From 5bce672c705e0791169f4d4711a033cbd6706997 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 6 Jun 2024 18:02:39 -0400 Subject: [PATCH] Reformatting --- CMakeLists.txt | 86 +- CMakePresets.json | 321 +- contrib/legendre_rule_fast.cpp | 274 +- contrib/legendre_rule_fast.h | 8 +- devel/eval_ker_expts.cpp | 49 +- devel/eval_ker_expts2.cpp | 62 +- devel/eval_ker_expts_libin_simd64.cpp | 75 +- devel/eval_ker_expts_ludvig.cpp | 76 +- devel/foldrescale.cpp | 151 +- devel/foldrescale_perf.cpp | 83 +- devel/foldrescale_perf2.cpp | 317 +- devel/interp_square_nowrap.cpp | 38 +- devel/padding.cpp | 257 +- devel/test_ker_ppval.cpp | 159 +- devel/time2d2interp.cpp | 102 +- examples/cuda/example2d1many.cpp | 176 +- examples/cuda/example2d2many.cpp | 193 +- examples/cuda/getting_started.cpp | 127 +- examples/guru1d1.cpp | 73 +- examples/guru1d1f.cpp | 73 +- examples/guru2d1.cpp | 87 +- examples/gurumany1d1.cpp | 79 +- examples/many1d1.cpp | 72 +- examples/simple1d1.cpp | 61 +- examples/simple1d1f.cpp | 60 +- examples/simple2d1.cpp | 85 +- examples/simulplans1d1.cpp | 87 +- examples/threadsafe1d1.cpp | 83 +- examples/threadsafe2d2f.cpp | 41 +- fortran/finufftfort.cpp | 227 +- include/cufinufft.h | 18 +- include/cufinufft/common.h | 31 +- include/cufinufft/contrib/helper_cuda.h | 124 +- include/cufinufft/cudeconvolve.h | 47 +- include/cufinufft/defs.h | 15 +- include/cufinufft/impl.h | 688 ++-- include/cufinufft/memtransfer.h | 21 +- include/cufinufft/precision_independent.h | 47 +- include/cufinufft/spreadinterp.h | 141 +- include/cufinufft/types.h | 141 +- include/cufinufft/utils.h | 63 +- include/cufinufft_opts.h | 34 +- include/finufft.h | 5 +- include/finufft/dirft.h | 22 +- include/finufft/fftw_defs.h | 52 +- include/finufft/spreadinterp.h | 60 +- include/finufft/test_defs.h | 10 +- include/finufft/utils.h | 25 +- include/finufft/utils_precindep.h | 51 +- include/finufft_eitherprec.h | 164 +- include/finufft_opts.h | 31 +- include/finufft_spread_opts.h | 30 +- matlab/finufft.cpp | 3626 ++++++++++----------- perftest/big2d2f.cpp | 30 +- perftest/cuda/cuperftest.cu | 485 ++- perftest/guru_timing_test.cpp | 468 +-- perftest/manysmallprobs.cpp | 82 +- perftest/spreadtestnd.cpp | 351 +- src/cuda/1d/cufinufft1d.cu | 137 +- src/cuda/1d/interp1d_wrapper.cu | 98 +- src/cuda/1d/spread1d_wrapper.cu | 409 +-- src/cuda/2d/cufinufft2d.cu | 138 +- src/cuda/2d/interp2d_wrapper.cu | 229 +- src/cuda/2d/spread2d_wrapper.cu | 457 +-- src/cuda/3d/cufinufft3d.cu | 132 +- src/cuda/3d/interp3d_wrapper.cu | 250 +- src/cuda/3d/spread3d_wrapper.cu | 946 +++--- src/cuda/common.cu | 229 +- src/cuda/cufinufft.cu | 131 +- src/cuda/deconvolve_wrapper.cu | 320 +- src/cuda/memtransfer_wrapper.cu | 601 ++-- src/cuda/precision_independent.cu | 359 +- src/cuda/spreadinterp.cpp | 118 +- src/cuda/utils.cpp | 45 +- src/finufft.cpp | 1174 +++---- src/ker_horner_allw_loop_constexpr.h | 1129 +++++-- src/simpleinterfaces.cpp | 299 +- src/utils.cpp | 64 +- src/utils_precindep.cpp | 50 +- test/basicpassfail.cpp | 55 +- test/cuda/cufinufft1d_test.cu | 361 +- test/cuda/cufinufft2d1nupts_test.cu | 404 +-- test/cuda/cufinufft2d_test.cu | 361 +- test/cuda/cufinufft2dmany_test.cu | 385 +-- test/cuda/cufinufft3d_test.cu | 388 +-- test/cuda/fseries_kernel_test.cu | 263 +- test/directft/dirft1d.cpp | 51 +- test/directft/dirft2d.cpp | 80 +- test/directft/dirft3d.cpp | 106 +- test/dumbinputs.cpp | 718 ++-- test/finufft1d_test.cpp | 248 +- test/finufft1dmany_test.cpp | 315 +- test/finufft2d_test.cpp | 260 +- test/finufft2dmany_test.cpp | 342 +- test/finufft3d_test.cpp | 282 +- test/finufft3dmany_test.cpp | 363 ++- test/testutils.cpp | 48 +- 97 files changed, 11922 insertions(+), 10867 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e4888955b..1626ad35e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,22 +5,22 @@ project(finufft VERSION 2.2.0 LANGUAGES C CXX) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(GNU_LIKE_FRONTENDS AppleClang Clang GNU) -if(CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS) +if (CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS) # Set custom compiler flags for gcc-compatible compilers set(FINUFFT_CXX_FLAGS_RELEASE -funroll-loops -ffp-contract=fast) set(FINUFFT_CXX_FLAGS_RELWITHDEBINFO -O3 -g -DNDEBUG ${FINUFFT_CXX_FLAGS_RELEASE}) -endif() +endif () include(CTest) if (CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS) - if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc|ppc64|powerpc|powerpc64" OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc|ppc64")) + if (CMAKE_SYSTEM_PROCESSOR MATCHES "ppc|ppc64|powerpc|powerpc64" OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc|ppc64")) # PowerPC arch does not have -march flag. set(FINUFFT_ARCH_FLAGS "-mtune=native" CACHE STRING "Compiler flags for specifying target architecture.") - else() + else () set(FINUFFT_ARCH_FLAGS "-march=native" CACHE STRING "Compiler flags for specifying target architecture.") - endif() -endif() + endif () +endif () set(FINUFFT_FFTW_SUFFIX "OpenMP" CACHE STRING "Suffix for FFTW libraries (e.g. OpenMP, Threads etc.)") set(FINUFFT_FFTW_LIBRARIES "DEFAULT" CACHE STRING "Specify a custom FFTW library") @@ -38,16 +38,16 @@ option(FINUFFT_STATIC_LINKING "Whether to link the static FINUFFT library (libfi option(FINUFFT_BUILD_DEVEL "Whether to build development executables" OFF) # sphinx tag (don't remove): @cmake_opts_end -if(FINUFFT_USE_CPU) +if (FINUFFT_USE_CPU) # suppress Windows warnings about "unsafe" functions - if(WIN32) + if (WIN32) add_definitions(-D_CRT_SECURE_NO_WARNINGS) - endif() + endif () # make apple with gnu use old linker, new linker breaks, see issue #360 - if((APPLE) AND (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")) + if ((APPLE) AND (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")) add_link_options("-ld64") - endif() + endif () set(CPM_DOWNLOAD_VERSION 0.38.0) set(FFTW_VERSION 3.3.10) @@ -56,7 +56,7 @@ if(FINUFFT_USE_CPU) include(cmake/setupCPM.cmake) include(cmake/setupFFTW.cmake) include(cmake/setupXSIMD.cmake) -endif() +endif () if (FINUFFT_BUILD_MATLAB) # When building for matlab, we will fetch the OpenMP library used by matlab @@ -104,20 +104,20 @@ endfunction() # Utility function to link static/dynamic lib function(finufft_link_test target) - if(FINUFFT_STATIC_LINKING) + if (FINUFFT_STATIC_LINKING) target_link_libraries(${target} PRIVATE finufft_static) - if(FINUFFT_USE_OPENMP) + if (FINUFFT_USE_OPENMP) target_link_libraries(${target} PRIVATE OpenMP::OpenMP_CXX) - if(WIN32) + if (WIN32) target_link_options(${target} PRIVATE ${OpenMP_CXX_FLAGS}) - endif() - endif() - else() + endif () + endif () + else () target_link_libraries(${target} PRIVATE finufft) - if(WIN32) + if (WIN32) target_compile_definitions(${target} PRIVATE FINUFFT_DLL) - endif() - endif() + endif () + endif () enable_asan(${target}) endfunction() @@ -140,9 +140,9 @@ function(set_finufft_options target) target_link_libraries(${target} PRIVATE OpenMP::OpenMP_CXX) # there are issues on windows with OpenMP and CMake, so we need to manually add the flags # otherwise there are link errors - if(WIN32) + if (WIN32) target_link_options(${target} PRIVATE ${OpenMP_CXX_FLAGS}) - endif() + endif () else () if (CMAKE_CXX_COMPILER_ID IN_LIST FINUFFT_GNU_LIKE_COMPILERS) # OpenMP disabled, suppress unknown pragma warnings to avoid spam. @@ -154,16 +154,16 @@ function(set_finufft_options target) # include them since we need them for build not for install # trying to include them directly into the fftw and fftwf targets causes issues with # the latest version of cmake, so we do it here instead. - if ( (NOT FFTW_FOUND ) OR (FINUFFT_FFTW_LIBRARIES STREQUAL DOWNLOAD)) - list (GET FINUFFT_FFTW_LIBRARIES 0 element) + if ((NOT FFTW_FOUND) OR (FINUFFT_FFTW_LIBRARIES STREQUAL DOWNLOAD)) + list(GET FINUFFT_FFTW_LIBRARIES 0 element) get_property(FFTW_SOURCE_DIR TARGET ${element} PROPERTY SOURCE_DIR) set(FFTW_INCLUDE_DIR ${FFTW_SOURCE_DIR}/api) target_include_directories(${target} PUBLIC ${FFTW_INCLUDE_DIR}) - endif() + endif () endfunction() -if(FINUFFT_USE_CPU) +if (FINUFFT_USE_CPU) # Main finufft libraries add_library(finufft_f32 OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES}) target_compile_definitions(finufft_f32 PRIVATE SINGLE) @@ -176,7 +176,7 @@ if(FINUFFT_USE_CPU) set_finufft_options(finufft_f64) target_link_libraries(finufft_f64 PUBLIC ${FINUFFT_FFTW_LIBRARIES}) target_link_libraries(finufft_f64 PRIVATE xsimd) - if(WIN32) + if (WIN32) add_library(finufft_f32_dll OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES}) target_compile_definitions(finufft_f32_dll PRIVATE SINGLE dll_EXPORTS FINUFFT_DLL) set_finufft_options(finufft_f32_dll) @@ -186,43 +186,43 @@ if(FINUFFT_USE_CPU) target_compile_definitions(finufft_f64_dll PRIVATE dll_EXPORTS FINUFFT_DLL) set_finufft_options(finufft_f64_dll) target_link_libraries(finufft_f64_dll PUBLIC ${FINUFFT_FFTW_LIBRARIES}) - endif() + endif () add_library(finufft SHARED src/utils_precindep.cpp contrib/legendre_rule_fast.cpp) target_compile_definitions(finufft PRIVATE dll_EXPORTS FINUFFT_DLL) set_finufft_options(finufft) - if(NOT WIN32) + if (NOT WIN32) target_link_libraries(finufft PUBLIC finufft_f32 finufft_f64) - else() + else () target_link_libraries(finufft PUBLIC finufft_f32_dll finufft_f64_dll) - endif() + endif () # windows does not have a math library, so we need to exclude it - if(NOT WIN32) + if (NOT WIN32) target_link_libraries(finufft PUBLIC m) - endif() + endif () target_include_directories(finufft PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include") add_library(finufft_static STATIC src/utils_precindep.cpp contrib/legendre_rule_fast.cpp) set_finufft_options(finufft_static) target_link_libraries(finufft_static PUBLIC finufft_f32 finufft_f64) # windows does not have a math library, so we need to exclude it - if(NOT WIN32) + if (NOT WIN32) target_link_libraries(finufft_static PUBLIC m) - endif() + endif () target_include_directories(finufft_static PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include") file(GLOB FINUFFT_PUBLIC_HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/include/finufft*.h") set_target_properties(finufft PROPERTIES PUBLIC_HEADER "${FINUFFT_PUBLIC_HEADERS}") list(APPEND INSTALL_TARGETS finufft finufft_static) -endif() +endif () -if(FINUFFT_USE_CUDA) - if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) +if (FINUFFT_USE_CUDA) + if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) message("FINUFFT WARNING: No CUDA architecture supplied via '-DCMAKE_CUDA_ARCHITECTURES=...', defaulting to '60;70;75;'") message("See: https://developer.nvidia.com/cuda-gpus for more details on what architecture to supply.") set(CMAKE_CUDA_ARCHITECTURES "60;70;75" CACHE STRING "" FORCE) - endif() + endif () enable_language(CUDA) find_package(CUDAToolkit REQUIRED) add_subdirectory(src/cuda) @@ -231,7 +231,7 @@ if(FINUFFT_USE_CUDA) endif () list(APPEND INSTALL_TARGETS cufinufft cufinufft_static) -endif() +endif () # Add tests defined in their own directory if (BUILD_TESTING AND FINUFFT_BUILD_TESTS AND FINUFFT_USE_CPU) @@ -278,7 +278,7 @@ if (FINUFFT_USE_CPU) install(FILES ${PROJECT_SOURCE_DIR}/include/finufft.fh DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} ) - endif() + endif () endif () if (FINUFFT_USE_CUDA) install(DIRECTORY ${PROJECT_SOURCE_DIR}/examples/cuda @@ -286,4 +286,4 @@ if (FINUFFT_USE_CUDA) PATTERN "README" EXCLUDE PATTERN "CMakeLists.txt" EXCLUDE ) -endif() +endif () diff --git a/CMakePresets.json b/CMakePresets.json index 0dcb3a5eb..b04204500 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -1,163 +1,164 @@ { - "version": 2, - "cmakeMinimumRequired": { - "major": 3, - "minor": 19, - "patch": 0 + "version": 2, + "cmakeMinimumRequired": { + "major": 3, + "minor": 19, + "patch": 0 + }, + "configurePresets": [ + { + "name": "default", + "binaryDir": "build/default", + "displayName": "Default", + "description": "Default release configuration (ninja)", + "generator": "Ninja", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "RelWithDebInfo" + } }, - "configurePresets": [ - { - "name": "default", - "binaryDir": "build/default", - "displayName": "Default", - "description": "Default release configuration (ninja)", - "generator": "Ninja", - "cacheVariables": { - "CMAKE_BUILD_TYPE": "RelWithDebInfo" - } - }, - { - "name": "ninja-multi", - "binaryDir": "build/ninja", - "displayName": "Ninja Multi-config", - "description": "Multi-configuration build with ninja", - "generator": "Ninja Multi-Config" - }, - { - "name": "dev", - "binaryDir": "build/dev", - "displayName": "Development", - "description": "Development configuration (full tests and examples)", - "generator": "Ninja Multi-Config", - "cacheVariables": { - "FINUFFT_BUILD_TESTS": "ON", - "FINUFFT_BUILD_EXAMPLES": "ON", - "FINUFFT_BUILD_DEVEL": "ON" - } - }, - { - "name": "benchmark", - "binaryDir": "build/benchmark", - "displayName": "Benchmark", - "description": "Benchmark release configuration (ninja)", - "generator": "Ninja", - "cacheVariables": { - "CMAKE_BUILD_TYPE": "RelWithDebInfo", - "FINUFFT_BUILD_TESTS": "ON", - "FINUFFT_BUILD_EXAMPLES": "ON", - "FINUFFT_FFTW_SUFFIX": "", - "FINUFFT_USE_OPENMP": "OFF" - } - }, - { - "name": "manylinux", - "binaryDir": "build/manylinux", - "displayName": "manylinux", - "description": "Configuration for maximum binary compatibility", - "inherits": "default", - "cacheVariables": { - "FINUFFT_ARCH_FLAGS": "-march=x86-64 -mtune=generic -msse4" - } - }, - { - "name": "singlethreaded", - "binaryDir": "build/singlethreaded", - "displayName": "singlethreaded", - "description": "Configuration for single-threaded build. Disables OpenMP for finufft and FFTW", - "inherits": "default", - "cacheVariables": { - "FINUFFT_FFTW_SUFFIX": "", - "FINUFFT_USE_OPENMP": "OFF" - } - }, - { - "name": "icx", - "binaryDir": "build/icx", - "displayName": "Intel Compiler (llvm)", - "description": "Build with Intel Compiler", - "generator": "Ninja Multi-Config", - "cacheVariables": { - "CMAKE_C_COMPILER": "icx", - "CMAKE_CXX_COMPILER": "icpx", - "CMAKE_Fortran_COMPILER": "ifx", - "FINUFFT_ARCH_FLAGS": "-xHost", - "CMAKE_CXX_FLAGS": "-fp-model=strict" - } - }, - { - "name": "icc", - "binaryDir": "build/icc", - "displayName": "Intel Compiler", - "description": "Build with Intel Compiler", - "generator": "Ninja Multi-Config", - "cacheVariables": { - "CMAKE_C_COMPILER": "icc", - "CMAKE_CXX_COMPILER": "icpc", - "CMAKE_Fortran_COMPILER": "ifort", - "FINUFFT_ARCH_FLAGS": "-xHost", - "CMAKE_CXX_FLAGS": "-fp-model=strict" - } - }, - { - "name": "matlab", - "binaryDir": "build/matlab", - "displayName": "matlab", - "description": "Build with the matlab interface", - "generator": "Ninja Multi-Config", - "cacheVariables": { - "FINUFFT_FFTW_SUFFIX": "Threads", - "FINUFFT_BUILD_MATLAB": "ON", - "FINUFFT_ENABLE_SANITIZERS": "OFF" - } - } - ], - "buildPresets": [ - { - "name": "default", - "configurePreset": "default" - }, - { - "name": "dev", - "configurePreset": "dev", - "configuration": "RelWithDebInfo" - }, - { - "name": "ninja-multi", - "configurePreset": "ninja-multi", - "configuration": "RelWithDebInfo" - }, - { - "name": "manylinux", - "configurePreset": "manylinux" - }, - { - "name": "singlethreaded", - "configurePreset": "singlethreaded" - }, - { - "name": "icc", - "configurePreset": "icc", - "configuration": "RelWithDebInfo" - }, - { - "name": "icx", - "configurePreset": "icx", - "configuration": "RelWithDebInfo" - }, - { - "name": "matlab", - "configurePreset": "matlab", - "configuration": "Release" - } - ], - "testPresets": [ - { - "name": "dev", - "configurePreset": "dev", - "configuration": "Debug", - "environment": { - "OMP_NUM_THREADS": "1" - } - } - ] + { + "name": "ninja-multi", + "binaryDir": "build/ninja", + "displayName": "Ninja Multi-config", + "description": "Multi-configuration build with ninja", + "generator": "Ninja Multi-Config" + }, + { + "name": "dev", + "binaryDir": "build/dev", + "displayName": "Development", + "description": "Development configuration (full tests and examples)", + "generator": "Ninja Multi-Config", + "cacheVariables": { + "FINUFFT_BUILD_TESTS": "ON", + "FINUFFT_BUILD_EXAMPLES": "ON", + "FINUFFT_BUILD_DEVEL": "ON" + } + }, + { + "name": "benchmark", + "binaryDir": "build/benchmark", + "displayName": "Benchmark", + "description": "Benchmark release configuration (ninja)", + "generator": "Ninja", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "RelWithDebInfo", + "FINUFFT_BUILD_TESTS": "ON", + "FINUFFT_BUILD_EXAMPLES": "ON", + "FINUFFT_FFTW_SUFFIX": "", + "FINUFFT_USE_OPENMP": "OFF" + } + }, + { + "name": "manylinux", + "binaryDir": "build/manylinux", + "displayName": "manylinux", + "description": "Configuration for maximum binary compatibility", + "inherits": "default", + "cacheVariables": { + "FINUFFT_ARCH_FLAGS": "-march=x86-64 -mtune=generic -msse4" + } + }, + { + "name": "singlethreaded", + "binaryDir": "build/singlethreaded", + "displayName": "singlethreaded", + "description": + "Configuration for single-threaded build. Disables OpenMP for finufft and FFTW", + "inherits": "default", + "cacheVariables": { + "FINUFFT_FFTW_SUFFIX": "", + "FINUFFT_USE_OPENMP": "OFF" + } + }, + { + "name": "icx", + "binaryDir": "build/icx", + "displayName": "Intel Compiler (llvm)", + "description": "Build with Intel Compiler", + "generator": "Ninja Multi-Config", + "cacheVariables": { + "CMAKE_C_COMPILER": "icx", + "CMAKE_CXX_COMPILER": "icpx", + "CMAKE_Fortran_COMPILER": "ifx", + "FINUFFT_ARCH_FLAGS": "-xHost", + "CMAKE_CXX_FLAGS": "-fp-model=strict" + } + }, + { + "name": "icc", + "binaryDir": "build/icc", + "displayName": "Intel Compiler", + "description": "Build with Intel Compiler", + "generator": "Ninja Multi-Config", + "cacheVariables": { + "CMAKE_C_COMPILER": "icc", + "CMAKE_CXX_COMPILER": "icpc", + "CMAKE_Fortran_COMPILER": "ifort", + "FINUFFT_ARCH_FLAGS": "-xHost", + "CMAKE_CXX_FLAGS": "-fp-model=strict" + } + }, + { + "name": "matlab", + "binaryDir": "build/matlab", + "displayName": "matlab", + "description": "Build with the matlab interface", + "generator": "Ninja Multi-Config", + "cacheVariables": { + "FINUFFT_FFTW_SUFFIX": "Threads", + "FINUFFT_BUILD_MATLAB": "ON", + "FINUFFT_ENABLE_SANITIZERS": "OFF" + } + } + ], + "buildPresets": [ + { + "name": "default", + "configurePreset": "default" + }, + { + "name": "dev", + "configurePreset": "dev", + "configuration": "RelWithDebInfo" + }, + { + "name": "ninja-multi", + "configurePreset": "ninja-multi", + "configuration": "RelWithDebInfo" + }, + { + "name": "manylinux", + "configurePreset": "manylinux" + }, + { + "name": "singlethreaded", + "configurePreset": "singlethreaded" + }, + { + "name": "icc", + "configurePreset": "icc", + "configuration": "RelWithDebInfo" + }, + { + "name": "icx", + "configurePreset": "icx", + "configuration": "RelWithDebInfo" + }, + { + "name": "matlab", + "configurePreset": "matlab", + "configuration": "Release" + } + ], + "testPresets": [ + { + "name": "dev", + "configurePreset": "dev", + "configuration": "Debug", + "environment": { + "OMP_NUM_THREADS": "1" + } + } + ] } diff --git a/contrib/legendre_rule_fast.cpp b/contrib/legendre_rule_fast.cpp index 01b626cc3..a91119161 100644 --- a/contrib/legendre_rule_fast.cpp +++ b/contrib/legendre_rule_fast.cpp @@ -12,16 +12,16 @@ #include namespace finufft { - namespace quadrature { - -void legendre_compute_glr ( int n, double x[], double w[] ); -void legendre_compute_glr0 ( int n, double *p, double *pp ); -void legendre_compute_glr1 ( int n, double *roots, double *ders ); -void legendre_compute_glr2 ( double p, int n, double *roots, double *ders ); -double rk2_leg ( double t, double tn, double x, int n ); -double ts_mult ( double *u, double h, int n ); - -void legendre_compute_glr ( int n, double x[], double w[] ) +namespace quadrature { + +void legendre_compute_glr(int n, double x[], double w[]); +void legendre_compute_glr0(int n, double *p, double *pp); +void legendre_compute_glr1(int n, double *roots, double *ders); +void legendre_compute_glr2(double p, int n, double *roots, double *ders); +double rk2_leg(double t, double tn, double x, int n); +double ts_mult(double *u, double h, int n); + +void legendre_compute_glr(int n, double x[], double w[]) /******************************************************************************/ /* Purpose: @@ -30,7 +30,7 @@ void legendre_compute_glr ( int n, double x[], double w[] ) Licensing: - This code is distributed under the GNU LGPL license. + This code is distributed under the GNU LGPL license. Modified: @@ -43,8 +43,8 @@ void legendre_compute_glr ( int n, double x[], double w[] ) Reference: - Andreas Glaser, Xiangtao Liu, Vladimir Rokhlin, - A fast algorithm for the calculation of the roots of special functions, + Andreas Glaser, Xiangtao Liu, Vladimir Rokhlin, + A fast algorithm for the calculation of the roots of special functions, SIAM Journal on Scientific Computing, Volume 29, Number 4, pages 1420-1438, 2007. @@ -61,47 +61,41 @@ void legendre_compute_glr ( int n, double x[], double w[] ) double p; double pp; double w_sum; -/* - Get the value and derivative of the N-th Legendre polynomial at 0. -*/ - legendre_compute_glr0 ( n, &p, &pp ); -/* - Either zero is a root, or we have to call a function to find the first root. -*/ - if ( n % 2 == 1 ) - { - x[(n-1)/2] = p; - w[(n-1)/2] = pp; + /* + Get the value and derivative of the N-th Legendre polynomial at 0. + */ + legendre_compute_glr0(n, &p, &pp); + /* + Either zero is a root, or we have to call a function to find the first root. + */ + if (n % 2 == 1) { + x[(n - 1) / 2] = p; + w[(n - 1) / 2] = pp; + } else { + legendre_compute_glr2(p, n, &x[n / 2], &w[n / 2]); } - else - { - legendre_compute_glr2 ( p, n, &x[n/2], &w[n/2] ); - } -/* - Get the complete set of roots and derivatives. -*/ - legendre_compute_glr1 ( n, x, w ); -/* - Compute the weights. -*/ - for ( i = 0; i < n; i++ ) - { - w[i] = 2.0 / ( 1.0 - x[i] ) / ( 1.0 + x[i] ) / w[i] / w[i]; + /* + Get the complete set of roots and derivatives. + */ + legendre_compute_glr1(n, x, w); + /* + Compute the weights. + */ + for (i = 0; i < n; i++) { + w[i] = 2.0 / (1.0 - x[i]) / (1.0 + x[i]) / w[i] / w[i]; } w_sum = 0.0; - for ( i = 0; i < n; i++ ) - { + for (i = 0; i < n; i++) { w_sum = w_sum + w[i]; } - for ( i = 0; i < n; i++ ) - { + for (i = 0; i < n; i++) { w[i] = 2.0 * w[i] / w_sum; } return; } /******************************************************************************/ -void legendre_compute_glr0 ( int n, double *p, double *pp ) +void legendre_compute_glr0(int n, double *p, double *pp) /******************************************************************************/ /* @@ -111,7 +105,7 @@ void legendre_compute_glr0 ( int n, double *p, double *pp ) Licensing: - This code is distributed under the GNU LGPL license. + This code is distributed under the GNU LGPL license. Modified: @@ -124,8 +118,8 @@ void legendre_compute_glr0 ( int n, double *p, double *pp ) Reference: - Andreas Glaser, Xiangtao Liu, Vladimir Rokhlin, - A fast algorithm for the calculation of the roots of special functions, + Andreas Glaser, Xiangtao Liu, Vladimir Rokhlin, + A fast algorithm for the calculation of the roots of special functions, SIAM Journal on Scientific Computing, Volume 29, Number 4, pages 1420-1438, 2007. @@ -144,18 +138,17 @@ void legendre_compute_glr0 ( int n, double *p, double *pp ) double ppm1; double ppm2; - pm2 = 0.0; - pm1 = 1.0; + pm2 = 0.0; + pm1 = 1.0; ppm2 = 0.0; ppm1 = 0.0; - for ( k = 0; k < n; k++ ) - { - dk = ( double ) k; - *p = - dk * pm2 / ( dk + 1.0 ); - *pp = ( ( 2.0 * dk + 1.0 ) * pm1 - dk * ppm2 ) / ( dk + 1.0 ); - pm2 = pm1; - pm1 = *p; + for (k = 0; k < n; k++) { + dk = (double)k; + *p = -dk * pm2 / (dk + 1.0); + *pp = ((2.0 * dk + 1.0) * pm1 - dk * ppm2) / (dk + 1.0); + pm2 = pm1; + pm1 = *p; ppm2 = ppm1; ppm1 = *pp; } @@ -163,7 +156,7 @@ void legendre_compute_glr0 ( int n, double *p, double *pp ) } /******************************************************************************/ -void legendre_compute_glr1 ( int n, double *x, double *ders ) +void legendre_compute_glr1(int n, double *x, double *ders) /******************************************************************************/ /* @@ -179,7 +172,7 @@ void legendre_compute_glr1 ( int n, double *x, double *ders ) Licensing: - This code is distributed under the GNU LGPL license. + This code is distributed under the GNU LGPL license. Modified: @@ -192,8 +185,8 @@ void legendre_compute_glr1 ( int n, double *x, double *ders ) Reference: - Andreas Glaser, Xiangtao Liu, Vladimir Rokhlin, - A fast algorithm for the calculation of the roots of special functions, + Andreas Glaser, Xiangtao Liu, Vladimir Rokhlin, + A fast algorithm for the calculation of the roots of special functions, SIAM Journal on Scientific Computing, Volume 29, Number 4, pages 1420-1438, 2007. @@ -202,11 +195,11 @@ void legendre_compute_glr1 ( int n, double *x, double *ders ) Input, int N, the order of the Legendre polynomial. Input/output, double X[N]. On input, a starting value - has been set in one entry. On output, the roots of the Legendre + has been set in one entry. On output, the roots of the Legendre polynomial. Input/output, double DERS[N]. On input, a starting value - has been set in one entry. On output, the derivatives of the Legendre + has been set in one entry. On output, the derivatives of the Legendre polynomial at the zeros. Local Parameters: @@ -228,27 +221,23 @@ void legendre_compute_glr1 ( int n, double *x, double *ders ) double *up; double xp; - if ( n % 2 == 1 ) - { - n2 = ( n - 1 ) / 2; - s = 1; - } - else - { + if (n % 2 == 1) { + n2 = (n - 1) / 2; + s = 1; + } else { n2 = n / 2; - s = 0; + s = 0; } - u = ( double * ) malloc ( ( m + 2 ) * sizeof ( double ) ); - up = ( double * ) malloc ( ( m + 1 ) * sizeof ( double ) ); + u = (double *)malloc((m + 2) * sizeof(double)); + up = (double *)malloc((m + 1) * sizeof(double)); - dn = ( double ) n; + dn = (double)n; - for ( j = n2; j < n - 1; j++ ) - { + for (j = n2; j < n - 1; j++) { xp = x[j]; - h = rk2_leg ( pi/2.0, -pi/2.0, xp, n ) - xp; + h = rk2_leg(pi / 2.0, -pi / 2.0, xp, n) - xp; u[0] = 0.0; u[1] = 0.0; @@ -257,41 +246,36 @@ void legendre_compute_glr1 ( int n, double *x, double *ders ) up[0] = 0.0; up[1] = u[2]; - for ( k = 0; k <= m - 2; k++ ) - { - dk = ( double ) k; + for (k = 0; k <= m - 2; k++) { + dk = (double)k; - u[k+3] = - ( - 2.0 * xp * ( dk + 1.0 ) * u[k+2] - + ( dk * ( dk + 1.0 ) - dn * ( dn + 1.0 ) ) * u[k+1] / ( dk + 1.0 ) - ) / ( 1.0 - xp ) / ( 1.0 + xp ) / ( dk + 2.0 ); + u[k + 3] = (2.0 * xp * (dk + 1.0) * u[k + 2] + + (dk * (dk + 1.0) - dn * (dn + 1.0)) * u[k + 1] / (dk + 1.0)) / + (1.0 - xp) / (1.0 + xp) / (dk + 2.0); - up[k+2] = ( dk + 2.0 ) * u[k+3]; + up[k + 2] = (dk + 2.0) * u[k + 3]; } - for ( l = 0; l < 5; l++ ) - { - h = h - ts_mult ( u, h, m ) / ts_mult ( up, h, m-1 ); + for (l = 0; l < 5; l++) { + h = h - ts_mult(u, h, m) / ts_mult(up, h, m - 1); } - x[j+1] = xp + h; - ders[j+1] = ts_mult ( up, h, m-1 ); + x[j + 1] = xp + h; + ders[j + 1] = ts_mult(up, h, m - 1); } - free ( u ); - free ( up ); + free(u); + free(up); - for ( k = 0; k < n2 + s; k++ ) - { - x[k] = - x[n-k-1]; - ders[k] = ders[n-k-1]; + for (k = 0; k < n2 + s; k++) { + x[k] = -x[n - k - 1]; + ders[k] = ders[n - k - 1]; } return; } /******************************************************************************/ -void legendre_compute_glr2 ( double pn0, int n, double *x1, double *d1 ) +void legendre_compute_glr2(double pn0, int n, double *x1, double *d1) /******************************************************************************/ /* @@ -308,7 +292,7 @@ void legendre_compute_glr2 ( double pn0, int n, double *x1, double *d1 ) Licensing: - This code is distributed under the GNU LGPL license. + This code is distributed under the GNU LGPL license. Modified: @@ -321,8 +305,8 @@ void legendre_compute_glr2 ( double pn0, int n, double *x1, double *d1 ) Reference: - Andreas Glaser, Xiangtao Liu, Vladimir Rokhlin, - A fast algorithm for the calculation of the roots of special functions, + Andreas Glaser, Xiangtao Liu, Vladimir Rokhlin, + A fast algorithm for the calculation of the roots of special functions, SIAM Journal on Scientific Computing, Volume 29, Number 4, pages 1420-1438, 2007. @@ -345,55 +329,52 @@ void legendre_compute_glr2 ( double pn0, int n, double *x1, double *d1 ) double dn; int k; int l; - int m = 30; + int m = 30; const double pi = 3.141592653589793; double t; double *u; double *up; - t = 0.0; - *x1 = rk2_leg ( t, -pi/2.0, 0.0, n ); + t = 0.0; + *x1 = rk2_leg(t, -pi / 2.0, 0.0, n); - u = ( double * ) malloc ( ( m + 2 ) * sizeof ( double ) ); - up = ( double * ) malloc ( ( m + 1 ) * sizeof ( double ) ); + u = (double *)malloc((m + 2) * sizeof(double)); + up = (double *)malloc((m + 1) * sizeof(double)); - dn = ( double ) n; -/* - U[0] and UP[0] are never used. - U[M+1] is set, but not used, and UP[M] is set and not used. - What gives? -*/ + dn = (double)n; + /* + U[0] and UP[0] are never used. + U[M+1] is set, but not used, and UP[M] is set and not used. + What gives? + */ u[0] = 0.0; u[1] = pn0; up[0] = 0.0; - - for ( k = 0; k <= m - 2; k = k + 2 ) - { - dk = ( double ) k; - - u[k+2] = 0.0; - u[k+3] = ( dk * ( dk + 1.0 ) - dn * ( dn + 1.0 ) ) * u[k+1] - / ( dk + 1.0 ) / ( dk + 2.0 ); - - up[k+1] = 0.0; - up[k+2] = ( dk + 2.0 ) * u[k+3]; + + for (k = 0; k <= m - 2; k = k + 2) { + dk = (double)k; + + u[k + 2] = 0.0; + u[k + 3] = (dk * (dk + 1.0) - dn * (dn + 1.0)) * u[k + 1] / (dk + 1.0) / (dk + 2.0); + + up[k + 1] = 0.0; + up[k + 2] = (dk + 2.0) * u[k + 3]; } - - for ( l = 0; l < 5; l++ ) - { - *x1 = *x1 - ts_mult ( u, *x1, m ) / ts_mult ( up, *x1, m-1 ); + + for (l = 0; l < 5; l++) { + *x1 = *x1 - ts_mult(u, *x1, m) / ts_mult(up, *x1, m - 1); } - *d1 = ts_mult ( up, *x1, m-1 ); + *d1 = ts_mult(up, *x1, m - 1); - free ( u ); - free ( up) ; + free(u); + free(up); return; } /******************************************************************************/ -double rk2_leg ( double t1, double t2, double x, int n ) +double rk2_leg(double t1, double t2, double x, int n) /******************************************************************************/ /* @@ -403,7 +384,7 @@ double rk2_leg ( double t1, double t2, double x, int n ) Licensing: - This code is distributed under the GNU LGPL license. + This code is distributed under the GNU LGPL license. Modified: @@ -434,29 +415,27 @@ double rk2_leg ( double t1, double t2, double x, int n ) double snn1; double t; - h = ( t2 - t1 ) / ( double ) m; - snn1 = sqrt ( ( double ) ( n * ( n + 1 ) ) ); + h = (t2 - t1) / (double)m; + snn1 = sqrt((double)(n * (n + 1))); t = t1; - for ( j = 0; j < m; j++ ) - { - f = ( 1.0 - x ) * ( 1.0 + x ); - k1 = - h * f / ( snn1 * sqrt ( f ) - 0.5 * x * sin ( 2.0 * t ) ); - x = x + k1; + for (j = 0; j < m; j++) { + f = (1.0 - x) * (1.0 + x); + k1 = -h * f / (snn1 * sqrt(f) - 0.5 * x * sin(2.0 * t)); + x = x + k1; t = t + h; - f = ( 1.0 - x ) * ( 1.0 + x ); - k2 = - h * f / ( snn1 * sqrt ( f ) - 0.5 * x * sin ( 2.0 * t ) ); - x = x + 0.5 * ( k2 - k1 ); + f = (1.0 - x) * (1.0 + x); + k2 = -h * f / (snn1 * sqrt(f) - 0.5 * x * sin(2.0 * t)); + x = x + 0.5 * (k2 - k1); } return x; } /******************************************************************************/ - -double ts_mult ( double *u, double h, int n ) +double ts_mult(double *u, double h, int n) /******************************************************************************/ /* @@ -470,7 +449,7 @@ double ts_mult ( double *u, double h, int n ) Licensing: - This code is distributed under the GNU LGPL license. + This code is distributed under the GNU LGPL license. Modified: @@ -496,11 +475,10 @@ double ts_mult ( double *u, double h, int n ) double hk; int k; double ts; - + ts = 0.0; hk = 1.0; - for ( k = 1; k<= n; k++ ) - { + for (k = 1; k <= n; k++) { ts = ts + u[k] * hk; hk = hk * h; } @@ -508,5 +486,5 @@ double ts_mult ( double *u, double h, int n ) } /******************************************************************************/ - } // namespace -} // namespace +} // namespace quadrature +} // namespace finufft diff --git a/contrib/legendre_rule_fast.h b/contrib/legendre_rule_fast.h index 49c5bcf13..357909f9e 100644 --- a/contrib/legendre_rule_fast.h +++ b/contrib/legendre_rule_fast.h @@ -2,9 +2,9 @@ #define GAUSSQUAD_H namespace finufft { - namespace quadrature { - void legendre_compute_glr ( int n, double x[], double w[] ); - } // namespace -} // namespace +namespace quadrature { +void legendre_compute_glr(int n, double x[], double w[]); +} // namespace quadrature +} // namespace finufft #endif diff --git a/devel/eval_ker_expts.cpp b/devel/eval_ker_expts.cpp index 015bb8a38..8da4a1699 100644 --- a/devel/eval_ker_expts.cpp +++ b/devel/eval_ker_expts.cpp @@ -3,22 +3,25 @@ compile with: -g++ eval_ker_expts.cpp -o eval_ker_expts -Ofast -funroll-loops -march=native; time ./eval_ker_expts +g++ eval_ker_expts.cpp -o eval_ker_expts -Ofast -funroll-loops -march=native; time +./eval_ker_expts Barnett 3/28/18 for JD Patel (Intel). Single-prec version also of interest, if faster. */ -#include -#include -#include #include +#include +#include +#include // Choose prec... typedef double FLT; -//typedef float FLT; +// typedef float FLT; -static inline void evaluate_kernel_vector(FLT * __restrict__ ker, const FLT * __restrict__ args, const FLT beta, const FLT c, const int N) +static inline void evaluate_kernel_vector(FLT *__restrict__ ker, + const FLT *__restrict__ args, const FLT beta, + const FLT c, const int N) /* Evaluate kernel for a vector of N arguments. Can comment out either or both loops. The #pragra's need to be removed for icpc if -fopenmp not used. @@ -26,33 +29,31 @@ static inline void evaluate_kernel_vector(FLT * __restrict__ ker, const FLT * __ { #pragma omp simd for (int i = 0; i < N; i++) // Loop 1: Compute exponential arguments - ker[i] = beta * sqrt(1.0 - c*args[i]*args[i]); - //ker[i] = beta * (1.0 - c*args[i]*args[i]); // no-sqrt version - + ker[i] = beta * sqrt(1.0 - c * args[i] * args[i]); + // ker[i] = beta * (1.0 - c*args[i]*args[i]); // no-sqrt version + #pragma omp simd for (int i = 0; i < N; i++) // Loop 2: Compute exponentials ker[i] = exp(ker[i]); } -int main(int argc, char* argv[]) -{ - int M = (int) 1e7; // # of reps - int w=10; // spread width (small), needn't be mult of 4 - FLT beta=2.3*w, c = 4.0/(w*w); // ker params - FLT iw = 1.0/(FLT)w; - FLT ans = 0.0; // dummy answer +int main(int argc, char *argv[]) { + int M = (int)1e7; // # of reps + int w = 10; // spread width (small), needn't be mult of 4 + FLT beta = 2.3 * w, c = 4.0 / (w * w); // ker params + FLT iw = 1.0 / (FLT)w; + FLT ans = 0.0; // dummy answer std::vector x(w); std::vector f(w); - for (int i=1;i -#include -#include #include +#include +#include +#include // Choose prec... typedef double FLT; -//typedef float FLT; +// typedef float FLT; -static inline void evaluate_kernel_vector(FLT* ker, const FLT* args, const FLT beta, const FLT c, const int N) +static inline void evaluate_kernel_vector(FLT *ker, const FLT *args, const FLT beta, + const FLT c, const int N) /* Evaluate kernel for a vector of N arguments. The #pragmas need to be removed for icpc if -fopenmp not used. For g++-7, this pragma (with -fopenmp) slows it down from 0.2s to 0.4s! THe __restrict__ on the I/O args don't matter. */ { - //#pragma omp simd - for (int i = 0; i < N; i++) - ker[i] = exp(beta * sqrt(FLT(1.0) - c*args[i]*args[i])); + // #pragma omp simd + for (int i = 0; i < N; i++) ker[i] = exp(beta * sqrt(FLT(1.0) - c * args[i] * args[i])); // FLT(1.0) suggested by mreineck - // slows down from 0.2s to 2.0s for w=12, unless it's at 0.4s when no effect... - // for (int i = 0; i < N; i++) - // if (fabs(args[i]) >= (FLT)N/2) // note fabs not abs! - // ker[i] = 0.0; + // slows down from 0.2s to 2.0s for w=12, unless it's at 0.4s when no effect... + // for (int i = 0; i < N; i++) + // if (fabs(args[i]) >= (FLT)N/2) // note fabs not abs! + // ker[i] = 0.0; } -int main(int argc, char* argv[]) -{ - int M = (int) 1e7; // # of reps - if (argc>1) - sscanf(argv[1],"%d",&M); // find not needed to get the 0.2 s time. - int w=11; // spread width: 10 0.17s, 11 1.8s, 12 0.2s, 13 2.0s, 15 2.5s - //if (argc>2) // even including this code slows to 0.4s !! - //sscanf(argv[2],"%d",&w); // .. but speeds up w=13 from 2s to 0.4s ! - FLT beta=2.3*w, c = 4.0/(w*w); // typ ker params - FLT iw = 1.0/(FLT)w; - FLT ans = 0.0; // dummy answer +int main(int argc, char *argv[]) { + int M = (int)1e7; // # of reps + if (argc > 1) sscanf(argv[1], "%d", &M); // find not needed to get the 0.2 s time. + int w = 11; // spread width: 10 0.17s, 11 1.8s, 12 0.2s, 13 2.0s, 15 2.5s + // if (argc>2) // even including this code slows to 0.4s !! + // sscanf(argv[2],"%d",&w); // .. but speeds up w=13 from 2s to 0.4s ! + FLT beta = 2.3 * w, c = 4.0 / (w * w); // typ ker params + FLT iw = 1.0 / (FLT)w; + FLT ans = 0.0; // dummy answer std::vector x(w); std::vector f(w); - for (int i=1;i<=M;++i) { // i=0 to M-1 : 2.1s; i=1 to M : 0.2s !!!!! - FLT xi = -w/(FLT)2.0 + i/(FLT)M; // dummy offset to make each rep different - for (int j=0;j -#include -#include #include - +#include +#include +#include #ifdef VCL // Use Agner Fog's vector class library @@ -33,65 +33,66 @@ that correlates w/ 0.2s magic. // Choose prec... typedef double FLT; -//typedef float FLT; +// typedef float FLT; -static inline void evaluate_kernel_vector(FLT * __restrict__ ker, const FLT * __restrict__ args, const FLT beta, const FLT c, const int N) +static inline void evaluate_kernel_vector(FLT *__restrict__ ker, + const FLT *__restrict__ args, const FLT beta, + const FLT c, const int N) /* Evaluate kernel for a vector of N arguments. -*/ + */ { -#ifdef VCL - for (int i = 0; i < N; i+=4) // Assume w divisible by 4 +#ifdef VCL + for (int i = 0; i < N; i += 4) // Assume w divisible by 4 { Vec4d vec; vec.load(args + i); - vec = exp(beta*sqrt(1.0 - c*vec*vec)); + vec = exp(beta * sqrt(1.0 - c * vec * vec)); vec.store(ker + i); - } + } #else for (int i = 0; i < N; i++) // Straight computation, note no pragma omp simd - ker[i] = exp(beta * sqrt(1.0 - c*args[i]*args[i])); + ker[i] = exp(beta * sqrt(1.0 - c * args[i] * args[i])); #endif - } -int main(int argc, char* argv[]) -{ - int M = (int) 1e7; // # of reps - int w=12; // 12, spread width (small), needn't be mult of 4, 15 takes 3.2s but 12 only 0.2s, in g++-7. But not in gcc 5.4.0 +int main(int argc, char *argv[]) { + int M = (int)1e7; // # of reps + int w = 12; // 12, spread width (small), needn't be mult of 4, 15 takes 3.2s but 12 only + // 0.2s, in g++-7. But not in gcc 5.4.0 - if (1) { // 0 makes 10x slower (2s) than 1, which is 0.2 s, for g++-7 - ahb - if (argc == 3) - { - sscanf(argv[1],"%d",&M); - //sscanf(argv[2],"%d",&w); // slows down from 0.2s to 0.44s if use - why?? - } + if (1) { // 0 makes 10x slower (2s) than 1, which is 0.2 s, for g++-7 - ahb + if (argc == 3) { + sscanf(argv[1], "%d", &M); + // sscanf(argv[2],"%d",&w); // slows down from 0.2s to 0.44s if use - why?? + } } - - - FLT beta=2.3*w, c = 4.0/(w*w); // ker params - FLT iw = 1.0/(FLT)w; - FLT ans = 0.0; // dummy answer + + FLT beta = 2.3 * w, c = 4.0 / (w * w); // ker params + FLT iw = 1.0 / (FLT)w; + FLT ans = 0.0; // dummy answer std::vector x(w); std::vector f(w); FLT xi; FLT tmp_val; - + #pragma omp simd simdlen(64) // this pragma makes no difference on modern gcc. - for (int i=1;i<=M;++i) { // changing from i=1 to i=0 slows from 0.2s to 2.4s!!!! I don't understand - has to be a better way to control (assembly code?) - xi = i/(FLT)M; // dummy offset to make each rep different + for (int i = 1; i <= M; ++i) { // changing from i=1 to i=0 slows from 0.2s to 2.4s!!!! I + // don't understand - has to be a better way to control + // (assembly code?) + xi = i / (FLT)M; // dummy offset to make each rep different /* for (int j=0;j -#include -#include #include - +#include +#include +#include #ifdef VCL // Use Agner Fog's vector class library @@ -33,56 +33,56 @@ that correlates w/ 0.2s magic. // Choose prec... typedef double FLT; -//typedef float FLT; +// typedef float FLT; -static inline void evaluate_kernel_vector(FLT * __restrict__ ker, const FLT * __restrict__ args, const FLT beta, const FLT c, const int N) +static inline void evaluate_kernel_vector(FLT *__restrict__ ker, + const FLT *__restrict__ args, const FLT beta, + const FLT c, const int N) /* Evaluate kernel for a vector of N arguments. -*/ + */ { -#ifdef VCL - for (int i = 0; i < N; i+=4) // Assume w divisible by 4 +#ifdef VCL + for (int i = 0; i < N; i += 4) // Assume w divisible by 4 { Vec4d vec; vec.load(args + i); - vec = exp(beta*sqrt(1.0 - c*vec*vec)); + vec = exp(beta * sqrt(1.0 - c * vec * vec)); vec.store(ker + i); - } + } #else for (int i = 0; i < N; i++) // Straight computation, note no pragma omp simd - ker[i] = exp(beta * sqrt(1.0 - c*args[i]*args[i])); + ker[i] = exp(beta * sqrt(1.0 - c * args[i] * args[i])); #endif - } -int main(int argc, char* argv[]) -{ - int M = (int) 1e7; // # of reps - int w=12; // 12, spread width (small), needn't be mult of 4, 15 takes 3.2s but 12 only 0.2s, in g++-7. But not in gcc 5.4.0 +int main(int argc, char *argv[]) { + int M = (int)1e7; // # of reps + int w = 12; // 12, spread width (small), needn't be mult of 4, 15 takes 3.2s but 12 only + // 0.2s, in g++-7. But not in gcc 5.4.0 - if (1) { // 0 makes 10x slower (2s) than 1, which is 0.2 s, for g++-7 - ahb - if (argc == 3) - { - sscanf(argv[1],"%d",&M); - //sscanf(argv[2],"%d",&w); // slows down from 0.2s to 0.44s if use - why?? + if (1) { // 0 makes 10x slower (2s) than 1, which is 0.2 s, for g++-7 - ahb + if (argc == 3) { + sscanf(argv[1], "%d", &M); + // sscanf(argv[2],"%d",&w); // slows down from 0.2s to 0.44s if use - why?? + } } - } - - - FLT beta=2.3*w, c = 4.0/(w*w); // ker params - FLT iw = 1.0/(FLT)w; - FLT ans = 0.0; // dummy answer + + FLT beta = 2.3 * w, c = 4.0 / (w * w); // ker params + FLT iw = 1.0 / (FLT)w; + FLT ans = 0.0; // dummy answer std::vector x(w); std::vector f(w); - - for (int i=1;i<=M;++i) { // changing from i=1 to i=0 slows from 0.2s to 2.4s!!!! I don't understand - has to be a better way to control (assembly code?) - FLT xi = i/(FLT)M; // dummy offset to make each rep different - for (int j=0;j +#include #include #include -#include #include // no vectorize -//#pragma GCC optimize("no-tree-vectorize") +// #pragma GCC optimize("no-tree-vectorize") /* local NU coord fold+rescale macro: does the following affine transform to x: when p=true: map [-3pi,-pi) and [-pi,pi) and [pi,3pi) each to [0,N) otherwise, map [-N,0) and [0,N) and [N,2N) each to [0,N) @@ -17,63 +17,58 @@ The macro wins hands-down on i7, even for modern GCC9. This should be done in C++ not as a macro, someday. */ -#define FOLDRESCALE(x, N, p) (p ? \ - (x + (x>=-PI ? (x=0.0 ? (x<(FLT)N ? x : x-(FLT)N) : x+(FLT)N)) - +#define FOLDRESCALE(x, N, p) \ + (p ? (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT)M_1_2PI * N) \ + : (x >= 0.0 ? (x < (FLT)N ? x : x - (FLT)N) : x + (FLT)N)) -#define FOLDRESCALE04(x, N, p) (p ? \ - ((x * FLT(M_1_2PI) + FLT(0.5)) - floor(x * FLT(M_1_2PI) + FLT(0.5))) * FLT(N) : \ - ((x/FLT(N))-floor(x/FLT(N)))*FLT(N)) +#define FOLDRESCALE04(x, N, p) \ + (p ? ((x * FLT(M_1_2PI) + FLT(0.5)) - floor(x * FLT(M_1_2PI) + FLT(0.5))) * FLT(N) \ + : ((x / FLT(N)) - floor(x / FLT(N))) * FLT(N)) -#define FOLDRESCALE05(x, N, p) FLT(N) * (p ? \ - ((x * FLT(M_1_2PI) + FLT(0.5)) - floor(x * FLT(M_1_2PI) + FLT(0.5))) : \ - ((x/FLT(N))-floor(x/FLT(N)))) +#define FOLDRESCALE05(x, N, p) \ + FLT(N) * (p ? ((x * FLT(M_1_2PI) + FLT(0.5)) - floor(x * FLT(M_1_2PI) + FLT(0.5))) \ + : ((x / FLT(N)) - floor(x / FLT(N)))) -inline __attribute__((always_inline)) -FLT foldRescale00(FLT x, BIGINT N, bool p) { +inline __attribute__((always_inline)) FLT foldRescale00(FLT x, BIGINT N, bool p) { FLT result; FLT fN = FLT(N); if (p) { static constexpr FLT x2pi = FLT(M_1_2PI); - result = x * x2pi + FLT(0.5); + result = x * x2pi + FLT(0.5); result -= floor(result); } else { const FLT invN = FLT(1.0) / fN; - result = x * invN; + result = x * invN; result -= floor(result); } return result * fN; } -inline __attribute__((always_inline)) -FLT foldRescale01(FLT x, BIGINT N, bool p) { - return p ? (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT) M_1_2PI * N) : - (x >= 0.0 ? (x < (FLT) N ? x : x - (FLT) N) : x + (FLT) N); +inline __attribute__((always_inline)) FLT foldRescale01(FLT x, BIGINT N, bool p) { + return p ? (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT)M_1_2PI * N) + : (x >= 0.0 ? (x < (FLT)N ? x : x - (FLT)N) : x + (FLT)N); } template -inline __attribute__((always_inline)) -FLT foldRescale02(FLT x, BIGINT N) { +inline __attribute__((always_inline)) FLT foldRescale02(FLT x, BIGINT N) { if constexpr (p) { - return (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT) M_1_2PI * N); + return (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT)M_1_2PI * N); } else { - return (x >= 0.0 ? (x < (FLT) N ? x : x - (FLT) N) : x + (FLT) N); + return (x >= 0.0 ? (x < (FLT)N ? x : x - (FLT)N) : x + (FLT)N); } } template -inline __attribute__((always_inline)) -FLT foldRescale03(FLT x, BIGINT N) { +inline __attribute__((always_inline)) FLT foldRescale03(FLT x, BIGINT N) { FLT result; FLT fN = FLT(N); if constexpr (p) { static constexpr FLT x2pi = FLT(M_1_2PI); - result = std::fma(x, x2pi, FLT(0.5)); + result = std::fma(x, x2pi, FLT(0.5)); result -= floor(result); } else { const FLT invN = FLT(1.0) / fN; - result = x * invN; + result = x * invN; result -= floor(result); } return result * fN; @@ -81,10 +76,10 @@ FLT foldRescale03(FLT x, BIGINT N) { xsimd::batch fold_rescale_vec(xsimd::batch x, BIGINT N) { xsimd::batch result; - const xsimd::batch fN = xsimd::batch(FLT(N)); + const xsimd::batch fN = xsimd::batch(FLT(N)); static const xsimd::batch x2pi = xsimd::batch(FLT(M_1_2PI)); static const xsimd::batch half = xsimd::batch(FLT(0.5)); - result = xsimd::fma(x, x2pi, half); + result = xsimd::fma(x, x2pi, half); result -= xsimd::floor(result); return result * fN; } @@ -92,122 +87,116 @@ xsimd::batch fold_rescale_vec(xsimd::batch x, BIGINT N) { static std::mt19937_64 gen; static std::uniform_real_distribution<> dis(-10, 10); static const auto N = std::uniform_int_distribution<>{0, 1000}(gen); -static std::uniform_real_distribution<> disN(-N, 2*N); -static volatile auto pirange = true; +static std::uniform_real_distribution<> disN(-N, 2 * N); +static volatile auto pirange = true; static volatile auto notPirange = !pirange; static void BM_BASELINE(benchmark::State &state) { - for (auto _: state) { + for (auto _ : state) { benchmark::DoNotOptimize(dis(gen)); } } static void BM_FoldRescaleMacro(benchmark::State &state) { - for (auto _: state) { + for (auto _ : state) { FLT x = dis(gen); benchmark::DoNotOptimize(FOLDRESCALE(x, N, pirange)); } } static void BM_FoldRescaleMacroN(benchmark::State &state) { - for (auto _: state) { + for (auto _ : state) { FLT x = disN(gen); benchmark::DoNotOptimize(FOLDRESCALE(x, N, notPirange)); } } static void BM_FoldRescale00(benchmark::State &state) { - for (auto _: state) { + for (auto _ : state) { FLT x = dis(gen); benchmark::DoNotOptimize(foldRescale00(x, N, pirange)); } } - static void BM_FoldRescale00N(benchmark::State &state) { - for (auto _: state) { + for (auto _ : state) { FLT x = disN(gen); benchmark::DoNotOptimize(foldRescale00(x, N, notPirange)); } } - static void BM_FoldRescale01(benchmark::State &state) { - for (auto _: state) { + for (auto _ : state) { FLT x = dis(gen); benchmark::DoNotOptimize(foldRescale01(x, N, pirange)); } } - static void BM_FoldRescale01N(benchmark::State &state) { - for (auto _: state) { + for (auto _ : state) { FLT x = disN(gen); benchmark::DoNotOptimize(foldRescale01(x, N, notPirange)); } } static void BM_FoldRescale02(benchmark::State &state) { - for (auto _: state) { + for (auto _ : state) { FLT x = dis(gen); benchmark::DoNotOptimize(foldRescale02(x, N)); } } - static void BM_FoldRescale02N(benchmark::State &state) { - for (auto _: state) { + for (auto _ : state) { FLT x = disN(gen); benchmark::DoNotOptimize(foldRescale02(x, N)); } } - static void BM_FoldRescale03(benchmark::State &state) { - for (auto _: state) { + for (auto _ : state) { FLT x = dis(gen); benchmark::DoNotOptimize(foldRescale03(x, N)); } } static void BM_FoldRescale03N(benchmark::State &state) { - for (auto _: state) { + for (auto _ : state) { FLT x = disN(gen); benchmark::DoNotOptimize(foldRescale03(x, N)); } } static void BM_FoldRescale04(benchmark::State &state) { - for (auto _: state) { + for (auto _ : state) { FLT x = dis(gen); benchmark::DoNotOptimize(FOLDRESCALE04(x, N, pirange)); } } static void BM_FoldRescale04N(benchmark::State &state) { - for (auto _: state) { + for (auto _ : state) { FLT x = disN(gen); benchmark::DoNotOptimize(FOLDRESCALE04(x, N, notPirange)); } } static void BM_FoldRescale05(benchmark::State &state) { - for (auto _: state) { + for (auto _ : state) { FLT x = dis(gen); benchmark::DoNotOptimize(FOLDRESCALE05(x, N, pirange)); } } static void BM_FoldRescale05N(benchmark::State &state) { - for (auto _: state) { + for (auto _ : state) { FLT x = disN(gen); benchmark::DoNotOptimize(FOLDRESCALE05(x, N, notPirange)); } } - static void BM_FoldRescaleVec(benchmark::State &state) { - for (auto _: state) { + for (auto _ : state) { // Generate 4 floating point numbers constexpr auto size = xsimd::batch::size; std::array arr; @@ -220,7 +209,6 @@ static void BM_FoldRescaleVec(benchmark::State &state) { } } - BENCHMARK(BM_BASELINE)->Iterations(10000000); BENCHMARK(BM_FoldRescaleMacro)->Iterations(1000000); BENCHMARK(BM_FoldRescale00)->Iterations(1000000); @@ -229,7 +217,7 @@ BENCHMARK(BM_FoldRescale02)->Iterations(1000000); BENCHMARK(BM_FoldRescale03)->Iterations(10000000); BENCHMARK(BM_FoldRescale04)->Iterations(1000000); BENCHMARK(BM_FoldRescale05)->Iterations(1000000); -BENCHMARK(BM_FoldRescaleVec)->Iterations(1000000/4); +BENCHMARK(BM_FoldRescaleVec)->Iterations(1000000 / 4); BENCHMARK(BM_FoldRescaleMacroN)->Iterations(1000000); BENCHMARK(BM_FoldRescale00N)->Iterations(1000000); BENCHMARK(BM_FoldRescale01N)->Iterations(1000000); @@ -238,14 +226,13 @@ BENCHMARK(BM_FoldRescale03N)->Iterations(1000000); BENCHMARK(BM_FoldRescale04N)->Iterations(1000000); BENCHMARK(BM_FoldRescale05N)->Iterations(1000000); - void testFoldRescaleVec() { constexpr auto size = xsimd::batch::size; std::array xVec; for (int i = 0; i < size; ++i) { xVec[i] = dis(gen); } - const auto x = xsimd::load(xVec.data()); + const auto x = xsimd::load(xVec.data()); const auto result = fold_rescale_vec(x, N); std::array resultVec; xsimd::store(resultVec.data(), result); @@ -255,51 +242,59 @@ void testFoldRescaleVec() { for (int i = 0; i < size; ++i) { double result00 = foldRescale03(xVec[i], N); if (std::abs(1 - result00 / resultVec[i]) > 1e-14) { - std::cout << "input: " << xVec[i] << " result00: " << result00 << " resultVec: " << resultVec[i] << std::endl; + std::cout << "input: " << xVec[i] << " result00: " << result00 + << " resultVec: " << resultVec[i] << std::endl; throw std::runtime_error("foldRescaleVec is not equivalent to foldRescale00"); } } } void testFoldRescaleFunctions() { - for (bool p: {false, true}) { - for (int i = 0; i < 1024; ++i) { // Run the test 1000 times - FLT x = dis(gen); + for (bool p : {false, true}) { + for (int i = 0; i < 1024; ++i) { // Run the test 1000 times + FLT x = dis(gen); FLT resultMacro = FOLDRESCALE(x, N, p); - FLT result00 = foldRescale00(x, N, p); - FLT result01 = foldRescale01(x, N, p); - FLT result02 = p ? foldRescale02(x, N) : foldRescale02(x, N); - FLT result03 = p ? foldRescale03(x, N) : foldRescale03(x, N); - FLT result04 = FOLDRESCALE04(x, N, p); - FLT result05 = FOLDRESCALE05(x, N, p); - - // function that compares two floating point number with a tolerance, using relative error + FLT result00 = foldRescale00(x, N, p); + FLT result01 = foldRescale01(x, N, p); + FLT result02 = p ? foldRescale02(x, N) : foldRescale02(x, N); + FLT result03 = p ? foldRescale03(x, N) : foldRescale03(x, N); + FLT result04 = FOLDRESCALE04(x, N, p); + FLT result05 = FOLDRESCALE05(x, N, p); + + // function that compares two floating point number with a tolerance, using relative + // error auto compare = [](FLT a, FLT b) { return std::abs(a - b) > std::max(std::abs(a), std::abs(b)) * 10e-13; }; if (compare(resultMacro, result00)) { - std::cout << "resultMacro: " << resultMacro << " result00: " << result00 << std::endl; + std::cout << "resultMacro: " << resultMacro << " result00: " << result00 + << std::endl; throw std::runtime_error("function00 is wrong"); } if (compare(resultMacro, result01)) { - std::cout << "resultMacro: " << resultMacro << " result01: " << result01 << std::endl; + std::cout << "resultMacro: " << resultMacro << " result01: " << result01 + << std::endl; throw std::runtime_error("function01 is wrong"); } if (compare(resultMacro, result02)) { - std::cout << "resultMacro: " << resultMacro << " result02: " << result02 << std::endl; + std::cout << "resultMacro: " << resultMacro << " result02: " << result02 + << std::endl; throw std::runtime_error("function02 is wrong"); } if (compare(resultMacro, result03)) { - std::cout << "resultMacro: " << resultMacro << " result03: " << result03 << std::endl; + std::cout << "resultMacro: " << resultMacro << " result03: " << result03 + << std::endl; throw std::runtime_error("function03 is wrong"); } if (compare(resultMacro, result04)) { - std::cout << "resultMacro: " << resultMacro << " result04: " << result04 << std::endl; + std::cout << "resultMacro: " << resultMacro << " result04: " << result04 + << std::endl; throw std::runtime_error("function04 is wrong"); } if (compare(resultMacro, result05)) { - std::cout << "resultMacro: " << resultMacro << " result05: " << result05 << std::endl; + std::cout << "resultMacro: " << resultMacro << " result05: " << result05 + << std::endl; throw std::runtime_error("function05 is wrong"); } } @@ -313,7 +308,7 @@ class BaselineSubtractingReporter : public benchmark::ConsoleReporter { } void ReportRuns(const std::vector &reports) override { - for (const auto &run: reports) { + for (const auto &run : reports) { if (run.benchmark_name() == "BM_BASELINE") { baseline_time = run.cpu_accumulated_time; } else { @@ -329,7 +324,7 @@ class BaselineSubtractingReporter : public benchmark::ConsoleReporter { }; int main(int argc, char **argv) { - pirange = argc & 1; + pirange = argc & 1; notPirange = !pirange; static std::random_device rd; const auto seed = rd(); diff --git a/devel/foldrescale_perf.cpp b/devel/foldrescale_perf.cpp index 3d423cdba..a4ac38c99 100644 --- a/devel/foldrescale_perf.cpp +++ b/devel/foldrescale_perf.cpp @@ -4,7 +4,8 @@ Compile with, eg on linux, double-prec: - g++ -O3 -funroll-loops -march=native -I../include foldrescale_perf.cpp -o foldrescale_perf + g++ -O3 -funroll-loops -march=native -I../include foldrescale_perf.cpp -o + foldrescale_perf Use -DSINGLE for single-prec @@ -35,9 +36,13 @@ using namespace std::chrono; #endif // old coord-handling macro ------------------------------------------------ -#define RESCALE(x,N,p) (p ? \ - (x*(FLT)M_1_2PI*N + (x*(FLT)M_1_2PI*N<-N/(FLT)2.0 ? (FLT)1.5 : (x*(FLT)M_1_2PI*N>N/(FLT)2.0 ? (FLT)-0.5 : (FLT)0.5))*N) : \ - (x<0 ? x+N : (x>N ? x-N : x))) +#define RESCALE(x, N, p) \ + (p ? (x * (FLT)M_1_2PI * N + \ + (x * (FLT)M_1_2PI * N < -N / (FLT)2.0 \ + ? (FLT)1.5 \ + : (x * (FLT)M_1_2PI * N > N / (FLT)2.0 ? (FLT) - 0.5 : (FLT)0.5)) * \ + N) \ + : (x < 0 ? x + N : (x > N ? x - N : x))) // function equivalent ----------------------------------------------------- FLT foldrescale(FLT x, BIGINT N, int pirange) @@ -48,58 +53,68 @@ FLT foldrescale(FLT x, BIGINT N, int pirange) // affine rescale... FLT z = x; if (pirange) - z = (N/(2*PI)) * (x+PI); // PI is (FLT)M_PI in defs.h + z = (N / (2 * PI)) * (x + PI); // PI is (FLT)M_PI in defs.h else z = x; // fold... - if (z<(FLT)0.0) + if (z < (FLT)0.0) z += (FLT)N; - else if (z>=(FLT)N) + else if (z >= (FLT)N) z -= (FLT)N; return z; -} +} // ========================================================================== -int main(int argc, char* argv[]) -{ - int M=100000000; // default: # pts to test - long int N = 1000000; // default: grid size, doesn't matter - - if (argc>1) { double w; sscanf(argv[1],"%lf",&w); M = (int)w; } - if (argc>2) { double w; sscanf(argv[2],"%lf",&w); N = (long int)w; } +int main(int argc, char *argv[]) { + int M = 100000000; // default: # pts to test + long int N = 1000000; // default: grid size, doesn't matter - FLT sum=0.0; + if (argc > 1) { + double w; + sscanf(argv[1], "%lf", &w); + M = (int)w; + } + if (argc > 2) { + double w; + sscanf(argv[2], "%lf", &w); + N = (long int)w; + } + + FLT sum = 0.0; auto tbegin = system_clock::now(); - for (int i=0;i dur = system_clock::now() - tbegin; // dur.count() is sec - printf("backgnd ops: \t%.3g s/call\t\t(sum:%.12g)\n",dur.count()/M,sum); + duration dur = system_clock::now() - tbegin; // dur.count() is sec + printf("backgnd ops: \t%.3g s/call\t\t(sum:%.12g)\n", dur.count() / M, + sum); sum = 0.0; - for (int pirange=0;pirange<2;++pirange) { + for (int pirange = 0; pirange < 2; ++pirange) { tbegin = system_clock::now(); - for (int i=0;i g++-9 -O3 -funroll-loops -march=native -I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp -alex@fiona /home/alex/numerics/finufft/devel> ./foldrescale_perf2 -simple array sum: 1.9 ns/call (sum:540.8833119415621) -simple bin over [-3pi,3pi): 1.1 ns/call (ans:100667) -w/ RESCALE1 macro: 4.3 ns/call (sum:499894508.4253364) -w/ RESCALE macro (pir=0): 6.7 ns/call (sum:499894508.4253364) -w/ RESCALE macro (pir=1): 4.5 ns/call (sum:499894508.4253364) -w/ foldrescale1: 8.3 ns/call (sum:499894508.4253364) -w/ foldrescale2: 7.0 ns/call (sum:499894508.4253364) -w/ foldrescale3: 7.0 ns/call (sum:499894508.4253364) -w/ foldrescale (pir=0): 6.7 ns/call (sum:499894508.4253364) -w/ foldrescale (pir=1): 8.2 ns/call (sum:499894508.4253364) - (ans:905754) - -alex@fiona /home/alex/numerics/finufft/devel> g++-9 -O3 -funroll-loops -march=native -I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp -Ofast -fno-finite-math-only -alex@fiona /home/alex/numerics/finufft/devel> ./foldrescale_perf2 +alex@fiona /home/alex/numerics/finufft/devel> g++-9 -O3 -funroll-loops -march=native +-I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp alex@fiona +/home/alex/numerics/finufft/devel> ./foldrescale_perf2 simple array sum: 1.9 +ns/call (sum:540.8833119415621) simple bin over [-3pi,3pi): 1.1 ns/call (ans:100667) w/ +RESCALE1 macro: 4.3 ns/call (sum:499894508.4253364) w/ RESCALE macro (pir=0): 6.7 +ns/call (sum:499894508.4253364) w/ RESCALE macro (pir=1): 4.5 ns/call +(sum:499894508.4253364) w/ foldrescale1: 8.3 ns/call (sum:499894508.4253364) w/ +foldrescale2: 7.0 ns/call (sum:499894508.4253364) w/ +foldrescale3: 7.0 ns/call (sum:499894508.4253364) w/ foldrescale +(pir=0): 6.7 ns/call (sum:499894508.4253364) w/ foldrescale (pir=1): 8.2 ns/call +(sum:499894508.4253364) (ans:905754) + +alex@fiona /home/alex/numerics/finufft/devel> g++-9 -O3 -funroll-loops -march=native +-I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp -Ofast +-fno-finite-math-only alex@fiona /home/alex/numerics/finufft/devel> ./foldrescale_perf2 simple array sum: 0.4 ns/call (sum:-9554.451222028649) simple bin over [-3pi,3pi): 1.5 ns/call (ans:100815) w/ RESCALE1 macro: 2.0 ns/call (sum:499919136.1859143) @@ -50,35 +50,31 @@ w/ foldrescale2: 6.7 ns/call (sum:499919136.1859144) w/ foldrescale3: 7.0 ns/call (sum:499919136.1859144) w/ foldrescale (pir=0): 6.4 ns/call (sum:499919136.1859144) w/ foldrescale (pir=1): 8.1 ns/call (sum:499919136.1859143) - (ans:904913) + (ans:904913) NOBIN: -alex@fiona /home/alex/numerics/finufft/devel> g++-9 -O3 -funroll-loops -march=native -I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp -DNOBIN -alex@fiona /home/alex/numerics/finufft/devel> ./foldrescale_perf2 -simple array sum: 1.3 ns/call (sum:-5028.023988434961) -w/ RESCALE1 macro: 1.3 ns/call (sum:499984776.5128576) -w/ RESCALE macro (pir=0): 6.4 ns/call (sum:499984776.5128576) -w/ RESCALE macro (pir=1): 1.4 ns/call (sum:499984776.5128576) -w/ foldrescale1: 7.8 ns/call (sum:499984776.5128576) -w/ foldrescale2: 6.2 ns/call (sum:499984776.5128576) -w/ foldrescale3: 6.4 ns/call (sum:499984776.5128576) -w/ foldrescale (pir=0): 6.3 ns/call (sum:499984776.5128576) -w/ foldrescale (pir=1): 8.2 ns/call (sum:499984776.5128576) - (ans:0) - -alex@fiona /home/alex/numerics/finufft/devel> g++-9 -O3 -funroll-loops -march=native -I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp -Ofast -fno-finite-math-only -DNOBIN -alex@fiona /home/alex/numerics/finufft/devel> ./foldrescale_perf2 -simple array sum: 0.4 ns/call (sum:-14573.38274652959) -w/ RESCALE1 macro: 0.7 ns/call (sum:499926457.4098142) -w/ RESCALE macro (pir=0): 0.7 ns/call (sum:499926457.4098142) -w/ RESCALE macro (pir=1): 0.8 ns/call (sum:499926457.4098142) -w/ foldrescale1: 1.0 ns/call (sum:499926457.4098143) -w/ foldrescale2: 0.8 ns/call (sum:499926457.4098142) -w/ foldrescale3: 0.8 ns/call (sum:499926457.4098142) -w/ foldrescale (pir=0): 0.9 ns/call (sum:499926457.4098143) -w/ foldrescale (pir=1): 1.0 ns/call (sum:499926457.4098144) - (ans:0) -Concl: +alex@fiona /home/alex/numerics/finufft/devel> g++-9 -O3 -funroll-loops -march=native +-I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp -DNOBIN alex@fiona +/home/alex/numerics/finufft/devel> ./foldrescale_perf2 simple array sum: 1.3 +ns/call (sum:-5028.023988434961) w/ RESCALE1 macro: 1.3 ns/call +(sum:499984776.5128576) w/ RESCALE macro (pir=0): 6.4 ns/call (sum:499984776.5128576) w/ +RESCALE macro (pir=1): 1.4 ns/call (sum:499984776.5128576) w/ +foldrescale1: 7.8 ns/call (sum:499984776.5128576) w/ +foldrescale2: 6.2 ns/call (sum:499984776.5128576) w/ +foldrescale3: 6.4 ns/call (sum:499984776.5128576) w/ foldrescale +(pir=0): 6.3 ns/call (sum:499984776.5128576) w/ foldrescale (pir=1): 8.2 ns/call +(sum:499984776.5128576) (ans:0) + +alex@fiona /home/alex/numerics/finufft/devel> g++-9 -O3 -funroll-loops -march=native +-I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp -Ofast +-fno-finite-math-only -DNOBIN alex@fiona /home/alex/numerics/finufft/devel> +./foldrescale_perf2 simple array sum: 0.4 ns/call (sum:-14573.38274652959) w/ +RESCALE1 macro: 0.7 ns/call (sum:499926457.4098142) w/ RESCALE macro (pir=0): 0.7 +ns/call (sum:499926457.4098142) w/ RESCALE macro (pir=1): 0.8 ns/call +(sum:499926457.4098142) w/ foldrescale1: 1.0 ns/call (sum:499926457.4098143) w/ +foldrescale2: 0.8 ns/call (sum:499926457.4098142) w/ foldrescale3: 0.8 ns/call +(sum:499926457.4098142) w/ foldrescale (pir=0): 0.9 ns/call (sum:499926457.4098143) w/ +foldrescale (pir=1): 1.0 ns/call (sum:499926457.4098144) (ans:0) Concl: * foldrescale FUNCTION is only fast when Ofast & NOBIN, really weird. * macro *is* faster than function, even modern g++. * RESCALE is same as RESCALE1 @@ -118,32 +114,34 @@ can recover isnan handling with -Ofast -fno-finite-math-only .. good! #include "finufft/defs.h" #include +#include #include #include -#include // let's try the "modern" C++ way to time... yuk... #include using namespace std::chrono; - // old coord-handling macro ------------------------------------------------ -//#define RESCALE(x,N,p) (p ? \ -// (x*(FLT)M_1_2PI*N + (x*(FLT)M_1_2PI*N<-N/(FLT)2.0 ? (FLT)1.5 : (x*(FLT)M_1_2PI*N>N/(FLT)2.0 ? (FLT)-0.5 : (FLT)0.5))*N) : \ +// #define RESCALE(x,N,p) (p ? \ +// (x*(FLT)M_1_2PI*N + (x*(FLT)M_1_2PI*N<-N/(FLT)2.0 ? (FLT)1.5 : +// (x*(FLT)M_1_2PI*N>N/(FLT)2.0 ? (FLT)-0.5 : (FLT)0.5))*N) : \ // (x<(FLT)0.0 ? x+(FLT)N : (x>(FLT)N ? x-(FLT)N : x))) // casting makes no difference // cleaner rewrite, no slower: -#define RESCALE(x,N,p) (p ? \ - (x + (x>=-PI ? (x=0.0 ? (x<(FLT)N ? x : x-(FLT)N) : x+(FLT)N)) +#define RESCALE(x, N, p) \ + (p ? (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT)M_1_2PI * N) \ + : (x >= 0.0 ? (x < (FLT)N ? x : x - (FLT)N) : x + (FLT)N)) // pirange=1 fixed ver of old coord-handling macro ------------------------ -//#define RESCALE1(x,N) (x*(FLT)M_1_2PI*N + (x*(FLT)M_1_2PI*N<-N/(FLT)2.0 ? (FLT)1.5*N : (x*(FLT)M_1_2PI*N>N/(FLT)2.0 ? (FLT)-0.5*N : (FLT)0.5*N))) +// #define RESCALE1(x,N) (x*(FLT)M_1_2PI*N + (x*(FLT)M_1_2PI*N<-N/(FLT)2.0 ? (FLT)1.5*N : +// (x*(FLT)M_1_2PI*N>N/(FLT)2.0 ? (FLT)-0.5*N : (FLT)0.5*N))) // it does matter how written: this made faster... -//#define RESCALE1(x,N) (x*(FLT)M_1_2PI + (x*(FLT)M_1_2PI<-0.5 ? 1.5 : (x*(FLT)M_1_2PI>0.5 ? -0.5 : 0.5)))*N - -#define RESCALE1(x,N) (x + (x>=-PI ? (x0.5 ? -0.5 : 0.5)))*N +#define RESCALE1(x, N) \ + (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT)M_1_2PI * N) // function equivalents ----------------------------------------------------- static inline FLT foldrescale(FLT x, BIGINT N, int pirange) @@ -153,184 +151,199 @@ static inline FLT foldrescale(FLT x, BIGINT N, int pirange) { // affine rescale... FLT z = x; - if (pirange) - z = (N/(2*PI)) * (x+PI); // PI is (FLT)M_PI in defs.h + if (pirange) z = (N / (2 * PI)) * (x + PI); // PI is (FLT)M_PI in defs.h // fold... - if (z<(FLT)0.0) + if (z < (FLT)0.0) z += (FLT)N; - else if (z>=(FLT)N) + else if (z >= (FLT)N) z -= (FLT)N; return z; -} +} static inline FLT foldrescale1(FLT x, BIGINT N) // same as above but hardwired pirange=1. rescale then fold { // affine rescale always... - FLT z = (N/(2*PI)) * (x+PI); // PI is (FLT)M_PI in defs.h + FLT z = (N / (2 * PI)) * (x + PI); // PI is (FLT)M_PI in defs.h // fold... - if (z<(FLT)0.0) + if (z < (FLT)0.0) z += (FLT)N; - else if (z>=(FLT)N) + else if (z >= (FLT)N) z -= (FLT)N; return z; -} +} static inline FLT foldrescale2(FLT x, BIGINT N) // same as above but hardwired pirange=1, flip so fold done before rescale { - if (x<-PI) - x += 2*PI; - else if (x>PI) - x -= 2*PI; - return (N/(2*PI)) * (x+PI); -} + if (x < -PI) + x += 2 * PI; + else if (x > PI) + x -= 2 * PI; + return (N / (2 * PI)) * (x + PI); +} static inline FLT foldrescale3(FLT x, BIGINT N) // same as above but hardwired pirange=1, flip so fold done before rescale { - if (x<-PI) - x += 3*PI; - else if (x>PI) + if (x < -PI) + x += 3 * PI; + else if (x > PI) x -= PI; else x += PI; - return (N/(2*PI)) * x; + return (N / (2 * PI)) * x; } - - // ========================================================================== -int main(int argc, char* argv[]) -{ - int M=10000000; // default: # pts to test (>=1e7 is acc) - int N = 100; // grid size, matters that unknown @ compile - - if (argc>1) { double w; sscanf(argv[1],"%lf",&w); M = (int)w; } - if (argc>2) { double w; sscanf(argv[2],"%lf",&w); N = (int)w; } - std::vector c(N,0); // let's do basic binning while we're at it - // to prevent compiler optims - int maxc=0; // use for max bin count - +int main(int argc, char *argv[]) { + int M = 10000000; // default: # pts to test (>=1e7 is acc) + int N = 100; // grid size, matters that unknown @ compile + + if (argc > 1) { + double w; + sscanf(argv[1], "%lf", &w); + M = (int)w; + } + if (argc > 2) { + double w; + sscanf(argv[2], "%lf", &w); + N = (int)w; + } + std::vector c(N, 0); // let's do basic binning while we're at it + // to prevent compiler optims + int maxc = 0; // use for max bin count + // fill array w/ random #s (in par), deterministic seeds based on threads std::vector x(M); #pragma omp parallel { - unsigned int s=omp_get_thread_num(); // needed for parallel random #s -#pragma omp for schedule(dynamic,1000000) - for (int i=0; i=0 still 1:2 random) // We'll reuse this array by rescaling/unrescaling by hand. - - FLT sum=0.0; + + FLT sum = 0.0; auto tbegin = system_clock::now(); - for (int i=0;i dur = system_clock::now() - tbegin; // dur.count() is sec - printf("simple array sum: \t%.1f ns/call\t(sum:%.16g)\n",1e9*dur.count()/(double)M,sum); + for (int i = 0; i < M; ++i) sum += x[i]; // simply sweep through array + duration dur = system_clock::now() - tbegin; // dur.count() is sec + printf("simple array sum: \t%.1f ns/call\t(sum:%.16g)\n", + 1e9 * dur.count() / (double)M, sum); #ifndef NOBIN tbegin = system_clock::now(); - for (int i=0;i=N) printf("b[%d]=%d (x=%.16g, flt b=%.16g)\n",i,b,x[i],N*((1.0/(6*PI))*x[i] + 0.5)); // chk all indices ok! + // if (b<0 || b>=N) printf("b[%d]=%d (x=%.16g, flt + // b=%.16g)\n",i,b,x[i],N*((1.0/(6*PI))*x[i] + 0.5)); // chk all indices ok! } - dur = system_clock::now() - tbegin; // dur.count() is sec - for(int b=0;bmaxc) maxc=c[b]; // somehow use it - printf("simple bin over [-3pi,3pi): \t%.1f ns/call\t(ans:%d)\n",1e9*dur.count()/(double)M,maxc); + dur = system_clock::now() - tbegin; // dur.count() is sec + for (int b = 0; b < N; ++b) + if (c[b] > maxc) maxc = c[b]; // somehow use it + printf("simple bin over [-3pi,3pi): \t%.1f ns/call\t(ans:%d)\n", + 1e9 * dur.count() / (double)M, maxc); #endif - - sum = 0.0; // hardwired pirange=1 MACRO....................... + + sum = 0.0; // hardwired pirange=1 MACRO....................... tbegin = system_clock::now(); - for (int i=0;imx) mx=x[i]; // chk max - //printf("max x=%.3g\n",mx); - sum = 0.0; + for (int i = 0; i < M; ++i) x[i] = (N / (2 * PI)) * (x[i] + PI); // rescale to [0,N) + // FLT mx=0.0; for (int i=0;imx) mx=x[i]; // chk max + // printf("max x=%.3g\n",mx); + sum = 0.0; tbegin = system_clock::now(); - for (int i=0;imaxc) maxc=c[b]; // somehow use it - printf("\t\t\t\t\t\t(ans:%d)\n",maxc); + maxc = 0; + for (int b = 0; b < N; ++b) + if (c[b] > maxc) maxc = c[b]; // somehow use it + printf("\t\t\t\t\t\t(ans:%d)\n", maxc); return 0; } diff --git a/devel/interp_square_nowrap.cpp b/devel/interp_square_nowrap.cpp index d17b32a89..8cd3758b5 100644 --- a/devel/interp_square_nowrap.cpp +++ b/devel/interp_square_nowrap.cpp @@ -1,29 +1,31 @@ // this is code I was messing with timing using time2d2interp.cpp // around May 3, 2018, to figure how wrapping was slowing down spreading. -void interp_square_nowrap(FLT *out,FLT *du, FLT *ker1, FLT *ker2, BIGINT i1,BIGINT i2,BIGINT N1,BIGINT N2,int ns) +void interp_square_nowrap(FLT *out, FLT *du, FLT *ker1, FLT *ker2, BIGINT i1, BIGINT i2, + BIGINT N1, BIGINT N2, int ns) // *************** don't periodic wrap, avoid ptrs. correct if no NU pts nr edge { - out[0] = 0.0; out[1] = 0.0; - if (0) { // plain - for (int dy=0; dy #include -#include #include +#include +#include -template -static constexpr auto BestSIMDHelper(); +template static constexpr auto BestSIMDHelper(); -template -static constexpr auto GetPaddedSIMDSize(); +template static constexpr auto GetPaddedSIMDSize(); -template -static uint16_t get_padding(uint16_t ns); +template static uint16_t get_padding(uint16_t ns); -template -static constexpr auto get_padding(); +template static constexpr auto get_padding(); template using BestSIMD = typename decltype(BestSIMDHelper::size>())::type; -template -static constexpr uint16_t min_batch_size(); +template static constexpr uint16_t min_batch_size(); -template()> -constexpr uint16_t max_batch_size(); +template()> constexpr uint16_t max_batch_size(); -template -static constexpr auto find_optimal_batch_size(); +template static constexpr auto find_optimal_batch_size(); // below there is some trickery to obtain the padded SIMD type to vectorize // the given number of elements. @@ -36,55 +29,50 @@ static constexpr auto find_optimal_batch_size(); // or on older ones... "compiler internal error please report" // you have been warned. -template -static constexpr auto BestSIMDHelper() { +template static constexpr auto BestSIMDHelper() { if constexpr (N % K == 0) { // returns void in the worst case return xsimd::make_sized_batch{}; } else { - return BestSIMDHelper>1)>(); + return BestSIMDHelper> 1)>(); } } -template -constexpr uint16_t min_batch_size() { +template constexpr uint16_t min_batch_size() { if constexpr (std::is_void_v>) { - return min_batch_size(); + return min_batch_size(); } else { return N; } }; -template -constexpr uint16_t max_batch_size() { - if constexpr (!std::is_void_v>) { - return max_batch_size(); +template constexpr uint16_t max_batch_size() { + if constexpr (!std::is_void_v>) { + return max_batch_size(); } else { return N; } }; -template -static constexpr auto find_optimal_batch_size() { - uint16_t min_iterations = N; +template static constexpr auto find_optimal_batch_size() { + uint16_t min_iterations = N; uint16_t optimal_batch_size = 1; - for (uint16_t batch_size = min_batch_size(); batch_size <= xsimd::batch::size; batch_size *= 2) { + for (uint16_t batch_size = min_batch_size(); batch_size <= xsimd::batch::size; + batch_size *= 2) { uint16_t iterations = (N + batch_size - 1) / batch_size; if (iterations < min_iterations) { - min_iterations = iterations; + min_iterations = iterations; optimal_batch_size = batch_size; } } return optimal_batch_size; } -template -static constexpr auto GetPaddedSIMDSize() { +template static constexpr auto GetPaddedSIMDSize() { static_assert(N < 128); - return xsimd::make_sized_batch()>::type::size; + return xsimd::make_sized_batch()>::type::size; } -template -static constexpr auto get_padding() { +template static constexpr auto get_padding() { constexpr uint16_t width = GetPaddedSIMDSize(); return ns % width == 0 ? 0 : width - (ns % width); } @@ -102,13 +90,11 @@ static constexpr auto get_padding_helper(uint16_t runtime_ns) { } } -template -static uint16_t get_padding(uint16_t ns) { +template static uint16_t get_padding(uint16_t ns) { return get_padding_helper(ns); } -template -std::ostream & print(T arg) { +template std::ostream &print(T arg) { typename T::value_type sum = 0; for (const auto &elem : arg) { std::cout << elem << " "; @@ -118,34 +104,31 @@ std::ostream & print(T arg) { return std::cout; } - -template -constexpr uint16_t po2_in_between() { +template constexpr uint16_t po2_in_between() { std::uint16_t result = 0; - for (auto i = low; i <= high; i<<=1 ) { + for (auto i = low; i <= high; i <<= 1) { result++; } return result; } -template -constexpr auto mixed_vectors() { +template constexpr auto mixed_vectors() { constexpr auto min_batch = min_batch_size(); constexpr auto max_batch = max_batch_size(); // compute all the power of 2 between min_batch and max_batch - std::array()+1> batch_sizes{1}; + std::array() + 1> batch_sizes{1}; for (uint16_t i = 1; i < batch_sizes.size(); i++) { batch_sizes[i] = min_batch << (i - 1); } print(batch_sizes); - std::array chosen_batch_sizes{0}, dp{N+1}; - dp[0] = 0; // 0 amount requires 0 coins + std::array chosen_batch_sizes{0}, dp{N + 1}; + dp[0] = 0; // 0 amount requires 0 coins - for (uint16_t i = 0; i < N+1; ++i) { + for (uint16_t i = 0; i < N + 1; ++i) { for (const auto batch_size : batch_sizes) { if (batch_size <= i && dp[i - batch_size] + 1 < dp[i]) { - dp[i] = dp[i - batch_size] + 1; + dp[i] = dp[i - batch_size] + 1; chosen_batch_sizes[i] = batch_size; } } @@ -160,8 +143,6 @@ constexpr auto mixed_vectors() { return sequence; } - - int main(int argc, char *argv[]) { std::cout << "sequence for 16 single precision is "; print(mixed_vectors()) << std::endl; @@ -183,86 +164,140 @@ int main(int argc, char *argv[]) { std::cout << "sequence for 18 double precision is "; print(mixed_vectors()) << std::endl; - std::cout << "sequence for 31 single precision is "; print(mixed_vectors()) << std::endl; std::cout << "sequence for 31 double precision is "; print(mixed_vectors()) << std::endl; - std::cout << "Min batch size for single precision is " << uint64_t(min_batch_size()) << std::endl; - std::cout << "Max batch size for single precision is " << uint64_t(max_batch_size()) << std::endl; - std::cout << "Min batch size for double precision is " << uint64_t(min_batch_size()) << std::endl; - std::cout << "Max batch size for double precision is " << uint64_t(max_batch_size()) << std::endl; + std::cout << "Min batch size for single precision is " + << uint64_t(min_batch_size()) << std::endl; + std::cout << "Max batch size for single precision is " + << uint64_t(max_batch_size()) << std::endl; + std::cout << "Min batch size for double precision is " + << uint64_t(min_batch_size()) << std::endl; + std::cout << "Max batch size for double precision is " + << uint64_t(max_batch_size()) << std::endl; std::cout << "Best SIMD single precision" << std::endl; - std::cout << "SIMD for " << 4 << " is " << uint64_t(BestSIMD::size) << std::endl; - std::cout << "SIMD for " << 8 << " is " << uint64_t(BestSIMD::size) << std::endl; - std::cout << "SIMD for " << 12 << " is " << uint64_t(BestSIMD::size) << std::endl; - std::cout << "SIMD for " << 16 << " is " << uint64_t(BestSIMD::size) << std::endl; - std::cout << "SIMD for " << 20 << " is " << uint64_t(BestSIMD::size) << std::endl; - std::cout << "SIMD for " << 24 << " is " << uint64_t(BestSIMD::size) << std::endl; - std::cout << "SIMD for " << 28 << " is " << uint64_t(BestSIMD::size) << std::endl; - std::cout << "SIMD for " << 32 << " is " << uint64_t(BestSIMD::size) << std::endl; + std::cout << "SIMD for " << 4 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 8 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 12 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 16 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 20 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 24 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 28 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 32 << " is " << uint64_t(BestSIMD::size) + << std::endl; std::cout << "Best SIMD double precision" << std::endl; - std::cout << "SIMD for " << 4 << " is " << uint64_t(BestSIMD::size) << std::endl; - std::cout << "SIMD for " << 8 << " is " << uint64_t(BestSIMD::size) << std::endl; - std::cout << "SIMD for " << 12 << " is " << uint64_t(BestSIMD::size) << std::endl; - std::cout << "SIMD for " << 16 << " is " << uint64_t(BestSIMD::size) << std::endl; - std::cout << "SIMD for " << 20 << " is " << uint64_t(BestSIMD::size) << std::endl; - std::cout << "SIMD for " << 24 << " is " << uint64_t(BestSIMD::size) << std::endl; - std::cout << "SIMD for " << 28 << " is " << uint64_t(BestSIMD::size) << std::endl; - std::cout << "SIMD for " << 32 << " is " << uint64_t(BestSIMD::size) << std::endl; + std::cout << "SIMD for " << 4 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 8 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 12 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 16 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 20 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 24 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 28 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 32 << " is " << uint64_t(BestSIMD::size) + << std::endl; std::cout << "Padded SIMD single precision" << std::endl; - std::cout << "Padded SIMD for " << 4 << " is " << uint64_t(GetPaddedSIMDSize()) << std::endl; - std::cout << "Padded SIMD for " << 6 << " is " << uint64_t(GetPaddedSIMDSize()) << std::endl; - std::cout << "Padded SIMD for " << 10 << " is " << uint64_t(GetPaddedSIMDSize()) << std::endl; - std::cout << "Padded SIMD for " << 12 << " is " << uint64_t(GetPaddedSIMDSize()) << std::endl; - std::cout << "Padded SIMD for " << 15 << " is " << uint64_t(GetPaddedSIMDSize()) << std::endl; - std::cout << "Padded SIMD for " << 18 << " is " << uint64_t(GetPaddedSIMDSize()) << std::endl; - std::cout << "Padded SIMD for " << 22 << " is " << uint64_t(GetPaddedSIMDSize()) << std::endl; - std::cout << "Padded SIMD for " << 26 << " is " << uint64_t(GetPaddedSIMDSize()) << std::endl; - std::cout << "Padded SIMD for " << 30 << " is " << uint64_t(GetPaddedSIMDSize()) << std::endl; - std::cout << "Padded SIMD for " << 32 << " is " << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 4 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 6 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 10 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 12 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 15 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 18 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 22 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 26 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 30 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 32 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; std::cout << "Padded SIMD double precision" << std::endl; - std::cout << "Padded SIMD for " << 4 << " is " << uint64_t(GetPaddedSIMDSize()) << std::endl; - std::cout << "Padded SIMD for " << 6 << " is " << uint64_t(GetPaddedSIMDSize()) << std::endl; - std::cout << "Padded SIMD for " << 10 << " is " << uint64_t(GetPaddedSIMDSize()) << std::endl; - std::cout << "Padded SIMD for " << 12 << " is " << uint64_t(GetPaddedSIMDSize()) << std::endl; - std::cout << "Padded SIMD for " << 15 << " is " << uint64_t(GetPaddedSIMDSize()) << std::endl; - std::cout << "Padded SIMD for " << 18 << " is " << uint64_t(GetPaddedSIMDSize()) << std::endl; - std::cout << "Padded SIMD for " << 22 << " is " << uint64_t(GetPaddedSIMDSize()) << std::endl; - std::cout << "Padded SIMD for " << 26 << " is " << uint64_t(GetPaddedSIMDSize()) << std::endl; - std::cout << "Padded SIMD for " << 30 << " is " << uint64_t(GetPaddedSIMDSize()) << std::endl; - std::cout << "Padded SIMD for " << 32 << " is " << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 4 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 6 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 10 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 12 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 15 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 18 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 22 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 26 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 30 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 32 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; std::cout << "single precision" << std::endl; - for(auto i = 2; i < 16; i++){ - std::cout << "Padding for " << i*2 << " is " << uint64_t(get_padding(i*2)) << std::endl; + for (auto i = 2; i < 16; i++) { + std::cout << "Padding for " << i * 2 << " is " << uint64_t(get_padding(i * 2)) + << std::endl; } std::cout << "double precision" << std::endl; - for(auto i = 2; i < 16; i++){ - std::cout << "Padding for " << i*2 << " is " << uint64_t(get_padding(i*2)) << std::endl; + for (auto i = 2; i < 16; i++) { + std::cout << "Padding for " << i * 2 << " is " << uint64_t(get_padding(i * 2)) + << std::endl; } std::cout << "single precision" << std::endl; - std::cout << "Padding for " << 3 * 2 << " is " << uint64_t(get_padding()) << std::endl; - std::cout << "Padding for " << 5 * 2 << " is " << uint64_t(get_padding()) << std::endl; - std::cout << "Padding for " << 9 * 2 << " is " << uint64_t(get_padding()) << std::endl; - std::cout << "Padding for " << 11 * 2 << " is " << uint64_t(get_padding()) << std::endl; - std::cout << "Padding for " << 13 * 2 << " is " << uint64_t(get_padding()) << std::endl; - std::cout << "Padding for " << 15 * 2 << " is " << uint64_t(get_padding()) << std::endl; + std::cout << "Padding for " << 3 * 2 << " is " << uint64_t(get_padding()) + << std::endl; + std::cout << "Padding for " << 5 * 2 << " is " << uint64_t(get_padding()) + << std::endl; + std::cout << "Padding for " << 9 * 2 << " is " << uint64_t(get_padding()) + << std::endl; + std::cout << "Padding for " << 11 * 2 << " is " + << uint64_t(get_padding()) << std::endl; + std::cout << "Padding for " << 13 * 2 << " is " + << uint64_t(get_padding()) << std::endl; + std::cout << "Padding for " << 15 * 2 << " is " + << uint64_t(get_padding()) << std::endl; std::cout << "double precision" << std::endl; - std::cout << "Padding for " << 3*2 << " is " << uint64_t(get_padding()) << std::endl; - std::cout << "Padding for " << 5*2 << " is " << uint64_t(get_padding()) << std::endl; - std::cout << "Padding for " << 7*2 << " is " << uint64_t(get_padding()) << std::endl; - std::cout << "Padding for " << 9*2 << " is " << uint64_t(get_padding()) << std::endl; - std::cout << "Padding for " << 11*2 << " is " << uint64_t(get_padding()) << std::endl; - std::cout << "Padding for " << 13*2 << " is " << uint64_t(get_padding()) << std::endl; - std::cout << "Padding for " << 15*2 << " is " << uint64_t(get_padding()) << std::endl; + std::cout << "Padding for " << 3 * 2 << " is " << uint64_t(get_padding()) + << std::endl; + std::cout << "Padding for " << 5 * 2 << " is " << uint64_t(get_padding()) + << std::endl; + std::cout << "Padding for " << 7 * 2 << " is " << uint64_t(get_padding()) + << std::endl; + std::cout << "Padding for " << 9 * 2 << " is " << uint64_t(get_padding()) + << std::endl; + std::cout << "Padding for " << 11 * 2 << " is " + << uint64_t(get_padding()) << std::endl; + std::cout << "Padding for " << 13 * 2 << " is " + << uint64_t(get_padding()) << std::endl; + std::cout << "Padding for " << 15 * 2 << " is " + << uint64_t(get_padding()) << std::endl; return 0; } \ No newline at end of file diff --git a/devel/test_ker_ppval.cpp b/devel/test_ker_ppval.cpp index e8089121e..5c2131542 100644 --- a/devel/test_ker_ppval.cpp +++ b/devel/test_ker_ppval.cpp @@ -4,7 +4,8 @@ For dyn linked: g++-9 test_ker_ppval.cpp -o test_ker_ppval -O3 -funroll-loops -march=native -fopenmp For statically linked so can control glibc (avoid Matlab calling being different): -g++-9 test_ker_ppval.cpp -o test_ker_ppval -O3 -funroll-loops -march=native -fopenmp -static -lmvec +g++-9 test_ker_ppval.cpp -o test_ker_ppval -O3 -funroll-loops -march=native -fopenmp +-static -lmvec For GCC vectorization info: -fopt-info @@ -29,46 +30,43 @@ I have even seen 1e-7 error for w=12 (which should be good to 1e-11) Demo that sscanf for w can speed plain eval but slow some magic horner speed: -alex@fiona /home/alex/numerics/finufft/devel> g++-7 test_ker_ppval.cpp -o test_ker_ppval -Ofast -march=native -funroll-loops -fopenmp -WITHOUT SSCANF FOR w: -alex@fiona /home/alex/numerics/finufft/devel> ./test_ker_ppval 10000000 -acc test: sup err scaled to kernel peak of 1: 6.53e-11 -exp(sqrt): M=10000000 w=12 in 1.03 s: 116 Meval/s (ans=2.73717868002952e+19) -Horner: M=10000000 w=12 in 0.0812 s: 1.48e+03 Meval/s (ans=2.73717867964406e+19) -rel err in sum = 1.41e-10 -WITH SSCANF FOR w: -alex@fiona /home/alex/numerics/finufft/devel> ./test_ker_ppval 10000000 -acc test: sup err scaled to kernel peak of 1: 6.53e-11 -exp(sqrt): M=10000000 w=12 in 0.45 s: 267 Meval/s (ans=2.73717867952762e+19) -Horner: M=10000000 w=12 in 0.483 s: 248 Meval/s (ans=2.73717867952754e+19) -rel err in sum = 3.01e-14 +alex@fiona /home/alex/numerics/finufft/devel> g++-7 test_ker_ppval.cpp -o test_ker_ppval +-Ofast -march=native -funroll-loops -fopenmp WITHOUT SSCANF FOR w: alex@fiona +/home/alex/numerics/finufft/devel> ./test_ker_ppval 10000000 acc test: sup err scaled to +kernel peak of 1: 6.53e-11 exp(sqrt): M=10000000 w=12 in 1.03 s: 116 Meval/s +(ans=2.73717868002952e+19) Horner: M=10000000 w=12 in 0.0812 s: 1.48e+03 Meval/s +(ans=2.73717867964406e+19) rel err in sum = 1.41e-10 WITH SSCANF FOR w: alex@fiona +/home/alex/numerics/finufft/devel> ./test_ker_ppval 10000000 acc test: sup err scaled to +kernel peak of 1: 6.53e-11 exp(sqrt): M=10000000 w=12 in 0.45 s: 267 Meval/s +(ans=2.73717867952762e+19) Horner: M=10000000 w=12 in 0.483 s: 248 Meval/s +(ans=2.73717867952754e+19) rel err in sum = 3.01e-14 */ -#include -#include -#include #include +#include +#include #include +#include // Choose prec... (w=7 enough for single) typedef double FLT; -//typedef float FLT; +// typedef float FLT; -static inline void evaluate_kernel_vector(FLT *ker, const FLT *args, const FLT beta, const FLT c, const int w) +static inline void evaluate_kernel_vector(FLT *ker, const FLT *args, const FLT beta, + const FLT c, const int w) /* Evaluate kernel for a vector of w arguments, must also be the int width par. The #pragra's need to be removed for icpc if -fopenmp not used; g++ is ok. */ { #pragma omp simd - for (int i = 0; i < w; i++) - ker[i] = exp(beta * sqrt((FLT)1.0 - c*args[i]*args[i])); - // gcc 5.4 can't simd the combined loop, hence we split the - // out-of-support test to subsequent loop... - // This check loop prevents getting 0.2s (600 Meval/s): + for (int i = 0; i < w; i++) ker[i] = exp(beta * sqrt((FLT)1.0 - c * args[i] * args[i])); + // gcc 5.4 can't simd the combined loop, hence we split the + // out-of-support test to subsequent loop... + // This check loop prevents getting 0.2s (600 Meval/s): #pragma omp simd for (int i = 0; i < w; i++) - if (fabs(args[i]) >= (FLT)w/2) // note fabs not abs! + if (fabs(args[i]) >= (FLT)w / 2) // note fabs not abs! ker[i] = 0.0; } @@ -78,79 +76,78 @@ static inline void kernel_vector_Horner(FLT *ker, FLT z, int w) See: gen_all_horner_C_code.m */ { - //#include "../src/ker_horner_allw.c" + // #include "../src/ker_horner_allw.c" #include "../src/ker_horner_allw_loop.c" } -int main(int argc, char* argv[]) -{ - int M = (int) 1e7; // # of reps (<2^31) - if (argc>1) - sscanf(argv[1],"%d",&M); // weirdly allows exp simd 10x faster, even on gcc 5.4.0 - int w=12; // spread width - if (argc>2) - sscanf(argv[2],"%d",&w); // prevents the magic 0.2s, keeps at 0.4s - FLT beta=2.30*w; // should match kernel params for acc test - if (w==2) beta = 2.20*w; - if (w==3) beta = 2.26*w; - if (w==4) beta = 2.38*w; - FLT c = 4.0/(FLT)(w*w); // set up ker params for plain eval - FLT iw = 1.0/(FLT)w; // scale factor +int main(int argc, char *argv[]) { + int M = (int)1e7; // # of reps (<2^31) + if (argc > 1) + sscanf(argv[1], "%d", &M); // weirdly allows exp simd 10x faster, even on gcc 5.4.0 + int w = 12; // spread width + if (argc > 2) sscanf(argv[2], "%d", &w); // prevents the magic 0.2s, keeps at 0.4s + FLT beta = 2.30 * w; // should match kernel params for acc test + if (w == 2) beta = 2.20 * w; + if (w == 3) beta = 2.26 * w; + if (w == 4) beta = 2.38 * w; + FLT c = 4.0 / (FLT)(w * w); // set up ker params for plain eval + FLT iw = 1.0 / (FLT)w; // scale factor std::vector x(w); - std::vector f(16), f2(16); // length=MAX_NSPREAD + std::vector f(16), f2(16); // length=MAX_NSPREAD - int Macc = 100; // test accuracy....... + int Macc = 100; // test accuracy....... FLT superr = 0.0; - for (int i=0;isuperr) superr = err; + for (int i = 0; i < Macc; ++i) { // loop over eval grid sets + FLT z = (2 * i) / (FLT)(Macc - 1) - 1.0; // local offset sweep through z in [-1,1] + // printf("z=%g:\n",z); // useful for calling w/ eg Macc=3 + kernel_vector_Horner(&f2[0], z, w); // eval kernel to f2, given offset z + for (int j = 0; j < w; ++j) // vector of args in [-w/2,w/2] ker supp + x[j] = (-(FLT)w + 1.0 + z) / 2 + j; + evaluate_kernel_vector(&f[0], &x[0], beta, c, w); // eval kernel into f + for (int j = 0; j < w; ++j) { + // printf("x=%.3g\tf=%.6g\tf2=%.6g\tf2-f=%.3g\n",x[j],f[j],f2[j],f2[j]-f[j]); + FLT err = abs(f[j] - f2[j]); + if (err > superr) superr = err; } } superr /= exp(beta); - printf("acc test: sup err scaled to kernel peak of 1: %.3g\n",superr); - + printf("acc test: sup err scaled to kernel peak of 1: %.3g\n", superr); + // test speed...... plain eval - clock_t start=clock(); - FLT ans = 0.0; // dummy answer - for (int i=0;i -#include -#include #include +#include +#include #include +#include // Choose prec for floating pt... typedef double FLT; #define MAXNS 16 -int main(int argc, char *argv[]) -{ - int M = 10000000; // NU pts - int n = 2000; // U grid pts per dimension (needn't be huge) - if (argc>1) - sscanf(argv[1],"%d",&M); - if (argc>2) - sscanf(argv[2],"%d",&n); - int ns=10; // kernel width - if (argc>3) - sscanf(argv[3],"%d",&ns); - FLT ker1[MAXNS],ker2[MAXNS]; - - std::vector du(2*n*n); // U "input" array, with... - for (int i=0;i<2*n*n;++i) // something in it +int main(int argc, char *argv[]) { + int M = 10000000; // NU pts + int n = 2000; // U grid pts per dimension (needn't be huge) + if (argc > 1) sscanf(argv[1], "%d", &M); + if (argc > 2) sscanf(argv[2], "%d", &n); + int ns = 10; // kernel width + if (argc > 3) sscanf(argv[3], "%d", &ns); + FLT ker1[MAXNS], ker2[MAXNS]; + + std::vector du(2 * n * n); // U "input" array, with... + for (int i = 0; i < 2 * n * n; ++i) // something in it du[i] = (FLT)i; - - clock_t start=clock(); - FLT tot[2] = {0.0,0.0}; // complex output total - int N1=n, N2=n; - int i1=n/4, i2=n/4+7; // starting pt for bottom left coords of interp box - - for (int i=0;i3*n/4) {i1-=n/2; i2+=1;} // keep spread box away from edges - //i2 += 57; // move far in slow direc - causes pain - if (i2>3*n/4) i2-=n/2; - - } // ....................... - double t=(double)(clock()-start)/CLOCKS_PER_SEC; - printf("M=%d from N=%d^2, ns=%d: tot[0]=%.15g \t%.3g s\n",M,n,ns,tot[0],t); - printf("%.3g spread pts/s\n",M*ns*ns/t); + if (i1 > 3 * n / 4) { + i1 -= n / 2; + i2 += 1; + } // keep spread box away from edges + // i2 += 57; // move far in slow direc - causes pain + if (i2 > 3 * n / 4) i2 -= n / 2; + + } // ....................... + double t = (double)(clock() - start) / CLOCKS_PER_SEC; + printf("M=%d from N=%d^2, ns=%d: tot[0]=%.15g \t%.3g s\n", M, n, ns, tot[0], t); + printf("%.3g spread pts/s\n", M * ns * ns / t); return 0; } diff --git a/examples/cuda/example2d1many.cpp b/examples/cuda/example2d1many.cpp index e67f8c30d..a5d0ecd5d 100644 --- a/examples/cuda/example2d1many.cpp +++ b/examples/cuda/example2d1many.cpp @@ -21,97 +21,101 @@ int main(int argc, char *argv[]) * example code for 2D Type 1 transformation. * * To compile the code: - * nvcc example2d1many.cpp -o example2d1many -I/loc/to/cufinufft/include /loc/to/cufinufft/lib-static/libcufinufft.a - * -lcudart -lcufft -lnvToolsExt + * nvcc example2d1many.cpp -o example2d1many -I/loc/to/cufinufft/include + * /loc/to/cufinufft/lib-static/libcufinufft.a -lcudart -lcufft -lnvToolsExt * * or * export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/loc/to/cufinufft/lib - * nvcc example2d1many.cpp -o example2d1many -I/loc/to/cufinufft/include -L/loc/to/cufinufft/lib/ -lcufinufft + * nvcc example2d1many.cpp -o example2d1many -I/loc/to/cufinufft/include + * -L/loc/to/cufinufft/lib/ -lcufinufft * * */ { - std::cout << std::scientific << std::setprecision(3); - - int ier; - int N1 = 256; - int N2 = 256; - int M = 65536; - int ntransf = 2; - int iflag = 1; - float tol = 1e-6; - - float *x, *y; - std::complex *c, *fk; - cudaMallocHost(&x, M * sizeof(float)); - cudaMallocHost(&y, M * sizeof(float)); - cudaMallocHost(&c, M * ntransf * sizeof(std::complex)); - cudaMallocHost(&fk, N1 * N2 * ntransf * sizeof(std::complex)); - - float *d_x, *d_y; - cuFloatComplex *d_c, *d_fk; - cudaMalloc(&d_x, M * sizeof(float)); - cudaMalloc(&d_y, M * sizeof(float)); - cudaMalloc(&d_c, M * ntransf * sizeof(cuFloatComplex)); - cudaMalloc(&d_fk, N1 * N2 * ntransf * sizeof(cuFloatComplex)); - - std::default_random_engine eng(1); - std::uniform_real_distribution distr(-1, 1); - - for (int i = 0; i < M; i++) { - x[i] = M_PI * distr(eng); - y[i] = M_PI * distr(eng); - } - - for (int i = 0; i < M * ntransf; i++) { - c[i].real(distr(eng)); - c[i].imag(distr(eng)); - } - cudaMemcpy(d_x, x, M * sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(d_y, y, M * sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(d_c, c, M * ntransf * sizeof(cuFloatComplex), cudaMemcpyHostToDevice); - - cufinufftf_plan dplan; - - int dim = 2; - int64_t nmodes[3]; - int type = 1; - - nmodes[0] = N1; - nmodes[1] = N2; - nmodes[2] = 1; - - ier = cufinufftf_makeplan(type, dim, nmodes, iflag, ntransf, tol, &dplan, NULL); - - ier = cufinufftf_setpts(dplan, M, d_x, d_y, NULL, 0, NULL, NULL, NULL); - - ier = cufinufftf_execute(dplan, d_c, d_fk); - - ier = cufinufftf_destroy(dplan); - - cudaMemcpy(fk, d_fk, N1 * N2 * ntransf * sizeof(cuFloatComplex), cudaMemcpyDeviceToHost); - - std::cout << std::endl << "Accuracy check:" << std::endl; - int N = N1 * N2; - for (int i = 0; i < ntransf; i += 1) { - int nt1 = (int)(0.37 * N1), nt2 = (int)(0.26 * N2); // choose some mode index to check - std::complex Ft = std::complex(0, 0), J = std::complex(0, 1) * (float)iflag; - for (CUFINUFFT_BIGINT j = 0; j < M; ++j) - Ft += c[j + i * M] * exp(J * (nt1 * x[j] + nt2 * y[j])); // crude direct - int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2); // index in complex F as 1d array - printf("[gpu %3d] one mode: abs err in F[%d,%d] is %.3g\n", i, nt1, nt2, abs(Ft - fk[it + i * N])); - printf("[gpu %3d] one mode: rel err in F[%d,%d] is %.3g\n", i, nt1, nt2, - abs(Ft - fk[it + i * N]) / infnorm(N, fk + i * N)); - } - - cudaFreeHost(x); - cudaFreeHost(y); - cudaFreeHost(c); - cudaFreeHost(fk); - - cudaFree(d_x); - cudaFree(d_y); - cudaFree(d_c); - cudaFree(d_fk); - return 0; + std::cout << std::scientific << std::setprecision(3); + + int ier; + int N1 = 256; + int N2 = 256; + int M = 65536; + int ntransf = 2; + int iflag = 1; + float tol = 1e-6; + + float *x, *y; + std::complex *c, *fk; + cudaMallocHost(&x, M * sizeof(float)); + cudaMallocHost(&y, M * sizeof(float)); + cudaMallocHost(&c, M * ntransf * sizeof(std::complex)); + cudaMallocHost(&fk, N1 * N2 * ntransf * sizeof(std::complex)); + + float *d_x, *d_y; + cuFloatComplex *d_c, *d_fk; + cudaMalloc(&d_x, M * sizeof(float)); + cudaMalloc(&d_y, M * sizeof(float)); + cudaMalloc(&d_c, M * ntransf * sizeof(cuFloatComplex)); + cudaMalloc(&d_fk, N1 * N2 * ntransf * sizeof(cuFloatComplex)); + + std::default_random_engine eng(1); + std::uniform_real_distribution distr(-1, 1); + + for (int i = 0; i < M; i++) { + x[i] = M_PI * distr(eng); + y[i] = M_PI * distr(eng); + } + + for (int i = 0; i < M * ntransf; i++) { + c[i].real(distr(eng)); + c[i].imag(distr(eng)); + } + cudaMemcpy(d_x, x, M * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_y, y, M * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_c, c, M * ntransf * sizeof(cuFloatComplex), cudaMemcpyHostToDevice); + + cufinufftf_plan dplan; + + int dim = 2; + int64_t nmodes[3]; + int type = 1; + + nmodes[0] = N1; + nmodes[1] = N2; + nmodes[2] = 1; + + ier = cufinufftf_makeplan(type, dim, nmodes, iflag, ntransf, tol, &dplan, NULL); + + ier = cufinufftf_setpts(dplan, M, d_x, d_y, NULL, 0, NULL, NULL, NULL); + + ier = cufinufftf_execute(dplan, d_c, d_fk); + + ier = cufinufftf_destroy(dplan); + + cudaMemcpy(fk, d_fk, N1 * N2 * ntransf * sizeof(cuFloatComplex), + cudaMemcpyDeviceToHost); + + std::cout << std::endl << "Accuracy check:" << std::endl; + int N = N1 * N2; + for (int i = 0; i < ntransf; i += 1) { + int nt1 = (int)(0.37 * N1), nt2 = (int)(0.26 * N2); // choose some mode index to check + std::complex Ft = std::complex(0, 0), + J = std::complex(0, 1) * (float)iflag; + for (CUFINUFFT_BIGINT j = 0; j < M; ++j) + Ft += c[j + i * M] * exp(J * (nt1 * x[j] + nt2 * y[j])); // crude direct + int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2); // index in complex F as 1d array + printf("[gpu %3d] one mode: abs err in F[%d,%d] is %.3g\n", i, nt1, nt2, + abs(Ft - fk[it + i * N])); + printf("[gpu %3d] one mode: rel err in F[%d,%d] is %.3g\n", i, nt1, nt2, + abs(Ft - fk[it + i * N]) / infnorm(N, fk + i * N)); + } + + cudaFreeHost(x); + cudaFreeHost(y); + cudaFreeHost(c); + cudaFreeHost(fk); + + cudaFree(d_x); + cudaFree(d_y); + cudaFree(d_c); + cudaFree(d_fk); + return 0; } diff --git a/examples/cuda/example2d2many.cpp b/examples/cuda/example2d2many.cpp index f35b10205..a6b0c6d3e 100644 --- a/examples/cuda/example2d2many.cpp +++ b/examples/cuda/example2d2many.cpp @@ -21,106 +21,109 @@ int main(int argc, char *argv[]) * example code for 2D Type 1 transformation. * * To compile the code: - * nvcc example2d2many.cpp -o example2d2many loc/to/cufinufft/lib-static/libcufinufft.a -I/loc/to/cufinufft/include - * -lcudart -lcufft -lnvToolsExt + * nvcc example2d2many.cpp -o example2d2many loc/to/cufinufft/lib-static/libcufinufft.a + * -I/loc/to/cufinufft/include -lcudart -lcufft -lnvToolsExt * * or * export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/loc/to/cufinufft/lib - * nvcc example2d2many.cpp -L/loc/to/cufinufft/lib/ -I/loc/to/cufinufft/include -o example2d1 -lcufinufft + * nvcc example2d2many.cpp -L/loc/to/cufinufft/lib/ -I/loc/to/cufinufft/include -o + * example2d1 -lcufinufft * * */ { - std::cout << std::scientific << std::setprecision(3); - - int ier; - int N1 = 128; - int N2 = 128; - int M = 10; - int ntransf = 4; - int maxbatchsize = 4; - int iflag = 1; - double tol = 1e-6; - - double *x, *y; - std::complex *c, *fk; - cudaMallocHost(&x, M * sizeof(double)); - cudaMallocHost(&y, M * sizeof(double)); - cudaMallocHost(&c, M * ntransf * sizeof(std::complex)); - cudaMallocHost(&fk, N1 * N2 * ntransf * sizeof(std::complex)); - - double *d_x, *d_y; - cuDoubleComplex *d_c, *d_fk; - cudaMalloc(&d_x, M * sizeof(double)); - cudaMalloc(&d_y, M * sizeof(double)); - cudaMalloc(&d_c, M * ntransf * sizeof(cuDoubleComplex)); - cudaMalloc(&d_fk, N1 * N2 * ntransf * sizeof(cuDoubleComplex)); - - std::default_random_engine eng(1); - std::uniform_real_distribution distr(-1, 1); - - for (int i = 0; i < M; i++) { - x[i] = M_PI * distr(eng); - y[i] = M_PI * distr(eng); - } - - for (int i = 0; i < N1 * N2 * ntransf; i++) { - fk[i].real(distr(eng)); - fk[i].imag(distr(eng)); - } - cudaMemcpy(d_x, x, M * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(d_y, y, M * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(d_fk, fk, N1 * N2 * ntransf * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice); - - cufinufft_plan dplan; - - int dim = 2; - int64_t nmodes[3]; - int type = 2; - - nmodes[0] = N1; - nmodes[1] = N2; - nmodes[2] = 1; - - cufinufft_opts opts; - cufinufft_default_opts(&opts); - opts.gpu_maxbatchsize = maxbatchsize; - - ier = cufinufft_makeplan(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts); - - ier = cufinufft_setpts(dplan, M, d_x, d_y, NULL, 0, NULL, NULL, NULL); - - ier = cufinufft_execute(dplan, d_c, d_fk); - - ier = cufinufft_destroy(dplan); - - cudaMemcpy(c, d_c, M * ntransf * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost); - - std::cout << std::endl << "Accuracy check:" << std::endl; - std::complex *fkstart; - std::complex *cstart; - for (int t = 0; t < ntransf; t++) { - fkstart = fk + t * N1 * N2; - cstart = c + t * M; - int jt = M / 2; // check arbitrary choice of one targ pt - std::complex J(0, iflag * 1); - std::complex ct(0, 0); - int m = 0; - for (int m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F - for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1) - ct += fkstart[m++] * exp(J * (m1 * x[jt] + m2 * y[jt])); // crude direct - - printf("[gpu %3d] one targ: rel err in c[%d] is %.3g\n", t, jt, abs(cstart[jt] - ct) / infnorm(M, c)); - } - - cudaFreeHost(x); - cudaFreeHost(y); - cudaFreeHost(c); - cudaFreeHost(fk); - - cudaFree(d_x); - cudaFree(d_y); - cudaFree(d_c); - cudaFree(d_fk); - return 0; + std::cout << std::scientific << std::setprecision(3); + + int ier; + int N1 = 128; + int N2 = 128; + int M = 10; + int ntransf = 4; + int maxbatchsize = 4; + int iflag = 1; + double tol = 1e-6; + + double *x, *y; + std::complex *c, *fk; + cudaMallocHost(&x, M * sizeof(double)); + cudaMallocHost(&y, M * sizeof(double)); + cudaMallocHost(&c, M * ntransf * sizeof(std::complex)); + cudaMallocHost(&fk, N1 * N2 * ntransf * sizeof(std::complex)); + + double *d_x, *d_y; + cuDoubleComplex *d_c, *d_fk; + cudaMalloc(&d_x, M * sizeof(double)); + cudaMalloc(&d_y, M * sizeof(double)); + cudaMalloc(&d_c, M * ntransf * sizeof(cuDoubleComplex)); + cudaMalloc(&d_fk, N1 * N2 * ntransf * sizeof(cuDoubleComplex)); + + std::default_random_engine eng(1); + std::uniform_real_distribution distr(-1, 1); + + for (int i = 0; i < M; i++) { + x[i] = M_PI * distr(eng); + y[i] = M_PI * distr(eng); + } + + for (int i = 0; i < N1 * N2 * ntransf; i++) { + fk[i].real(distr(eng)); + fk[i].imag(distr(eng)); + } + cudaMemcpy(d_x, x, M * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(d_y, y, M * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(d_fk, fk, N1 * N2 * ntransf * sizeof(cuDoubleComplex), + cudaMemcpyHostToDevice); + + cufinufft_plan dplan; + + int dim = 2; + int64_t nmodes[3]; + int type = 2; + + nmodes[0] = N1; + nmodes[1] = N2; + nmodes[2] = 1; + + cufinufft_opts opts; + cufinufft_default_opts(&opts); + opts.gpu_maxbatchsize = maxbatchsize; + + ier = cufinufft_makeplan(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts); + + ier = cufinufft_setpts(dplan, M, d_x, d_y, NULL, 0, NULL, NULL, NULL); + + ier = cufinufft_execute(dplan, d_c, d_fk); + + ier = cufinufft_destroy(dplan); + + cudaMemcpy(c, d_c, M * ntransf * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost); + + std::cout << std::endl << "Accuracy check:" << std::endl; + std::complex *fkstart; + std::complex *cstart; + for (int t = 0; t < ntransf; t++) { + fkstart = fk + t * N1 * N2; + cstart = c + t * M; + int jt = M / 2; // check arbitrary choice of one targ pt + std::complex J(0, iflag * 1); + std::complex ct(0, 0); + int m = 0; + for (int m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F + for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1) + ct += fkstart[m++] * exp(J * (m1 * x[jt] + m2 * y[jt])); // crude direct + + printf("[gpu %3d] one targ: rel err in c[%d] is %.3g\n", t, jt, + abs(cstart[jt] - ct) / infnorm(M, c)); + } + + cudaFreeHost(x); + cudaFreeHost(y); + cudaFreeHost(c); + cudaFreeHost(fk); + + cudaFree(d_x); + cudaFree(d_y); + cudaFree(d_c); + cudaFree(d_fk); + return 0; } diff --git a/examples/cuda/getting_started.cpp b/examples/cuda/getting_started.cpp index 113a73e7c..da2bf6f5f 100644 --- a/examples/cuda/getting_started.cpp +++ b/examples/cuda/getting_started.cpp @@ -26,90 +26,91 @@ #include int main() { - // Problem size: number of nonuniform points (M) and grid size (N). - const int M = 100000, N = 10000; + // Problem size: number of nonuniform points (M) and grid size (N). + const int M = 100000, N = 10000; - // Size of the grid as an array. - int64_t modes[1] = {N}; + // Size of the grid as an array. + int64_t modes[1] = {N}; - // Host pointers: frequencies (x), coefficients (c), and output (f). - float *x; - float _Complex *c; - float _Complex *f; + // Host pointers: frequencies (x), coefficients (c), and output (f). + float *x; + float _Complex *c; + float _Complex *f; - // Device pointers. - float *d_x; - cuFloatComplex *d_c, *d_f; + // Device pointers. + float *d_x; + cuFloatComplex *d_c, *d_f; - // Store cufinufft plan. - cufinufftf_plan plan; + // Store cufinufft plan. + cufinufftf_plan plan; - // Manual calculation at a single point idx. - int idx; - float _Complex f0; + // Manual calculation at a single point idx. + int idx; + float _Complex f0; - // Allocate the host arrays. - x = (float *)malloc(M * sizeof(float)); - c = (float _Complex *)malloc(M * sizeof(float _Complex)); - f = (float _Complex *)malloc(N * sizeof(float _Complex)); + // Allocate the host arrays. + x = (float *)malloc(M * sizeof(float)); + c = (float _Complex *)malloc(M * sizeof(float _Complex)); + f = (float _Complex *)malloc(N * sizeof(float _Complex)); - // Fill with random numbers. Frequencies must be in the interval [-pi, pi) - // while strengths can be any value. - srand(0); + // Fill with random numbers. Frequencies must be in the interval [-pi, pi) + // while strengths can be any value. + srand(0); - for (int j = 0; j < M; ++j) { - x[j] = 2 * M_PI * (((float)rand()) / RAND_MAX - 1); - c[j] = (2 * ((float)rand()) / RAND_MAX - 1) + I * (2 * ((float)rand()) / RAND_MAX - 1); - } + for (int j = 0; j < M; ++j) { + x[j] = 2 * M_PI * (((float)rand()) / RAND_MAX - 1); + c[j] = + (2 * ((float)rand()) / RAND_MAX - 1) + I * (2 * ((float)rand()) / RAND_MAX - 1); + } - // Allocate the device arrays and copy the x and c arrays. - cudaMalloc(&d_x, M * sizeof(float)); - cudaMalloc(&d_c, M * sizeof(float _Complex)); - cudaMalloc(&d_f, N * sizeof(float _Complex)); + // Allocate the device arrays and copy the x and c arrays. + cudaMalloc(&d_x, M * sizeof(float)); + cudaMalloc(&d_c, M * sizeof(float _Complex)); + cudaMalloc(&d_f, N * sizeof(float _Complex)); - cudaMemcpy(d_x, x, M * sizeof(float), cudaMemcpyHostToDevice); - cudaMemcpy(d_c, c, M * sizeof(float _Complex), cudaMemcpyHostToDevice); + cudaMemcpy(d_x, x, M * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_c, c, M * sizeof(float _Complex), cudaMemcpyHostToDevice); - // Make the cufinufft plan for a 1D type-1 transform with six digits of - // tolerance. - cufinufftf_makeplan(1, 1, modes, 1, 1, 1e-6, &plan, NULL); + // Make the cufinufft plan for a 1D type-1 transform with six digits of + // tolerance. + cufinufftf_makeplan(1, 1, modes, 1, 1, 1e-6, &plan, NULL); - // Set the frequencies of the nonuniform points. - cufinufftf_setpts(plan, M, d_x, NULL, NULL, 0, NULL, NULL, NULL); + // Set the frequencies of the nonuniform points. + cufinufftf_setpts(plan, M, d_x, NULL, NULL, 0, NULL, NULL, NULL); - // Actually execute the plan on the given coefficients and store the result - // in the d_f array. - cufinufftf_execute(plan, d_c, d_f); + // Actually execute the plan on the given coefficients and store the result + // in the d_f array. + cufinufftf_execute(plan, d_c, d_f); - // Copy the result back onto the host. - cudaMemcpy(f, d_f, N * sizeof(float _Complex), cudaMemcpyDeviceToHost); + // Copy the result back onto the host. + cudaMemcpy(f, d_f, N * sizeof(float _Complex), cudaMemcpyDeviceToHost); - // Destroy the plan and free the device arrays after we're done. - cufinufftf_destroy(plan); + // Destroy the plan and free the device arrays after we're done. + cufinufftf_destroy(plan); - cudaFree(d_x); - cudaFree(d_c); - cudaFree(d_f); + cudaFree(d_x); + cudaFree(d_c); + cudaFree(d_f); - // Pick an index to check the result of the calculation. - idx = 4 * N / 7; + // Pick an index to check the result of the calculation. + idx = 4 * N / 7; - printf("f[%d] = %lf + %lfi\n", idx, crealf(f[idx]), cimagf(f[idx])); + printf("f[%d] = %lf + %lfi\n", idx, crealf(f[idx]), cimagf(f[idx])); - // Calculate the result manually using the formula for the type-1 - // transform. - f0 = 0; + // Calculate the result manually using the formula for the type-1 + // transform. + f0 = 0; - for (int j = 0; j < M; ++j) { - f0 += c[j] * cexp(I * x[j] * (idx - N / 2)); - } + for (int j = 0; j < M; ++j) { + f0 += c[j] * cexp(I * x[j] * (idx - N / 2)); + } - printf("f0[%d] = %lf + %lfi\n", idx, crealf(f0), cimagf(f0)); + printf("f0[%d] = %lf + %lfi\n", idx, crealf(f0), cimagf(f0)); - // Finally free the host arrays. - free(x); - free(c); - free(f); + // Finally free the host arrays. + free(x); + free(c); + free(f); - return 0; + return 0; } diff --git a/examples/guru1d1.cpp b/examples/guru1d1.cpp index eb7189da0..35c626093 100644 --- a/examples/guru1d1.cpp +++ b/examples/guru1d1.cpp @@ -1,87 +1,90 @@ // this is all you must include for the finufft lib... -#include #include +#include // specific to this example... +#include #include -#include #include #include -#include +#include // only good for small projects... using namespace std; // allows 1i to be the imaginary unit... (C++14 onwards) using namespace std::complex_literals; -int main(int argc, char* argv[]) +int main(int argc, char *argv[]) /* Example calling guru C++ interface to FINUFFT library, passing pointers to STL vectors of C++ double complex numbers, with a math check. Barnett 2/27/20 Compile on linux with (or see ../makefile): - g++ -std=c++14 -fopenmp guru1d1.cpp -I../include ../lib-static/libfinufft.a -o guru1d1 -lfftw3 -lfftw3_omp -lm + g++ -std=c++14 -fopenmp guru1d1.cpp -I../include ../lib-static/libfinufft.a -o guru1d1 + -lfftw3 -lfftw3_omp -lm Or if you have built a single-thread library, remove -fopenmp and -lfftw3_omp Usage: ./guru1d1 */ { - int M = 1e6; // number of nonuniform points - int N = 1e6; // number of modes - double tol = 1e-9; // desired accuracy + int M = 1e6; // number of nonuniform points + int N = 1e6; // number of modes + double tol = 1e-9; // desired accuracy - int type = 1, dim = 1; // 1d1 - int64_t Ns[3]; // guru describes mode array by vector [N1,N2..] - Ns[0] = N; - int ntransf = 1; // we want to do a single transform at a time - finufft_plan plan; // creates a plan struct - int changeopts = 0; // do you want to try changing opts? 0 or 1 - if (changeopts) { // demo how to change options away from defaults.. + int type = 1, dim = 1; // 1d1 + int64_t Ns[3]; // guru describes mode array by vector [N1,N2..] + Ns[0] = N; + int ntransf = 1; // we want to do a single transform at a time + finufft_plan plan; // creates a plan struct + int changeopts = 0; // do you want to try changing opts? 0 or 1 + if (changeopts) { // demo how to change options away from defaults.. finufft_opts opts; finufft_default_opts(&opts); - opts.debug = 1; // example options change + opts.debug = 1; // example options change finufft_makeplan(type, dim, Ns, +1, ntransf, tol, &plan, &opts); - } else // or, NULL here means use default opts... + } else // or, NULL here means use default opts... finufft_makeplan(type, dim, Ns, +1, ntransf, tol, &plan, NULL); // generate some random nonuniform points vector x(M); - for (int j=0; j> c(M); - for (int j=0; j> F(N); int ier = finufft_execute(plan, c.data(), F.data()); // for fun, do another with same NU pts (no re-sorting), but new strengths... - for (int j=0; j=-(double)N/2 && n<(double)N/2); // ensure meaningful test - complex Ftest = complex(0,0); - for (int j=0; j= -(double)N / 2 && n < (double)N / 2); // ensure meaningful test + complex Ftest = complex(0, 0); + for (int j = 0; j < M; ++j) Ftest += c[j] * exp(1i * (double)n * x[j]); + int nout = n + N / 2; // index in output array for freq mode n double Fmax = 0.0; // compute inf norm of F - for (int m=0; mFmax) Fmax=aF; + if (aF > Fmax) Fmax = aF; } - double err = abs(F[nout] - Ftest)/Fmax; - printf("guru 1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n",ier,n,err); + double err = abs(F[nout] - Ftest) / Fmax; + printf("guru 1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n", ier, + n, err); return ier; } diff --git a/examples/guru1d1f.cpp b/examples/guru1d1f.cpp index a46c4a735..d890d3081 100644 --- a/examples/guru1d1f.cpp +++ b/examples/guru1d1f.cpp @@ -1,85 +1,88 @@ // this is all you must include for the finufft lib... -#include #include +#include // specific to this example... #include -#include #include #include +#include // only good for small projects... using namespace std; // allows 1i to be the imaginary unit... (C++14 onwards) using namespace std::complex_literals; -int main(int argc, char* argv[]) +int main(int argc, char *argv[]) /* Example calling guru C++ interface to FINUFFT library, single-prec, passing pointers to STL vectors of C++ float complex numbers, with a math check. Barnett 7/5/20 Compile on linux with: - g++-7 -std=c++14 -fopenmp guru1d1f.cpp -I../include ../lib-static/libfinufft.a -o guru1d1f -lfftw3f -lfftw3f_omp -lm + g++-7 -std=c++14 -fopenmp guru1d1f.cpp -I../include ../lib-static/libfinufft.a -o + guru1d1f -lfftw3f -lfftw3f_omp -lm Or if you have built a single-core library, remove -fopenmp and -lfftw3f_omp Usage: ./guru1d1f */ { - int M = 1e5; // number of nonuniform points - int N = 1e5; // number of modes - float tol = 1e-5; // desired accuracy + int M = 1e5; // number of nonuniform points + int N = 1e5; // number of modes + float tol = 1e-5; // desired accuracy - int type = 1, dim = 1; // 1d1 - int64_t Ns[3]; // guru describes mode array by vector [N1,N2..] - Ns[0] = N; - int ntransf = 1; // we want to do a single transform at a time - finufftf_plan plan; // creates single-prec plan struct: note the "f" - int changeopts = 1; // do you want to try changing opts? 0 or 1 - if (changeopts) { // demo how to change options away from defaults.. + int type = 1, dim = 1; // 1d1 + int64_t Ns[3]; // guru describes mode array by vector [N1,N2..] + Ns[0] = N; + int ntransf = 1; // we want to do a single transform at a time + finufftf_plan plan; // creates single-prec plan struct: note the "f" + int changeopts = 1; // do you want to try changing opts? 0 or 1 + if (changeopts) { // demo how to change options away from defaults.. finufft_opts opts; - finufftf_default_opts(&opts); // note "f" for single-prec, throughout... - opts.debug = 2; // example options change + finufftf_default_opts(&opts); // note "f" for single-prec, throughout... + opts.debug = 2; // example options change finufftf_makeplan(type, dim, Ns, +1, ntransf, tol, &plan, &opts); - } else // or, NULL here means use default opts... + } else // or, NULL here means use default opts... finufftf_makeplan(type, dim, Ns, +1, ntransf, tol, &plan, NULL); // generate some random nonuniform points vector x(M); - for (int j=0; j> c(M); - for (int j=0; j> F(N); int ier = finufftf_execute(plan, &c[0], &F[0]); // for fun, do another with same NU pts (no re-sorting), but new strengths... - for (int j=0; j Ftest = complex(0,0); - for (int j=0; j Ftest = complex(0, 0); + for (int j = 0; j < M; ++j) Ftest += c[j] * exp(1if * (float)n * x[j]); + int nout = n + N / 2; // index in output array for freq mode n + float Fmax = 0.0; // compute inf norm of F + for (int m = 0; m < N; ++m) { float aF = abs(F[m]); - if (aF>Fmax) Fmax=aF; + if (aF > Fmax) Fmax = aF; } - float err = abs(F[nout] - Ftest)/Fmax; - printf("guru 1D type-1 single-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n",ier,n,err); + float err = abs(F[nout] - Ftest) / Fmax; + printf("guru 1D type-1 single-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n", ier, + n, err); return ier; } diff --git a/examples/guru2d1.cpp b/examples/guru2d1.cpp index 06d25e064..cfc39109e 100644 --- a/examples/guru2d1.cpp +++ b/examples/guru2d1.cpp @@ -1,51 +1,53 @@ #include #include -#include #include +#include #include using namespace std; -int main(int argc, char *argv[]){ +int main(int argc, char *argv[]) { + + /* 2D type 1 guru interface example of calling the FINUFFT library from C++, + using STL double complex vectors, with a math test. Similar to simple2d1 + except illustrates the guru interface. -/* 2D type 1 guru interface example of calling the FINUFFT library from C++, - using STL double complex vectors, with a math test. Similar to simple2d1 - except illustrates the guru interface. + Compile multithreaded with + g++ -fopenmp guru2d1.cpp -I ../src ../lib-static/libfinufft.a -o guru2d1 -lfftw3 + -lfftw3_omp -lm single core with: g++ guru2d1.cpp -I ../src + ../lib-static/libfinufft.a -o guru2d1 -lfftw3 -lm - Compile multithreaded with - g++ -fopenmp guru2d1.cpp -I ../src ../lib-static/libfinufft.a -o guru2d1 -lfftw3 -lfftw3_omp -lm - single core with: - g++ guru2d1.cpp -I ../src ../lib-static/libfinufft.a -o guru2d1 -lfftw3 -lm - - Usage: ./guru2d1 -*/ - int M = 1e6; // number of nonuniform points - int N = 1e6; // approximate total number of modes (N1*N2) - double tol = 1e-6; // desired accuracy - finufft_opts opts; finufft_default_opts(&opts); + Usage: ./guru2d1 + */ + int M = 1e6; // number of nonuniform points + int N = 1e6; // approximate total number of modes (N1*N2) + double tol = 1e-6; // desired accuracy + finufft_opts opts; + finufft_default_opts(&opts); opts.upsampfac = 1.25; complex I(0.0, 1.0); // the imaginary unit // generate random non-uniform points on (x,y) and complex strengths (c): vector x(M), y(M); - vector > c(M); + vector> c(M); - for(int i = 0; i < M; i++){ - x[i] = M_PI*(2*(double)rand()/RAND_MAX-1); //uniform random in [-pi, pi) - y[i] = M_PI*(2*(double)rand()/RAND_MAX-1); //uniform random in [-pi, pi) + for (int i = 0; i < M; i++) { + x[i] = M_PI * (2 * (double)rand() / RAND_MAX - 1); // uniform random in [-pi, pi) + y[i] = M_PI * (2 * (double)rand() / RAND_MAX - 1); // uniform random in [-pi, pi) // each component uniform random in [-1,1] - c[i] = 2*((double)rand()/RAND_MAX-1) + I*(2*((double)rand()/RAND_MAX)-1); + c[i] = + 2 * ((double)rand() / RAND_MAX - 1) + I * (2 * ((double)rand() / RAND_MAX) - 1); } // choose numbers of output Fourier coefficients in each dimension - int N1 = round(2.0*sqrt(N)); - int N2 = round(N/N1); - + int N1 = round(2.0 * sqrt(N)); + int N2 = round(N / N1); + // output array for the Fourier modes - vector > F(N1*N2); + vector> F(N1 * N2); - int type=1, dim=2, ntrans=1; // you could also do ntrans>1 - int64_t Ns[] = {N1,N2}; // N1,N2 as 64-bit int array + int type = 1, dim = 2, ntrans = 1; // you could also do ntrans>1 + int64_t Ns[] = {N1, N2}; // N1,N2 as 64-bit int array // step 1: make a plan... finufft_plan plan; int ier = finufft_makeplan(type, dim, Ns, +1, ntrans, tol, &plan, NULL); @@ -58,27 +60,28 @@ int main(int argc, char *argv[]){ // step 4: free the memory used by the plan... finufft_destroy(plan); - int k1 = round(0.45*N1); // check the answer for mode frequency (k1,k2) - int k2 = round(-0.35*N2); - - complex Ftest(0,0); - for(int j = 0; j < M; j++) - Ftest += c[j]*exp(I*((double)k1*x[j]+(double)k2*y[j])); + int k1 = round(0.45 * N1); // check the answer for mode frequency (k1,k2) + int k2 = round(-0.35 * N2); + + complex Ftest(0, 0); + for (int j = 0; j < M; j++) + Ftest += c[j] * exp(I * ((double)k1 * x[j] + (double)k2 * y[j])); - // compute inf norm of F + // compute inf norm of F double Fmax = 0.0; - for (int m=0; mFmax) Fmax=aF; + if (aF > Fmax) Fmax = aF; } - + // indices in output array for this frequency pair (k1,k2) - int k1out = k1 + (int)N1/2; - int k2out = k2 + (int)N2/2; - int indexOut = k1out + k2out*(N1); + int k1out = k1 + (int)N1 / 2; + int k2out = k2 + (int)N2 / 2; + int indexOut = k1out + k2out * (N1); // compute relative error - double err = abs(F[indexOut] - Ftest)/Fmax; - cout << "2D type-1 NUFFT done. ier=" << ier << ", err in F[" << indexOut << "] rel to max(F) is " << setprecision(2) << err << endl; + double err = abs(F[indexOut] - Ftest) / Fmax; + cout << "2D type-1 NUFFT done. ier=" << ier << ", err in F[" << indexOut + << "] rel to max(F) is " << setprecision(2) << err << endl; return ier; } diff --git a/examples/gurumany1d1.cpp b/examples/gurumany1d1.cpp index 8f150e609..01503af14 100644 --- a/examples/gurumany1d1.cpp +++ b/examples/gurumany1d1.cpp @@ -1,4 +1,4 @@ -/* Demonstrate guru FINUFFT interface performing a stack of 1d type 1 +/* Demonstrate guru FINUFFT interface performing a stack of 1d type 1 transforms in a single execute call. See guru1d1.cpp for other guru features demonstrated. @@ -11,70 +11,73 @@ */ // this is all you must include for the finufft lib... -#include #include +#include // specific to this demo... +#include #include -#include #include #include -#include +#include // only good for small projects... using namespace std; // allows 1i to be the imaginary unit... (C++14 onwards) using namespace std::complex_literals; -int main(int argc, char* argv[]) -{ - int M = 2e5; // number of nonuniform points - int N = 1e5; // number of modes - double tol = 1e-9; // desired accuracy - int ntrans = 100; // request a bunch of transforms in the execute - int isign = +1; // sign of i in the transform math definition - +int main(int argc, char *argv[]) { + int M = 2e5; // number of nonuniform points + int N = 1e5; // number of modes + double tol = 1e-9; // desired accuracy + int ntrans = 100; // request a bunch of transforms in the execute + int isign = +1; // sign of i in the transform math definition + int type = 1, dim = 1; // 1d1 - int64_t Ns[3] = {N,0,0}; // guru describes mode array by vector [N1,N2..] + int64_t Ns[3] = {N, 0, 0}; // guru describes mode array by vector [N1,N2..] finufft_plan plan; // creates a plan struct (NULL below: default opts) finufft_makeplan(type, dim, Ns, isign, ntrans, tol, &plan, NULL); // generate random nonuniform points and pass to FINUFFT vector x(M); - for (int j=0; j> c(M*ntrans); // plain contiguous storage - for (int j=0; j> c(M * ntrans); // plain contiguous storage + for (int j = 0; j < M * ntrans; ++j) + c[j] = + 2 * ((double)rand() / RAND_MAX) - 1 + 1i * (2 * ((double)rand() / RAND_MAX) - 1); // alloc output array for the Fourier modes, then do the transform - vector> F(N*ntrans); - printf("guru many 1D type-1 double-prec, tol=%.3g, executing %d transforms (vectorized), each size %d NU pts to %d modes...\n",tol,ntrans,M,N); + vector> F(N * ntrans); + printf("guru many 1D type-1 double-prec, tol=%.3g, executing %d transforms " + "(vectorized), each size %d NU pts to %d modes...\n", + tol, ntrans, M, N); int ier = finufft_execute(plan, c.data(), F.data()); // could now change c, do another execute, do another setpts, execute, etc... - - finufft_destroy(plan); // don't forget! we're done with transforms of this size - + + finufft_destroy(plan); // don't forget! we're done with transforms of this size + // rest is math checking and reporting... - int k = 42519; // check the answer just for this mode - int trans = 71; // ...testing in just this transform - assert(k>=-(double)N/2 && k<(double)N/2); // ensure meaningful test - assert(trans>=0 && trans Ftest = complex(0,0); - for (int j=0; jFmax) Fmax=aF; + int k = 42519; // check the answer just for this mode + int trans = 71; // ...testing in just this transform + assert(k >= -(double)N / 2 && k < (double)N / 2); // ensure meaningful test + assert(trans >= 0 && trans < ntrans); + complex Ftest = complex(0, 0); + for (int j = 0; j < M; ++j) + Ftest += c[j + M * trans] * exp(1i * (double)k * x[j]); // c offset to trans + double Fmax = 0.0; // compute inf norm of F for selected transform + for (int m = 0; m < N; ++m) { + double aF = abs(F[m + N * trans]); + if (aF > Fmax) Fmax = aF; } - int nout = k+N/2 + N*trans; // output index for freq mode k in the trans - double err = abs(F[nout] - Ftest)/Fmax; - printf("\tdone: ier=%d; for transform %d, rel err in F[%d] is %.3g\n",ier,trans,k,err); + int nout = k + N / 2 + N * trans; // output index for freq mode k in the trans + double err = abs(F[nout] - Ftest) / Fmax; + printf("\tdone: ier=%d; for transform %d, rel err in F[%d] is %.3g\n", ier, trans, k, + err); return ier; } diff --git a/examples/many1d1.cpp b/examples/many1d1.cpp index 8176007c9..4b884d028 100644 --- a/examples/many1d1.cpp +++ b/examples/many1d1.cpp @@ -1,59 +1,61 @@ #include -#include +#include #include #include #include -#include +#include using namespace std; -int main(int argc, char* argv[]) +int main(int argc, char *argv[]) /* Example of calling the vectorized FINUFFT library from C++, using STL double complex vectors, with a math test. Compile with: - g++ -fopenmp many1d1.cpp -I../include ../lib-static/libfinufft.a -o many1d1 -lfftw3 -lfftw3_omp -lm - or if you have built a single-core version: - g++ many1d1.cpp -I../include ../lib-static/libfinufft.a -o many1d1 -lfftw3 -lm + g++ -fopenmp many1d1.cpp -I../include ../lib-static/libfinufft.a -o many1d1 -lfftw3 + -lfftw3_omp -lm or if you have built a single-core version: g++ many1d1.cpp + -I../include ../lib-static/libfinufft.a -o many1d1 -lfftw3 -lm Usage: ./many1d1 */ { - int ntrans = 3; // how many stacked transforms to do - int M = 1e6; // nonuniform points (same for all transforms) - int N = 1e6; // number of modes (same for all transforms) - double tol = 1e-9; // desired accuracy - finufft_opts* opts = new finufft_opts; // opts is pointer to struct + int ntrans = 3; // how many stacked transforms to do + int M = 1e6; // nonuniform points (same for all transforms) + int N = 1e6; // number of modes (same for all transforms) + double tol = 1e-9; // desired accuracy + finufft_opts *opts = new finufft_opts; // opts is pointer to struct finufft_default_opts(opts); - complex I = complex(0.0,1.0); // the imaginary unit - + complex I = complex(0.0, 1.0); // the imaginary unit + // generate some random nonuniform points (x) and complex strengths (c)... vector x(M); - vector > c(M*ntrans); - for (int j=0; j> c(M * ntrans); + for (int j = 0; j < M; ++j) + x[j] = M_PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi) + for (int j = 0; j < M * ntrans; ++j) // fill all ntrans vectors... + c[j] = + 2 * ((double)rand() / RAND_MAX) - 1 + I * (2 * ((double)rand() / RAND_MAX) - 1); // allocate output array for the Fourier modes... - vector > F(N*ntrans); + vector> F(N * ntrans); // call the NUFFT (with iflag=+1): note pointers (not STL vecs) passed... - int ier = finufft1d1many(ntrans,M,&x[0],&c[0],+1,tol,N,&F[0],NULL); - - int k = 142519; // check the answer just for this mode... - int trans = ntrans-1; // ...in this transform - assert(k>=-(double)N/2 && k<(double)N/2); - - complex Ftest = complex(0,0); // do the naive calc... - for (int j=0; jFmax) Fmax=aF; + int ier = finufft1d1many(ntrans, M, &x[0], &c[0], +1, tol, N, &F[0], NULL); + + int k = 142519; // check the answer just for this mode... + int trans = ntrans - 1; // ...in this transform + assert(k >= -(double)N / 2 && k < (double)N / 2); + + complex Ftest = complex(0, 0); // do the naive calc... + for (int j = 0; j < M; ++j) + Ftest += c[j + M * trans] * exp(I * (double)k * x[j]); // c from transform # trans + double Fmax = 0.0; // compute inf norm of F for transform # trans + for (int m = 0; m < N; ++m) { + double aF = abs(F[m + N * trans]); + if (aF > Fmax) Fmax = aF; } - int kout = k+N/2+N*trans; // output index, freq mode k, transform # trans - double err = abs(F[kout] - Ftest)/Fmax; - printf("1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n",ier,k,err); + int kout = k + N / 2 + N * trans; // output index, freq mode k, transform # trans + double err = abs(F[kout] - Ftest) / Fmax; + printf("1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n", ier, k, + err); return ier; } diff --git a/examples/simple1d1.cpp b/examples/simple1d1.cpp index 1e7f16858..4e547eafc 100644 --- a/examples/simple1d1.cpp +++ b/examples/simple1d1.cpp @@ -2,60 +2,61 @@ #include // also used in this example... -#include +#include #include #include #include -#include +#include using namespace std; -int main(int argc, char* argv[]) +int main(int argc, char *argv[]) /* Example of calling the FINUFFT library from C++, using STL double complex vectors, with a math test. Double-precision version (see simple1d1f for single-precision) Compile with (static library case): - g++ simple1d1.cpp -I../include ../lib-static/libfinufft.a -o simple1d1 -lfftw3 -lfftw3_omp - or if you have built a single-core version: g++ simple1d1.cpp -I../include ../lib-static/libfinufft.a -o simple1d1 -lfftw3 + -lfftw3_omp or if you have built a single-core version: g++ simple1d1.cpp -I../include + ../lib-static/libfinufft.a -o simple1d1 -lfftw3 Usage: ./simple1d1 Also see ../docs/cex.rst or online documentation. */ { - int M = 1e6; // number of nonuniform points - int N = 1e6; // number of modes - double acc = 1e-9; // desired accuracy - finufft_opts* opts = new finufft_opts; // opts is pointer to struct + int M = 1e6; // number of nonuniform points + int N = 1e6; // number of modes + double acc = 1e-9; // desired accuracy + finufft_opts *opts = new finufft_opts; // opts is pointer to struct finufft_default_opts(opts); - complex I = complex(0.0,1.0); // the imaginary unit - + complex I = complex(0.0, 1.0); // the imaginary unit + // generate some random nonuniform points (x) and complex strengths (c)... vector x(M); - vector > c(M); - for (int j=0; j> c(M); + for (int j = 0; j < M; ++j) { + x[j] = M_PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi) + c[j] = + 2 * ((double)rand() / RAND_MAX) - 1 + I * (2 * ((double)rand() / RAND_MAX) - 1); } // allocate output array for the Fourier modes... - vector > F(N); - + vector> F(N); + // call the NUFFT (with iflag=+1): note pointers (not STL vecs) passed... - int ier = finufft1d1(M,&x[0],&c[0],+1,acc,N,&F[0],opts); - - int k = 142519; // check the answer just for this mode frequency... - assert(k>=-(double)N/2 && k<(double)N/2); - complex Ftest = complex(0,0); - for (int j=0; j= -(double)N / 2 && k < (double)N / 2); + complex Ftest = complex(0, 0); + for (int j = 0; j < M; ++j) Ftest += c[j] * exp(I * (double)k * x[j]); + double Fmax = 0.0; // compute inf norm of F + for (int m = 0; m < N; ++m) { double aF = abs(F[m]); - if (aF>Fmax) Fmax=aF; + if (aF > Fmax) Fmax = aF; } - int kout = k+N/2; // index in output array for freq mode k - double err = abs(F[kout] - Ftest)/Fmax; - printf("1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n",ier,k,err); + int kout = k + N / 2; // index in output array for freq mode k + double err = abs(F[kout] - Ftest) / Fmax; + printf("1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n", ier, k, + err); return ier; } diff --git a/examples/simple1d1f.cpp b/examples/simple1d1f.cpp index fea98b8d6..3882d8ea1 100644 --- a/examples/simple1d1f.cpp +++ b/examples/simple1d1f.cpp @@ -2,58 +2,58 @@ #include // also needed for this example... -#include +#include #include #include #include -#include +#include using namespace std; -int main(int argc, char* argv[]) +int main(int argc, char *argv[]) /* Example of calling the FINUFFT library from C++, using STL single complex vectors, with a math test. (See simple1d1 for double-precision version.) Compile with: - g++ -fopenmp simple1d1f.cpp -I../include ../lib-static/libfinufft.a -o simple1d1f -lfftw3f -lfftw3f_omp -lm - or if you have built a single-core version: - g++ simple1d1f.cpp -I../include ../lib-static/libfinufft.a -o simple1d1f -lfftw3f -lm + g++ -fopenmp simple1d1f.cpp -I../include ../lib-static/libfinufft.a -o simple1d1f + -lfftw3f -lfftw3f_omp -lm or if you have built a single-core version: g++ + simple1d1f.cpp -I../include ../lib-static/libfinufft.a -o simple1d1f -lfftw3f -lm Usage: ./simple1d1f */ { - int M = 1e5; // number of nonuniform points - int N = 1e5; // number of modes (NB if too large lose acc in 1d) - float acc = 1e-3; // desired accuracy - finufft_opts* opts = new finufft_opts; // opts is pointer to struct - finufftf_default_opts(opts); // note finufft "f" suffix - complex I = complex(0.0,1.0); // the imaginary unit - + int M = 1e5; // number of nonuniform points + int N = 1e5; // number of modes (NB if too large lose acc in 1d) + float acc = 1e-3; // desired accuracy + finufft_opts *opts = new finufft_opts; // opts is pointer to struct + finufftf_default_opts(opts); // note finufft "f" suffix + complex I = complex(0.0, 1.0); // the imaginary unit + // generate some random nonuniform points (x) and complex strengths (c)... vector x(M); - vector > c(M); - for (int j=0; j> c(M); + for (int j = 0; j < M; ++j) { + x[j] = M_PI * (2 * ((float)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi) + c[j] = 2 * ((float)rand() / RAND_MAX) - 1 + I * (2 * ((float)rand() / RAND_MAX) - 1); } // allocate output array for the Fourier modes... - vector > F(N); + vector> F(N); // call the NUFFT (with iflag=+1): note pointers (not STL vecs) passed... - int ier = finufftf1d1(M,&x[0],&c[0],+1,acc,N,&F[0],opts); // note "f" + int ier = finufftf1d1(M, &x[0], &c[0], +1, acc, N, &F[0], opts); // note "f" - int k = 14251; // check the answer just for this mode... - assert(k>=-(double)N/2 && k<(double)N/2); - complex Ftest = complex(0,0); - for (int j=0; j= -(double)N / 2 && k < (double)N / 2); + complex Ftest = complex(0, 0); + for (int j = 0; j < M; ++j) Ftest += c[j] * exp(I * (float)k * x[j]); + float Fmax = 0.0; // compute inf norm of F + for (int m = 0; m < N; ++m) { float aF = abs(F[m]); - if (aF>Fmax) Fmax=aF; + if (aF > Fmax) Fmax = aF; } - int kout = k+N/2; // index in output array for freq mode k - float err = abs(F[kout] - Ftest)/Fmax; - printf("1D type-1 single-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n",ier,k,err); + int kout = k + N / 2; // index in output array for freq mode k + float err = abs(F[kout] - Ftest) / Fmax; + printf("1D type-1 single-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n", ier, k, + err); return ier; } diff --git a/examples/simple2d1.cpp b/examples/simple2d1.cpp index cf912445b..91cce0bd1 100644 --- a/examples/simple2d1.cpp +++ b/examples/simple2d1.cpp @@ -1,76 +1,79 @@ // this is all you must include for the finufft lib... -#include #include +#include // also needed for this example... -#include #include +#include #include using namespace std; -int main(int argc, char *argv[]){ +int main(int argc, char *argv[]) { -/* Simple 2D type-1 example of calling the FINUFFT library from C++, using plain - arrays of C++ complex numbers, with a math test. Double precision version. + /* Simple 2D type-1 example of calling the FINUFFT library from C++, using plain + arrays of C++ complex numbers, with a math test. Double precision version. - Compile multithreaded with - g++ -fopenmp simple2d1.cpp -I ../src ../lib-static/libfinufft.a -o simple2d1 -lfftw3 -lfftw3_omp -lm - single core with: - g++ simple2d1.cpp -I ../src ../lib-static/libfinufft.a -o simple2d1 -lfftw3 -lm - - Usage: ./simple2d1 -*/ + Compile multithreaded with + g++ -fopenmp simple2d1.cpp -I ../src ../lib-static/libfinufft.a -o simple2d1 -lfftw3 + -lfftw3_omp -lm single core with: g++ simple2d1.cpp -I ../src + ../lib-static/libfinufft.a -o simple2d1 -lfftw3 -lm - int M = 1e6; // number of nonuniform points - int N = 1e6; // approximate total number of modes (N1*N2) - double tol = 1e-6; // desired accuracy - finufft_opts opts; finufft_default_opts(&opts); + Usage: ./simple2d1 + */ + + int M = 1e6; // number of nonuniform points + int N = 1e6; // approximate total number of modes (N1*N2) + double tol = 1e-6; // desired accuracy + finufft_opts opts; + finufft_default_opts(&opts); complex I(0.0, 1.0); // the imaginary unit // generate random non-uniform points on (x,y) and complex strengths (c): vector x(M), y(M); - vector > c(M); + vector> c(M); - for(int i = 0; i < M; i++){ - x[i] = M_PI*(2*(double)rand()/RAND_MAX-1); //uniform random in [-pi, pi) - y[i] = M_PI*(2*(double)rand()/RAND_MAX-1); //uniform random in [-pi, pi) + for (int i = 0; i < M; i++) { + x[i] = M_PI * (2 * (double)rand() / RAND_MAX - 1); // uniform random in [-pi, pi) + y[i] = M_PI * (2 * (double)rand() / RAND_MAX - 1); // uniform random in [-pi, pi) // each component uniform random in [-1,1] - c[i] = 2*((double)rand()/RAND_MAX-1) + I*(2*((double)rand()/RAND_MAX)-1); + c[i] = + 2 * ((double)rand() / RAND_MAX - 1) + I * (2 * ((double)rand() / RAND_MAX) - 1); } // choose numbers of output Fourier coefficients in each dimension - int N1 = round(2.0*sqrt(N)); - int N2 = round(N/N1); - + int N1 = round(2.0 * sqrt(N)); + int N2 = round(N / N1); + // output array for the Fourier modes - vector > F(N1*N2); + vector> F(N1 * N2); // call the NUFFT (with iflag += 1): note passing in pointers... opts.upsampfac = 1.25; - int ier = finufft2d1(M,&x[0],&y[0], &c[0], 1, tol, N1, N2, &F[0], &opts); + int ier = finufft2d1(M, &x[0], &y[0], &c[0], 1, tol, N1, N2, &F[0], &opts); - int k1 = round(0.45*N1); // check the answer for mode frequency (k1,k2) - int k2 = round(-0.35*N2); - - complex Ftest(0,0); - for(int j = 0; j < M; j++) - Ftest += c[j]*exp(I*((double)k1*x[j]+(double)k2*y[j])); + int k1 = round(0.45 * N1); // check the answer for mode frequency (k1,k2) + int k2 = round(-0.35 * N2); - // compute inf norm of F + complex Ftest(0, 0); + for (int j = 0; j < M; j++) + Ftest += c[j] * exp(I * ((double)k1 * x[j] + (double)k2 * y[j])); + + // compute inf norm of F double Fmax = 0.0; - for (int m=0; mFmax) Fmax=aF; + if (aF > Fmax) Fmax = aF; } - + // indices in output array for this frequency pair (k1,k2) - int k1out = k1 + N1/2; - int k2out = k2 + N2/2; - int indexOut = k1out + k2out*(N1); + int k1out = k1 + N1 / 2; + int k2out = k2 + N2 / 2; + int indexOut = k1out + k2out * (N1); // compute relative error - double err = abs(F[indexOut] - Ftest)/Fmax; - cout << "2D type-1 NUFFT done. ier=" << ier << ", err in F[" << indexOut << "] rel to max(F) is " << setprecision(2) << err << endl; + double err = abs(F[indexOut] - Ftest) / Fmax; + cout << "2D type-1 NUFFT done. ier=" << ier << ", err in F[" << indexOut + << "] rel to max(F) is " << setprecision(2) << err << endl; return ier; } diff --git a/examples/simulplans1d1.cpp b/examples/simulplans1d1.cpp index b814876a2..4fb5f9449 100644 --- a/examples/simulplans1d1.cpp +++ b/examples/simulplans1d1.cpp @@ -2,37 +2,41 @@ #include // also used in this example... -#include +#include #include #include #include -#include +#include using namespace std; -void strengths(vector>& c) { // fill random complex array - for (long unsigned int j=0; j> &c) { // fill random complex array + for (long unsigned int j = 0; j < c.size(); ++j) + c[j] = + 2 * ((double)rand() / RAND_MAX) - 1 + 1i * (2 * ((double)rand() / RAND_MAX) - 1); } -double chk1d1(int n, vector& x, vector>& c, - vector>& F) +double chk1d1(int n, vector &x, vector> &c, + vector> &F) // return error in output array F, for n'th mode only, rel to ||F||_inf { int N = F.size(); - if (n>=N/2 || n<-N/2) { printf("n out of bounds!\n"); return NAN; } - complex Ftest = complex(0,0); - for (long unsigned int j=0; j= N / 2 || n < -N / 2) { + printf("n out of bounds!\n"); + return NAN; + } + complex Ftest = complex(0, 0); + for (long unsigned int j = 0; j < x.size(); ++j) + Ftest += c[j] * exp(1i * (double)n * x[j]); + int nout = n + N / 2; // index in output array for freq mode n double Fmax = 0.0; // compute inf norm of F - for (int m=0; mFmax) Fmax=aF; + if (aF > Fmax) Fmax = aF; } - return abs(F[nout] - Ftest)/Fmax; + return abs(F[nout] - Ftest) / Fmax; } -int main(int argc, char* argv[]) +int main(int argc, char *argv[]) /* Demo two simultaneous FINUFFT plans (A,B) being handled in C++ without interacting (or at least without crashing; note that FFTW initialization is the only global state of FINUFFT library). @@ -40,20 +44,21 @@ int main(int argc, char* argv[]) Edited from guru1d1, Barnett 2/15/22 Compile & run: - g++ -fopenmp simulplans1d1.cpp -I../include ../lib-static/libfinufft.a -o simulplans1d1 -lfftw3 -lfftw3_omp -lm && ./simulplans1d1 + g++ -fopenmp simulplans1d1.cpp -I../include ../lib-static/libfinufft.a -o simulplans1d1 + -lfftw3 -lfftw3_omp -lm && ./simulplans1d1 */ { - double tol = 1e-9; // desired accuracy for both plans + double tol = 1e-9; // desired accuracy for both plans int type = 1, dim = 1; // 1d1 - int64_t Ns[3]; // guru describes mode array by vector [N1,N2..] + int64_t Ns[3]; // guru describes mode array by vector [N1,N2..] int ntransf = 1; // we want to do a single transform at a time - - int MA = 3e6; // number of nonuniform points PLAN A - int NA = 1e6; // number of modes - int MB = 2e6; // number of nonuniform points PLAN B, diff sizes - int NB = 1e5; // number of modes - finufft_plan planA, planB; // creates plan structs + int MA = 3e6; // number of nonuniform points PLAN A + int NA = 1e6; // number of modes + int MB = 2e6; // number of nonuniform points PLAN B, diff sizes + int NB = 1e5; // number of modes + + finufft_plan planA, planB; // creates plan structs Ns[0] = NA; finufft_makeplan(type, dim, Ns, +1, ntransf, tol, &planA, NULL); Ns[0] = NB; @@ -61,22 +66,22 @@ int main(int argc, char* argv[]) // generate some random nonuniform points vector xA(MA), xB(MB); - for (int j=0; j> cA(MA), cB(MB); strengths(cA); strengths(cB); - + // allocate output arrays for the Fourier modes... - vector > FA(NA), FB(NB); + vector> FA(NA), FB(NB); int ierA = finufft_execute(planA, &cA[0], &FA[0]); int ierB = finufft_execute(planB, &cB[0], &FB[0]); @@ -87,14 +92,16 @@ int main(int argc, char* argv[]) ierB = finufft_execute(planB, &cB[0], &FB[0]); finufft_destroy(planA); finufft_destroy(planB); - + // math checking and reporting... - int n = 116354; - double errA = chk1d1(n,xA,cA,FA); - printf("planA: 1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n",ierA,n,errA); - n = 27152; - double errB = chk1d1(n,xB,cB,FB); - printf("planB: 1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n",ierB,n,errB); + int n = 116354; + double errA = chk1d1(n, xA, cA, FA); + printf("planA: 1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n", + ierA, n, errA); + n = 27152; + double errB = chk1d1(n, xB, cB, FB); + printf("planB: 1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n", + ierB, n, errB); return ierA + ierB; } diff --git a/examples/threadsafe1d1.cpp b/examples/threadsafe1d1.cpp index f25f25b8b..da267fa6c 100644 --- a/examples/threadsafe1d1.cpp +++ b/examples/threadsafe1d1.cpp @@ -2,15 +2,15 @@ #include // also used in this example... -#include +#include #include #include -#include -#include #include +#include +#include using namespace std; -int main(int argc, char* argv[]) +int main(int argc, char *argv[]) /* Demo single-threaded FINUFFT calls from inside a OMP parallel block. Adapted from simple1d1.cpp: C++, STL double complex vectors, with math test. Barnett 4/19/21, eg for Goran Zauhar, issue #183. Also see: many1d1.cpp. @@ -26,50 +26,51 @@ int main(int argc, char* argv[]) reporting small error. */ { - int M = 1e5; // number of nonuniform points - int N = 1e5; // number of modes - double acc = 1e-9; // desired accuracy - finufft_opts* opts = new finufft_opts; // opts is pointer to struct + int M = 1e5; // number of nonuniform points + int N = 1e5; // number of modes + double acc = 1e-9; // desired accuracy + finufft_opts *opts = new finufft_opts; // opts is pointer to struct finufft_default_opts(opts); - complex I = complex(0.0,1.0); // the imaginary unit - - opts->nthreads=1; // *crucial* so that each call single-thread (otherwise segfaults) + complex I = complex(0.0, 1.0); // the imaginary unit + + opts->nthreads = 1; // *crucial* so that each call single-thread (otherwise segfaults) // Now have each thread do independent 1D type 1 on their own data: #pragma omp parallel { - // generate some random nonuniform points (x) and complex strengths (c)... - // Note that these are local to the thread (if you have the *same* sets of - // NU pts x for each thread, consider instead using one vectorized multithreaded - // transform, which would be faster). - vector x(M); - vector > c(M); - for (int j=0; j > F(N); + // generate some random nonuniform points (x) and complex strengths (c)... + // Note that these are local to the thread (if you have the *same* sets of + // NU pts x for each thread, consider instead using one vectorized multithreaded + // transform, which would be faster). + vector x(M); + vector> c(M); + for (int j = 0; j < M; ++j) { + x[j] = M_PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi) + c[j] = + 2 * ((double)rand() / RAND_MAX) - 1 + I * (2 * ((double)rand() / RAND_MAX) - 1); + } - // call the NUFFT (with iflag=+1): note pointers (not STL vecs) passed... - int ier = finufft1d1(M,&x[0],&c[0],+1,acc,N,&F[0],opts); + // allocate output array for the Fourier modes... local to the thread + vector> F(N); - int k = 42519; // check the answer just for this mode frequency... - assert(k>=-(double)N/2 && k<(double)N/2); - complex Ftest = complex(0,0); - for (int j=0; jFmax) Fmax=aF; - } - int kout = k+N/2; // index in output array for freq mode k - double err = abs(F[kout] - Ftest)/Fmax; - - printf("[thread %2d] 1D t-1 dbl-prec NUFFT done. ier=%d, rel err in F[%d]: %.3g\n",omp_get_thread_num(),ier,k,err); + // call the NUFFT (with iflag=+1): note pointers (not STL vecs) passed... + int ier = finufft1d1(M, &x[0], &c[0], +1, acc, N, &F[0], opts); + + int k = 42519; // check the answer just for this mode frequency... + assert(k >= -(double)N / 2 && k < (double)N / 2); + complex Ftest = complex(0, 0); + for (int j = 0; j < M; ++j) Ftest += c[j] * exp(I * (double)k * x[j]); + double Fmax = 0.0; // compute inf norm of F + for (int m = 0; m < N; ++m) { + double aF = abs(F[m]); + if (aF > Fmax) Fmax = aF; + } + int kout = k + N / 2; // index in output array for freq mode k + double err = abs(F[kout] - Ftest) / Fmax; + + printf("[thread %2d] 1D t-1 dbl-prec NUFFT done. ier=%d, rel err in F[%d]: %.3g\n", + omp_get_thread_num(), ier, k, err); } - + return 0; } diff --git a/examples/threadsafe2d2f.cpp b/examples/threadsafe2d2f.cpp index 9844af54a..e2ad64bb1 100644 --- a/examples/threadsafe2d2f.cpp +++ b/examples/threadsafe2d2f.cpp @@ -8,7 +8,8 @@ To compile (note uses threads rather than omp version of FFTW3): - g++ -fopenmp threadsafe2d2f.cpp -I../include ../lib/libfinufft.so -o threadsafe2d2f -g -Wall + g++ -fopenmp threadsafe2d2f.cpp -I../include ../lib/libfinufft.so -o threadsafe2d2f -g + -Wall ./threadsafe2d2f <-- use all threads OMP_NUM_THREADS=1 ./threadsafe2d2f <-- sequential, 1 thread @@ -23,43 +24,43 @@ #include // also used in this example... -#include #include #include #include +#include using namespace std; -int test_finufft(finufft_opts* opts) - // self-contained small test that one single-prec FINUFFT2D2 has no error/crash +int test_finufft(finufft_opts *opts) +// self-contained small test that one single-prec FINUFFT2D2 has no error/crash { - size_t n_rows = 256, n_cols = 256; // 2d image size - size_t n_read = 512, n_spokes = 128; // some k-space point params - size_t M = n_read*n_spokes; // how many k-space pts; MRI-specific - std::vector x(M); // bunch of zero input data + size_t n_rows = 256, n_cols = 256; // 2d image size + size_t n_read = 512, n_spokes = 128; // some k-space point params + size_t M = n_read * n_spokes; // how many k-space pts; MRI-specific + std::vector x(M); // bunch of zero input data std::vector y(M); - std::vector> img(n_rows * n_cols); // coeffs - std::vector> ksp(M); // output array (vals @ k-space pts) + std::vector> img(n_rows * n_cols); // coeffs + std::vector> ksp(M); // output array (vals @ k-space pts) - int ier = finufftf2d2(M, x.data(), y.data(), ksp.data(), - -1, 1e-3, n_rows, n_cols, img.data(), opts); + int ier = finufftf2d2(M, x.data(), y.data(), ksp.data(), -1, 1e-3, n_rows, n_cols, + img.data(), opts); - std::cout << "\ttest_finufft: exit code " << ier << ", thread " << omp_get_thread_num() << std::endl; + std::cout << "\ttest_finufft: exit code " << ier << ", thread " << omp_get_thread_num() + << std::endl; return ier; } -int main(int argc, char* argv[]) -{ +int main(int argc, char *argv[]) { finufft_opts opts; finufftf_default_opts(&opts); - opts.nthreads = 1; // *crucial* so each call single-thread; else segfaults + opts.nthreads = 1; // *crucial* so each call single-thread; else segfaults - int n_slices = 50; // number of transforms. parallelize over slices - int overallstatus=0; + int n_slices = 50; // number of transforms. parallelize over slices + int overallstatus = 0; #pragma omp parallel for for (int i = 0; i < n_slices; i++) { int ier = test_finufft(&opts); - if (ier!=0) overallstatus=1; + if (ier != 0) overallstatus = 1; } - + return overallstatus; } diff --git a/fortran/finufftfort.cpp b/fortran/finufftfort.cpp index 9f415d647..799a10041 100644 --- a/fortran/finufftfort.cpp +++ b/fortran/finufftfort.cpp @@ -26,205 +26,182 @@ // local prec-switching macros for fortran names, ie // underscore-suffixed versions of those at end of defs.h #define FINUFFT_DEFAULT_OPTS_ FINUFFTIFY(_default_opts_) -#define FINUFFT_MAKEPLAN_ FINUFFTIFY(_makeplan_) -#define FINUFFT_SETPTS_ FINUFFTIFY(_setpts_) -#define FINUFFT_EXECUTE_ FINUFFTIFY(_execute_) -#define FINUFFT_DESTROY_ FINUFFTIFY(_destroy_) -#define FINUFFT1D1_ FINUFFTIFY(1d1_) -#define FINUFFT1D2_ FINUFFTIFY(1d2_) -#define FINUFFT1D3_ FINUFFTIFY(1d3_) -#define FINUFFT2D1_ FINUFFTIFY(2d1_) -#define FINUFFT2D2_ FINUFFTIFY(2d2_) -#define FINUFFT2D3_ FINUFFTIFY(2d3_) -#define FINUFFT3D1_ FINUFFTIFY(3d1_) -#define FINUFFT3D2_ FINUFFTIFY(3d2_) -#define FINUFFT3D3_ FINUFFTIFY(3d3_) -#define FINUFFT1D1MANY_ FINUFFTIFY(1d1many_) -#define FINUFFT1D2MANY_ FINUFFTIFY(1d2many_) -#define FINUFFT1D3MANY_ FINUFFTIFY(1d3many_) -#define FINUFFT2D1MANY_ FINUFFTIFY(2d1many_) -#define FINUFFT2D2MANY_ FINUFFTIFY(2d2many_) -#define FINUFFT2D3MANY_ FINUFFTIFY(2d3many_) -#define FINUFFT3D1MANY_ FINUFFTIFY(3d1many_) -#define FINUFFT3D2MANY_ FINUFFTIFY(3d2many_) -#define FINUFFT3D3MANY_ FINUFFTIFY(3d3many_) +#define FINUFFT_MAKEPLAN_ FINUFFTIFY(_makeplan_) +#define FINUFFT_SETPTS_ FINUFFTIFY(_setpts_) +#define FINUFFT_EXECUTE_ FINUFFTIFY(_execute_) +#define FINUFFT_DESTROY_ FINUFFTIFY(_destroy_) +#define FINUFFT1D1_ FINUFFTIFY(1d1_) +#define FINUFFT1D2_ FINUFFTIFY(1d2_) +#define FINUFFT1D3_ FINUFFTIFY(1d3_) +#define FINUFFT2D1_ FINUFFTIFY(2d1_) +#define FINUFFT2D2_ FINUFFTIFY(2d2_) +#define FINUFFT2D3_ FINUFFTIFY(2d3_) +#define FINUFFT3D1_ FINUFFTIFY(3d1_) +#define FINUFFT3D2_ FINUFFTIFY(3d2_) +#define FINUFFT3D3_ FINUFFTIFY(3d3_) +#define FINUFFT1D1MANY_ FINUFFTIFY(1d1many_) +#define FINUFFT1D2MANY_ FINUFFTIFY(1d2many_) +#define FINUFFT1D3MANY_ FINUFFTIFY(1d3many_) +#define FINUFFT2D1MANY_ FINUFFTIFY(2d1many_) +#define FINUFFT2D2MANY_ FINUFFTIFY(2d2many_) +#define FINUFFT2D3MANY_ FINUFFTIFY(2d3many_) +#define FINUFFT3D1MANY_ FINUFFTIFY(3d1many_) +#define FINUFFT3D2MANY_ FINUFFTIFY(3d2many_) +#define FINUFFT3D3MANY_ FINUFFTIFY(3d3many_) #ifdef __cplusplus extern "C" { #endif - + // --------------------- guru interface from fortran ------------------------ -void FINUFFT_MAKEPLAN_(int *type, int *n_dims, BIGINT *n_modes, int *iflag, int *n_transf, FLT *tol, FINUFFT_PLAN *plan, finufft_opts *o, int *ier) -{ +void FINUFFT_MAKEPLAN_(int *type, int *n_dims, BIGINT *n_modes, int *iflag, int *n_transf, + FLT *tol, FINUFFT_PLAN *plan, finufft_opts *o, int *ier) { if (!plan) - fprintf(stderr,"%s fortran: plan must be allocated as at least the size of a C pointer (usually 8 bytes)!\n",__func__); + fprintf(stderr, + "%s fortran: plan must be allocated as at least the size of a C pointer " + "(usually 8 bytes)!\n", + __func__); else { // pass o whether it's a NULL or pointer to a fortran-allocated finufft_opts: *ier = FINUFFT_MAKEPLAN(*type, *n_dims, n_modes, *iflag, *n_transf, *tol, plan, o); } } -void FINUFFT_SETPTS_(FINUFFT_PLAN *plan, BIGINT *M, FLT *xj, FLT *yj, FLT *zj, BIGINT *nk, FLT *s, FLT *t, FLT *u, int *ier) -{ +void FINUFFT_SETPTS_(FINUFFT_PLAN *plan, BIGINT *M, FLT *xj, FLT *yj, FLT *zj, BIGINT *nk, + FLT *s, FLT *t, FLT *u, int *ier) { if (!*plan) { - fprintf(stderr,"%s fortran: finufft_plan unallocated!",__func__); + fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__); return; } - int nk_safe = 0; // catches the case where user passes NULL in - if (nk) - nk_safe = *nk; + int nk_safe = 0; // catches the case where user passes NULL in + if (nk) nk_safe = *nk; *ier = FINUFFT_SETPTS(*plan, *M, xj, yj, zj, nk_safe, s, t, u); } -void FINUFFT_EXECUTE_(FINUFFT_PLAN *plan, CPX *weights, CPX *result, int *ier) -{ +void FINUFFT_EXECUTE_(FINUFFT_PLAN *plan, CPX *weights, CPX *result, int *ier) { if (!plan) - fprintf(stderr,"%s fortran: finufft_plan unallocated!",__func__); + fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__); else *ier = FINUFFT_EXECUTE(*plan, weights, result); } -void FINUFFT_DESTROY_(FINUFFT_PLAN *plan, int *ier) -{ +void FINUFFT_DESTROY_(FINUFFT_PLAN *plan, int *ier) { if (!plan) - fprintf(stderr,"%s fortran: finufft_plan unallocated!",__func__); + fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__); else *ier = FINUFFT_DESTROY(*plan); } - // ------------ use FINUFFT to set the default options --------------------- // (Note the finufft_opts is created in f90-style derived types, not here) -void FINUFFT_DEFAULT_OPTS_(finufft_opts* o) -{ +void FINUFFT_DEFAULT_OPTS_(finufft_opts *o) { if (!o) - fprintf(stderr,"%s fortran: opts must be allocated!\n",__func__); + fprintf(stderr, "%s fortran: opts must be allocated!\n", __func__); else // o is a ptr to already-allocated fortran finufft_opts derived type... FINUFFT_DEFAULT_OPTS(o); } - // -------------- simple and many-vector interfaces -------------------- // --- 1D --- -void FINUFFT1D1_(BIGINT* nj, FLT* xj, CPX* cj, int* iflag, FLT* eps, - BIGINT* ms, CPX* fk, finufft_opts* o, int* ier) -{ - *ier = FINUFFT1D1(*nj,xj,cj,*iflag,*eps,*ms,fk,o); +void FINUFFT1D1_(BIGINT *nj, FLT *xj, CPX *cj, int *iflag, FLT *eps, BIGINT *ms, CPX *fk, + finufft_opts *o, int *ier) { + *ier = FINUFFT1D1(*nj, xj, cj, *iflag, *eps, *ms, fk, o); } -void FINUFFT1D1MANY_(int* ntransf, - BIGINT* nj, FLT* xj, CPX* cj, int* iflag, FLT* eps, - BIGINT* ms, CPX* fk, finufft_opts* o, int* ier) -{ - *ier = FINUFFT1D1MANY(*ntransf,*nj,xj,cj,*iflag,*eps,*ms,fk,o); +void FINUFFT1D1MANY_(int *ntransf, BIGINT *nj, FLT *xj, CPX *cj, int *iflag, FLT *eps, + BIGINT *ms, CPX *fk, finufft_opts *o, int *ier) { + *ier = FINUFFT1D1MANY(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o); } -void FINUFFT1D2_(BIGINT* nj, FLT* xj, CPX* cj, int* iflag, FLT* eps, - BIGINT* ms, CPX* fk, finufft_opts* o, int* ier) -{ - *ier = FINUFFT1D2(*nj,xj,cj,*iflag,*eps,*ms,fk,o); +void FINUFFT1D2_(BIGINT *nj, FLT *xj, CPX *cj, int *iflag, FLT *eps, BIGINT *ms, CPX *fk, + finufft_opts *o, int *ier) { + *ier = FINUFFT1D2(*nj, xj, cj, *iflag, *eps, *ms, fk, o); } -void FINUFFT1D2MANY_(int* ntransf, - BIGINT* nj, FLT* xj, CPX* cj, int* iflag, FLT* eps, - BIGINT* ms, CPX* fk, finufft_opts* o, int* ier) -{ - *ier = FINUFFT1D2MANY(*ntransf,*nj,xj,cj,*iflag,*eps,*ms,fk,o); +void FINUFFT1D2MANY_(int *ntransf, BIGINT *nj, FLT *xj, CPX *cj, int *iflag, FLT *eps, + BIGINT *ms, CPX *fk, finufft_opts *o, int *ier) { + *ier = FINUFFT1D2MANY(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o); } -void FINUFFT1D3_(BIGINT* nj, FLT* x, CPX* c, int* iflag, FLT* eps, - BIGINT* nk, FLT* s, CPX* f, finufft_opts* o, int* ier) -{ - *ier = FINUFFT1D3(*nj,x,c,*iflag,*eps,*nk,s,f,o); +void FINUFFT1D3_(BIGINT *nj, FLT *x, CPX *c, int *iflag, FLT *eps, BIGINT *nk, FLT *s, + CPX *f, finufft_opts *o, int *ier) { + *ier = FINUFFT1D3(*nj, x, c, *iflag, *eps, *nk, s, f, o); } -void FINUFFT1D3MANY_(int* ntransf, - BIGINT* nj, FLT* x, CPX* c, int* iflag, FLT* eps, - BIGINT* nk, FLT* s, CPX* f, finufft_opts* o, int* ier) -{ - *ier = FINUFFT1D3MANY(*ntransf,*nj,x,c,*iflag,*eps,*nk,s,f,o); +void FINUFFT1D3MANY_(int *ntransf, BIGINT *nj, FLT *x, CPX *c, int *iflag, FLT *eps, + BIGINT *nk, FLT *s, CPX *f, finufft_opts *o, int *ier) { + *ier = FINUFFT1D3MANY(*ntransf, *nj, x, c, *iflag, *eps, *nk, s, f, o); } // --- 2D --- -void FINUFFT2D1_(BIGINT* nj, FLT* xj, FLT* yj, CPX* cj, int* iflag, FLT* eps, - BIGINT* ms, BIGINT* mt, CPX* fk, finufft_opts* o, int* ier) -{ - *ier = FINUFFT2D1(*nj,xj,yj,cj,*iflag,*eps,*ms,*mt,fk,o); +void FINUFFT2D1_(BIGINT *nj, FLT *xj, FLT *yj, CPX *cj, int *iflag, FLT *eps, BIGINT *ms, + BIGINT *mt, CPX *fk, finufft_opts *o, int *ier) { + *ier = FINUFFT2D1(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); } -void FINUFFT2D1MANY_(int* ntransf, - BIGINT* nj, FLT* xj, FLT* yj, CPX* cj, int* iflag, FLT* eps, - BIGINT* ms, BIGINT* mt, CPX* fk, finufft_opts* o, int* ier) -{ - *ier = FINUFFT2D1MANY(*ntransf,*nj,xj,yj,cj,*iflag,*eps,*ms,*mt,fk,o); +void FINUFFT2D1MANY_(int *ntransf, BIGINT *nj, FLT *xj, FLT *yj, CPX *cj, int *iflag, + FLT *eps, BIGINT *ms, BIGINT *mt, CPX *fk, finufft_opts *o, + int *ier) { + *ier = FINUFFT2D1MANY(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); } -void FINUFFT2D2_(BIGINT* nj, FLT* xj, FLT* yj, CPX* cj, int* iflag, FLT* eps, - BIGINT* ms, BIGINT* mt, CPX* fk, finufft_opts* o, int* ier) -{ - *ier = FINUFFT2D2(*nj,xj,yj,cj,*iflag,*eps,*ms,*mt,fk,o); +void FINUFFT2D2_(BIGINT *nj, FLT *xj, FLT *yj, CPX *cj, int *iflag, FLT *eps, BIGINT *ms, + BIGINT *mt, CPX *fk, finufft_opts *o, int *ier) { + *ier = FINUFFT2D2(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); } -void FINUFFT2D2MANY_(int* ntransf, - BIGINT* nj, FLT* xj, FLT* yj, CPX* cj, int* iflag, FLT* eps, - BIGINT* ms, BIGINT* mt, CPX* fk, finufft_opts* o, int* ier) -{ - *ier = FINUFFT2D2MANY(*ntransf,*nj,xj,yj,cj,*iflag,*eps,*ms,*mt,fk,o); +void FINUFFT2D2MANY_(int *ntransf, BIGINT *nj, FLT *xj, FLT *yj, CPX *cj, int *iflag, + FLT *eps, BIGINT *ms, BIGINT *mt, CPX *fk, finufft_opts *o, + int *ier) { + *ier = FINUFFT2D2MANY(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); } -void FINUFFT2D3_(BIGINT* nj, FLT* x, FLT* y, CPX* c, int* iflag, FLT* eps, - BIGINT* nk, FLT* s, FLT* t, CPX* f, finufft_opts* o, int* ier) -{ - *ier = FINUFFT2D3(*nj,x,y,c,*iflag,*eps,*nk,s,t,f,o); +void FINUFFT2D3_(BIGINT *nj, FLT *x, FLT *y, CPX *c, int *iflag, FLT *eps, BIGINT *nk, + FLT *s, FLT *t, CPX *f, finufft_opts *o, int *ier) { + *ier = FINUFFT2D3(*nj, x, y, c, *iflag, *eps, *nk, s, t, f, o); } -void FINUFFT2D3MANY_(int* ntransf, - BIGINT* nj, FLT* x, FLT* y, CPX* c, int* iflag, FLT* eps, - BIGINT* nk, FLT* s, FLT* t, CPX* f, finufft_opts* o, int* ier) -{ - *ier = FINUFFT2D3MANY(*ntransf,*nj,x,y,c,*iflag,*eps,*nk,s,t,f,o); +void FINUFFT2D3MANY_(int *ntransf, BIGINT *nj, FLT *x, FLT *y, CPX *c, int *iflag, + FLT *eps, BIGINT *nk, FLT *s, FLT *t, CPX *f, finufft_opts *o, + int *ier) { + *ier = FINUFFT2D3MANY(*ntransf, *nj, x, y, c, *iflag, *eps, *nk, s, t, f, o); } // --- 3D --- -void FINUFFT3D1_(BIGINT* nj, FLT* xj, FLT* yj, FLT* zj, CPX* cj, int* iflag, FLT* eps, - BIGINT* ms, BIGINT* mt, BIGINT* mu, CPX* fk, finufft_opts* o, int* ier) -{ - *ier = FINUFFT3D1(*nj,xj,yj,zj,cj,*iflag,*eps,*ms,*mt,*mu,fk,o); +void FINUFFT3D1_(BIGINT *nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int *iflag, FLT *eps, + BIGINT *ms, BIGINT *mt, BIGINT *mu, CPX *fk, finufft_opts *o, int *ier) { + *ier = FINUFFT3D1(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); } -void FINUFFT3D1MANY_(int* ntransf, - BIGINT* nj, FLT* xj, FLT* yj, FLT* zj, CPX* cj, int* iflag, FLT* eps, - BIGINT* ms, BIGINT* mt, BIGINT* mu, CPX* fk, finufft_opts* o, int* ier) -{ - *ier = FINUFFT3D1MANY(*ntransf,*nj,xj,yj,zj,cj,*iflag,*eps,*ms,*mt,*mu,fk,o); +void FINUFFT3D1MANY_(int *ntransf, BIGINT *nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, + int *iflag, FLT *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu, CPX *fk, + finufft_opts *o, int *ier) { + *ier = + FINUFFT3D1MANY(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); } -void FINUFFT3D2_(BIGINT* nj, FLT* xj, FLT* yj, FLT* zj, CPX* cj, int* iflag, FLT* eps, - BIGINT* ms, BIGINT* mt, BIGINT* mu, CPX* fk, finufft_opts* o, int* ier) -{ - *ier = FINUFFT3D2(*nj,xj,yj,zj,cj,*iflag,*eps,*ms,*mt,*mu,fk,o); +void FINUFFT3D2_(BIGINT *nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int *iflag, FLT *eps, + BIGINT *ms, BIGINT *mt, BIGINT *mu, CPX *fk, finufft_opts *o, int *ier) { + *ier = FINUFFT3D2(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); } -void FINUFFT3D2MANY_(int* ntransf, - BIGINT* nj, FLT* xj, FLT* yj, FLT* zj, CPX* cj, int* iflag, FLT* eps, - BIGINT* ms, BIGINT* mt, BIGINT* mu, CPX* fk, finufft_opts* o, int* ier) -{ - *ier = FINUFFT3D2MANY(*ntransf,*nj,xj,yj,zj,cj,*iflag,*eps,*ms,*mt,*mu,fk,o); +void FINUFFT3D2MANY_(int *ntransf, BIGINT *nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, + int *iflag, FLT *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu, CPX *fk, + finufft_opts *o, int *ier) { + *ier = + FINUFFT3D2MANY(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); } -void FINUFFT3D3_(BIGINT* nj, FLT* x, FLT* y, FLT* z, CPX* c, int* iflag, FLT* eps, - BIGINT* nk, FLT* s, FLT* t, FLT* u, CPX* f, finufft_opts* o, int* ier) -{ - *ier = FINUFFT3D3(*nj,x,y,z,c,*iflag,*eps,*nk,s,t,u,f,o); +void FINUFFT3D3_(BIGINT *nj, FLT *x, FLT *y, FLT *z, CPX *c, int *iflag, FLT *eps, + BIGINT *nk, FLT *s, FLT *t, FLT *u, CPX *f, finufft_opts *o, int *ier) { + *ier = FINUFFT3D3(*nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o); } -void FINUFFT3D3MANY_(int* ntransf, - BIGINT* nj, FLT* x, FLT* y, FLT* z, CPX* c, int* iflag, FLT* eps, - BIGINT* nk, FLT* s, FLT* t, FLT* u, CPX* f, finufft_opts* o, int* ier) -{ - *ier = FINUFFT3D3MANY(*ntransf,*nj,x,y,z,c,*iflag,*eps,*nk,s,t,u,f,o); +void FINUFFT3D3MANY_(int *ntransf, BIGINT *nj, FLT *x, FLT *y, FLT *z, CPX *c, int *iflag, + FLT *eps, BIGINT *nk, FLT *s, FLT *t, FLT *u, CPX *f, + finufft_opts *o, int *ier) { + *ier = FINUFFT3D3MANY(*ntransf, *nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o); } - #ifdef __cplusplus } #endif diff --git a/include/cufinufft.h b/include/cufinufft.h index 3c498fed0..b323d94c0 100644 --- a/include/cufinufft.h +++ b/include/cufinufft.h @@ -14,15 +14,15 @@ extern "C" { #endif void cufinufft_default_opts(cufinufft_opts *opts); -int cufinufft_makeplan(int type, int dim, const int64_t *n_modes, int iflag, int ntr, double eps, - cufinufft_plan *d_plan_ptr, cufinufft_opts *opts); -int cufinufftf_makeplan(int type, int dim, const int64_t *n_modes, int iflag, int ntr, float eps, - cufinufftf_plan *d_plan_ptr, cufinufft_opts *opts); - -int cufinufft_setpts(cufinufft_plan d_plan, int M, double *d_x, double *d_y, double *d_z, int N, double *d_s, - double *d_t, double *d_u); -int cufinufftf_setpts(cufinufftf_plan d_plan, int M, float *d_x, float *d_y, float *d_z, int N, float *d_s, - float *d_t, float *d_u); +int cufinufft_makeplan(int type, int dim, const int64_t *n_modes, int iflag, int ntr, + double eps, cufinufft_plan *d_plan_ptr, cufinufft_opts *opts); +int cufinufftf_makeplan(int type, int dim, const int64_t *n_modes, int iflag, int ntr, + float eps, cufinufftf_plan *d_plan_ptr, cufinufft_opts *opts); + +int cufinufft_setpts(cufinufft_plan d_plan, int M, double *d_x, double *d_y, double *d_z, + int N, double *d_s, double *d_t, double *d_u); +int cufinufftf_setpts(cufinufftf_plan d_plan, int M, float *d_x, float *d_y, float *d_z, + int N, float *d_s, float *d_t, float *d_u); int cufinufft_execute(cufinufft_plan d_plan, cuDoubleComplex *d_c, cuDoubleComplex *d_fk); int cufinufftf_execute(cufinufftf_plan d_plan, cuFloatComplex *d_c, cuFloatComplex *d_fk); diff --git a/include/cufinufft/common.h b/include/cufinufft/common.h index 3ea437448..7bddc188e 100644 --- a/include/cufinufft/common.h +++ b/include/cufinufft/common.h @@ -10,24 +10,27 @@ namespace cufinufft { namespace common { -template -__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, cuDoubleComplex *a, T *fwkerhalf1, - T *fwkerhalf2, T *fwkerhalf3, int ns); -template -int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, cuDoubleComplex *d_a, T *d_fwkerhalf1, - T *d_fwkerhalf2, T *d_fwkerhalf3, int ns, cudaStream_t stream); -template +template +__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, + cuDoubleComplex *a, T *fwkerhalf1, T *fwkerhalf2, + T *fwkerhalf3, int ns); +template +int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, + cuDoubleComplex *d_a, T *d_fwkerhalf1, T *d_fwkerhalf2, + T *d_fwkerhalf3, int ns, cudaStream_t stream); +template int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, cufinufft_opts opts); -void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts spopts, CUFINUFFT_BIGINT *nf, - CUFINUFFT_BIGINT b); -template +void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts spopts, + CUFINUFFT_BIGINT *nf, CUFINUFFT_BIGINT b); +template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, T *fwkerhalf, finufft_spread_opts opts); -template -void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, std::complex *a, finufft_spread_opts opts); -template -void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex *a, T *fwkerhalf, +template +void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, std::complex *a, finufft_spread_opts opts); +template +void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex *a, + T *fwkerhalf, finufft_spread_opts opts); } // namespace common } // namespace cufinufft diff --git a/include/cufinufft/contrib/helper_cuda.h b/include/cufinufft/contrib/helper_cuda.h index 69dad3b86..3f3f931c6 100644 --- a/include/cufinufft/contrib/helper_cuda.h +++ b/include/cufinufft/contrib/helper_cuda.h @@ -37,95 +37,97 @@ #include -static const char *_cudaGetErrorEnum(cudaError_t error) { return cudaGetErrorName(error); } +static const char *_cudaGetErrorEnum(cudaError_t error) { + return cudaGetErrorName(error); +} // This will output the proper CUDA error strings in the event // that a CUDA host call returns an error #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) -#define RETURN_IF_CUDA_ERROR \ - { \ - cudaError_t err = cudaGetLastError(); \ - if (err != cudaSuccess) { \ - printf("[%s] Error: %s\n", __func__, cudaGetErrorString(err)); \ - return FINUFFT_ERR_CUDA_FAILURE; \ - } \ - } - -#define CUDA_FREE_AND_NULL(val, stream) \ - { \ - if (val != nullptr) { \ - check(cudaFreeAsync(val, stream), #val, __FILE__, __LINE__); \ - val = nullptr; \ - } \ - } +#define RETURN_IF_CUDA_ERROR \ + { \ + cudaError_t err = cudaGetLastError(); \ + if (err != cudaSuccess) { \ + printf("[%s] Error: %s\n", __func__, cudaGetErrorString(err)); \ + return FINUFFT_ERR_CUDA_FAILURE; \ + } \ + } + +#define CUDA_FREE_AND_NULL(val, stream) \ + { \ + if (val != nullptr) { \ + check(cudaFreeAsync(val, stream), #val, __FILE__, __LINE__); \ + val = nullptr; \ + } \ + } static const char *cufftGetErrorString(cufftResult error) { - switch (error) { - case CUFFT_SUCCESS: - return "CUFFT_SUCCESS"; + switch (error) { + case CUFFT_SUCCESS: + return "CUFFT_SUCCESS"; - case CUFFT_INVALID_PLAN: - return "CUFFT_INVALID_PLAN"; + case CUFFT_INVALID_PLAN: + return "CUFFT_INVALID_PLAN"; - case CUFFT_ALLOC_FAILED: - return "CUFFT_ALLOC_FAILED"; + case CUFFT_ALLOC_FAILED: + return "CUFFT_ALLOC_FAILED"; - case CUFFT_INVALID_TYPE: - return "CUFFT_INVALID_TYPE"; + case CUFFT_INVALID_TYPE: + return "CUFFT_INVALID_TYPE"; - case CUFFT_INVALID_VALUE: - return "CUFFT_INVALID_VALUE"; + case CUFFT_INVALID_VALUE: + return "CUFFT_INVALID_VALUE"; - case CUFFT_INTERNAL_ERROR: - return "CUFFT_INTERNAL_ERROR"; + case CUFFT_INTERNAL_ERROR: + return "CUFFT_INTERNAL_ERROR"; - case CUFFT_EXEC_FAILED: - return "CUFFT_EXEC_FAILED"; + case CUFFT_EXEC_FAILED: + return "CUFFT_EXEC_FAILED"; - case CUFFT_SETUP_FAILED: - return "CUFFT_SETUP_FAILED"; + case CUFFT_SETUP_FAILED: + return "CUFFT_SETUP_FAILED"; - case CUFFT_INVALID_SIZE: - return "CUFFT_INVALID_SIZE"; + case CUFFT_INVALID_SIZE: + return "CUFFT_INVALID_SIZE"; - case CUFFT_UNALIGNED_DATA: - return "CUFFT_UNALIGNED_DATA"; + case CUFFT_UNALIGNED_DATA: + return "CUFFT_UNALIGNED_DATA"; - case CUFFT_INCOMPLETE_PARAMETER_LIST: - return "CUFFT_INCOMPLETE_PARAMETER_LIST"; + case CUFFT_INCOMPLETE_PARAMETER_LIST: + return "CUFFT_INCOMPLETE_PARAMETER_LIST"; - case CUFFT_INVALID_DEVICE: - return "CUFFT_INVALID_DEVICE"; + case CUFFT_INVALID_DEVICE: + return "CUFFT_INVALID_DEVICE"; - case CUFFT_PARSE_ERROR: - return "CUFFT_PARSE_ERROR"; + case CUFFT_PARSE_ERROR: + return "CUFFT_PARSE_ERROR"; - case CUFFT_NO_WORKSPACE: - return "CUFFT_NO_WORKSPACE"; + case CUFFT_NO_WORKSPACE: + return "CUFFT_NO_WORKSPACE"; - case CUFFT_NOT_IMPLEMENTED: - return "CUFFT_NOT_IMPLEMENTED"; + case CUFFT_NOT_IMPLEMENTED: + return "CUFFT_NOT_IMPLEMENTED"; - case CUFFT_LICENSE_ERROR: - return "CUFFT_LICENSE_ERROR"; + case CUFFT_LICENSE_ERROR: + return "CUFFT_LICENSE_ERROR"; - case CUFFT_NOT_SUPPORTED: - return "CUFFT_NOT_SUPPORTED"; - } + case CUFFT_NOT_SUPPORTED: + return "CUFFT_NOT_SUPPORTED"; + } - return ""; + return ""; } -template +template int check(T result, char const *const func, const char *const file, int const line) { - if (result) { - fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, static_cast(result), - _cudaGetErrorEnum(result), func); - return FINUFFT_ERR_CUDA_FAILURE; - } + if (result) { + fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, + static_cast(result), _cudaGetErrorEnum(result), func); + return FINUFFT_ERR_CUDA_FAILURE; + } - return 0; + return 0; } #endif // COMMON_HELPER_CUDA_H_ diff --git a/include/cufinufft/cudeconvolve.h b/include/cufinufft/cudeconvolve.h index 4daa4767e..ed701ed28 100644 --- a/include/cufinufft/cudeconvolve.h +++ b/include/cufinufft/cudeconvolve.h @@ -5,30 +5,33 @@ namespace cufinufft { namespace deconvolve { -template -__global__ void deconvolve_1d(int ms, int nf1, int fw_width, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1); -template -__global__ void amplify_1d(int ms, int nf1, int fw_width, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf2); -template -__global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, int fw_width, cuda_complex *fw, cuda_complex *fk, - T *fwkerhalf1, T *fwkerhalf2); -template -__global__ void amplify_2d(int ms, int mt, int nf1, int nf2, int fw_width, cuda_complex *fw, cuda_complex *fk, - T *fwkerhalf1, T *fwkerhalf2); +template +__global__ void deconvolve_1d(int ms, int nf1, int fw_width, cuda_complex *fw, + cuda_complex *fk, T *fwkerhalf1); +template +__global__ void amplify_1d(int ms, int nf1, int fw_width, cuda_complex *fw, + cuda_complex *fk, T *fwkerhalf2); +template +__global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, int fw_width, + cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1, + T *fwkerhalf2); +template +__global__ void amplify_2d(int ms, int mt, int nf1, int nf2, int fw_width, + cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1, + T *fwkerhalf2); -template -__global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, int fw_width, cuda_complex *fw, - cuda_complex *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3); -template -__global__ void amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, int fw_width, cuda_complex *fw, - cuda_complex *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3); +template +__global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, + int fw_width, cuda_complex *fw, cuda_complex *fk, + T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3); +template +__global__ void amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, + int fw_width, cuda_complex *fw, cuda_complex *fk, + T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3); -template -int cudeconvolve1d(cufinufft_plan_t *d_mem, int blksize); -template -int cudeconvolve2d(cufinufft_plan_t *d_mem, int blksize); -template -int cudeconvolve3d(cufinufft_plan_t *d_mem, int blksize); +template int cudeconvolve1d(cufinufft_plan_t *d_mem, int blksize); +template int cudeconvolve2d(cufinufft_plan_t *d_mem, int blksize); +template int cudeconvolve3d(cufinufft_plan_t *d_mem, int blksize); } // namespace deconvolve } // namespace cufinufft #endif diff --git a/include/cufinufft/defs.h b/include/cufinufft/defs.h index 6cdb84340..6b2a075ea 100644 --- a/include/cufinufft/defs.h +++ b/include/cufinufft/defs.h @@ -4,11 +4,12 @@ #include // constants needed within common -// upper bound on w, ie nspread, even when padded (see evaluate_kernel_vector); also for common +// upper bound on w, ie nspread, even when padded (see evaluate_kernel_vector); also for +// common #define MAX_NSPREAD 16 // max number of positive quadr nodes -#define MAX_NQUAD 100 +#define MAX_NQUAD 100 // FIXME: If cufft ever takes N > INT_MAX... constexpr int32_t MAX_NF = std::numeric_limits::max(); @@ -18,16 +19,16 @@ constexpr int32_t MAX_NF = std::numeric_limits::max(); #ifdef _OPENMP #include // point to actual omp utils -#define MY_OMP_GET_NUM_THREADS() omp_get_num_threads() -#define MY_OMP_GET_MAX_THREADS() omp_get_max_threads() -#define MY_OMP_GET_THREAD_NUM() omp_get_thread_num() +#define MY_OMP_GET_NUM_THREADS() omp_get_num_threads() +#define MY_OMP_GET_MAX_THREADS() omp_get_max_threads() +#define MY_OMP_GET_THREAD_NUM() omp_get_thread_num() #define MY_OMP_SET_NUM_THREADS(x) omp_set_num_threads(x) -#define MY_OMP_SET_NESTED(x) omp_set_nested(x) +#define MY_OMP_SET_NESTED(x) omp_set_nested(x) #else // non-omp safe dummy versions of omp utils #define MY_OMP_GET_NUM_THREADS() 1 #define MY_OMP_GET_MAX_THREADS() 1 -#define MY_OMP_GET_THREAD_NUM() 0 +#define MY_OMP_GET_THREAD_NUM() 0 #define MY_OMP_SET_NUM_THREADS(x) #define MY_OMP_SET_NESTED(x) #endif diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 34b969b46..3b8d3db2c 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -16,255 +16,269 @@ #include // 1d -template -int cufinufft1d1_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan); -template -int cufinufft1d2_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan); +template +int cufinufft1d1_exec(cuda_complex *d_c, cuda_complex *d_fk, + cufinufft_plan_t *d_plan); +template +int cufinufft1d2_exec(cuda_complex *d_c, cuda_complex *d_fk, + cufinufft_plan_t *d_plan); // 2d -template -int cufinufft2d1_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan); -template -int cufinufft2d2_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan); +template +int cufinufft2d1_exec(cuda_complex *d_c, cuda_complex *d_fk, + cufinufft_plan_t *d_plan); +template +int cufinufft2d2_exec(cuda_complex *d_c, cuda_complex *d_fk, + cufinufft_plan_t *d_plan); // 3d -template -int cufinufft3d1_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan); -template -int cufinufft3d2_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan); +template +int cufinufft3d1_exec(cuda_complex *d_c, cuda_complex *d_fk, + cufinufft_plan_t *d_plan); +template +int cufinufft3d2_exec(cuda_complex *d_c, cuda_complex *d_fk, + cufinufft_plan_t *d_plan); static void cufinufft_setup_binsize(int type, int dim, cufinufft_opts *opts) { - switch (dim) { - case 1: { - opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 1024 : opts->gpu_binsizex; - opts->gpu_binsizey = 1; - opts->gpu_binsizez = 1; - } break; + switch (dim) { + case 1: { + opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 1024 : opts->gpu_binsizex; + opts->gpu_binsizey = 1; + opts->gpu_binsizez = 1; + } break; + case 2: { + opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 32 : opts->gpu_binsizex; + opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 32 : opts->gpu_binsizey; + opts->gpu_binsizez = 1; + } break; + case 3: { + switch (opts->gpu_method) { + case 1: case 2: { - opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 32 : opts->gpu_binsizex; - opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 32 : opts->gpu_binsizey; - opts->gpu_binsizez = 1; + opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 16 : opts->gpu_binsizex; + opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 16 : opts->gpu_binsizey; + opts->gpu_binsizez = (opts->gpu_binsizez < 0) ? 2 : opts->gpu_binsizez; } break; - case 3: { - switch (opts->gpu_method) { - case 1: - case 2: { - opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 16 : opts->gpu_binsizex; - opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 16 : opts->gpu_binsizey; - opts->gpu_binsizez = (opts->gpu_binsizez < 0) ? 2 : opts->gpu_binsizez; - } break; - case 4: { - opts->gpu_obinsizex = (opts->gpu_obinsizex < 0) ? 8 : opts->gpu_obinsizex; - opts->gpu_obinsizey = (opts->gpu_obinsizey < 0) ? 8 : opts->gpu_obinsizey; - opts->gpu_obinsizez = (opts->gpu_obinsizez < 0) ? 8 : opts->gpu_obinsizez; - opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 4 : opts->gpu_binsizex; - opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 4 : opts->gpu_binsizey; - opts->gpu_binsizez = (opts->gpu_binsizez < 0) ? 4 : opts->gpu_binsizez; - } break; - } + case 4: { + opts->gpu_obinsizex = (opts->gpu_obinsizex < 0) ? 8 : opts->gpu_obinsizex; + opts->gpu_obinsizey = (opts->gpu_obinsizey < 0) ? 8 : opts->gpu_obinsizey; + opts->gpu_obinsizez = (opts->gpu_obinsizez < 0) ? 8 : opts->gpu_obinsizez; + opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 4 : opts->gpu_binsizex; + opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 4 : opts->gpu_binsizey; + opts->gpu_binsizez = (opts->gpu_binsizez < 0) ? 4 : opts->gpu_binsizez; } break; } + } break; + } } -template +template int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntransf, T tol, cufinufft_plan_t **d_plan_ptr, cufinufft_opts *opts) { - /* - "plan" stage (in single or double precision). - See ../docs/cppdoc.md for main user-facing documentation. - Note that *d_plan_ptr in the args list was called simply *plan there. - This is the remaining dev-facing doc: - - This performs: - (0) creating a new plan struct (d_plan), a pointer to which is passed - back by writing that pointer into *d_plan_ptr. - (1) set up the spread option, d_plan.spopts. - (2) calculate the correction factor on cpu, copy the value from cpu to - gpu - (3) allocate gpu arrays with size determined by number of fourier modes - and method related options that had been set in d_plan.opts - (4) call cufftPlanMany and save the cufft plan inside cufinufft plan - Variables and arrays inside the plan struct are set and allocated. - - Melody Shih 07/25/19. Use-facing moved to markdown, Barnett 2/16/21. - */ - int ier; - cuDoubleComplex *d_a = nullptr; // fseries temp data - T *d_f = nullptr; // fseries temp data - - if (type < 1 || type > 2) { - fprintf(stderr, "[%s] Invalid type (%d): should be 1 or 2.\n", __func__, type); - return FINUFFT_ERR_TYPE_NOTVALID; - } - if (ntransf < 1) { - fprintf(stderr, "[%s] Invalid ntransf (%d): should be at least 1.\n", __func__, ntransf); - return FINUFFT_ERR_NTRANS_NOTVALID; - } - - // Mult-GPU support: set the CUDA Device ID: - const int device_id = opts == NULL ? 0 : opts->gpu_device_id; - cufinufft::utils::WithCudaDevice device_swapper(device_id); - - /* allocate the plan structure, assign address to user pointer. */ - cufinufft_plan_t *d_plan = new cufinufft_plan_t; - *d_plan_ptr = d_plan; - // Zero out your struct, (sets all pointers to NULL) - memset(d_plan, 0, sizeof(*d_plan)); - - /* If a user has not supplied their own options, assign defaults for them. */ - if (opts == NULL) { // use default opts - cufinufft_default_opts(&(d_plan->opts)); - } else { // or read from what's passed in - d_plan->opts = *opts; // keep a deep copy; changing *opts now has no effect - } - - auto &stream = d_plan->stream = (cudaStream_t)d_plan->opts.gpu_stream; - - /* Automatically set GPU method. */ - if (d_plan->opts.gpu_method == 0) { - /* For type 1, we default to method 2 (SM) since this is generally faster. - * However, in the special case of _double precision_ in _three dimensions_ - * with more than _three digits of precision_, there is note enough shared - * memory for this to work. As a result, we will default to method 1 (GM) in - * this special case. - * - * For type 2, we always default to method 1 (GM). */ - if (type == 1 && (sizeof(T) == 4 || dim < 3 || tol >= 1e-3)) - d_plan->opts.gpu_method = 2; - else if (type == 1 && tol < 1e-3) - d_plan->opts.gpu_method = 1; - else if (type == 2) - d_plan->opts.gpu_method = 1; - } - - /* Setup Spreader */ - using namespace cufinufft::common; - // can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK - if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) { - delete *d_plan_ptr; - *d_plan_ptr = nullptr; - return ier; - } - - d_plan->dim = dim; - d_plan->ms = nmodes[0]; - d_plan->mt = nmodes[1]; - d_plan->mu = nmodes[2]; - - cufinufft_setup_binsize(type, dim, &d_plan->opts); - CUFINUFFT_BIGINT nf1 = 1, nf2 = 1, nf3 = 1; - set_nf_type12(d_plan->ms, d_plan->opts, d_plan->spopts, &nf1, d_plan->opts.gpu_obinsizex); + /* + "plan" stage (in single or double precision). + See ../docs/cppdoc.md for main user-facing documentation. + Note that *d_plan_ptr in the args list was called simply *plan there. + This is the remaining dev-facing doc: + + This performs: + (0) creating a new plan struct (d_plan), a pointer to which is passed + back by writing that pointer into *d_plan_ptr. + (1) set up the spread option, d_plan.spopts. + (2) calculate the correction factor on cpu, copy the value from cpu to + gpu + (3) allocate gpu arrays with size determined by number of fourier modes + and method related options that had been set in d_plan.opts + (4) call cufftPlanMany and save the cufft plan inside cufinufft plan + Variables and arrays inside the plan struct are set and allocated. + + Melody Shih 07/25/19. Use-facing moved to markdown, Barnett 2/16/21. + */ + int ier; + cuDoubleComplex *d_a = nullptr; // fseries temp data + T *d_f = nullptr; // fseries temp data + + if (type < 1 || type > 2) { + fprintf(stderr, "[%s] Invalid type (%d): should be 1 or 2.\n", __func__, type); + return FINUFFT_ERR_TYPE_NOTVALID; + } + if (ntransf < 1) { + fprintf(stderr, "[%s] Invalid ntransf (%d): should be at least 1.\n", __func__, + ntransf); + return FINUFFT_ERR_NTRANS_NOTVALID; + } + + // Mult-GPU support: set the CUDA Device ID: + const int device_id = opts == NULL ? 0 : opts->gpu_device_id; + cufinufft::utils::WithCudaDevice device_swapper(device_id); + + /* allocate the plan structure, assign address to user pointer. */ + cufinufft_plan_t *d_plan = new cufinufft_plan_t; + *d_plan_ptr = d_plan; + // Zero out your struct, (sets all pointers to NULL) + memset(d_plan, 0, sizeof(*d_plan)); + + /* If a user has not supplied their own options, assign defaults for them. */ + if (opts == NULL) { // use default opts + cufinufft_default_opts(&(d_plan->opts)); + } else { // or read from what's passed in + d_plan->opts = *opts; // keep a deep copy; changing *opts now has no effect + } + + auto &stream = d_plan->stream = (cudaStream_t)d_plan->opts.gpu_stream; + + /* Automatically set GPU method. */ + if (d_plan->opts.gpu_method == 0) { + /* For type 1, we default to method 2 (SM) since this is generally faster. + * However, in the special case of _double precision_ in _three dimensions_ + * with more than _three digits of precision_, there is note enough shared + * memory for this to work. As a result, we will default to method 1 (GM) in + * this special case. + * + * For type 2, we always default to method 1 (GM). */ + if (type == 1 && (sizeof(T) == 4 || dim < 3 || tol >= 1e-3)) + d_plan->opts.gpu_method = 2; + else if (type == 1 && tol < 1e-3) + d_plan->opts.gpu_method = 1; + else if (type == 2) + d_plan->opts.gpu_method = 1; + } + + /* Setup Spreader */ + using namespace cufinufft::common; + // can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK + if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) { + delete *d_plan_ptr; + *d_plan_ptr = nullptr; + return ier; + } + + d_plan->dim = dim; + d_plan->ms = nmodes[0]; + d_plan->mt = nmodes[1]; + d_plan->mu = nmodes[2]; + + cufinufft_setup_binsize(type, dim, &d_plan->opts); + CUFINUFFT_BIGINT nf1 = 1, nf2 = 1, nf3 = 1; + set_nf_type12(d_plan->ms, d_plan->opts, d_plan->spopts, &nf1, + d_plan->opts.gpu_obinsizex); + if (dim > 1) + set_nf_type12(d_plan->mt, d_plan->opts, d_plan->spopts, &nf2, + d_plan->opts.gpu_obinsizey); + if (dim > 2) + set_nf_type12(d_plan->mu, d_plan->opts, d_plan->spopts, &nf3, + d_plan->opts.gpu_obinsizez); + int fftsign = (iflag >= 0) ? 1 : -1; + + d_plan->nf1 = nf1; + d_plan->nf2 = nf2; + d_plan->nf3 = nf3; + d_plan->iflag = fftsign; + d_plan->ntransf = ntransf; + int maxbatchsize = opts ? opts->gpu_maxbatchsize : 0; + if (maxbatchsize == 0) // implies: use a heuristic. + maxbatchsize = std::min(ntransf, 8); // heuristic from test codes + d_plan->maxbatchsize = maxbatchsize; + d_plan->type = type; + + if (d_plan->type == 1) d_plan->spopts.spread_direction = 1; + if (d_plan->type == 2) d_plan->spopts.spread_direction = 2; + + using namespace cufinufft::memtransfer; + switch (d_plan->dim) { + case 1: { + if ((ier = allocgpumem1d_plan(d_plan))) goto finalize; + } break; + case 2: { + if ((ier = allocgpumem2d_plan(d_plan))) goto finalize; + } break; + case 3: { + if ((ier = allocgpumem3d_plan(d_plan))) goto finalize; + } break; + } + + cufftHandle fftplan; + cufftResult_t cufft_status; + switch (d_plan->dim) { + case 1: { + int n[] = {(int)nf1}; + int inembed[] = {(int)nf1}; + + cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1, + inembed[0], cufft_type(), maxbatchsize); + } break; + case 2: { + int n[] = {(int)nf2, (int)nf1}; + int inembed[] = {(int)nf2, (int)nf1}; + + cufft_status = + cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1, + inembed[0] * inembed[1], cufft_type(), maxbatchsize); + } break; + case 3: { + int n[] = {(int)nf3, (int)nf2, (int)nf1}; + int inembed[] = {(int)nf3, (int)nf2, (int)nf1}; + + cufft_status = cufftPlanMany( + &fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1, + inembed[0] * inembed[1] * inembed[2], cufft_type(), maxbatchsize); + } break; + } + + if (cufft_status != CUFFT_SUCCESS) { + fprintf(stderr, "[%s] cufft makeplan error: %s", __func__, + cufftGetErrorString(cufft_status)); + ier = FINUFFT_ERR_CUDA_FAILURE; + goto finalize; + } + cufftSetStream(fftplan, stream); + + d_plan->fftplan = fftplan; + { + std::complex *a = d_plan->fseries_precomp_a; + T *f = d_plan->fseries_precomp_f; + + onedim_fseries_kernel_precomp(nf1, f, a, d_plan->spopts); if (dim > 1) - set_nf_type12(d_plan->mt, d_plan->opts, d_plan->spopts, &nf2, d_plan->opts.gpu_obinsizey); + onedim_fseries_kernel_precomp(nf2, f + MAX_NQUAD, a + MAX_NQUAD, d_plan->spopts); if (dim > 2) - set_nf_type12(d_plan->mu, d_plan->opts, d_plan->spopts, &nf3, d_plan->opts.gpu_obinsizez); - int fftsign = (iflag >= 0) ? 1 : -1; - - d_plan->nf1 = nf1; - d_plan->nf2 = nf2; - d_plan->nf3 = nf3; - d_plan->iflag = fftsign; - d_plan->ntransf = ntransf; - int maxbatchsize = opts ? opts->gpu_maxbatchsize : 0; - if (maxbatchsize == 0) // implies: use a heuristic. - maxbatchsize = std::min(ntransf, 8); // heuristic from test codes - d_plan->maxbatchsize = maxbatchsize; - d_plan->type = type; - - if (d_plan->type == 1) - d_plan->spopts.spread_direction = 1; - if (d_plan->type == 2) - d_plan->spopts.spread_direction = 2; - - using namespace cufinufft::memtransfer; - switch (d_plan->dim) { - case 1: { - if ((ier = allocgpumem1d_plan(d_plan))) - goto finalize; - } break; - case 2: { - if ((ier = allocgpumem2d_plan(d_plan))) - goto finalize; - } break; - case 3: { - if ((ier = allocgpumem3d_plan(d_plan))) - goto finalize; - } break; - } - - cufftHandle fftplan; - cufftResult_t cufft_status; - switch (d_plan->dim) { - case 1: { - int n[] = {(int)nf1}; - int inembed[] = {(int)nf1}; - - cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1, inembed[0], cufft_type(), - maxbatchsize); - } break; - case 2: { - int n[] = {(int)nf2, (int)nf1}; - int inembed[] = {(int)nf2, (int)nf1}; - - cufft_status = cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1, - inembed[0] * inembed[1], cufft_type(), maxbatchsize); - } break; - case 3: { - int n[] = {(int)nf3, (int)nf2, (int)nf1}; - int inembed[] = {(int)nf3, (int)nf2, (int)nf1}; - - cufft_status = cufftPlanMany(&fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1, - inembed[0] * inembed[1] * inembed[2], cufft_type(), maxbatchsize); - } break; - } - - if (cufft_status != CUFFT_SUCCESS) { - fprintf(stderr, "[%s] cufft makeplan error: %s", __func__, cufftGetErrorString(cufft_status)); - ier = FINUFFT_ERR_CUDA_FAILURE; - goto finalize; - } - cufftSetStream(fftplan, stream); - - d_plan->fftplan = fftplan; - { - std::complex *a = d_plan->fseries_precomp_a; - T *f = d_plan->fseries_precomp_f; - - onedim_fseries_kernel_precomp(nf1, f, a, d_plan->spopts); - if (dim > 1) - onedim_fseries_kernel_precomp(nf2, f + MAX_NQUAD, a + MAX_NQUAD, d_plan->spopts); - if (dim > 2) - onedim_fseries_kernel_precomp(nf3, f + 2 * MAX_NQUAD, a + 2 * MAX_NQUAD, d_plan->spopts); - - if ((ier = checkCudaErrors(cudaMallocAsync(&d_a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), stream)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMallocAsync(&d_f, dim * MAX_NQUAD * sizeof(T), stream)))) - goto finalize; - if ((ier = checkCudaErrors( - cudaMemcpyAsync(d_a, a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice, stream)))) - goto finalize; - if ((ier = - checkCudaErrors(cudaMemcpyAsync(d_f, f, dim * MAX_NQUAD * sizeof(T), cudaMemcpyHostToDevice, stream)))) - goto finalize; - if ((ier = cufserieskernelcompute(d_plan->dim, nf1, nf2, nf3, d_f, d_a, d_plan->fwkerhalf1, d_plan->fwkerhalf2, - d_plan->fwkerhalf3, d_plan->spopts.nspread, stream))) - goto finalize; - } + onedim_fseries_kernel_precomp(nf3, f + 2 * MAX_NQUAD, a + 2 * MAX_NQUAD, + d_plan->spopts); + + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), stream)))) + goto finalize; + if ((ier = + checkCudaErrors(cudaMallocAsync(&d_f, dim * MAX_NQUAD * sizeof(T), stream)))) + goto finalize; + if ((ier = checkCudaErrors( + cudaMemcpyAsync(d_a, a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), + cudaMemcpyHostToDevice, stream)))) + goto finalize; + if ((ier = checkCudaErrors(cudaMemcpyAsync(d_f, f, dim * MAX_NQUAD * sizeof(T), + cudaMemcpyHostToDevice, stream)))) + goto finalize; + if ((ier = cufserieskernelcompute( + d_plan->dim, nf1, nf2, nf3, d_f, d_a, d_plan->fwkerhalf1, d_plan->fwkerhalf2, + d_plan->fwkerhalf3, d_plan->spopts.nspread, stream))) + goto finalize; + } finalize: - cudaFreeAsync(d_a, stream); - cudaFreeAsync(d_f, stream); + cudaFreeAsync(d_a, stream); + cudaFreeAsync(d_f, stream); - if (ier > 1) { - delete *d_plan_ptr; - *d_plan_ptr = nullptr; - } + if (ier > 1) { + delete *d_plan_ptr; + *d_plan_ptr = nullptr; + } - return ier; + return ier; } -template -int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_t, T *d_u, cufinufft_plan_t *d_plan) +template +int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_t, T *d_u, + cufinufft_plan_t *d_plan) /* "setNUpts" stage (in single or double precision). @@ -302,66 +316,78 @@ Notes: the type T means either single or double, matching the Melody Shih 07/25/19; Barnett 2/16/21 moved out docs. */ { - cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); - - int nf1 = d_plan->nf1; - int nf2 = d_plan->nf2; - int nf3 = d_plan->nf3; - int dim = d_plan->dim; - - d_plan->M = M; - - using namespace cufinufft::memtransfer; - int ier; - switch (d_plan->dim) { - case 1: { - ier = allocgpumem1d_nupts(d_plan); - } break; - case 2: { - ier = allocgpumem2d_nupts(d_plan); - } break; - case 3: { - ier = allocgpumem3d_nupts(d_plan); - } break; - } - if (ier) - return ier; - - d_plan->kx = d_kx; - if (dim > 1) - d_plan->ky = d_ky; - if (dim > 2) - d_plan->kz = d_kz; - - using namespace cufinufft::spreadinterp; - switch (d_plan->dim) { - case 1: { - if (d_plan->opts.gpu_method == 1 && (ier = cuspread1d_nuptsdriven_prop(nf1, M, d_plan))) - fprintf(stderr, "error: cuspread1d_nupts_prop, method(%d)\n", d_plan->opts.gpu_method); - if (d_plan->opts.gpu_method == 2 && (ier = cuspread1d_subprob_prop(nf1, M, d_plan))) - fprintf(stderr, "error: cuspread1d_subprob_prop, method(%d)\n", d_plan->opts.gpu_method); - } break; - case 2: { - if (d_plan->opts.gpu_method == 1 && (ier = cuspread2d_nuptsdriven_prop(nf1, nf2, M, d_plan))) - fprintf(stderr, "error: cuspread2d_nupts_prop, method(%d)\n", d_plan->opts.gpu_method); - if (d_plan->opts.gpu_method == 2 && (ier = cuspread2d_subprob_prop(nf1, nf2, M, d_plan))) - fprintf(stderr, "error: cuspread2d_subprob_prop, method(%d)\n", d_plan->opts.gpu_method); - } break; - case 3: { - if (d_plan->opts.gpu_method == 1 && (ier = cuspread3d_nuptsdriven_prop(nf1, nf2, nf3, M, d_plan))) - fprintf(stderr, "error: cuspread3d_nuptsdriven_prop, method(%d)\n", d_plan->opts.gpu_method); - if (d_plan->opts.gpu_method == 2 && (ier = cuspread3d_subprob_prop(nf1, nf2, nf3, M, d_plan))) - fprintf(stderr, "error: cuspread3d_subprob_prop, method(%d)\n", d_plan->opts.gpu_method); - if (d_plan->opts.gpu_method == 4 && (ier = cuspread3d_blockgather_prop(nf1, nf2, nf3, M, d_plan))) - fprintf(stderr, "error: cuspread3d_blockgather_prop, method(%d)\n", d_plan->opts.gpu_method); - } break; - } - - return ier; + cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); + + int nf1 = d_plan->nf1; + int nf2 = d_plan->nf2; + int nf3 = d_plan->nf3; + int dim = d_plan->dim; + + d_plan->M = M; + + using namespace cufinufft::memtransfer; + int ier; + switch (d_plan->dim) { + case 1: { + ier = allocgpumem1d_nupts(d_plan); + } break; + case 2: { + ier = allocgpumem2d_nupts(d_plan); + } break; + case 3: { + ier = allocgpumem3d_nupts(d_plan); + } break; + } + if (ier) return ier; + + d_plan->kx = d_kx; + if (dim > 1) d_plan->ky = d_ky; + if (dim > 2) d_plan->kz = d_kz; + + using namespace cufinufft::spreadinterp; + switch (d_plan->dim) { + case 1: { + if (d_plan->opts.gpu_method == 1 && + (ier = cuspread1d_nuptsdriven_prop(nf1, M, d_plan))) + fprintf(stderr, "error: cuspread1d_nupts_prop, method(%d)\n", + d_plan->opts.gpu_method); + if (d_plan->opts.gpu_method == 2 && + (ier = cuspread1d_subprob_prop(nf1, M, d_plan))) + fprintf(stderr, "error: cuspread1d_subprob_prop, method(%d)\n", + d_plan->opts.gpu_method); + } break; + case 2: { + if (d_plan->opts.gpu_method == 1 && + (ier = cuspread2d_nuptsdriven_prop(nf1, nf2, M, d_plan))) + fprintf(stderr, "error: cuspread2d_nupts_prop, method(%d)\n", + d_plan->opts.gpu_method); + if (d_plan->opts.gpu_method == 2 && + (ier = cuspread2d_subprob_prop(nf1, nf2, M, d_plan))) + fprintf(stderr, "error: cuspread2d_subprob_prop, method(%d)\n", + d_plan->opts.gpu_method); + } break; + case 3: { + if (d_plan->opts.gpu_method == 1 && + (ier = cuspread3d_nuptsdriven_prop(nf1, nf2, nf3, M, d_plan))) + fprintf(stderr, "error: cuspread3d_nuptsdriven_prop, method(%d)\n", + d_plan->opts.gpu_method); + if (d_plan->opts.gpu_method == 2 && + (ier = cuspread3d_subprob_prop(nf1, nf2, nf3, M, d_plan))) + fprintf(stderr, "error: cuspread3d_subprob_prop, method(%d)\n", + d_plan->opts.gpu_method); + if (d_plan->opts.gpu_method == 4 && + (ier = cuspread3d_blockgather_prop(nf1, nf2, nf3, M, d_plan))) + fprintf(stderr, "error: cuspread3d_blockgather_prop, method(%d)\n", + d_plan->opts.gpu_method); + } break; + } + + return ier; } -template -int cufinufft_execute_impl(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan) +template +int cufinufft_execute_impl(cuda_complex *d_c, cuda_complex *d_fk, + cufinufft_plan_t *d_plan) /* "exec" stage (single and double precision versions). @@ -377,53 +403,47 @@ int cufinufft_execute_impl(cuda_complex *d_c, cuda_complex *d_fk, cufinuff Type 2; output for Type 1) Notes: - i) Here CUFINUFFT_CPX is a defined type meaning either complex or complex - to match the precision of the library called. - ii) All operations are done on the GPU device (hence the d_* names) + i) Here CUFINUFFT_CPX is a defined type meaning either complex or + complex to match the precision of the library called. ii) All operations are + done on the GPU device (hence the d_* names) Melody Shih 07/25/19; Barnett 2/16/21. */ { - cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); - int ier; - int type = d_plan->type; - switch (d_plan->dim) { - case 1: { - if (type == 1) - ier = cufinufft1d1_exec(d_c, d_fk, d_plan); - if (type == 2) - ier = cufinufft1d2_exec(d_c, d_fk, d_plan); - if (type == 3) { - std::cerr << "Not Implemented yet" << std::endl; - ier = FINUFFT_ERR_TYPE_NOTVALID; - } - } break; - case 2: { - if (type == 1) - ier = cufinufft2d1_exec(d_c, d_fk, d_plan); - if (type == 2) - ier = cufinufft2d2_exec(d_c, d_fk, d_plan); - if (type == 3) { - std::cerr << "Not Implemented yet" << std::endl; - ier = FINUFFT_ERR_TYPE_NOTVALID; - } - } break; - case 3: { - if (type == 1) - ier = cufinufft3d1_exec(d_c, d_fk, d_plan); - if (type == 2) - ier = cufinufft3d2_exec(d_c, d_fk, d_plan); - if (type == 3) { - std::cerr << "Not Implemented yet" << std::endl; - ier = FINUFFT_ERR_TYPE_NOTVALID; - } - } break; + cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); + int ier; + int type = d_plan->type; + switch (d_plan->dim) { + case 1: { + if (type == 1) ier = cufinufft1d1_exec(d_c, d_fk, d_plan); + if (type == 2) ier = cufinufft1d2_exec(d_c, d_fk, d_plan); + if (type == 3) { + std::cerr << "Not Implemented yet" << std::endl; + ier = FINUFFT_ERR_TYPE_NOTVALID; } + } break; + case 2: { + if (type == 1) ier = cufinufft2d1_exec(d_c, d_fk, d_plan); + if (type == 2) ier = cufinufft2d2_exec(d_c, d_fk, d_plan); + if (type == 3) { + std::cerr << "Not Implemented yet" << std::endl; + ier = FINUFFT_ERR_TYPE_NOTVALID; + } + } break; + case 3: { + if (type == 1) ier = cufinufft3d1_exec(d_c, d_fk, d_plan); + if (type == 2) ier = cufinufft3d2_exec(d_c, d_fk, d_plan); + if (type == 3) { + std::cerr << "Not Implemented yet" << std::endl; + ier = FINUFFT_ERR_TYPE_NOTVALID; + } + } break; + } - return ier; + return ier; } -template +template int cufinufft_destroy_impl(cufinufft_plan_t *d_plan) /* "destroy" stage (single and double precision versions). @@ -435,21 +455,19 @@ int cufinufft_destroy_impl(cufinufft_plan_t *d_plan) Also see ../docs/cppdoc.md for main user-facing documentation. */ { - cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); + cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); - // Can't destroy a null pointer. - if (!d_plan) - return FINUFFT_ERR_PLAN_NOTVALID; + // Can't destroy a null pointer. + if (!d_plan) return FINUFFT_ERR_PLAN_NOTVALID; - using namespace cufinufft::memtransfer; - freegpumemory(d_plan); + using namespace cufinufft::memtransfer; + freegpumemory(d_plan); - if (d_plan->fftplan) - cufftDestroy(d_plan->fftplan); + if (d_plan->fftplan) cufftDestroy(d_plan->fftplan); - /* free/destruct the plan */ - delete d_plan; + /* free/destruct the plan */ + delete d_plan; - return 0; + return 0; } // namespace cufinufft #endif diff --git a/include/cufinufft/memtransfer.h b/include/cufinufft/memtransfer.h index 382f911e9..4c4788b9d 100644 --- a/include/cufinufft/memtransfer.h +++ b/include/cufinufft/memtransfer.h @@ -6,20 +6,13 @@ namespace cufinufft { namespace memtransfer { -template -int allocgpumem1d_plan(cufinufft_plan_t *d_plan); -template -int allocgpumem1d_nupts(cufinufft_plan_t *d_plan); -template -void freegpumemory(cufinufft_plan_t *d_plan); -template -int allocgpumem2d_plan(cufinufft_plan_t *d_plan); -template -int allocgpumem2d_nupts(cufinufft_plan_t *d_plan); -template -int allocgpumem3d_plan(cufinufft_plan_t *d_plan); -template -int allocgpumem3d_nupts(cufinufft_plan_t *d_plan); +template int allocgpumem1d_plan(cufinufft_plan_t *d_plan); +template int allocgpumem1d_nupts(cufinufft_plan_t *d_plan); +template void freegpumemory(cufinufft_plan_t *d_plan); +template int allocgpumem2d_plan(cufinufft_plan_t *d_plan); +template int allocgpumem2d_nupts(cufinufft_plan_t *d_plan); +template int allocgpumem3d_plan(cufinufft_plan_t *d_plan); +template int allocgpumem3d_nupts(cufinufft_plan_t *d_plan); } // namespace memtransfer } // namespace cufinufft diff --git a/include/cufinufft/precision_independent.h b/include/cufinufft/precision_independent.h index ff98506bf..9fa48a07e 100644 --- a/include/cufinufft/precision_independent.h +++ b/include/cufinufft/precision_independent.h @@ -6,8 +6,8 @@ #define PRECISION_INDEPENDENT_H #include -#define rpart(x) (cuCreal(x)) -#define ipart(x) (cuCimag(x)) +#define rpart(x) (cuCreal(x)) +#define ipart(x) (cuCimag(x)) #define cmplx(x, y) (make_cuDoubleComplex(x, y)) namespace cufinufft { namespace common { @@ -20,42 +20,51 @@ __device__ RT cabs(const CT &z); __device__ CT cpow(const CT &z, const int &n); /* Common Kernels from spreadinterp3d */ -__host__ __device__ int calc_global_index(int xidx, int yidx, int zidx, int onx, int ony, int onz, int bnx, int bny, - int bnz); -__device__ int calc_global_index_v2(int xidx, int yidx, int zidx, int nbinx, int nbiny, int nbinz); +__host__ __device__ int calc_global_index(int xidx, int yidx, int zidx, int onx, int ony, + int onz, int bnx, int bny, int bnz); +__device__ int calc_global_index_v2(int xidx, int yidx, int zidx, int nbinx, int nbiny, + int nbinz); /* spreadinterp 1d */ -__global__ void calc_subprob_1d(int *bin_size, int *num_subprob, int maxsubprobsize, int numbins); +__global__ void calc_subprob_1d(int *bin_size, int *num_subprob, int maxsubprobsize, + int numbins); -__global__ void map_b_into_subprob_1d(int *d_subprob_to_bin, int *d_subprobstartpts, int *d_numsubprob, int numbins); +__global__ void map_b_into_subprob_1d(int *d_subprob_to_bin, int *d_subprobstartpts, + int *d_numsubprob, int numbins); __global__ void trivial_global_sort_index_1d(int M, int *index); /* spreadinterp 2d */ -__global__ void calc_subprob_2d(int *bin_size, int *num_subprob, int maxsubprobsize, int numbins); +__global__ void calc_subprob_2d(int *bin_size, int *num_subprob, int maxsubprobsize, + int numbins); -__global__ void map_b_into_subprob_2d(int *d_subprob_to_bin, int *d_subprobstartpts, int *d_numsubprob, int numbins); +__global__ void map_b_into_subprob_2d(int *d_subprob_to_bin, int *d_subprobstartpts, + int *d_numsubprob, int numbins); __global__ void trivial_global_sort_index_2d(int M, int *index); /* spreadinterp3d */ -__global__ void calc_subprob_3d_v2(int *bin_size, int *num_subprob, int maxsubprobsize, int numbins); +__global__ void calc_subprob_3d_v2(int *bin_size, int *num_subprob, int maxsubprobsize, + int numbins); -__global__ void map_b_into_subprob_3d_v2(int *d_subprob_to_bin, int *d_subprobstartpts, int *d_numsubprob, int numbins); +__global__ void map_b_into_subprob_3d_v2(int *d_subprob_to_bin, int *d_subprobstartpts, + int *d_numsubprob, int numbins); -__global__ void calc_subprob_3d_v1(int binsperobinx, int binsperobiny, int binsperobinz, int *bin_size, - int *num_subprob, int maxsubprobsize, int numbins); +__global__ void calc_subprob_3d_v1(int binsperobinx, int binsperobiny, int binsperobinz, + int *bin_size, int *num_subprob, int maxsubprobsize, + int numbins); -__global__ void map_b_into_subprob_3d_v1(int *d_subprob_to_obin, int *d_subprobstartpts, int *d_numsubprob, - int numbins); +__global__ void map_b_into_subprob_3d_v1(int *d_subprob_to_obin, int *d_subprobstartpts, + int *d_numsubprob, int numbins); __global__ void trivial_global_sort_index_3d(int M, int *index); -__global__ void fill_ghost_bins(int binsperobinx, int binsperobiny, int binsperobinz, int nobinx, int nobiny, - int nobinz, int *binsize); +__global__ void fill_ghost_bins(int binsperobinx, int binsperobiny, int binsperobinz, + int nobinx, int nobiny, int nobinz, int *binsize); -__global__ void ghost_bin_pts_index(int binsperobinx, int binsperobiny, int binsperobinz, int nobinx, int nobiny, - int nobinz, int *binsize, int *index, int *binstartpts, int M); +__global__ void ghost_bin_pts_index(int binsperobinx, int binsperobiny, int binsperobinz, + int nobinx, int nobiny, int nobinz, int *binsize, + int *index, int *binstartpts, int M); } // namespace common } // namespace cufinufft #endif diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h index 85850e92a..da1c59930 100644 --- a/include/cufinufft/spreadinterp.h +++ b/include/cufinufft/spreadinterp.h @@ -1,21 +1,20 @@ #ifndef __CUSPREADINTERP_H__ #define __CUSPREADINTERP_H__ +#include #include #include -#include namespace cufinufft { namespace spreadinterp { -template -static __forceinline__ __device__ T fold_rescale(T x, int N) { +template static __forceinline__ __device__ T fold_rescale(T x, int N) { static constexpr const auto x2pi = T(0.159154943091895345554011992339482617); - const T result = x * x2pi + T(0.5); - return (result-floor(result)) * T(N); + const T result = x * x2pi + T(0.5); + return (result - floor(result)) * T(N); } -template +template static inline T evaluate_kernel(T x, const finufft_spread_opts &opts) /* ES ("exp sqrt") kernel evaluation at single real argument: phi(x) = exp(beta.sqrt(1 - (2x/n_s)^2)), for |x| < nspread/2 @@ -23,17 +22,17 @@ static inline T evaluate_kernel(T x, const finufft_spread_opts &opts) approximation to prolate spheroidal wavefunction (PSWF) of order 0. This is the "reference implementation", used by eg common/onedim_* 2/17/17 */ { - if (abs(x) >= opts.ES_halfwidth) - // if spreading/FT careful, shouldn't need this if, but causes no speed hit - return 0.0; - else - return exp(opts.ES_beta * sqrt(1.0 - opts.ES_c * x * x)); + if (abs(x) >= opts.ES_halfwidth) + // if spreading/FT careful, shouldn't need this if, but causes no speed hit + return 0.0; + else + return exp(opts.ES_beta * sqrt(1.0 - opts.ES_c * x * x)); } -template +template int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmeth); -template +template static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta, int ns) /* ES ("exp sqrt") kernel evaluation at single real argument: phi(x) = exp(beta.sqrt(1 - (2x/n_s)^2)), for |x| < nspread/2 @@ -42,89 +41,95 @@ static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta, int This is the "reference implementation", used by eg common/onedim_* 2/17/17 */ { - return abs(x) < ns / 2.0 ? exp(es_beta * (sqrt(1.0 - es_c * x * x))) : 0.0; + return abs(x) < ns / 2.0 ? exp(es_beta * (sqrt(1.0 - es_c * x * x))) : 0.0; } -template -static __inline__ __device__ void eval_kernel_vec_horner(T *ker, const T x, const int w, const double upsampfac) +template +static __inline__ __device__ void eval_kernel_vec_horner(T *ker, const T x, const int w, + const double upsampfac) /* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at x_j = x + j, for j=0,..,w-1. Thus x in [-w/2,-w/2+1]. w is aka ns. This is the current evaluation method, since it's faster (except i7 w=16). Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ { - T z = 2 * x + w - 1.0; // scale so local grid offset z in [-1,1] - // insert the auto-generated code which expects z, w args, writes to ker... - if (upsampfac == 2.0) { // floating point equality is fine here - using FLT = T; - using CUFINUFFT_FLT = T; + T z = 2 * x + w - 1.0; // scale so local grid offset z in [-1,1] + // insert the auto-generated code which expects z, w args, writes to ker... + if (upsampfac == 2.0) { // floating point equality is fine here + using FLT = T; + using CUFINUFFT_FLT = T; #include "cufinufft/contrib/ker_horner_allw_loop.inc" - } + } } -template -static __inline__ __device__ void eval_kernel_vec(T *ker, const T x, const int w, const T es_c, const T es_beta) { - for (int i = 0; i < w; i++) { - ker[i] = evaluate_kernel(abs(x + i), es_c, es_beta, w); - } +template +static __inline__ __device__ void eval_kernel_vec(T *ker, const T x, const int w, + const T es_c, const T es_beta) { + for (int i = 0; i < w; i++) { + ker[i] = evaluate_kernel(abs(x + i), es_c, es_beta, w); + } } // Functions for calling different methods of spreading & interpolation -template -int cuspread1d(cufinufft_plan_t *d_plan, int blksize); -template -int cuinterp1d(cufinufft_plan_t *d_plan, int blksize); +template int cuspread1d(cufinufft_plan_t *d_plan, int blksize); +template int cuinterp1d(cufinufft_plan_t *d_plan, int blksize); -template -int cuspread2d(cufinufft_plan_t *d_plan, int blksize); -template -int cuinterp2d(cufinufft_plan_t *d_plan, int blksize); -template -int cuspread3d(cufinufft_plan_t *d_plan, int blksize); -template -int cuinterp3d(cufinufft_plan_t *d_plan, int blksize); +template int cuspread2d(cufinufft_plan_t *d_plan, int blksize); +template int cuinterp2d(cufinufft_plan_t *d_plan, int blksize); +template int cuspread3d(cufinufft_plan_t *d_plan, int blksize); +template int cuinterp3d(cufinufft_plan_t *d_plan, int blksize); // Wrappers for methods of spreading -template +template int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t *d_plan); -template +template int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t *d_plan, int blksize); -template +template int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t *d_plan); -template +template int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t *d_plan, int blksize); -template +template int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t *d_plan); -template -int cuspread2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, int blksize); -template +template +int cuspread2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, + int blksize); +template int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t *d_plan); -template +template int cuspread2d_subprob(int nf1, int nf2, int m, cufinufft_plan_t *d_plan, int blksize); -template -int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan); -template -int cuspread3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize); -template -int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan); -template -int cuspread3d_blockgather(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize); -template -int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan); -template -int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize); +template +int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M, + cufinufft_plan_t *d_plan); +template +int cuspread3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, + int blksize); +template +int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M, + cufinufft_plan_t *d_plan); +template +int cuspread3d_blockgather(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, + int blksize); +template +int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M, + cufinufft_plan_t *d_plan); +template +int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, + int blksize); // Wrappers for methods of interpolation -template +template int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t *d_plan, int blksize); -template -int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, int blksize); -template +template +int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, + int blksize); +template int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, int blksize); -template -int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize); -template -int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize); +template +int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, + int blksize); +template +int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, + int blksize); } // namespace spreadinterp } // namespace cufinufft diff --git a/include/cufinufft/types.h b/include/cufinufft/types.h index 246b4aaa1..05b2f6c36 100644 --- a/include/cufinufft/types.h +++ b/include/cufinufft/types.h @@ -3,99 +3,88 @@ #include +#include #include #include #include -#include #include #define CUFINUFFT_BIGINT int // Ugly trick to map a template to a fixed type, here cuda_complex -template -struct cuda_complex_impl; -template <> -struct cuda_complex_impl { - using type = cuFloatComplex; +template struct cuda_complex_impl; +template<> struct cuda_complex_impl { + using type = cuFloatComplex; }; -template <> -struct cuda_complex_impl { - using type = cuDoubleComplex; +template<> struct cuda_complex_impl { + using type = cuDoubleComplex; }; -template -using cuda_complex = typename cuda_complex_impl::type; - -template -struct cufinufft_plan_t { - cufinufft_opts opts; - finufft_spread_opts spopts; - - int type; - int dim; - CUFINUFFT_BIGINT M; - CUFINUFFT_BIGINT nf1; - CUFINUFFT_BIGINT nf2; - CUFINUFFT_BIGINT nf3; - CUFINUFFT_BIGINT ms; - CUFINUFFT_BIGINT mt; - CUFINUFFT_BIGINT mu; - int ntransf; - int maxbatchsize; - int iflag; - - int totalnumsubprob; - T *fwkerhalf1; - T *fwkerhalf2; - T *fwkerhalf3; - - T *kx; - T *ky; - T *kz; - cuda_complex *c; - cuda_complex *fw; - cuda_complex *fk; - - // Arrays that used in subprob method - int *idxnupts; // length: #nupts, index of the nupts in the bin-sorted order - int *sortidx; // length: #nupts, order inside the bin the nupt belongs to - int *numsubprob; // length: #bins, number of subproblems in each bin - int *binsize; // length: #bins, number of nonuniform ponits in each bin - int *binstartpts; // length: #bins, exclusive scan of array binsize - int *subprob_to_bin; // length: #subproblems, the bin the subproblem works on - int *subprobstartpts; // length: #bins, exclusive scan of array numsubprob - - // Arrays for 3d (need to sort out) - int *numnupts; - int *subprob_to_nupts; - - // Temporary variables to do fseries precomputation - std::complex fseries_precomp_a[3 * MAX_NQUAD]; - T fseries_precomp_f[3 * MAX_NQUAD]; - - cufftHandle fftplan; - cudaStream_t stream; +template using cuda_complex = typename cuda_complex_impl::type; + +template struct cufinufft_plan_t { + cufinufft_opts opts; + finufft_spread_opts spopts; + + int type; + int dim; + CUFINUFFT_BIGINT M; + CUFINUFFT_BIGINT nf1; + CUFINUFFT_BIGINT nf2; + CUFINUFFT_BIGINT nf3; + CUFINUFFT_BIGINT ms; + CUFINUFFT_BIGINT mt; + CUFINUFFT_BIGINT mu; + int ntransf; + int maxbatchsize; + int iflag; + + int totalnumsubprob; + T *fwkerhalf1; + T *fwkerhalf2; + T *fwkerhalf3; + + T *kx; + T *ky; + T *kz; + cuda_complex *c; + cuda_complex *fw; + cuda_complex *fk; + + // Arrays that used in subprob method + int *idxnupts; // length: #nupts, index of the nupts in the bin-sorted order + int *sortidx; // length: #nupts, order inside the bin the nupt belongs to + int *numsubprob; // length: #bins, number of subproblems in each bin + int *binsize; // length: #bins, number of nonuniform ponits in each bin + int *binstartpts; // length: #bins, exclusive scan of array binsize + int *subprob_to_bin; // length: #subproblems, the bin the subproblem works on + int *subprobstartpts; // length: #bins, exclusive scan of array numsubprob + + // Arrays for 3d (need to sort out) + int *numnupts; + int *subprob_to_nupts; + + // Temporary variables to do fseries precomputation + std::complex fseries_precomp_a[3 * MAX_NQUAD]; + T fseries_precomp_f[3 * MAX_NQUAD]; + + cufftHandle fftplan; + cudaStream_t stream; }; -template -static cufftType_t cufft_type(); -template <> -inline cufftType_t cufft_type() { - return CUFFT_C2C; -} +template static cufftType_t cufft_type(); +template<> inline cufftType_t cufft_type() { return CUFFT_C2C; } -template <> -inline cufftType_t cufft_type() { - return CUFFT_Z2Z; -} +template<> inline cufftType_t cufft_type() { return CUFFT_Z2Z; } -static inline cufftResult cufft_ex(cufftHandle plan, cufftComplex *idata, cufftComplex *odata, int direction) { - return cufftExecC2C(plan, idata, odata, direction); +static inline cufftResult cufft_ex(cufftHandle plan, cufftComplex *idata, + cufftComplex *odata, int direction) { + return cufftExecC2C(plan, idata, odata, direction); } -static inline cufftResult cufft_ex(cufftHandle plan, cufftDoubleComplex *idata, cufftDoubleComplex *odata, - int direction) { - return cufftExecZ2Z(plan, idata, odata, direction); +static inline cufftResult cufft_ex(cufftHandle plan, cufftDoubleComplex *idata, + cufftDoubleComplex *odata, int direction) { + return cufftExecZ2Z(plan, idata, odata, direction); } #endif diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h index e8deb42e9..3455b99c0 100644 --- a/include/cufinufft/utils.h +++ b/include/cufinufft/utils.h @@ -15,59 +15,58 @@ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__) #else __inline__ __device__ double atomicAdd(double *address, double val) { - unsigned long long int *address_as_ull = (unsigned long long int *)address; - unsigned long long int old = *address_as_ull, assumed; + unsigned long long int *address_as_ull = (unsigned long long int *)address; + unsigned long long int old = *address_as_ull, assumed; - do { - assumed = old; - old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); - // Note: uses integer comparison to avoid hang in case of NaN - // (since NaN != NaN) - } while (assumed != old); + // Note: uses integer comparison to avoid hang in case of NaN + // (since NaN != NaN) + } while (assumed != old); - return __longlong_as_double(old); + return __longlong_as_double(old); } #endif namespace cufinufft { namespace utils { class WithCudaDevice { - public: - WithCudaDevice(int device) { - cudaGetDevice(&orig_device_); - cudaSetDevice(device); - } +public: + WithCudaDevice(int device) { + cudaGetDevice(&orig_device_); + cudaSetDevice(device); + } - ~WithCudaDevice() { cudaSetDevice(orig_device_); } + ~WithCudaDevice() { cudaSetDevice(orig_device_); } - private: - int orig_device_; +private: + int orig_device_; }; // jfm timer class class CNTime { - public: - void start(); - double restart(); - double elapsedsec(); +public: + void start(); + double restart(); + double elapsedsec(); - private: - struct timeval initial; +private: + struct timeval initial; }; // ahb math helpers CUFINUFFT_BIGINT next235beven(CUFINUFFT_BIGINT n, CUFINUFFT_BIGINT b); -template -T infnorm(int n, std::complex *a) { - T nrm = 0.0; - for (int m = 0; m < n; ++m) { - T aa = real(conj(a[m]) * a[m]); - if (aa > nrm) - nrm = aa; - } - return sqrt(nrm); +template T infnorm(int n, std::complex *a) { + T nrm = 0.0; + for (int m = 0; m < n; ++m) { + T aa = real(conj(a[m]) * a[m]); + if (aa > nrm) nrm = aa; + } + return sqrt(nrm); } } // namespace utils } // namespace cufinufft diff --git a/include/cufinufft_opts.h b/include/cufinufft_opts.h index 9760884ae..0fb0d8f62 100644 --- a/include/cufinufft_opts.h +++ b/include/cufinufft_opts.h @@ -2,30 +2,30 @@ #define __CUFINUFFT_OPTS_H__ typedef struct cufinufft_opts { // see cufinufft_default_opts() for defaults - double upsampfac; // upsampling ratio sigma, only 2.0 (standard) is implemented - /* following options are for gpu */ - int gpu_method; // 1: nonuniform-pts driven, 2: shared mem (SM) - int gpu_sort; // when NU-pts driven: 0: no sort (GM), 1: sort (GM-sort) + double upsampfac; // upsampling ratio sigma, only 2.0 (standard) is implemented + /* following options are for gpu */ + int gpu_method; // 1: nonuniform-pts driven, 2: shared mem (SM) + int gpu_sort; // when NU-pts driven: 0: no sort (GM), 1: sort (GM-sort) - int gpu_binsizex; // used for 2D, 3D subproblem method - int gpu_binsizey; - int gpu_binsizez; + int gpu_binsizex; // used for 2D, 3D subproblem method + int gpu_binsizey; + int gpu_binsizez; - int gpu_obinsizex; // used for 3D spread block gather method - int gpu_obinsizey; - int gpu_obinsizez; + int gpu_obinsizex; // used for 3D spread block gather method + int gpu_obinsizey; + int gpu_obinsizez; - int gpu_maxsubprobsize; - int gpu_kerevalmeth; // 0: direct exp(sqrt()), 1: Horner ppval + int gpu_maxsubprobsize; + int gpu_kerevalmeth; // 0: direct exp(sqrt()), 1: Horner ppval - int gpu_spreadinterponly; // 0: NUFFT, 1: spread or interpolation only + int gpu_spreadinterponly; // 0: NUFFT, 1: spread or interpolation only - int gpu_maxbatchsize; + int gpu_maxbatchsize; - /* multi-gpu support */ - int gpu_device_id; + /* multi-gpu support */ + int gpu_device_id; - void *gpu_stream; + void *gpu_stream; } cufinufft_opts; #endif diff --git a/include/finufft.h b/include/finufft.h index 71a38f9be..487a3eb4f 100644 --- a/include/finufft.h +++ b/include/finufft.h @@ -5,7 +5,6 @@ // They will clobber any prior macros starting FINUFFT*, so in the lib/test // sources finufft.h must be included before defs.h - /* Devnotes. A) Two precisions done by including the "either precision" headers twice. No use of the private headers for lib/test/example compilation is made. @@ -37,7 +36,7 @@ #define FINUFFT_BIGINT int64_t #ifndef __cplusplus -#include // for bool type in C (needed for item in plan struct) +#include // for bool type in C (needed for item in plan struct) #endif // this macro name has to be safe since exposed to user @@ -50,4 +49,4 @@ // clean up any purely local defs that are not in finufft_eitherprec.h... #undef FINUFFT_BIGINT -#endif // FINUFFT_H +#endif // FINUFFT_H diff --git a/include/finufft/dirft.h b/include/finufft/dirft.h index 88f1dd2df..5d13265a4 100644 --- a/include/finufft/dirft.h +++ b/include/finufft/dirft.h @@ -3,16 +3,20 @@ #include -void dirft1d1(BIGINT nj,FLT* x,CPX* c,int isign,BIGINT ms, CPX* f); -void dirft1d2(BIGINT nj,FLT* x,CPX* c,int iflag,BIGINT ms, CPX* f); -void dirft1d3(BIGINT nj,FLT* x,CPX* c,int iflag,BIGINT nk, FLT* s, CPX* f); +void dirft1d1(BIGINT nj, FLT *x, CPX *c, int isign, BIGINT ms, CPX *f); +void dirft1d2(BIGINT nj, FLT *x, CPX *c, int iflag, BIGINT ms, CPX *f); +void dirft1d3(BIGINT nj, FLT *x, CPX *c, int iflag, BIGINT nk, FLT *s, CPX *f); -void dirft2d1(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT ms, BIGINT mt, CPX* f); -void dirft2d2(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT ms, BIGINT mt, CPX* f); -void dirft2d3(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT nk, FLT* s, FLT* t, CPX* f); +void dirft2d1(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT ms, BIGINT mt, CPX *f); +void dirft2d2(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT ms, BIGINT mt, CPX *f); +void dirft2d3(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT nk, FLT *s, FLT *t, + CPX *f); -void dirft3d1(BIGINT nj,FLT* x,FLT *y,FLT *z,CPX* c,int iflag,BIGINT ms, BIGINT mt, BIGINT mu, CPX* f); -void dirft3d2(BIGINT nj,FLT* x,FLT *y,FLT *z,CPX* c,int iflag,BIGINT ms, BIGINT mt, BIGINT mu, CPX* f); -void dirft3d3(BIGINT nj,FLT* x,FLT *y,FLT *z,CPX* c,int iflag,BIGINT nk, FLT* s, FLT* t, FLT *u, CPX* f); +void dirft3d1(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT ms, BIGINT mt, + BIGINT mu, CPX *f); +void dirft3d2(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT ms, BIGINT mt, + BIGINT mu, CPX *f); +void dirft3d3(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT nk, FLT *s, + FLT *t, FLT *u, CPX *f); #endif diff --git a/include/finufft/fftw_defs.h b/include/finufft/fftw_defs.h index 89d86f0de..1771ff259 100644 --- a/include/finufft/fftw_defs.h +++ b/include/finufft/fftw_defs.h @@ -7,42 +7,42 @@ // precision library compilation, which need different FFTW command symbols. // Barnett simplified via FFTWIFY, 6/7/22. -#include // (after complex.h) needed so can typedef FFTW_CPX +#include // (after complex.h) needed so can typedef FFTW_CPX // precision-switching names for interfaces to FFTW... #ifdef SINGLE - // macro to prepend fftw_ (for double) or fftwf_ (for single) to a string - // without a space. The 2nd level of indirection is needed for safety, see: - // https://isocpp.org/wiki/faq/misc-technical-issues#macros-with-token-pasting - #define FFTWIFY_UNSAFE(x) fftwf_##x +// macro to prepend fftw_ (for double) or fftwf_ (for single) to a string +// without a space. The 2nd level of indirection is needed for safety, see: +// https://isocpp.org/wiki/faq/misc-technical-issues#macros-with-token-pasting +#define FFTWIFY_UNSAFE(x) fftwf_##x #else - #define FFTWIFY_UNSAFE(x) fftw_##x +#define FFTWIFY_UNSAFE(x) fftw_##x #endif -#define FFTWIFY(x) FFTWIFY_UNSAFE(x) +#define FFTWIFY(x) FFTWIFY_UNSAFE(x) // now use this tool (note we replaced typedefs v<=2.0.4, in favor of macros): -#define FFTW_CPX FFTWIFY(complex) -#define FFTW_PLAN FFTWIFY(plan) -#define FFTW_ALLOC_RE FFTWIFY(alloc_real) -#define FFTW_ALLOC_CPX FFTWIFY(alloc_complex) -#define FFTW_PLAN_1D FFTWIFY(plan_dft_1d) -#define FFTW_PLAN_2D FFTWIFY(plan_dft_2d) -#define FFTW_PLAN_3D FFTWIFY(plan_dft_3d) +#define FFTW_CPX FFTWIFY(complex) +#define FFTW_PLAN FFTWIFY(plan) +#define FFTW_ALLOC_RE FFTWIFY(alloc_real) +#define FFTW_ALLOC_CPX FFTWIFY(alloc_complex) +#define FFTW_PLAN_1D FFTWIFY(plan_dft_1d) +#define FFTW_PLAN_2D FFTWIFY(plan_dft_2d) +#define FFTW_PLAN_3D FFTWIFY(plan_dft_3d) #define FFTW_PLAN_MANY_DFT FFTWIFY(plan_many_dft) -#define FFTW_EX FFTWIFY(execute) -#define FFTW_DE FFTWIFY(destroy_plan) -#define FFTW_FR FFTWIFY(free) +#define FFTW_EX FFTWIFY(execute) +#define FFTW_DE FFTWIFY(destroy_plan) +#define FFTW_FR FFTWIFY(free) #define FFTW_FORGET_WISDOM FFTWIFY(forget_wisdom) -#define FFTW_CLEANUP FFTWIFY(cleanup) +#define FFTW_CLEANUP FFTWIFY(cleanup) // the following OMP switch could be done in the src code instead... #ifdef _OPENMP - #define FFTW_INIT FFTWIFY(init_threads) - #define FFTW_PLAN_TH FFTWIFY(plan_with_nthreads) - #define FFTW_CLEANUP_THREADS FFTWIFY(cleanup_threads) +#define FFTW_INIT FFTWIFY(init_threads) +#define FFTW_PLAN_TH FFTWIFY(plan_with_nthreads) +#define FFTW_CLEANUP_THREADS FFTWIFY(cleanup_threads) #else - // no OMP (no fftw{f}_threads or _omp), need dummy fftw threads calls... - #define FFTW_INIT() - #define FFTW_PLAN_TH(x) - #define FFTW_CLEANUP_THREADS() +// no OMP (no fftw{f}_threads or _omp), need dummy fftw threads calls... +#define FFTW_INIT() +#define FFTW_PLAN_TH(x) +#define FFTW_CLEANUP_THREADS() #endif -#endif // FFTW_DEFS_H +#endif // FFTW_DEFS_H diff --git a/include/finufft/spreadinterp.h b/include/finufft/spreadinterp.h index 853b6c2b1..0900dd31b 100644 --- a/include/finufft/spreadinterp.h +++ b/include/finufft/spreadinterp.h @@ -26,32 +26,38 @@ #define TF_OMIT_SPREADING 8 // don't interp/spread (dir=1: to subgrids) namespace finufft { - namespace spreadinterp { +namespace spreadinterp { // things external (spreadinterp) interface needs... -FINUFFT_EXPORT int FINUFFT_CDECL spreadinterp(BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, - BIGINT M, FLT *kx, FLT *ky, FLT *kz, - FLT *data_nonuniform, finufft_spread_opts opts); -FINUFFT_EXPORT int FINUFFT_CDECL spreadcheck(BIGINT N1, BIGINT N2, BIGINT N3, - BIGINT M, FLT *kx, FLT *ky, FLT *kz, finufft_spread_opts opts); -FINUFFT_EXPORT int FINUFFT_CDECL indexSort(BIGINT* sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M, - FLT *kx, FLT *ky, FLT *kz, finufft_spread_opts opts); -FINUFFT_EXPORT int FINUFFT_CDECL interpSorted(BIGINT* sort_indices,BIGINT N1, BIGINT N2, BIGINT N3, - FLT *data_uniform,BIGINT M, FLT *kx, FLT *ky, FLT *kz, - FLT *data_nonuniform, finufft_spread_opts opts, int did_sort); -FINUFFT_EXPORT int FINUFFT_CDECL spreadSorted(BIGINT* sort_indices,BIGINT N1, BIGINT N2, BIGINT N3, - FLT *data_uniform,BIGINT M, FLT *kx, FLT *ky, FLT *kz, - FLT *data_nonuniform, finufft_spread_opts opts, int did_sort); -FINUFFT_EXPORT int FINUFFT_CDECL spreadinterpSorted(BIGINT* sort_indices,BIGINT N1, BIGINT N2, BIGINT N3, - FLT *data_uniform,BIGINT M, FLT *kx, FLT *ky, FLT *kz, - FLT *data_nonuniform, finufft_spread_opts opts, - int did_sort); -FINUFFT_EXPORT FLT FINUFFT_CDECL evaluate_kernel(FLT x,const finufft_spread_opts &opts); -FINUFFT_EXPORT FLT FINUFFT_CDECL evaluate_kernel_noexp(FLT x,const finufft_spread_opts &opts); -FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts,FLT eps,double upsampfac, - int kerevalmeth, int debug, int showwarn, int dim); - - } // namespace -} // namespace - -#endif // SPREADINTERP_H +FINUFFT_EXPORT int FINUFFT_CDECL spreadinterp( + BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, BIGINT M, FLT *kx, FLT *ky, + FLT *kz, FLT *data_nonuniform, finufft_spread_opts opts); +FINUFFT_EXPORT int FINUFFT_CDECL spreadcheck(BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M, + FLT *kx, FLT *ky, FLT *kz, + finufft_spread_opts opts); +FINUFFT_EXPORT int FINUFFT_CDECL indexSort(BIGINT *sort_indices, BIGINT N1, BIGINT N2, + BIGINT N3, BIGINT M, FLT *kx, FLT *ky, FLT *kz, + finufft_spread_opts opts); +FINUFFT_EXPORT int FINUFFT_CDECL interpSorted( + BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, BIGINT M, + FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform, finufft_spread_opts opts, + int did_sort); +FINUFFT_EXPORT int FINUFFT_CDECL spreadSorted( + BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, BIGINT M, + FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform, finufft_spread_opts opts, + int did_sort); +FINUFFT_EXPORT int FINUFFT_CDECL spreadinterpSorted( + BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, BIGINT M, + FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform, finufft_spread_opts opts, + int did_sort); +FINUFFT_EXPORT FLT FINUFFT_CDECL evaluate_kernel(FLT x, const finufft_spread_opts &opts); +FINUFFT_EXPORT FLT FINUFFT_CDECL evaluate_kernel_noexp(FLT x, + const finufft_spread_opts &opts); +FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, FLT eps, + double upsampfac, int kerevalmeth, + int debug, int showwarn, int dim); + +} // namespace spreadinterp +} // namespace finufft + +#endif // SPREADINTERP_H diff --git a/include/finufft/test_defs.h b/include/finufft/test_defs.h index 54b058266..6142eadfb 100644 --- a/include/finufft/test_defs.h +++ b/include/finufft/test_defs.h @@ -7,7 +7,7 @@ // TESTER SETTINGS... // how big a problem to check direct DFT for in 1D... -#define TEST_BIGPROB 1e8 +#define TEST_BIGPROB 1e8 // for omp rand filling #define TEST_RANDCHUNK 1000000 @@ -25,11 +25,11 @@ #include // std stuff for tester src -#include -#include #include -#include #include +#include +#include +#include #include -#endif // TEST_DEFS_H +#endif // TEST_DEFS_H diff --git a/include/finufft/utils.h b/include/finufft/utils.h index 8c2b7619e..9039fee96 100644 --- a/include/finufft/utils.h +++ b/include/finufft/utils.h @@ -7,18 +7,19 @@ #include "finufft/defs.h" namespace finufft { - namespace utils { +namespace utils { // ahb's low-level array helpers -FINUFFT_EXPORT FLT FINUFFT_CDECL relerrtwonorm(BIGINT n, CPX* a, CPX* b); -FINUFFT_EXPORT FLT FINUFFT_CDECL errtwonorm(BIGINT n, CPX* a, CPX* b); -FINUFFT_EXPORT FLT FINUFFT_CDECL twonorm(BIGINT n, CPX* a); -FINUFFT_EXPORT FLT FINUFFT_CDECL infnorm(BIGINT n, CPX* a); -FINUFFT_EXPORT void FINUFFT_CDECL arrayrange(BIGINT n, FLT* a, FLT *lo, FLT *hi); -FINUFFT_EXPORT void FINUFFT_CDECL indexedarrayrange(BIGINT n, BIGINT* i, FLT* a, FLT *lo, FLT *hi); -FINUFFT_EXPORT void FINUFFT_CDECL arraywidcen(BIGINT n, FLT* a, FLT *w, FLT *c); +FINUFFT_EXPORT FLT FINUFFT_CDECL relerrtwonorm(BIGINT n, CPX *a, CPX *b); +FINUFFT_EXPORT FLT FINUFFT_CDECL errtwonorm(BIGINT n, CPX *a, CPX *b); +FINUFFT_EXPORT FLT FINUFFT_CDECL twonorm(BIGINT n, CPX *a); +FINUFFT_EXPORT FLT FINUFFT_CDECL infnorm(BIGINT n, CPX *a); +FINUFFT_EXPORT void FINUFFT_CDECL arrayrange(BIGINT n, FLT *a, FLT *lo, FLT *hi); +FINUFFT_EXPORT void FINUFFT_CDECL indexedarrayrange(BIGINT n, BIGINT *i, FLT *a, FLT *lo, + FLT *hi); +FINUFFT_EXPORT void FINUFFT_CDECL arraywidcen(BIGINT n, FLT *a, FLT *w, FLT *c); - } // namespace -} // namespace - -#endif // UTILS_H +} // namespace utils +} // namespace finufft + +#endif // UTILS_H diff --git a/include/finufft/utils_precindep.h b/include/finufft/utils_precindep.h index 866d33198..0504bb8df 100644 --- a/include/finufft/utils_precindep.h +++ b/include/finufft/utils_precindep.h @@ -10,34 +10,35 @@ #include namespace finufft { - namespace utils { - - FINUFFT_EXPORT BIGINT FINUFFT_CDECL next235even(BIGINT n); - - // jfm's timer class - class FINUFFT_EXPORT CNTime { - public: - void start(); - double restart(); - double elapsedsec(); - private: - double initial; - }; - - // openmp helpers - int get_num_threads_parallel_block(); - - } //namespace -} //namespace - +namespace utils { + +FINUFFT_EXPORT BIGINT FINUFFT_CDECL next235even(BIGINT n); + +// jfm's timer class +class FINUFFT_EXPORT CNTime { +public: + void start(); + double restart(); + double elapsedsec(); + +private: + double initial; +}; + +// openmp helpers +int get_num_threads_parallel_block(); + +} // namespace utils +} // namespace finufft + // thread-safe rand number generator for Windows platform #ifdef _WIN32 #include namespace finufft { - namespace utils { - FINUFFT_EXPORT int FINUFFT_CDECL rand_r(unsigned int *seedp); - } // namespace -} // namespace +namespace utils { +FINUFFT_EXPORT int FINUFFT_CDECL rand_r(unsigned int *seedp); +} // namespace utils +} // namespace finufft #endif -#endif // UTILS_PRECINDEP_H +#endif // UTILS_PRECINDEP_H diff --git a/include/finufft_eitherprec.h b/include/finufft_eitherprec.h index 25703fb1d..250dec7c0 100644 --- a/include/finufft_eitherprec.h +++ b/include/finufft_eitherprec.h @@ -15,26 +15,26 @@ // The 2nd level of indirection is needed for safety, see: // https://isocpp.org/wiki/faq/misc-technical-issues#macros-with-token-pasting #define FINUFFTIFY_UNSAFE(x) finufftf##x -#define FINUFFT_FLT float +#define FINUFFT_FLT float #else #define FINUFFTIFY_UNSAFE(x) finufft##x -#define FINUFFT_FLT double +#define FINUFFT_FLT double #endif #define FINUFFTIFY(x) FINUFFTIFY_UNSAFE(x) // decide which kind of complex numbers FINUFFT_CPX is (four options) #ifdef __cplusplus #define _USE_MATH_DEFINES -#include // C++ type +#include // C++ type #define FINUFFT_COMPLEXIFY(X) std::complex #else -#include // C99 type +#include // C99 type #define FINUFFT_COMPLEXIFY(X) X complex #endif -#define FINUFFT_CPX FINUFFT_COMPLEXIFY(FINUFFT_FLT) +#define FINUFFT_CPX FINUFFT_COMPLEXIFY(FINUFFT_FLT) // opaque pointer to finufft_plan private object, for this precision... -#define FINUFFT_PLAN FINUFFTIFY(_plan) +#define FINUFFT_PLAN FINUFFTIFY(_plan) // the plan object pointed to... (doesn't need to be even defined here) #define FINUFFT_PLAN_S FINUFFTIFY(_plan_s) @@ -51,13 +51,13 @@ with it in the future we just need to update cmake for it to work instead of having a check on the msvc version. */ #if defined(FINUFFT_DLL) && (defined(_WIN32) || defined(__WIN32__)) -# if defined(dll_EXPORTS) -# define FINUFFT_EXPORT __declspec(dllexport) -# else -# define FINUFFT_EXPORT __declspec(dllimport) -# endif +#if defined(dll_EXPORTS) +#define FINUFFT_EXPORT __declspec(dllexport) #else -# define FINUFFT_EXPORT +#define FINUFFT_EXPORT __declspec(dllimport) +#endif +#else +#define FINUFFT_EXPORT #endif /* specify calling convention (Windows only) @@ -66,81 +66,115 @@ If the user code changes the default compiler calling convention, may need this when generating DLL. */ #if defined(_WIN32) || defined(__WIN32__) -# define FINUFFT_CDECL __cdecl +#define FINUFFT_CDECL __cdecl #else -# define FINUFFT_CDECL +#define FINUFFT_CDECL #endif //////////////////////////////////////////////////////////////////// // PUBLIC METHOD INTERFACES. All are C-style even when used from C++... #ifdef __cplusplus -extern "C" -{ +extern "C" { #endif -// ----------------- the plan ----------------------------------------------- +// ----------------- the plan ----------------------------------------------- // the plan handle that we pass around is just a pointer to the plan object // that contains all the info. The latter is invisible to the public user. -typedef struct FINUFFT_PLAN_S * FINUFFT_PLAN; +typedef struct FINUFFT_PLAN_S *FINUFFT_PLAN; - // ------------------ the guru interface ------------------------------------ // (sources in finufft.cpp) - - FINUFFT_EXPORT void FINUFFT_CDECL FINUFFTIFY(_default_opts)(finufft_opts *o); - FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_makeplan)(int type, int dim, FINUFFT_BIGINT* n_modes, int iflag, int n_transf, FINUFFT_FLT tol, FINUFFT_PLAN* plan, finufft_opts* o); - FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_setpts)(FINUFFT_PLAN plan , FINUFFT_BIGINT M, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_FLT *zj, FINUFFT_BIGINT N, FINUFFT_FLT *s, FINUFFT_FLT *t, FINUFFT_FLT *u); - FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_execute)(FINUFFT_PLAN plan, FINUFFT_CPX* weights, FINUFFT_CPX* result); - FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_destroy)(FINUFFT_PLAN plan); +FINUFFT_EXPORT void FINUFFT_CDECL FINUFFTIFY(_default_opts)(finufft_opts *o); +FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_makeplan)( + int type, int dim, FINUFFT_BIGINT *n_modes, int iflag, int n_transf, FINUFFT_FLT tol, + FINUFFT_PLAN *plan, finufft_opts *o); +FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_setpts)( + FINUFFT_PLAN plan, FINUFFT_BIGINT M, FINUFFT_FLT *xj, FINUFFT_FLT *yj, + FINUFFT_FLT *zj, FINUFFT_BIGINT N, FINUFFT_FLT *s, FINUFFT_FLT *t, FINUFFT_FLT *u); +FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_execute)( + FINUFFT_PLAN plan, FINUFFT_CPX *weights, FINUFFT_CPX *result); +FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_destroy)(FINUFFT_PLAN plan); // ----------------- the 18 simple interfaces ------------------------------- // (sources in simpleinterfaces.cpp) - FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d1)(FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,FINUFFT_BIGINT ms, - FINUFFT_CPX* fk, finufft_opts *opts); - FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d1many)(int ntransf, FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,FINUFFT_BIGINT ms, - FINUFFT_CPX* fk, finufft_opts *opts); - - FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d2)(FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,FINUFFT_BIGINT ms, - FINUFFT_CPX* fk, finufft_opts *opts); - FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d2many)(int ntransf, FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,FINUFFT_BIGINT ms, - FINUFFT_CPX* fk, finufft_opts *opts); - FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d3)(FINUFFT_BIGINT nj,FINUFFT_FLT* x,FINUFFT_CPX* c,int iflag,FINUFFT_FLT eps,FINUFFT_BIGINT nk, FINUFFT_FLT* s, FINUFFT_CPX* f, finufft_opts *opts); - FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d3many)(int ntransf, FINUFFT_BIGINT nj,FINUFFT_FLT* x,FINUFFT_CPX* c,int iflag,FINUFFT_FLT eps,FINUFFT_BIGINT nk, FINUFFT_FLT* s, FINUFFT_CPX* f, finufft_opts *opts); - FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d1)(FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_FLT *yj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps, - FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_CPX* fk, finufft_opts *opts); - FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d1many)(int ndata, FINUFFT_BIGINT nj, FINUFFT_FLT* xj, FINUFFT_FLT *yj, FINUFFT_CPX* c, int iflag, - FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_CPX* fk, finufft_opts *opts); - FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d2)(FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_FLT *yj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps, - FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_CPX* fk, finufft_opts *opts); - FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d2many)(int ndata, FINUFFT_BIGINT nj, FINUFFT_FLT* xj, FINUFFT_FLT *yj, FINUFFT_CPX* c, int iflag, - FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_CPX* fk, finufft_opts *opts); - FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d3)(FINUFFT_BIGINT nj,FINUFFT_FLT* x,FINUFFT_FLT *y,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,FINUFFT_BIGINT nk, FINUFFT_FLT* s, FINUFFT_FLT* t, FINUFFT_CPX* fk, finufft_opts *opts); - - FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d3many)(int ntransf, FINUFFT_BIGINT nj,FINUFFT_FLT* x,FINUFFT_FLT *y,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,FINUFFT_BIGINT nk, FINUFFT_FLT* s, FINUFFT_FLT* t, FINUFFT_CPX* fk, finufft_opts *opts); - - FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d1)(FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_FLT *yj,FINUFFT_FLT *zj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps, - FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_BIGINT mu, FINUFFT_CPX* fk, finufft_opts *opts); - FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d1many)(int ntransfs, FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_FLT *yj,FINUFFT_FLT *zj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps, - FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_BIGINT mu, FINUFFT_CPX* fk, finufft_opts *opts); - - FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d2)(FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_FLT *yj,FINUFFT_FLT *zj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps, - FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_BIGINT mu, FINUFFT_CPX* fk, finufft_opts *opts); - FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d2many)(int ntransf, FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_FLT *yj,FINUFFT_FLT *zj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps, - FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_BIGINT mu, FINUFFT_CPX* fk, finufft_opts *opts); - FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d3)(FINUFFT_BIGINT nj,FINUFFT_FLT* x,FINUFFT_FLT *y,FINUFFT_FLT *z, FINUFFT_CPX* cj,int iflag, - FINUFFT_FLT eps,FINUFFT_BIGINT nk,FINUFFT_FLT* s, FINUFFT_FLT* t, FINUFFT_FLT *u, - FINUFFT_CPX* fk, finufft_opts *opts); - FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d3many)(int ntransf, FINUFFT_BIGINT nj,FINUFFT_FLT* x,FINUFFT_FLT *y,FINUFFT_FLT *z, FINUFFT_CPX* cj,int iflag, - FINUFFT_FLT eps,FINUFFT_BIGINT nk,FINUFFT_FLT* s, FINUFFT_FLT* t, FINUFFT_FLT *u, - FINUFFT_CPX* fk, finufft_opts *opts); - +FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d1)( + FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_CPX *cj, int iflag, FINUFFT_FLT eps, + FINUFFT_BIGINT ms, FINUFFT_CPX *fk, finufft_opts *opts); +FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d1many)( + int ntransf, FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_CPX *cj, int iflag, + FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_CPX *fk, finufft_opts *opts); + +FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d2)( + FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_CPX *cj, int iflag, FINUFFT_FLT eps, + FINUFFT_BIGINT ms, FINUFFT_CPX *fk, finufft_opts *opts); +FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d2many)( + int ntransf, FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_CPX *cj, int iflag, + FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_CPX *fk, finufft_opts *opts); +FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d3)( + FINUFFT_BIGINT nj, FINUFFT_FLT *x, FINUFFT_CPX *c, int iflag, FINUFFT_FLT eps, + FINUFFT_BIGINT nk, FINUFFT_FLT *s, FINUFFT_CPX *f, finufft_opts *opts); +FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d3many)( + int ntransf, FINUFFT_BIGINT nj, FINUFFT_FLT *x, FINUFFT_CPX *c, int iflag, + FINUFFT_FLT eps, FINUFFT_BIGINT nk, FINUFFT_FLT *s, FINUFFT_CPX *f, + finufft_opts *opts); +FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d1)( + FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_CPX *cj, int iflag, + FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_CPX *fk, + finufft_opts *opts); +FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d1many)( + int ndata, FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_CPX *c, + int iflag, FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_CPX *fk, + finufft_opts *opts); +FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d2)( + FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_CPX *cj, int iflag, + FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_CPX *fk, + finufft_opts *opts); +FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d2many)( + int ndata, FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_CPX *c, + int iflag, FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_CPX *fk, + finufft_opts *opts); +FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d3)( + FINUFFT_BIGINT nj, FINUFFT_FLT *x, FINUFFT_FLT *y, FINUFFT_CPX *cj, int iflag, + FINUFFT_FLT eps, FINUFFT_BIGINT nk, FINUFFT_FLT *s, FINUFFT_FLT *t, FINUFFT_CPX *fk, + finufft_opts *opts); + +FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d3many)( + int ntransf, FINUFFT_BIGINT nj, FINUFFT_FLT *x, FINUFFT_FLT *y, FINUFFT_CPX *cj, + int iflag, FINUFFT_FLT eps, FINUFFT_BIGINT nk, FINUFFT_FLT *s, FINUFFT_FLT *t, + FINUFFT_CPX *fk, finufft_opts *opts); + +FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d1)( + FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_FLT *zj, FINUFFT_CPX *cj, + int iflag, FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_BIGINT mu, + FINUFFT_CPX *fk, finufft_opts *opts); +FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d1many)( + int ntransfs, FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_FLT *zj, + FINUFFT_CPX *cj, int iflag, FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, + FINUFFT_BIGINT mu, FINUFFT_CPX *fk, finufft_opts *opts); + +FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d2)( + FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_FLT *zj, FINUFFT_CPX *cj, + int iflag, FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_BIGINT mu, + FINUFFT_CPX *fk, finufft_opts *opts); +FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d2many)( + int ntransf, FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_FLT *zj, + FINUFFT_CPX *cj, int iflag, FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, + FINUFFT_BIGINT mu, FINUFFT_CPX *fk, finufft_opts *opts); +FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d3)( + FINUFFT_BIGINT nj, FINUFFT_FLT *x, FINUFFT_FLT *y, FINUFFT_FLT *z, FINUFFT_CPX *cj, + int iflag, FINUFFT_FLT eps, FINUFFT_BIGINT nk, FINUFFT_FLT *s, FINUFFT_FLT *t, + FINUFFT_FLT *u, FINUFFT_CPX *fk, finufft_opts *opts); +FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d3many)( + int ntransf, FINUFFT_BIGINT nj, FINUFFT_FLT *x, FINUFFT_FLT *y, FINUFFT_FLT *z, + FINUFFT_CPX *cj, int iflag, FINUFFT_FLT eps, FINUFFT_BIGINT nk, FINUFFT_FLT *s, + FINUFFT_FLT *t, FINUFFT_FLT *u, FINUFFT_CPX *fk, finufft_opts *opts); + #ifdef __cplusplus } #endif - // clean up things that were purely local to this file #undef FINUFFT_COMPLEXIFY #undef FINUFFTIFY_UNSAFE diff --git a/include/finufft_opts.h b/include/finufft_opts.h index 4f6db1e02..289e779e5 100644 --- a/include/finufft_opts.h +++ b/include/finufft_opts.h @@ -5,19 +5,18 @@ #ifndef FINUFFT_OPTS_H #define FINUFFT_OPTS_H - -typedef struct finufft_opts{ // defaults see finufft.cpp:finufft_default_opts() +typedef struct finufft_opts { // defaults see finufft.cpp:finufft_default_opts() // sphinx tag (don't remove): @opts_start // FINUFFT options: // data handling opts... - int modeord; // (type 1,2 only): 0 CMCL-style increasing mode order - // 1 FFT-style mode order - int chkbnds; // [DEPRECATED] 0 don't check NU pts in [-3pi,3pi), 1 do (= 201402L) -#define DEPRECATED_OPTS [[deprecated ("as of v2.1.0, nufft_opts is obsolete and renamed finufft_opts; please use this instead.")]] +#define DEPRECATED_OPTS \ + [[deprecated("as of v2.1.0, nufft_opts is obsolete and renamed finufft_opts; please " \ + "use this instead.")]] #elif defined(_MSC_VER) -#define DEPRECATED_OPTS __declspec(deprecated("as of v2.1.0, nufft_opts is obsolete and renamed finufft_opts; please use this instead.")) +#define DEPRECATED_OPTS \ + __declspec(deprecated("as of v2.1.0, nufft_opts is obsolete and renamed " \ + "finufft_opts; please use this instead.")) #else -#define DEPRECATED_OPTS __attribute__((deprecated("as of v2.1.0, nufft_opts is obsolete and renamed finufft_opts; please use this instead."))) +#define DEPRECATED_OPTS \ + __attribute__((deprecated("as of v2.1.0, nufft_opts is obsolete and renamed " \ + "finufft_opts; please use this instead."))) #endif // Backwards-compatibility DEPRECATED_OPTS typedef finufft_opts nufft_opts; -#endif // FINUFFT_OPTS_H +#endif // FINUFFT_OPTS_H diff --git a/include/finufft_spread_opts.h b/include/finufft_spread_opts.h index 8549505db..2f3c9ce76 100644 --- a/include/finufft_spread_opts.h +++ b/include/finufft_spread_opts.h @@ -10,25 +10,25 @@ typedef struct finufft_spread_opts { // See spreadinterp:setup_spreader for default values of the following fields. // This is the main documentation for these options... - int nspread; // w, the kernel width in grid pts - int spread_direction; // 1 means spread NU->U, 2 means interpolate U->NU - int chkbnds; // [DEPRECATED] 0: don't check NU pts in 3-period range; 1: do - int sort; // 0: don't sort NU pts, 1: do, 2: heuristic choice - int kerevalmeth; // 0: direct exp(sqrt()), or 1: Horner ppval, fastest - int kerpad; // 0: no pad w to mult of 4, 1: do pad - // (this helps SIMD for kerevalmeth=0, eg on i7). - int nthreads; // # threads for spreadinterp (0: use max avail) - int sort_threads; // # threads for sort (0: auto-choice up to nthreads) + int nspread; // w, the kernel width in grid pts + int spread_direction; // 1 means spread NU->U, 2 means interpolate U->NU + int chkbnds; // [DEPRECATED] 0: don't check NU pts in 3-period range; 1: do + int sort; // 0: don't sort NU pts, 1: do, 2: heuristic choice + int kerevalmeth; // 0: direct exp(sqrt()), or 1: Horner ppval, fastest + int kerpad; // 0: no pad w to mult of 4, 1: do pad + // (this helps SIMD for kerevalmeth=0, eg on i7). + int nthreads; // # threads for spreadinterp (0: use max avail) + int sort_threads; // # threads for sort (0: auto-choice up to nthreads) int max_subproblem_size; // # pts per t1 subprob; sets extra RAM per thread - int flags; // binary flags for timing only (may give wrong ans - // if changed from 0!). See spreadinterp.h - int debug; // 0: silent, 1: small text output, 2: verbose - int atomic_threshold; // num threads before switching spreadSorted to using atomic ops - double upsampfac; // sigma, upsampling factor + int flags; // binary flags for timing only (may give wrong ans + // if changed from 0!). See spreadinterp.h + int debug; // 0: silent, 1: small text output, 2: verbose + int atomic_threshold; // num threads before switching spreadSorted to using atomic ops + double upsampfac; // sigma, upsampling factor // ES kernel specific consts for eval. No longer FLT, to avoid name clash... double ES_beta; double ES_halfwidth; double ES_c; } finufft_spread_opts; -#endif // FINUFFT_SPREAD_OPTS_H +#endif // FINUFFT_SPREAD_OPTS_H diff --git a/matlab/finufft.cpp b/matlab/finufft.cpp index 9a805dade..ccbc9a59a 100644 --- a/matlab/finufft.cpp +++ b/matlab/finufft.cpp @@ -31,9 +31,9 @@ THE SOFTWARE. */ +#include #include #include -#include #include @@ -41,12 +41,10 @@ #include #endif - /* * Records for call profile. */ -int* mexprofrecord_= NULL; - +int *mexprofrecord_ = NULL; /* * Support routines for copying data into and out of the MEX stubs, R2018a @@ -54,502 +52,421 @@ int* mexprofrecord_= NULL; #if MX_HAS_INTERLEAVED_COMPLEX -void* mxWrapGetP(const mxArray* a, const char* fmt, const char** e) -{ - void* p = NULL; +void *mxWrapGetP(const mxArray *a, const char *fmt, const char **e) { + void *p = NULL; #ifdef R2008OO - mxArray* ap; + mxArray *ap; #endif - if (mxGetClassID(a) == mxDOUBLE_CLASS && mxIsComplex(a) ) - { - if( mxGetM(a)*mxGetN(a) == 1 && (*mxGetComplexDoubles(a)).real == 0 ) - return NULL; - } - if (mxGetClassID(a) == mxDOUBLE_CLASS && !mxIsComplex(a) ) - { - if( mxGetM(a)*mxGetN(a) == 1 && *mxGetDoubles(a) == 0) - return NULL; - } - if (mxIsChar(a)) { - char pbuf[128]; - mxGetString(a, pbuf, sizeof(pbuf)); - sscanf(pbuf, fmt, &p); - } + if (mxGetClassID(a) == mxDOUBLE_CLASS && mxIsComplex(a)) { + if (mxGetM(a) * mxGetN(a) == 1 && (*mxGetComplexDoubles(a)).real == 0) return NULL; + } + if (mxGetClassID(a) == mxDOUBLE_CLASS && !mxIsComplex(a)) { + if (mxGetM(a) * mxGetN(a) == 1 && *mxGetDoubles(a) == 0) return NULL; + } + if (mxIsChar(a)) { + char pbuf[128]; + mxGetString(a, pbuf, sizeof(pbuf)); + sscanf(pbuf, fmt, &p); + } #ifdef R2008OO - else if (ap = mxGetProperty(a, 0, "mwptr")) { - return mxWrapGetP(ap, fmt, e); - } + else if (ap = mxGetProperty(a, 0, "mwptr")) { + return mxWrapGetP(ap, fmt, e); + } #endif - if (p == 0) - *e = "Invalid pointer"; - return p; -} - -mxArray* mxWrapCreateP(void* p, const char* fmt) -{ - if (p == 0) { - mxArray* z = mxCreateDoubleMatrix(1,1, mxREAL); - *mxGetDoubles(z) = 0; - return z; - } else { - char pbuf[128]; - sprintf(pbuf, fmt, p); - return mxCreateString(pbuf); - } -} - -mxArray* mxWrapStrncpy(const char* s) -{ - if (s) { - return mxCreateString(s); - } else { - mxArray* z = mxCreateDoubleMatrix(1,1, mxREAL); - *mxGetDoubles(z) = 0; - return z; - } -} - -char* mxWrapGetString(const mxArray* a, const char** e) -{ - char* s; - mwSize slen; - if (!a || (!mxIsChar(a) && mxGetM(a)*mxGetN(a) > 0)) { - *e = "Invalid string argument"; - return NULL; - } - slen = mxGetM(a)*mxGetN(a) + 1; - s = (char*) mxMalloc(slen); - if (mxGetM(a)*mxGetN(a) == 0) - *s = 0; - else - mxGetString(a, s, slen); - return s; -} - - -double mxWrapGetScalar(const mxArray* a, const char** e) -{ - if (!a || mxGetClassID(a) != mxDOUBLE_CLASS || mxGetM(a)*mxGetN(a) != 1) { - *e = "Invalid scalar argument"; - return 0; - } - if( mxIsComplex(a) ) - return (double) (*mxGetComplexDoubles(a)).real; - else - return (double) (*mxGetDoubles(a)); -} - -#define mxWrapGetArrayDef(func, T) \ -T* func(const mxArray* a, const char** e) \ -{ \ - T* array; \ - mwSize arraylen; \ - mwIndex i; \ - T* p; \ - double* q; \ - mxComplexDouble* z; \ - if (!a || mxGetClassID(a) != mxDOUBLE_CLASS) { \ - *e = "Invalid array argument, mxDOUBLE_CLASS expected"; \ - return 0; \ - } \ - arraylen = mxGetM(a)*mxGetN(a); \ - array = (T*) mxMalloc(mxGetM(a)*mxGetN(a) * sizeof(T)); \ - p = array; \ - if( mxIsComplex(a) ) \ - { \ - z = mxGetComplexDoubles(a); \ - for (i = 0; i < arraylen; ++i) \ - *p++ = (T) (*z++).real; \ - } \ - else \ - { \ - q = mxGetDoubles(a); \ - for (i = 0; i < arraylen; ++i) \ - *p++ = (T) (*q++); \ - } \ - return array; \ -} - - -#define mxWrapCopyDef(func, T) \ -void func(mxArray* a, const T* q, mwSize n) \ -{ \ - mwIndex i; \ - double* p; \ - mxComplexDouble* z; \ - if( mxIsComplex(a) ) \ - { \ - z = mxGetComplexDoubles(a); \ - for (i = 0; i < n; ++i) \ - (*z++).real = (double) *q++; \ - (*z++).imag = 0; \ - } \ - else \ - { \ - p = mxGetDoubles(a); \ - for (i = 0; i < n; ++i) \ - *p++ = (double) *q++; \ - } \ -} - - -#define mxWrapReturnDef(func, T) \ -mxArray* func(const T* q, mwSize m, mwSize n) \ -{ \ - mwIndex i; \ - double* p; \ - if (!q) { \ - return mxCreateDoubleMatrix(0,0, mxREAL); \ - } else { \ - mxArray* a = mxCreateDoubleMatrix(m,n, mxREAL); \ - p = mxGetDoubles(a); \ - for (i = 0; i < m*n; ++i) \ - *p++ = (double) *q++; \ - return a; \ - } \ -} - - -#define mxWrapGetScalarZDef(func, T, ZT, setz) \ -void func(T* z, const mxArray* a) \ -{ \ - if( mxIsComplex(a) ) \ - { \ - setz(z, (ZT) (*mxGetComplexDoubles(a)).real, (ZT) (*mxGetComplexDoubles(a)).imag); \ - } \ - else \ - { \ - setz(z, (ZT) (*mxGetComplexDoubles(a)).real, (ZT) 0); \ - } \ -} - - -#define mxWrapGetArrayZDef(func, T, ZT, setz) \ -T* func(const mxArray* a, const char** e) \ -{ \ - T* array; \ - mwSize arraylen; \ - mwIndex i; \ - T* p; \ - double* q; \ - mxComplexDouble* z; \ - if (!a || mxGetClassID(a) != mxDOUBLE_CLASS) { \ - *e = "Invalid array argument, mxDOUBLE_CLASS expected"; \ - return 0; \ - } \ - arraylen = mxGetM(a)*mxGetN(a); \ - array = (T*) mxMalloc(mxGetM(a)*mxGetN(a) * sizeof(T)); \ - p = array; \ - if( mxIsComplex(a) ) \ - { \ - z = mxGetComplexDoubles(a); \ - for (i = 0; i < arraylen; ++i) { \ - setz(p, (ZT) (*z).real, (ZT) (*z).imag); \ - ++p; ++z; } \ - } \ - else \ - { \ - q = mxGetDoubles(a); \ - for (i = 0; i < arraylen; ++i) { \ - setz(p, (ZT) (*q), (ZT) 0 ); \ - ++p; ++q; } \ - } \ - return array; \ -} - - -#define mxWrapCopyZDef(func, T, freal, fimag) \ -void func(mxArray* a, const T* q, mwSize n) \ -{ \ - mwIndex i; \ - double* p; \ - mxComplexDouble* z; \ - if( mxIsComplex(a) ) \ - { \ - z = mxGetComplexDoubles(a); \ - for (i = 0; i < n; ++i) { \ - (*z).real = freal(*q); \ - (*z).imag = fimag(*q); \ - ++z; ++q; } \ - } \ - else \ - { \ - p = mxGetDoubles(a); \ - for (i = 0; i < n; ++i) \ - *p++ = freal(*q++); \ - } \ -} - - -#define mxWrapReturnZDef(func, T, freal, fimag) \ -mxArray* func(const T* q, mwSize m, mwSize n) \ -{ \ - mwIndex i; \ - mxComplexDouble* p; \ - if (!q) { \ - return mxCreateDoubleMatrix(0,0, mxCOMPLEX); \ - } else { \ - mxArray* a = mxCreateDoubleMatrix(m,n, mxCOMPLEX); \ - p = mxGetComplexDoubles(a); \ - for (i = 0; i < m*n; ++i) { \ - (*p).real = freal(*q); \ - (*p).imag = fimag(*q); \ - ++p; ++q; } \ - return a; \ - } \ -} - - - - - - -void* mxWrapGetP_single(const mxArray* a, const char* fmt, const char** e) -{ - void* p = NULL; + if (p == 0) *e = "Invalid pointer"; + return p; +} + +mxArray *mxWrapCreateP(void *p, const char *fmt) { + if (p == 0) { + mxArray *z = mxCreateDoubleMatrix(1, 1, mxREAL); + *mxGetDoubles(z) = 0; + return z; + } else { + char pbuf[128]; + sprintf(pbuf, fmt, p); + return mxCreateString(pbuf); + } +} + +mxArray *mxWrapStrncpy(const char *s) { + if (s) { + return mxCreateString(s); + } else { + mxArray *z = mxCreateDoubleMatrix(1, 1, mxREAL); + *mxGetDoubles(z) = 0; + return z; + } +} + +char *mxWrapGetString(const mxArray *a, const char **e) { + char *s; + mwSize slen; + if (!a || (!mxIsChar(a) && mxGetM(a) * mxGetN(a) > 0)) { + *e = "Invalid string argument"; + return NULL; + } + slen = mxGetM(a) * mxGetN(a) + 1; + s = (char *)mxMalloc(slen); + if (mxGetM(a) * mxGetN(a) == 0) + *s = 0; + else + mxGetString(a, s, slen); + return s; +} + +double mxWrapGetScalar(const mxArray *a, const char **e) { + if (!a || mxGetClassID(a) != mxDOUBLE_CLASS || mxGetM(a) * mxGetN(a) != 1) { + *e = "Invalid scalar argument"; + return 0; + } + if (mxIsComplex(a)) + return (double)(*mxGetComplexDoubles(a)).real; + else + return (double)(*mxGetDoubles(a)); +} + +#define mxWrapGetArrayDef(func, T) \ + T *func(const mxArray *a, const char **e) { \ + T *array; \ + mwSize arraylen; \ + mwIndex i; \ + T *p; \ + double *q; \ + mxComplexDouble *z; \ + if (!a || mxGetClassID(a) != mxDOUBLE_CLASS) { \ + *e = "Invalid array argument, mxDOUBLE_CLASS expected"; \ + return 0; \ + } \ + arraylen = mxGetM(a) * mxGetN(a); \ + array = (T *)mxMalloc(mxGetM(a) * mxGetN(a) * sizeof(T)); \ + p = array; \ + if (mxIsComplex(a)) { \ + z = mxGetComplexDoubles(a); \ + for (i = 0; i < arraylen; ++i) *p++ = (T)(*z++).real; \ + } else { \ + q = mxGetDoubles(a); \ + for (i = 0; i < arraylen; ++i) *p++ = (T)(*q++); \ + } \ + return array; \ + } + +#define mxWrapCopyDef(func, T) \ + void func(mxArray *a, const T *q, mwSize n) { \ + mwIndex i; \ + double *p; \ + mxComplexDouble *z; \ + if (mxIsComplex(a)) { \ + z = mxGetComplexDoubles(a); \ + for (i = 0; i < n; ++i) (*z++).real = (double)*q++; \ + (*z++).imag = 0; \ + } else { \ + p = mxGetDoubles(a); \ + for (i = 0; i < n; ++i) *p++ = (double)*q++; \ + } \ + } + +#define mxWrapReturnDef(func, T) \ + mxArray *func(const T *q, mwSize m, mwSize n) { \ + mwIndex i; \ + double *p; \ + if (!q) { \ + return mxCreateDoubleMatrix(0, 0, mxREAL); \ + } else { \ + mxArray *a = mxCreateDoubleMatrix(m, n, mxREAL); \ + p = mxGetDoubles(a); \ + for (i = 0; i < m * n; ++i) *p++ = (double)*q++; \ + return a; \ + } \ + } + +#define mxWrapGetScalarZDef(func, T, ZT, setz) \ + void func(T *z, const mxArray *a) { \ + if (mxIsComplex(a)) { \ + setz(z, (ZT)(*mxGetComplexDoubles(a)).real, (ZT)(*mxGetComplexDoubles(a)).imag); \ + } else { \ + setz(z, (ZT)(*mxGetComplexDoubles(a)).real, (ZT)0); \ + } \ + } + +#define mxWrapGetArrayZDef(func, T, ZT, setz) \ + T *func(const mxArray *a, const char **e) { \ + T *array; \ + mwSize arraylen; \ + mwIndex i; \ + T *p; \ + double *q; \ + mxComplexDouble *z; \ + if (!a || mxGetClassID(a) != mxDOUBLE_CLASS) { \ + *e = "Invalid array argument, mxDOUBLE_CLASS expected"; \ + return 0; \ + } \ + arraylen = mxGetM(a) * mxGetN(a); \ + array = (T *)mxMalloc(mxGetM(a) * mxGetN(a) * sizeof(T)); \ + p = array; \ + if (mxIsComplex(a)) { \ + z = mxGetComplexDoubles(a); \ + for (i = 0; i < arraylen; ++i) { \ + setz(p, (ZT)(*z).real, (ZT)(*z).imag); \ + ++p; \ + ++z; \ + } \ + } else { \ + q = mxGetDoubles(a); \ + for (i = 0; i < arraylen; ++i) { \ + setz(p, (ZT)(*q), (ZT)0); \ + ++p; \ + ++q; \ + } \ + } \ + return array; \ + } + +#define mxWrapCopyZDef(func, T, freal, fimag) \ + void func(mxArray *a, const T *q, mwSize n) { \ + mwIndex i; \ + double *p; \ + mxComplexDouble *z; \ + if (mxIsComplex(a)) { \ + z = mxGetComplexDoubles(a); \ + for (i = 0; i < n; ++i) { \ + (*z).real = freal(*q); \ + (*z).imag = fimag(*q); \ + ++z; \ + ++q; \ + } \ + } else { \ + p = mxGetDoubles(a); \ + for (i = 0; i < n; ++i) *p++ = freal(*q++); \ + } \ + } + +#define mxWrapReturnZDef(func, T, freal, fimag) \ + mxArray *func(const T *q, mwSize m, mwSize n) { \ + mwIndex i; \ + mxComplexDouble *p; \ + if (!q) { \ + return mxCreateDoubleMatrix(0, 0, mxCOMPLEX); \ + } else { \ + mxArray *a = mxCreateDoubleMatrix(m, n, mxCOMPLEX); \ + p = mxGetComplexDoubles(a); \ + for (i = 0; i < m * n; ++i) { \ + (*p).real = freal(*q); \ + (*p).imag = fimag(*q); \ + ++p; \ + ++q; \ + } \ + return a; \ + } \ + } + +void *mxWrapGetP_single(const mxArray *a, const char *fmt, const char **e) { + void *p = NULL; #ifdef R2008OO - mxArray* ap; + mxArray *ap; #endif - if (mxGetClassID(a) == mxSINGLE_CLASS && mxIsComplex(a) ) - { - if( mxGetM(a)*mxGetN(a) == 1 && (*mxGetComplexSingles(a)).real == 0 ) - return NULL; - } - if (mxGetClassID(a) == mxSINGLE_CLASS && !mxIsComplex(a) ) - { - if( mxGetM(a)*mxGetN(a) == 1 && *mxGetSingles(a) == 0) - return NULL; - } - if (mxIsChar(a)) { - char pbuf[128]; - mxGetString(a, pbuf, sizeof(pbuf)); - sscanf(pbuf, fmt, &p); - } + if (mxGetClassID(a) == mxSINGLE_CLASS && mxIsComplex(a)) { + if (mxGetM(a) * mxGetN(a) == 1 && (*mxGetComplexSingles(a)).real == 0) return NULL; + } + if (mxGetClassID(a) == mxSINGLE_CLASS && !mxIsComplex(a)) { + if (mxGetM(a) * mxGetN(a) == 1 && *mxGetSingles(a) == 0) return NULL; + } + if (mxIsChar(a)) { + char pbuf[128]; + mxGetString(a, pbuf, sizeof(pbuf)); + sscanf(pbuf, fmt, &p); + } #ifdef R2008OO - else if (ap = mxGetProperty(a, 0, "mwptr")) { - return mxWrapGetP(ap, fmt, e); - } + else if (ap = mxGetProperty(a, 0, "mwptr")) { + return mxWrapGetP(ap, fmt, e); + } #endif - if (p == 0) - *e = "Invalid pointer"; - return p; -} - -mxArray* mxWrapCreateP_single(void* p, const char* fmt) -{ - if (p == 0) { - mxArray* z = mxCreateNumericMatrix(1,1, mxSINGLE_CLASS, mxREAL); - *mxGetSingles(z) = 0; - return z; - } else { - char pbuf[128]; - sprintf(pbuf, fmt, p); - return mxCreateString(pbuf); - } -} - -mxArray* mxWrapStrncpy_single(const char* s) -{ - if (s) { - return mxCreateString(s); - } else { - mxArray* z = mxCreateNumericMatrix(1,1, mxSINGLE_CLASS, mxREAL); - *mxGetSingles(z) = 0; - return z; - } -} - -char* mxWrapGetString_single(const mxArray* a, const char** e) -{ - char* s; - mwSize slen; - if (!a || (!mxIsChar(a) && mxGetM(a)*mxGetN(a) > 0)) { - *e = "Invalid string argument"; - return NULL; - } - slen = mxGetM(a)*mxGetN(a) + 1; - s = (char*) mxMalloc(slen); - if (mxGetM(a)*mxGetN(a) == 0) - *s = 0; - else - mxGetString(a, s, slen); - return s; -} - - -float mxWrapGetScalar_single(const mxArray* a, const char** e) -{ - if (!a || mxGetClassID(a) != mxSINGLE_CLASS || mxGetM(a)*mxGetN(a) != 1) { - *e = "Invalid scalar argument"; - return 0; - } - if( mxIsComplex(a) ) - return (float) (*mxGetComplexSingles(a)).real; - else - return (float) (*mxGetSingles(a)); -} - -#define mxWrapGetArrayDef_single(func, T) \ -T* func(const mxArray* a, const char** e) \ -{ \ - T* array; \ - mwSize arraylen; \ - mwIndex i; \ - T* p; \ - float* q; \ - mxComplexSingle* z; \ - if (!a || mxGetClassID(a) != mxSINGLE_CLASS) { \ - *e = "Invalid array argument, mxSINGLE_CLASS expected"; \ - return 0; \ - } \ - arraylen = mxGetM(a)*mxGetN(a); \ - array = (T*) mxMalloc(mxGetM(a)*mxGetN(a) * sizeof(T)); \ - p = array; \ - if( mxIsComplex(a) ) \ - { \ - z = mxGetComplexSingles(a); \ - for (i = 0; i < arraylen; ++i) \ - *p++ = (T) (*z++).real; \ - } \ - else \ - { \ - q = mxGetSingles(a); \ - for (i = 0; i < arraylen; ++i) \ - *p++ = (T) (*q++); \ - } \ - return array; \ -} - - -#define mxWrapCopyDef_single(func, T) \ -void func(mxArray* a, const T* q, mwSize n) \ -{ \ - mwIndex i; \ - float* p; \ - mxComplexSingle* z; \ - if( mxIsComplex(a) ) \ - { \ - z = mxGetComplexSingles(a); \ - for (i = 0; i < n; ++i) \ - (*z++).real = (float) *q++; \ - (*z++).imag = 0; \ - } \ - else \ - { \ - p = mxGetSingles(a); \ - for (i = 0; i < n; ++i) \ - *p++ = (float) *q++; \ - } \ -} - - -#define mxWrapReturnDef_single(func, T) \ -mxArray* func(const T* q, mwSize m, mwSize n) \ -{ \ - mwIndex i; \ - float* p; \ - if (!q) { \ - return mxCreateNumericMatrix(0,0, mxSINGLE_CLASS, mxREAL); \ - } else { \ - mxArray* a = mxCreateNumericMatrix(m,n, mxSINGLE_CLASS, mxREAL); \ - p = mxGetSingles(a); \ - for (i = 0; i < m*n; ++i) \ - *p++ = (float) *q++; \ - return a; \ - } \ -} - - -#define mxWrapGetScalarZDef_single(func, T, ZT, setz) \ -void func(T* z, const mxArray* a) \ -{ \ - if( mxIsComplex(a) ) \ - { \ - setz(z, (ZT) (*mxGetComplexSingles(a)).real, (ZT) (*mxGetComplexSingles(a)).imag); \ - } \ - else \ - { \ - setz(z, (ZT) (*mxGetComplexSingles(a)).real, (ZT) 0); \ - } \ -} - - -#define mxWrapGetArrayZDef_single(func, T, ZT, setz) \ -T* func(const mxArray* a, const char** e) \ -{ \ - T* array; \ - mwSize arraylen; \ - mwIndex i; \ - T* p; \ - float* q; \ - mxComplexSingle* z; \ - if (!a || mxGetClassID(a) != mxSINGLE_CLASS) { \ - *e = "Invalid array argument, mxSINGLE_CLASS expected"; \ - return 0; \ - } \ - arraylen = mxGetM(a)*mxGetN(a); \ - array = (T*) mxMalloc(mxGetM(a)*mxGetN(a) * sizeof(T)); \ - p = array; \ - if( mxIsComplex(a) ) \ - { \ - z = mxGetComplexSingles(a); \ - for (i = 0; i < arraylen; ++i) { \ - setz(p, (ZT) (*z).real, (ZT) (*z).imag); \ - ++p; ++z; } \ - } \ - else \ - { \ - q = mxGetSingles(a); \ - for (i = 0; i < arraylen; ++i) { \ - setz(p, (ZT) (*q), (ZT) 0 ); \ - ++p; ++q; } \ - } \ - return array; \ -} - - -#define mxWrapCopyZDef_single(func, T, freal, fimag) \ -void func(mxArray* a, const T* q, mwSize n) \ -{ \ - mwIndex i; \ - float* p; \ - mxComplexSingle* z; \ - if( mxIsComplex(a) ) \ - { \ - z = mxGetComplexSingles(a); \ - for (i = 0; i < n; ++i) { \ - (*z).real = freal(*q); \ - (*z).imag = fimag(*q); \ - ++z; ++q; } \ - } \ - else \ - { \ - p = mxGetSingles(a); \ - for (i = 0; i < n; ++i) \ - *p++ = freal(*q++); \ - } \ -} - - -#define mxWrapReturnZDef_single(func, T, freal, fimag) \ -mxArray* func(const T* q, mwSize m, mwSize n) \ -{ \ - mwIndex i; \ - mxComplexSingle* p; \ - if (!q) { \ - return mxCreateNumericMatrix(0,0, mxSINGLE_CLASS, mxCOMPLEX); \ - } else { \ - mxArray* a = mxCreateNumericMatrix(m,n, mxSINGLE_CLASS, mxCOMPLEX); \ - p = mxGetComplexSingles(a); \ - for (i = 0; i < m*n; ++i) { \ - (*p).real = freal(*q); \ - (*p).imag = fimag(*q); \ - ++p; ++q; } \ - return a; \ - } \ -} - - + if (p == 0) *e = "Invalid pointer"; + return p; +} + +mxArray *mxWrapCreateP_single(void *p, const char *fmt) { + if (p == 0) { + mxArray *z = mxCreateNumericMatrix(1, 1, mxSINGLE_CLASS, mxREAL); + *mxGetSingles(z) = 0; + return z; + } else { + char pbuf[128]; + sprintf(pbuf, fmt, p); + return mxCreateString(pbuf); + } +} + +mxArray *mxWrapStrncpy_single(const char *s) { + if (s) { + return mxCreateString(s); + } else { + mxArray *z = mxCreateNumericMatrix(1, 1, mxSINGLE_CLASS, mxREAL); + *mxGetSingles(z) = 0; + return z; + } +} + +char *mxWrapGetString_single(const mxArray *a, const char **e) { + char *s; + mwSize slen; + if (!a || (!mxIsChar(a) && mxGetM(a) * mxGetN(a) > 0)) { + *e = "Invalid string argument"; + return NULL; + } + slen = mxGetM(a) * mxGetN(a) + 1; + s = (char *)mxMalloc(slen); + if (mxGetM(a) * mxGetN(a) == 0) + *s = 0; + else + mxGetString(a, s, slen); + return s; +} + +float mxWrapGetScalar_single(const mxArray *a, const char **e) { + if (!a || mxGetClassID(a) != mxSINGLE_CLASS || mxGetM(a) * mxGetN(a) != 1) { + *e = "Invalid scalar argument"; + return 0; + } + if (mxIsComplex(a)) + return (float)(*mxGetComplexSingles(a)).real; + else + return (float)(*mxGetSingles(a)); +} + +#define mxWrapGetArrayDef_single(func, T) \ + T *func(const mxArray *a, const char **e) { \ + T *array; \ + mwSize arraylen; \ + mwIndex i; \ + T *p; \ + float *q; \ + mxComplexSingle *z; \ + if (!a || mxGetClassID(a) != mxSINGLE_CLASS) { \ + *e = "Invalid array argument, mxSINGLE_CLASS expected"; \ + return 0; \ + } \ + arraylen = mxGetM(a) * mxGetN(a); \ + array = (T *)mxMalloc(mxGetM(a) * mxGetN(a) * sizeof(T)); \ + p = array; \ + if (mxIsComplex(a)) { \ + z = mxGetComplexSingles(a); \ + for (i = 0; i < arraylen; ++i) *p++ = (T)(*z++).real; \ + } else { \ + q = mxGetSingles(a); \ + for (i = 0; i < arraylen; ++i) *p++ = (T)(*q++); \ + } \ + return array; \ + } + +#define mxWrapCopyDef_single(func, T) \ + void func(mxArray *a, const T *q, mwSize n) { \ + mwIndex i; \ + float *p; \ + mxComplexSingle *z; \ + if (mxIsComplex(a)) { \ + z = mxGetComplexSingles(a); \ + for (i = 0; i < n; ++i) (*z++).real = (float)*q++; \ + (*z++).imag = 0; \ + } else { \ + p = mxGetSingles(a); \ + for (i = 0; i < n; ++i) *p++ = (float)*q++; \ + } \ + } + +#define mxWrapReturnDef_single(func, T) \ + mxArray *func(const T *q, mwSize m, mwSize n) { \ + mwIndex i; \ + float *p; \ + if (!q) { \ + return mxCreateNumericMatrix(0, 0, mxSINGLE_CLASS, mxREAL); \ + } else { \ + mxArray *a = mxCreateNumericMatrix(m, n, mxSINGLE_CLASS, mxREAL); \ + p = mxGetSingles(a); \ + for (i = 0; i < m * n; ++i) *p++ = (float)*q++; \ + return a; \ + } \ + } + +#define mxWrapGetScalarZDef_single(func, T, ZT, setz) \ + void func(T *z, const mxArray *a) { \ + if (mxIsComplex(a)) { \ + setz(z, (ZT)(*mxGetComplexSingles(a)).real, (ZT)(*mxGetComplexSingles(a)).imag); \ + } else { \ + setz(z, (ZT)(*mxGetComplexSingles(a)).real, (ZT)0); \ + } \ + } + +#define mxWrapGetArrayZDef_single(func, T, ZT, setz) \ + T *func(const mxArray *a, const char **e) { \ + T *array; \ + mwSize arraylen; \ + mwIndex i; \ + T *p; \ + float *q; \ + mxComplexSingle *z; \ + if (!a || mxGetClassID(a) != mxSINGLE_CLASS) { \ + *e = "Invalid array argument, mxSINGLE_CLASS expected"; \ + return 0; \ + } \ + arraylen = mxGetM(a) * mxGetN(a); \ + array = (T *)mxMalloc(mxGetM(a) * mxGetN(a) * sizeof(T)); \ + p = array; \ + if (mxIsComplex(a)) { \ + z = mxGetComplexSingles(a); \ + for (i = 0; i < arraylen; ++i) { \ + setz(p, (ZT)(*z).real, (ZT)(*z).imag); \ + ++p; \ + ++z; \ + } \ + } else { \ + q = mxGetSingles(a); \ + for (i = 0; i < arraylen; ++i) { \ + setz(p, (ZT)(*q), (ZT)0); \ + ++p; \ + ++q; \ + } \ + } \ + return array; \ + } + +#define mxWrapCopyZDef_single(func, T, freal, fimag) \ + void func(mxArray *a, const T *q, mwSize n) { \ + mwIndex i; \ + float *p; \ + mxComplexSingle *z; \ + if (mxIsComplex(a)) { \ + z = mxGetComplexSingles(a); \ + for (i = 0; i < n; ++i) { \ + (*z).real = freal(*q); \ + (*z).imag = fimag(*q); \ + ++z; \ + ++q; \ + } \ + } else { \ + p = mxGetSingles(a); \ + for (i = 0; i < n; ++i) *p++ = freal(*q++); \ + } \ + } + +#define mxWrapReturnZDef_single(func, T, freal, fimag) \ + mxArray *func(const T *q, mwSize m, mwSize n) { \ + mwIndex i; \ + mxComplexSingle *p; \ + if (!q) { \ + return mxCreateNumericMatrix(0, 0, mxSINGLE_CLASS, mxCOMPLEX); \ + } else { \ + mxArray *a = mxCreateNumericMatrix(m, n, mxSINGLE_CLASS, mxCOMPLEX); \ + p = mxGetComplexSingles(a); \ + for (i = 0; i < m * n; ++i) { \ + (*p).real = freal(*q); \ + (*p).imag = fimag(*q); \ + ++p; \ + ++q; \ + } \ + return a; \ + } \ + } #else @@ -557,1672 +474,1533 @@ mxArray* func(const T* q, mwSize m, mwSize n) \ * Support routines for copying data into and out of the MEX stubs, -R2017b */ -void* mxWrapGetP(const mxArray* a, const char* fmt, const char** e) -{ - void* p = 0; -#ifdef R2008OO - mxArray* ap; -#endif - if (mxGetClassID(a) == mxDOUBLE_CLASS && - mxGetM(a)*mxGetN(a) == 1 && *mxGetPr(a) == 0) - return p; - if (mxIsChar(a)) { - char pbuf[128]; - mxGetString(a, pbuf, sizeof(pbuf)); - sscanf(pbuf, fmt, &p); - } +void *mxWrapGetP(const mxArray *a, const char *fmt, const char **e) { + void *p = 0; #ifdef R2008OO - else if (ap = mxGetProperty(a, 0, "mwptr")) { - return mxWrapGetP(ap, fmt, e); - } + mxArray *ap; #endif - if (p == 0) - *e = "Invalid pointer"; + if (mxGetClassID(a) == mxDOUBLE_CLASS && mxGetM(a) * mxGetN(a) == 1 && *mxGetPr(a) == 0) return p; -} - -mxArray* mxWrapCreateP(void* p, const char* fmt) -{ - if (p == 0) { - mxArray* z = mxCreateDoubleMatrix(1,1, mxREAL); - *mxGetPr(z) = 0; - return z; - } else { - char pbuf[128]; - sprintf(pbuf, fmt, p); - return mxCreateString(pbuf); - } -} - -mxArray* mxWrapStrncpy(const char* s) -{ - if (s) { - return mxCreateString(s); - } else { - mxArray* z = mxCreateDoubleMatrix(1,1, mxREAL); - *mxGetPr(z) = 0; - return z; - } -} - -double mxWrapGetScalar(const mxArray* a, const char** e) -{ - if (!a || mxGetClassID(a) != mxDOUBLE_CLASS || mxGetM(a)*mxGetN(a) != 1) { - *e = "Invalid scalar argument"; - return 0; - } - return *mxGetPr(a); -} - -char* mxWrapGetString(const mxArray* a, const char** e) -{ - char* s; - mwSize slen; - if (!a || (!mxIsChar(a) && mxGetM(a)*mxGetN(a) > 0)) { - *e = "Invalid string argument"; - return NULL; - } - slen = mxGetM(a)*mxGetN(a) + 1; - s = (char*) mxMalloc(slen); - if (mxGetM(a)*mxGetN(a) == 0) - *s = 0; - else - mxGetString(a, s, slen); - return s; -} - - -#define mxWrapGetArrayDef(func, T) \ -T* func(const mxArray* a, const char** e) \ -{ \ - T* array; \ - mwSize arraylen; \ - mwIndex i; \ - T* p; \ - double* q; \ - if (!a || mxGetClassID(a) != mxDOUBLE_CLASS) { \ - *e = "Invalid array argument, mxDOUBLE_CLASS expected"; \ - return 0; \ - } \ - arraylen = mxGetM(a)*mxGetN(a); \ - array = (T*) mxMalloc(mxGetM(a)*mxGetN(a) * sizeof(T)); \ - p = array; \ - q = mxGetPr(a); \ - for (i = 0; i < arraylen; ++i) \ - *p++ = (T) (*q++); \ - return array; \ -} - - -#define mxWrapCopyDef(func, T) \ -void func(mxArray* a, const T* q, mwSize n) \ -{ \ - mwIndex i; \ - double* p = mxGetPr(a); \ - for (i = 0; i < n; ++i) \ - *p++ = *q++; \ -} - - -#define mxWrapReturnDef(func, T) \ -mxArray* func(const T* q, mwSize m, mwSize n) \ -{ \ - mwIndex i; \ - double* p; \ - if (!q) { \ - return mxCreateDoubleMatrix(0,0, mxREAL); \ - } else { \ - mxArray* a = mxCreateDoubleMatrix(m,n, mxREAL); \ - p = mxGetPr(a); \ - for (i = 0; i < m*n; ++i) \ - *p++ = *q++; \ - return a; \ - } \ -} - - -#define mxWrapGetScalarZDef(func, T, ZT, setz) \ -void func(T* z, const mxArray* a) \ -{ \ - double* pr = mxGetPr(a); \ - double* pi = mxGetPi(a); \ - setz(z, (ZT) *pr, (pi ? (ZT) *pi : (ZT) 0)); \ -} - - -#define mxWrapGetArrayZDef(func, T, ZT, setz) \ -T* func(const mxArray* a, const char** e) \ -{ \ - T* array; \ - mwSize arraylen; \ - mwIndex i; \ - T* p; \ - double* qr; \ - double* qi; \ - if (!a || mxGetClassID(a) != mxDOUBLE_CLASS) { \ - *e = "Invalid array argument, mxDOUBLE_CLASS expected"; \ - return 0; \ - } \ - arraylen = mxGetM(a)*mxGetN(a); \ - array = (T*) mxMalloc(mxGetM(a)*mxGetN(a) * sizeof(T)); \ - p = array; \ - qr = mxGetPr(a); \ - qi = mxGetPi(a); \ - for (i = 0; i < arraylen; ++i) { \ - ZT val_qr = *qr++; \ - ZT val_qi = (qi ? (ZT) *qi++ : (ZT) 0); \ - setz(p, val_qr, val_qi); \ - ++p; \ - } \ - return array; \ -} - - -#define mxWrapCopyZDef(func, T, real, imag) \ -void func(mxArray* a, const T* q, mwSize n) \ -{ \ - mwIndex i; \ - double* pr = mxGetPr(a); \ - double* pi = mxGetPi(a); \ - for (i = 0; i < n; ++i) { \ - *pr++ = real(*q); \ - *pi++ = imag(*q); \ - ++q; \ - } \ -} - - -#define mxWrapReturnZDef(func, T, real, imag) \ -mxArray* func(const T* q, mwSize m, mwSize n) \ -{ \ - mwIndex i; \ - double* pr; \ - double* pi; \ - if (!q) { \ - return mxCreateDoubleMatrix(0,0, mxCOMPLEX); \ - } else { \ - mxArray* a = mxCreateDoubleMatrix(m,n, mxCOMPLEX); \ - pr = mxGetPr(a); \ - pi = mxGetPi(a); \ - for (i = 0; i < m*n; ++i) { \ - *pr++ = real(*q); \ - *pi++ = imag(*q); \ - ++q; \ - } \ - return a; \ - } \ -} - - - - - - -void* mxWrapGetP_single(const mxArray* a, const char* fmt, const char** e) -{ - void* p = 0; + if (mxIsChar(a)) { + char pbuf[128]; + mxGetString(a, pbuf, sizeof(pbuf)); + sscanf(pbuf, fmt, &p); + } #ifdef R2008OO - mxArray* ap; + else if (ap = mxGetProperty(a, 0, "mwptr")) { + return mxWrapGetP(ap, fmt, e); + } #endif - if (mxGetClassID(a) == mxSINGLE_CLASS && - mxGetM(a)*mxGetN(a) == 1 && *((float*)mxGetData(a)) == 0) - return p; - if (mxIsChar(a)) { - char pbuf[128]; - mxGetString(a, pbuf, sizeof(pbuf)); - sscanf(pbuf, fmt, &p); - } + if (p == 0) *e = "Invalid pointer"; + return p; +} + +mxArray *mxWrapCreateP(void *p, const char *fmt) { + if (p == 0) { + mxArray *z = mxCreateDoubleMatrix(1, 1, mxREAL); + *mxGetPr(z) = 0; + return z; + } else { + char pbuf[128]; + sprintf(pbuf, fmt, p); + return mxCreateString(pbuf); + } +} + +mxArray *mxWrapStrncpy(const char *s) { + if (s) { + return mxCreateString(s); + } else { + mxArray *z = mxCreateDoubleMatrix(1, 1, mxREAL); + *mxGetPr(z) = 0; + return z; + } +} + +double mxWrapGetScalar(const mxArray *a, const char **e) { + if (!a || mxGetClassID(a) != mxDOUBLE_CLASS || mxGetM(a) * mxGetN(a) != 1) { + *e = "Invalid scalar argument"; + return 0; + } + return *mxGetPr(a); +} + +char *mxWrapGetString(const mxArray *a, const char **e) { + char *s; + mwSize slen; + if (!a || (!mxIsChar(a) && mxGetM(a) * mxGetN(a) > 0)) { + *e = "Invalid string argument"; + return NULL; + } + slen = mxGetM(a) * mxGetN(a) + 1; + s = (char *)mxMalloc(slen); + if (mxGetM(a) * mxGetN(a) == 0) + *s = 0; + else + mxGetString(a, s, slen); + return s; +} + +#define mxWrapGetArrayDef(func, T) \ + T *func(const mxArray *a, const char **e) { \ + T *array; \ + mwSize arraylen; \ + mwIndex i; \ + T *p; \ + double *q; \ + if (!a || mxGetClassID(a) != mxDOUBLE_CLASS) { \ + *e = "Invalid array argument, mxDOUBLE_CLASS expected"; \ + return 0; \ + } \ + arraylen = mxGetM(a) * mxGetN(a); \ + array = (T *)mxMalloc(mxGetM(a) * mxGetN(a) * sizeof(T)); \ + p = array; \ + q = mxGetPr(a); \ + for (i = 0; i < arraylen; ++i) *p++ = (T)(*q++); \ + return array; \ + } + +#define mxWrapCopyDef(func, T) \ + void func(mxArray *a, const T *q, mwSize n) { \ + mwIndex i; \ + double *p = mxGetPr(a); \ + for (i = 0; i < n; ++i) *p++ = *q++; \ + } + +#define mxWrapReturnDef(func, T) \ + mxArray *func(const T *q, mwSize m, mwSize n) { \ + mwIndex i; \ + double *p; \ + if (!q) { \ + return mxCreateDoubleMatrix(0, 0, mxREAL); \ + } else { \ + mxArray *a = mxCreateDoubleMatrix(m, n, mxREAL); \ + p = mxGetPr(a); \ + for (i = 0; i < m * n; ++i) *p++ = *q++; \ + return a; \ + } \ + } + +#define mxWrapGetScalarZDef(func, T, ZT, setz) \ + void func(T *z, const mxArray *a) { \ + double *pr = mxGetPr(a); \ + double *pi = mxGetPi(a); \ + setz(z, (ZT) * pr, (pi ? (ZT) * pi : (ZT)0)); \ + } + +#define mxWrapGetArrayZDef(func, T, ZT, setz) \ + T *func(const mxArray *a, const char **e) { \ + T *array; \ + mwSize arraylen; \ + mwIndex i; \ + T *p; \ + double *qr; \ + double *qi; \ + if (!a || mxGetClassID(a) != mxDOUBLE_CLASS) { \ + *e = "Invalid array argument, mxDOUBLE_CLASS expected"; \ + return 0; \ + } \ + arraylen = mxGetM(a) * mxGetN(a); \ + array = (T *)mxMalloc(mxGetM(a) * mxGetN(a) * sizeof(T)); \ + p = array; \ + qr = mxGetPr(a); \ + qi = mxGetPi(a); \ + for (i = 0; i < arraylen; ++i) { \ + ZT val_qr = *qr++; \ + ZT val_qi = (qi ? (ZT) * qi++ : (ZT)0); \ + setz(p, val_qr, val_qi); \ + ++p; \ + } \ + return array; \ + } + +#define mxWrapCopyZDef(func, T, real, imag) \ + void func(mxArray *a, const T *q, mwSize n) { \ + mwIndex i; \ + double *pr = mxGetPr(a); \ + double *pi = mxGetPi(a); \ + for (i = 0; i < n; ++i) { \ + *pr++ = real(*q); \ + *pi++ = imag(*q); \ + ++q; \ + } \ + } + +#define mxWrapReturnZDef(func, T, real, imag) \ + mxArray *func(const T *q, mwSize m, mwSize n) { \ + mwIndex i; \ + double *pr; \ + double *pi; \ + if (!q) { \ + return mxCreateDoubleMatrix(0, 0, mxCOMPLEX); \ + } else { \ + mxArray *a = mxCreateDoubleMatrix(m, n, mxCOMPLEX); \ + pr = mxGetPr(a); \ + pi = mxGetPi(a); \ + for (i = 0; i < m * n; ++i) { \ + *pr++ = real(*q); \ + *pi++ = imag(*q); \ + ++q; \ + } \ + return a; \ + } \ + } + +void *mxWrapGetP_single(const mxArray *a, const char *fmt, const char **e) { + void *p = 0; #ifdef R2008OO - else if (ap = mxGetProperty(a, 0, "mwptr")) { - return mxWrapGetP(ap, fmt, e); - } + mxArray *ap; #endif - if (p == 0) - *e = "Invalid pointer"; + if (mxGetClassID(a) == mxSINGLE_CLASS && mxGetM(a) * mxGetN(a) == 1 && + *((float *)mxGetData(a)) == 0) return p; -} - -mxArray* mxWrapCreateP_single(void* p, const char* fmt) -{ - if (p == 0) { - mxArray* z = mxCreateNumericMatrix(1,1, mxSINGLE_CLASS, mxREAL); - *((float*)mxGetData(z)) = 0; - return z; - } else { - char pbuf[128]; - sprintf(pbuf, fmt, p); - return mxCreateString(pbuf); - } -} -mxArray* mxWrapStrncpy_single(const char* s) -{ - if (s) { - return mxCreateString(s); - } else { - mxArray* z = mxCreateNumericMatrix(1,1, mxSINGLE_CLASS, mxREAL); - *((float*)mxGetData(z)) = 0; - return z; - } -} - -float mxWrapGetScalar_single(const mxArray* a, const char** e) -{ - if (!a || mxGetClassID(a) != mxSINGLE_CLASS || mxGetM(a)*mxGetN(a) != 1) { - *e = "Invalid scalar argument"; - return 0; - } - return *((float*)mxGetData(a)); -} - -char* mxWrapGetString_single(const mxArray* a, const char** e) -{ - char* s; - mwSize slen; - if (!a || (!mxIsChar(a) && mxGetM(a)*mxGetN(a) > 0)) { - *e = "Invalid string argument, mxSINGLE_CLASS expected"; - return NULL; - } - slen = mxGetM(a)*mxGetN(a) + 1; - s = (char*) mxMalloc(slen); - if (mxGetM(a)*mxGetN(a) == 0) - *s = 0; - else - mxGetString(a, s, slen); - return s; -} - - -#define mxWrapGetArrayDef_single(func, T) \ -T* func(const mxArray* a, const char** e) \ -{ \ - T* array; \ - mwSize arraylen; \ - mwIndex i; \ - T* p; \ - float* q; \ - if (!a || mxGetClassID(a) != mxSINGLE_CLASS) { \ - *e = "Invalid array argument, mxSINGLE_CLASS expected"; \ - return 0; \ - } \ - arraylen = mxGetM(a)*mxGetN(a); \ - array = (T*) mxMalloc(mxGetM(a)*mxGetN(a) * sizeof(T)); \ - p = array; \ - q = (float*) mxGetData(a); \ - for (i = 0; i < arraylen; ++i) \ - *p++ = (T) (*q++); \ - return array; \ -} - - -#define mxWrapCopyDef_single(func, T) \ -void func(mxArray* a, const T* q, mwSize n) \ -{ \ - mwIndex i; \ - float* p = (float*) mxGetData(a); \ - for (i = 0; i < n; ++i) \ - *p++ = *q++; \ -} - - -#define mxWrapReturnDef_single(func, T) \ -mxArray* func(const T* q, mwSize m, mwSize n) \ -{ \ - mwIndex i; \ - float* p; \ - if (!q) { \ - return mxCreateNumericMatrix(0,0, mxSINGLE_CLASS, mxREAL); \ - } else { \ - mxArray* a = mxCreateNumericMatrix(m,n, mxSINGLE_CLASS, mxREAL);\ - p = (float*) mxGetData(a); \ - for (i = 0; i < m*n; ++i) \ - *p++ = *q++; \ - return a; \ - } \ -} - + if (mxIsChar(a)) { + char pbuf[128]; + mxGetString(a, pbuf, sizeof(pbuf)); + sscanf(pbuf, fmt, &p); + } +#ifdef R2008OO + else if (ap = mxGetProperty(a, 0, "mwptr")) { + return mxWrapGetP(ap, fmt, e); + } +#endif + if (p == 0) *e = "Invalid pointer"; + return p; +} + +mxArray *mxWrapCreateP_single(void *p, const char *fmt) { + if (p == 0) { + mxArray *z = mxCreateNumericMatrix(1, 1, mxSINGLE_CLASS, mxREAL); + *((float *)mxGetData(z)) = 0; + return z; + } else { + char pbuf[128]; + sprintf(pbuf, fmt, p); + return mxCreateString(pbuf); + } +} +mxArray *mxWrapStrncpy_single(const char *s) { + if (s) { + return mxCreateString(s); + } else { + mxArray *z = mxCreateNumericMatrix(1, 1, mxSINGLE_CLASS, mxREAL); + *((float *)mxGetData(z)) = 0; + return z; + } +} + +float mxWrapGetScalar_single(const mxArray *a, const char **e) { + if (!a || mxGetClassID(a) != mxSINGLE_CLASS || mxGetM(a) * mxGetN(a) != 1) { + *e = "Invalid scalar argument"; + return 0; + } + return *((float *)mxGetData(a)); +} + +char *mxWrapGetString_single(const mxArray *a, const char **e) { + char *s; + mwSize slen; + if (!a || (!mxIsChar(a) && mxGetM(a) * mxGetN(a) > 0)) { + *e = "Invalid string argument, mxSINGLE_CLASS expected"; + return NULL; + } + slen = mxGetM(a) * mxGetN(a) + 1; + s = (char *)mxMalloc(slen); + if (mxGetM(a) * mxGetN(a) == 0) + *s = 0; + else + mxGetString(a, s, slen); + return s; +} + +#define mxWrapGetArrayDef_single(func, T) \ + T *func(const mxArray *a, const char **e) { \ + T *array; \ + mwSize arraylen; \ + mwIndex i; \ + T *p; \ + float *q; \ + if (!a || mxGetClassID(a) != mxSINGLE_CLASS) { \ + *e = "Invalid array argument, mxSINGLE_CLASS expected"; \ + return 0; \ + } \ + arraylen = mxGetM(a) * mxGetN(a); \ + array = (T *)mxMalloc(mxGetM(a) * mxGetN(a) * sizeof(T)); \ + p = array; \ + q = (float *)mxGetData(a); \ + for (i = 0; i < arraylen; ++i) *p++ = (T)(*q++); \ + return array; \ + } + +#define mxWrapCopyDef_single(func, T) \ + void func(mxArray *a, const T *q, mwSize n) { \ + mwIndex i; \ + float *p = (float *)mxGetData(a); \ + for (i = 0; i < n; ++i) *p++ = *q++; \ + } + +#define mxWrapReturnDef_single(func, T) \ + mxArray *func(const T *q, mwSize m, mwSize n) { \ + mwIndex i; \ + float *p; \ + if (!q) { \ + return mxCreateNumericMatrix(0, 0, mxSINGLE_CLASS, mxREAL); \ + } else { \ + mxArray *a = mxCreateNumericMatrix(m, n, mxSINGLE_CLASS, mxREAL); \ + p = (float *)mxGetData(a); \ + for (i = 0; i < m * n; ++i) *p++ = *q++; \ + return a; \ + } \ + } #define mxWrapGetScalarZDef_single(func, T, ZT, setz) \ -void func(T* z, const mxArray* a) \ -{ \ - float* pr = (float*) mxGetData(a); \ - float* pi = (float*) mxGetImagData(a); \ - setz(z, (ZT) *pr, (pi ? (ZT) *pi : (ZT) 0)); \ -} - - -#define mxWrapGetArrayZDef_single(func, T, ZT, setz) \ -T* func(const mxArray* a, const char** e) \ -{ \ - T* array; \ - mwSize arraylen; \ - mwIndex i; \ - T* p; \ - float* qr; \ - float* qi; \ - if (!a || mxGetClassID(a) != mxSINGLE_CLASS) { \ - *e = "Invalid array argument, mxSINGLE_CLASS expected"; \ - return 0; \ - } \ - arraylen = mxGetM(a)*mxGetN(a); \ - array = (T*) mxMalloc(mxGetM(a)*mxGetN(a) * sizeof(T)); \ - p = array; \ - qr = (float*) mxGetData(a); \ - qi = (float*) mxGetImagData(a); \ - for (i = 0; i < arraylen; ++i) { \ - ZT val_qr = *qr++; \ - ZT val_qi = (qi ? (ZT) *qi++ : (ZT) 0); \ - setz(p, val_qr, val_qi); \ - ++p; \ - } \ - return array; \ -} - + void func(T *z, const mxArray *a) { \ + float *pr = (float *)mxGetData(a); \ + float *pi = (float *)mxGetImagData(a); \ + setz(z, (ZT) * pr, (pi ? (ZT) * pi : (ZT)0)); \ + } + +#define mxWrapGetArrayZDef_single(func, T, ZT, setz) \ + T *func(const mxArray *a, const char **e) { \ + T *array; \ + mwSize arraylen; \ + mwIndex i; \ + T *p; \ + float *qr; \ + float *qi; \ + if (!a || mxGetClassID(a) != mxSINGLE_CLASS) { \ + *e = "Invalid array argument, mxSINGLE_CLASS expected"; \ + return 0; \ + } \ + arraylen = mxGetM(a) * mxGetN(a); \ + array = (T *)mxMalloc(mxGetM(a) * mxGetN(a) * sizeof(T)); \ + p = array; \ + qr = (float *)mxGetData(a); \ + qi = (float *)mxGetImagData(a); \ + for (i = 0; i < arraylen; ++i) { \ + ZT val_qr = *qr++; \ + ZT val_qi = (qi ? (ZT) * qi++ : (ZT)0); \ + setz(p, val_qr, val_qi); \ + ++p; \ + } \ + return array; \ + } #define mxWrapCopyZDef_single(func, T, real, imag) \ -void func(mxArray* a, const T* q, mwSize n) \ -{ \ - mwIndex i; \ - float* pr = (float*) mxGetData(a); \ - float* pi = (float*) mxGetImagData(a); \ - for (i = 0; i < n; ++i) { \ - *pr++ = real(*q); \ - *pi++ = imag(*q); \ - ++q; \ - } \ -} - - -#define mxWrapReturnZDef_single(func, T, real, imag) \ -mxArray* func(const T* q, mwSize m, mwSize n) \ -{ \ - mwIndex i; \ - float* pr; \ - float* pi; \ - if (!q) { \ - return mxCreateNumericMatrix(0,0, mxSINGLE_CLASS, mxCOMPLEX); \ - } else { \ - mxArray* a = mxCreateNumericMatrix(m,n, mxSINGLE_CLASS, mxCOMPLEX);\ - pr = (float*) mxGetData(a); \ - pi = (float*) mxGetImagData(a); \ - for (i = 0; i < m*n; ++i) { \ - *pr++ = real(*q); \ - *pi++ = imag(*q); \ - ++q; \ - } \ - return a; \ - } \ -} - - - - + void func(mxArray *a, const T *q, mwSize n) { \ + mwIndex i; \ + float *pr = (float *)mxGetData(a); \ + float *pi = (float *)mxGetImagData(a); \ + for (i = 0; i < n; ++i) { \ + *pr++ = real(*q); \ + *pi++ = imag(*q); \ + ++q; \ + } \ + } + +#define mxWrapReturnZDef_single(func, T, real, imag) \ + mxArray *func(const T *q, mwSize m, mwSize n) { \ + mwIndex i; \ + float *pr; \ + float *pi; \ + if (!q) { \ + return mxCreateNumericMatrix(0, 0, mxSINGLE_CLASS, mxCOMPLEX); \ + } else { \ + mxArray *a = mxCreateNumericMatrix(m, n, mxSINGLE_CLASS, mxCOMPLEX); \ + pr = (float *)mxGetData(a); \ + pi = (float *)mxGetImagData(a); \ + for (i = 0; i < m * n; ++i) { \ + *pr++ = real(*q); \ + *pi++ = imag(*q); \ + ++q; \ + } \ + return a; \ + } \ + } #endif #include typedef std::complex dcomplex; -#define real_dcomplex(z) std::real(z) -#define imag_dcomplex(z) std::imag(z) -#define setz_dcomplex(z,r,i) *z = dcomplex(r,i) +#define real_dcomplex(z) std::real(z) +#define imag_dcomplex(z) std::imag(z) +#define setz_dcomplex(z, r, i) *z = dcomplex(r, i) typedef std::complex fcomplex; -#define real_fcomplex(z) std::real(z) -#define imag_fcomplex(z) std::imag(z) -#define setz_fcomplex(z,r,i) *z = fcomplex(r,i) - - #include - #include - #include - #include - #include - void copy_finufft_opts(const mxArray* om, finufft_opts *oc) { - if(!mxIsStruct(om)) - mexErrMsgIdAndTxt("FINUFFT:inputNotStruct","opts input must be a structure."); - mwIndex idx = 0; - int ifield, nfields; - const char **fname; - nfields = mxGetNumberOfFields(om); - fname = (const char**)mxCalloc(nfields, sizeof(*fname)); - for(ifield=0; ifielddebug = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield))); - } - else if (strcmp(fname[ifield],"spread_debug") == 0) { - oc->spread_debug = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield))); - } - else if (strcmp(fname[ifield],"spread_sort") == 0) { - oc->spread_sort = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield))); - } - else if (strcmp(fname[ifield],"spread_kerevalmeth") == 0) { - oc->spread_kerevalmeth = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield))); - } - else if (strcmp(fname[ifield],"spread_kerpad") == 0) { - oc->spread_kerpad = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield))); - } - else if (strcmp(fname[ifield],"fftw") == 0) { - oc->fftw = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield))); - } - else if (strcmp(fname[ifield],"modeord") == 0) { - oc->modeord = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield))); - } - else if (strcmp(fname[ifield],"upsampfac") == 0) { - oc->upsampfac = (double)*mxGetPr(mxGetFieldByNumber(om,idx,ifield)); - } - else if (strcmp(fname[ifield],"spread_thread") == 0) { - oc->spread_thread = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield))); - } - else if (strcmp(fname[ifield],"maxbatchsize") == 0) { - oc->maxbatchsize = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield))); - } - else if (strcmp(fname[ifield],"nthreads") == 0) { - oc->nthreads = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield))); - } - else if (strcmp(fname[ifield],"spread_nthr_atomic") == 0) { - oc->spread_nthr_atomic = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield))); - } - else if (strcmp(fname[ifield],"spread_max_sp_size") == 0) { - oc->spread_max_sp_size = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield))); - } - else - continue; - } - mxFree(fname); - } - void finufft_mex_setup() { - /* Forces MATLAB to properly initialize their FFTW library. */ - mexEvalString("fft(1:8);"); - } - - +#define real_fcomplex(z) std::real(z) +#define imag_fcomplex(z) std::imag(z) +#define setz_fcomplex(z, r, i) *z = fcomplex(r, i) + +#include +#include +#include +#include +#include +void copy_finufft_opts(const mxArray *om, finufft_opts *oc) { + if (!mxIsStruct(om)) + mexErrMsgIdAndTxt("FINUFFT:inputNotStruct", "opts input must be a structure."); + mwIndex idx = 0; + int ifield, nfields; + const char **fname; + nfields = mxGetNumberOfFields(om); + fname = (const char **)mxCalloc(nfields, sizeof(*fname)); + for (ifield = 0; ifield < nfields; ifield++) { + fname[ifield] = mxGetFieldNameByNumber(om, ifield); + if (strcmp(fname[ifield], "debug") == 0) { + oc->debug = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield))); + } else if (strcmp(fname[ifield], "spread_debug") == 0) { + oc->spread_debug = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield))); + } else if (strcmp(fname[ifield], "spread_sort") == 0) { + oc->spread_sort = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield))); + } else if (strcmp(fname[ifield], "spread_kerevalmeth") == 0) { + oc->spread_kerevalmeth = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield))); + } else if (strcmp(fname[ifield], "spread_kerpad") == 0) { + oc->spread_kerpad = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield))); + } else if (strcmp(fname[ifield], "fftw") == 0) { + oc->fftw = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield))); + } else if (strcmp(fname[ifield], "modeord") == 0) { + oc->modeord = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield))); + } else if (strcmp(fname[ifield], "upsampfac") == 0) { + oc->upsampfac = (double)*mxGetPr(mxGetFieldByNumber(om, idx, ifield)); + } else if (strcmp(fname[ifield], "spread_thread") == 0) { + oc->spread_thread = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield))); + } else if (strcmp(fname[ifield], "maxbatchsize") == 0) { + oc->maxbatchsize = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield))); + } else if (strcmp(fname[ifield], "nthreads") == 0) { + oc->nthreads = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield))); + } else if (strcmp(fname[ifield], "spread_nthr_atomic") == 0) { + oc->spread_nthr_atomic = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield))); + } else if (strcmp(fname[ifield], "spread_max_sp_size") == 0) { + oc->spread_max_sp_size = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield))); + } else + continue; + } + mxFree(fname); +} +void finufft_mex_setup() { + /* Forces MATLAB to properly initialize their FFTW library. */ + mexEvalString("fft(1:8);"); +} /* Array copier definitions */ -mxWrapGetArrayDef(mxWrapGetArray_bool, bool) -mxWrapCopyDef (mxWrapCopy_bool, bool) -mxWrapReturnDef (mxWrapReturn_bool, bool) -mxWrapGetArrayDef_single(mxWrapGetArray_single_bool, bool) -mxWrapCopyDef_single (mxWrapCopy_single_bool, bool) -mxWrapReturnDef_single (mxWrapReturn_single_bool, bool) -mxWrapGetArrayDef(mxWrapGetArray_char, char) -mxWrapCopyDef (mxWrapCopy_char, char) -mxWrapReturnDef (mxWrapReturn_char, char) -mxWrapGetArrayDef_single(mxWrapGetArray_single_char, char) -mxWrapCopyDef_single (mxWrapCopy_single_char, char) -mxWrapReturnDef_single (mxWrapReturn_single_char, char) -mxWrapGetArrayDef(mxWrapGetArray_double, double) -mxWrapCopyDef (mxWrapCopy_double, double) -mxWrapReturnDef (mxWrapReturn_double, double) -mxWrapGetArrayDef_single(mxWrapGetArray_single_double, double) -mxWrapCopyDef_single (mxWrapCopy_single_double, double) -mxWrapReturnDef_single (mxWrapReturn_single_double, double) -mxWrapGetArrayDef(mxWrapGetArray_float, float) -mxWrapCopyDef (mxWrapCopy_float, float) -mxWrapReturnDef (mxWrapReturn_float, float) -mxWrapGetArrayDef_single(mxWrapGetArray_single_float, float) -mxWrapCopyDef_single (mxWrapCopy_single_float, float) -mxWrapReturnDef_single (mxWrapReturn_single_float, float) -mxWrapGetArrayDef(mxWrapGetArray_int, int) -mxWrapCopyDef (mxWrapCopy_int, int) -mxWrapReturnDef (mxWrapReturn_int, int) -mxWrapGetArrayDef_single(mxWrapGetArray_single_int, int) -mxWrapCopyDef_single (mxWrapCopy_single_int, int) -mxWrapReturnDef_single (mxWrapReturn_single_int, int) -mxWrapGetArrayDef(mxWrapGetArray_int64_t, int64_t) -mxWrapCopyDef (mxWrapCopy_int64_t, int64_t) -mxWrapReturnDef (mxWrapReturn_int64_t, int64_t) -mxWrapGetArrayDef_single(mxWrapGetArray_single_int64_t, int64_t) -mxWrapCopyDef_single (mxWrapCopy_single_int64_t, int64_t) -mxWrapReturnDef_single (mxWrapReturn_single_int64_t, int64_t) -mxWrapGetArrayDef(mxWrapGetArray_long, long) -mxWrapCopyDef (mxWrapCopy_long, long) -mxWrapReturnDef (mxWrapReturn_long, long) -mxWrapGetArrayDef_single(mxWrapGetArray_single_long, long) -mxWrapCopyDef_single (mxWrapCopy_single_long, long) -mxWrapReturnDef_single (mxWrapReturn_single_long, long) -mxWrapGetArrayDef(mxWrapGetArray_ptrdiff_t, ptrdiff_t) -mxWrapCopyDef (mxWrapCopy_ptrdiff_t, ptrdiff_t) -mxWrapReturnDef (mxWrapReturn_ptrdiff_t, ptrdiff_t) -mxWrapGetArrayDef_single(mxWrapGetArray_single_ptrdiff_t, ptrdiff_t) -mxWrapCopyDef_single (mxWrapCopy_single_ptrdiff_t, ptrdiff_t) -mxWrapReturnDef_single (mxWrapReturn_single_ptrdiff_t, ptrdiff_t) -mxWrapGetArrayDef(mxWrapGetArray_short, short) -mxWrapCopyDef (mxWrapCopy_short, short) -mxWrapReturnDef (mxWrapReturn_short, short) -mxWrapGetArrayDef_single(mxWrapGetArray_single_short, short) -mxWrapCopyDef_single (mxWrapCopy_single_short, short) -mxWrapReturnDef_single (mxWrapReturn_single_short, short) -mxWrapGetArrayDef(mxWrapGetArray_size_t, size_t) -mxWrapCopyDef (mxWrapCopy_size_t, size_t) -mxWrapReturnDef (mxWrapReturn_size_t, size_t) -mxWrapGetArrayDef_single(mxWrapGetArray_single_size_t, size_t) -mxWrapCopyDef_single (mxWrapCopy_single_size_t, size_t) -mxWrapReturnDef_single (mxWrapReturn_single_size_t, size_t) -mxWrapGetScalarZDef(mxWrapGetScalar_fcomplex, fcomplex, - float, setz_fcomplex) -mxWrapGetArrayZDef (mxWrapGetArray_fcomplex, fcomplex, - float, setz_fcomplex) -mxWrapCopyZDef (mxWrapCopy_fcomplex, fcomplex, - real_fcomplex, imag_fcomplex) -mxWrapReturnZDef (mxWrapReturn_fcomplex, fcomplex, - real_fcomplex, imag_fcomplex) -mxWrapGetScalarZDef_single(mxWrapGetScalar_single_fcomplex, fcomplex, - float, setz_fcomplex) -mxWrapGetArrayZDef_single (mxWrapGetArray_single_fcomplex, fcomplex, - float, setz_fcomplex) -mxWrapCopyZDef_single (mxWrapCopy_single_fcomplex, fcomplex, - real_fcomplex, imag_fcomplex) -mxWrapReturnZDef_single (mxWrapReturn_single_fcomplex, fcomplex, - real_fcomplex, imag_fcomplex) -mxWrapGetScalarZDef(mxWrapGetScalar_dcomplex, dcomplex, - double, setz_dcomplex) -mxWrapGetArrayZDef (mxWrapGetArray_dcomplex, dcomplex, - double, setz_dcomplex) -mxWrapCopyZDef (mxWrapCopy_dcomplex, dcomplex, - real_dcomplex, imag_dcomplex) -mxWrapReturnZDef (mxWrapReturn_dcomplex, dcomplex, - real_dcomplex, imag_dcomplex) -mxWrapGetScalarZDef_single(mxWrapGetScalar_single_dcomplex, dcomplex, - double, setz_dcomplex) -mxWrapGetArrayZDef_single (mxWrapGetArray_single_dcomplex, dcomplex, - double, setz_dcomplex) -mxWrapCopyZDef_single (mxWrapCopy_single_dcomplex, dcomplex, - real_dcomplex, imag_dcomplex) -mxWrapReturnZDef_single (mxWrapReturn_single_dcomplex, dcomplex, - real_dcomplex, imag_dcomplex) - -/* ---- finufft.mw: 166 ---- - * finufft_mex_setup(); - */ -static const char* stubids1_ = "finufft_mex_setup()"; - -void mexStub1(int nlhs, mxArray* plhs[], - int nrhs, const mxArray* prhs[]) -{ - const char* mw_err_txt_ = 0; - if (mexprofrecord_) - mexprofrecord_[1]++; - finufft_mex_setup(); +mxWrapGetArrayDef(mxWrapGetArray_bool, bool) mxWrapCopyDef(mxWrapCopy_bool, bool) mxWrapReturnDef( + mxWrapReturn_bool, + bool) mxWrapGetArrayDef_single(mxWrapGetArray_single_bool, + bool) mxWrapCopyDef_single(mxWrapCopy_single_bool, + bool) mxWrapReturnDef_single(mxWrapReturn_single_bool, + bool) + mxWrapGetArrayDef(mxWrapGetArray_char, char) mxWrapCopyDef(mxWrapCopy_char, char) mxWrapReturnDef( + mxWrapReturn_char, + char) mxWrapGetArrayDef_single(mxWrapGetArray_single_char, + char) mxWrapCopyDef_single(mxWrapCopy_single_char, + char) + mxWrapReturnDef_single(mxWrapReturn_single_char, char) mxWrapGetArrayDef( + mxWrapGetArray_double, + double) mxWrapCopyDef(mxWrapCopy_double, + double) mxWrapReturnDef(mxWrapReturn_double, + double) mxWrapGetArrayDef_single(mxWrapGetArray_single_double, + double) + mxWrapCopyDef_single(mxWrapCopy_single_double, double) mxWrapReturnDef_single( + mxWrapReturn_single_double, + double) mxWrapGetArrayDef(mxWrapGetArray_float, + float) mxWrapCopyDef(mxWrapCopy_float, + float) mxWrapReturnDef(mxWrapReturn_float, + float) + mxWrapGetArrayDef_single(mxWrapGetArray_single_float, float) mxWrapCopyDef_single( + mxWrapCopy_single_float, + float) mxWrapReturnDef_single(mxWrapReturn_single_float, + float) mxWrapGetArrayDef(mxWrapGetArray_int, + int) + mxWrapCopyDef(mxWrapCopy_int, int) mxWrapReturnDef(mxWrapReturn_int, int) mxWrapGetArrayDef_single( + mxWrapGetArray_single_int, + int) mxWrapCopyDef_single(mxWrapCopy_single_int, + int) mxWrapReturnDef_single(mxWrapReturn_single_int, + int) mxWrapGetArrayDef(mxWrapGetArray_int64_t, + int64_t) + mxWrapCopyDef(mxWrapCopy_int64_t, int64_t) mxWrapReturnDef(mxWrapReturn_int64_t, int64_t) mxWrapGetArrayDef_single( + mxWrapGetArray_single_int64_t, + int64_t) mxWrapCopyDef_single(mxWrapCopy_single_int64_t, + int64_t) mxWrapReturnDef_single(mxWrapReturn_single_int64_t, + int64_t) + mxWrapGetArrayDef(mxWrapGetArray_long, long) mxWrapCopyDef(mxWrapCopy_long, long) mxWrapReturnDef( + mxWrapReturn_long, + long) mxWrapGetArrayDef_single(mxWrapGetArray_single_long, + long) mxWrapCopyDef_single(mxWrapCopy_single_long, + long) + mxWrapReturnDef_single(mxWrapReturn_single_long, long) mxWrapGetArrayDef( + mxWrapGetArray_ptrdiff_t, + ptrdiff_t) mxWrapCopyDef(mxWrapCopy_ptrdiff_t, + ptrdiff_t) mxWrapReturnDef(mxWrapReturn_ptrdiff_t, ptrdiff_t) + mxWrapGetArrayDef_single(mxWrapGetArray_single_ptrdiff_t, ptrdiff_t) mxWrapCopyDef_single( + mxWrapCopy_single_ptrdiff_t, + ptrdiff_t) mxWrapReturnDef_single(mxWrapReturn_single_ptrdiff_t, + ptrdiff_t) + mxWrapGetArrayDef(mxWrapGetArray_short, short) mxWrapCopyDef( + mxWrapCopy_short, + short) mxWrapReturnDef(mxWrapReturn_short, + short) mxWrapGetArrayDef_single(mxWrapGetArray_single_short, + short) + mxWrapCopyDef_single(mxWrapCopy_single_short, short) mxWrapReturnDef_single( + mxWrapReturn_single_short, + short) mxWrapGetArrayDef(mxWrapGetArray_size_t, + size_t) mxWrapCopyDef(mxWrapCopy_size_t, size_t) + mxWrapReturnDef(mxWrapReturn_size_t, size_t) mxWrapGetArrayDef_single( + mxWrapGetArray_single_size_t, + size_t) mxWrapCopyDef_single(mxWrapCopy_single_size_t, size_t) + mxWrapReturnDef_single(mxWrapReturn_single_size_t, size_t) mxWrapGetScalarZDef( + mxWrapGetScalar_fcomplex, + fcomplex, float, + setz_fcomplex) mxWrapGetArrayZDef(mxWrapGetArray_fcomplex, fcomplex, float, setz_fcomplex) + mxWrapCopyZDef(mxWrapCopy_fcomplex, fcomplex, real_fcomplex, imag_fcomplex) mxWrapReturnZDef( + mxWrapReturn_fcomplex, + fcomplex, real_fcomplex, + imag_fcomplex) + mxWrapGetScalarZDef_single( + mxWrapGetScalar_single_fcomplex, + fcomplex, float, + setz_fcomplex) mxWrapGetArrayZDef_single(mxWrapGetArray_single_fcomplex, + fcomplex, + float, setz_fcomplex) + mxWrapCopyZDef_single( + mxWrapCopy_single_fcomplex, + fcomplex, + real_fcomplex, + imag_fcomplex) + mxWrapReturnZDef_single( + mxWrapReturn_single_fcomplex, + fcomplex, + real_fcomplex, + imag_fcomplex) mxWrapGetScalarZDef(mxWrapGetScalar_dcomplex, + dcomplex, + double, + setz_dcomplex) + mxWrapGetArrayZDef( + mxWrapGetArray_dcomplex, + dcomplex, + double, + setz_dcomplex) + mxWrapCopyZDef( + mxWrapCopy_dcomplex, + dcomplex, + real_dcomplex, + imag_dcomplex) + mxWrapReturnZDef( + mxWrapReturn_dcomplex, + dcomplex, + real_dcomplex, + imag_dcomplex) + mxWrapGetScalarZDef_single( + mxWrapGetScalar_single_dcomplex, + dcomplex, + double, + setz_dcomplex) + mxWrapGetArrayZDef_single( + mxWrapGetArray_single_dcomplex, + dcomplex, + double, + setz_dcomplex) + mxWrapCopyZDef_single( + mxWrapCopy_single_dcomplex, + dcomplex, + real_dcomplex, + imag_dcomplex) + mxWrapReturnZDef_single( + mxWrapReturn_single_dcomplex, + dcomplex, + real_dcomplex, + imag_dcomplex) + + /* ---- finufft.mw: 166 ---- + * finufft_mex_setup(); + */ + static const char *stubids1_ = "finufft_mex_setup()"; + +void mexStub1(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { + const char *mw_err_txt_ = 0; + if (mexprofrecord_) mexprofrecord_[1]++; + finufft_mex_setup(); mw_err_label: - if (mw_err_txt_) - mexErrMsgTxt(mw_err_txt_); + if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_); } /* ---- finufft.mw: 167 ---- * finufft_opts* o = new(); */ -static const char* stubids2_ = "o finufft_opts* = new()"; +static const char *stubids2_ = "o finufft_opts* = new()"; -void mexStub2(int nlhs, mxArray* plhs[], - int nrhs, const mxArray* prhs[]) -{ - const char* mw_err_txt_ = 0; - finufft_opts* out0_=0; /* o */ +void mexStub2(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { + const char *mw_err_txt_ = 0; + finufft_opts *out0_ = 0; /* o */ - if (mexprofrecord_) - mexprofrecord_[2]++; - out0_ = new finufft_opts(); - plhs[0] = mxWrapCreateP(out0_, "finufft_opts:%p"); + if (mexprofrecord_) mexprofrecord_[2]++; + out0_ = new finufft_opts(); + plhs[0] = mxWrapCreateP(out0_, "finufft_opts:%p"); mw_err_label: - if (mw_err_txt_) - mexErrMsgTxt(mw_err_txt_); + if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_); } /* ---- finufft.mw: 169 ---- * finufft_plan* p = new(); */ -static const char* stubids3_ = "o finufft_plan* = new()"; +static const char *stubids3_ = "o finufft_plan* = new()"; -void mexStub3(int nlhs, mxArray* plhs[], - int nrhs, const mxArray* prhs[]) -{ - const char* mw_err_txt_ = 0; - finufft_plan* out0_=0; /* p */ +void mexStub3(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { + const char *mw_err_txt_ = 0; + finufft_plan *out0_ = 0; /* p */ - if (mexprofrecord_) - mexprofrecord_[3]++; - out0_ = new finufft_plan(); - plhs[0] = mxWrapCreateP(out0_, "finufft_plan:%p"); + if (mexprofrecord_) mexprofrecord_[3]++; + out0_ = new finufft_plan(); + plhs[0] = mxWrapCreateP(out0_, "finufft_plan:%p"); mw_err_label: - if (mw_err_txt_) - mexErrMsgTxt(mw_err_txt_); + if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_); } /* ---- finufft.mw: 170 ---- * finufft_default_opts(finufft_opts* o); */ -static const char* stubids4_ = "finufft_default_opts(i finufft_opts*)"; +static const char *stubids4_ = "finufft_default_opts(i finufft_opts*)"; -void mexStub4(int nlhs, mxArray* plhs[], - int nrhs, const mxArray* prhs[]) -{ - const char* mw_err_txt_ = 0; - finufft_opts* in0_ =0; /* o */ +void mexStub4(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { + const char *mw_err_txt_ = 0; + finufft_opts *in0_ = 0; /* o */ - in0_ = (finufft_opts*) mxWrapGetP(prhs[0], "finufft_opts:%p", &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if (mexprofrecord_) - mexprofrecord_[4]++; - finufft_default_opts(in0_); + in0_ = (finufft_opts *)mxWrapGetP(prhs[0], "finufft_opts:%p", &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (mexprofrecord_) mexprofrecord_[4]++; + finufft_default_opts(in0_); mw_err_label: - if (mw_err_txt_) - mexErrMsgTxt(mw_err_txt_); + if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_); } /* ---- finufft.mw: 172 ---- * finufftf_plan* p = new(); */ -static const char* stubids5_ = "o finufftf_plan* = new()"; +static const char *stubids5_ = "o finufftf_plan* = new()"; -void mexStub5(int nlhs, mxArray* plhs[], - int nrhs, const mxArray* prhs[]) -{ - const char* mw_err_txt_ = 0; - finufftf_plan* out0_=0; /* p */ +void mexStub5(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { + const char *mw_err_txt_ = 0; + finufftf_plan *out0_ = 0; /* p */ - if (mexprofrecord_) - mexprofrecord_[5]++; - out0_ = new finufftf_plan(); - plhs[0] = mxWrapCreateP(out0_, "finufftf_plan:%p"); + if (mexprofrecord_) mexprofrecord_[5]++; + out0_ = new finufftf_plan(); + plhs[0] = mxWrapCreateP(out0_, "finufftf_plan:%p"); mw_err_label: - if (mw_err_txt_) - mexErrMsgTxt(mw_err_txt_); + if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_); } /* ---- finufft.mw: 173 ---- * finufftf_default_opts(finufft_opts* o); */ -static const char* stubids6_ = "finufftf_default_opts(i finufft_opts*)"; +static const char *stubids6_ = "finufftf_default_opts(i finufft_opts*)"; -void mexStub6(int nlhs, mxArray* plhs[], - int nrhs, const mxArray* prhs[]) -{ - const char* mw_err_txt_ = 0; - finufft_opts* in0_ =0; /* o */ +void mexStub6(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { + const char *mw_err_txt_ = 0; + finufft_opts *in0_ = 0; /* o */ - in0_ = (finufft_opts*) mxWrapGetP(prhs[0], "finufft_opts:%p", &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if (mexprofrecord_) - mexprofrecord_[6]++; - finufftf_default_opts(in0_); + in0_ = (finufft_opts *)mxWrapGetP(prhs[0], "finufft_opts:%p", &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (mexprofrecord_) mexprofrecord_[6]++; + finufftf_default_opts(in0_); mw_err_label: - if (mw_err_txt_) - mexErrMsgTxt(mw_err_txt_); + if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_); } /* ---- finufft.mw: 184 ---- * copy_finufft_opts(mxArray opts, finufft_opts* o); */ -static const char* stubids7_ = "copy_finufft_opts(i mxArray, i finufft_opts*)"; - -void mexStub7(int nlhs, mxArray* plhs[], - int nrhs, const mxArray* prhs[]) -{ - const char* mw_err_txt_ = 0; - const mxArray* in0_; /* opts */ - finufft_opts* in1_ =0; /* o */ - - in0_ = prhs[0]; - in1_ = (finufft_opts*) mxWrapGetP(prhs[1], "finufft_opts:%p", &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if (mexprofrecord_) - mexprofrecord_[7]++; - copy_finufft_opts(in0_, in1_); +static const char *stubids7_ = "copy_finufft_opts(i mxArray, i finufft_opts*)"; + +void mexStub7(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { + const char *mw_err_txt_ = 0; + const mxArray *in0_; /* opts */ + finufft_opts *in1_ = 0; /* o */ + + in0_ = prhs[0]; + in1_ = (finufft_opts *)mxWrapGetP(prhs[1], "finufft_opts:%p", &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (mexprofrecord_) mexprofrecord_[7]++; + copy_finufft_opts(in0_, in1_); mw_err_label: - if (mw_err_txt_) - mexErrMsgTxt(mw_err_txt_); + if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_); } /* ---- finufft.mw: 187 ---- - * int ier = finufft_makeplan(int type, int dim, int64_t[3] n_modes, int iflag, int n_trans, double tol, finufft_plan* plan, finufft_opts* o); + * int ier = finufft_makeplan(int type, int dim, int64_t[3] n_modes, int iflag, int + * n_trans, double tol, finufft_plan* plan, finufft_opts* o); */ -static const char* stubids8_ = "o int = finufft_makeplan(i int, i int, i int64_t[x], i int, i int, i double, i finufft_plan*, i finufft_opts*)"; - -void mexStub8(int nlhs, mxArray* plhs[], - int nrhs, const mxArray* prhs[]) -{ - const char* mw_err_txt_ = 0; - int in0_; /* type */ - int in1_; /* dim */ - int64_t* in2_ =0; /* n_modes */ - int in3_; /* iflag */ - int in4_; /* n_trans */ - double in5_; /* tol */ - finufft_plan* in6_ =0; /* plan */ - finufft_opts* in7_ =0; /* o */ - int out0_; /* ier */ - mwSize dim8_; /* 3 */ - - dim8_ = (mwSize) mxWrapGetScalar(prhs[8], &mw_err_txt_); - - if (mxGetM(prhs[2])*mxGetN(prhs[2]) != dim8_) { - mw_err_txt_ = "Bad argument size: n_modes"; goto mw_err_label; - } - - if( mxGetClassID(prhs[0]) != mxDOUBLE_CLASS ) - mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected"; - if (mw_err_txt_) goto mw_err_label; - in0_ = (int) mxWrapGetScalar(prhs[0], &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if( mxGetClassID(prhs[1]) != mxDOUBLE_CLASS ) - mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected"; - if (mw_err_txt_) goto mw_err_label; - in1_ = (int) mxWrapGetScalar(prhs[1], &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if (mxGetM(prhs[2])*mxGetN(prhs[2]) != 0) { - in2_ = mxWrapGetArray_int64_t(prhs[2], &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - } else - in2_ = NULL; - if( mxGetClassID(prhs[3]) != mxDOUBLE_CLASS ) - mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected"; +static const char *stubids8_ = "o int = finufft_makeplan(i int, i int, i int64_t[x], i " + "int, i int, i double, i finufft_plan*, i finufft_opts*)"; + +void mexStub8(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { + const char *mw_err_txt_ = 0; + int in0_; /* type */ + int in1_; /* dim */ + int64_t *in2_ = 0; /* n_modes */ + int in3_; /* iflag */ + int in4_; /* n_trans */ + double in5_; /* tol */ + finufft_plan *in6_ = 0; /* plan */ + finufft_opts *in7_ = 0; /* o */ + int out0_; /* ier */ + mwSize dim8_; /* 3 */ + + dim8_ = (mwSize)mxWrapGetScalar(prhs[8], &mw_err_txt_); + + if (mxGetM(prhs[2]) * mxGetN(prhs[2]) != dim8_) { + mw_err_txt_ = "Bad argument size: n_modes"; + goto mw_err_label; + } + + if (mxGetClassID(prhs[0]) != mxDOUBLE_CLASS) + mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; + in0_ = (int)mxWrapGetScalar(prhs[0], &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (mxGetClassID(prhs[1]) != mxDOUBLE_CLASS) + mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; + in1_ = (int)mxWrapGetScalar(prhs[1], &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (mxGetM(prhs[2]) * mxGetN(prhs[2]) != 0) { + in2_ = mxWrapGetArray_int64_t(prhs[2], &mw_err_txt_); if (mw_err_txt_) goto mw_err_label; - in3_ = (int) mxWrapGetScalar(prhs[3], &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if( mxGetClassID(prhs[4]) != mxDOUBLE_CLASS ) - mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected"; - if (mw_err_txt_) goto mw_err_label; - in4_ = (int) mxWrapGetScalar(prhs[4], &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if( mxGetClassID(prhs[5]) != mxDOUBLE_CLASS ) - mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected"; - if (mw_err_txt_) goto mw_err_label; - in5_ = (double) mxWrapGetScalar(prhs[5], &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - in6_ = (finufft_plan*) mxWrapGetP(prhs[6], "finufft_plan:%p", &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - in7_ = (finufft_opts*) mxWrapGetP(prhs[7], "finufft_opts:%p", &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if (mexprofrecord_) - mexprofrecord_[8]++; - out0_ = finufft_makeplan(in0_, in1_, in2_, in3_, in4_, in5_, in6_, in7_); + } else + in2_ = NULL; + if (mxGetClassID(prhs[3]) != mxDOUBLE_CLASS) + mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; + in3_ = (int)mxWrapGetScalar(prhs[3], &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (mxGetClassID(prhs[4]) != mxDOUBLE_CLASS) + mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; + in4_ = (int)mxWrapGetScalar(prhs[4], &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (mxGetClassID(prhs[5]) != mxDOUBLE_CLASS) + mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; + in5_ = (double)mxWrapGetScalar(prhs[5], &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + in6_ = (finufft_plan *)mxWrapGetP(prhs[6], "finufft_plan:%p", &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + in7_ = (finufft_opts *)mxWrapGetP(prhs[7], "finufft_opts:%p", &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (mexprofrecord_) mexprofrecord_[8]++; + out0_ = finufft_makeplan(in0_, in1_, in2_, in3_, in4_, in5_, in6_, in7_); #if MX_HAS_INTERLEAVED_COMPLEX - plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); - *mxGetDoubles(plhs[0]) = out0_; + plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); + *mxGetDoubles(plhs[0]) = out0_; #else - plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); - *mxGetPr(plhs[0]) = out0_; + plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); + *mxGetPr(plhs[0]) = out0_; #endif mw_err_label: - if (in2_) mxFree(in2_); - if (mw_err_txt_) - mexErrMsgTxt(mw_err_txt_); + if (in2_) mxFree(in2_); + if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_); } /* ---- finufft.mw: 190 ---- - * int ier = finufftf_makeplan(int type, int dim, int64_t[3] n_modes, int iflag, int n_trans, float tol, finufftf_plan* plan, finufft_opts* o); + * int ier = finufftf_makeplan(int type, int dim, int64_t[3] n_modes, int iflag, int + * n_trans, float tol, finufftf_plan* plan, finufft_opts* o); */ -static const char* stubids9_ = "o int = finufftf_makeplan(i int, i int, i int64_t[x], i int, i int, i float, i finufftf_plan*, i finufft_opts*)"; - -void mexStub9(int nlhs, mxArray* plhs[], - int nrhs, const mxArray* prhs[]) -{ - const char* mw_err_txt_ = 0; - int in0_; /* type */ - int in1_; /* dim */ - int64_t* in2_ =0; /* n_modes */ - int in3_; /* iflag */ - int in4_; /* n_trans */ - float in5_; /* tol */ - finufftf_plan* in6_ =0; /* plan */ - finufft_opts* in7_ =0; /* o */ - int out0_; /* ier */ - mwSize dim8_; /* 3 */ - - dim8_ = (mwSize) mxWrapGetScalar(prhs[8], &mw_err_txt_); - - if (mxGetM(prhs[2])*mxGetN(prhs[2]) != dim8_) { - mw_err_txt_ = "Bad argument size: n_modes"; goto mw_err_label; - } - - if( mxGetClassID(prhs[0]) != mxDOUBLE_CLASS ) - mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected"; - if (mw_err_txt_) goto mw_err_label; - in0_ = (int) mxWrapGetScalar(prhs[0], &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if( mxGetClassID(prhs[1]) != mxDOUBLE_CLASS ) - mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected"; - if (mw_err_txt_) goto mw_err_label; - in1_ = (int) mxWrapGetScalar(prhs[1], &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if (mxGetM(prhs[2])*mxGetN(prhs[2]) != 0) { - in2_ = mxWrapGetArray_int64_t(prhs[2], &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - } else - in2_ = NULL; - if( mxGetClassID(prhs[3]) != mxDOUBLE_CLASS ) - mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected"; +static const char *stubids9_ = "o int = finufftf_makeplan(i int, i int, i int64_t[x], i " + "int, i int, i float, i finufftf_plan*, i finufft_opts*)"; + +void mexStub9(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { + const char *mw_err_txt_ = 0; + int in0_; /* type */ + int in1_; /* dim */ + int64_t *in2_ = 0; /* n_modes */ + int in3_; /* iflag */ + int in4_; /* n_trans */ + float in5_; /* tol */ + finufftf_plan *in6_ = 0; /* plan */ + finufft_opts *in7_ = 0; /* o */ + int out0_; /* ier */ + mwSize dim8_; /* 3 */ + + dim8_ = (mwSize)mxWrapGetScalar(prhs[8], &mw_err_txt_); + + if (mxGetM(prhs[2]) * mxGetN(prhs[2]) != dim8_) { + mw_err_txt_ = "Bad argument size: n_modes"; + goto mw_err_label; + } + + if (mxGetClassID(prhs[0]) != mxDOUBLE_CLASS) + mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; + in0_ = (int)mxWrapGetScalar(prhs[0], &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (mxGetClassID(prhs[1]) != mxDOUBLE_CLASS) + mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; + in1_ = (int)mxWrapGetScalar(prhs[1], &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (mxGetM(prhs[2]) * mxGetN(prhs[2]) != 0) { + in2_ = mxWrapGetArray_int64_t(prhs[2], &mw_err_txt_); if (mw_err_txt_) goto mw_err_label; - in3_ = (int) mxWrapGetScalar(prhs[3], &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if( mxGetClassID(prhs[4]) != mxDOUBLE_CLASS ) - mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected"; - if (mw_err_txt_) goto mw_err_label; - in4_ = (int) mxWrapGetScalar(prhs[4], &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if( mxGetClassID(prhs[5]) != mxSINGLE_CLASS ) - mw_err_txt_ = "Invalid scalar argument, mxSINGLE_CLASS expected"; - if (mw_err_txt_) goto mw_err_label; - in5_ = (float) mxWrapGetScalar_single(prhs[5], &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - in6_ = (finufftf_plan*) mxWrapGetP(prhs[6], "finufftf_plan:%p", &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - in7_ = (finufft_opts*) mxWrapGetP(prhs[7], "finufft_opts:%p", &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if (mexprofrecord_) - mexprofrecord_[9]++; - out0_ = finufftf_makeplan(in0_, in1_, in2_, in3_, in4_, in5_, in6_, in7_); + } else + in2_ = NULL; + if (mxGetClassID(prhs[3]) != mxDOUBLE_CLASS) + mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; + in3_ = (int)mxWrapGetScalar(prhs[3], &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (mxGetClassID(prhs[4]) != mxDOUBLE_CLASS) + mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; + in4_ = (int)mxWrapGetScalar(prhs[4], &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (mxGetClassID(prhs[5]) != mxSINGLE_CLASS) + mw_err_txt_ = "Invalid scalar argument, mxSINGLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; + in5_ = (float)mxWrapGetScalar_single(prhs[5], &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + in6_ = (finufftf_plan *)mxWrapGetP(prhs[6], "finufftf_plan:%p", &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + in7_ = (finufft_opts *)mxWrapGetP(prhs[7], "finufft_opts:%p", &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (mexprofrecord_) mexprofrecord_[9]++; + out0_ = finufftf_makeplan(in0_, in1_, in2_, in3_, in4_, in5_, in6_, in7_); #if MX_HAS_INTERLEAVED_COMPLEX - plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); - *mxGetDoubles(plhs[0]) = out0_; + plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); + *mxGetDoubles(plhs[0]) = out0_; #else - plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); - *mxGetPr(plhs[0]) = out0_; + plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); + *mxGetPr(plhs[0]) = out0_; #endif mw_err_label: - if (in2_) mxFree(in2_); - if (mw_err_txt_) - mexErrMsgTxt(mw_err_txt_); + if (in2_) mxFree(in2_); + if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_); } /* ---- finufft.mw: 192 ---- * delete(finufft_opts* o); */ -static const char* stubids10_ = "delete(i finufft_opts*)"; +static const char *stubids10_ = "delete(i finufft_opts*)"; -void mexStub10(int nlhs, mxArray* plhs[], - int nrhs, const mxArray* prhs[]) -{ - const char* mw_err_txt_ = 0; - finufft_opts* in0_ =0; /* o */ +void mexStub10(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { + const char *mw_err_txt_ = 0; + finufft_opts *in0_ = 0; /* o */ - in0_ = (finufft_opts*) mxWrapGetP(prhs[0], "finufft_opts:%p", &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if (mexprofrecord_) - mexprofrecord_[10]++; - delete(in0_); + in0_ = (finufft_opts *)mxWrapGetP(prhs[0], "finufft_opts:%p", &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (mexprofrecord_) mexprofrecord_[10]++; + delete (in0_); mw_err_label: - if (mw_err_txt_) - mexErrMsgTxt(mw_err_txt_); + if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_); } /* ---- finufft.mw: 222 ---- - * int ier = finufft_setpts(finufft_plan plan, int64_t nj, double[] xj, double[] yj, double[] zj, int64_t nk, double[] s, double[] t, double[] u); + * int ier = finufft_setpts(finufft_plan plan, int64_t nj, double[] xj, double[] yj, + * double[] zj, int64_t nk, double[] s, double[] t, double[] u); */ -static const char* stubids11_ = "o int = finufft_setpts(i finufft_plan, i int64_t, i double[], i double[], i double[], i int64_t, i double[], i double[], i double[])"; - -void mexStub11(int nlhs, mxArray* plhs[], - int nrhs, const mxArray* prhs[]) -{ - const char* mw_err_txt_ = 0; - finufft_plan* in0_ =0; /* plan */ - int64_t in1_; /* nj */ - double* in2_ =0; /* xj */ - double* in3_ =0; /* yj */ - double* in4_ =0; /* zj */ - int64_t in5_; /* nk */ - double* in6_ =0; /* s */ - double* in7_ =0; /* t */ - double* in8_ =0; /* u */ - int out0_; /* ier */ - - in0_ = (finufft_plan*) mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if( mxGetClassID(prhs[1]) != mxDOUBLE_CLASS ) - mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected"; +static const char *stubids11_ = + "o int = finufft_setpts(i finufft_plan, i int64_t, i double[], i double[], i " + "double[], i int64_t, i double[], i double[], i double[])"; + +void mexStub11(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { + const char *mw_err_txt_ = 0; + finufft_plan *in0_ = 0; /* plan */ + int64_t in1_; /* nj */ + double *in2_ = 0; /* xj */ + double *in3_ = 0; /* yj */ + double *in4_ = 0; /* zj */ + int64_t in5_; /* nk */ + double *in6_ = 0; /* s */ + double *in7_ = 0; /* t */ + double *in8_ = 0; /* u */ + int out0_; /* ier */ + + in0_ = (finufft_plan *)mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (mxGetClassID(prhs[1]) != mxDOUBLE_CLASS) + mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; + in1_ = (int64_t)mxWrapGetScalar(prhs[1], &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (mxGetM(prhs[2]) * mxGetN(prhs[2]) != 0) { + if (mxGetClassID(prhs[2]) != mxDOUBLE_CLASS) + mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected"; if (mw_err_txt_) goto mw_err_label; - in1_ = (int64_t) mxWrapGetScalar(prhs[1], &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if (mxGetM(prhs[2])*mxGetN(prhs[2]) != 0) { - if( mxGetClassID(prhs[2]) != mxDOUBLE_CLASS ) - mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected"; - if (mw_err_txt_) goto mw_err_label; #if MX_HAS_INTERLEAVED_COMPLEX - in2_ = mxGetDoubles(prhs[2]); + in2_ = mxGetDoubles(prhs[2]); #else - in2_ = mxGetPr(prhs[2]); + in2_ = mxGetPr(prhs[2]); #endif - } else - in2_ = NULL; - if (mxGetM(prhs[3])*mxGetN(prhs[3]) != 0) { - if( mxGetClassID(prhs[3]) != mxDOUBLE_CLASS ) - mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected"; - if (mw_err_txt_) goto mw_err_label; + } else + in2_ = NULL; + if (mxGetM(prhs[3]) * mxGetN(prhs[3]) != 0) { + if (mxGetClassID(prhs[3]) != mxDOUBLE_CLASS) + mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; #if MX_HAS_INTERLEAVED_COMPLEX - in3_ = mxGetDoubles(prhs[3]); + in3_ = mxGetDoubles(prhs[3]); #else - in3_ = mxGetPr(prhs[3]); + in3_ = mxGetPr(prhs[3]); #endif - } else - in3_ = NULL; - if (mxGetM(prhs[4])*mxGetN(prhs[4]) != 0) { - if( mxGetClassID(prhs[4]) != mxDOUBLE_CLASS ) - mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected"; - if (mw_err_txt_) goto mw_err_label; + } else + in3_ = NULL; + if (mxGetM(prhs[4]) * mxGetN(prhs[4]) != 0) { + if (mxGetClassID(prhs[4]) != mxDOUBLE_CLASS) + mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; #if MX_HAS_INTERLEAVED_COMPLEX - in4_ = mxGetDoubles(prhs[4]); + in4_ = mxGetDoubles(prhs[4]); #else - in4_ = mxGetPr(prhs[4]); + in4_ = mxGetPr(prhs[4]); #endif - } else - in4_ = NULL; - if( mxGetClassID(prhs[5]) != mxDOUBLE_CLASS ) - mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected"; + } else + in4_ = NULL; + if (mxGetClassID(prhs[5]) != mxDOUBLE_CLASS) + mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; + in5_ = (int64_t)mxWrapGetScalar(prhs[5], &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (mxGetM(prhs[6]) * mxGetN(prhs[6]) != 0) { + if (mxGetClassID(prhs[6]) != mxDOUBLE_CLASS) + mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected"; if (mw_err_txt_) goto mw_err_label; - in5_ = (int64_t) mxWrapGetScalar(prhs[5], &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if (mxGetM(prhs[6])*mxGetN(prhs[6]) != 0) { - if( mxGetClassID(prhs[6]) != mxDOUBLE_CLASS ) - mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected"; - if (mw_err_txt_) goto mw_err_label; #if MX_HAS_INTERLEAVED_COMPLEX - in6_ = mxGetDoubles(prhs[6]); + in6_ = mxGetDoubles(prhs[6]); #else - in6_ = mxGetPr(prhs[6]); + in6_ = mxGetPr(prhs[6]); #endif - } else - in6_ = NULL; - if (mxGetM(prhs[7])*mxGetN(prhs[7]) != 0) { - if( mxGetClassID(prhs[7]) != mxDOUBLE_CLASS ) - mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected"; - if (mw_err_txt_) goto mw_err_label; + } else + in6_ = NULL; + if (mxGetM(prhs[7]) * mxGetN(prhs[7]) != 0) { + if (mxGetClassID(prhs[7]) != mxDOUBLE_CLASS) + mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; #if MX_HAS_INTERLEAVED_COMPLEX - in7_ = mxGetDoubles(prhs[7]); + in7_ = mxGetDoubles(prhs[7]); #else - in7_ = mxGetPr(prhs[7]); + in7_ = mxGetPr(prhs[7]); #endif - } else - in7_ = NULL; - if (mxGetM(prhs[8])*mxGetN(prhs[8]) != 0) { - if( mxGetClassID(prhs[8]) != mxDOUBLE_CLASS ) - mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected"; - if (mw_err_txt_) goto mw_err_label; + } else + in7_ = NULL; + if (mxGetM(prhs[8]) * mxGetN(prhs[8]) != 0) { + if (mxGetClassID(prhs[8]) != mxDOUBLE_CLASS) + mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; #if MX_HAS_INTERLEAVED_COMPLEX - in8_ = mxGetDoubles(prhs[8]); + in8_ = mxGetDoubles(prhs[8]); #else - in8_ = mxGetPr(prhs[8]); + in8_ = mxGetPr(prhs[8]); #endif - } else - in8_ = NULL; - if (!in0_) { - mw_err_txt_ = "Argument plan cannot be null"; - goto mw_err_label; - } - if (mexprofrecord_) - mexprofrecord_[11]++; - out0_ = finufft_setpts(*in0_, in1_, in2_, in3_, in4_, in5_, in6_, in7_, in8_); + } else + in8_ = NULL; + if (!in0_) { + mw_err_txt_ = "Argument plan cannot be null"; + goto mw_err_label; + } + if (mexprofrecord_) mexprofrecord_[11]++; + out0_ = finufft_setpts(*in0_, in1_, in2_, in3_, in4_, in5_, in6_, in7_, in8_); #if MX_HAS_INTERLEAVED_COMPLEX - plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); - *mxGetDoubles(plhs[0]) = out0_; + plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); + *mxGetDoubles(plhs[0]) = out0_; #else - plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); - *mxGetPr(plhs[0]) = out0_; + plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); + *mxGetPr(plhs[0]) = out0_; #endif mw_err_label: - if (mw_err_txt_) - mexErrMsgTxt(mw_err_txt_); + if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_); } /* ---- finufft.mw: 224 ---- - * int ier = finufftf_setpts(finufftf_plan plan, int64_t nj, float[] xj, float[] yj, float[] zj, int64_t nk, float[] s, float[] t, float[] u); + * int ier = finufftf_setpts(finufftf_plan plan, int64_t nj, float[] xj, float[] yj, + * float[] zj, int64_t nk, float[] s, float[] t, float[] u); */ -static const char* stubids12_ = "o int = finufftf_setpts(i finufftf_plan, i int64_t, i float[], i float[], i float[], i int64_t, i float[], i float[], i float[])"; - -void mexStub12(int nlhs, mxArray* plhs[], - int nrhs, const mxArray* prhs[]) -{ - const char* mw_err_txt_ = 0; - finufftf_plan* in0_ =0; /* plan */ - int64_t in1_; /* nj */ - float* in2_ =0; /* xj */ - float* in3_ =0; /* yj */ - float* in4_ =0; /* zj */ - int64_t in5_; /* nk */ - float* in6_ =0; /* s */ - float* in7_ =0; /* t */ - float* in8_ =0; /* u */ - int out0_; /* ier */ - - in0_ = (finufftf_plan*) mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if( mxGetClassID(prhs[1]) != mxDOUBLE_CLASS ) - mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected"; +static const char *stubids12_ = + "o int = finufftf_setpts(i finufftf_plan, i int64_t, i float[], i float[], i " + "float[], i int64_t, i float[], i float[], i float[])"; + +void mexStub12(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { + const char *mw_err_txt_ = 0; + finufftf_plan *in0_ = 0; /* plan */ + int64_t in1_; /* nj */ + float *in2_ = 0; /* xj */ + float *in3_ = 0; /* yj */ + float *in4_ = 0; /* zj */ + int64_t in5_; /* nk */ + float *in6_ = 0; /* s */ + float *in7_ = 0; /* t */ + float *in8_ = 0; /* u */ + int out0_; /* ier */ + + in0_ = (finufftf_plan *)mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (mxGetClassID(prhs[1]) != mxDOUBLE_CLASS) + mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; + in1_ = (int64_t)mxWrapGetScalar(prhs[1], &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (mxGetM(prhs[2]) * mxGetN(prhs[2]) != 0) { + if (mxGetClassID(prhs[2]) != mxSINGLE_CLASS) + mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected"; if (mw_err_txt_) goto mw_err_label; - in1_ = (int64_t) mxWrapGetScalar(prhs[1], &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if (mxGetM(prhs[2])*mxGetN(prhs[2]) != 0) { - if( mxGetClassID(prhs[2]) != mxSINGLE_CLASS ) - mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected"; - if (mw_err_txt_) goto mw_err_label; #if MX_HAS_INTERLEAVED_COMPLEX - in2_ = mxGetSingles(prhs[2]); + in2_ = mxGetSingles(prhs[2]); #else - in2_ = (float*) mxGetData(prhs[2]); + in2_ = (float *)mxGetData(prhs[2]); #endif - } else - in2_ = NULL; - if (mxGetM(prhs[3])*mxGetN(prhs[3]) != 0) { - if( mxGetClassID(prhs[3]) != mxSINGLE_CLASS ) - mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected"; - if (mw_err_txt_) goto mw_err_label; + } else + in2_ = NULL; + if (mxGetM(prhs[3]) * mxGetN(prhs[3]) != 0) { + if (mxGetClassID(prhs[3]) != mxSINGLE_CLASS) + mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; #if MX_HAS_INTERLEAVED_COMPLEX - in3_ = mxGetSingles(prhs[3]); + in3_ = mxGetSingles(prhs[3]); #else - in3_ = (float*) mxGetData(prhs[3]); + in3_ = (float *)mxGetData(prhs[3]); #endif - } else - in3_ = NULL; - if (mxGetM(prhs[4])*mxGetN(prhs[4]) != 0) { - if( mxGetClassID(prhs[4]) != mxSINGLE_CLASS ) - mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected"; - if (mw_err_txt_) goto mw_err_label; + } else + in3_ = NULL; + if (mxGetM(prhs[4]) * mxGetN(prhs[4]) != 0) { + if (mxGetClassID(prhs[4]) != mxSINGLE_CLASS) + mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; #if MX_HAS_INTERLEAVED_COMPLEX - in4_ = mxGetSingles(prhs[4]); + in4_ = mxGetSingles(prhs[4]); #else - in4_ = (float*) mxGetData(prhs[4]); + in4_ = (float *)mxGetData(prhs[4]); #endif - } else - in4_ = NULL; - if( mxGetClassID(prhs[5]) != mxDOUBLE_CLASS ) - mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected"; + } else + in4_ = NULL; + if (mxGetClassID(prhs[5]) != mxDOUBLE_CLASS) + mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; + in5_ = (int64_t)mxWrapGetScalar(prhs[5], &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (mxGetM(prhs[6]) * mxGetN(prhs[6]) != 0) { + if (mxGetClassID(prhs[6]) != mxSINGLE_CLASS) + mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected"; if (mw_err_txt_) goto mw_err_label; - in5_ = (int64_t) mxWrapGetScalar(prhs[5], &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if (mxGetM(prhs[6])*mxGetN(prhs[6]) != 0) { - if( mxGetClassID(prhs[6]) != mxSINGLE_CLASS ) - mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected"; - if (mw_err_txt_) goto mw_err_label; #if MX_HAS_INTERLEAVED_COMPLEX - in6_ = mxGetSingles(prhs[6]); + in6_ = mxGetSingles(prhs[6]); #else - in6_ = (float*) mxGetData(prhs[6]); + in6_ = (float *)mxGetData(prhs[6]); #endif - } else - in6_ = NULL; - if (mxGetM(prhs[7])*mxGetN(prhs[7]) != 0) { - if( mxGetClassID(prhs[7]) != mxSINGLE_CLASS ) - mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected"; - if (mw_err_txt_) goto mw_err_label; + } else + in6_ = NULL; + if (mxGetM(prhs[7]) * mxGetN(prhs[7]) != 0) { + if (mxGetClassID(prhs[7]) != mxSINGLE_CLASS) + mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; #if MX_HAS_INTERLEAVED_COMPLEX - in7_ = mxGetSingles(prhs[7]); + in7_ = mxGetSingles(prhs[7]); #else - in7_ = (float*) mxGetData(prhs[7]); + in7_ = (float *)mxGetData(prhs[7]); #endif - } else - in7_ = NULL; - if (mxGetM(prhs[8])*mxGetN(prhs[8]) != 0) { - if( mxGetClassID(prhs[8]) != mxSINGLE_CLASS ) - mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected"; - if (mw_err_txt_) goto mw_err_label; + } else + in7_ = NULL; + if (mxGetM(prhs[8]) * mxGetN(prhs[8]) != 0) { + if (mxGetClassID(prhs[8]) != mxSINGLE_CLASS) + mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; #if MX_HAS_INTERLEAVED_COMPLEX - in8_ = mxGetSingles(prhs[8]); + in8_ = mxGetSingles(prhs[8]); #else - in8_ = (float*) mxGetData(prhs[8]); + in8_ = (float *)mxGetData(prhs[8]); #endif - } else - in8_ = NULL; - if (!in0_) { - mw_err_txt_ = "Argument plan cannot be null"; - goto mw_err_label; - } - if (mexprofrecord_) - mexprofrecord_[12]++; - out0_ = finufftf_setpts(*in0_, in1_, in2_, in3_, in4_, in5_, in6_, in7_, in8_); + } else + in8_ = NULL; + if (!in0_) { + mw_err_txt_ = "Argument plan cannot be null"; + goto mw_err_label; + } + if (mexprofrecord_) mexprofrecord_[12]++; + out0_ = finufftf_setpts(*in0_, in1_, in2_, in3_, in4_, in5_, in6_, in7_, in8_); #if MX_HAS_INTERLEAVED_COMPLEX - plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); - *mxGetDoubles(plhs[0]) = out0_; + plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); + *mxGetDoubles(plhs[0]) = out0_; #else - plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); - *mxGetPr(plhs[0]) = out0_; + plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); + *mxGetPr(plhs[0]) = out0_; #endif mw_err_label: - if (mw_err_txt_) - mexErrMsgTxt(mw_err_txt_); + if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_); } /* ---- finufft.mw: 251 ---- - * int ier = finufft_execute(finufft_plan plan, dcomplex[] data_in, output dcomplex[ncoeffs] result); + * int ier = finufft_execute(finufft_plan plan, dcomplex[] data_in, output + * dcomplex[ncoeffs] result); */ -static const char* stubids13_ = "o int = finufft_execute(i finufft_plan, i dcomplex[], o dcomplex[x])"; - -void mexStub13(int nlhs, mxArray* plhs[], - int nrhs, const mxArray* prhs[]) -{ - const char* mw_err_txt_ = 0; - finufft_plan* in0_ =0; /* plan */ - dcomplex* in1_ =0; /* data_in */ - int out0_; /* ier */ - dcomplex* out1_=0; /* result */ - mwSize dim2_; /* ncoeffs */ - - dim2_ = (mwSize) mxWrapGetScalar(prhs[2], &mw_err_txt_); - - in0_ = (finufft_plan*) mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if (mxGetM(prhs[1])*mxGetN(prhs[1]) != 0) { - if( mxGetClassID(prhs[1]) != mxDOUBLE_CLASS ) - mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected"; - if (mw_err_txt_) goto mw_err_label; - in1_ = mxWrapGetArray_dcomplex(prhs[1], &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - } else - in1_ = NULL; - if (!in0_) { - mw_err_txt_ = "Argument plan cannot be null"; - goto mw_err_label; - } - out1_ = (dcomplex*) mxMalloc(dim2_*sizeof(dcomplex)); - if (mexprofrecord_) - mexprofrecord_[13]++; - out0_ = finufft_execute(*in0_, in1_, out1_); +static const char *stubids13_ = + "o int = finufft_execute(i finufft_plan, i dcomplex[], o dcomplex[x])"; + +void mexStub13(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { + const char *mw_err_txt_ = 0; + finufft_plan *in0_ = 0; /* plan */ + dcomplex *in1_ = 0; /* data_in */ + int out0_; /* ier */ + dcomplex *out1_ = 0; /* result */ + mwSize dim2_; /* ncoeffs */ + + dim2_ = (mwSize)mxWrapGetScalar(prhs[2], &mw_err_txt_); + + in0_ = (finufft_plan *)mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (mxGetM(prhs[1]) * mxGetN(prhs[1]) != 0) { + if (mxGetClassID(prhs[1]) != mxDOUBLE_CLASS) + mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; + in1_ = mxWrapGetArray_dcomplex(prhs[1], &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + } else + in1_ = NULL; + if (!in0_) { + mw_err_txt_ = "Argument plan cannot be null"; + goto mw_err_label; + } + out1_ = (dcomplex *)mxMalloc(dim2_ * sizeof(dcomplex)); + if (mexprofrecord_) mexprofrecord_[13]++; + out0_ = finufft_execute(*in0_, in1_, out1_); #if MX_HAS_INTERLEAVED_COMPLEX - plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); - *mxGetDoubles(plhs[0]) = out0_; + plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); + *mxGetDoubles(plhs[0]) = out0_; #else - plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); - *mxGetPr(plhs[0]) = out0_; + plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); + *mxGetPr(plhs[0]) = out0_; #endif - plhs[1] = mxCreateDoubleMatrix(dim2_, 1, mxCOMPLEX); - mxWrapCopy_dcomplex(plhs[1], out1_, dim2_); + plhs[1] = mxCreateDoubleMatrix(dim2_, 1, mxCOMPLEX); + mxWrapCopy_dcomplex(plhs[1], out1_, dim2_); mw_err_label: - if (in1_) mxFree(in1_); - if (out1_) mxFree(out1_); - if (mw_err_txt_) - mexErrMsgTxt(mw_err_txt_); + if (in1_) mxFree(in1_); + if (out1_) mxFree(out1_); + if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_); } /* ---- finufft.mw: 253 ---- - * int ier = finufftf_execute(finufftf_plan plan, fcomplex[] data_in, output fcomplex[ncoeffs] result); + * int ier = finufftf_execute(finufftf_plan plan, fcomplex[] data_in, output + * fcomplex[ncoeffs] result); */ -static const char* stubids14_ = "o int = finufftf_execute(i finufftf_plan, i fcomplex[], o fcomplex[x])"; - -void mexStub14(int nlhs, mxArray* plhs[], - int nrhs, const mxArray* prhs[]) -{ - const char* mw_err_txt_ = 0; - finufftf_plan* in0_ =0; /* plan */ - fcomplex* in1_ =0; /* data_in */ - int out0_; /* ier */ - fcomplex* out1_=0; /* result */ - mwSize dim2_; /* ncoeffs */ - - dim2_ = (mwSize) mxWrapGetScalar(prhs[2], &mw_err_txt_); - - in0_ = (finufftf_plan*) mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if (mxGetM(prhs[1])*mxGetN(prhs[1]) != 0) { - if( mxGetClassID(prhs[1]) != mxSINGLE_CLASS ) - mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected"; - if (mw_err_txt_) goto mw_err_label; - in1_ = mxWrapGetArray_single_fcomplex(prhs[1], &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - } else - in1_ = NULL; - if (!in0_) { - mw_err_txt_ = "Argument plan cannot be null"; - goto mw_err_label; - } - out1_ = (fcomplex*) mxMalloc(dim2_*sizeof(fcomplex)); - if (mexprofrecord_) - mexprofrecord_[14]++; - out0_ = finufftf_execute(*in0_, in1_, out1_); +static const char *stubids14_ = + "o int = finufftf_execute(i finufftf_plan, i fcomplex[], o fcomplex[x])"; + +void mexStub14(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { + const char *mw_err_txt_ = 0; + finufftf_plan *in0_ = 0; /* plan */ + fcomplex *in1_ = 0; /* data_in */ + int out0_; /* ier */ + fcomplex *out1_ = 0; /* result */ + mwSize dim2_; /* ncoeffs */ + + dim2_ = (mwSize)mxWrapGetScalar(prhs[2], &mw_err_txt_); + + in0_ = (finufftf_plan *)mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (mxGetM(prhs[1]) * mxGetN(prhs[1]) != 0) { + if (mxGetClassID(prhs[1]) != mxSINGLE_CLASS) + mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; + in1_ = mxWrapGetArray_single_fcomplex(prhs[1], &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + } else + in1_ = NULL; + if (!in0_) { + mw_err_txt_ = "Argument plan cannot be null"; + goto mw_err_label; + } + out1_ = (fcomplex *)mxMalloc(dim2_ * sizeof(fcomplex)); + if (mexprofrecord_) mexprofrecord_[14]++; + out0_ = finufftf_execute(*in0_, in1_, out1_); #if MX_HAS_INTERLEAVED_COMPLEX - plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); - *mxGetDoubles(plhs[0]) = out0_; + plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); + *mxGetDoubles(plhs[0]) = out0_; #else - plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); - *mxGetPr(plhs[0]) = out0_; + plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); + *mxGetPr(plhs[0]) = out0_; #endif - plhs[1] = mxCreateNumericMatrix(dim2_, 1, mxSINGLE_CLASS, mxCOMPLEX); - mxWrapCopy_single_fcomplex(plhs[1], out1_, dim2_); + plhs[1] = mxCreateNumericMatrix(dim2_, 1, mxSINGLE_CLASS, mxCOMPLEX); + mxWrapCopy_single_fcomplex(plhs[1], out1_, dim2_); mw_err_label: - if (in1_) mxFree(in1_); - if (out1_) mxFree(out1_); - if (mw_err_txt_) - mexErrMsgTxt(mw_err_txt_); + if (in1_) mxFree(in1_); + if (out1_) mxFree(out1_); + if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_); } /* ---- finufft.mw: 259 ---- - * int ier = finufft_execute(finufft_plan plan, output dcomplex[nj, n_trans] result, dcomplex[] data_in); + * int ier = finufft_execute(finufft_plan plan, output dcomplex[nj, n_trans] result, + * dcomplex[] data_in); */ -static const char* stubids15_ = "o int = finufft_execute(i finufft_plan, o dcomplex[xx], i dcomplex[])"; - -void mexStub15(int nlhs, mxArray* plhs[], - int nrhs, const mxArray* prhs[]) -{ - const char* mw_err_txt_ = 0; - finufft_plan* in0_ =0; /* plan */ - dcomplex* in1_ =0; /* data_in */ - int out0_; /* ier */ - dcomplex* out1_=0; /* result */ - mwSize dim2_; /* nj */ - mwSize dim3_; /* n_trans */ - - dim2_ = (mwSize) mxWrapGetScalar(prhs[2], &mw_err_txt_); - dim3_ = (mwSize) mxWrapGetScalar(prhs[3], &mw_err_txt_); - - in0_ = (finufft_plan*) mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if (mxGetM(prhs[1])*mxGetN(prhs[1]) != 0) { - if( mxGetClassID(prhs[1]) != mxDOUBLE_CLASS ) - mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected"; - if (mw_err_txt_) goto mw_err_label; - in1_ = mxWrapGetArray_dcomplex(prhs[1], &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - } else - in1_ = NULL; - if (!in0_) { - mw_err_txt_ = "Argument plan cannot be null"; - goto mw_err_label; - } - out1_ = (dcomplex*) mxMalloc(dim2_*dim3_*sizeof(dcomplex)); - if (mexprofrecord_) - mexprofrecord_[15]++; - out0_ = finufft_execute(*in0_, out1_, in1_); +static const char *stubids15_ = + "o int = finufft_execute(i finufft_plan, o dcomplex[xx], i dcomplex[])"; + +void mexStub15(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { + const char *mw_err_txt_ = 0; + finufft_plan *in0_ = 0; /* plan */ + dcomplex *in1_ = 0; /* data_in */ + int out0_; /* ier */ + dcomplex *out1_ = 0; /* result */ + mwSize dim2_; /* nj */ + mwSize dim3_; /* n_trans */ + + dim2_ = (mwSize)mxWrapGetScalar(prhs[2], &mw_err_txt_); + dim3_ = (mwSize)mxWrapGetScalar(prhs[3], &mw_err_txt_); + + in0_ = (finufft_plan *)mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (mxGetM(prhs[1]) * mxGetN(prhs[1]) != 0) { + if (mxGetClassID(prhs[1]) != mxDOUBLE_CLASS) + mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; + in1_ = mxWrapGetArray_dcomplex(prhs[1], &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + } else + in1_ = NULL; + if (!in0_) { + mw_err_txt_ = "Argument plan cannot be null"; + goto mw_err_label; + } + out1_ = (dcomplex *)mxMalloc(dim2_ * dim3_ * sizeof(dcomplex)); + if (mexprofrecord_) mexprofrecord_[15]++; + out0_ = finufft_execute(*in0_, out1_, in1_); #if MX_HAS_INTERLEAVED_COMPLEX - plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); - *mxGetDoubles(plhs[0]) = out0_; + plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); + *mxGetDoubles(plhs[0]) = out0_; #else - plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); - *mxGetPr(plhs[0]) = out0_; + plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); + *mxGetPr(plhs[0]) = out0_; #endif - plhs[1] = mxCreateDoubleMatrix(dim2_, dim3_, mxCOMPLEX); - mxWrapCopy_dcomplex(plhs[1], out1_, dim2_*dim3_); + plhs[1] = mxCreateDoubleMatrix(dim2_, dim3_, mxCOMPLEX); + mxWrapCopy_dcomplex(plhs[1], out1_, dim2_ * dim3_); mw_err_label: - if (out1_) mxFree(out1_); - if (in1_) mxFree(in1_); - if (mw_err_txt_) - mexErrMsgTxt(mw_err_txt_); + if (out1_) mxFree(out1_); + if (in1_) mxFree(in1_); + if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_); } /* ---- finufft.mw: 261 ---- - * int ier = finufftf_execute(finufftf_plan plan, output fcomplex[nj, n_trans] result, fcomplex[] data_in); + * int ier = finufftf_execute(finufftf_plan plan, output fcomplex[nj, n_trans] result, + * fcomplex[] data_in); */ -static const char* stubids16_ = "o int = finufftf_execute(i finufftf_plan, o fcomplex[xx], i fcomplex[])"; - -void mexStub16(int nlhs, mxArray* plhs[], - int nrhs, const mxArray* prhs[]) -{ - const char* mw_err_txt_ = 0; - finufftf_plan* in0_ =0; /* plan */ - fcomplex* in1_ =0; /* data_in */ - int out0_; /* ier */ - fcomplex* out1_=0; /* result */ - mwSize dim2_; /* nj */ - mwSize dim3_; /* n_trans */ - - dim2_ = (mwSize) mxWrapGetScalar(prhs[2], &mw_err_txt_); - dim3_ = (mwSize) mxWrapGetScalar(prhs[3], &mw_err_txt_); - - in0_ = (finufftf_plan*) mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if (mxGetM(prhs[1])*mxGetN(prhs[1]) != 0) { - if( mxGetClassID(prhs[1]) != mxSINGLE_CLASS ) - mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected"; - if (mw_err_txt_) goto mw_err_label; - in1_ = mxWrapGetArray_single_fcomplex(prhs[1], &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - } else - in1_ = NULL; - if (!in0_) { - mw_err_txt_ = "Argument plan cannot be null"; - goto mw_err_label; - } - out1_ = (fcomplex*) mxMalloc(dim2_*dim3_*sizeof(fcomplex)); - if (mexprofrecord_) - mexprofrecord_[16]++; - out0_ = finufftf_execute(*in0_, out1_, in1_); +static const char *stubids16_ = + "o int = finufftf_execute(i finufftf_plan, o fcomplex[xx], i fcomplex[])"; + +void mexStub16(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { + const char *mw_err_txt_ = 0; + finufftf_plan *in0_ = 0; /* plan */ + fcomplex *in1_ = 0; /* data_in */ + int out0_; /* ier */ + fcomplex *out1_ = 0; /* result */ + mwSize dim2_; /* nj */ + mwSize dim3_; /* n_trans */ + + dim2_ = (mwSize)mxWrapGetScalar(prhs[2], &mw_err_txt_); + dim3_ = (mwSize)mxWrapGetScalar(prhs[3], &mw_err_txt_); + + in0_ = (finufftf_plan *)mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (mxGetM(prhs[1]) * mxGetN(prhs[1]) != 0) { + if (mxGetClassID(prhs[1]) != mxSINGLE_CLASS) + mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; + in1_ = mxWrapGetArray_single_fcomplex(prhs[1], &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + } else + in1_ = NULL; + if (!in0_) { + mw_err_txt_ = "Argument plan cannot be null"; + goto mw_err_label; + } + out1_ = (fcomplex *)mxMalloc(dim2_ * dim3_ * sizeof(fcomplex)); + if (mexprofrecord_) mexprofrecord_[16]++; + out0_ = finufftf_execute(*in0_, out1_, in1_); #if MX_HAS_INTERLEAVED_COMPLEX - plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); - *mxGetDoubles(plhs[0]) = out0_; + plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); + *mxGetDoubles(plhs[0]) = out0_; #else - plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); - *mxGetPr(plhs[0]) = out0_; + plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); + *mxGetPr(plhs[0]) = out0_; #endif - plhs[1] = mxCreateNumericMatrix(dim2_, dim3_, mxSINGLE_CLASS, mxCOMPLEX); - mxWrapCopy_single_fcomplex(plhs[1], out1_, dim2_*dim3_); + plhs[1] = mxCreateNumericMatrix(dim2_, dim3_, mxSINGLE_CLASS, mxCOMPLEX); + mxWrapCopy_single_fcomplex(plhs[1], out1_, dim2_ * dim3_); mw_err_label: - if (out1_) mxFree(out1_); - if (in1_) mxFree(in1_); - if (mw_err_txt_) - mexErrMsgTxt(mw_err_txt_); + if (out1_) mxFree(out1_); + if (in1_) mxFree(in1_); + if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_); } /* ---- finufft.mw: 265 ---- - * int ier = finufft_execute(finufft_plan plan, dcomplex[] data_in, output dcomplex[nk, n_trans] result); + * int ier = finufft_execute(finufft_plan plan, dcomplex[] data_in, output dcomplex[nk, + * n_trans] result); */ -static const char* stubids17_ = "o int = finufft_execute(i finufft_plan, i dcomplex[], o dcomplex[xx])"; - -void mexStub17(int nlhs, mxArray* plhs[], - int nrhs, const mxArray* prhs[]) -{ - const char* mw_err_txt_ = 0; - finufft_plan* in0_ =0; /* plan */ - dcomplex* in1_ =0; /* data_in */ - int out0_; /* ier */ - dcomplex* out1_=0; /* result */ - mwSize dim2_; /* nk */ - mwSize dim3_; /* n_trans */ - - dim2_ = (mwSize) mxWrapGetScalar(prhs[2], &mw_err_txt_); - dim3_ = (mwSize) mxWrapGetScalar(prhs[3], &mw_err_txt_); - - in0_ = (finufft_plan*) mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if (mxGetM(prhs[1])*mxGetN(prhs[1]) != 0) { - if( mxGetClassID(prhs[1]) != mxDOUBLE_CLASS ) - mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected"; - if (mw_err_txt_) goto mw_err_label; - in1_ = mxWrapGetArray_dcomplex(prhs[1], &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - } else - in1_ = NULL; - if (!in0_) { - mw_err_txt_ = "Argument plan cannot be null"; - goto mw_err_label; - } - out1_ = (dcomplex*) mxMalloc(dim2_*dim3_*sizeof(dcomplex)); - if (mexprofrecord_) - mexprofrecord_[17]++; - out0_ = finufft_execute(*in0_, in1_, out1_); +static const char *stubids17_ = + "o int = finufft_execute(i finufft_plan, i dcomplex[], o dcomplex[xx])"; + +void mexStub17(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { + const char *mw_err_txt_ = 0; + finufft_plan *in0_ = 0; /* plan */ + dcomplex *in1_ = 0; /* data_in */ + int out0_; /* ier */ + dcomplex *out1_ = 0; /* result */ + mwSize dim2_; /* nk */ + mwSize dim3_; /* n_trans */ + + dim2_ = (mwSize)mxWrapGetScalar(prhs[2], &mw_err_txt_); + dim3_ = (mwSize)mxWrapGetScalar(prhs[3], &mw_err_txt_); + + in0_ = (finufft_plan *)mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (mxGetM(prhs[1]) * mxGetN(prhs[1]) != 0) { + if (mxGetClassID(prhs[1]) != mxDOUBLE_CLASS) + mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; + in1_ = mxWrapGetArray_dcomplex(prhs[1], &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + } else + in1_ = NULL; + if (!in0_) { + mw_err_txt_ = "Argument plan cannot be null"; + goto mw_err_label; + } + out1_ = (dcomplex *)mxMalloc(dim2_ * dim3_ * sizeof(dcomplex)); + if (mexprofrecord_) mexprofrecord_[17]++; + out0_ = finufft_execute(*in0_, in1_, out1_); #if MX_HAS_INTERLEAVED_COMPLEX - plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); - *mxGetDoubles(plhs[0]) = out0_; + plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); + *mxGetDoubles(plhs[0]) = out0_; #else - plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); - *mxGetPr(plhs[0]) = out0_; + plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); + *mxGetPr(plhs[0]) = out0_; #endif - plhs[1] = mxCreateDoubleMatrix(dim2_, dim3_, mxCOMPLEX); - mxWrapCopy_dcomplex(plhs[1], out1_, dim2_*dim3_); + plhs[1] = mxCreateDoubleMatrix(dim2_, dim3_, mxCOMPLEX); + mxWrapCopy_dcomplex(plhs[1], out1_, dim2_ * dim3_); mw_err_label: - if (in1_) mxFree(in1_); - if (out1_) mxFree(out1_); - if (mw_err_txt_) - mexErrMsgTxt(mw_err_txt_); + if (in1_) mxFree(in1_); + if (out1_) mxFree(out1_); + if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_); } /* ---- finufft.mw: 267 ---- - * int ier = finufftf_execute(finufftf_plan plan, fcomplex[] data_in, output fcomplex[nk, n_trans] result); + * int ier = finufftf_execute(finufftf_plan plan, fcomplex[] data_in, output fcomplex[nk, + * n_trans] result); */ -static const char* stubids18_ = "o int = finufftf_execute(i finufftf_plan, i fcomplex[], o fcomplex[xx])"; - -void mexStub18(int nlhs, mxArray* plhs[], - int nrhs, const mxArray* prhs[]) -{ - const char* mw_err_txt_ = 0; - finufftf_plan* in0_ =0; /* plan */ - fcomplex* in1_ =0; /* data_in */ - int out0_; /* ier */ - fcomplex* out1_=0; /* result */ - mwSize dim2_; /* nk */ - mwSize dim3_; /* n_trans */ - - dim2_ = (mwSize) mxWrapGetScalar(prhs[2], &mw_err_txt_); - dim3_ = (mwSize) mxWrapGetScalar(prhs[3], &mw_err_txt_); - - in0_ = (finufftf_plan*) mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if (mxGetM(prhs[1])*mxGetN(prhs[1]) != 0) { - if( mxGetClassID(prhs[1]) != mxSINGLE_CLASS ) - mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected"; - if (mw_err_txt_) goto mw_err_label; - in1_ = mxWrapGetArray_single_fcomplex(prhs[1], &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - } else - in1_ = NULL; - if (!in0_) { - mw_err_txt_ = "Argument plan cannot be null"; - goto mw_err_label; - } - out1_ = (fcomplex*) mxMalloc(dim2_*dim3_*sizeof(fcomplex)); - if (mexprofrecord_) - mexprofrecord_[18]++; - out0_ = finufftf_execute(*in0_, in1_, out1_); +static const char *stubids18_ = + "o int = finufftf_execute(i finufftf_plan, i fcomplex[], o fcomplex[xx])"; + +void mexStub18(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { + const char *mw_err_txt_ = 0; + finufftf_plan *in0_ = 0; /* plan */ + fcomplex *in1_ = 0; /* data_in */ + int out0_; /* ier */ + fcomplex *out1_ = 0; /* result */ + mwSize dim2_; /* nk */ + mwSize dim3_; /* n_trans */ + + dim2_ = (mwSize)mxWrapGetScalar(prhs[2], &mw_err_txt_); + dim3_ = (mwSize)mxWrapGetScalar(prhs[3], &mw_err_txt_); + + in0_ = (finufftf_plan *)mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (mxGetM(prhs[1]) * mxGetN(prhs[1]) != 0) { + if (mxGetClassID(prhs[1]) != mxSINGLE_CLASS) + mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected"; + if (mw_err_txt_) goto mw_err_label; + in1_ = mxWrapGetArray_single_fcomplex(prhs[1], &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + } else + in1_ = NULL; + if (!in0_) { + mw_err_txt_ = "Argument plan cannot be null"; + goto mw_err_label; + } + out1_ = (fcomplex *)mxMalloc(dim2_ * dim3_ * sizeof(fcomplex)); + if (mexprofrecord_) mexprofrecord_[18]++; + out0_ = finufftf_execute(*in0_, in1_, out1_); #if MX_HAS_INTERLEAVED_COMPLEX - plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); - *mxGetDoubles(plhs[0]) = out0_; + plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); + *mxGetDoubles(plhs[0]) = out0_; #else - plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); - *mxGetPr(plhs[0]) = out0_; + plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL); + *mxGetPr(plhs[0]) = out0_; #endif - plhs[1] = mxCreateNumericMatrix(dim2_, dim3_, mxSINGLE_CLASS, mxCOMPLEX); - mxWrapCopy_single_fcomplex(plhs[1], out1_, dim2_*dim3_); + plhs[1] = mxCreateNumericMatrix(dim2_, dim3_, mxSINGLE_CLASS, mxCOMPLEX); + mxWrapCopy_single_fcomplex(plhs[1], out1_, dim2_ * dim3_); mw_err_label: - if (in1_) mxFree(in1_); - if (out1_) mxFree(out1_); - if (mw_err_txt_) - mexErrMsgTxt(mw_err_txt_); + if (in1_) mxFree(in1_); + if (out1_) mxFree(out1_); + if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_); } /* ---- finufft.mw: 279 ---- * finufft_destroy(finufft_plan plan); */ -static const char* stubids19_ = "finufft_destroy(i finufft_plan)"; - -void mexStub19(int nlhs, mxArray* plhs[], - int nrhs, const mxArray* prhs[]) -{ - const char* mw_err_txt_ = 0; - finufft_plan* in0_ =0; /* plan */ - - in0_ = (finufft_plan*) mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if (!in0_) { - mw_err_txt_ = "Argument plan cannot be null"; - goto mw_err_label; - } - if (mexprofrecord_) - mexprofrecord_[19]++; - finufft_destroy(*in0_); +static const char *stubids19_ = "finufft_destroy(i finufft_plan)"; + +void mexStub19(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { + const char *mw_err_txt_ = 0; + finufft_plan *in0_ = 0; /* plan */ + + in0_ = (finufft_plan *)mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (!in0_) { + mw_err_txt_ = "Argument plan cannot be null"; + goto mw_err_label; + } + if (mexprofrecord_) mexprofrecord_[19]++; + finufft_destroy(*in0_); mw_err_label: - if (mw_err_txt_) - mexErrMsgTxt(mw_err_txt_); + if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_); } /* ---- finufft.mw: 281 ---- * finufftf_destroy(finufftf_plan plan); */ -static const char* stubids20_ = "finufftf_destroy(i finufftf_plan)"; - -void mexStub20(int nlhs, mxArray* plhs[], - int nrhs, const mxArray* prhs[]) -{ - const char* mw_err_txt_ = 0; - finufftf_plan* in0_ =0; /* plan */ - - in0_ = (finufftf_plan*) mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_); - if (mw_err_txt_) - goto mw_err_label; - if (!in0_) { - mw_err_txt_ = "Argument plan cannot be null"; - goto mw_err_label; - } - if (mexprofrecord_) - mexprofrecord_[20]++; - finufftf_destroy(*in0_); +static const char *stubids20_ = "finufftf_destroy(i finufftf_plan)"; + +void mexStub20(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { + const char *mw_err_txt_ = 0; + finufftf_plan *in0_ = 0; /* plan */ + + in0_ = (finufftf_plan *)mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_); + if (mw_err_txt_) goto mw_err_label; + if (!in0_) { + mw_err_txt_ = "Argument plan cannot be null"; + goto mw_err_label; + } + if (mexprofrecord_) mexprofrecord_[20]++; + finufftf_destroy(*in0_); mw_err_label: - if (mw_err_txt_) - mexErrMsgTxt(mw_err_txt_); + if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_); } /* ---- */ -void mexFunction(int nlhs, mxArray* plhs[], - int nrhs, const mxArray* prhs[]) -{ - char id[512]; - if (nrhs == 0) { - mexPrintf("Mex function installed\n"); - return; +void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { + char id[512]; + if (nrhs == 0) { + mexPrintf("Mex function installed\n"); + return; + } + + if (mxGetString(prhs[0], id, sizeof(id)) != 0) + mexErrMsgTxt("Identifier should be a string"); + else if (strcmp(id, stubids1_) == 0) + mexStub1(nlhs, plhs, nrhs - 1, prhs + 1); + else if (strcmp(id, stubids2_) == 0) + mexStub2(nlhs, plhs, nrhs - 1, prhs + 1); + else if (strcmp(id, stubids3_) == 0) + mexStub3(nlhs, plhs, nrhs - 1, prhs + 1); + else if (strcmp(id, stubids4_) == 0) + mexStub4(nlhs, plhs, nrhs - 1, prhs + 1); + else if (strcmp(id, stubids5_) == 0) + mexStub5(nlhs, plhs, nrhs - 1, prhs + 1); + else if (strcmp(id, stubids6_) == 0) + mexStub6(nlhs, plhs, nrhs - 1, prhs + 1); + else if (strcmp(id, stubids7_) == 0) + mexStub7(nlhs, plhs, nrhs - 1, prhs + 1); + else if (strcmp(id, stubids8_) == 0) + mexStub8(nlhs, plhs, nrhs - 1, prhs + 1); + else if (strcmp(id, stubids9_) == 0) + mexStub9(nlhs, plhs, nrhs - 1, prhs + 1); + else if (strcmp(id, stubids10_) == 0) + mexStub10(nlhs, plhs, nrhs - 1, prhs + 1); + else if (strcmp(id, stubids11_) == 0) + mexStub11(nlhs, plhs, nrhs - 1, prhs + 1); + else if (strcmp(id, stubids12_) == 0) + mexStub12(nlhs, plhs, nrhs - 1, prhs + 1); + else if (strcmp(id, stubids13_) == 0) + mexStub13(nlhs, plhs, nrhs - 1, prhs + 1); + else if (strcmp(id, stubids14_) == 0) + mexStub14(nlhs, plhs, nrhs - 1, prhs + 1); + else if (strcmp(id, stubids15_) == 0) + mexStub15(nlhs, plhs, nrhs - 1, prhs + 1); + else if (strcmp(id, stubids16_) == 0) + mexStub16(nlhs, plhs, nrhs - 1, prhs + 1); + else if (strcmp(id, stubids17_) == 0) + mexStub17(nlhs, plhs, nrhs - 1, prhs + 1); + else if (strcmp(id, stubids18_) == 0) + mexStub18(nlhs, plhs, nrhs - 1, prhs + 1); + else if (strcmp(id, stubids19_) == 0) + mexStub19(nlhs, plhs, nrhs - 1, prhs + 1); + else if (strcmp(id, stubids20_) == 0) + mexStub20(nlhs, plhs, nrhs - 1, prhs + 1); + else if (strcmp(id, "*profile on*") == 0) { + if (!mexprofrecord_) { + mexprofrecord_ = (int *)malloc(21 * sizeof(int)); + mexLock(); } - - if (mxGetString(prhs[0], id, sizeof(id)) != 0) - mexErrMsgTxt("Identifier should be a string"); - else if (strcmp(id, stubids1_) == 0) - mexStub1(nlhs,plhs, nrhs-1,prhs+1); - else if (strcmp(id, stubids2_) == 0) - mexStub2(nlhs,plhs, nrhs-1,prhs+1); - else if (strcmp(id, stubids3_) == 0) - mexStub3(nlhs,plhs, nrhs-1,prhs+1); - else if (strcmp(id, stubids4_) == 0) - mexStub4(nlhs,plhs, nrhs-1,prhs+1); - else if (strcmp(id, stubids5_) == 0) - mexStub5(nlhs,plhs, nrhs-1,prhs+1); - else if (strcmp(id, stubids6_) == 0) - mexStub6(nlhs,plhs, nrhs-1,prhs+1); - else if (strcmp(id, stubids7_) == 0) - mexStub7(nlhs,plhs, nrhs-1,prhs+1); - else if (strcmp(id, stubids8_) == 0) - mexStub8(nlhs,plhs, nrhs-1,prhs+1); - else if (strcmp(id, stubids9_) == 0) - mexStub9(nlhs,plhs, nrhs-1,prhs+1); - else if (strcmp(id, stubids10_) == 0) - mexStub10(nlhs,plhs, nrhs-1,prhs+1); - else if (strcmp(id, stubids11_) == 0) - mexStub11(nlhs,plhs, nrhs-1,prhs+1); - else if (strcmp(id, stubids12_) == 0) - mexStub12(nlhs,plhs, nrhs-1,prhs+1); - else if (strcmp(id, stubids13_) == 0) - mexStub13(nlhs,plhs, nrhs-1,prhs+1); - else if (strcmp(id, stubids14_) == 0) - mexStub14(nlhs,plhs, nrhs-1,prhs+1); - else if (strcmp(id, stubids15_) == 0) - mexStub15(nlhs,plhs, nrhs-1,prhs+1); - else if (strcmp(id, stubids16_) == 0) - mexStub16(nlhs,plhs, nrhs-1,prhs+1); - else if (strcmp(id, stubids17_) == 0) - mexStub17(nlhs,plhs, nrhs-1,prhs+1); - else if (strcmp(id, stubids18_) == 0) - mexStub18(nlhs,plhs, nrhs-1,prhs+1); - else if (strcmp(id, stubids19_) == 0) - mexStub19(nlhs,plhs, nrhs-1,prhs+1); - else if (strcmp(id, stubids20_) == 0) - mexStub20(nlhs,plhs, nrhs-1,prhs+1); - else if (strcmp(id, "*profile on*") == 0) { - if (!mexprofrecord_) { - mexprofrecord_ = (int*) malloc(21 * sizeof(int)); - mexLock(); - } - memset(mexprofrecord_, 0, 21 * sizeof(int)); - } else if (strcmp(id, "*profile off*") == 0) { - if (mexprofrecord_) { - free(mexprofrecord_); - mexUnlock(); - } - mexprofrecord_ = NULL; - } else if (strcmp(id, "*profile report*") == 0) { - if (!mexprofrecord_) - mexPrintf("Profiler inactive\n"); - mexPrintf("%d calls to finufft.mw:166\n", mexprofrecord_[1]); - mexPrintf("%d calls to finufft.mw:167\n", mexprofrecord_[2]); - mexPrintf("%d calls to finufft.mw:169\n", mexprofrecord_[3]); - mexPrintf("%d calls to finufft.mw:170\n", mexprofrecord_[4]); - mexPrintf("%d calls to finufft.mw:172\n", mexprofrecord_[5]); - mexPrintf("%d calls to finufft.mw:173\n", mexprofrecord_[6]); - mexPrintf("%d calls to finufft.mw:184\n", mexprofrecord_[7]); - mexPrintf("%d calls to finufft.mw:187\n", mexprofrecord_[8]); - mexPrintf("%d calls to finufft.mw:190\n", mexprofrecord_[9]); - mexPrintf("%d calls to finufft.mw:192\n", mexprofrecord_[10]); - mexPrintf("%d calls to finufft.mw:222\n", mexprofrecord_[11]); - mexPrintf("%d calls to finufft.mw:224\n", mexprofrecord_[12]); - mexPrintf("%d calls to finufft.mw:251\n", mexprofrecord_[13]); - mexPrintf("%d calls to finufft.mw:253\n", mexprofrecord_[14]); - mexPrintf("%d calls to finufft.mw:259\n", mexprofrecord_[15]); - mexPrintf("%d calls to finufft.mw:261\n", mexprofrecord_[16]); - mexPrintf("%d calls to finufft.mw:265\n", mexprofrecord_[17]); - mexPrintf("%d calls to finufft.mw:267\n", mexprofrecord_[18]); - mexPrintf("%d calls to finufft.mw:279\n", mexprofrecord_[19]); - mexPrintf("%d calls to finufft.mw:281\n", mexprofrecord_[20]); - } else if (strcmp(id, "*profile log*") == 0) { - FILE* logfp; - if (nrhs != 2 || mxGetString(prhs[1], id, sizeof(id)) != 0) - mexErrMsgTxt("Must have two string arguments"); - logfp = fopen(id, "w+"); - if (!logfp) - mexErrMsgTxt("Cannot open log for output"); - if (!mexprofrecord_) - fprintf(logfp, "Profiler inactive\n"); - fprintf(logfp, "%d calls to finufft.mw:166\n", mexprofrecord_[1]); - fprintf(logfp, "%d calls to finufft.mw:167\n", mexprofrecord_[2]); - fprintf(logfp, "%d calls to finufft.mw:169\n", mexprofrecord_[3]); - fprintf(logfp, "%d calls to finufft.mw:170\n", mexprofrecord_[4]); - fprintf(logfp, "%d calls to finufft.mw:172\n", mexprofrecord_[5]); - fprintf(logfp, "%d calls to finufft.mw:173\n", mexprofrecord_[6]); - fprintf(logfp, "%d calls to finufft.mw:184\n", mexprofrecord_[7]); - fprintf(logfp, "%d calls to finufft.mw:187\n", mexprofrecord_[8]); - fprintf(logfp, "%d calls to finufft.mw:190\n", mexprofrecord_[9]); - fprintf(logfp, "%d calls to finufft.mw:192\n", mexprofrecord_[10]); - fprintf(logfp, "%d calls to finufft.mw:222\n", mexprofrecord_[11]); - fprintf(logfp, "%d calls to finufft.mw:224\n", mexprofrecord_[12]); - fprintf(logfp, "%d calls to finufft.mw:251\n", mexprofrecord_[13]); - fprintf(logfp, "%d calls to finufft.mw:253\n", mexprofrecord_[14]); - fprintf(logfp, "%d calls to finufft.mw:259\n", mexprofrecord_[15]); - fprintf(logfp, "%d calls to finufft.mw:261\n", mexprofrecord_[16]); - fprintf(logfp, "%d calls to finufft.mw:265\n", mexprofrecord_[17]); - fprintf(logfp, "%d calls to finufft.mw:267\n", mexprofrecord_[18]); - fprintf(logfp, "%d calls to finufft.mw:279\n", mexprofrecord_[19]); - fprintf(logfp, "%d calls to finufft.mw:281\n", mexprofrecord_[20]); - fclose(logfp); - } else - mexErrMsgTxt("Unknown identifier"); + memset(mexprofrecord_, 0, 21 * sizeof(int)); + } else if (strcmp(id, "*profile off*") == 0) { + if (mexprofrecord_) { + free(mexprofrecord_); + mexUnlock(); + } + mexprofrecord_ = NULL; + } else if (strcmp(id, "*profile report*") == 0) { + if (!mexprofrecord_) mexPrintf("Profiler inactive\n"); + mexPrintf("%d calls to finufft.mw:166\n", mexprofrecord_[1]); + mexPrintf("%d calls to finufft.mw:167\n", mexprofrecord_[2]); + mexPrintf("%d calls to finufft.mw:169\n", mexprofrecord_[3]); + mexPrintf("%d calls to finufft.mw:170\n", mexprofrecord_[4]); + mexPrintf("%d calls to finufft.mw:172\n", mexprofrecord_[5]); + mexPrintf("%d calls to finufft.mw:173\n", mexprofrecord_[6]); + mexPrintf("%d calls to finufft.mw:184\n", mexprofrecord_[7]); + mexPrintf("%d calls to finufft.mw:187\n", mexprofrecord_[8]); + mexPrintf("%d calls to finufft.mw:190\n", mexprofrecord_[9]); + mexPrintf("%d calls to finufft.mw:192\n", mexprofrecord_[10]); + mexPrintf("%d calls to finufft.mw:222\n", mexprofrecord_[11]); + mexPrintf("%d calls to finufft.mw:224\n", mexprofrecord_[12]); + mexPrintf("%d calls to finufft.mw:251\n", mexprofrecord_[13]); + mexPrintf("%d calls to finufft.mw:253\n", mexprofrecord_[14]); + mexPrintf("%d calls to finufft.mw:259\n", mexprofrecord_[15]); + mexPrintf("%d calls to finufft.mw:261\n", mexprofrecord_[16]); + mexPrintf("%d calls to finufft.mw:265\n", mexprofrecord_[17]); + mexPrintf("%d calls to finufft.mw:267\n", mexprofrecord_[18]); + mexPrintf("%d calls to finufft.mw:279\n", mexprofrecord_[19]); + mexPrintf("%d calls to finufft.mw:281\n", mexprofrecord_[20]); + } else if (strcmp(id, "*profile log*") == 0) { + FILE *logfp; + if (nrhs != 2 || mxGetString(prhs[1], id, sizeof(id)) != 0) + mexErrMsgTxt("Must have two string arguments"); + logfp = fopen(id, "w+"); + if (!logfp) mexErrMsgTxt("Cannot open log for output"); + if (!mexprofrecord_) fprintf(logfp, "Profiler inactive\n"); + fprintf(logfp, "%d calls to finufft.mw:166\n", mexprofrecord_[1]); + fprintf(logfp, "%d calls to finufft.mw:167\n", mexprofrecord_[2]); + fprintf(logfp, "%d calls to finufft.mw:169\n", mexprofrecord_[3]); + fprintf(logfp, "%d calls to finufft.mw:170\n", mexprofrecord_[4]); + fprintf(logfp, "%d calls to finufft.mw:172\n", mexprofrecord_[5]); + fprintf(logfp, "%d calls to finufft.mw:173\n", mexprofrecord_[6]); + fprintf(logfp, "%d calls to finufft.mw:184\n", mexprofrecord_[7]); + fprintf(logfp, "%d calls to finufft.mw:187\n", mexprofrecord_[8]); + fprintf(logfp, "%d calls to finufft.mw:190\n", mexprofrecord_[9]); + fprintf(logfp, "%d calls to finufft.mw:192\n", mexprofrecord_[10]); + fprintf(logfp, "%d calls to finufft.mw:222\n", mexprofrecord_[11]); + fprintf(logfp, "%d calls to finufft.mw:224\n", mexprofrecord_[12]); + fprintf(logfp, "%d calls to finufft.mw:251\n", mexprofrecord_[13]); + fprintf(logfp, "%d calls to finufft.mw:253\n", mexprofrecord_[14]); + fprintf(logfp, "%d calls to finufft.mw:259\n", mexprofrecord_[15]); + fprintf(logfp, "%d calls to finufft.mw:261\n", mexprofrecord_[16]); + fprintf(logfp, "%d calls to finufft.mw:265\n", mexprofrecord_[17]); + fprintf(logfp, "%d calls to finufft.mw:267\n", mexprofrecord_[18]); + fprintf(logfp, "%d calls to finufft.mw:279\n", mexprofrecord_[19]); + fprintf(logfp, "%d calls to finufft.mw:281\n", mexprofrecord_[20]); + fclose(logfp); + } else + mexErrMsgTxt("Unknown identifier"); } - diff --git a/perftest/big2d2f.cpp b/perftest/big2d2f.cpp index 4b59a72df..1a87067d2 100644 --- a/perftest/big2d2f.cpp +++ b/perftest/big2d2f.cpp @@ -10,31 +10,29 @@ #include // also used in this example... -#include #include #include #include +#include using namespace std; -int test_finufft(finufft_opts* opts) -{ - size_t nj = 129*129*2; - size_t ms = 129, mt = 129; - size_t ntrans = 75000; // the point is: 129*129*2*75000 > 2^31 ~ 2.15e9 - std::vector x(nj); // bunch of zero data - std::vector y(nj); - std::vector> cj(ntrans*nj); - std::vector> fk(ntrans*ms*mt); +int test_finufft(finufft_opts *opts) { + size_t nj = 129 * 129 * 2; + size_t ms = 129, mt = 129; + size_t ntrans = 75000; // the point is: 129*129*2*75000 > 2^31 ~ 2.15e9 + std::vector x(nj); // bunch of zero data + std::vector y(nj); + std::vector> cj(ntrans * nj); + std::vector> fk(ntrans * ms * mt); - int ier = finufftf2d2many(ntrans, nj, x.data(), y.data(), cj.data(), - -1, 1e-3, ms, mt, fk.data(), opts); + int ier = finufftf2d2many(ntrans, nj, x.data(), y.data(), cj.data(), -1, 1e-3, ms, mt, + fk.data(), opts); - std::cout << "\tbig2d2f finufft status: " << ier << std::endl; - return ier; + std::cout << "\tbig2d2f finufft status: " << ier << std::endl; + return ier; } -int main(int argc, char* argv[]) -{ +int main(int argc, char *argv[]) { finufft_opts opts; finufftf_default_opts(&opts); return test_finufft(&opts); diff --git a/perftest/cuda/cuperftest.cu b/perftest/cuda/cuperftest.cu index 5b51fe3ac..f72ffb3e6 100644 --- a/perftest/cuda/cuperftest.cu +++ b/perftest/cuda/cuperftest.cu @@ -14,34 +14,34 @@ #include #include -std::string get_or(const std::unordered_map &m, const std::string &key, - const std::string &default_value) { - auto it = m.find(key); - if (it == m.end()) { - return default_value; - } - return it->second; +std::string get_or(const std::unordered_map &m, + const std::string &key, const std::string &default_value) { + auto it = m.find(key); + if (it == m.end()) { + return default_value; + } + return it->second; } struct test_options_t { - char prec; - int type; - int n_runs; - int N[3]; - int M; - int ntransf; - int kerevalmethod; - int method; - int sort; - double tol; - - test_options_t(int argc, char *argv[]) { - std::unordered_map options_map; - - while (true) { - int option_index = 0; - - // clang-format off + char prec; + int type; + int n_runs; + int N[3]; + int M; + int ntransf; + int kerevalmethod; + int method; + int sort; + double tol; + + test_options_t(int argc, char *argv[]) { + std::unordered_map options_map; + + while (true) { + int option_index = 0; + + // clang-format off static struct option long_options[] { {"prec", required_argument, 0, 0}, {"type", required_argument, 0, 0}, @@ -57,251 +57,248 @@ struct test_options_t { {"sort", required_argument, 0, 0}, {0, 0, 0, 0}, }; - // clang-format on - - int c = getopt_long(argc, argv, "", long_options, &option_index); - if (c == -1) - break; - - switch (c) { - case 0: - options_map[long_options[option_index].name] = optarg; - break; - - default: - break; - } - } - - prec = get_or(options_map, "prec", "f")[0]; - type = std::stoi(get_or(options_map, "type", "1")); - n_runs = std::stoi(get_or(options_map, "n_runs", "10")); - N[0] = std::stof(get_or(options_map, "N1", "1E6")); - N[1] = std::stof(get_or(options_map, "N2", "1")); - N[2] = std::stof(get_or(options_map, "N3", "1")); - M = std::stof(get_or(options_map, "M", "2E6")); - ntransf = std::stoi(get_or(options_map, "ntransf", "1")); - method = std::stoi(get_or(options_map, "method", "1")); - kerevalmethod = std::stoi(get_or(options_map, "kerevalmethod", "1")); - sort = std::stoi(get_or(options_map, "sort", "1")); - tol = std::stof(get_or(options_map, "tol", "1E-5")); - } + // clang-format on + + int c = getopt_long(argc, argv, "", long_options, &option_index); + if (c == -1) break; + + switch (c) { + case 0: + options_map[long_options[option_index].name] = optarg; + break; - friend std::ostream &operator<<(std::ostream &outs, const test_options_t &opts) { - return outs << "# prec = " << opts.prec << "\n" - << "# type = " << opts.type << "\n" - << "# n_runs = " << opts.n_runs << "\n" - << "# N1 = " << opts.N[0] << "\n" - << "# N2 = " << opts.N[1] << "\n" - << "# N3 = " << opts.N[2] << "\n" - << "# M = " << opts.M << "\n" - << "# ntransf = " << opts.ntransf << "\n" - << "# method = " << opts.method << "\n" - << "# kerevalmethod = " << opts.kerevalmethod << "\n" - << "# sort = " << opts.sort << "\n" - << "# tol = " << opts.tol << "\n"; + default: + break; + } } + + prec = get_or(options_map, "prec", "f")[0]; + type = std::stoi(get_or(options_map, "type", "1")); + n_runs = std::stoi(get_or(options_map, "n_runs", "10")); + N[0] = std::stof(get_or(options_map, "N1", "1E6")); + N[1] = std::stof(get_or(options_map, "N2", "1")); + N[2] = std::stof(get_or(options_map, "N3", "1")); + M = std::stof(get_or(options_map, "M", "2E6")); + ntransf = std::stoi(get_or(options_map, "ntransf", "1")); + method = std::stoi(get_or(options_map, "method", "1")); + kerevalmethod = std::stoi(get_or(options_map, "kerevalmethod", "1")); + sort = std::stoi(get_or(options_map, "sort", "1")); + tol = std::stof(get_or(options_map, "tol", "1E-5")); + } + + friend std::ostream &operator<<(std::ostream &outs, const test_options_t &opts) { + return outs << "# prec = " << opts.prec << "\n" + << "# type = " << opts.type << "\n" + << "# n_runs = " << opts.n_runs << "\n" + << "# N1 = " << opts.N[0] << "\n" + << "# N2 = " << opts.N[1] << "\n" + << "# N3 = " << opts.N[2] << "\n" + << "# M = " << opts.M << "\n" + << "# ntransf = " << opts.ntransf << "\n" + << "# method = " << opts.method << "\n" + << "# kerevalmethod = " << opts.kerevalmethod << "\n" + << "# sort = " << opts.sort << "\n" + << "# tol = " << opts.tol << "\n"; + } }; struct CudaTimer { - CudaTimer() {} + CudaTimer() {} - ~CudaTimer() { - for (auto &event : start_) - cudaEventDestroy(event); - for (auto &event : stop_) - cudaEventDestroy(event); - } + ~CudaTimer() { + for (auto &event : start_) cudaEventDestroy(event); + for (auto &event : stop_) cudaEventDestroy(event); + } - void start() { - start_.push_back(cudaEvent_t{}); - stop_.push_back(cudaEvent_t{}); + void start() { + start_.push_back(cudaEvent_t{}); + stop_.push_back(cudaEvent_t{}); - cudaEventCreate(&start_.back()); - cudaEventCreate(&stop_.back()); + cudaEventCreate(&start_.back()); + cudaEventCreate(&stop_.back()); - cudaEventRecord(start_.back()); - } + cudaEventRecord(start_.back()); + } - void stop() { cudaEventRecord(stop_.back()); } + void stop() { cudaEventRecord(stop_.back()); } - void sync() { - for (auto &event : stop_) - cudaEventSynchronize(event); - } + void sync() { + for (auto &event : stop_) cudaEventSynchronize(event); + } - float mean() { return this->tot() / start_.size(); } + float mean() { return this->tot() / start_.size(); } - float std() { - float avg = this->mean(); + float std() { + float avg = this->mean(); - double var = 0.0; - for (int i = 0; i < start_.size(); ++i) { - float dt; - cudaEventElapsedTime(&dt, start_[i], stop_[i]); - var += (dt - avg) * (dt - avg); - } - var /= start_.size(); - - return sqrt(var); + double var = 0.0; + for (int i = 0; i < start_.size(); ++i) { + float dt; + cudaEventElapsedTime(&dt, start_[i], stop_[i]); + var += (dt - avg) * (dt - avg); } + var /= start_.size(); - float tot() { - float dt_tot = 0.; - for (int i = 0; i < start_.size(); ++i) { - float dt; - cudaEventElapsedTime(&dt, start_[i], stop_[i]); - dt_tot += dt; - } + return sqrt(var); + } - return dt_tot; + float tot() { + float dt_tot = 0.; + for (int i = 0; i < start_.size(); ++i) { + float dt; + cudaEventElapsedTime(&dt, start_[i], stop_[i]); + dt_tot += dt; } - int count() { return start_.size(); } + return dt_tot; + } + + int count() { return start_.size(); } - std::vector start_; - std::vector stop_; + std::vector start_; + std::vector stop_; }; -template -inline void timeit(F f, CudaTimer &timer, Args... args) { - timer.start(); - f(args...); - timer.stop(); +template inline void timeit(F f, CudaTimer &timer, Args... args) { + timer.start(); + f(args...); + timer.stop(); } void gpu_warmup() { - int nf1 = 100; - cufftHandle fftplan; - cufftPlan1d(&fftplan, nf1, CUFFT_Z2Z, 1); - thrust::device_vector in(nf1), out(nf1); - cufftExecZ2Z(fftplan, in.data().get(), out.data().get(), 1); - cudaDeviceSynchronize(); + int nf1 = 100; + cufftHandle fftplan; + cufftPlan1d(&fftplan, nf1, CUFFT_Z2Z, 1); + thrust::device_vector in(nf1), out(nf1); + cufftExecZ2Z(fftplan, in.data().get(), out.data().get(), 1); + cudaDeviceSynchronize(); } -template -void run_test(test_options_t &test_opts) { - std::cout << test_opts; - const int ntransf = test_opts.ntransf; - const int64_t M = test_opts.M; - const int N = test_opts.N[0] * test_opts.N[1] * test_opts.N[2]; - const int type = test_opts.type; - constexpr int iflag = 1; - - thrust::host_vector x(M * ntransf), y(M * ntransf), z(M * ntransf); - thrust::host_vector> c(M * ntransf), fk(N * ntransf); - - thrust::device_vector d_x(M * ntransf), d_y(M * ntransf), d_z(M * ntransf); - thrust::device_vector> d_c(M * ntransf), d_fk(N * ntransf); - - std::default_random_engine eng(1); - std::uniform_real_distribution dist11(-1, 1); - auto randm11 = [&eng, &dist11]() { return dist11(eng); }; - - // Making data - for (int64_t i = 0; i < M; i++) { - x[i] = M_PI * randm11(); // x in [-pi,pi) - y[i] = M_PI * randm11(); - z[i] = M_PI * randm11(); - } - for (int64_t i = M; i < M * ntransf; ++i) { - int64_t j = i % M; - x[i] = x[j]; - y[i] = y[j]; - z[i] = z[j]; +template void run_test(test_options_t &test_opts) { + std::cout << test_opts; + const int ntransf = test_opts.ntransf; + const int64_t M = test_opts.M; + const int N = test_opts.N[0] * test_opts.N[1] * test_opts.N[2]; + const int type = test_opts.type; + constexpr int iflag = 1; + + thrust::host_vector x(M * ntransf), y(M * ntransf), z(M * ntransf); + thrust::host_vector> c(M * ntransf), fk(N * ntransf); + + thrust::device_vector d_x(M * ntransf), d_y(M * ntransf), d_z(M * ntransf); + thrust::device_vector> d_c(M * ntransf), d_fk(N * ntransf); + + std::default_random_engine eng(1); + std::uniform_real_distribution dist11(-1, 1); + auto randm11 = [&eng, &dist11]() { + return dist11(eng); + }; + + // Making data + for (int64_t i = 0; i < M; i++) { + x[i] = M_PI * randm11(); // x in [-pi,pi) + y[i] = M_PI * randm11(); + z[i] = M_PI * randm11(); + } + for (int64_t i = M; i < M * ntransf; ++i) { + int64_t j = i % M; + x[i] = x[j]; + y[i] = y[j]; + z[i] = z[j]; + } + + if (type == 1) { + for (int i = 0; i < M * ntransf; i++) { + c[i].real(randm11()); + c[i].imag(randm11()); } - if (type == 1) { - for (int i = 0; i < M * ntransf; i++) { - c[i].real(randm11()); - c[i].imag(randm11()); - } - - } else if (type == 2) { - for (int i = 0; i < N * ntransf; i++) { - fk[i].real(randm11()); - fk[i].imag(randm11()); - } - } else { - std::cerr << "Invalid type " << type << " supplied\n"; - return; + } else if (type == 2) { + for (int i = 0; i < N * ntransf; i++) { + fk[i].real(randm11()); + fk[i].imag(randm11()); } - - gpu_warmup(); - - cufinufft_opts opts; - int dim = 0; - for (int i = 0; i < 3; ++i) - dim = test_opts.N[i] > 1 ? i + 1 : dim; - - cufinufft_default_opts(&opts); - opts.gpu_method = test_opts.method; - opts.gpu_sort = test_opts.sort; - opts.gpu_kerevalmeth = test_opts.kerevalmethod; - - cufinufft_plan_t *dplan; - CudaTimer h2d_timer, makeplan_timer, setpts_timer, execute_timer, d2h_timer, amortized_timer; - { - amortized_timer.start(); - h2d_timer.start(); - d_x = x, d_y = y, d_z = z; - if (type == 1) - d_c = c; - if (type == 2) - d_fk = fk; - h2d_timer.stop(); - - T *d_x_p = dim >= 1 ? d_x.data().get() : nullptr; - T *d_y_p = dim >= 2 ? d_y.data().get() : nullptr; - T *d_z_p = dim == 3 ? d_z.data().get() : nullptr; - cuda_complex *d_c_p = (cuda_complex *)d_c.data().get(); - cuda_complex *d_fk_p = (cuda_complex *)d_fk.data().get(); - - timeit(cufinufft_makeplan_impl, makeplan_timer, test_opts.type, dim, test_opts.N, iflag, ntransf, - test_opts.tol, &dplan, &opts); - for (int i = 0; i < test_opts.n_runs; ++i) { - timeit(cufinufft_setpts_impl, setpts_timer, M, d_x_p, d_y_p, d_z_p, 0, nullptr, nullptr, nullptr, dplan); - timeit(cufinufft_execute_impl, execute_timer, d_c_p, d_fk_p, dplan); - } - - d2h_timer.start(); - if (type == 1) - fk = d_fk; - if (type == 2) - c = d_c; - d2h_timer.stop(); - - amortized_timer.stop(); - - h2d_timer.sync(); - makeplan_timer.sync(); - setpts_timer.sync(); - execute_timer.sync(); - d2h_timer.sync(); - amortized_timer.sync(); + } else { + std::cerr << "Invalid type " << type << " supplied\n"; + return; + } + + gpu_warmup(); + + cufinufft_opts opts; + int dim = 0; + for (int i = 0; i < 3; ++i) dim = test_opts.N[i] > 1 ? i + 1 : dim; + + cufinufft_default_opts(&opts); + opts.gpu_method = test_opts.method; + opts.gpu_sort = test_opts.sort; + opts.gpu_kerevalmeth = test_opts.kerevalmethod; + + cufinufft_plan_t *dplan; + CudaTimer h2d_timer, makeplan_timer, setpts_timer, execute_timer, d2h_timer, + amortized_timer; + { + amortized_timer.start(); + h2d_timer.start(); + d_x = x, d_y = y, d_z = z; + if (type == 1) d_c = c; + if (type == 2) d_fk = fk; + h2d_timer.stop(); + + T *d_x_p = dim >= 1 ? d_x.data().get() : nullptr; + T *d_y_p = dim >= 2 ? d_y.data().get() : nullptr; + T *d_z_p = dim == 3 ? d_z.data().get() : nullptr; + cuda_complex *d_c_p = (cuda_complex *)d_c.data().get(); + cuda_complex *d_fk_p = (cuda_complex *)d_fk.data().get(); + + timeit(cufinufft_makeplan_impl, makeplan_timer, test_opts.type, dim, test_opts.N, + iflag, ntransf, test_opts.tol, &dplan, &opts); + for (int i = 0; i < test_opts.n_runs; ++i) { + timeit(cufinufft_setpts_impl, setpts_timer, M, d_x_p, d_y_p, d_z_p, 0, nullptr, + nullptr, nullptr, dplan); + timeit(cufinufft_execute_impl, execute_timer, d_c_p, d_fk_p, dplan); } - const int64_t nupts_tot = M * test_opts.n_runs * ntransf; - - printf("event,count,tot(ms),mean(ms),std(ms),nupts/s,ns/nupt\n"); - printf("host_to_device,%d,%f,%f,%f,0.0,0.0\n", h2d_timer.count(), h2d_timer.tot(), - h2d_timer.mean(), h2d_timer.std()); - printf("makeplan,%d,%f,%f,%f,0.0,0.0\n", makeplan_timer.count(), makeplan_timer.tot(), makeplan_timer.mean(), - makeplan_timer.std()); - printf("setpts,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, setpts_timer.tot(), setpts_timer.mean(), setpts_timer.std(), - nupts_tot * 1000 / setpts_timer.tot(), setpts_timer.tot() * 1E6 / nupts_tot); - printf("execute,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, execute_timer.tot(), execute_timer.mean(), - execute_timer.std(), nupts_tot * 1000 / execute_timer.tot(), execute_timer.tot() * 1E6 / nupts_tot); - printf("device_to_host,%d,%f,%f,%f,0.0,0.0\n", d2h_timer.count(), d2h_timer.tot(), - d2h_timer.mean(), d2h_timer.std()); - printf("amortized,%d,%f,%f,%f,%g,%f\n", 1, amortized_timer.tot(), amortized_timer.mean(), amortized_timer.std(), - nupts_tot * 1000 / amortized_timer.tot(), amortized_timer.tot() * 1E6 / nupts_tot); + d2h_timer.start(); + if (type == 1) fk = d_fk; + if (type == 2) c = d_c; + d2h_timer.stop(); + + amortized_timer.stop(); + + h2d_timer.sync(); + makeplan_timer.sync(); + setpts_timer.sync(); + execute_timer.sync(); + d2h_timer.sync(); + amortized_timer.sync(); + } + + const int64_t nupts_tot = M * test_opts.n_runs * ntransf; + + printf("event,count,tot(ms),mean(ms),std(ms),nupts/s,ns/nupt\n"); + printf("host_to_device,%d,%f,%f,%f,0.0,0.0\n", h2d_timer.count(), h2d_timer.tot(), + h2d_timer.mean(), h2d_timer.std()); + printf("makeplan,%d,%f,%f,%f,0.0,0.0\n", makeplan_timer.count(), makeplan_timer.tot(), + makeplan_timer.mean(), makeplan_timer.std()); + printf("setpts,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, setpts_timer.tot(), + setpts_timer.mean(), setpts_timer.std(), nupts_tot * 1000 / setpts_timer.tot(), + setpts_timer.tot() * 1E6 / nupts_tot); + printf("execute,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, execute_timer.tot(), + execute_timer.mean(), execute_timer.std(), + nupts_tot * 1000 / execute_timer.tot(), execute_timer.tot() * 1E6 / nupts_tot); + printf("device_to_host,%d,%f,%f,%f,0.0,0.0\n", d2h_timer.count(), d2h_timer.tot(), + d2h_timer.mean(), d2h_timer.std()); + printf("amortized,%d,%f,%f,%f,%g,%f\n", 1, amortized_timer.tot(), + amortized_timer.mean(), amortized_timer.std(), + nupts_tot * 1000 / amortized_timer.tot(), + amortized_timer.tot() * 1E6 / nupts_tot); } int main(int argc, char *argv[]) { - if (argc == 2 && (std::string(argv[1]) == "--help" || std::string(argv[1]) == "-h")) { - test_options_t default_opts(0, nullptr); - // clang-format off + if (argc == 2 && (std::string(argv[1]) == "--help" || std::string(argv[1]) == "-h")) { + test_options_t default_opts(0, nullptr); + // clang-format off std::cout << "Valid options:\n" " --prec \n" " float or double precision. i.e. 'f' or 'd'\n" @@ -347,15 +344,15 @@ int main(int argc, char *argv[]) { " 0: do not sort the points\n" " 1: sort the points\n" " default: " << default_opts.sort << "\n"; - // clang-format on - return 0; - } - test_options_t opts(argc, argv); + // clang-format on + return 0; + } + test_options_t opts(argc, argv); - if (opts.prec == 'f') - run_test(opts); - else if (opts.prec == 'd') - run_test(opts); + if (opts.prec == 'f') + run_test(opts); + else if (opts.prec == 'd') + run_test(opts); - return 0; + return 0; } diff --git a/perftest/guru_timing_test.cpp b/perftest/guru_timing_test.cpp index 145d4f1ef..90055a36b 100644 --- a/perftest/guru_timing_test.cpp +++ b/perftest/guru_timing_test.cpp @@ -1,11 +1,8 @@ #include // for sleep call #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) -#include -void sleep(unsigned long seconds) -{ - Sleep(seconds * 1000); -} +#include +void sleep(unsigned long seconds) { Sleep(seconds * 1000); } #else #include #endif @@ -14,11 +11,10 @@ using namespace finufft; using namespace finufft::utils; // forward declaration of helper to (repeatedly if needed) call finufft?d? -double many_simple_calls(CPX *c,CPX *F,FLT*x, FLT*y, FLT*z,FINUFFT_PLAN plan); - +double many_simple_calls(CPX *c, CPX *F, FLT *x, FLT *y, FLT *z, FINUFFT_PLAN plan); // -------------------------------------------------------------------------- -int main(int argc, char* argv[]) +int main(int argc, char *argv[]) /* Timing-only tester for the guru interface, allowing control of many params and opts from the command line. It compares doing many transforms with same NU pts, with repeated calls to @@ -37,10 +33,10 @@ int main(int argc, char* argv[]) debug = 0: rel errors and overall timing 1: timing breakdowns 2: also spreading output - + spread_scheme = 0: sequential maximally multithreaded spread/interp 1: parallel singlethreaded spread/interp, nested last batch - + Example: guru_timing_test 100 1 2 100 100 0 1000000 1e-3 1 0 0 2 2.0 The unused dimensions of Nmodes may be left as zero. @@ -51,147 +47,159 @@ int main(int argc, char* argv[]) added 2 extra args, 5/22/20. Moved to perftests 7/23/20. */ { - double tsleep = 0.1; // how long wait between tests to let FFTW settle (1.0?) + double tsleep = 0.1; // how long wait between tests to let FFTW settle (1.0?) int ntransf, type, ndim; - BIGINT M, N1, N2, N3; // M = # srcs, N1,N2,N3= # modes in each dim + BIGINT M, N1, N2, N3; // M = # srcs, N1,N2,N3= # modes in each dim double w, tol = 1e-6; - int isign = +1; // choose which exponential sign to test + int isign = +1; // choose which exponential sign to test finufft_opts opts; - FINUFFT_DEFAULT_OPTS(&opts); // for guru interface - + FINUFFT_DEFAULT_OPTS(&opts); // for guru interface + // Collect command line arguments ------------------------------------------ - if (argc<8 || argc>14) { - fprintf(stderr,"Usage: guru_timing_test ntransf type ndim N1 N2 N3 Nsrc [tol [debug [spread_thread [maxbatchsize [spread_sort [upsampfac]]]]]]\n\teg:\tguru_timing_test 100 1 2 1e2 1e2 0 1e6 1e-3 1 0 0 2\n"); + if (argc < 8 || argc > 14) { + fprintf( + stderr, + "Usage: guru_timing_test ntransf type ndim N1 N2 N3 Nsrc [tol [debug " + "[spread_thread [maxbatchsize [spread_sort " + "[upsampfac]]]]]]\n\teg:\tguru_timing_test 100 1 2 1e2 1e2 0 1e6 1e-3 1 0 0 2\n"); return 1; } - sscanf(argv[1],"%d",&ntransf); - sscanf(argv[2],"%d",&type); - sscanf(argv[3],"%d",&ndim); - sscanf(argv[4],"%lf",&w); N1 = (BIGINT)w; - sscanf(argv[5],"%lf",&w); N2 = (BIGINT)w; - sscanf(argv[6],"%lf",&w); N3 = (BIGINT)w; - sscanf(argv[7],"%lf",&w); M = (BIGINT)w; - if (argc>8) sscanf(argv[8],"%lf",&tol); - if (argc>9) sscanf(argv[9],"%d",&opts.debug); - opts.spread_debug = (opts.debug>1) ? 1 : 0; // see output from spreader - if (argc>10) sscanf(argv[10], "%d", &opts.spread_thread); - if (argc>11) sscanf(argv[11], "%d", &opts.maxbatchsize); - if (argc>12) sscanf(argv[12],"%d",&opts.spread_sort); - if (argc>13) { sscanf(argv[13],"%lf",&w); opts.upsampfac = (FLT)w; } - - // Allocate and initialize input ------------------------------------------- + sscanf(argv[1], "%d", &ntransf); + sscanf(argv[2], "%d", &type); + sscanf(argv[3], "%d", &ndim); + sscanf(argv[4], "%lf", &w); + N1 = (BIGINT)w; + sscanf(argv[5], "%lf", &w); + N2 = (BIGINT)w; + sscanf(argv[6], "%lf", &w); + N3 = (BIGINT)w; + sscanf(argv[7], "%lf", &w); + M = (BIGINT)w; + if (argc > 8) sscanf(argv[8], "%lf", &tol); + if (argc > 9) sscanf(argv[9], "%d", &opts.debug); + opts.spread_debug = (opts.debug > 1) ? 1 : 0; // see output from spreader + if (argc > 10) sscanf(argv[10], "%d", &opts.spread_thread); + if (argc > 11) sscanf(argv[11], "%d", &opts.maxbatchsize); + if (argc > 12) sscanf(argv[12], "%d", &opts.spread_sort); + if (argc > 13) { + sscanf(argv[13], "%lf", &w); + opts.upsampfac = (FLT)w; + } + + // Allocate and initialize input ------------------------------------------- cout << scientific << setprecision(15); - N2 = (N2 == 0) ? 1 : N2; - N3 = (N3 == 0) ? 1 : N3; - BIGINT N = N1*N2*N3; - - FLT* s = NULL; - FLT* t = NULL; - FLT* u = NULL; - if (type == 3) { // make target freq NU pts for type 3 (N of them)... - s = (FLT*)malloc(sizeof(FLT)*N); // targ freqs (1-cmpt) - FLT S1 = (FLT)N1/2; + N2 = (N2 == 0) ? 1 : N2; + N3 = (N3 == 0) ? 1 : N3; + BIGINT N = N1 * N2 * N3; + + FLT *s = NULL; + FLT *t = NULL; + FLT *u = NULL; + if (type == 3) { // make target freq NU pts for type 3 (N of them)... + s = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (1-cmpt) + FLT S1 = (FLT)N1 / 2; #pragma omp parallel { - unsigned int se=MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s -#pragma omp for schedule(dynamic,TEST_RANDCHUNK) - for (BIGINT k=0; k 1) { - t = (FLT*)malloc(sizeof(FLT)*N); // targ freqs (2-cmpt) - FLT S2 = (FLT)N2/2; -#pragma omp for schedule(dynamic,TEST_RANDCHUNK) - for (BIGINT k=0; k 1) { + t = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (2-cmpt) + FLT S2 = (FLT)N2 / 2; +#pragma omp for schedule(dynamic, TEST_RANDCHUNK) + for (BIGINT k = 0; k < N; ++k) { + t[k] = S2 * (-0.5 + randm11r(&se)); } - } - if(ndim > 2) { - u = (FLT*)malloc(sizeof(FLT)*N); // targ freqs (3-cmpt) - FLT S3 = (FLT)N3/2; -#pragma omp for schedule(dynamic,TEST_RANDCHUNK) - for (BIGINT k=0; k 2) { + u = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (3-cmpt) + FLT S3 = (FLT)N3 / 2; +#pragma omp for schedule(dynamic, TEST_RANDCHUNK) + for (BIGINT k = 0; k < N; ++k) { + u[k] = S3 * (0.9 + randm11r(&se)); } } } } - - CPX* c = (CPX*)malloc(sizeof(CPX)*M*ntransf); // strengths - CPX* F = (CPX*)malloc(sizeof(CPX)*N*ntransf); // mode ampls - - FLT *x = (FLT *)malloc(sizeof(FLT)*M), *y=NULL, *z=NULL; // NU pts x coords - if(ndim > 1) - y = (FLT *)malloc(sizeof(FLT)*M); // NU pts y coords - if(ndim > 2) - z = (FLT *)malloc(sizeof(FLT)*M); // NU pts z coords + + CPX *c = (CPX *)malloc(sizeof(CPX) * M * ntransf); // strengths + CPX *F = (CPX *)malloc(sizeof(CPX) * N * ntransf); // mode ampls + + FLT *x = (FLT *)malloc(sizeof(FLT) * M), *y = NULL, *z = NULL; // NU pts x coords + if (ndim > 1) y = (FLT *)malloc(sizeof(FLT) * M); // NU pts y coords + if (ndim > 2) z = (FLT *)malloc(sizeof(FLT) * M); // NU pts z coords #pragma omp parallel { - unsigned int se=MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s -#pragma omp for schedule(dynamic,TEST_RANDCHUNK) - for (BIGINT j=0; j1) { - printf("error (ier=%d)!\n",ier); + if (ier > 1) { + printf("error (ier=%d)!\n", ier); return ier; } else { - if (type!=3) + if (type != 3) printf("\tplan, for %lld modes: \t\t%.3g s\n", (long long)N, plan_t); else printf("\tplan:\t\t\t\t\t%.3g s\n", plan_t); } - - timer.restart(); // Guru Step 2 - ier = FINUFFT_SETPTS(plan, M, x, y, z, N, s, t, u); //(t1,2: N,s,t,u ignored) + + timer.restart(); // Guru Step 2 + ier = FINUFFT_SETPTS(plan, M, x, y, z, N, s, t, u); //(t1,2: N,s,t,u ignored) double sort_t = timer.elapsedsec(); if (ier) { - printf("error (ier=%d)!\n",ier); + printf("error (ier=%d)!\n", ier); return ier; } else { - if (type!=3) + if (type != 3) printf("\tsetpts for %lld NU pts: \t\t%.3g s\n", (long long)M, sort_t); else - printf("\tsetpts for %lld + %lld NU pts: \t%.3g s\n", (long long)M, (long long)N, sort_t); + printf("\tsetpts for %lld + %lld NU pts: \t%.3g s\n", (long long)M, (long long)N, + sort_t); } - - timer.restart(); // Guru Step 3 - ier = FINUFFT_EXECUTE(plan,c,F); - double exec_t=timer.elapsedsec(); + + timer.restart(); // Guru Step 3 + ier = FINUFFT_EXECUTE(plan, c, F); + double exec_t = timer.elapsedsec(); if (ier) { - printf("error (ier=%d)!\n",ier); + printf("error (ier=%d)!\n", ier); return ier; } else printf("\texec \t\t\t\t\t%.3g s\n", exec_t); double totalTime = plan_t + sort_t + exec_t; - if (type!=3) - printf("ntr=%d: %lld NU pts to %lld modes in %.3g s \t%.3g NU pts/s\n", ntransf, (long long)M,(long long)N, totalTime, ntransf*M/totalTime); + if (type != 3) + printf("ntr=%d: %lld NU pts to %lld modes in %.3g s \t%.3g NU pts/s\n", ntransf, + (long long)M, (long long)N, totalTime, ntransf * M / totalTime); else - printf("ntr=%d: %lld NU pts to %lld NU pts in %.3g s \t%.3g tot NU pts/s\n", ntransf, (long long)M,(long long)N, totalTime, ntransf*(N+M)/totalTime); + printf("ntr=%d: %lld NU pts to %lld NU pts in %.3g s \t%.3g tot NU pts/s\n", ntransf, + (long long)M, (long long)N, totalTime, ntransf * (N + M) / totalTime); // Comparing timing results with repeated calls to corresponding finufft function... @@ -199,40 +207,38 @@ int main(int argc, char* argv[]) // by Andrea Malleo, but in this case we need to access the plan later // for many_simple_calls() to work, so we cannot do FFTW cleanup without // apparently causing segfault :(. So we skip them. - //FFTW_CLEANUP(); - //FFTW_CLEANUP_THREADS(); - //FFTW_FORGET_WISDOM(); - - //std::this_thread::sleep_for(std::chrono::seconds(1)); if c++11 is allowed - sleep(tsleep); //sleep for one second using linux sleep call - - - printf("Compare speed of repeated calls to simple interface:------------------------\n"); + // FFTW_CLEANUP(); + // FFTW_CLEANUP_THREADS(); + // FFTW_FORGET_WISDOM(); + + // std::this_thread::sleep_for(std::chrono::seconds(1)); if c++11 is allowed + sleep(tsleep); // sleep for one second using linux sleep call + + printf( + "Compare speed of repeated calls to simple interface:------------------------\n"); // this used to actually call Alex's old (v1.1) src/finufft?d.cpp routines. // Since we don't want to ship those, we now call the simple interfaces. - - double simpleTime = many_simple_calls(c,F, x, y, z, plan); - if (isnan(simpleTime)) - return 1; - - if (type!=3) - printf("%d of:\t%lld NU pts to %lld modes in %.3g s \t%.3g NU pts/s\n", - ntransf,(long long)M,(long long)N, simpleTime, ntransf*M/simpleTime); + + double simpleTime = many_simple_calls(c, F, x, y, z, plan); + if (isnan(simpleTime)) return 1; + + if (type != 3) + printf("%d of:\t%lld NU pts to %lld modes in %.3g s \t%.3g NU pts/s\n", ntransf, + (long long)M, (long long)N, simpleTime, ntransf * M / simpleTime); else - printf("%d of:\t%lld NU pts to %lld NU pts in %.3g s \t%.3g tot NU pts/s\n", - ntransf,(long long)M,(long long)N, simpleTime, ntransf*(M+N)/simpleTime); - printf("\tspeedup \t T_finufft%dd%d_simple / T_finufft%dd%d = %.3g\n",ndim,type, - ndim, type, simpleTime/totalTime); + printf("%d of:\t%lld NU pts to %lld NU pts in %.3g s \t%.3g tot NU pts/s\n", ntransf, + (long long)M, (long long)N, simpleTime, ntransf * (M + N) / simpleTime); + printf("\tspeedup \t T_finufft%dd%d_simple / T_finufft%dd%d = %.3g\n", ndim, type, ndim, + type, simpleTime / totalTime); - - FINUFFT_DESTROY(plan); // Guru Step 4 + FINUFFT_DESTROY(plan); // Guru Step 4 // (must be done *after* many_simple_calls, which sneaks a look at the plan!) // however, segfaults, maybe because plan->opts.debug changed? - + //---------------------------- Free Memory (no need to test if NULL) free(F); free(c); - free(x); + free(x); free(y); free(z); free(s); @@ -241,7 +247,6 @@ int main(int argc, char* argv[]) return 0; } - // -------------------------------- HELPER FUNCS ---------------------------- double finufftFunnel(CPX *cStart, CPX *fStart, FLT *x, FLT *y, FLT *z, FINUFFT_PLAN plan) @@ -253,156 +258,161 @@ double finufftFunnel(CPX *cStart, CPX *fStart, FLT *x, FLT *y, FLT *z, FINUFFT_P Malleo 2019; xyz passed in by Barnett 5/26/20 to prevent X_orig fields. */ { - finufft::utils::CNTime timer; timer.start(); - int ier = 0; - double t = 0; - double fail = NAN; // dummy code for failure - finufft_opts* popts = &(plan->opts); // opts ptr, as v1.2 simple calls need - switch (plan->dim){ - - case 1: // 1D - switch (plan->type){ + finufft::utils::CNTime timer; + timer.start(); + int ier = 0; + double t = 0; + double fail = NAN; // dummy code for failure + finufft_opts *popts = &(plan->opts); // opts ptr, as v1.2 simple calls need + switch (plan->dim) { + + case 1: // 1D + switch (plan->type) { case 1: timer.restart(); - ier = FINUFFT1D1(plan->nj, x, cStart, plan->fftSign, plan->tol, plan->ms, fStart, popts); - t = timer.elapsedsec(); - if(ier) - return fail; + ier = FINUFFT1D1(plan->nj, x, cStart, plan->fftSign, plan->tol, plan->ms, fStart, + popts); + t = timer.elapsedsec(); + if (ier) + return fail; else - return t; - + return t; + case 2: timer.restart(); - ier = FINUFFT1D2(plan->nj, x, cStart, plan->fftSign, plan->tol, plan->ms, fStart, popts); - t = timer.elapsedsec(); - if(ier) - return fail; + ier = FINUFFT1D2(plan->nj, x, cStart, plan->fftSign, plan->tol, plan->ms, fStart, + popts); + t = timer.elapsedsec(); + if (ier) + return fail; else - return t; - + return t; + case 3: timer.restart(); - ier = FINUFFT1D3(plan->nj, x, cStart, plan->fftSign, plan->tol, plan->nk, plan->S, fStart, popts); - t = timer.elapsedsec(); - if(ier) - return fail; + ier = FINUFFT1D3(plan->nj, x, cStart, plan->fftSign, plan->tol, plan->nk, plan->S, + fStart, popts); + t = timer.elapsedsec(); + if (ier) + return fail; else - return t; - + return t; + default: - return fail; + return fail; } - case 2: // 2D - switch(plan->type){ - + case 2: // 2D + switch (plan->type) { + case 1: timer.restart(); - ier = FINUFFT2D1(plan->nj, x,y, cStart, plan->fftSign, plan->tol, plan->ms, plan->mt, fStart, popts); - t = timer.elapsedsec(); - if(ier) - return fail; + ier = FINUFFT2D1(plan->nj, x, y, cStart, plan->fftSign, plan->tol, plan->ms, + plan->mt, fStart, popts); + t = timer.elapsedsec(); + if (ier) + return fail; else - return t; - + return t; + case 2: timer.restart(); - ier = FINUFFT2D2(plan->nj, x,y, cStart, plan->fftSign, plan->tol, plan->ms, plan->mt, - fStart, popts); - t = timer.elapsedsec(); - if(ier) - return fail; + ier = FINUFFT2D2(plan->nj, x, y, cStart, plan->fftSign, plan->tol, plan->ms, + plan->mt, fStart, popts); + t = timer.elapsedsec(); + if (ier) + return fail; else - return t; + return t; case 3: timer.restart(); - ier = FINUFFT2D3(plan->nj, x,y, cStart, plan->fftSign, plan->tol, plan->nk, plan->S, plan->T, - fStart, popts); - t = timer.elapsedsec(); - if(ier) - return fail; + ier = FINUFFT2D3(plan->nj, x, y, cStart, plan->fftSign, plan->tol, plan->nk, + plan->S, plan->T, fStart, popts); + t = timer.elapsedsec(); + if (ier) + return fail; else - return t; - + return t; + default: return fail; } - case 3: // 3D - switch(plan->type){ + case 3: // 3D + switch (plan->type) { case 1: timer.restart(); - ier = FINUFFT3D1(plan->nj, x,y,z, cStart, plan->fftSign, plan->tol, - plan->ms, plan->mt, plan->mu, fStart, popts); - t = timer.elapsedsec(); - if(ier) - return fail; + ier = FINUFFT3D1(plan->nj, x, y, z, cStart, plan->fftSign, plan->tol, plan->ms, + plan->mt, plan->mu, fStart, popts); + t = timer.elapsedsec(); + if (ier) + return fail; else - return t; - + return t; + case 2: timer.restart(); - ier = FINUFFT3D2(plan->nj, x,y,z, cStart, plan->fftSign, plan->tol, - plan->ms, plan->mt, plan->mu, fStart, popts); - t = timer.elapsedsec(); - if(ier) - return fail; + ier = FINUFFT3D2(plan->nj, x, y, z, cStart, plan->fftSign, plan->tol, plan->ms, + plan->mt, plan->mu, fStart, popts); + t = timer.elapsedsec(); + if (ier) + return fail; else - return t; - + return t; + case 3: timer.restart(); - ier = FINUFFT3D3(plan->nj, x,y,z, cStart, plan->fftSign, plan->tol, - plan->nk, plan->S, plan->T, plan->U, fStart, popts); - t = timer.elapsedsec(); - if(ier) - return fail; + ier = FINUFFT3D3(plan->nj, x, y, z, cStart, plan->fftSign, plan->tol, plan->nk, + plan->S, plan->T, plan->U, fStart, popts); + t = timer.elapsedsec(); + if (ier) + return fail; else - return t; + return t; - default: // invalid type + default: // invalid type return fail; } - default: // invalid dimension + default: // invalid dimension return fail; } } -double many_simple_calls(CPX *c,CPX *F, FLT* x, FLT* y, FLT* z, FINUFFT_PLAN plan) +double many_simple_calls(CPX *c, CPX *F, FLT *x, FLT *y, FLT *z, FINUFFT_PLAN plan) /* A unified interface to all of the simple interfaces, with a loop over many such transforms. Returns total time reported by the transforms. (Used to call pre-v1.2 single implementations in finufft, via runOldFinufft. The repo no longer contains those implementations, which used to be in a subdirectory.) */ -{ - CPX *cStart; - CPX *fStart; - - double time = 0; - double temp = 0;; - - for(int k = 0; k < plan->ntrans; k++){ - cStart = c + plan->nj*k; - fStart = F + plan->ms*plan->mt*plan->mu*k; - - //printf("k=%d, debug=%d.................\n",k, plan->opts.debug); - if(k != 0) { // prevent massive debug output - plan->opts.debug = 0; - plan->opts.spread_debug = 0; - } - - temp = finufftFunnel(cStart,fStart, x, y,z,plan); - if (isnan(temp)) { - fprintf(stderr,"[%s] Funnel call to finufft failed!\n",__func__); - return NAN; - } - else - time += temp; +{ + CPX *cStart; + CPX *fStart; + + double time = 0; + double temp = 0; + ; + + for (int k = 0; k < plan->ntrans; k++) { + cStart = c + plan->nj * k; + fStart = F + plan->ms * plan->mt * plan->mu * k; + + // printf("k=%d, debug=%d.................\n",k, plan->opts.debug); + if (k != 0) { // prevent massive debug output + plan->opts.debug = 0; + plan->opts.spread_debug = 0; } - return time; + + temp = finufftFunnel(cStart, fStart, x, y, z, plan); + if (isnan(temp)) { + fprintf(stderr, "[%s] Funnel call to finufft failed!\n", __func__); + return NAN; + } else + time += temp; + } + return time; } diff --git a/perftest/manysmallprobs.cpp b/perftest/manysmallprobs.cpp index c6776cf0e..0f2c9d0bb 100644 --- a/perftest/manysmallprobs.cpp +++ b/perftest/manysmallprobs.cpp @@ -10,14 +10,14 @@ using namespace finufft::utils; #include using namespace std; -int main(int argc, char* argv[]) +int main(int argc, char *argv[]) /* What is small-problem cost of FINUFFT library from C++, using plain arrays of C++ complex numbers? Barnett 10/31/17. for Xi Chen question. Updated to also demo guru interface and compare speed. 6/7/22 made deterministic changes so check answer matches both ways. - g++ -fopenmp manysmallprobs.cpp ../lib-static/libfinufft.a -o manysmallprobs -lfftw3 -lfftw3_omp -lm - # multithreaded is much slower, due to overhead of starting threads?... + g++ -fopenmp manysmallprobs.cpp ../lib-static/libfinufft.a -o manysmallprobs -lfftw3 + -lfftw3_omp -lm # multithreaded is much slower, due to overhead of starting threads?... export OMP_NUM_THREADS=1 time ./manysmallprobs @@ -26,54 +26,64 @@ int main(int argc, char* argv[]) But why is multi-thread so much slower? (thread start-up time?) */ -{ - int M = 2e2; // number of nonuniform points - int N = 2e2; // number of modes - int reps = 2e4; // how many repetitions - double acc = 1e-6; // desired accuracy - - complex I = complex(0.0,1.0); // the imaginary unit +{ + int M = 2e2; // number of nonuniform points + int N = 2e2; // number of modes + int reps = 2e4; // how many repetitions + double acc = 1e-6; // desired accuracy + + complex I = complex(0.0, 1.0); // the imaginary unit int ier; - + // generate some random nonuniform points (x) and complex strengths (c): - double *x = (double *)malloc(sizeof(double)*M); - complex* c = (complex*)malloc(sizeof(complex)*M); - for (int j=0; j *c = (complex *)malloc(sizeof(complex) * M); + for (int j = 0; j < M; ++j) { + x[j] = M_PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi] + c[j] = + 2 * ((double)rand() / RAND_MAX) - 1 + I * (2 * ((double)rand() / RAND_MAX) - 1); } // allocate output array for the Fourier modes: - complex* F = (complex*)malloc(sizeof(complex)*N); + complex *F = (complex *)malloc(sizeof(complex) * N); printf("repeatedly calling the simple interface: --------------------- \n"); - finufft::utils::CNTime timer; timer.start(); - for (int r=0;r y=F[0]; // actually use the data so not optimized away - printf("%d reps of 1d1 done in %.3g s,\t%.3g NU pts/s\t(last ier=%d)\nF[0]=%.6g + %.6gi\n",reps,timer.elapsedsec(),reps*M/timer.elapsedsec(),ier,real(y),imag(y)); + complex y = F[0]; // actually use the data so not optimized away + printf( + "%d reps of 1d1 done in %.3g s,\t%.3g NU pts/s\t(last ier=%d)\nF[0]=%.6g + %.6gi\n", + reps, timer.elapsedsec(), reps * M / timer.elapsedsec(), ier, real(y), imag(y)); printf("repeatedly executing via the guru interface: -------------------\n"); timer.restart(); - finufft_plan plan; finufft_opts opts; finufft_default_opts(&opts); - opts.debug = 0; - int64_t Ns[]={N,1,1}; - int ntransf = 1; // since we do one at a time (neq reps) - finufft_makeplan(1,1,Ns,+1,ntransf,acc,&plan,&opts); - for (int r=0;r #include +#include #include #include -#include #include #include #include +#include using namespace finufft::spreadinterp; -using namespace finufft::utils; // for timer +using namespace finufft::utils; // for timer -void usage() -{ - printf("usage: spreadtestnd dims [M N [tol [sort [flags [debug [kerpad [kerevalmeth [upsampfac]]]]]]]]\n\twhere dims=1,2 or 3\n\tM=# nonuniform pts\n\tN=# uniform pts\n\ttol=requested accuracy\n\tsort=0 (don't sort NU pts), 1 (do), or 2 (maybe sort; default)\n\tflags: expert timing flags, 0 is default (see spreadinterp.h)\n\tdebug=0 (less text out), 1 (more), 2 (lots)\n\tkerpad=0 (no pad to mult of 4), 1 (do, for kerevalmeth=0 only)\n\tkerevalmeth=0 (direct), 1 (Horner ppval)\n\tupsampfac>1; 2 or 1.25 for Horner\n\nexample: ./spreadtestnd 1 1e6 1e6 1e-6 2 0 1\n"); +void usage() { + printf("usage: spreadtestnd dims [M N [tol [sort [flags [debug [kerpad [kerevalmeth " + "[upsampfac]]]]]]]]\n\twhere dims=1,2 or 3\n\tM=# nonuniform pts\n\tN=# uniform " + "pts\n\ttol=requested accuracy\n\tsort=0 (don't sort NU pts), 1 (do), or 2 " + "(maybe sort; default)\n\tflags: expert timing flags, 0 is default (see " + "spreadinterp.h)\n\tdebug=0 (less text out), 1 (more), 2 (lots)\n\tkerpad=0 (no " + "pad to mult of 4), 1 (do, for kerevalmeth=0 only)\n\tkerevalmeth=0 (direct), 1 " + "(Horner ppval)\n\tupsampfac>1; 2 or 1.25 for Horner\n\nexample: ./spreadtestnd " + "1 1e6 1e6 1e-6 2 0 1\n"); } -int main(int argc, char* argv[]) +int main(int argc, char *argv[]) /* Test executable for the 1D, 2D, or 3D C++ spreader, both directions. * It checks speed, and basic correctness via the grid sum of the result. * See usage() for usage. Note it currently tests only pirange=0, which is not @@ -25,7 +31,8 @@ int main(int argc, char* argv[]) * Example: spreadtestnd 3 8e6 8e6 1e-6 2 0 1 * * Compilation (also check ../makefile): - * g++ spreadtestnd.cpp ../src/spreadinterp.o ../src/utils.o -o spreadtestnd -fPIC -Ofast -funroll-loops -fopenmp + * g++ spreadtestnd.cpp ../src/spreadinterp.o ../src/utils.o -o spreadtestnd -fPIC + * -Ofast -funroll-loops -fopenmp * * Magland; expanded by Barnett 1/14/17. Better cmd line args 3/13/17 * indep setting N 3/27/17. parallel rand() & sort flag 3/28/17 @@ -34,192 +41,258 @@ int main(int argc, char* argv[]) * Barbone, removed pirange 05/09/24. */ { - int d = 3; // Cmd line args & their defaults: default #dims - double w, tol = 1e-6; // default (eg 1e-6 has nspread=7) - BIGINT M = 1e6; // default # NU pts - BIGINT roughNg = 1e6; // default # U pts - int sort = 2; // spread_sort - int flags = 0; // default - int debug = 0; // default - int kerpad = 0; // default - int kerevalmeth = 1; // default: Horner - FLT upsampfac = 2.0; // standard - - if (argc<2 || argc==3 || argc>11) { - usage(); return (argc>1); - } - sscanf(argv[1],"%d",&d); - if (d<1 || d>3) { - printf("d must be 1, 2 or 3!\n"); usage(); return 1; - } - if (argc>2) { - sscanf(argv[2],"%lf",&w); M = (BIGINT)w; // to read "1e6" right! - if (M<1) { - printf("M (# NU pts) must be positive!\n"); usage(); return 1; + int d = 3; // Cmd line args & their defaults: default #dims + double w, tol = 1e-6; // default (eg 1e-6 has nspread=7) + BIGINT M = 1e6; // default # NU pts + BIGINT roughNg = 1e6; // default # U pts + int sort = 2; // spread_sort + int flags = 0; // default + int debug = 0; // default + int kerpad = 0; // default + int kerevalmeth = 1; // default: Horner + FLT upsampfac = 2.0; // standard + + if (argc < 2 || argc == 3 || argc > 11) { + usage(); + return (argc > 1); + } + sscanf(argv[1], "%d", &d); + if (d < 1 || d > 3) { + printf("d must be 1, 2 or 3!\n"); + usage(); + return 1; + } + if (argc > 2) { + sscanf(argv[2], "%lf", &w); + M = (BIGINT)w; // to read "1e6" right! + if (M < 1) { + printf("M (# NU pts) must be positive!\n"); + usage(); + return 1; } - sscanf(argv[3],"%lf",&w); roughNg = (BIGINT)w; - if (roughNg<1) { - printf("N (# U pts) must be positive!\n"); usage(); return 1; + sscanf(argv[3], "%lf", &w); + roughNg = (BIGINT)w; + if (roughNg < 1) { + printf("N (# U pts) must be positive!\n"); + usage(); + return 1; } } - if (argc>4) sscanf(argv[4],"%lf",&tol); - if (argc>5) { - sscanf(argv[5],"%d",&sort); - if ((sort!=0) && (sort!=1) && (sort!=2)) { - printf("sort must be 0, 1 or 2!\n"); usage(); return 1; + if (argc > 4) sscanf(argv[4], "%lf", &tol); + if (argc > 5) { + sscanf(argv[5], "%d", &sort); + if ((sort != 0) && (sort != 1) && (sort != 2)) { + printf("sort must be 0, 1 or 2!\n"); + usage(); + return 1; } } - if (argc>6) - sscanf(argv[6],"%d",&flags); - if (argc>7) { - sscanf(argv[7],"%d",&debug); - if ((debug<0) || (debug>2)) { - printf("debug must be 0, 1 or 2!\n"); usage(); return 1; + if (argc > 6) sscanf(argv[6], "%d", &flags); + if (argc > 7) { + sscanf(argv[7], "%d", &debug); + if ((debug < 0) || (debug > 2)) { + printf("debug must be 0, 1 or 2!\n"); + usage(); + return 1; } } - if (argc>8) { - sscanf(argv[8],"%d",&kerpad); - if ((kerpad<0) || (kerpad>1)) { - printf("kerpad must be 0 or 1!\n"); usage(); return 1; + if (argc > 8) { + sscanf(argv[8], "%d", &kerpad); + if ((kerpad < 0) || (kerpad > 1)) { + printf("kerpad must be 0 or 1!\n"); + usage(); + return 1; } } - if (argc>9) { - sscanf(argv[9],"%d",&kerevalmeth); - if ((kerevalmeth<0) || (kerevalmeth>1)) { - printf("kerevalmeth must be 0 or 1!\n"); usage(); return 1; + if (argc > 9) { + sscanf(argv[9], "%d", &kerevalmeth); + if ((kerevalmeth < 0) || (kerevalmeth > 1)) { + printf("kerevalmeth must be 0 or 1!\n"); + usage(); + return 1; } } - if (argc>10) { - sscanf(argv[10],"%lf",&w); upsampfac = (FLT)w; - if (upsampfac<=1.0) { - printf("upsampfac must be >1.0!\n"); usage(); return 1; + if (argc > 10) { + sscanf(argv[10], "%lf", &w); + upsampfac = (FLT)w; + if (upsampfac <= 1.0) { + printf("upsampfac must be >1.0!\n"); + usage(); + return 1; } } - int dodir1 = true; // control if dir=1 tested at all - BIGINT N = (BIGINT)round(pow(roughNg,1.0/d)); // Fourier grid size per dim - BIGINT Ng = (BIGINT)pow(N,d); // actual total grid points - BIGINT N2 = (d>=2) ? N : 1, N3 = (d==3) ? N : 1; // the y and z grid sizes - std::vector kx(M),ky(1),kz(1),d_nonuniform(2*M); // NU, Re & Im - if (d>1) ky.resize(M); // only alloc needed coords - if (d>2) kz.resize(M); - std::vector d_uniform(2*Ng); // Re and Im + int dodir1 = true; // control if dir=1 tested at all + BIGINT N = (BIGINT)round(pow(roughNg, 1.0 / d)); // Fourier grid size per dim + BIGINT Ng = (BIGINT)pow(N, d); // actual total grid points + BIGINT N2 = (d >= 2) ? N : 1, N3 = (d == 3) ? N : 1; // the y and z grid sizes + std::vector kx(M), ky(1), kz(1), d_nonuniform(2 * M); // NU, Re & Im + if (d > 1) ky.resize(M); // only alloc needed coords + if (d > 2) kz.resize(M); + std::vector d_uniform(2 * Ng); // Re and Im finufft_spread_opts opts; - int ier_set = setup_spreader(opts,(FLT)tol,upsampfac,kerevalmeth,debug,1,d); - if (ier_set>1) { // exit gracefully if can't set up. - printf("error when setting up spreader (ier_set=%d)!\n",ier_set); + int ier_set = setup_spreader(opts, (FLT)tol, upsampfac, kerevalmeth, debug, 1, d); + if (ier_set > 1) { // exit gracefully if can't set up. + printf("error when setting up spreader (ier_set=%d)!\n", ier_set); return ier_set; } - opts.debug = debug; // print more diagnostics? - opts.sort = sort; - opts.flags = flags; - opts.kerpad = kerpad; - opts.upsampfac = upsampfac; - opts.nthreads = 0; // max # threads used, or 0 to use what's avail + opts.debug = debug; // print more diagnostics? + opts.sort = sort; + opts.flags = flags; + opts.kerpad = kerpad; + opts.upsampfac = upsampfac; + opts.nthreads = 0; // max # threads used, or 0 to use what's avail opts.sort_threads = 0; - //opts.max_subproblem_size = 1e5; + // opts.max_subproblem_size = 1e5; FLT maxerr, ansmod; - + // spread a single source, only for reference accuracy check... - opts.spread_direction=1; - d_nonuniform[0] = 1.0; d_nonuniform[1] = 0.0; // unit strength - kx[0] = ky[0] = kz[0] = 0.0; // at center (probably doesn't matter); domain is [-pi,pi)^d - int ier = spreadinterp(N,N2,N3,d_uniform.data(),1,kx.data(),ky.data(),kz.data(),d_nonuniform.data(),opts); // vector::data officially C++11 but works - if (ier!=0) { - printf("error when spreading M=1 pt for ref acc check (ier=%d)!\n",ier); + opts.spread_direction = 1; + d_nonuniform[0] = 1.0; + d_nonuniform[1] = 0.0; // unit strength + kx[0] = ky[0] = kz[0] = 0.0; // at center (probably doesn't matter); domain is + // [-pi,pi)^d + int ier = spreadinterp(N, + N2, + N3, + d_uniform.data(), + 1, + kx.data(), + ky.data(), + kz.data(), + d_nonuniform.data(), + opts); // vector::data officially C++11 but works + if (ier != 0) { + printf("error when spreading M=1 pt for ref acc check (ier=%d)!\n", ier); return ier; } - FLT kersumre = 0.0, kersumim = 0.0; // sum kernel on uniform grid - for (BIGINT i=0;i1) ky[i]=randm11r(&se)*3*M_PI; // only fill needed coords - if (d>2) kz[i]=randm11r(&se)*3*M_PI; - d_nonuniform[i*2]=randm11r(&se); - d_nonuniform[i*2+1]=randm11r(&se); - strre += d_nonuniform[2*i]; - strim += d_nonuniform[2*i+1]; + unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s +#pragma omp for schedule(dynamic, 1000000) reduction(+ : strre, strim) + for (BIGINT i = 0; i < M; ++i) { + kx[i] = randm11r(&se) * 3 * M_PI; + // kx[i]=2.0*kx[i] - 50.0; //// to test folding within +-1 period + if (d > 1) ky[i] = randm11r(&se) * 3 * M_PI; // only fill needed coords + if (d > 2) kz[i] = randm11r(&se) * 3 * M_PI; + d_nonuniform[i * 2] = randm11r(&se); + d_nonuniform[i * 2 + 1] = randm11r(&se); + strre += d_nonuniform[2 * i]; + strim += d_nonuniform[2 * i + 1]; } } CNTime timer; double t; - if (dodir1) { // test direction 1 (NU -> U spreading) ...................... - printf("spreadinterp %dD, %.3g U pts, dir=%d, tol=%.3g: nspread=%d\n",d,(double)Ng,opts.spread_direction,tol,opts.nspread); + if (dodir1) { // test direction 1 (NU -> U spreading) ...................... + printf("spreadinterp %dD, %.3g U pts, dir=%d, tol=%.3g: nspread=%d\n", + d, + (double)Ng, + opts.spread_direction, + tol, + opts.nspread); timer.start(); - ier = spreadinterp(N,N2,N3,d_uniform.data(),M,kx.data(),ky.data(),kz.data(),d_nonuniform.data(),opts); - t=timer.elapsedsec(); - if (ier!=0) { - printf("error (ier=%d)!\n",ier); + ier = spreadinterp(N, + N2, + N3, + d_uniform.data(), + M, + kx.data(), + ky.data(), + kz.data(), + d_nonuniform.data(), + opts); + t = timer.elapsedsec(); + if (ier != 0) { + printf("error (ier=%d)!\n", ier); return ier; } else - printf(" %.3g NU pts in %.3g s \t%.3g pts/s \t%.3g spread pts/s\n",(double)M,t,M/t,pow(opts.nspread,d)*M/t); - - FLT sumre = 0.0, sumim = 0.0; // check spreading accuracy, wrapping -#pragma omp parallel for reduction(+:sumre,sumim) - for (BIGINT i=0;i NU interpolation) .............................. printf("making more random NU pts...\n"); - for (BIGINT i=0;i1) ky[i]=randm11r(&se)*3*M_PI; - if (d>2) kz[i]=randm11r(&se)*3*M_PI; - } + unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s +#pragma omp for schedule(dynamic, 1000000) + for (BIGINT i = 0; i < M; ++i) { // random target pts + // kx[i]=10+.9*rand01r(&s)*N; // or if want to keep ns away from edges + kx[i] = randm11r(&se) * 3 * M_PI; + if (d > 1) ky[i] = randm11r(&se) * 3 * M_PI; + if (d > 2) kz[i] = randm11r(&se) * 3 * M_PI; + } } - opts.spread_direction=2; - printf("spreadinterp %dD, %.3g U pts, dir=%d, tol=%.3g: nspread=%d\n",d,(double)Ng,opts.spread_direction,tol,opts.nspread); + opts.spread_direction = 2; + printf("spreadinterp %dD, %.3g U pts, dir=%d, tol=%.3g: nspread=%d\n", + d, + (double)Ng, + opts.spread_direction, + tol, + opts.nspread); timer.restart(); - ier = spreadinterp(N,N2,N3,d_uniform.data(),M,kx.data(),ky.data(),kz.data(),d_nonuniform.data(),opts); - t=timer.elapsedsec(); - if (ier!=0) { - printf("error (ier=%d)!\n",ier); + ier = spreadinterp(N, + N2, + N3, + d_uniform.data(), + M, + kx.data(), + ky.data(), + kz.data(), + d_nonuniform.data(), + opts); + t = timer.elapsedsec(); + if (ier != 0) { + printf("error (ier=%d)!\n", ier); return 1; } else - printf(" %.3g NU pts in %.3g s \t%.3g pts/s \t%.3g spread pts/s\n",(double)M,t,M/t,pow(opts.nspread,d)*M/t); + printf(" %.3g NU pts in %.3g s \t%.3g pts/s \t%.3g spread pts/s\n", + (double)M, + t, + M / t, + pow(opts.nspread, d) * M / t); // math test is worst-case error from pred value (kersum) on interp pts: maxerr = 0.0; - for (BIGINT i=0;imaxerr) maxerr=err; + for (BIGINT i = 0; i < M; ++i) { + FLT err = std::max(fabs(d_nonuniform[2 * i] - kersumre), + fabs(d_nonuniform[2 * i + 1] - kersumim)); + if (err > maxerr) maxerr = err; } - ansmod = sqrt(kersumre*kersumre+kersumim*kersumim); - printf(" max rel err in values at NU pts: %.3g\n",maxerr/ansmod); + ansmod = sqrt(kersumre * kersumre + kersumim * kersumim); + printf(" max rel err in values at NU pts: %.3g\n", maxerr / ansmod); // this is stronger test than for dir=1, since it tests sum of kernel for // each NU pt. However, it cannot detect reading // from wrong grid pts (they are all unity) diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu index 246a064f6..6bff9cb6e 100644 --- a/src/cuda/1d/cufinufft1d.cu +++ b/src/cuda/1d/cufinufft1d.cu @@ -16,8 +16,9 @@ using namespace cufinufft::deconvolve; using namespace cufinufft::spreadinterp; -template -int cufinufft1d1_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan) +template +int cufinufft1d1_exec(cuda_complex *d_c, cuda_complex *d_fk, + cufinufft_plan_t *d_plan) /* 1D Type-1 NUFFT @@ -31,43 +32,44 @@ int cufinufft1d1_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_pla Melody Shih 11/21/21 */ { - assert(d_plan->spopts.spread_direction == 1); - auto &stream = d_plan->stream; - - int ier; - cuda_complex *d_fkstart; - cuda_complex *d_cstart; - for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) { - int blksize = std::min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize); - d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; - d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms; - d_plan->c = d_cstart; - d_plan->fk = d_fkstart; - - // this is needed - if ((ier = checkCudaErrors( - cudaMemsetAsync(d_plan->fw, 0, d_plan->maxbatchsize * d_plan->nf1 * sizeof(cuda_complex), stream)))) - return ier; - - // Step 1: Spread - if ((ier = cuspread1d(d_plan, blksize))) - return ier; - - // Step 2: FFT - cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); - if (cufft_status != CUFFT_SUCCESS) - return FINUFFT_ERR_CUDA_FAILURE; - - // Step 3: deconvolve and shuffle - if ((ier = cudeconvolve1d(d_plan, blksize))) - return ier; - } - - return 0; + assert(d_plan->spopts.spread_direction == 1); + auto &stream = d_plan->stream; + + int ier; + cuda_complex *d_fkstart; + cuda_complex *d_cstart; + for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) { + int blksize = + std::min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize); + d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; + d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms; + d_plan->c = d_cstart; + d_plan->fk = d_fkstart; + + // this is needed + if ((ier = checkCudaErrors(cudaMemsetAsync( + d_plan->fw, 0, d_plan->maxbatchsize * d_plan->nf1 * sizeof(cuda_complex), + stream)))) + return ier; + + // Step 1: Spread + if ((ier = cuspread1d(d_plan, blksize))) return ier; + + // Step 2: FFT + cufftResult cufft_status = + cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); + if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE; + + // Step 3: deconvolve and shuffle + if ((ier = cudeconvolve1d(d_plan, blksize))) return ier; + } + + return 0; } -template -int cufinufft1d2_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan) +template +int cufinufft1d2_exec(cuda_complex *d_c, cuda_complex *d_fk, + cufinufft_plan_t *d_plan) /* 1D Type-2 NUFFT @@ -81,41 +83,42 @@ int cufinufft1d2_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_pla Melody Shih 11/21/21 */ { - assert(d_plan->spopts.spread_direction == 2); - - int ier; - cuda_complex *d_fkstart; - cuda_complex *d_cstart; - for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) { - int blksize = std::min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize); - d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; - d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms; - - d_plan->c = d_cstart; - d_plan->fk = d_fkstart; - - // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw - if ((ier = cudeconvolve1d(d_plan, blksize))) - return ier; - - // Step 2: FFT - cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); - if (cufft_status != CUFFT_SUCCESS) - return FINUFFT_ERR_CUDA_FAILURE; - - // Step 3: deconvolve and shuffle - if ((ier = cuinterp1d(d_plan, blksize))) - return ier; - } - - return 0; + assert(d_plan->spopts.spread_direction == 2); + + int ier; + cuda_complex *d_fkstart; + cuda_complex *d_cstart; + for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) { + int blksize = + std::min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize); + d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; + d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms; + + d_plan->c = d_cstart; + d_plan->fk = d_fkstart; + + // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw + if ((ier = cudeconvolve1d(d_plan, blksize))) return ier; + + // Step 2: FFT + cufftResult cufft_status = + cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); + if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE; + + // Step 3: deconvolve and shuffle + if ((ier = cuinterp1d(d_plan, blksize))) return ier; + } + + return 0; } template int cufinufft1d1_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan); -template int cufinufft1d1_exec(cuda_complex *d_c, cuda_complex *d_fk, +template int cufinufft1d1_exec(cuda_complex *d_c, + cuda_complex *d_fk, cufinufft_plan_t *d_plan); template int cufinufft1d2_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan); -template int cufinufft1d2_exec(cuda_complex *d_c, cuda_complex *d_fk, +template int cufinufft1d2_exec(cuda_complex *d_c, + cuda_complex *d_fk, cufinufft_plan_t *d_plan); diff --git a/src/cuda/1d/interp1d_wrapper.cu b/src/cuda/1d/interp1d_wrapper.cu index 0940f10de..cd3637c8b 100644 --- a/src/cuda/1d/interp1d_wrapper.cu +++ b/src/cuda/1d/interp1d_wrapper.cu @@ -14,7 +14,7 @@ using namespace cufinufft::memtransfer; namespace cufinufft { namespace spreadinterp { -template +template int cuinterp1d(cufinufft_plan_t *d_plan, int blksize) /* A wrapper for different interpolation methods. @@ -26,58 +26,60 @@ int cuinterp1d(cufinufft_plan_t *d_plan, int blksize) Melody Shih 11/21/21 */ { - int nf1 = d_plan->nf1; - int M = d_plan->M; - - int ier; - switch (d_plan->opts.gpu_method) { - case 1: { - ier = cuinterp1d_nuptsdriven(nf1, M, d_plan, blksize); - } break; - default: - std::cerr << "[cuinterp1d] error: incorrect method, should be 1" << std::endl; - ier = FINUFFT_ERR_METHOD_NOTVALID; - } - - return ier; + int nf1 = d_plan->nf1; + int M = d_plan->M; + + int ier; + switch (d_plan->opts.gpu_method) { + case 1: { + ier = cuinterp1d_nuptsdriven(nf1, M, d_plan, blksize); + } break; + default: + std::cerr << "[cuinterp1d] error: incorrect method, should be 1" << std::endl; + ier = FINUFFT_ERR_METHOD_NOTVALID; + } + + return ier; } -template +template int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t *d_plan, int blksize) { - auto &stream = d_plan->stream; - dim3 threadsPerBlock; - dim3 blocks; - - int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells - T es_c = d_plan->spopts.ES_c; - T es_beta = d_plan->spopts.ES_beta; - T sigma = d_plan->opts.upsampfac; - int *d_idxnupts = d_plan->idxnupts; - - T *d_kx = d_plan->kx; - cuda_complex *d_c = d_plan->c; - cuda_complex *d_fw = d_plan->fw; - - threadsPerBlock.x = 32; - threadsPerBlock.y = 1; - blocks.x = (M + threadsPerBlock.x - 1) / threadsPerBlock.x; - blocks.y = 1; - - if (d_plan->opts.gpu_kerevalmeth) { - for (int t = 0; t < blksize; t++) { - interp_1d_nuptsdriven<<>>( - d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, d_idxnupts); - RETURN_IF_CUDA_ERROR - } - } else { - for (int t = 0; t < blksize; t++) { - interp_1d_nuptsdriven<<>>( - d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, d_idxnupts); - RETURN_IF_CUDA_ERROR - } + auto &stream = d_plan->stream; + dim3 threadsPerBlock; + dim3 blocks; + + int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells + T es_c = d_plan->spopts.ES_c; + T es_beta = d_plan->spopts.ES_beta; + T sigma = d_plan->opts.upsampfac; + int *d_idxnupts = d_plan->idxnupts; + + T *d_kx = d_plan->kx; + cuda_complex *d_c = d_plan->c; + cuda_complex *d_fw = d_plan->fw; + + threadsPerBlock.x = 32; + threadsPerBlock.y = 1; + blocks.x = (M + threadsPerBlock.x - 1) / threadsPerBlock.x; + blocks.y = 1; + + if (d_plan->opts.gpu_kerevalmeth) { + for (int t = 0; t < blksize; t++) { + interp_1d_nuptsdriven<<>>( + d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, + d_idxnupts); + RETURN_IF_CUDA_ERROR + } + } else { + for (int t = 0; t < blksize; t++) { + interp_1d_nuptsdriven<<>>( + d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, + d_idxnupts); + RETURN_IF_CUDA_ERROR } + } - return 0; + return 0; } template int cuinterp1d(cufinufft_plan_t *d_plan, int blksize); diff --git a/src/cuda/1d/spread1d_wrapper.cu b/src/cuda/1d/spread1d_wrapper.cu index c41ce0919..e72ade469 100644 --- a/src/cuda/1d/spread1d_wrapper.cu +++ b/src/cuda/1d/spread1d_wrapper.cu @@ -19,7 +19,7 @@ using namespace cufinufft::memtransfer; namespace cufinufft { namespace spreadinterp { -template +template int cuspread1d(cufinufft_plan_t *d_plan, int blksize) /* A wrapper for different spreading methods. @@ -31,143 +31,52 @@ int cuspread1d(cufinufft_plan_t *d_plan, int blksize) Melody Shih 11/21/21 */ { - int nf1 = d_plan->nf1; - int M = d_plan->M; - - int ier; - switch (d_plan->opts.gpu_method) { - case 1: { - ier = cuspread1d_nuptsdriven(nf1, M, d_plan, blksize); - } break; - case 2: { - ier = cuspread1d_subprob(nf1, M, d_plan, blksize); - } break; - default: - std::cerr << "[cuspread1d] error: incorrect method, should be 1 or 2\n"; - ier = FINUFFT_ERR_METHOD_NOTVALID; - } - - return ier; + int nf1 = d_plan->nf1; + int M = d_plan->M; + + int ier; + switch (d_plan->opts.gpu_method) { + case 1: { + ier = cuspread1d_nuptsdriven(nf1, M, d_plan, blksize); + } break; + case 2: { + ier = cuspread1d_subprob(nf1, M, d_plan, blksize); + } break; + default: + std::cerr << "[cuspread1d] error: incorrect method, should be 1 or 2\n"; + ier = FINUFFT_ERR_METHOD_NOTVALID; + } + + return ier; } -template +template int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t *d_plan) { - auto &stream = d_plan->stream; - - if (d_plan->opts.gpu_sort) { - int bin_size_x = d_plan->opts.gpu_binsizex; - if (bin_size_x < 0) { - std::cerr << "[cuspread1d_nuptsdriven_prop] error: invalid binsize (binsizex) = (" << bin_size_x << ")\n"; - return FINUFFT_ERR_BINSIZE_NOTVALID; - } - - int numbins = ceil((T)nf1 / bin_size_x); - - T *d_kx = d_plan->kx; - - int *d_binsize = d_plan->binsize; - int *d_binstartpts = d_plan->binstartpts; - int *d_sortidx = d_plan->sortidx; - int *d_idxnupts = d_plan->idxnupts; - - int ier; - if ((ier = checkCudaErrors(cudaMemsetAsync(d_binsize, 0, numbins * sizeof(int), stream)))) - return ier; - calc_bin_size_noghost_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M, nf1, bin_size_x, numbins, d_binsize, - d_kx, d_sortidx); - RETURN_IF_CUDA_ERROR - - int n = numbins; - thrust::device_ptr d_ptr(d_binsize); - thrust::device_ptr d_result(d_binstartpts); - thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result); - - calc_inverse_of_global_sort_idx_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( - M, bin_size_x, numbins, d_binstartpts, d_sortidx, d_kx, d_idxnupts, nf1); - RETURN_IF_CUDA_ERROR - } else { - int *d_idxnupts = d_plan->idxnupts; - trivial_global_sort_index_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M, d_idxnupts); - RETURN_IF_CUDA_ERROR - } - - return 0; -} - -template -int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t *d_plan, int blksize) { - auto &stream = d_plan->stream; - dim3 threadsPerBlock; - dim3 blocks; - - int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells - int *d_idxnupts = d_plan->idxnupts; - T es_c = d_plan->spopts.ES_c; - T es_beta = d_plan->spopts.ES_beta; - T sigma = d_plan->spopts.upsampfac; - - T *d_kx = d_plan->kx; - cuda_complex *d_c = d_plan->c; - cuda_complex *d_fw = d_plan->fw; - - threadsPerBlock.x = 16; - threadsPerBlock.y = 1; - blocks.x = (M + threadsPerBlock.x - 1) / threadsPerBlock.x; - blocks.y = 1; - - if (d_plan->opts.gpu_kerevalmeth) { - for (int t = 0; t < blksize; t++) { - spread_1d_nuptsdriven<<>>( - d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, d_idxnupts); - RETURN_IF_CUDA_ERROR - } - } else { - for (int t = 0; t < blksize; t++) { - spread_1d_nuptsdriven<<>>( - d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, d_idxnupts); - RETURN_IF_CUDA_ERROR - } - } - - return 0; -} - -template -int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t *d_plan) -/* - This function determines the properties for spreading that are independent - of the strength of the nodes, only relates to the locations of the nodes, - which only needs to be done once. -*/ -{ - auto &stream = d_plan->stream; - int ier; + auto &stream = d_plan->stream; - int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; + if (d_plan->opts.gpu_sort) { int bin_size_x = d_plan->opts.gpu_binsizex; if (bin_size_x < 0) { - std::cerr << "[cuspread1d_subprob_prop] error: invalid binsize (binsizex) = (" << bin_size_x << ")\n"; - return FINUFFT_ERR_BINSIZE_NOTVALID; + std::cerr << "[cuspread1d_nuptsdriven_prop] error: invalid binsize (binsizex) = (" + << bin_size_x << ")\n"; + return FINUFFT_ERR_BINSIZE_NOTVALID; } int numbins = ceil((T)nf1 / bin_size_x); T *d_kx = d_plan->kx; - int *d_binsize = d_plan->binsize; + int *d_binsize = d_plan->binsize; int *d_binstartpts = d_plan->binstartpts; - int *d_sortidx = d_plan->sortidx; - int *d_numsubprob = d_plan->numsubprob; - int *d_subprobstartpts = d_plan->subprobstartpts; - int *d_idxnupts = d_plan->idxnupts; - - int *d_subprob_to_bin = nullptr; + int *d_sortidx = d_plan->sortidx; + int *d_idxnupts = d_plan->idxnupts; - - if ((ier = checkCudaErrors(cudaMemsetAsync(d_binsize, 0, numbins * sizeof(int), stream)))) - return ier; - calc_bin_size_noghost_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M, nf1, bin_size_x, numbins, d_binsize, d_kx, - d_sortidx); + int ier; + if ((ier = checkCudaErrors( + cudaMemsetAsync(d_binsize, 0, numbins * sizeof(int), stream)))) + return ier; + calc_bin_size_noghost_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( + M, nf1, bin_size_x, numbins, d_binsize, d_kx, d_sortidx); RETURN_IF_CUDA_ERROR int n = numbins; @@ -178,101 +87,207 @@ int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t *d_plan) calc_inverse_of_global_sort_idx_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( M, bin_size_x, numbins, d_binstartpts, d_sortidx, d_kx, d_idxnupts, nf1); RETURN_IF_CUDA_ERROR - - calc_subprob_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(d_binsize, d_numsubprob, maxsubprobsize, numbins); + } else { + int *d_idxnupts = d_plan->idxnupts; + trivial_global_sort_index_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M, + d_idxnupts); RETURN_IF_CUDA_ERROR + } - d_ptr = thrust::device_pointer_cast(d_numsubprob); - d_result = thrust::device_pointer_cast(d_subprobstartpts + 1); - thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result); - - if ((ier = checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream)))) - return ier; + return 0; +} - int totalnumsubprob; - if ((ier = checkCudaErrors( - cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n], sizeof(int), cudaMemcpyDeviceToHost, stream)))) - return ier; - cudaStreamSynchronize(stream); - if ((ier = checkCudaErrors(cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream)))) - return ier; - map_b_into_subprob_1d<<<(numbins + 1024 - 1) / 1024, 1024, 0, stream>>>(d_subprob_to_bin, d_subprobstartpts, - d_numsubprob, numbins); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err)); - cudaFree(d_subprob_to_bin); - return FINUFFT_ERR_CUDA_FAILURE; +template +int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t *d_plan, int blksize) { + auto &stream = d_plan->stream; + dim3 threadsPerBlock; + dim3 blocks; + + int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells + int *d_idxnupts = d_plan->idxnupts; + T es_c = d_plan->spopts.ES_c; + T es_beta = d_plan->spopts.ES_beta; + T sigma = d_plan->spopts.upsampfac; + + T *d_kx = d_plan->kx; + cuda_complex *d_c = d_plan->c; + cuda_complex *d_fw = d_plan->fw; + + threadsPerBlock.x = 16; + threadsPerBlock.y = 1; + blocks.x = (M + threadsPerBlock.x - 1) / threadsPerBlock.x; + blocks.y = 1; + + if (d_plan->opts.gpu_kerevalmeth) { + for (int t = 0; t < blksize; t++) { + spread_1d_nuptsdriven<<>>( + d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, + d_idxnupts); + RETURN_IF_CUDA_ERROR } + } else { + for (int t = 0; t < blksize; t++) { + spread_1d_nuptsdriven<<>>( + d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, + d_idxnupts); + RETURN_IF_CUDA_ERROR + } + } - assert(d_subprob_to_bin != NULL); - cudaFreeAsync(d_plan->subprob_to_bin, stream); - d_plan->subprob_to_bin = d_subprob_to_bin; - d_plan->totalnumsubprob = totalnumsubprob; - - return 0; + return 0; } -template -int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t *d_plan, int blksize) { - auto &stream = d_plan->stream; +template +int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t *d_plan) +/* + This function determines the properties for spreading that are independent + of the strength of the nodes, only relates to the locations of the nodes, + which only needs to be done once. +*/ +{ + auto &stream = d_plan->stream; + int ier; - int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells - T es_c = d_plan->spopts.ES_c; - T es_beta = d_plan->spopts.ES_beta; - int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; + int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; + int bin_size_x = d_plan->opts.gpu_binsizex; + if (bin_size_x < 0) { + std::cerr << "[cuspread1d_subprob_prop] error: invalid binsize (binsizex) = (" + << bin_size_x << ")\n"; + return FINUFFT_ERR_BINSIZE_NOTVALID; + } - // assume that bin_size_x > ns/2; - int bin_size_x = d_plan->opts.gpu_binsizex; - int numbins = ceil((T)nf1 / bin_size_x); + int numbins = ceil((T)nf1 / bin_size_x); - T *d_kx = d_plan->kx; - cuda_complex *d_c = d_plan->c; - cuda_complex *d_fw = d_plan->fw; + T *d_kx = d_plan->kx; - int *d_binsize = d_plan->binsize; - int *d_binstartpts = d_plan->binstartpts; - int *d_numsubprob = d_plan->numsubprob; - int *d_subprobstartpts = d_plan->subprobstartpts; - int *d_idxnupts = d_plan->idxnupts; + int *d_binsize = d_plan->binsize; + int *d_binstartpts = d_plan->binstartpts; + int *d_sortidx = d_plan->sortidx; + int *d_numsubprob = d_plan->numsubprob; + int *d_subprobstartpts = d_plan->subprobstartpts; + int *d_idxnupts = d_plan->idxnupts; - int totalnumsubprob = d_plan->totalnumsubprob; - int *d_subprob_to_bin = d_plan->subprob_to_bin; + int *d_subprob_to_bin = nullptr; + if ((ier = + checkCudaErrors(cudaMemsetAsync(d_binsize, 0, numbins * sizeof(int), stream)))) + return ier; + calc_bin_size_noghost_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( + M, nf1, bin_size_x, numbins, d_binsize, d_kx, d_sortidx); + RETURN_IF_CUDA_ERROR - T sigma = d_plan->opts.upsampfac; + int n = numbins; + thrust::device_ptr d_ptr(d_binsize); + thrust::device_ptr d_result(d_binstartpts); + thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result); - size_t sharedplanorysize = (bin_size_x + 2 * (int)ceil(ns / 2.0)) * sizeof(cuda_complex); - if (sharedplanorysize > 49152) { - std::cerr << "[cuspread1d_subprob] error: not enough shared memory\n"; - return FINUFFT_ERR_INSUFFICIENT_SHMEM; - } + calc_inverse_of_global_sort_idx_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( + M, bin_size_x, numbins, d_binstartpts, d_sortidx, d_kx, d_idxnupts, nf1); + RETURN_IF_CUDA_ERROR + + calc_subprob_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(d_binsize, d_numsubprob, + maxsubprobsize, numbins); + RETURN_IF_CUDA_ERROR + + d_ptr = thrust::device_pointer_cast(d_numsubprob); + d_result = thrust::device_pointer_cast(d_subprobstartpts + 1); + thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result); - if (d_plan->opts.gpu_kerevalmeth) { - for (int t = 0; t < blksize; t++) { - spread_1d_subprob<<>>( - d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, d_binstartpts, d_binsize, - bin_size_x, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins, d_idxnupts); - RETURN_IF_CUDA_ERROR - } - } else { - for (int t = 0; t < blksize; t++) { - spread_1d_subprob<<>>( - d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, d_binstartpts, d_binsize, - bin_size_x, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins, d_idxnupts); - RETURN_IF_CUDA_ERROR - } + if ((ier = checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream)))) + return ier; + + int totalnumsubprob; + if ((ier = + checkCudaErrors(cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n], + sizeof(int), cudaMemcpyDeviceToHost, stream)))) + return ier; + cudaStreamSynchronize(stream); + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream)))) + return ier; + map_b_into_subprob_1d<<<(numbins + 1024 - 1) / 1024, 1024, 0, stream>>>( + d_subprob_to_bin, d_subprobstartpts, d_numsubprob, numbins); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err)); + cudaFree(d_subprob_to_bin); + return FINUFFT_ERR_CUDA_FAILURE; + } + + assert(d_subprob_to_bin != NULL); + cudaFreeAsync(d_plan->subprob_to_bin, stream); + d_plan->subprob_to_bin = d_subprob_to_bin; + d_plan->totalnumsubprob = totalnumsubprob; + + return 0; +} + +template +int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t *d_plan, int blksize) { + auto &stream = d_plan->stream; + + int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells + T es_c = d_plan->spopts.ES_c; + T es_beta = d_plan->spopts.ES_beta; + int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; + + // assume that bin_size_x > ns/2; + int bin_size_x = d_plan->opts.gpu_binsizex; + int numbins = ceil((T)nf1 / bin_size_x); + + T *d_kx = d_plan->kx; + cuda_complex *d_c = d_plan->c; + cuda_complex *d_fw = d_plan->fw; + + int *d_binsize = d_plan->binsize; + int *d_binstartpts = d_plan->binstartpts; + int *d_numsubprob = d_plan->numsubprob; + int *d_subprobstartpts = d_plan->subprobstartpts; + int *d_idxnupts = d_plan->idxnupts; + + int totalnumsubprob = d_plan->totalnumsubprob; + int *d_subprob_to_bin = d_plan->subprob_to_bin; + + T sigma = d_plan->opts.upsampfac; + + size_t sharedplanorysize = + (bin_size_x + 2 * (int)ceil(ns / 2.0)) * sizeof(cuda_complex); + if (sharedplanorysize > 49152) { + std::cerr << "[cuspread1d_subprob] error: not enough shared memory\n"; + return FINUFFT_ERR_INSUFFICIENT_SHMEM; + } + + if (d_plan->opts.gpu_kerevalmeth) { + for (int t = 0; t < blksize; t++) { + spread_1d_subprob<<>>( + d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, + d_binstartpts, d_binsize, bin_size_x, d_subprob_to_bin, d_subprobstartpts, + d_numsubprob, maxsubprobsize, numbins, d_idxnupts); + RETURN_IF_CUDA_ERROR + } + } else { + for (int t = 0; t < blksize; t++) { + spread_1d_subprob<<>>( + d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, + d_binstartpts, d_binsize, bin_size_x, d_subprob_to_bin, d_subprobstartpts, + d_numsubprob, maxsubprobsize, numbins, d_idxnupts); + RETURN_IF_CUDA_ERROR } + } - return 0; + return 0; } template int cuspread1d(cufinufft_plan_t *d_plan, int blksize); template int cuspread1d(cufinufft_plan_t *d_plan, int blksize); -template int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t *d_plan); -template int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t *d_plan); -template int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t *d_plan); -template int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t *d_plan); +template int cuspread1d_nuptsdriven_prop(int nf1, int M, + cufinufft_plan_t *d_plan); +template int cuspread1d_nuptsdriven_prop(int nf1, int M, + cufinufft_plan_t *d_plan); +template int cuspread1d_subprob_prop(int nf1, int M, + cufinufft_plan_t *d_plan); +template int cuspread1d_subprob_prop(int nf1, int M, + cufinufft_plan_t *d_plan); } // namespace spreadinterp } // namespace cufinufft diff --git a/src/cuda/2d/cufinufft2d.cu b/src/cuda/2d/cufinufft2d.cu index b566f49ce..5f1fbd55c 100644 --- a/src/cuda/2d/cufinufft2d.cu +++ b/src/cuda/2d/cufinufft2d.cu @@ -15,8 +15,9 @@ using namespace cufinufft::deconvolve; using namespace cufinufft::spreadinterp; using std::min; -template -int cufinufft2d1_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan) +template +int cufinufft2d1_exec(cuda_complex *d_c, cuda_complex *d_fk, + cufinufft_plan_t *d_plan) /* 2D Type-1 NUFFT @@ -30,44 +31,45 @@ int cufinufft2d1_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_pla Melody Shih 07/25/19 */ { - assert(d_plan->spopts.spread_direction == 1); - - int ier; - cuda_complex *d_fkstart; - cuda_complex *d_cstart; - - auto &stream = d_plan->stream; - for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) { - int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize); - d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; - d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt; - d_plan->c = d_cstart; - d_plan->fk = d_fkstart; - - // this is needed - if ((ier = checkCudaErrors(cudaMemsetAsync( - d_plan->fw, 0, d_plan->maxbatchsize * d_plan->nf1 * d_plan->nf2 * sizeof(cuda_complex), stream)))) - return ier; - - // Step 1: Spread - if ((ier = cuspread2d(d_plan, blksize))) - return ier; - - // Step 2: FFT - cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); - if (cufft_status != CUFFT_SUCCESS) - return FINUFFT_ERR_CUDA_FAILURE; - - // Step 3: deconvolve and shuffle - if ((ier = cudeconvolve2d(d_plan, blksize))) - return ier; - } - - return 0; + assert(d_plan->spopts.spread_direction == 1); + + int ier; + cuda_complex *d_fkstart; + cuda_complex *d_cstart; + + auto &stream = d_plan->stream; + for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) { + int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize); + d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; + d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt; + d_plan->c = d_cstart; + d_plan->fk = d_fkstart; + + // this is needed + if ((ier = checkCudaErrors(cudaMemsetAsync( + d_plan->fw, 0, + d_plan->maxbatchsize * d_plan->nf1 * d_plan->nf2 * sizeof(cuda_complex), + stream)))) + return ier; + + // Step 1: Spread + if ((ier = cuspread2d(d_plan, blksize))) return ier; + + // Step 2: FFT + cufftResult cufft_status = + cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); + if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE; + + // Step 3: deconvolve and shuffle + if ((ier = cudeconvolve2d(d_plan, blksize))) return ier; + } + + return 0; } -template -int cufinufft2d2_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan) +template +int cufinufft2d2_exec(cuda_complex *d_c, cuda_complex *d_fk, + cufinufft_plan_t *d_plan) /* 2D Type-2 NUFFT @@ -81,41 +83,41 @@ int cufinufft2d2_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_pla Melody Shih 07/25/19 */ { - assert(d_plan->spopts.spread_direction == 2); - - int ier; - cuda_complex *d_fkstart; - cuda_complex *d_cstart; - for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) { - int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize); - d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; - d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt; - - d_plan->c = d_cstart; - d_plan->fk = d_fkstart; - - // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw - if ((ier = cudeconvolve2d(d_plan, blksize))) - return ier; - - // Step 2: FFT - cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); - if (cufft_status != CUFFT_SUCCESS) - return FINUFFT_ERR_CUDA_FAILURE; - - // Step 3: deconvolve and shuffle - if ((ier = cuinterp2d(d_plan, blksize))) - return ier; - } - - return 0; + assert(d_plan->spopts.spread_direction == 2); + + int ier; + cuda_complex *d_fkstart; + cuda_complex *d_cstart; + for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) { + int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize); + d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; + d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt; + + d_plan->c = d_cstart; + d_plan->fk = d_fkstart; + + // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw + if ((ier = cudeconvolve2d(d_plan, blksize))) return ier; + + // Step 2: FFT + cufftResult cufft_status = + cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); + if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE; + + // Step 3: deconvolve and shuffle + if ((ier = cuinterp2d(d_plan, blksize))) return ier; + } + + return 0; } template int cufinufft2d1_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan); -template int cufinufft2d1_exec(cuda_complex *d_c, cuda_complex *d_fk, +template int cufinufft2d1_exec(cuda_complex *d_c, + cuda_complex *d_fk, cufinufft_plan_t *d_plan); template int cufinufft2d2_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan); -template int cufinufft2d2_exec(cuda_complex *d_c, cuda_complex *d_fk, +template int cufinufft2d2_exec(cuda_complex *d_c, + cuda_complex *d_fk, cufinufft_plan_t *d_plan); diff --git a/src/cuda/2d/interp2d_wrapper.cu b/src/cuda/2d/interp2d_wrapper.cu index c62188e90..533788482 100644 --- a/src/cuda/2d/interp2d_wrapper.cu +++ b/src/cuda/2d/interp2d_wrapper.cu @@ -14,7 +14,7 @@ using namespace cufinufft::memtransfer; namespace cufinufft { namespace spreadinterp { -template +template int cuinterp2d(cufinufft_plan_t *d_plan, int blksize) /* A wrapper for different interpolation methods. @@ -26,127 +26,130 @@ int cuinterp2d(cufinufft_plan_t *d_plan, int blksize) Melody Shih 07/25/19 */ { - int nf1 = d_plan->nf1; - int nf2 = d_plan->nf2; - int M = d_plan->M; - - int ier; - switch (d_plan->opts.gpu_method) { - case 1: { - ier = cuinterp2d_nuptsdriven(nf1, nf2, M, d_plan, blksize); - } break; - case 2: { - ier = cuinterp2d_subprob(nf1, nf2, M, d_plan, blksize); - } break; - default: - std::cerr << "[cuinterp2d] error: incorrect method, should be 1 or 2\n"; - ier = FINUFFT_ERR_METHOD_NOTVALID; - } - - return ier; + int nf1 = d_plan->nf1; + int nf2 = d_plan->nf2; + int M = d_plan->M; + + int ier; + switch (d_plan->opts.gpu_method) { + case 1: { + ier = cuinterp2d_nuptsdriven(nf1, nf2, M, d_plan, blksize); + } break; + case 2: { + ier = cuinterp2d_subprob(nf1, nf2, M, d_plan, blksize); + } break; + default: + std::cerr << "[cuinterp2d] error: incorrect method, should be 1 or 2\n"; + ier = FINUFFT_ERR_METHOD_NOTVALID; + } + + return ier; } -template -int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, int blksize) { - auto &stream = d_plan->stream; - - dim3 threadsPerBlock; - dim3 blocks; - - int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells - T es_c = d_plan->spopts.ES_c; - T es_beta = d_plan->spopts.ES_beta; - T sigma = d_plan->opts.upsampfac; - - int *d_idxnupts = d_plan->idxnupts; - - T *d_kx = d_plan->kx; - T *d_ky = d_plan->ky; - cuda_complex *d_c = d_plan->c; - cuda_complex *d_fw = d_plan->fw; - - threadsPerBlock.x = 32; - threadsPerBlock.y = 1; - blocks.x = (M + threadsPerBlock.x - 1) / threadsPerBlock.x; - blocks.y = 1; - - if (d_plan->opts.gpu_kerevalmeth) { - for (int t = 0; t < blksize; t++) { - interp_2d_nupts_driven - <<>>(d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, - es_c, es_beta, sigma, d_idxnupts); - RETURN_IF_CUDA_ERROR - } - } else { - for (int t = 0; t < blksize; t++) { - interp_2d_nupts_driven - <<>>(d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, - es_c, es_beta, sigma, d_idxnupts); - RETURN_IF_CUDA_ERROR - } +template +int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, + int blksize) { + auto &stream = d_plan->stream; + + dim3 threadsPerBlock; + dim3 blocks; + + int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells + T es_c = d_plan->spopts.ES_c; + T es_beta = d_plan->spopts.ES_beta; + T sigma = d_plan->opts.upsampfac; + + int *d_idxnupts = d_plan->idxnupts; + + T *d_kx = d_plan->kx; + T *d_ky = d_plan->ky; + cuda_complex *d_c = d_plan->c; + cuda_complex *d_fw = d_plan->fw; + + threadsPerBlock.x = 32; + threadsPerBlock.y = 1; + blocks.x = (M + threadsPerBlock.x - 1) / threadsPerBlock.x; + blocks.y = 1; + + if (d_plan->opts.gpu_kerevalmeth) { + for (int t = 0; t < blksize; t++) { + interp_2d_nupts_driven<<>>( + d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, + sigma, d_idxnupts); + RETURN_IF_CUDA_ERROR } + } else { + for (int t = 0; t < blksize; t++) { + interp_2d_nupts_driven<<>>( + d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, + sigma, d_idxnupts); + RETURN_IF_CUDA_ERROR + } + } - return 0; + return 0; } -template -int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, int blksize) { - auto &stream = d_plan->stream; - - int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells - T es_c = d_plan->spopts.ES_c; - T es_beta = d_plan->spopts.ES_beta; - int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; - - // assume that bin_size_x > ns/2; - int bin_size_x = d_plan->opts.gpu_binsizex; - int bin_size_y = d_plan->opts.gpu_binsizey; - int numbins[2]; - numbins[0] = ceil((T)nf1 / bin_size_x); - numbins[1] = ceil((T)nf2 / bin_size_y); - - T *d_kx = d_plan->kx; - T *d_ky = d_plan->ky; - cuda_complex *d_c = d_plan->c; - cuda_complex *d_fw = d_plan->fw; - - int *d_binsize = d_plan->binsize; - int *d_binstartpts = d_plan->binstartpts; - int *d_numsubprob = d_plan->numsubprob; - int *d_subprobstartpts = d_plan->subprobstartpts; - int *d_idxnupts = d_plan->idxnupts; - int *d_subprob_to_bin = d_plan->subprob_to_bin; - int totalnumsubprob = d_plan->totalnumsubprob; - - - T sigma = d_plan->opts.upsampfac; - size_t sharedplanorysize = - (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex); - - if (sharedplanorysize > 49152) { - std::cerr << "[cuinterp2d_subprob] error: not enough shared memory\n"; - return FINUFFT_ERR_INSUFFICIENT_SHMEM; +template +int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, + int blksize) { + auto &stream = d_plan->stream; + + int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells + T es_c = d_plan->spopts.ES_c; + T es_beta = d_plan->spopts.ES_beta; + int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; + + // assume that bin_size_x > ns/2; + int bin_size_x = d_plan->opts.gpu_binsizex; + int bin_size_y = d_plan->opts.gpu_binsizey; + int numbins[2]; + numbins[0] = ceil((T)nf1 / bin_size_x); + numbins[1] = ceil((T)nf2 / bin_size_y); + + T *d_kx = d_plan->kx; + T *d_ky = d_plan->ky; + cuda_complex *d_c = d_plan->c; + cuda_complex *d_fw = d_plan->fw; + + int *d_binsize = d_plan->binsize; + int *d_binstartpts = d_plan->binstartpts; + int *d_numsubprob = d_plan->numsubprob; + int *d_subprobstartpts = d_plan->subprobstartpts; + int *d_idxnupts = d_plan->idxnupts; + int *d_subprob_to_bin = d_plan->subprob_to_bin; + int totalnumsubprob = d_plan->totalnumsubprob; + + T sigma = d_plan->opts.upsampfac; + size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) * + (bin_size_y + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex); + + if (sharedplanorysize > 49152) { + std::cerr << "[cuinterp2d_subprob] error: not enough shared memory\n"; + return FINUFFT_ERR_INSUFFICIENT_SHMEM; + } + + if (d_plan->opts.gpu_kerevalmeth) { + for (int t = 0; t < blksize; t++) { + interp_2d_subprob<<>>( + d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, + sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, + d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins[0], numbins[1], + d_idxnupts); + RETURN_IF_CUDA_ERROR } - - if (d_plan->opts.gpu_kerevalmeth) { - for (int t = 0; t < blksize; t++) { - interp_2d_subprob<<>>( - d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts, - d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize, - numbins[0], numbins[1], d_idxnupts); - RETURN_IF_CUDA_ERROR - } - } else { - for (int t = 0; t < blksize; t++) { - interp_2d_subprob<<>>( - d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts, - d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize, - numbins[0], numbins[1], d_idxnupts); - RETURN_IF_CUDA_ERROR - } + } else { + for (int t = 0; t < blksize; t++) { + interp_2d_subprob<<>>( + d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, + sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, + d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins[0], numbins[1], + d_idxnupts); + RETURN_IF_CUDA_ERROR } + } - return 0; + return 0; } template int cuinterp2d(cufinufft_plan_t *d_plan, int blksize); diff --git a/src/cuda/2d/spread2d_wrapper.cu b/src/cuda/2d/spread2d_wrapper.cu index 3b27f7efd..f03e658d2 100644 --- a/src/cuda/2d/spread2d_wrapper.cu +++ b/src/cuda/2d/spread2d_wrapper.cu @@ -19,7 +19,7 @@ using namespace cufinufft::memtransfer; namespace cufinufft { namespace spreadinterp { -template +template int cuspread2d(cufinufft_plan_t *d_plan, int blksize) /* A wrapper for different spreading methods. @@ -31,135 +31,40 @@ int cuspread2d(cufinufft_plan_t *d_plan, int blksize) Melody Shih 07/25/19 */ { - int nf1 = d_plan->nf1; - int nf2 = d_plan->nf2; - int M = d_plan->M; - - int ier; - switch (d_plan->opts.gpu_method) { - case 1: { - ier = cuspread2d_nuptsdriven(nf1, nf2, M, d_plan, blksize); - } break; - case 2: { - ier = cuspread2d_subprob(nf1, nf2, M, d_plan, blksize); - } break; - default: - std::cerr << "[cuspread2d] error: incorrect method, should be 1 or 2\n"; - ier = FINUFFT_ERR_METHOD_NOTVALID; - } - - return ier; + int nf1 = d_plan->nf1; + int nf2 = d_plan->nf2; + int M = d_plan->M; + + int ier; + switch (d_plan->opts.gpu_method) { + case 1: { + ier = cuspread2d_nuptsdriven(nf1, nf2, M, d_plan, blksize); + } break; + case 2: { + ier = cuspread2d_subprob(nf1, nf2, M, d_plan, blksize); + } break; + default: + std::cerr << "[cuspread2d] error: incorrect method, should be 1 or 2\n"; + ier = FINUFFT_ERR_METHOD_NOTVALID; + } + + return ier; } -template +template int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t *d_plan) { - auto &stream = d_plan->stream; - - if (d_plan->opts.gpu_sort) { - int bin_size_x = d_plan->opts.gpu_binsizex; - int bin_size_y = d_plan->opts.gpu_binsizey; - if (bin_size_x < 0 || bin_size_y < 0) { - std::cerr << "[cuspread2d_nuptsdriven_prop] error: invalid binsize (binsizex, binsizey) = ("; - std::cerr << bin_size_x << "," << bin_size_y << ")" << std::endl; - return FINUFFT_ERR_BINSIZE_NOTVALID; - } - - int numbins[2]; - numbins[0] = ceil((T)nf1 / bin_size_x); - numbins[1] = ceil((T)nf2 / bin_size_y); - - T *d_kx = d_plan->kx; - T *d_ky = d_plan->ky; - - int *d_binsize = d_plan->binsize; - int *d_binstartpts = d_plan->binstartpts; - int *d_sortidx = d_plan->sortidx; - int *d_idxnupts = d_plan->idxnupts; - - int ier; - if ((ier = checkCudaErrors(cudaMemsetAsync(d_binsize, 0, numbins[0] * numbins[1] * sizeof(int), stream)))) - return ier; - - calc_bin_size_noghost_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( - M, nf1, nf2, bin_size_x, bin_size_y, numbins[0], numbins[1], d_binsize, d_kx, d_ky, d_sortidx); - RETURN_IF_CUDA_ERROR - - int n = numbins[0] * numbins[1]; - thrust::device_ptr d_ptr(d_binsize); - thrust::device_ptr d_result(d_binstartpts); - thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result); - - calc_inverse_of_global_sort_index_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( - M, bin_size_x, bin_size_y, numbins[0], numbins[1], d_binstartpts, d_sortidx, d_kx, d_ky, d_idxnupts, nf1, nf2); - RETURN_IF_CUDA_ERROR - } else { - int *d_idxnupts = d_plan->idxnupts; - - trivial_global_sort_index_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M, d_idxnupts); - RETURN_IF_CUDA_ERROR - } - - return 0; -} - -template -int cuspread2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, int blksize) { - auto &stream = d_plan->stream; - dim3 threadsPerBlock; - dim3 blocks; - - int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells - int *d_idxnupts = d_plan->idxnupts; - T es_c = d_plan->spopts.ES_c; - T es_beta = d_plan->spopts.ES_beta; - T sigma = d_plan->spopts.upsampfac; - - T *d_kx = d_plan->kx; - T *d_ky = d_plan->ky; - cuda_complex *d_c = d_plan->c; - cuda_complex *d_fw = d_plan->fw; - - threadsPerBlock.x = 16; - threadsPerBlock.y = 1; - blocks.x = (M + threadsPerBlock.x - 1) / threadsPerBlock.x; - blocks.y = 1; - if (d_plan->opts.gpu_kerevalmeth) { - for (int t = 0; t < blksize; t++) { - spread_2d_nupts_driven - <<>>(d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, - es_c, es_beta, sigma, d_idxnupts); - RETURN_IF_CUDA_ERROR - } - } else { - for (int t = 0; t < blksize; t++) { - spread_2d_nupts_driven - <<>>(d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, - es_c, es_beta, sigma, d_idxnupts); - RETURN_IF_CUDA_ERROR - } - } + auto &stream = d_plan->stream; - return 0; -} - -template -int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t *d_plan) -/* - This function determines the properties for spreading that are independent - of the strength of the nodes, only relates to the locations of the nodes, - which only needs to be done once. -*/ -{ - auto &stream = d_plan->stream; - - int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; + if (d_plan->opts.gpu_sort) { int bin_size_x = d_plan->opts.gpu_binsizex; int bin_size_y = d_plan->opts.gpu_binsizey; if (bin_size_x < 0 || bin_size_y < 0) { - std::cerr << "[cuspread2d_subprob_prop] error: invalid binsize (binsizex, binsizey) = ("; - std::cerr << bin_size_x << "," << bin_size_y << ")" << std::endl; - return FINUFFT_ERR_BINSIZE_NOTVALID; + std::cerr << "[cuspread2d_nuptsdriven_prop] error: invalid binsize (binsizex, " + "binsizey) = ("; + std::cerr << bin_size_x << "," << bin_size_y << ")" << std::endl; + return FINUFFT_ERR_BINSIZE_NOTVALID; } + int numbins[2]; numbins[0] = ceil((T)nf1 / bin_size_x); numbins[1] = ceil((T)nf2 / bin_size_y); @@ -167,21 +72,19 @@ int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t *d_plan T *d_kx = d_plan->kx; T *d_ky = d_plan->ky; - int *d_binsize = d_plan->binsize; + int *d_binsize = d_plan->binsize; int *d_binstartpts = d_plan->binstartpts; - int *d_sortidx = d_plan->sortidx; - int *d_numsubprob = d_plan->numsubprob; - int *d_subprobstartpts = d_plan->subprobstartpts; - int *d_idxnupts = d_plan->idxnupts; - - int *d_subprob_to_bin = NULL; + int *d_sortidx = d_plan->sortidx; + int *d_idxnupts = d_plan->idxnupts; int ier; - if ((ier = checkCudaErrors(cudaMemsetAsync(d_binsize, 0, numbins[0] * numbins[1] * sizeof(int), stream)))) - return ier; + if ((ier = checkCudaErrors(cudaMemsetAsync( + d_binsize, 0, numbins[0] * numbins[1] * sizeof(int), stream)))) + return ier; calc_bin_size_noghost_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( - M, nf1, nf2, bin_size_x, bin_size_y, numbins[0], numbins[1], d_binsize, d_kx, d_ky, d_sortidx); + M, nf1, nf2, bin_size_x, bin_size_y, numbins[0], numbins[1], d_binsize, d_kx, + d_ky, d_sortidx); RETURN_IF_CUDA_ERROR int n = numbins[0] * numbins[1]; @@ -190,110 +93,226 @@ int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t *d_plan thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result); calc_inverse_of_global_sort_index_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( - M, bin_size_x, bin_size_y, numbins[0], numbins[1], d_binstartpts, d_sortidx, d_kx, d_ky, d_idxnupts, - nf1, nf2); - RETURN_IF_CUDA_ERROR - calc_subprob_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(d_binsize, d_numsubprob, maxsubprobsize, - numbins[0] * numbins[1]); + M, bin_size_x, bin_size_y, numbins[0], numbins[1], d_binstartpts, d_sortidx, d_kx, + d_ky, d_idxnupts, nf1, nf2); RETURN_IF_CUDA_ERROR + } else { + int *d_idxnupts = d_plan->idxnupts; - d_ptr = thrust::device_pointer_cast(d_numsubprob); - d_result = thrust::device_pointer_cast(d_subprobstartpts + 1); - thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result); - - if ((ier = checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream)))) - return ier; - - int totalnumsubprob; - if ((ier = checkCudaErrors( - cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n], sizeof(int), cudaMemcpyDeviceToHost, stream)))) - return ier; - cudaStreamSynchronize(stream); - if ((ier = checkCudaErrors(cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream)))) - return ier; - map_b_into_subprob_2d<<<(numbins[0] * numbins[1] + 1024 - 1) / 1024, 1024, 0, stream>>>( - d_subprob_to_bin, d_subprobstartpts, d_numsubprob, numbins[0] * numbins[1]); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err)); - cudaFree(d_subprob_to_bin); - return FINUFFT_ERR_CUDA_FAILURE; - } - - assert(d_subprob_to_bin != NULL); - cudaFreeAsync(d_plan->subprob_to_bin, stream); - d_plan->subprob_to_bin = d_subprob_to_bin; - d_plan->totalnumsubprob = totalnumsubprob; + trivial_global_sort_index_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M, + d_idxnupts); + RETURN_IF_CUDA_ERROR + } - return 0; + return 0; } -template -int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, int blksize) { - auto &stream = d_plan->stream; - - int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells - T es_c = d_plan->spopts.ES_c; - T es_beta = d_plan->spopts.ES_beta; - int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; - - // assume that bin_size_x > ns/2; - int bin_size_x = d_plan->opts.gpu_binsizex; - int bin_size_y = d_plan->opts.gpu_binsizey; - int numbins[2]; - numbins[0] = ceil((T)nf1 / bin_size_x); - numbins[1] = ceil((T)nf2 / bin_size_y); +template +int cuspread2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, + int blksize) { + auto &stream = d_plan->stream; + dim3 threadsPerBlock; + dim3 blocks; + + int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells + int *d_idxnupts = d_plan->idxnupts; + T es_c = d_plan->spopts.ES_c; + T es_beta = d_plan->spopts.ES_beta; + T sigma = d_plan->spopts.upsampfac; + + T *d_kx = d_plan->kx; + T *d_ky = d_plan->ky; + cuda_complex *d_c = d_plan->c; + cuda_complex *d_fw = d_plan->fw; + + threadsPerBlock.x = 16; + threadsPerBlock.y = 1; + blocks.x = (M + threadsPerBlock.x - 1) / threadsPerBlock.x; + blocks.y = 1; + if (d_plan->opts.gpu_kerevalmeth) { + for (int t = 0; t < blksize; t++) { + spread_2d_nupts_driven<<>>( + d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, + sigma, d_idxnupts); + RETURN_IF_CUDA_ERROR + } + } else { + for (int t = 0; t < blksize; t++) { + spread_2d_nupts_driven<<>>( + d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, + sigma, d_idxnupts); + RETURN_IF_CUDA_ERROR + } + } - T *d_kx = d_plan->kx; - T *d_ky = d_plan->ky; - cuda_complex *d_c = d_plan->c; - cuda_complex *d_fw = d_plan->fw; + return 0; +} - int *d_binsize = d_plan->binsize; - int *d_binstartpts = d_plan->binstartpts; - int *d_numsubprob = d_plan->numsubprob; - int *d_subprobstartpts = d_plan->subprobstartpts; - int *d_idxnupts = d_plan->idxnupts; +template +int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t *d_plan) +/* + This function determines the properties for spreading that are independent + of the strength of the nodes, only relates to the locations of the nodes, + which only needs to be done once. +*/ +{ + auto &stream = d_plan->stream; + + int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; + int bin_size_x = d_plan->opts.gpu_binsizex; + int bin_size_y = d_plan->opts.gpu_binsizey; + if (bin_size_x < 0 || bin_size_y < 0) { + std::cerr + << "[cuspread2d_subprob_prop] error: invalid binsize (binsizex, binsizey) = ("; + std::cerr << bin_size_x << "," << bin_size_y << ")" << std::endl; + return FINUFFT_ERR_BINSIZE_NOTVALID; + } + int numbins[2]; + numbins[0] = ceil((T)nf1 / bin_size_x); + numbins[1] = ceil((T)nf2 / bin_size_y); + + T *d_kx = d_plan->kx; + T *d_ky = d_plan->ky; + + int *d_binsize = d_plan->binsize; + int *d_binstartpts = d_plan->binstartpts; + int *d_sortidx = d_plan->sortidx; + int *d_numsubprob = d_plan->numsubprob; + int *d_subprobstartpts = d_plan->subprobstartpts; + int *d_idxnupts = d_plan->idxnupts; + + int *d_subprob_to_bin = NULL; + + int ier; + if ((ier = checkCudaErrors( + cudaMemsetAsync(d_binsize, 0, numbins[0] * numbins[1] * sizeof(int), stream)))) + return ier; - int totalnumsubprob = d_plan->totalnumsubprob; - int *d_subprob_to_bin = d_plan->subprob_to_bin; + calc_bin_size_noghost_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( + M, nf1, nf2, bin_size_x, bin_size_y, numbins[0], numbins[1], d_binsize, d_kx, d_ky, + d_sortidx); + RETURN_IF_CUDA_ERROR + + int n = numbins[0] * numbins[1]; + thrust::device_ptr d_ptr(d_binsize); + thrust::device_ptr d_result(d_binstartpts); + thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result); + + calc_inverse_of_global_sort_index_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( + M, bin_size_x, bin_size_y, numbins[0], numbins[1], d_binstartpts, d_sortidx, d_kx, + d_ky, d_idxnupts, nf1, nf2); + RETURN_IF_CUDA_ERROR + calc_subprob_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( + d_binsize, d_numsubprob, maxsubprobsize, numbins[0] * numbins[1]); + RETURN_IF_CUDA_ERROR + + d_ptr = thrust::device_pointer_cast(d_numsubprob); + d_result = thrust::device_pointer_cast(d_subprobstartpts + 1); + thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result); + + if ((ier = checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream)))) + return ier; - T sigma = d_plan->opts.upsampfac; + int totalnumsubprob; + if ((ier = + checkCudaErrors(cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n], + sizeof(int), cudaMemcpyDeviceToHost, stream)))) + return ier; + cudaStreamSynchronize(stream); + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream)))) + return ier; + map_b_into_subprob_2d<<<(numbins[0] * numbins[1] + 1024 - 1) / 1024, 1024, 0, stream>>>( + d_subprob_to_bin, d_subprobstartpts, d_numsubprob, numbins[0] * numbins[1]); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err)); + cudaFree(d_subprob_to_bin); + return FINUFFT_ERR_CUDA_FAILURE; + } + + assert(d_subprob_to_bin != NULL); + cudaFreeAsync(d_plan->subprob_to_bin, stream); + d_plan->subprob_to_bin = d_subprob_to_bin; + d_plan->totalnumsubprob = totalnumsubprob; + + return 0; +} - size_t sharedplanorysize = - (bin_size_x + 2 * (int)ceil(ns / 2.0)) * (bin_size_y + 2 * (int)ceil(ns / 2.0)) * sizeof(cuda_complex); - if (sharedplanorysize > 49152) { - std::cerr << "[cuspread2d_subprob] error: not enough shared memory\n"; - return FINUFFT_ERR_INSUFFICIENT_SHMEM; +template +int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, + int blksize) { + auto &stream = d_plan->stream; + + int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells + T es_c = d_plan->spopts.ES_c; + T es_beta = d_plan->spopts.ES_beta; + int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; + + // assume that bin_size_x > ns/2; + int bin_size_x = d_plan->opts.gpu_binsizex; + int bin_size_y = d_plan->opts.gpu_binsizey; + int numbins[2]; + numbins[0] = ceil((T)nf1 / bin_size_x); + numbins[1] = ceil((T)nf2 / bin_size_y); + + T *d_kx = d_plan->kx; + T *d_ky = d_plan->ky; + cuda_complex *d_c = d_plan->c; + cuda_complex *d_fw = d_plan->fw; + + int *d_binsize = d_plan->binsize; + int *d_binstartpts = d_plan->binstartpts; + int *d_numsubprob = d_plan->numsubprob; + int *d_subprobstartpts = d_plan->subprobstartpts; + int *d_idxnupts = d_plan->idxnupts; + + int totalnumsubprob = d_plan->totalnumsubprob; + int *d_subprob_to_bin = d_plan->subprob_to_bin; + + T sigma = d_plan->opts.upsampfac; + + size_t sharedplanorysize = (bin_size_x + 2 * (int)ceil(ns / 2.0)) * + (bin_size_y + 2 * (int)ceil(ns / 2.0)) * + sizeof(cuda_complex); + if (sharedplanorysize > 49152) { + std::cerr << "[cuspread2d_subprob] error: not enough shared memory\n"; + return FINUFFT_ERR_INSUFFICIENT_SHMEM; + } + + if (d_plan->opts.gpu_kerevalmeth) { + for (int t = 0; t < blksize; t++) { + spread_2d_subprob<<>>( + d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, + sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, + d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins[0], numbins[1], + d_idxnupts); + RETURN_IF_CUDA_ERROR } - - if (d_plan->opts.gpu_kerevalmeth) { - for (int t = 0; t < blksize; t++) { - spread_2d_subprob<<>>( - d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts, - d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize, - numbins[0], numbins[1], d_idxnupts); - RETURN_IF_CUDA_ERROR - } - } else { - for (int t = 0; t < blksize; t++) { - spread_2d_subprob<<>>( - d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts, - d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize, - numbins[0], numbins[1], d_idxnupts); - RETURN_IF_CUDA_ERROR - } + } else { + for (int t = 0; t < blksize; t++) { + spread_2d_subprob<<>>( + d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, + sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, + d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins[0], numbins[1], + d_idxnupts); + RETURN_IF_CUDA_ERROR } + } - return 0; + return 0; } template int cuspread2d(cufinufft_plan_t *d_plan, int blksize); template int cuspread2d(cufinufft_plan_t *d_plan, int blksize); -template int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t *d_plan); -template int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t *d_plan); -template int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t *d_plan); -template int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t *d_plan); +template int cuspread2d_subprob_prop(int nf1, int nf2, int M, + cufinufft_plan_t *d_plan); +template int cuspread2d_subprob_prop(int nf1, int nf2, int M, + cufinufft_plan_t *d_plan); +template int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, + cufinufft_plan_t *d_plan); +template int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, + cufinufft_plan_t *d_plan); } // namespace spreadinterp } // namespace cufinufft diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu index fa02ef860..41d69b03f 100644 --- a/src/cuda/3d/cufinufft3d.cu +++ b/src/cuda/3d/cufinufft3d.cu @@ -15,8 +15,9 @@ using namespace cufinufft::deconvolve; using namespace cufinufft::spreadinterp; using std::min; -template -int cufinufft3d1_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan) +template +int cufinufft3d1_exec(cuda_complex *d_c, cuda_complex *d_fk, + cufinufft_plan_t *d_plan) /* 3D Type-1 NUFFT @@ -30,42 +31,43 @@ int cufinufft3d1_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_pla Melody Shih 07/25/19 */ { - auto &stream = d_plan->stream; - int ier; - cuda_complex *d_fkstart; - cuda_complex *d_cstart; - for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) { - int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize); - d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; - d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt * d_plan->mu; - - d_plan->c = d_cstart; - d_plan->fk = d_fkstart; - - if ((ier = checkCudaErrors(cudaMemsetAsync( - d_plan->fw, 0, - d_plan->maxbatchsize * d_plan->nf1 * d_plan->nf2 * d_plan->nf3 * sizeof(cuda_complex), stream)))) - return ier; - - // Step 1: Spread - if ((ier = cuspread3d(d_plan, blksize))) - return ier; - - // Step 2: FFT - cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); - if (cufft_status != CUFFT_SUCCESS) - return FINUFFT_ERR_CUDA_FAILURE; - - // Step 3: deconvolve and shuffle - if ((ier = cudeconvolve3d(d_plan, blksize))) - return ier; - } - - return 0; + auto &stream = d_plan->stream; + int ier; + cuda_complex *d_fkstart; + cuda_complex *d_cstart; + for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) { + int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize); + d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; + d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt * d_plan->mu; + + d_plan->c = d_cstart; + d_plan->fk = d_fkstart; + + if ((ier = checkCudaErrors( + cudaMemsetAsync(d_plan->fw, 0, + d_plan->maxbatchsize * d_plan->nf1 * d_plan->nf2 * + d_plan->nf3 * sizeof(cuda_complex), + stream)))) + return ier; + + // Step 1: Spread + if ((ier = cuspread3d(d_plan, blksize))) return ier; + + // Step 2: FFT + cufftResult cufft_status = + cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); + if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE; + + // Step 3: deconvolve and shuffle + if ((ier = cudeconvolve3d(d_plan, blksize))) return ier; + } + + return 0; } -template -int cufinufft3d2_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan) +template +int cufinufft3d2_exec(cuda_complex *d_c, cuda_complex *d_fk, + cufinufft_plan_t *d_plan) /* 3D Type-2 NUFFT @@ -79,41 +81,41 @@ int cufinufft3d2_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_pla Melody Shih 07/25/19 */ { - int ier; - cuda_complex *d_fkstart; - cuda_complex *d_cstart; - for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) { - int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize); - d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; - d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt * d_plan->mu; - - d_plan->c = d_cstart; - d_plan->fk = d_fkstart; - - // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw - if ((ier = cudeconvolve3d(d_plan, blksize))) - return ier; - - // Step 2: FFT - RETURN_IF_CUDA_ERROR - cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); - if (cufft_status != CUFFT_SUCCESS) - return FINUFFT_ERR_CUDA_FAILURE; - - // Step 3: deconvolve and shuffle - if ((ier = cuinterp3d(d_plan, blksize))) - return ier; - } - - return 0; + int ier; + cuda_complex *d_fkstart; + cuda_complex *d_cstart; + for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) { + int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize); + d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M; + d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt * d_plan->mu; + + d_plan->c = d_cstart; + d_plan->fk = d_fkstart; + + // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw + if ((ier = cudeconvolve3d(d_plan, blksize))) return ier; + + // Step 2: FFT + RETURN_IF_CUDA_ERROR + cufftResult cufft_status = + cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); + if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE; + + // Step 3: deconvolve and shuffle + if ((ier = cuinterp3d(d_plan, blksize))) return ier; + } + + return 0; } template int cufinufft3d1_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan); -template int cufinufft3d1_exec(cuda_complex *d_c, cuda_complex *d_fk, +template int cufinufft3d1_exec(cuda_complex *d_c, + cuda_complex *d_fk, cufinufft_plan_t *d_plan); template int cufinufft3d2_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan); -template int cufinufft3d2_exec(cuda_complex *d_c, cuda_complex *d_fk, +template int cufinufft3d2_exec(cuda_complex *d_c, + cuda_complex *d_fk, cufinufft_plan_t *d_plan); diff --git a/src/cuda/3d/interp3d_wrapper.cu b/src/cuda/3d/interp3d_wrapper.cu index 9cdceccd0..b42231d86 100644 --- a/src/cuda/3d/interp3d_wrapper.cu +++ b/src/cuda/3d/interp3d_wrapper.cu @@ -14,7 +14,7 @@ using namespace cufinufft::memtransfer; namespace cufinufft { namespace spreadinterp { -template +template int cuinterp3d(cufinufft_plan_t *d_plan, int blksize) /* A wrapper for different interpolation methods. @@ -26,141 +26,147 @@ int cuinterp3d(cufinufft_plan_t *d_plan, int blksize) Melody Shih 07/25/19 */ { - int nf1 = d_plan->nf1; - int nf2 = d_plan->nf2; - int nf3 = d_plan->nf3; - int M = d_plan->M; - - int ier; - switch (d_plan->opts.gpu_method) { - case 1: { - ier = cuinterp3d_nuptsdriven(nf1, nf2, nf3, M, d_plan, blksize); - } break; - case 2: { - ier = cuinterp3d_subprob(nf1, nf2, nf3, M, d_plan, blksize); - } break; - default: - std::cerr << "[cuinterp3d] error: incorrect method, should be 1,2\n"; - ier = FINUFFT_ERR_METHOD_NOTVALID; - } - - return ier; + int nf1 = d_plan->nf1; + int nf2 = d_plan->nf2; + int nf3 = d_plan->nf3; + int M = d_plan->M; + + int ier; + switch (d_plan->opts.gpu_method) { + case 1: { + ier = cuinterp3d_nuptsdriven(nf1, nf2, nf3, M, d_plan, blksize); + } break; + case 2: { + ier = cuinterp3d_subprob(nf1, nf2, nf3, M, d_plan, blksize); + } break; + default: + std::cerr << "[cuinterp3d] error: incorrect method, should be 1,2\n"; + ier = FINUFFT_ERR_METHOD_NOTVALID; + } + + return ier; } -template -int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize) { - auto &stream = d_plan->stream; - - dim3 threadsPerBlock; - dim3 blocks; - - int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells - T es_c = d_plan->spopts.ES_c; - T es_beta = d_plan->spopts.ES_beta; - T sigma = d_plan->spopts.upsampfac; - - int *d_idxnupts = d_plan->idxnupts; - - T *d_kx = d_plan->kx; - T *d_ky = d_plan->ky; - T *d_kz = d_plan->kz; - cuda_complex *d_c = d_plan->c; - cuda_complex *d_fw = d_plan->fw; - - threadsPerBlock.x = 16; - threadsPerBlock.y = 1; - blocks.x = (M + threadsPerBlock.x - 1) / threadsPerBlock.x; - blocks.y = 1; - - if (d_plan->opts.gpu_kerevalmeth) { - for (int t = 0; t < blksize; t++) { - interp_3d_nupts_driven - <<>>(d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, - ns, nf1, nf2, nf3, es_c, es_beta, sigma, d_idxnupts); - RETURN_IF_CUDA_ERROR - } - } else { - for (int t = 0; t < blksize; t++) { - interp_3d_nupts_driven - <<>>(d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, - ns, nf1, nf2, nf3, es_c, es_beta, sigma, d_idxnupts); - RETURN_IF_CUDA_ERROR - } - } +template +int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, + int blksize) { + auto &stream = d_plan->stream; - return 0; -} + dim3 threadsPerBlock; + dim3 blocks; -template -int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize) { - auto &stream = d_plan->stream; - - int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells - int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; - - // assume that bin_size_x > ns/2; - int bin_size_x = d_plan->opts.gpu_binsizex; - int bin_size_y = d_plan->opts.gpu_binsizey; - int bin_size_z = d_plan->opts.gpu_binsizez; - int numbins[3]; - numbins[0] = ceil((T)nf1 / bin_size_x); - numbins[1] = ceil((T)nf2 / bin_size_y); - numbins[2] = ceil((T)nf3 / bin_size_z); - - T *d_kx = d_plan->kx; - T *d_ky = d_plan->ky; - T *d_kz = d_plan->kz; - cuda_complex *d_c = d_plan->c; - cuda_complex *d_fw = d_plan->fw; - - int *d_binsize = d_plan->binsize; - int *d_binstartpts = d_plan->binstartpts; - int *d_numsubprob = d_plan->numsubprob; - int *d_subprobstartpts = d_plan->subprobstartpts; - int *d_idxnupts = d_plan->idxnupts; - int *d_subprob_to_bin = d_plan->subprob_to_bin; - int totalnumsubprob = d_plan->totalnumsubprob; - - T sigma = d_plan->spopts.upsampfac; - T es_c = d_plan->spopts.ES_c; - T es_beta = d_plan->spopts.ES_beta; - size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) * - (bin_size_z + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex); - if (sharedplanorysize > 49152) { - std::cerr << "[cuinterp3d_subprob] error: not enough shared memory\n"; - return FINUFFT_ERR_INSUFFICIENT_SHMEM; - } + int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells + T es_c = d_plan->spopts.ES_c; + T es_beta = d_plan->spopts.ES_beta; + T sigma = d_plan->spopts.upsampfac; + + int *d_idxnupts = d_plan->idxnupts; + + T *d_kx = d_plan->kx; + T *d_ky = d_plan->ky; + T *d_kz = d_plan->kz; + cuda_complex *d_c = d_plan->c; + cuda_complex *d_fw = d_plan->fw; + threadsPerBlock.x = 16; + threadsPerBlock.y = 1; + blocks.x = (M + threadsPerBlock.x - 1) / threadsPerBlock.x; + blocks.y = 1; + + if (d_plan->opts.gpu_kerevalmeth) { + for (int t = 0; t < blksize; t++) { + interp_3d_nupts_driven<<>>( + d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, + es_c, es_beta, sigma, d_idxnupts); + RETURN_IF_CUDA_ERROR + } + } else { for (int t = 0; t < blksize; t++) { - if (d_plan->opts.gpu_kerevalmeth == 1) { - interp_3d_subprob<<>>( - d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, es_c, es_beta, sigma, - d_binstartpts, d_binsize, bin_size_x, bin_size_y, bin_size_z, d_subprob_to_bin, d_subprobstartpts, - d_numsubprob, maxsubprobsize, numbins[0], numbins[1], numbins[2], d_idxnupts); - RETURN_IF_CUDA_ERROR - } else { - interp_3d_subprob<<>>( - d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, es_c, es_beta, sigma, - d_binstartpts, d_binsize, bin_size_x, bin_size_y, bin_size_z, d_subprob_to_bin, d_subprobstartpts, - d_numsubprob, maxsubprobsize, numbins[0], numbins[1], numbins[2], d_idxnupts); - RETURN_IF_CUDA_ERROR - } + interp_3d_nupts_driven<<>>( + d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, + es_c, es_beta, sigma, d_idxnupts); + RETURN_IF_CUDA_ERROR + } + } + + return 0; +} + +template +int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, + int blksize) { + auto &stream = d_plan->stream; + + int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells + int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; + + // assume that bin_size_x > ns/2; + int bin_size_x = d_plan->opts.gpu_binsizex; + int bin_size_y = d_plan->opts.gpu_binsizey; + int bin_size_z = d_plan->opts.gpu_binsizez; + int numbins[3]; + numbins[0] = ceil((T)nf1 / bin_size_x); + numbins[1] = ceil((T)nf2 / bin_size_y); + numbins[2] = ceil((T)nf3 / bin_size_z); + + T *d_kx = d_plan->kx; + T *d_ky = d_plan->ky; + T *d_kz = d_plan->kz; + cuda_complex *d_c = d_plan->c; + cuda_complex *d_fw = d_plan->fw; + + int *d_binsize = d_plan->binsize; + int *d_binstartpts = d_plan->binstartpts; + int *d_numsubprob = d_plan->numsubprob; + int *d_subprobstartpts = d_plan->subprobstartpts; + int *d_idxnupts = d_plan->idxnupts; + int *d_subprob_to_bin = d_plan->subprob_to_bin; + int totalnumsubprob = d_plan->totalnumsubprob; + + T sigma = d_plan->spopts.upsampfac; + T es_c = d_plan->spopts.ES_c; + T es_beta = d_plan->spopts.ES_beta; + size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) * + (bin_size_y + 2 * ceil(ns / 2.0)) * + (bin_size_z + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex); + if (sharedplanorysize > 49152) { + std::cerr << "[cuinterp3d_subprob] error: not enough shared memory\n"; + return FINUFFT_ERR_INSUFFICIENT_SHMEM; + } + + for (int t = 0; t < blksize; t++) { + if (d_plan->opts.gpu_kerevalmeth == 1) { + interp_3d_subprob<<>>( + d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, + es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, + bin_size_z, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize, + numbins[0], numbins[1], numbins[2], d_idxnupts); + RETURN_IF_CUDA_ERROR + } else { + interp_3d_subprob<<>>( + d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, + es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, + bin_size_z, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize, + numbins[0], numbins[1], numbins[2], d_idxnupts); + RETURN_IF_CUDA_ERROR } + } - return 0; + return 0; } template int cuinterp3d(cufinufft_plan_t *d_plan, int blksize); template int cuinterp3d(cufinufft_plan_t *d_plan, int blksize); -template int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, - int blksize); -template int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, - int blksize); +template int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, + cufinufft_plan_t *d_plan, int blksize); +template int cuinterp3d_nuptsdriven( + int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize); -template int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize); -template int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, - int blksize); +template int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, + cufinufft_plan_t *d_plan, int blksize); +template int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, + cufinufft_plan_t *d_plan, int blksize); } // namespace spreadinterp } // namespace cufinufft diff --git a/src/cuda/3d/spread3d_wrapper.cu b/src/cuda/3d/spread3d_wrapper.cu index 13d435e28..fb5ab0495 100644 --- a/src/cuda/3d/spread3d_wrapper.cu +++ b/src/cuda/3d/spread3d_wrapper.cu @@ -18,7 +18,7 @@ using namespace cufinufft::memtransfer; namespace cufinufft { namespace spreadinterp { -template +template int cuspread3d(cufinufft_plan_t *d_plan, int blksize) /* A wrapper for different spreading methods. @@ -31,521 +31,551 @@ int cuspread3d(cufinufft_plan_t *d_plan, int blksize) Melody Shih 07/25/19 */ { - int nf1 = d_plan->nf1; - int nf2 = d_plan->nf2; - int nf3 = d_plan->nf3; - int M = d_plan->M; - - int ier = 0; - switch (d_plan->opts.gpu_method) { - case 1: { - ier = cuspread3d_nuptsdriven(nf1, nf2, nf3, M, d_plan, blksize); - } break; - case 2: { - ier = cuspread3d_subprob(nf1, nf2, nf3, M, d_plan, blksize); - } break; - case 4: { - ier = cuspread3d_blockgather(nf1, nf2, nf3, M, d_plan, blksize); - } break; - default: - std::cerr << "[cuspread3d] error: incorrect method, should be 1,2,4" << std::endl; - ier = FINUFFT_ERR_METHOD_NOTVALID; - } - - return ier; -} - -template -int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan) { - auto &stream = d_plan->stream; - - if (d_plan->opts.gpu_sort) { - int bin_size_x = d_plan->opts.gpu_binsizex; - int bin_size_y = d_plan->opts.gpu_binsizey; - int bin_size_z = d_plan->opts.gpu_binsizez; - if (bin_size_x < 0 || bin_size_y < 0 || bin_size_z < 0) { - std::cerr << "[cuspread3d_nuptsdriven_prop] error: invalid binsize (binsizex, binsizey, binsizez) = ("; - std::cerr << bin_size_x << "," << bin_size_y << "," << bin_size_z << ")" << std::endl; - return FINUFFT_ERR_BINSIZE_NOTVALID; - } - - int numbins[3]; - numbins[0] = ceil((T)nf1 / bin_size_x); - numbins[1] = ceil((T)nf2 / bin_size_y); - numbins[2] = ceil((T)nf3 / bin_size_z); - - T *d_kx = d_plan->kx; - T *d_ky = d_plan->ky; - T *d_kz = d_plan->kz; - - int *d_binsize = d_plan->binsize; - int *d_binstartpts = d_plan->binstartpts; - int *d_sortidx = d_plan->sortidx; - int *d_idxnupts = d_plan->idxnupts; - - int ier; - if ((ier = checkCudaErrors( - cudaMemsetAsync(d_binsize, 0, numbins[0] * numbins[1] * numbins[2] * sizeof(int), stream)))) - return ier; - calc_bin_size_noghost_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( - M, nf1, nf2, nf3, bin_size_x, bin_size_y, bin_size_z, numbins[0], numbins[1], numbins[2], d_binsize, d_kx, - d_ky, d_kz, d_sortidx); - RETURN_IF_CUDA_ERROR - - int n = numbins[0] * numbins[1] * numbins[2]; - thrust::device_ptr d_ptr(d_binsize); - thrust::device_ptr d_result(d_binstartpts); - thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result); - - calc_inverse_of_global_sort_index_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( - M, bin_size_x, bin_size_y, bin_size_z, numbins[0], numbins[1], numbins[2], d_binstartpts, d_sortidx, d_kx, - d_ky, d_kz, d_idxnupts, nf1, nf2, nf3); - RETURN_IF_CUDA_ERROR - } else { - int *d_idxnupts = d_plan->idxnupts; - - trivial_global_sort_index_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M, d_idxnupts); - RETURN_IF_CUDA_ERROR - } - - return 0; -} - -template -int cuspread3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize) { - auto &stream = d_plan->stream; - - dim3 threadsPerBlock; - dim3 blocks; - - int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells - T sigma = d_plan->spopts.upsampfac; - T es_c = d_plan->spopts.ES_c; - T es_beta = d_plan->spopts.ES_beta; - - int *d_idxnupts = d_plan->idxnupts; - T *d_kx = d_plan->kx; - T *d_ky = d_plan->ky; - T *d_kz = d_plan->kz; - cuda_complex *d_c = d_plan->c; - cuda_complex *d_fw = d_plan->fw; - - threadsPerBlock.x = 16; - threadsPerBlock.y = 1; - blocks.x = (M + threadsPerBlock.x - 1) / threadsPerBlock.x; - blocks.y = 1; - - if (d_plan->opts.gpu_kerevalmeth == 1) { - for (int t = 0; t < blksize; t++) { - spread_3d_nupts_driven - <<>>(d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, - ns, nf1, nf2, nf3, es_c, es_beta, sigma, d_idxnupts); - RETURN_IF_CUDA_ERROR - } - } else { - for (int t = 0; t < blksize; t++) { - spread_3d_nupts_driven - <<>>(d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, - ns, nf1, nf2, nf3, es_c, es_beta, sigma, d_idxnupts); - RETURN_IF_CUDA_ERROR - } - } - - return 0; + int nf1 = d_plan->nf1; + int nf2 = d_plan->nf2; + int nf3 = d_plan->nf3; + int M = d_plan->M; + + int ier = 0; + switch (d_plan->opts.gpu_method) { + case 1: { + ier = cuspread3d_nuptsdriven(nf1, nf2, nf3, M, d_plan, blksize); + } break; + case 2: { + ier = cuspread3d_subprob(nf1, nf2, nf3, M, d_plan, blksize); + } break; + case 4: { + ier = cuspread3d_blockgather(nf1, nf2, nf3, M, d_plan, blksize); + } break; + default: + std::cerr << "[cuspread3d] error: incorrect method, should be 1,2,4" << std::endl; + ier = FINUFFT_ERR_METHOD_NOTVALID; + } + + return ier; } -template -int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan) { - auto &stream = d_plan->stream; - - dim3 threadsPerBlock; - dim3 blocks; - - int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; - int o_bin_size_x = d_plan->opts.gpu_obinsizex; - int o_bin_size_y = d_plan->opts.gpu_obinsizey; - int o_bin_size_z = d_plan->opts.gpu_obinsizez; - - int numobins[3]; - if (nf1 % o_bin_size_x != 0 || nf2 % o_bin_size_y != 0 || nf3 % o_bin_size_z != 0) { - std::cerr << "[cuspread3d_blockgather_prop] error:\n"; - std::cerr << " mod(nf(1|2|3), opts.gpu_obinsize(x|y|z)) != 0" << std::endl; - std::cerr << " (nf1, nf2, nf3) = (" << nf1 << ", " << nf2 << ", " << nf3 << ")" << std::endl; - std::cerr << " (obinsizex, obinsizey, obinsizez) = (" << o_bin_size_x << ", " << o_bin_size_y << ", " - << o_bin_size_z << ")" << std::endl; - return FINUFFT_ERR_BINSIZE_NOTVALID; - } - - numobins[0] = ceil((T)nf1 / o_bin_size_x); - numobins[1] = ceil((T)nf2 / o_bin_size_y); - numobins[2] = ceil((T)nf3 / o_bin_size_z); +template +int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M, + cufinufft_plan_t *d_plan) { + auto &stream = d_plan->stream; + if (d_plan->opts.gpu_sort) { int bin_size_x = d_plan->opts.gpu_binsizex; int bin_size_y = d_plan->opts.gpu_binsizey; int bin_size_z = d_plan->opts.gpu_binsizez; - if (o_bin_size_x % bin_size_x != 0 || o_bin_size_y % bin_size_y != 0 || o_bin_size_z % bin_size_z != 0) { - std::cerr << "[cuspread3d_blockgather_prop] error:\n"; - std::cerr << " mod(ops.gpu_obinsize(x|y|z), opts.gpu_binsize(x|y|z)) != 0" << std::endl; - std::cerr << " (binsizex, binsizey, binsizez) = (" << bin_size_x << ", " << bin_size_y << ", " - << bin_size_z << ")" << std::endl; - std::cerr << " (obinsizex, obinsizey, obinsizez) = (" << o_bin_size_x << ", " << o_bin_size_y << ", " - << o_bin_size_z << ")" << std::endl; - return FINUFFT_ERR_BINSIZE_NOTVALID; + if (bin_size_x < 0 || bin_size_y < 0 || bin_size_z < 0) { + std::cerr << "[cuspread3d_nuptsdriven_prop] error: invalid binsize (binsizex, " + "binsizey, binsizez) = ("; + std::cerr << bin_size_x << "," << bin_size_y << "," << bin_size_z << ")" + << std::endl; + return FINUFFT_ERR_BINSIZE_NOTVALID; } - int binsperobinx, binsperobiny, binsperobinz; int numbins[3]; - binsperobinx = o_bin_size_x / bin_size_x + 2; - binsperobiny = o_bin_size_y / bin_size_y + 2; - binsperobinz = o_bin_size_z / bin_size_z + 2; - numbins[0] = numobins[0] * (binsperobinx); - numbins[1] = numobins[1] * (binsperobiny); - numbins[2] = numobins[2] * (binsperobinz); + numbins[0] = ceil((T)nf1 / bin_size_x); + numbins[1] = ceil((T)nf2 / bin_size_y); + numbins[2] = ceil((T)nf3 / bin_size_z); T *d_kx = d_plan->kx; T *d_ky = d_plan->ky; T *d_kz = d_plan->kz; - int *d_binsize = d_plan->binsize; - int *d_sortidx = d_plan->sortidx; + int *d_binsize = d_plan->binsize; int *d_binstartpts = d_plan->binstartpts; - int *d_numsubprob = d_plan->numsubprob; - int *d_idxnupts = NULL; - int *d_subprobstartpts = d_plan->subprobstartpts; - int *d_subprob_to_bin = NULL; + int *d_sortidx = d_plan->sortidx; + int *d_idxnupts = d_plan->idxnupts; int ier; - if ((ier = checkCudaErrors( - cudaMemsetAsync(d_binsize, 0, numbins[0] * numbins[1] * numbins[2] * sizeof(int), stream)))) - return ier; - - locate_nupts_to_bins_ghost<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( - M, bin_size_x, bin_size_y, bin_size_z, numobins[0], numobins[1], numobins[2], binsperobinx, binsperobiny, - binsperobinz, d_binsize, d_kx, d_ky, d_kz, d_sortidx, nf1, nf2, nf3); - RETURN_IF_CUDA_ERROR - - threadsPerBlock.x = 8; - threadsPerBlock.y = 8; - threadsPerBlock.z = 8; - - blocks.x = (threadsPerBlock.x + numbins[0] - 1) / threadsPerBlock.x; - blocks.y = (threadsPerBlock.y + numbins[1] - 1) / threadsPerBlock.y; - blocks.z = (threadsPerBlock.z + numbins[2] - 1) / threadsPerBlock.z; - - fill_ghost_bins<<>>(binsperobinx, binsperobiny, binsperobinz, numobins[0], - numobins[1], numobins[2], d_binsize); + if ((ier = checkCudaErrors(cudaMemsetAsync( + d_binsize, 0, numbins[0] * numbins[1] * numbins[2] * sizeof(int), stream)))) + return ier; + calc_bin_size_noghost_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( + M, nf1, nf2, nf3, bin_size_x, bin_size_y, bin_size_z, numbins[0], numbins[1], + numbins[2], d_binsize, d_kx, d_ky, d_kz, d_sortidx); RETURN_IF_CUDA_ERROR int n = numbins[0] * numbins[1] * numbins[2]; thrust::device_ptr d_ptr(d_binsize); - thrust::device_ptr d_result(d_binstartpts + 1); - thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result); - - if ((ier = checkCudaErrors(cudaMemsetAsync(d_binstartpts, 0, sizeof(int), stream)))) - return ier; - - int totalNUpts; - if ((ier = checkCudaErrors( - cudaMemcpyAsync(&totalNUpts, &d_binstartpts[n], sizeof(int), cudaMemcpyDeviceToHost, stream)))) - return ier; - cudaStreamSynchronize(stream); - if ((ier = checkCudaErrors(cudaMallocAsync(&d_idxnupts, totalNUpts * sizeof(int), stream)))) - return ier; - - calc_inverse_of_global_sort_index_ghost<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( - M, bin_size_x, bin_size_y, bin_size_z, numobins[0], numobins[1], numobins[2], binsperobinx, binsperobiny, - binsperobinz, d_binstartpts, d_sortidx, d_kx, d_ky, d_kz, d_idxnupts, nf1, nf2, nf3); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err)); - cudaFree(d_idxnupts); - return FINUFFT_ERR_CUDA_FAILURE; - } - - threadsPerBlock.x = 2; - threadsPerBlock.y = 2; - threadsPerBlock.z = 2; - - blocks.x = (threadsPerBlock.x + numbins[0] - 1) / threadsPerBlock.x; - blocks.y = (threadsPerBlock.y + numbins[1] - 1) / threadsPerBlock.y; - blocks.z = (threadsPerBlock.z + numbins[2] - 1) / threadsPerBlock.z; - - ghost_bin_pts_index<<>>(binsperobinx, binsperobiny, binsperobinz, numobins[0], - numobins[1], numobins[2], d_binsize, d_idxnupts, - d_binstartpts, M); - err = cudaGetLastError(); - if (err != cudaSuccess) { - fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err)); - cudaFree(d_idxnupts); - return FINUFFT_ERR_CUDA_FAILURE; - } - - cudaFree(d_plan->idxnupts); - d_plan->idxnupts = d_idxnupts; + thrust::device_ptr d_result(d_binstartpts); + thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result); - /* --------------------------------------------- */ - // Determining Subproblem properties // - /* --------------------------------------------- */ - n = numobins[0] * numobins[1] * numobins[2]; - calc_subprob_3d_v1<<<(n + 1024 - 1) / 1024, 1024, 0, stream>>>(binsperobinx, binsperobiny, binsperobinz, d_binsize, - d_numsubprob, maxsubprobsize, - numobins[0] * numobins[1] * numobins[2]); + calc_inverse_of_global_sort_index_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( + M, bin_size_x, bin_size_y, bin_size_z, numbins[0], numbins[1], numbins[2], + d_binstartpts, d_sortidx, d_kx, d_ky, d_kz, d_idxnupts, nf1, nf2, nf3); RETURN_IF_CUDA_ERROR + } else { + int *d_idxnupts = d_plan->idxnupts; - n = numobins[0] * numobins[1] * numobins[2]; - d_ptr = thrust::device_pointer_cast(d_numsubprob); - d_result = thrust::device_pointer_cast(d_subprobstartpts + 1); - thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result); - - if ((ier = checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream)))) - return ier; - - int totalnumsubprob; - if ((ier = checkCudaErrors( - cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n], sizeof(int), cudaMemcpyDeviceToHost, stream)))) - return ier; - cudaStreamSynchronize(stream); - if ((ier = checkCudaErrors(cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream)))) - return ier; - map_b_into_subprob_3d_v1<<<(n + 1024 - 1) / 1024, 1024, 0, stream>>>(d_subprob_to_bin, d_subprobstartpts, - d_numsubprob, n); - err = cudaGetLastError(); - if (err != cudaSuccess) { - fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err)); - cudaFree(d_subprob_to_bin); - return FINUFFT_ERR_CUDA_FAILURE; - } - - assert(d_subprob_to_bin != NULL); - cudaFree(d_plan->subprob_to_bin); - d_plan->subprob_to_bin = d_subprob_to_bin; - d_plan->totalnumsubprob = totalnumsubprob; + trivial_global_sort_index_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M, + d_idxnupts); + RETURN_IF_CUDA_ERROR + } - return 0; + return 0; } -template -int cuspread3d_blockgather(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize) { - auto &stream = d_plan->stream; - - int ns = d_plan->spopts.nspread; - T es_c = d_plan->spopts.ES_c; - T es_beta = d_plan->spopts.ES_beta; - T sigma = d_plan->spopts.upsampfac; - int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; - - int obin_size_x = d_plan->opts.gpu_obinsizex; - int obin_size_y = d_plan->opts.gpu_obinsizey; - int obin_size_z = d_plan->opts.gpu_obinsizez; - int bin_size_x = d_plan->opts.gpu_binsizex; - int bin_size_y = d_plan->opts.gpu_binsizey; - int bin_size_z = d_plan->opts.gpu_binsizez; - int numobins[3]; - numobins[0] = ceil((T)nf1 / obin_size_x); - numobins[1] = ceil((T)nf2 / obin_size_y); - numobins[2] = ceil((T)nf3 / obin_size_z); +template +int cuspread3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, + int blksize) { + auto &stream = d_plan->stream; - int binsperobinx, binsperobiny, binsperobinz; - binsperobinx = obin_size_x / bin_size_x + 2; - binsperobiny = obin_size_y / bin_size_y + 2; - binsperobinz = obin_size_z / bin_size_z + 2; + dim3 threadsPerBlock; + dim3 blocks; - T *d_kx = d_plan->kx; - T *d_ky = d_plan->ky; - T *d_kz = d_plan->kz; - cuda_complex *d_c = d_plan->c; - cuda_complex *d_fw = d_plan->fw; + int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells + T sigma = d_plan->spopts.upsampfac; + T es_c = d_plan->spopts.ES_c; + T es_beta = d_plan->spopts.ES_beta; - int *d_binstartpts = d_plan->binstartpts; - int *d_subprobstartpts = d_plan->subprobstartpts; - int *d_idxnupts = d_plan->idxnupts; + int *d_idxnupts = d_plan->idxnupts; + T *d_kx = d_plan->kx; + T *d_ky = d_plan->ky; + T *d_kz = d_plan->kz; + cuda_complex *d_c = d_plan->c; + cuda_complex *d_fw = d_plan->fw; - int totalnumsubprob = d_plan->totalnumsubprob; - int *d_subprob_to_bin = d_plan->subprob_to_bin; + threadsPerBlock.x = 16; + threadsPerBlock.y = 1; + blocks.x = (M + threadsPerBlock.x - 1) / threadsPerBlock.x; + blocks.y = 1; - size_t sharedplanorysize = obin_size_x * obin_size_y * obin_size_z * sizeof(cuda_complex); - if (sharedplanorysize > 49152) { - std::cerr << "[cuspread3d_blockgather] error: not enough shared memory" << std::endl; - return FINUFFT_ERR_INSUFFICIENT_SHMEM; + if (d_plan->opts.gpu_kerevalmeth == 1) { + for (int t = 0; t < blksize; t++) { + spread_3d_nupts_driven<<>>( + d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, + es_c, es_beta, sigma, d_idxnupts); + RETURN_IF_CUDA_ERROR } - + } else { for (int t = 0; t < blksize; t++) { - if (d_plan->opts.gpu_kerevalmeth == 1) { - spread_3d_block_gather<<>>( - d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, es_c, es_beta, sigma, - d_binstartpts, obin_size_x, obin_size_y, obin_size_z, binsperobinx * binsperobiny * binsperobinz, - d_subprob_to_bin, d_subprobstartpts, maxsubprobsize, numobins[0], numobins[1], numobins[2], d_idxnupts); - RETURN_IF_CUDA_ERROR - } else { - spread_3d_block_gather<<>>( - d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, es_c, es_beta, sigma, - d_binstartpts, obin_size_x, obin_size_y, obin_size_z, binsperobinx * binsperobiny * binsperobinz, - d_subprob_to_bin, d_subprobstartpts, maxsubprobsize, numobins[0], numobins[1], numobins[2], d_idxnupts); - RETURN_IF_CUDA_ERROR - } + spread_3d_nupts_driven<<>>( + d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, + es_c, es_beta, sigma, d_idxnupts); + RETURN_IF_CUDA_ERROR } + } - return 0; + return 0; } -template -int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan) { - auto &stream = d_plan->stream; +template +int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M, + cufinufft_plan_t *d_plan) { + auto &stream = d_plan->stream; + + dim3 threadsPerBlock; + dim3 blocks; + + int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; + int o_bin_size_x = d_plan->opts.gpu_obinsizex; + int o_bin_size_y = d_plan->opts.gpu_obinsizey; + int o_bin_size_z = d_plan->opts.gpu_obinsizez; + + int numobins[3]; + if (nf1 % o_bin_size_x != 0 || nf2 % o_bin_size_y != 0 || nf3 % o_bin_size_z != 0) { + std::cerr << "[cuspread3d_blockgather_prop] error:\n"; + std::cerr << " mod(nf(1|2|3), opts.gpu_obinsize(x|y|z)) != 0" << std::endl; + std::cerr << " (nf1, nf2, nf3) = (" << nf1 << ", " << nf2 << ", " << nf3 << ")" + << std::endl; + std::cerr << " (obinsizex, obinsizey, obinsizez) = (" << o_bin_size_x << ", " + << o_bin_size_y << ", " << o_bin_size_z << ")" << std::endl; + return FINUFFT_ERR_BINSIZE_NOTVALID; + } + + numobins[0] = ceil((T)nf1 / o_bin_size_x); + numobins[1] = ceil((T)nf2 / o_bin_size_y); + numobins[2] = ceil((T)nf3 / o_bin_size_z); + + int bin_size_x = d_plan->opts.gpu_binsizex; + int bin_size_y = d_plan->opts.gpu_binsizey; + int bin_size_z = d_plan->opts.gpu_binsizez; + if (o_bin_size_x % bin_size_x != 0 || o_bin_size_y % bin_size_y != 0 || + o_bin_size_z % bin_size_z != 0) { + std::cerr << "[cuspread3d_blockgather_prop] error:\n"; + std::cerr << " mod(ops.gpu_obinsize(x|y|z), opts.gpu_binsize(x|y|z)) != 0" + << std::endl; + std::cerr << " (binsizex, binsizey, binsizez) = (" << bin_size_x << ", " + << bin_size_y << ", " << bin_size_z << ")" << std::endl; + std::cerr << " (obinsizex, obinsizey, obinsizez) = (" << o_bin_size_x << ", " + << o_bin_size_y << ", " << o_bin_size_z << ")" << std::endl; + return FINUFFT_ERR_BINSIZE_NOTVALID; + } + + int binsperobinx, binsperobiny, binsperobinz; + int numbins[3]; + binsperobinx = o_bin_size_x / bin_size_x + 2; + binsperobiny = o_bin_size_y / bin_size_y + 2; + binsperobinz = o_bin_size_z / bin_size_z + 2; + numbins[0] = numobins[0] * (binsperobinx); + numbins[1] = numobins[1] * (binsperobiny); + numbins[2] = numobins[2] * (binsperobinz); + + T *d_kx = d_plan->kx; + T *d_ky = d_plan->ky; + T *d_kz = d_plan->kz; + + int *d_binsize = d_plan->binsize; + int *d_sortidx = d_plan->sortidx; + int *d_binstartpts = d_plan->binstartpts; + int *d_numsubprob = d_plan->numsubprob; + int *d_idxnupts = NULL; + int *d_subprobstartpts = d_plan->subprobstartpts; + int *d_subprob_to_bin = NULL; + + int ier; + if ((ier = checkCudaErrors(cudaMemsetAsync( + d_binsize, 0, numbins[0] * numbins[1] * numbins[2] * sizeof(int), stream)))) + return ier; - int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; - int bin_size_x = d_plan->opts.gpu_binsizex; - int bin_size_y = d_plan->opts.gpu_binsizey; - int bin_size_z = d_plan->opts.gpu_binsizez; - if (bin_size_x < 0 || bin_size_y < 0 || bin_size_z < 0) { - std::cerr << "error: invalid binsize (binsizex, binsizey, binsizez) = ("; - std::cerr << bin_size_x << "," << bin_size_y << "," << bin_size_z << ")" << std::endl; - return FINUFFT_ERR_BINSIZE_NOTVALID; - } + locate_nupts_to_bins_ghost<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( + M, bin_size_x, bin_size_y, bin_size_z, numobins[0], numobins[1], numobins[2], + binsperobinx, binsperobiny, binsperobinz, d_binsize, d_kx, d_ky, d_kz, d_sortidx, + nf1, nf2, nf3); + RETURN_IF_CUDA_ERROR - int numbins[3]; - numbins[0] = ceil((T)nf1 / bin_size_x); - numbins[1] = ceil((T)nf2 / bin_size_y); - numbins[2] = ceil((T)nf3 / bin_size_z); + threadsPerBlock.x = 8; + threadsPerBlock.y = 8; + threadsPerBlock.z = 8; - T *d_kx = d_plan->kx; - T *d_ky = d_plan->ky; - T *d_kz = d_plan->kz; + blocks.x = (threadsPerBlock.x + numbins[0] - 1) / threadsPerBlock.x; + blocks.y = (threadsPerBlock.y + numbins[1] - 1) / threadsPerBlock.y; + blocks.z = (threadsPerBlock.z + numbins[2] - 1) / threadsPerBlock.z; - int *d_binsize = d_plan->binsize; - int *d_binstartpts = d_plan->binstartpts; - int *d_sortidx = d_plan->sortidx; - int *d_numsubprob = d_plan->numsubprob; - int *d_subprobstartpts = d_plan->subprobstartpts; - int *d_idxnupts = d_plan->idxnupts; + fill_ghost_bins<<>>( + binsperobinx, binsperobiny, binsperobinz, numobins[0], numobins[1], numobins[2], + d_binsize); + RETURN_IF_CUDA_ERROR - int *d_subprob_to_bin = NULL; + int n = numbins[0] * numbins[1] * numbins[2]; + thrust::device_ptr d_ptr(d_binsize); + thrust::device_ptr d_result(d_binstartpts + 1); + thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result); - int ier; - if ((ier = checkCudaErrors( - cudaMemsetAsync(d_binsize, 0, numbins[0] * numbins[1] * numbins[2] * sizeof(int), stream)))) - return ier; - calc_bin_size_noghost_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( - M, nf1, nf2, nf3, bin_size_x, bin_size_y, bin_size_z, numbins[0], numbins[1], numbins[2], d_binsize, d_kx, d_ky, - d_kz, d_sortidx); - RETURN_IF_CUDA_ERROR - - int n = numbins[0] * numbins[1] * numbins[2]; - thrust::device_ptr d_ptr(d_binsize); - thrust::device_ptr d_result(d_binstartpts); - thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result); - - calc_inverse_of_global_sort_index_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( - M, bin_size_x, bin_size_y, bin_size_z, numbins[0], numbins[1], numbins[2], d_binstartpts, d_sortidx, d_kx, d_ky, - d_kz, d_idxnupts, nf1, nf2, nf3); - RETURN_IF_CUDA_ERROR - /* --------------------------------------------- */ - // Determining Subproblem properties // - /* --------------------------------------------- */ - calc_subprob_3d_v2<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(d_binsize, d_numsubprob, maxsubprobsize, - numbins[0] * numbins[1] * numbins[2]); - RETURN_IF_CUDA_ERROR + if ((ier = checkCudaErrors(cudaMemsetAsync(d_binstartpts, 0, sizeof(int), stream)))) + return ier; - d_ptr = thrust::device_pointer_cast(d_numsubprob); - d_result = thrust::device_pointer_cast(d_subprobstartpts + 1); - thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result); - int totalnumsubprob; - if (checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream)) || - checkCudaErrors( - cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n], sizeof(int), cudaMemcpyDeviceToHost, stream) - ) - ) - return FINUFFT_ERR_CUDA_FAILURE; - cudaStreamSynchronize(stream); - if(checkCudaErrors(cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream))) - return FINUFFT_ERR_CUDA_FAILURE; - - map_b_into_subprob_3d_v2<<<(numbins[0] * numbins[1] + 1024 - 1) / 1024, 1024, 0, stream>>>( - d_subprob_to_bin, d_subprobstartpts, d_numsubprob, numbins[0] * numbins[1] * numbins[2]); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err)); - cudaFree(d_subprob_to_bin); - return FINUFFT_ERR_CUDA_FAILURE; - } + int totalNUpts; + if ((ier = checkCudaErrors(cudaMemcpyAsync(&totalNUpts, &d_binstartpts[n], sizeof(int), + cudaMemcpyDeviceToHost, stream)))) + return ier; + cudaStreamSynchronize(stream); + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_idxnupts, totalNUpts * sizeof(int), stream)))) + return ier; - assert(d_subprob_to_bin != NULL); - if (d_plan->subprob_to_bin != NULL) - cudaFree(d_plan->subprob_to_bin); - d_plan->subprob_to_bin = d_subprob_to_bin; - assert(d_plan->subprob_to_bin != nullptr); - d_plan->totalnumsubprob = totalnumsubprob; + calc_inverse_of_global_sort_index_ghost<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( + M, bin_size_x, bin_size_y, bin_size_z, numobins[0], numobins[1], numobins[2], + binsperobinx, binsperobiny, binsperobinz, d_binstartpts, d_sortidx, d_kx, d_ky, + d_kz, d_idxnupts, nf1, nf2, nf3); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err)); + cudaFree(d_idxnupts); + return FINUFFT_ERR_CUDA_FAILURE; + } + + threadsPerBlock.x = 2; + threadsPerBlock.y = 2; + threadsPerBlock.z = 2; + + blocks.x = (threadsPerBlock.x + numbins[0] - 1) / threadsPerBlock.x; + blocks.y = (threadsPerBlock.y + numbins[1] - 1) / threadsPerBlock.y; + blocks.z = (threadsPerBlock.z + numbins[2] - 1) / threadsPerBlock.z; + + ghost_bin_pts_index<<>>( + binsperobinx, binsperobiny, binsperobinz, numobins[0], numobins[1], numobins[2], + d_binsize, d_idxnupts, d_binstartpts, M); + err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err)); + cudaFree(d_idxnupts); + return FINUFFT_ERR_CUDA_FAILURE; + } + + cudaFree(d_plan->idxnupts); + d_plan->idxnupts = d_idxnupts; + + /* --------------------------------------------- */ + // Determining Subproblem properties // + /* --------------------------------------------- */ + n = numobins[0] * numobins[1] * numobins[2]; + calc_subprob_3d_v1<<<(n + 1024 - 1) / 1024, 1024, 0, stream>>>( + binsperobinx, binsperobiny, binsperobinz, d_binsize, d_numsubprob, maxsubprobsize, + numobins[0] * numobins[1] * numobins[2]); + RETURN_IF_CUDA_ERROR + + n = numobins[0] * numobins[1] * numobins[2]; + d_ptr = thrust::device_pointer_cast(d_numsubprob); + d_result = thrust::device_pointer_cast(d_subprobstartpts + 1); + thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result); + + if ((ier = checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream)))) + return ier; - return 0; + int totalnumsubprob; + if ((ier = + checkCudaErrors(cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n], + sizeof(int), cudaMemcpyDeviceToHost, stream)))) + return ier; + cudaStreamSynchronize(stream); + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream)))) + return ier; + map_b_into_subprob_3d_v1<<<(n + 1024 - 1) / 1024, 1024, 0, stream>>>( + d_subprob_to_bin, d_subprobstartpts, d_numsubprob, n); + err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err)); + cudaFree(d_subprob_to_bin); + return FINUFFT_ERR_CUDA_FAILURE; + } + + assert(d_subprob_to_bin != NULL); + cudaFree(d_plan->subprob_to_bin); + d_plan->subprob_to_bin = d_subprob_to_bin; + d_plan->totalnumsubprob = totalnumsubprob; + + return 0; } -template -int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize) { - auto &stream = d_plan->stream; - - int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells - int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; - - // assume that bin_size_x > ns/2; - int bin_size_x = d_plan->opts.gpu_binsizex; - int bin_size_y = d_plan->opts.gpu_binsizey; - int bin_size_z = d_plan->opts.gpu_binsizez; - int numbins[3]; - numbins[0] = ceil((T)nf1 / bin_size_x); - numbins[1] = ceil((T)nf2 / bin_size_y); - numbins[2] = ceil((T)nf3 / bin_size_z); - - T *d_kx = d_plan->kx; - T *d_ky = d_plan->ky; - T *d_kz = d_plan->kz; - cuda_complex *d_c = d_plan->c; - cuda_complex *d_fw = d_plan->fw; +template +int cuspread3d_blockgather(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, + int blksize) { + auto &stream = d_plan->stream; + + int ns = d_plan->spopts.nspread; + T es_c = d_plan->spopts.ES_c; + T es_beta = d_plan->spopts.ES_beta; + T sigma = d_plan->spopts.upsampfac; + int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; + + int obin_size_x = d_plan->opts.gpu_obinsizex; + int obin_size_y = d_plan->opts.gpu_obinsizey; + int obin_size_z = d_plan->opts.gpu_obinsizez; + int bin_size_x = d_plan->opts.gpu_binsizex; + int bin_size_y = d_plan->opts.gpu_binsizey; + int bin_size_z = d_plan->opts.gpu_binsizez; + int numobins[3]; + numobins[0] = ceil((T)nf1 / obin_size_x); + numobins[1] = ceil((T)nf2 / obin_size_y); + numobins[2] = ceil((T)nf3 / obin_size_z); + + int binsperobinx, binsperobiny, binsperobinz; + binsperobinx = obin_size_x / bin_size_x + 2; + binsperobiny = obin_size_y / bin_size_y + 2; + binsperobinz = obin_size_z / bin_size_z + 2; + + T *d_kx = d_plan->kx; + T *d_ky = d_plan->ky; + T *d_kz = d_plan->kz; + cuda_complex *d_c = d_plan->c; + cuda_complex *d_fw = d_plan->fw; + + int *d_binstartpts = d_plan->binstartpts; + int *d_subprobstartpts = d_plan->subprobstartpts; + int *d_idxnupts = d_plan->idxnupts; + + int totalnumsubprob = d_plan->totalnumsubprob; + int *d_subprob_to_bin = d_plan->subprob_to_bin; + + size_t sharedplanorysize = + obin_size_x * obin_size_y * obin_size_z * sizeof(cuda_complex); + if (sharedplanorysize > 49152) { + std::cerr << "[cuspread3d_blockgather] error: not enough shared memory" << std::endl; + return FINUFFT_ERR_INSUFFICIENT_SHMEM; + } + + for (int t = 0; t < blksize; t++) { + if (d_plan->opts.gpu_kerevalmeth == 1) { + spread_3d_block_gather<<>>( + d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, + es_c, es_beta, sigma, d_binstartpts, obin_size_x, obin_size_y, obin_size_z, + binsperobinx * binsperobiny * binsperobinz, d_subprob_to_bin, d_subprobstartpts, + maxsubprobsize, numobins[0], numobins[1], numobins[2], d_idxnupts); + RETURN_IF_CUDA_ERROR + } else { + spread_3d_block_gather<<>>( + d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, + es_c, es_beta, sigma, d_binstartpts, obin_size_x, obin_size_y, obin_size_z, + binsperobinx * binsperobiny * binsperobinz, d_subprob_to_bin, d_subprobstartpts, + maxsubprobsize, numobins[0], numobins[1], numobins[2], d_idxnupts); + RETURN_IF_CUDA_ERROR + } + } - int *d_binsize = d_plan->binsize; - int *d_binstartpts = d_plan->binstartpts; - int *d_numsubprob = d_plan->numsubprob; - int *d_subprobstartpts = d_plan->subprobstartpts; - int *d_idxnupts = d_plan->idxnupts; + return 0; +} - int totalnumsubprob = d_plan->totalnumsubprob; - int *d_subprob_to_bin = d_plan->subprob_to_bin; - - T sigma = d_plan->spopts.upsampfac; - T es_c = d_plan->spopts.ES_c; - T es_beta = d_plan->spopts.ES_beta; - size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) * - (bin_size_z + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex); - if (sharedplanorysize > 49152) { - std::cerr << "[cuspread3d_subprob] error: not enough shared memory (" << sharedplanorysize << ")" << std::endl; - return FINUFFT_ERR_INSUFFICIENT_SHMEM; - } +template +int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M, + cufinufft_plan_t *d_plan) { + auto &stream = d_plan->stream; + + int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; + int bin_size_x = d_plan->opts.gpu_binsizex; + int bin_size_y = d_plan->opts.gpu_binsizey; + int bin_size_z = d_plan->opts.gpu_binsizez; + if (bin_size_x < 0 || bin_size_y < 0 || bin_size_z < 0) { + std::cerr << "error: invalid binsize (binsizex, binsizey, binsizez) = ("; + std::cerr << bin_size_x << "," << bin_size_y << "," << bin_size_z << ")" << std::endl; + return FINUFFT_ERR_BINSIZE_NOTVALID; + } + + int numbins[3]; + numbins[0] = ceil((T)nf1 / bin_size_x); + numbins[1] = ceil((T)nf2 / bin_size_y); + numbins[2] = ceil((T)nf3 / bin_size_z); + + T *d_kx = d_plan->kx; + T *d_ky = d_plan->ky; + T *d_kz = d_plan->kz; + + int *d_binsize = d_plan->binsize; + int *d_binstartpts = d_plan->binstartpts; + int *d_sortidx = d_plan->sortidx; + int *d_numsubprob = d_plan->numsubprob; + int *d_subprobstartpts = d_plan->subprobstartpts; + int *d_idxnupts = d_plan->idxnupts; + + int *d_subprob_to_bin = NULL; + + int ier; + if ((ier = checkCudaErrors(cudaMemsetAsync( + d_binsize, 0, numbins[0] * numbins[1] * numbins[2] * sizeof(int), stream)))) + return ier; + calc_bin_size_noghost_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( + M, nf1, nf2, nf3, bin_size_x, bin_size_y, bin_size_z, numbins[0], numbins[1], + numbins[2], d_binsize, d_kx, d_ky, d_kz, d_sortidx); + RETURN_IF_CUDA_ERROR + + int n = numbins[0] * numbins[1] * numbins[2]; + thrust::device_ptr d_ptr(d_binsize); + thrust::device_ptr d_result(d_binstartpts); + thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result); + + calc_inverse_of_global_sort_index_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( + M, bin_size_x, bin_size_y, bin_size_z, numbins[0], numbins[1], numbins[2], + d_binstartpts, d_sortidx, d_kx, d_ky, d_kz, d_idxnupts, nf1, nf2, nf3); + RETURN_IF_CUDA_ERROR + /* --------------------------------------------- */ + // Determining Subproblem properties // + /* --------------------------------------------- */ + calc_subprob_3d_v2<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( + d_binsize, d_numsubprob, maxsubprobsize, numbins[0] * numbins[1] * numbins[2]); + RETURN_IF_CUDA_ERROR + + d_ptr = thrust::device_pointer_cast(d_numsubprob); + d_result = thrust::device_pointer_cast(d_subprobstartpts + 1); + thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result); + int totalnumsubprob; + if (checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream)) || + checkCudaErrors(cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n], + sizeof(int), cudaMemcpyDeviceToHost, stream))) + return FINUFFT_ERR_CUDA_FAILURE; + cudaStreamSynchronize(stream); + if (checkCudaErrors( + cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream))) + return FINUFFT_ERR_CUDA_FAILURE; + + map_b_into_subprob_3d_v2<<<(numbins[0] * numbins[1] + 1024 - 1) / 1024, 1024, 0, + stream>>>(d_subprob_to_bin, d_subprobstartpts, d_numsubprob, + numbins[0] * numbins[1] * numbins[2]); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err)); + cudaFree(d_subprob_to_bin); + return FINUFFT_ERR_CUDA_FAILURE; + } + + assert(d_subprob_to_bin != NULL); + if (d_plan->subprob_to_bin != NULL) cudaFree(d_plan->subprob_to_bin); + d_plan->subprob_to_bin = d_subprob_to_bin; + assert(d_plan->subprob_to_bin != nullptr); + d_plan->totalnumsubprob = totalnumsubprob; + + return 0; +} - for (int t = 0; t < blksize; t++) { - if (d_plan->opts.gpu_kerevalmeth) { - spread_3d_subprob<<>>( - d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, sigma, es_c, es_beta, - d_binstartpts, d_binsize, bin_size_x, bin_size_y, bin_size_z, d_subprob_to_bin, d_subprobstartpts, - d_numsubprob, maxsubprobsize, numbins[0], numbins[1], numbins[2], d_idxnupts); - RETURN_IF_CUDA_ERROR - } else { - spread_3d_subprob<<>>( - d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, sigma, es_c, es_beta, - d_binstartpts, d_binsize, bin_size_x, bin_size_y, bin_size_z, d_subprob_to_bin, d_subprobstartpts, - d_numsubprob, maxsubprobsize, numbins[0], numbins[1], numbins[2], d_idxnupts); - RETURN_IF_CUDA_ERROR - } +template +int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, + int blksize) { + auto &stream = d_plan->stream; + + int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells + int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; + + // assume that bin_size_x > ns/2; + int bin_size_x = d_plan->opts.gpu_binsizex; + int bin_size_y = d_plan->opts.gpu_binsizey; + int bin_size_z = d_plan->opts.gpu_binsizez; + int numbins[3]; + numbins[0] = ceil((T)nf1 / bin_size_x); + numbins[1] = ceil((T)nf2 / bin_size_y); + numbins[2] = ceil((T)nf3 / bin_size_z); + + T *d_kx = d_plan->kx; + T *d_ky = d_plan->ky; + T *d_kz = d_plan->kz; + cuda_complex *d_c = d_plan->c; + cuda_complex *d_fw = d_plan->fw; + + int *d_binsize = d_plan->binsize; + int *d_binstartpts = d_plan->binstartpts; + int *d_numsubprob = d_plan->numsubprob; + int *d_subprobstartpts = d_plan->subprobstartpts; + int *d_idxnupts = d_plan->idxnupts; + + int totalnumsubprob = d_plan->totalnumsubprob; + int *d_subprob_to_bin = d_plan->subprob_to_bin; + + T sigma = d_plan->spopts.upsampfac; + T es_c = d_plan->spopts.ES_c; + T es_beta = d_plan->spopts.ES_beta; + size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) * + (bin_size_y + 2 * ceil(ns / 2.0)) * + (bin_size_z + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex); + if (sharedplanorysize > 49152) { + std::cerr << "[cuspread3d_subprob] error: not enough shared memory (" + << sharedplanorysize << ")" << std::endl; + return FINUFFT_ERR_INSUFFICIENT_SHMEM; + } + + for (int t = 0; t < blksize; t++) { + if (d_plan->opts.gpu_kerevalmeth) { + spread_3d_subprob<<>>( + d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, + sigma, es_c, es_beta, d_binstartpts, d_binsize, bin_size_x, bin_size_y, + bin_size_z, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize, + numbins[0], numbins[1], numbins[2], d_idxnupts); + RETURN_IF_CUDA_ERROR + } else { + spread_3d_subprob<<>>( + d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, + sigma, es_c, es_beta, d_binstartpts, d_binsize, bin_size_x, bin_size_y, + bin_size_z, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize, + numbins[0], numbins[1], numbins[2], d_idxnupts); + RETURN_IF_CUDA_ERROR } + } - return 0; + return 0; } template int cuspread3d(cufinufft_plan_t *d_plan, int blksize); template int cuspread3d(cufinufft_plan_t *d_plan, int blksize); -template int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan); -template int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan); -template int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan); -template int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan); -template int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan); -template int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan); +template int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M, + cufinufft_plan_t *d_plan); +template int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M, + cufinufft_plan_t *d_plan); +template int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M, + cufinufft_plan_t *d_plan); +template int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M, + cufinufft_plan_t *d_plan); +template int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M, + cufinufft_plan_t *d_plan); +template int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M, + cufinufft_plan_t *d_plan); } // namespace spreadinterp } // namespace cufinufft diff --git a/src/cuda/common.cu b/src/cuda/common.cu index a83688693..a87628a38 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -25,39 +25,42 @@ using std::max; cnufftspread's real symmetric kernel. */ // a , f are intermediate results from function onedim_fseries_kernel_precomp() // (see cufinufft/contrib/common.cpp for description) -template -__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, cuDoubleComplex *a, T *fwkerhalf1, - T *fwkerhalf2, T *fwkerhalf3, int ns) { - T J2 = ns / 2.0; - int q = (int)(2 + 3.0 * J2); - int nf; - cuDoubleComplex *at = a + threadIdx.y * MAX_NQUAD; - T *ft = f + threadIdx.y * MAX_NQUAD; - T *oarr; - if (threadIdx.y == 0) { - oarr = fwkerhalf1; - nf = nf1; - } else if (threadIdx.y == 1) { - oarr = fwkerhalf2; - nf = nf2; - } else { - oarr = fwkerhalf3; - nf = nf3; - } - - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < nf / 2 + 1; i += blockDim.x * gridDim.x) { - int brk = 0.5 + i; - T x = 0.0; - for (int n = 0; n < q; n++) { - x += ft[n] * 2 * (pow(cabs(at[n]), brk) * cos(brk * carg(at[n]))); - } - oarr[i] = x; +template +__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, + cuDoubleComplex *a, T *fwkerhalf1, T *fwkerhalf2, + T *fwkerhalf3, int ns) { + T J2 = ns / 2.0; + int q = (int)(2 + 3.0 * J2); + int nf; + cuDoubleComplex *at = a + threadIdx.y * MAX_NQUAD; + T *ft = f + threadIdx.y * MAX_NQUAD; + T *oarr; + if (threadIdx.y == 0) { + oarr = fwkerhalf1; + nf = nf1; + } else if (threadIdx.y == 1) { + oarr = fwkerhalf2; + nf = nf2; + } else { + oarr = fwkerhalf3; + nf = nf3; + } + + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < nf / 2 + 1; + i += blockDim.x * gridDim.x) { + int brk = 0.5 + i; + T x = 0.0; + for (int n = 0; n < q; n++) { + x += ft[n] * 2 * (pow(cabs(at[n]), brk) * cos(brk * carg(at[n]))); } + oarr[i] = x; + } } -template -int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, cuDoubleComplex *d_a, T *d_fwkerhalf1, - T *d_fwkerhalf2, T *d_fwkerhalf3, int ns, cudaStream_t stream) +template +int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, + cuDoubleComplex *d_a, T *d_fwkerhalf1, T *d_fwkerhalf2, + T *d_fwkerhalf3, int ns, cudaStream_t stream) /* wrapper for approximation of Fourier series of real symmetric spreading kernel. @@ -65,44 +68,43 @@ int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, cuDoubleC Melody Shih 2/20/22 */ { - int nout = max(max(nf1 / 2 + 1, nf2 / 2 + 1), nf3 / 2 + 1); + int nout = max(max(nf1 / 2 + 1, nf2 / 2 + 1), nf3 / 2 + 1); - dim3 threadsPerBlock(16, dim); - dim3 numBlocks((nout + 16 - 1) / 16, 1); + dim3 threadsPerBlock(16, dim); + dim3 numBlocks((nout + 16 - 1) / 16, 1); - fseries_kernel_compute<<>>(nf1, nf2, nf3, d_f, d_a, d_fwkerhalf1, - d_fwkerhalf2, d_fwkerhalf3, ns); - RETURN_IF_CUDA_ERROR + fseries_kernel_compute<<>>( + nf1, nf2, nf3, d_f, d_a, d_fwkerhalf1, d_fwkerhalf2, d_fwkerhalf3, ns); + RETURN_IF_CUDA_ERROR - return 0; + return 0; } -template +template int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, cufinufft_opts opts) // Set up the spreader parameters given eps, and pass across various nufft // options. Report status of setup_spreader. Barnett 10/30/17 { - int ier = setup_spreader(spopts, eps, (T)opts.upsampfac, opts.gpu_kerevalmeth); - return ier; + int ier = setup_spreader(spopts, eps, (T)opts.upsampfac, opts.gpu_kerevalmeth); + return ier; } -void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts spopts, CUFINUFFT_BIGINT *nf, - CUFINUFFT_BIGINT bs) +void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts spopts, + CUFINUFFT_BIGINT *nf, CUFINUFFT_BIGINT bs) // type 1 & 2 recipe for how to set 1d size of upsampled array, nf, given opts // and requested number of Fourier modes ms. { - *nf = (CUFINUFFT_BIGINT)(opts.upsampfac * ms); - if (*nf < 2 * spopts.nspread) - *nf = 2 * spopts.nspread; // otherwise spread fails - if (*nf < MAX_NF) { // otherwise will fail anyway - if (opts.gpu_method == 4) // expensive at huge nf - *nf = utils::next235beven(*nf, bs); - else - *nf = utils::next235beven(*nf, 1); - } + *nf = (CUFINUFFT_BIGINT)(opts.upsampfac * ms); + if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread; // otherwise spread fails + if (*nf < MAX_NF) { // otherwise will fail anyway + if (opts.gpu_method == 4) // expensive at huge nf + *nf = utils::next235beven(*nf, bs); + else + *nf = utils::next235beven(*nf, 1); + } } -template +template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, T *fwkerhalf, finufft_spread_opts opts) /* Approximates exact Fourier series coeffs of cnufftspread's real symmetric @@ -129,10 +131,10 @@ void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, T *fwkerhalf, finufft_spread_opt Melody 2/20/22 separate into precomp & comp functions defined below. */ { - T f[MAX_NQUAD]; - std::complex a[MAX_NQUAD]; - onedim_fseries_kernel_precomp(nf, f, a, opts); - onedim_fseries_kernel_compute(nf, f, a, fwkerhalf, opts); + T f[MAX_NQUAD]; + std::complex a[MAX_NQUAD]; + onedim_fseries_kernel_precomp(nf, f, a, opts); + onedim_fseries_kernel_compute(nf, f, a, fwkerhalf, opts); } /* @@ -148,70 +150,81 @@ void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, T *fwkerhalf, finufft_spread_opt f - funciton values at quadrature nodes multiplied with quadrature weights (a, f are provided as the inputs of onedim_fseries_kernel_compute() defined below) */ -template -void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, std::complex *a, finufft_spread_opts opts) { - T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support - // # quadr nodes in z (from 0 to J/2; reflections will be added)... - int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot exceed MAX_NQUAD - double z[2 * MAX_NQUAD]; - double w[2 * MAX_NQUAD]; - - finufft::quadrature::legendre_compute_glr(2 * q, z, w); // only half the nodes used, eg on (0,1) - for (int n = 0; n < q; ++n) { // set up nodes z_n and vals f_n - z[n] *= J2; // rescale nodes - f[n] = J2 * w[n] * evaluate_kernel((T)z[n], opts); // vals & quadr wei - a[n] = exp((T)(2.0 * M_PI) * std::complex(0.0, 1.0) * (T)(nf / 2 - z[n]) / (T)nf); // phase winding rates - } +template +void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, std::complex *a, + finufft_spread_opts opts) { + T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support + // # quadr nodes in z (from 0 to J/2; reflections will be added)... + int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot exceed MAX_NQUAD + double z[2 * MAX_NQUAD]; + double w[2 * MAX_NQUAD]; + + finufft::quadrature::legendre_compute_glr(2 * q, z, w); // only half the nodes used, eg + // on (0,1) + for (int n = 0; n < q; ++n) { // set up nodes z_n and vals f_n + z[n] *= J2; // rescale nodes + f[n] = J2 * w[n] * evaluate_kernel((T)z[n], opts); // vals & quadr wei + a[n] = exp((T)(2.0 * M_PI) * std::complex(0.0, 1.0) * (T)(nf / 2 - z[n]) / + (T)nf); // phase winding rates + } } -template -void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex *a, T *fwkerhalf, - finufft_spread_opts opts) { - T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support - int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot exceed MAX_NQUAD - CUFINUFFT_BIGINT nout = nf / 2 + 1; // how many values we're writing to - int nt = std::min(nout, MY_OMP_GET_MAX_THREADS()); // how many chunks - std::vector brk(nt + 1); // start indices for each thread - for (int t = 0; t <= nt; ++t) // split nout mode indices btw threads - brk[t] = (CUFINUFFT_BIGINT)(0.5 + nout * t / (double)nt); +template +void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex *a, + T *fwkerhalf, finufft_spread_opts opts) { + T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support + int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot exceed MAX_NQUAD + CUFINUFFT_BIGINT nout = nf / 2 + 1; // how many values we're writing to + int nt = std::min(nout, MY_OMP_GET_MAX_THREADS()); // how many chunks + std::vector brk(nt + 1); // start indices for each thread + for (int t = 0; t <= nt; ++t) // split nout mode indices btw threads + brk[t] = (CUFINUFFT_BIGINT)(0.5 + nout * t / (double)nt); #pragma omp parallel - { - int t = MY_OMP_GET_THREAD_NUM(); - if (t < nt) { // could be nt < actual # threads - std::complex aj[MAX_NQUAD]; // phase rotator for this thread - for (int n = 0; n < q; ++n) - aj[n] = pow(a[n], (T)brk[t]); // init phase factors for chunk - for (CUFINUFFT_BIGINT j = brk[t]; j < brk[t + 1]; ++j) { // loop along output array - T x = 0.0; // accumulator for answer at this j - for (int n = 0; n < q; ++n) { - x += f[n] * 2 * real(aj[n]); // include the negative freq - aj[n] *= a[n]; // wind the phases - } - fwkerhalf[j] = x; - } + { + int t = MY_OMP_GET_THREAD_NUM(); + if (t < nt) { // could be nt < actual # threads + std::complex aj[MAX_NQUAD]; // phase rotator for this thread + for (int n = 0; n < q; ++n) + aj[n] = pow(a[n], (T)brk[t]); // init phase factors for chunk + for (CUFINUFFT_BIGINT j = brk[t]; j < brk[t + 1]; ++j) { // loop along output array + T x = 0.0; // accumulator for answer at this j + for (int n = 0; n < q; ++n) { + x += f[n] * 2 * real(aj[n]); // include the negative freq + aj[n] *= a[n]; // wind the phases } + fwkerhalf[j] = x; + } } + } } -template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, float *f, std::complex *a, float *fwkerhalf, +template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, float *f, + std::complex *a, float *fwkerhalf, finufft_spread_opts opts); -template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, double *f, std::complex *a, double *fwkerhalf, +template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, double *f, + std::complex *a, double *fwkerhalf, finufft_spread_opts opts); -template int setup_spreader_for_nufft(finufft_spread_opts &spopts, float eps, cufinufft_opts opts); -template int setup_spreader_for_nufft(finufft_spread_opts &spopts, double eps, cufinufft_opts opts); -template void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, float *f, std::complex *a, - finufft_spread_opts opts); -template void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, double *f, std::complex *a, - finufft_spread_opts opts); -template int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, float *d_f, cuDoubleComplex *d_a, - float *d_fwkerhalf1, float *d_fwkerhalf2, float *d_fwkerhalf3, int ns, +template int setup_spreader_for_nufft(finufft_spread_opts &spopts, float eps, + cufinufft_opts opts); +template int setup_spreader_for_nufft(finufft_spread_opts &spopts, double eps, + cufinufft_opts opts); +template void onedim_fseries_kernel_precomp( + CUFINUFFT_BIGINT nf, float *f, std::complex *a, finufft_spread_opts opts); +template void onedim_fseries_kernel_precomp( + CUFINUFFT_BIGINT nf, double *f, std::complex *a, finufft_spread_opts opts); +template int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, float *d_f, + cuDoubleComplex *d_a, float *d_fwkerhalf1, + float *d_fwkerhalf2, float *d_fwkerhalf3, int ns, cudaStream_t stream); -template int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, double *d_f, cuDoubleComplex *d_a, - double *d_fwkerhalf1, double *d_fwkerhalf2, double *d_fwkerhalf3, int ns, +template int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, double *d_f, + cuDoubleComplex *d_a, double *d_fwkerhalf1, + double *d_fwkerhalf2, double *d_fwkerhalf3, int ns, cudaStream_t stream); -template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, float *fwkerhalf, finufft_spread_opts opts); -template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, double *fwkerhalf, finufft_spread_opts opts); +template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, float *fwkerhalf, + finufft_spread_opts opts); +template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, double *fwkerhalf, + finufft_spread_opts opts); } // namespace common } // namespace cufinufft diff --git a/src/cuda/cufinufft.cu b/src/cuda/cufinufft.cu index 60cdd4482..a81e88780 100644 --- a/src/cuda/cufinufft.cu +++ b/src/cuda/cufinufft.cu @@ -7,76 +7,75 @@ #include inline bool is_invalid_mode_array(int dim, const int64_t *modes64, int32_t modes32[3]) { - int64_t tot_size = 1; - for (int i = 0; i < dim; ++i) { - if (modes64[i] > std::numeric_limits::max()) - return true; - if (modes64[i] <= 0) - return true; - modes32[i] = modes64[i]; - tot_size *= modes64[i]; - } - for (int i = dim; i < 3; ++i) - modes32[i] = 1; - - return tot_size > std::numeric_limits::max(); + int64_t tot_size = 1; + for (int i = 0; i < dim; ++i) { + if (modes64[i] > std::numeric_limits::max()) return true; + if (modes64[i] <= 0) return true; + modes32[i] = modes64[i]; + tot_size *= modes64[i]; + } + for (int i = dim; i < 3; ++i) modes32[i] = 1; + + return tot_size > std::numeric_limits::max(); } extern "C" { -int cufinufftf_makeplan(int type, int dim, const int64_t *nmodes, int iflag, int ntransf, float tol, - cufinufftf_plan *d_plan_ptr, cufinufft_opts *opts) { - if (dim < 1 || dim > 3) { - fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim); - return FINUFFT_ERR_DIM_NOTVALID; - } - - int nmodes32[3]; - if (is_invalid_mode_array(dim, nmodes, nmodes32)) - return FINUFFT_ERR_NDATA_NOTVALID; - - return cufinufft_makeplan_impl(type, dim, nmodes32, iflag, ntransf, tol, (cufinufft_plan_t **)d_plan_ptr, - opts); +int cufinufftf_makeplan(int type, int dim, const int64_t *nmodes, int iflag, int ntransf, + float tol, cufinufftf_plan *d_plan_ptr, cufinufft_opts *opts) { + if (dim < 1 || dim > 3) { + fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim); + return FINUFFT_ERR_DIM_NOTVALID; + } + + int nmodes32[3]; + if (is_invalid_mode_array(dim, nmodes, nmodes32)) return FINUFFT_ERR_NDATA_NOTVALID; + + return cufinufft_makeplan_impl(type, dim, nmodes32, iflag, ntransf, tol, + (cufinufft_plan_t **)d_plan_ptr, opts); } -int cufinufft_makeplan(int type, int dim, const int64_t *nmodes, int iflag, int ntransf, double tol, - cufinufft_plan *d_plan_ptr, cufinufft_opts *opts) { - if (dim < 1 || dim > 3) { - fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim); - return FINUFFT_ERR_DIM_NOTVALID; - } +int cufinufft_makeplan(int type, int dim, const int64_t *nmodes, int iflag, int ntransf, + double tol, cufinufft_plan *d_plan_ptr, cufinufft_opts *opts) { + if (dim < 1 || dim > 3) { + fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim); + return FINUFFT_ERR_DIM_NOTVALID; + } - int nmodes32[3]; - if (is_invalid_mode_array(dim, nmodes, nmodes32)) - return FINUFFT_ERR_NDATA_NOTVALID; + int nmodes32[3]; + if (is_invalid_mode_array(dim, nmodes, nmodes32)) return FINUFFT_ERR_NDATA_NOTVALID; - return cufinufft_makeplan_impl(type, dim, nmodes32, iflag, ntransf, tol, (cufinufft_plan_t **)d_plan_ptr, - opts); + return cufinufft_makeplan_impl(type, dim, nmodes32, iflag, ntransf, tol, + (cufinufft_plan_t **)d_plan_ptr, opts); } -int cufinufftf_setpts(cufinufftf_plan d_plan, int M, float *d_x, float *d_y, float *d_z, int N, float *d_s, - float *d_t, float *d_u) { - return cufinufft_setpts_impl(M, d_x, d_y, d_z, N, d_s, d_t, d_u, (cufinufft_plan_t *)d_plan); +int cufinufftf_setpts(cufinufftf_plan d_plan, int M, float *d_x, float *d_y, float *d_z, + int N, float *d_s, float *d_t, float *d_u) { + return cufinufft_setpts_impl(M, d_x, d_y, d_z, N, d_s, d_t, d_u, + (cufinufft_plan_t *)d_plan); } -int cufinufft_setpts(cufinufft_plan d_plan, int M, double *d_x, double *d_y, double *d_z, int N, double *d_s, - double *d_t, double *d_u) { - return cufinufft_setpts_impl(M, d_x, d_y, d_z, N, d_s, d_t, d_u, (cufinufft_plan_t *)d_plan); +int cufinufft_setpts(cufinufft_plan d_plan, int M, double *d_x, double *d_y, double *d_z, + int N, double *d_s, double *d_t, double *d_u) { + return cufinufft_setpts_impl(M, d_x, d_y, d_z, N, d_s, d_t, d_u, + (cufinufft_plan_t *)d_plan); } -int cufinufftf_execute(cufinufftf_plan d_plan, cuFloatComplex *d_c, cuFloatComplex *d_fk) { - return cufinufft_execute_impl(d_c, d_fk, (cufinufft_plan_t *)d_plan); +int cufinufftf_execute(cufinufftf_plan d_plan, cuFloatComplex *d_c, + cuFloatComplex *d_fk) { + return cufinufft_execute_impl(d_c, d_fk, (cufinufft_plan_t *)d_plan); } -int cufinufft_execute(cufinufft_plan d_plan, cuDoubleComplex *d_c, cuda_complex *d_fk) { - return cufinufft_execute_impl(d_c, d_fk, (cufinufft_plan_t *)d_plan); +int cufinufft_execute(cufinufft_plan d_plan, cuDoubleComplex *d_c, + cuda_complex *d_fk) { + return cufinufft_execute_impl(d_c, d_fk, (cufinufft_plan_t *)d_plan); } int cufinufftf_destroy(cufinufftf_plan d_plan) { - return cufinufft_destroy_impl((cufinufft_plan_t *)d_plan); + return cufinufft_destroy_impl((cufinufft_plan_t *)d_plan); } int cufinufft_destroy(cufinufft_plan d_plan) { - return cufinufft_destroy_impl((cufinufft_plan_t *)d_plan); + return cufinufft_destroy_impl((cufinufft_plan_t *)d_plan); } void cufinufft_default_opts(cufinufft_opts *opts) @@ -96,30 +95,30 @@ void cufinufft_default_opts(cufinufft_opts *opts) Melody Shih 07/25/19; Barnett 2/5/21. */ { - opts->upsampfac = 2.0; + opts->upsampfac = 2.0; - /* following options are for gpu */ - opts->gpu_sort = 1; // access nupts in an ordered way for nupts driven method + /* following options are for gpu */ + opts->gpu_sort = 1; // access nupts in an ordered way for nupts driven method - opts->gpu_maxsubprobsize = 1024; - opts->gpu_obinsizex = -1; - opts->gpu_obinsizey = -1; - opts->gpu_obinsizez = -1; + opts->gpu_maxsubprobsize = 1024; + opts->gpu_obinsizex = -1; + opts->gpu_obinsizey = -1; + opts->gpu_obinsizez = -1; - opts->gpu_binsizex = -1; - opts->gpu_binsizey = -1; - opts->gpu_binsizez = -1; + opts->gpu_binsizex = -1; + opts->gpu_binsizey = -1; + opts->gpu_binsizez = -1; - opts->gpu_spreadinterponly = 0; // default to do the whole nufft + opts->gpu_spreadinterponly = 0; // default to do the whole nufft - opts->gpu_maxbatchsize = 0; // Heuristically set - opts->gpu_stream = cudaStreamDefault; + opts->gpu_maxbatchsize = 0; // Heuristically set + opts->gpu_stream = cudaStreamDefault; - opts->gpu_kerevalmeth = 1; // Horner + opts->gpu_kerevalmeth = 1; // Horner - opts->gpu_method = 0; // Auto method (2 for type 1, 2 for type 2). + opts->gpu_method = 0; // Auto method (2 for type 1, 2 for type 2). - // By default, only use device 0 - opts->gpu_device_id = 0; + // By default, only use device 0 + opts->gpu_device_id = 0; } } diff --git a/src/cuda/deconvolve_wrapper.cu b/src/cuda/deconvolve_wrapper.cu index ffb65b8da..efdd656c7 100644 --- a/src/cuda/deconvolve_wrapper.cu +++ b/src/cuda/deconvolve_wrapper.cu @@ -12,102 +12,114 @@ namespace deconvolve { /* Kernel for copying fw to fk with amplication by prefac/ker */ // Note: assume modeord=0: CMCL-compatible mode ordering in fk (from -N/2 up // to N/2-1) -template -__global__ void deconvolve_1d(int ms, int nf1, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1) { - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms; i += blockDim.x * gridDim.x) { - int w1 = i - ms / 2 >= 0 ? i - ms / 2 : nf1 + i - ms / 2; - - T kervalue = fwkerhalf1[abs(i - ms / 2)]; - fk[i].x = fw[w1].x / kervalue; - fk[i].y = fw[w1].y / kervalue; - } +template +__global__ void deconvolve_1d(int ms, int nf1, cuda_complex *fw, cuda_complex *fk, + T *fwkerhalf1) { + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms; + i += blockDim.x * gridDim.x) { + int w1 = i - ms / 2 >= 0 ? i - ms / 2 : nf1 + i - ms / 2; + + T kervalue = fwkerhalf1[abs(i - ms / 2)]; + fk[i].x = fw[w1].x / kervalue; + fk[i].y = fw[w1].y / kervalue; + } } -template -__global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1, - T *fwkerhalf2) { - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt; i += blockDim.x * gridDim.x) { - int k1 = i % ms; - int k2 = i / ms; - int outidx = k1 + k2 * ms; - int w1 = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2; - int w2 = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2; - int inidx = w1 + w2 * nf1; - - T kervalue = fwkerhalf1[abs(k1 - ms / 2)] * fwkerhalf2[abs(k2 - mt / 2)]; - fk[outidx].x = fw[inidx].x / kervalue; - fk[outidx].y = fw[inidx].y / kervalue; - } +template +__global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, cuda_complex *fw, + cuda_complex *fk, T *fwkerhalf1, T *fwkerhalf2) { + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt; + i += blockDim.x * gridDim.x) { + int k1 = i % ms; + int k2 = i / ms; + int outidx = k1 + k2 * ms; + int w1 = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2; + int w2 = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2; + int inidx = w1 + w2 * nf1; + + T kervalue = fwkerhalf1[abs(k1 - ms / 2)] * fwkerhalf2[abs(k2 - mt / 2)]; + fk[outidx].x = fw[inidx].x / kervalue; + fk[outidx].y = fw[inidx].y / kervalue; + } } -template -__global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, cuda_complex *fw, - cuda_complex *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3) { - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt * mu; i += blockDim.x * gridDim.x) { - int k1 = i % ms; - int k2 = (i / ms) % mt; - int k3 = (i / ms / mt); - int outidx = k1 + k2 * ms + k3 * ms * mt; - int w1 = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2; - int w2 = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2; - int w3 = k3 - mu / 2 >= 0 ? k3 - mu / 2 : nf3 + k3 - mu / 2; - int inidx = w1 + w2 * nf1 + w3 * nf1 * nf2; - - T kervalue = fwkerhalf1[abs(k1 - ms / 2)] * fwkerhalf2[abs(k2 - mt / 2)] * fwkerhalf3[abs(k3 - mu / 2)]; - fk[outidx].x = fw[inidx].x / kervalue; - fk[outidx].y = fw[inidx].y / kervalue; - } +template +__global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, + cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1, + T *fwkerhalf2, T *fwkerhalf3) { + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt * mu; + i += blockDim.x * gridDim.x) { + int k1 = i % ms; + int k2 = (i / ms) % mt; + int k3 = (i / ms / mt); + int outidx = k1 + k2 * ms + k3 * ms * mt; + int w1 = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2; + int w2 = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2; + int w3 = k3 - mu / 2 >= 0 ? k3 - mu / 2 : nf3 + k3 - mu / 2; + int inidx = w1 + w2 * nf1 + w3 * nf1 * nf2; + + T kervalue = fwkerhalf1[abs(k1 - ms / 2)] * fwkerhalf2[abs(k2 - mt / 2)] * + fwkerhalf3[abs(k3 - mu / 2)]; + fk[outidx].x = fw[inidx].x / kervalue; + fk[outidx].y = fw[inidx].y / kervalue; + } } /* Kernel for copying fk to fw with same amplication */ -template -__global__ void amplify_1d(int ms, int nf1, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1) { - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms; i += blockDim.x * gridDim.x) { - int w1 = i - ms / 2 >= 0 ? i - ms / 2 : nf1 + i - ms / 2; - - T kervalue = fwkerhalf1[abs(i - ms / 2)]; - fw[w1].x = fk[i].x / kervalue; - fw[w1].y = fk[i].y / kervalue; - } +template +__global__ void amplify_1d(int ms, int nf1, cuda_complex *fw, cuda_complex *fk, + T *fwkerhalf1) { + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms; + i += blockDim.x * gridDim.x) { + int w1 = i - ms / 2 >= 0 ? i - ms / 2 : nf1 + i - ms / 2; + + T kervalue = fwkerhalf1[abs(i - ms / 2)]; + fw[w1].x = fk[i].x / kervalue; + fw[w1].y = fk[i].y / kervalue; + } } -template -__global__ void amplify_2d(int ms, int mt, int nf1, int nf2, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1, - T *fwkerhalf2) { - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt; i += blockDim.x * gridDim.x) { - int k1 = i % ms; - int k2 = i / ms; - int inidx = k1 + k2 * ms; - int w1 = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2; - int w2 = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2; - int outidx = w1 + w2 * nf1; - - T kervalue = fwkerhalf1[abs(k1 - ms / 2)] * fwkerhalf2[abs(k2 - mt / 2)]; - fw[outidx].x = fk[inidx].x / kervalue; - fw[outidx].y = fk[inidx].y / kervalue; - } +template +__global__ void amplify_2d(int ms, int mt, int nf1, int nf2, cuda_complex *fw, + cuda_complex *fk, T *fwkerhalf1, T *fwkerhalf2) { + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt; + i += blockDim.x * gridDim.x) { + int k1 = i % ms; + int k2 = i / ms; + int inidx = k1 + k2 * ms; + int w1 = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2; + int w2 = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2; + int outidx = w1 + w2 * nf1; + + T kervalue = fwkerhalf1[abs(k1 - ms / 2)] * fwkerhalf2[abs(k2 - mt / 2)]; + fw[outidx].x = fk[inidx].x / kervalue; + fw[outidx].y = fk[inidx].y / kervalue; + } } -template -__global__ void amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, cuda_complex *fw, cuda_complex *fk, - T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3) { - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt * mu; i += blockDim.x * gridDim.x) { - int k1 = i % ms; - int k2 = (i / ms) % mt; - int k3 = (i / ms / mt); - int inidx = k1 + k2 * ms + k3 * ms * mt; - int w1 = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2; - int w2 = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2; - int w3 = k3 - mu / 2 >= 0 ? k3 - mu / 2 : nf3 + k3 - mu / 2; - int outidx = w1 + w2 * nf1 + w3 * nf1 * nf2; - - T kervalue = fwkerhalf1[abs(k1 - ms / 2)] * fwkerhalf2[abs(k2 - mt / 2)] * fwkerhalf3[abs(k3 - mu / 2)]; - fw[outidx].x = fk[inidx].x / kervalue; - fw[outidx].y = fk[inidx].y / kervalue; - } +template +__global__ void amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, + cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1, + T *fwkerhalf2, T *fwkerhalf3) { + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt * mu; + i += blockDim.x * gridDim.x) { + int k1 = i % ms; + int k2 = (i / ms) % mt; + int k3 = (i / ms / mt); + int inidx = k1 + k2 * ms + k3 * ms * mt; + int w1 = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2; + int w2 = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2; + int w3 = k3 - mu / 2 >= 0 ? k3 - mu / 2 : nf3 + k3 - mu / 2; + int outidx = w1 + w2 * nf1 + w3 * nf1 * nf2; + + T kervalue = fwkerhalf1[abs(k1 - ms / 2)] * fwkerhalf2[abs(k2 - mt / 2)] * + fwkerhalf3[abs(k3 - mu / 2)]; + fw[outidx].x = fk[inidx].x / kervalue; + fw[outidx].y = fk[inidx].y / kervalue; + } } -template +template int cudeconvolve1d(cufinufft_plan_t *d_plan, int blksize) /* wrapper for deconvolution & amplication in 1D. @@ -115,29 +127,30 @@ int cudeconvolve1d(cufinufft_plan_t *d_plan, int blksize) Melody Shih 11/21/21 */ { - auto &stream = d_plan->stream; - - int ms = d_plan->ms; - int nf1 = d_plan->nf1; - int nmodes = ms; - int maxbatchsize = d_plan->maxbatchsize; - - if (d_plan->spopts.spread_direction == 1) { - for (int t = 0; t < blksize; t++) { - deconvolve_1d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, nf1, d_plan->fw + t * nf1, - d_plan->fk + t * nmodes, d_plan->fwkerhalf1); - } - } else { - checkCudaErrors(cudaMemsetAsync(d_plan->fw, 0, maxbatchsize * nf1 * sizeof(cuda_complex), stream)); - for (int t = 0; t < blksize; t++) { - amplify_1d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, nf1, d_plan->fw + t * nf1, - d_plan->fk + t * nmodes, d_plan->fwkerhalf1); - } + auto &stream = d_plan->stream; + + int ms = d_plan->ms; + int nf1 = d_plan->nf1; + int nmodes = ms; + int maxbatchsize = d_plan->maxbatchsize; + + if (d_plan->spopts.spread_direction == 1) { + for (int t = 0; t < blksize; t++) { + deconvolve_1d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>( + ms, nf1, d_plan->fw + t * nf1, d_plan->fk + t * nmodes, d_plan->fwkerhalf1); + } + } else { + checkCudaErrors(cudaMemsetAsync( + d_plan->fw, 0, maxbatchsize * nf1 * sizeof(cuda_complex), stream)); + for (int t = 0; t < blksize; t++) { + amplify_1d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>( + ms, nf1, d_plan->fw + t * nf1, d_plan->fk + t * nmodes, d_plan->fwkerhalf1); } - return 0; + } + return 0; } -template +template int cudeconvolve2d(cufinufft_plan_t *d_plan, int blksize) /* wrapper for deconvolution & amplication in 2D. @@ -145,33 +158,34 @@ int cudeconvolve2d(cufinufft_plan_t *d_plan, int blksize) Melody Shih 07/25/19 */ { - auto &stream = d_plan->stream; - - int ms = d_plan->ms; - int mt = d_plan->mt; - int nf1 = d_plan->nf1; - int nf2 = d_plan->nf2; - int nmodes = ms * mt; - int maxbatchsize = d_plan->maxbatchsize; - - if (d_plan->spopts.spread_direction == 1) { - for (int t = 0; t < blksize; t++) { - deconvolve_2d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, mt, nf1, nf2, d_plan->fw + t * nf1 * nf2, - d_plan->fk + t * nmodes, d_plan->fwkerhalf1, - d_plan->fwkerhalf2); - } - } else { - checkCudaErrors(cudaMemsetAsync(d_plan->fw, 0, maxbatchsize * nf1 * nf2 * sizeof(cuda_complex), stream)); - for (int t = 0; t < blksize; t++) { - amplify_2d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, mt, nf1, nf2, d_plan->fw + t * nf1 * nf2, - d_plan->fk + t * nmodes, d_plan->fwkerhalf1, - d_plan->fwkerhalf2); - } + auto &stream = d_plan->stream; + + int ms = d_plan->ms; + int mt = d_plan->mt; + int nf1 = d_plan->nf1; + int nf2 = d_plan->nf2; + int nmodes = ms * mt; + int maxbatchsize = d_plan->maxbatchsize; + + if (d_plan->spopts.spread_direction == 1) { + for (int t = 0; t < blksize; t++) { + deconvolve_2d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>( + ms, mt, nf1, nf2, d_plan->fw + t * nf1 * nf2, d_plan->fk + t * nmodes, + d_plan->fwkerhalf1, d_plan->fwkerhalf2); } - return 0; + } else { + checkCudaErrors(cudaMemsetAsync( + d_plan->fw, 0, maxbatchsize * nf1 * nf2 * sizeof(cuda_complex), stream)); + for (int t = 0; t < blksize; t++) { + amplify_2d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>( + ms, mt, nf1, nf2, d_plan->fw + t * nf1 * nf2, d_plan->fk + t * nmodes, + d_plan->fwkerhalf1, d_plan->fwkerhalf2); + } + } + return 0; } -template +template int cudeconvolve3d(cufinufft_plan_t *d_plan, int blksize) /* wrapper for deconvolution & amplication in 3D. @@ -179,32 +193,34 @@ int cudeconvolve3d(cufinufft_plan_t *d_plan, int blksize) Melody Shih 07/25/19 */ { - auto &stream = d_plan->stream; - - int ms = d_plan->ms; - int mt = d_plan->mt; - int mu = d_plan->mu; - int nf1 = d_plan->nf1; - int nf2 = d_plan->nf2; - int nf3 = d_plan->nf3; - int nmodes = ms * mt * mu; - int maxbatchsize = d_plan->maxbatchsize; - if (d_plan->spopts.spread_direction == 1) { - for (int t = 0; t < blksize; t++) { - deconvolve_3d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>( - ms, mt, mu, nf1, nf2, nf3, d_plan->fw + t * nf1 * nf2 * nf3, d_plan->fk + t * nmodes, - d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3); - } - } else { - checkCudaErrors( - cudaMemsetAsync(d_plan->fw, 0, maxbatchsize * nf1 * nf2 * nf3 * sizeof(cuda_complex), stream)); - for (int t = 0; t < blksize; t++) { - amplify_3d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>( - ms, mt, mu, nf1, nf2, nf3, d_plan->fw + t * nf1 * nf2 * nf3, d_plan->fk + t * nmodes, - d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3); - } + auto &stream = d_plan->stream; + + int ms = d_plan->ms; + int mt = d_plan->mt; + int mu = d_plan->mu; + int nf1 = d_plan->nf1; + int nf2 = d_plan->nf2; + int nf3 = d_plan->nf3; + int nmodes = ms * mt * mu; + int maxbatchsize = d_plan->maxbatchsize; + if (d_plan->spopts.spread_direction == 1) { + for (int t = 0; t < blksize; t++) { + deconvolve_3d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>( + ms, mt, mu, nf1, nf2, nf3, d_plan->fw + t * nf1 * nf2 * nf3, + d_plan->fk + t * nmodes, d_plan->fwkerhalf1, d_plan->fwkerhalf2, + d_plan->fwkerhalf3); + } + } else { + checkCudaErrors(cudaMemsetAsync( + d_plan->fw, 0, maxbatchsize * nf1 * nf2 * nf3 * sizeof(cuda_complex), stream)); + for (int t = 0; t < blksize; t++) { + amplify_3d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>( + ms, mt, mu, nf1, nf2, nf3, d_plan->fw + t * nf1 * nf2 * nf3, + d_plan->fk + t * nmodes, d_plan->fwkerhalf1, d_plan->fwkerhalf2, + d_plan->fwkerhalf3); } - return 0; + } + return 0; } template int cudeconvolve1d(cufinufft_plan_t *d_plan, int blksize); diff --git a/src/cuda/memtransfer_wrapper.cu b/src/cuda/memtransfer_wrapper.cu index a00fa526e..ea2170b9b 100644 --- a/src/cuda/memtransfer_wrapper.cu +++ b/src/cuda/memtransfer_wrapper.cu @@ -11,7 +11,7 @@ namespace cufinufft { namespace memtransfer { -template +template int allocgpumem1d_plan(cufinufft_plan_t *d_plan) /* wrapper for gpu memory allocation in "plan" stage. @@ -19,53 +19,60 @@ int allocgpumem1d_plan(cufinufft_plan_t *d_plan) Melody Shih 11/21/21 */ { - utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); - auto &stream = d_plan->stream; - - int ier; - int nf1 = d_plan->nf1; - int maxbatchsize = d_plan->maxbatchsize; - - switch (d_plan->opts.gpu_method) { - case 1: { - if (d_plan->opts.gpu_sort) { - int numbins = ceil((T)nf1 / d_plan->opts.gpu_binsizex); - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binsize, numbins * sizeof(int), stream)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binstartpts, numbins * sizeof(int), stream)))) - goto finalize; - } - } break; - case 2: { - int numbins = ceil((T)nf1 / d_plan->opts.gpu_binsizex); - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->numsubprob, numbins * sizeof(int), stream)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binsize, numbins * sizeof(int), stream)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binstartpts, numbins * sizeof(int), stream)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->subprobstartpts, (numbins + 1) * sizeof(int), stream)))) - goto finalize; - } break; - default: - std::cerr << "err: invalid method " << std::endl; - } - - if (!d_plan->opts.gpu_spreadinterponly) { - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->fw, maxbatchsize * nf1 * sizeof(cuda_complex), stream)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->fwkerhalf1, (nf1 / 2 + 1) * sizeof(T), stream)))) - goto finalize; + utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); + auto &stream = d_plan->stream; + + int ier; + int nf1 = d_plan->nf1; + int maxbatchsize = d_plan->maxbatchsize; + + switch (d_plan->opts.gpu_method) { + case 1: { + if (d_plan->opts.gpu_sort) { + int numbins = ceil((T)nf1 / d_plan->opts.gpu_binsizex); + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_plan->binsize, numbins * sizeof(int), stream)))) + goto finalize; + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_plan->binstartpts, numbins * sizeof(int), stream)))) + goto finalize; } + } break; + case 2: { + int numbins = ceil((T)nf1 / d_plan->opts.gpu_binsizex); + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_plan->numsubprob, numbins * sizeof(int), stream)))) + goto finalize; + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_plan->binsize, numbins * sizeof(int), stream)))) + goto finalize; + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_plan->binstartpts, numbins * sizeof(int), stream)))) + goto finalize; + if ((ier = checkCudaErrors(cudaMallocAsync( + &d_plan->subprobstartpts, (numbins + 1) * sizeof(int), stream)))) + goto finalize; + } break; + default: + std::cerr << "err: invalid method " << std::endl; + } + + if (!d_plan->opts.gpu_spreadinterponly) { + if ((ier = checkCudaErrors(cudaMallocAsync( + &d_plan->fw, maxbatchsize * nf1 * sizeof(cuda_complex), stream)))) + goto finalize; + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_plan->fwkerhalf1, (nf1 / 2 + 1) * sizeof(T), stream)))) + goto finalize; + } finalize: - if (ier) - freegpumemory(d_plan); + if (ier) freegpumemory(d_plan); - return ier; + return ier; } -template +template int allocgpumem1d_nupts(cufinufft_plan_t *d_plan) /* wrapper for gpu memory allocation in "setNUpts" stage. @@ -73,41 +80,43 @@ int allocgpumem1d_nupts(cufinufft_plan_t *d_plan) Melody Shih 11/21/21 */ { - utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); - auto &stream = d_plan->stream; - int ier; - - int M = d_plan->M; - CUDA_FREE_AND_NULL(d_plan->sortidx, stream); - CUDA_FREE_AND_NULL(d_plan->idxnupts, stream); - - switch (d_plan->opts.gpu_method) { - case 1: { - if (d_plan->opts.gpu_sort && - (ier = checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream)))) - goto finalize; - } break; - case 2: { - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream)))) - goto finalize; - } break; - default: - std::cerr << "[allocgpumem1d_nupts] error: invalid method\n"; - ier = FINUFFT_ERR_METHOD_NOTVALID; - } + utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); + auto &stream = d_plan->stream; + int ier; + + int M = d_plan->M; + CUDA_FREE_AND_NULL(d_plan->sortidx, stream); + CUDA_FREE_AND_NULL(d_plan->idxnupts, stream); + + switch (d_plan->opts.gpu_method) { + case 1: { + if (d_plan->opts.gpu_sort && (ier = checkCudaErrors(cudaMallocAsync( + &d_plan->sortidx, M * sizeof(int), stream)))) + goto finalize; + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream)))) + goto finalize; + } break; + case 2: { + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream)))) + goto finalize; + if ((ier = + checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream)))) + goto finalize; + } break; + default: + std::cerr << "[allocgpumem1d_nupts] error: invalid method\n"; + ier = FINUFFT_ERR_METHOD_NOTVALID; + } finalize: - if (ier) - freegpumemory(d_plan); + if (ier) freegpumemory(d_plan); - return ier; + return ier; } -template +template int allocgpumem2d_plan(cufinufft_plan_t *d_plan) /* wrapper for gpu memory allocation in "plan" stage. @@ -115,66 +124,70 @@ int allocgpumem2d_plan(cufinufft_plan_t *d_plan) Melody Shih 07/25/19 */ { - utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); - auto &stream = d_plan->stream; - int ier; - - int nf1 = d_plan->nf1; - int nf2 = d_plan->nf2; - int maxbatchsize = d_plan->maxbatchsize; - - switch (d_plan->opts.gpu_method) { - case 1: { - if (d_plan->opts.gpu_sort) { - int numbins[2]; - numbins[0] = ceil((T)nf1 / d_plan->opts.gpu_binsizex); - numbins[1] = ceil((T)nf2 / d_plan->opts.gpu_binsizey); - if ((ier = - checkCudaErrors(cudaMallocAsync(&d_plan->binsize, numbins[0] * numbins[1] * sizeof(int), stream)))) - goto finalize; - if ((ier = checkCudaErrors( - cudaMallocAsync(&d_plan->binstartpts, numbins[0] * numbins[1] * sizeof(int), stream)))) - goto finalize; - } - } break; - case 2: { - int64_t numbins[2]; - numbins[0] = ceil((T)nf1 / d_plan->opts.gpu_binsizex); - numbins[1] = ceil((T)nf2 / d_plan->opts.gpu_binsizey); - if ((ier = - checkCudaErrors(cudaMallocAsync(&d_plan->numsubprob, numbins[0] * numbins[1] * sizeof(int), stream)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binsize, numbins[0] * numbins[1] * sizeof(int), stream)))) - goto finalize; - if ((ier = - checkCudaErrors(cudaMallocAsync(&d_plan->binstartpts, numbins[0] * numbins[1] * sizeof(int), stream)))) - goto finalize; - if ((ier = checkCudaErrors( - cudaMallocAsync(&d_plan->subprobstartpts, (numbins[0] * numbins[1] + 1) * sizeof(int), stream)))) - goto finalize; - } break; - default: - std::cerr << "[allocgpumem2d_plan] error: invalid method\n"; - } - - if (!d_plan->opts.gpu_spreadinterponly) { - if ((ier = checkCudaErrors( - cudaMallocAsync(&d_plan->fw, maxbatchsize * nf1 * nf2 * sizeof(cuda_complex), stream)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->fwkerhalf1, (nf1 / 2 + 1) * sizeof(T), stream)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->fwkerhalf2, (nf2 / 2 + 1) * sizeof(T), stream)))) - goto finalize; + utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); + auto &stream = d_plan->stream; + int ier; + + int nf1 = d_plan->nf1; + int nf2 = d_plan->nf2; + int maxbatchsize = d_plan->maxbatchsize; + + switch (d_plan->opts.gpu_method) { + case 1: { + if (d_plan->opts.gpu_sort) { + int numbins[2]; + numbins[0] = ceil((T)nf1 / d_plan->opts.gpu_binsizex); + numbins[1] = ceil((T)nf2 / d_plan->opts.gpu_binsizey); + if ((ier = checkCudaErrors(cudaMallocAsync( + &d_plan->binsize, numbins[0] * numbins[1] * sizeof(int), stream)))) + goto finalize; + if ((ier = checkCudaErrors(cudaMallocAsync( + &d_plan->binstartpts, numbins[0] * numbins[1] * sizeof(int), stream)))) + goto finalize; } + } break; + case 2: { + int64_t numbins[2]; + numbins[0] = ceil((T)nf1 / d_plan->opts.gpu_binsizex); + numbins[1] = ceil((T)nf2 / d_plan->opts.gpu_binsizey); + if ((ier = checkCudaErrors(cudaMallocAsync( + &d_plan->numsubprob, numbins[0] * numbins[1] * sizeof(int), stream)))) + goto finalize; + if ((ier = checkCudaErrors(cudaMallocAsync( + &d_plan->binsize, numbins[0] * numbins[1] * sizeof(int), stream)))) + goto finalize; + if ((ier = checkCudaErrors(cudaMallocAsync( + &d_plan->binstartpts, numbins[0] * numbins[1] * sizeof(int), stream)))) + goto finalize; + if ((ier = + checkCudaErrors(cudaMallocAsync(&d_plan->subprobstartpts, + (numbins[0] * numbins[1] + 1) * sizeof(int), + stream)))) + goto finalize; + } break; + default: + std::cerr << "[allocgpumem2d_plan] error: invalid method\n"; + } + + if (!d_plan->opts.gpu_spreadinterponly) { + if ((ier = checkCudaErrors(cudaMallocAsync( + &d_plan->fw, maxbatchsize * nf1 * nf2 * sizeof(cuda_complex), stream)))) + goto finalize; + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_plan->fwkerhalf1, (nf1 / 2 + 1) * sizeof(T), stream)))) + goto finalize; + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_plan->fwkerhalf2, (nf2 / 2 + 1) * sizeof(T), stream)))) + goto finalize; + } finalize: - if (ier) - freegpumemory(d_plan); + if (ier) freegpumemory(d_plan); - return ier; + return ier; } -template +template int allocgpumem2d_nupts(cufinufft_plan_t *d_plan) /* wrapper for gpu memory allocation in "setNUpts" stage. @@ -182,41 +195,43 @@ int allocgpumem2d_nupts(cufinufft_plan_t *d_plan) Melody Shih 07/25/19 */ { - utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); - auto &stream = d_plan->stream; - int ier; - - const int M = d_plan->M; - - CUDA_FREE_AND_NULL(d_plan->sortidx, stream); - CUDA_FREE_AND_NULL(d_plan->idxnupts, stream); - - switch (d_plan->opts.gpu_method) { - case 1: { - if (d_plan->opts.gpu_sort && - (ier = checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream)))) - goto finalize; - } break; - case 2: { - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream)))) - goto finalize; - } break; - default: - std::cerr << "[allocgpumem2d_nupts] error: invalid method\n"; - } + utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); + auto &stream = d_plan->stream; + int ier; + + const int M = d_plan->M; + + CUDA_FREE_AND_NULL(d_plan->sortidx, stream); + CUDA_FREE_AND_NULL(d_plan->idxnupts, stream); + + switch (d_plan->opts.gpu_method) { + case 1: { + if (d_plan->opts.gpu_sort && (ier = checkCudaErrors(cudaMallocAsync( + &d_plan->sortidx, M * sizeof(int), stream)))) + goto finalize; + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream)))) + goto finalize; + } break; + case 2: { + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream)))) + goto finalize; + if ((ier = + checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream)))) + goto finalize; + } break; + default: + std::cerr << "[allocgpumem2d_nupts] error: invalid method\n"; + } finalize: - if (ier) - freegpumemory(d_plan); + if (ier) freegpumemory(d_plan); - return ier; + return ier; } -template +template int allocgpumem3d_plan(cufinufft_plan_t *d_plan) /* wrapper for gpu memory allocation in "plan" stage. @@ -224,89 +239,104 @@ int allocgpumem3d_plan(cufinufft_plan_t *d_plan) Melody Shih 07/25/19 */ { - utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); - auto &stream = d_plan->stream; - int ier; - - int nf1 = d_plan->nf1; - int nf2 = d_plan->nf2; - int nf3 = d_plan->nf3; - int maxbatchsize = d_plan->maxbatchsize; - - switch (d_plan->opts.gpu_method) { - case 1: { - if (d_plan->opts.gpu_sort) { - const int64_t nbins_tot = ceil((T)nf1 / d_plan->opts.gpu_binsizex) * - ceil((T)nf2 / d_plan->opts.gpu_binsizey) * - ceil((T)nf3 / d_plan->opts.gpu_binsizez); - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binsize, nbins_tot * sizeof(int), stream)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binstartpts, nbins_tot * sizeof(int), stream)))) - goto finalize; - } - } break; - case 2: { - const int64_t nbins_tot = ceil((T)nf1 / d_plan->opts.gpu_binsizex) * ceil((T)nf2 / d_plan->opts.gpu_binsizey) * - ceil((T)nf3 / d_plan->opts.gpu_binsizez); - - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->numsubprob, nbins_tot * sizeof(int), stream)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binsize, nbins_tot * sizeof(int), stream)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binstartpts, nbins_tot * sizeof(int), stream)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->subprobstartpts, (nbins_tot + 1) * sizeof(int), stream)))) - goto finalize; - } break; - case 4: { - const int numobins[3] = {(int)ceil((T)nf1 / d_plan->opts.gpu_obinsizex), - (int)ceil((T)nf2 / d_plan->opts.gpu_obinsizey), - (int)ceil((T)nf3 / d_plan->opts.gpu_obinsizez)}; - - const int binsperobins[3] = {d_plan->opts.gpu_obinsizex / d_plan->opts.gpu_binsizex, - d_plan->opts.gpu_obinsizey / d_plan->opts.gpu_binsizey, - d_plan->opts.gpu_obinsizez / d_plan->opts.gpu_binsizez}; - - const int numbins[3] = {numobins[0] * (binsperobins[0] + 2), numobins[1] * (binsperobins[1] + 2), - numobins[2] * (binsperobins[2] + 2)}; - - const int64_t numobins_tot = numobins[0] * numobins[1] * numobins[2]; - const int64_t numbins_tot = numbins[0] * numbins[1] * numbins[2]; - - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->numsubprob, numobins_tot * sizeof(int), stream)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binsize, numbins_tot * sizeof(int), stream)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binstartpts, (numbins_tot + 1) * sizeof(int), stream)))) - goto finalize; - if ((ier = - checkCudaErrors(cudaMallocAsync(&d_plan->subprobstartpts, (numobins_tot + 1) * sizeof(int), stream)))) - goto finalize; - } break; - default: - std::cerr << "[allocgpumem3d_plan] error: invalid method\n"; - } - - if (!d_plan->opts.gpu_spreadinterponly) { - if ((ier = checkCudaErrors( - cudaMallocAsync(&d_plan->fw, maxbatchsize * nf1 * nf2 * nf3 * sizeof(cuda_complex), stream)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->fwkerhalf1, (nf1 / 2 + 1) * sizeof(T), stream)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->fwkerhalf2, (nf2 / 2 + 1) * sizeof(T), stream)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->fwkerhalf3, (nf3 / 2 + 1) * sizeof(T), stream)))) - goto finalize; + utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); + auto &stream = d_plan->stream; + int ier; + + int nf1 = d_plan->nf1; + int nf2 = d_plan->nf2; + int nf3 = d_plan->nf3; + int maxbatchsize = d_plan->maxbatchsize; + + switch (d_plan->opts.gpu_method) { + case 1: { + if (d_plan->opts.gpu_sort) { + const int64_t nbins_tot = ceil((T)nf1 / d_plan->opts.gpu_binsizex) * + ceil((T)nf2 / d_plan->opts.gpu_binsizey) * + ceil((T)nf3 / d_plan->opts.gpu_binsizez); + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_plan->binsize, nbins_tot * sizeof(int), stream)))) + goto finalize; + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_plan->binstartpts, nbins_tot * sizeof(int), stream)))) + goto finalize; } + } break; + case 2: { + const int64_t nbins_tot = ceil((T)nf1 / d_plan->opts.gpu_binsizex) * + ceil((T)nf2 / d_plan->opts.gpu_binsizey) * + ceil((T)nf3 / d_plan->opts.gpu_binsizez); + + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_plan->numsubprob, nbins_tot * sizeof(int), stream)))) + goto finalize; + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_plan->binsize, nbins_tot * sizeof(int), stream)))) + goto finalize; + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_plan->binstartpts, nbins_tot * sizeof(int), stream)))) + goto finalize; + if ((ier = checkCudaErrors(cudaMallocAsync( + &d_plan->subprobstartpts, (nbins_tot + 1) * sizeof(int), stream)))) + goto finalize; + } break; + case 4: { + const int numobins[3] = {(int)ceil((T)nf1 / d_plan->opts.gpu_obinsizex), + (int)ceil((T)nf2 / d_plan->opts.gpu_obinsizey), + (int)ceil((T)nf3 / d_plan->opts.gpu_obinsizez)}; + + const int binsperobins[3] = {d_plan->opts.gpu_obinsizex / d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_obinsizey / d_plan->opts.gpu_binsizey, + d_plan->opts.gpu_obinsizez / d_plan->opts.gpu_binsizez}; + + const int numbins[3] = {numobins[0] * (binsperobins[0] + 2), + numobins[1] * (binsperobins[1] + 2), + numobins[2] * (binsperobins[2] + 2)}; + + const int64_t numobins_tot = numobins[0] * numobins[1] * numobins[2]; + const int64_t numbins_tot = numbins[0] * numbins[1] * numbins[2]; + + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_plan->numsubprob, numobins_tot * sizeof(int), stream)))) + goto finalize; + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_plan->binsize, numbins_tot * sizeof(int), stream)))) + goto finalize; + if ((ier = checkCudaErrors(cudaMallocAsync( + &d_plan->binstartpts, (numbins_tot + 1) * sizeof(int), stream)))) + goto finalize; + if ((ier = checkCudaErrors(cudaMallocAsync( + &d_plan->subprobstartpts, (numobins_tot + 1) * sizeof(int), stream)))) + goto finalize; + } break; + default: + std::cerr << "[allocgpumem3d_plan] error: invalid method\n"; + } + + if (!d_plan->opts.gpu_spreadinterponly) { + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_plan->fw, + maxbatchsize * nf1 * nf2 * nf3 * sizeof(cuda_complex), + stream)))) + goto finalize; + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_plan->fwkerhalf1, (nf1 / 2 + 1) * sizeof(T), stream)))) + goto finalize; + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_plan->fwkerhalf2, (nf2 / 2 + 1) * sizeof(T), stream)))) + goto finalize; + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_plan->fwkerhalf3, (nf3 / 2 + 1) * sizeof(T), stream)))) + goto finalize; + } finalize: - if (ier) - freegpumemory(d_plan); + if (ier) freegpumemory(d_plan); - return ier; + return ier; } -template +template int allocgpumem3d_nupts(cufinufft_plan_t *d_plan) /* wrapper for gpu memory allocation in "setNUpts" stage. @@ -314,44 +344,47 @@ int allocgpumem3d_nupts(cufinufft_plan_t *d_plan) Melody Shih 07/25/19 */ { - utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); - auto &stream = d_plan->stream; - int ier; - int M = d_plan->M; - - CUDA_FREE_AND_NULL(d_plan->sortidx, stream); - CUDA_FREE_AND_NULL(d_plan->idxnupts, stream) - - switch (d_plan->opts.gpu_method) { - case 1: { - if (d_plan->opts.gpu_sort && - ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream))))) - goto finalize; - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream)))) - goto finalize; - } break; - case 2: { - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream)))) - goto finalize; - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream)))) - goto finalize; - } break; - case 4: { - if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream)))) - goto finalize; - } break; - default: - std::cerr << "[allocgpumem3d_nupts] error: invalid method\n"; - } + utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); + auto &stream = d_plan->stream; + int ier; + int M = d_plan->M; + + CUDA_FREE_AND_NULL(d_plan->sortidx, stream); + CUDA_FREE_AND_NULL(d_plan->idxnupts, stream) + + switch (d_plan->opts.gpu_method) { + case 1: { + if (d_plan->opts.gpu_sort && ((ier = checkCudaErrors(cudaMallocAsync( + &d_plan->sortidx, M * sizeof(int), stream))))) + goto finalize; + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream)))) + goto finalize; + } break; + case 2: { + if ((ier = checkCudaErrors( + cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream)))) + goto finalize; + if ((ier = + checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream)))) + goto finalize; + } break; + case 4: { + if ((ier = + checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream)))) + goto finalize; + } break; + default: + std::cerr << "[allocgpumem3d_nupts] error: invalid method\n"; + } finalize: - if (ier) - freegpumemory(d_plan); + if (ier) freegpumemory(d_plan); - return ier; + return ier; } -template +template void freegpumemory(cufinufft_plan_t *d_plan) /* wrapper for freeing gpu memory. @@ -359,24 +392,24 @@ void freegpumemory(cufinufft_plan_t *d_plan) Melody Shih 11/21/21 */ { - utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); - auto &stream = d_plan->stream; - - CUDA_FREE_AND_NULL(d_plan->fw, stream); - CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream); - CUDA_FREE_AND_NULL(d_plan->fwkerhalf2, stream); - CUDA_FREE_AND_NULL(d_plan->fwkerhalf3, stream); - - CUDA_FREE_AND_NULL(d_plan->idxnupts, stream); - CUDA_FREE_AND_NULL(d_plan->sortidx, stream); - CUDA_FREE_AND_NULL(d_plan->numsubprob, stream); - CUDA_FREE_AND_NULL(d_plan->binsize, stream); - CUDA_FREE_AND_NULL(d_plan->binstartpts, stream); - CUDA_FREE_AND_NULL(d_plan->subprob_to_bin, stream); - CUDA_FREE_AND_NULL(d_plan->subprobstartpts, stream); - - CUDA_FREE_AND_NULL(d_plan->numnupts, stream); - CUDA_FREE_AND_NULL(d_plan->numsubprob, stream); + utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id); + auto &stream = d_plan->stream; + + CUDA_FREE_AND_NULL(d_plan->fw, stream); + CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream); + CUDA_FREE_AND_NULL(d_plan->fwkerhalf2, stream); + CUDA_FREE_AND_NULL(d_plan->fwkerhalf3, stream); + + CUDA_FREE_AND_NULL(d_plan->idxnupts, stream); + CUDA_FREE_AND_NULL(d_plan->sortidx, stream); + CUDA_FREE_AND_NULL(d_plan->numsubprob, stream); + CUDA_FREE_AND_NULL(d_plan->binsize, stream); + CUDA_FREE_AND_NULL(d_plan->binstartpts, stream); + CUDA_FREE_AND_NULL(d_plan->subprob_to_bin, stream); + CUDA_FREE_AND_NULL(d_plan->subprobstartpts, stream); + + CUDA_FREE_AND_NULL(d_plan->numnupts, stream); + CUDA_FREE_AND_NULL(d_plan->numsubprob, stream); } template int allocgpumem1d_plan(cufinufft_plan_t *d_plan); diff --git a/src/cuda/precision_independent.cu b/src/cuda/precision_independent.cu index 1ab2865e0..66cc5ca69 100644 --- a/src/cuda/precision_independent.cu +++ b/src/cuda/precision_independent.cu @@ -18,216 +18,237 @@ __device__ RT carg(const CT &z) { return (RT)atan2(ipart(z), rpart(z)); } // pol __device__ RT cabs(const CT &z) { return (RT)cuCabs(z); } /* Common Kernels from spreadinterp3d */ -__host__ __device__ int calc_global_index(int xidx, int yidx, int zidx, int onx, int ony, int onz, int bnx, int bny, - int bnz) { - int oix, oiy, oiz; - oix = xidx / bnx; - oiy = yidx / bny; - oiz = zidx / bnz; - return (oix + oiy * onx + oiz * ony * onx) * (bnx * bny * bnz) + - (xidx % bnx + yidx % bny * bnx + zidx % bnz * bny * bnx); +__host__ __device__ int calc_global_index(int xidx, int yidx, int zidx, int onx, int ony, + int onz, int bnx, int bny, int bnz) { + int oix, oiy, oiz; + oix = xidx / bnx; + oiy = yidx / bny; + oiz = zidx / bnz; + return (oix + oiy * onx + oiz * ony * onx) * (bnx * bny * bnz) + + (xidx % bnx + yidx % bny * bnx + zidx % bnz * bny * bnx); } -__device__ int calc_global_index_v2(int xidx, int yidx, int zidx, int nbinx, int nbiny, int nbinz) { - return xidx + yidx * nbinx + zidx * nbinx * nbiny; +__device__ int calc_global_index_v2(int xidx, int yidx, int zidx, int nbinx, int nbiny, + int nbinz) { + return xidx + yidx * nbinx + zidx * nbinx * nbiny; } /* spreadinterp 1d */ -__global__ void calc_subprob_1d(int *bin_size, int *num_subprob, int maxsubprobsize, int numbins) { - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; i += gridDim.x * blockDim.x) { - num_subprob[i] = ceil(bin_size[i] / (float)maxsubprobsize); - } +__global__ void calc_subprob_1d(int *bin_size, int *num_subprob, int maxsubprobsize, + int numbins) { + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; + i += gridDim.x * blockDim.x) { + num_subprob[i] = ceil(bin_size[i] / (float)maxsubprobsize); + } } -__global__ void map_b_into_subprob_1d(int *d_subprob_to_bin, int *d_subprobstartpts, int *d_numsubprob, int numbins) { - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; i += gridDim.x * blockDim.x) { - for (int j = 0; j < d_numsubprob[i]; j++) { - d_subprob_to_bin[d_subprobstartpts[i] + j] = i; - } +__global__ void map_b_into_subprob_1d(int *d_subprob_to_bin, int *d_subprobstartpts, + int *d_numsubprob, int numbins) { + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; + i += gridDim.x * blockDim.x) { + for (int j = 0; j < d_numsubprob[i]; j++) { + d_subprob_to_bin[d_subprobstartpts[i] + j] = i; } + } } __global__ void trivial_global_sort_index_1d(int M, int *index) { - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - index[i] = i; - } + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + index[i] = i; + } } /* spreadinterp 2d */ -__global__ void calc_subprob_2d(int *bin_size, int *num_subprob, int maxsubprobsize, int numbins) { - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; i += gridDim.x * blockDim.x) { - num_subprob[i] = ceil(bin_size[i] / (float)maxsubprobsize); - } +__global__ void calc_subprob_2d(int *bin_size, int *num_subprob, int maxsubprobsize, + int numbins) { + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; + i += gridDim.x * blockDim.x) { + num_subprob[i] = ceil(bin_size[i] / (float)maxsubprobsize); + } } -__global__ void map_b_into_subprob_2d(int *d_subprob_to_bin, int *d_subprobstartpts, int *d_numsubprob, int numbins) { - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; i += gridDim.x * blockDim.x) { - for (int j = 0; j < d_numsubprob[i]; j++) { - d_subprob_to_bin[d_subprobstartpts[i] + j] = i; - } +__global__ void map_b_into_subprob_2d(int *d_subprob_to_bin, int *d_subprobstartpts, + int *d_numsubprob, int numbins) { + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; + i += gridDim.x * blockDim.x) { + for (int j = 0; j < d_numsubprob[i]; j++) { + d_subprob_to_bin[d_subprobstartpts[i] + j] = i; } + } } __global__ void trivial_global_sort_index_2d(int M, int *index) { - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - index[i] = i; - } + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + index[i] = i; + } } /* spreadinterp3d */ -__global__ void calc_subprob_3d_v2(int *bin_size, int *num_subprob, int maxsubprobsize, int numbins) { - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; i += gridDim.x * blockDim.x) { - num_subprob[i] = ceil(bin_size[i] / (float)maxsubprobsize); - } +__global__ void calc_subprob_3d_v2(int *bin_size, int *num_subprob, int maxsubprobsize, + int numbins) { + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; + i += gridDim.x * blockDim.x) { + num_subprob[i] = ceil(bin_size[i] / (float)maxsubprobsize); + } } -__global__ void map_b_into_subprob_3d_v2(int *d_subprob_to_bin, int *d_subprobstartpts, int *d_numsubprob, - int numbins) { - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; i += gridDim.x * blockDim.x) { - for (int j = 0; j < d_numsubprob[i]; j++) { - d_subprob_to_bin[d_subprobstartpts[i] + j] = i; - } +__global__ void map_b_into_subprob_3d_v2(int *d_subprob_to_bin, int *d_subprobstartpts, + int *d_numsubprob, int numbins) { + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; + i += gridDim.x * blockDim.x) { + for (int j = 0; j < d_numsubprob[i]; j++) { + d_subprob_to_bin[d_subprobstartpts[i] + j] = i; } + } } -__global__ void calc_subprob_3d_v1(int binsperobinx, int binsperobiny, int binsperobinz, int *bin_size, - int *num_subprob, int maxsubprobsize, int numbins) { - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; i += gridDim.x * blockDim.x) { - int numnupts = 0; - int binsperobin = binsperobinx * binsperobiny * binsperobinz; - for (int b = 0; b < binsperobin; b++) { - numnupts += bin_size[binsperobin * i + b]; - } - num_subprob[i] = ceil(numnupts / (float)maxsubprobsize); +__global__ void calc_subprob_3d_v1(int binsperobinx, int binsperobiny, int binsperobinz, + int *bin_size, int *num_subprob, int maxsubprobsize, + int numbins) { + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; + i += gridDim.x * blockDim.x) { + int numnupts = 0; + int binsperobin = binsperobinx * binsperobiny * binsperobinz; + for (int b = 0; b < binsperobin; b++) { + numnupts += bin_size[binsperobin * i + b]; } + num_subprob[i] = ceil(numnupts / (float)maxsubprobsize); + } } -__global__ void map_b_into_subprob_3d_v1(int *d_subprob_to_obin, int *d_subprobstartpts, int *d_numsubprob, - int numbins) { - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; i += gridDim.x * blockDim.x) { - for (int j = 0; j < d_numsubprob[i]; j++) { - d_subprob_to_obin[d_subprobstartpts[i] + j] = i; - } +__global__ void map_b_into_subprob_3d_v1(int *d_subprob_to_obin, int *d_subprobstartpts, + int *d_numsubprob, int numbins) { + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; + i += gridDim.x * blockDim.x) { + for (int j = 0; j < d_numsubprob[i]; j++) { + d_subprob_to_obin[d_subprobstartpts[i] + j] = i; } + } } __global__ void trivial_global_sort_index_3d(int M, int *index) { - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - index[i] = i; - } + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + index[i] = i; + } } -__global__ void fill_ghost_bins(int binsperobinx, int binsperobiny, int binsperobinz, int nobinx, int nobiny, - int nobinz, int *binsize) { - int binx = threadIdx.x + blockIdx.x * blockDim.x; - int biny = threadIdx.y + blockIdx.y * blockDim.y; - int binz = threadIdx.z + blockIdx.z * blockDim.z; - - int nbinx = nobinx * binsperobinx; - int nbiny = nobiny * binsperobiny; - int nbinz = nobinz * binsperobinz; - - if (binx < nbinx && biny < nbiny && binz < nbinz) { - int binidx = - calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz, binsperobinx, binsperobiny, binsperobinz); - int i, j, k; - i = binx; - j = biny; - k = binz; - if (binx % binsperobinx == 0) { - i = binx - 2; - i = i < 0 ? i + nbinx : i; - } - if (binx % binsperobinx == binsperobinx - 1) { - i = binx + 2; - i = (i >= nbinx) ? i - nbinx : i; - } - if (biny % binsperobiny == 0) { - j = biny - 2; - j = j < 0 ? j + nbiny : j; - } - if (biny % binsperobiny == binsperobiny - 1) { - j = biny + 2; - j = (j >= nbiny) ? j - nbiny : j; - } - if (binz % binsperobinz == 0) { - k = binz - 2; - k = k < 0 ? k + nbinz : k; - } - if (binz % binsperobinz == binsperobinz - 1) { - k = binz + 2; - k = (k >= nbinz) ? k - nbinz : k; - } - int idxtoupdate = calc_global_index(i, j, k, nobinx, nobiny, nobinz, binsperobinx, binsperobiny, binsperobinz); - if (idxtoupdate != binidx) { - binsize[binidx] = binsize[idxtoupdate]; - } - } -} +__global__ void fill_ghost_bins(int binsperobinx, int binsperobiny, int binsperobinz, + int nobinx, int nobiny, int nobinz, int *binsize) { + int binx = threadIdx.x + blockIdx.x * blockDim.x; + int biny = threadIdx.y + blockIdx.y * blockDim.y; + int binz = threadIdx.z + blockIdx.z * blockDim.z; -__global__ void ghost_bin_pts_index(int binsperobinx, int binsperobiny, int binsperobinz, int nobinx, int nobiny, - int nobinz, int *binsize, int *index, int *binstartpts, int M) { - int binx = threadIdx.x + blockIdx.x * blockDim.x; - int biny = threadIdx.y + blockIdx.y * blockDim.y; - int binz = threadIdx.z + blockIdx.z * blockDim.z; - int nbinx = nobinx * binsperobinx; - int nbiny = nobiny * binsperobiny; - int nbinz = nobinz * binsperobinz; + int nbinx = nobinx * binsperobinx; + int nbiny = nobiny * binsperobiny; + int nbinz = nobinz * binsperobinz; + if (binx < nbinx && biny < nbiny && binz < nbinz) { + int binidx = calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz, binsperobinx, + binsperobiny, binsperobinz); int i, j, k; - int w = 0; - int box[3]; - if (binx < nbinx && biny < nbiny && binz < nbinz) { - box[0] = box[1] = box[2] = 0; - i = binx; - j = biny; - k = binz; - int binidx = - calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz, binsperobinx, binsperobiny, binsperobinz); - if (binx % binsperobinx == 0) { - i = binx - 2; - box[0] = (i < 0); - i = i < 0 ? i + nbinx : i; - w = 1; - } - if (binx % binsperobinx == binsperobinx - 1) { - i = binx + 2; - box[0] = (i > nbinx) * 2; - i = (i > nbinx) ? i - nbinx : i; - w = 1; - } - if (biny % binsperobiny == 0) { - j = biny - 2; - box[1] = (j < 0); - j = j < 0 ? j + nbiny : j; - w = 1; - } - if (biny % binsperobiny == binsperobiny - 1) { - j = biny + 2; - box[1] = (j > nbiny) * 2; - j = (j > nbiny) ? j - nbiny : j; - w = 1; - } - if (binz % binsperobinz == 0) { - k = binz - 2; - box[2] = (k < 0); - k = k < 0 ? k + nbinz : k; - w = 1; - } - if (binz % binsperobinz == binsperobinz - 1) { - k = binz + 2; - box[2] = (k > nbinz) * 2; - k = (k > nbinz) ? k - nbinz : k; - w = 1; - } - int corbinidx = calc_global_index(i, j, k, nobinx, nobiny, nobinz, binsperobinx, binsperobiny, binsperobinz); - if (w == 1) { - for (int n = 0; n < binsize[binidx]; n++) { - index[binstartpts[binidx] + n] = - M * (box[0] + box[1] * 3 + box[2] * 9) + index[binstartpts[corbinidx] + n]; - } - } + i = binx; + j = biny; + k = binz; + if (binx % binsperobinx == 0) { + i = binx - 2; + i = i < 0 ? i + nbinx : i; + } + if (binx % binsperobinx == binsperobinx - 1) { + i = binx + 2; + i = (i >= nbinx) ? i - nbinx : i; + } + if (biny % binsperobiny == 0) { + j = biny - 2; + j = j < 0 ? j + nbiny : j; + } + if (biny % binsperobiny == binsperobiny - 1) { + j = biny + 2; + j = (j >= nbiny) ? j - nbiny : j; + } + if (binz % binsperobinz == 0) { + k = binz - 2; + k = k < 0 ? k + nbinz : k; + } + if (binz % binsperobinz == binsperobinz - 1) { + k = binz + 2; + k = (k >= nbinz) ? k - nbinz : k; + } + int idxtoupdate = calc_global_index(i, j, k, nobinx, nobiny, nobinz, binsperobinx, + binsperobiny, binsperobinz); + if (idxtoupdate != binidx) { + binsize[binidx] = binsize[idxtoupdate]; + } + } +} + +__global__ void ghost_bin_pts_index(int binsperobinx, int binsperobiny, int binsperobinz, + int nobinx, int nobiny, int nobinz, int *binsize, + int *index, int *binstartpts, int M) { + int binx = threadIdx.x + blockIdx.x * blockDim.x; + int biny = threadIdx.y + blockIdx.y * blockDim.y; + int binz = threadIdx.z + blockIdx.z * blockDim.z; + int nbinx = nobinx * binsperobinx; + int nbiny = nobiny * binsperobiny; + int nbinz = nobinz * binsperobinz; + + int i, j, k; + int w = 0; + int box[3]; + if (binx < nbinx && biny < nbiny && binz < nbinz) { + box[0] = box[1] = box[2] = 0; + i = binx; + j = biny; + k = binz; + int binidx = calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz, binsperobinx, + binsperobiny, binsperobinz); + if (binx % binsperobinx == 0) { + i = binx - 2; + box[0] = (i < 0); + i = i < 0 ? i + nbinx : i; + w = 1; + } + if (binx % binsperobinx == binsperobinx - 1) { + i = binx + 2; + box[0] = (i > nbinx) * 2; + i = (i > nbinx) ? i - nbinx : i; + w = 1; + } + if (biny % binsperobiny == 0) { + j = biny - 2; + box[1] = (j < 0); + j = j < 0 ? j + nbiny : j; + w = 1; + } + if (biny % binsperobiny == binsperobiny - 1) { + j = biny + 2; + box[1] = (j > nbiny) * 2; + j = (j > nbiny) ? j - nbiny : j; + w = 1; + } + if (binz % binsperobinz == 0) { + k = binz - 2; + box[2] = (k < 0); + k = k < 0 ? k + nbinz : k; + w = 1; + } + if (binz % binsperobinz == binsperobinz - 1) { + k = binz + 2; + box[2] = (k > nbinz) * 2; + k = (k > nbinz) ? k - nbinz : k; + w = 1; + } + int corbinidx = calc_global_index(i, j, k, nobinx, nobiny, nobinz, binsperobinx, + binsperobiny, binsperobinz); + if (w == 1) { + for (int n = 0; n < binsize[binidx]; n++) { + index[binstartpts[binidx] + n] = + M * (box[0] + box[1] * 3 + box[2] * 9) + index[binstartpts[corbinidx] + n]; + } } + } } } // namespace common diff --git a/src/cuda/spreadinterp.cpp b/src/cuda/spreadinterp.cpp index f129f73b7..5a1c9a08e 100644 --- a/src/cuda/spreadinterp.cpp +++ b/src/cuda/spreadinterp.cpp @@ -13,7 +13,7 @@ namespace cufinufft { namespace spreadinterp { -template +template int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmeth) // Initializes spreader kernel parameters given desired NUFFT tolerance eps, // upsampling factor (=sigma in paper, or R in Dutt-Rokhlin), and ker eval meth @@ -22,70 +22,74 @@ int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmet // Must call before any kernel evals done. // Returns: 0 success, 1, warning, >1 failure (see error codes in utils.h) { - if (upsampfac != 2.0) { // nonstandard sigma - if (kerevalmeth == 1) { - fprintf(stderr, "[%s] nonstandard upsampfac=%.3g cannot be handled by kerevalmeth=1\n", __func__, - upsampfac); - return FINUFFT_ERR_HORNER_WRONG_BETA; - } - if (upsampfac <= 1.0) { - fprintf(stderr, "[%s] error: upsampfac=%.3g is <=1.0\n", __func__, upsampfac); - return FINUFFT_ERR_UPSAMPFAC_TOO_SMALL; - } - // calling routine must abort on above errors, since opts is garbage! - if (upsampfac > 4.0) - fprintf(stderr, "[%s] warning: upsampfac=%.3g is too large to be beneficial!\n", __func__, upsampfac); + if (upsampfac != 2.0) { // nonstandard sigma + if (kerevalmeth == 1) { + fprintf(stderr, + "[%s] nonstandard upsampfac=%.3g cannot be handled by kerevalmeth=1\n", + __func__, upsampfac); + return FINUFFT_ERR_HORNER_WRONG_BETA; } + if (upsampfac <= 1.0) { + fprintf(stderr, "[%s] error: upsampfac=%.3g is <=1.0\n", __func__, upsampfac); + return FINUFFT_ERR_UPSAMPFAC_TOO_SMALL; + } + // calling routine must abort on above errors, since opts is garbage! + if (upsampfac > 4.0) + fprintf(stderr, "[%s] warning: upsampfac=%.3g is too large to be beneficial!\n", + __func__, upsampfac); + } - // defaults... (user can change after this function called) - opts.spread_direction = 1; // user should always set to 1 or 2 as desired - opts.upsampfac = upsampfac; + // defaults... (user can change after this function called) + opts.spread_direction = 1; // user should always set to 1 or 2 as desired + opts.upsampfac = upsampfac; - // as in FINUFFT v2.0, allow too-small-eps by truncating to eps_mach... - int ier = 0; + // as in FINUFFT v2.0, allow too-small-eps by truncating to eps_mach... + int ier = 0; - constexpr T EPSILON = std::numeric_limits::epsilon(); - if (eps < EPSILON) { - fprintf(stderr, "setup_spreader: warning, increasing tol=%.3g to eps_mach=%.3g.\n", (double)eps, - (double)EPSILON); - eps = EPSILON; - ier = FINUFFT_WARN_EPS_TOO_SMALL; - } + constexpr T EPSILON = std::numeric_limits::epsilon(); + if (eps < EPSILON) { + fprintf(stderr, "setup_spreader: warning, increasing tol=%.3g to eps_mach=%.3g.\n", + (double)eps, (double)EPSILON); + eps = EPSILON; + ier = FINUFFT_WARN_EPS_TOO_SMALL; + } - // Set kernel width w (aka ns) and ES kernel beta parameter, in opts... - int ns = std::ceil(-log10(eps / (T)10.0)); // 1 digit per power of ten - if (upsampfac != 2.0) // override ns for custom sigma - ns = std::ceil(-log(eps) / (T(M_PI) * sqrt(1 - 1 / upsampfac))); // formula, gamma=1 - ns = std::max(2, ns); // we don't have ns=1 version yet - if (ns > MAX_NSPREAD) { // clip to match allocated arrays - fprintf(stderr, "%s warning: at upsampfac=%.3g, tol=%.3g would need kernel width ns=%d; clipping to max %d.\n", - __func__, upsampfac, (double)eps, ns, MAX_NSPREAD); - ns = MAX_NSPREAD; - ier = FINUFFT_WARN_EPS_TOO_SMALL; - } - opts.nspread = ns; - opts.ES_halfwidth = (T)ns / 2; // constants to help ker eval (except Horner) - opts.ES_c = 4.0 / (T)(ns * ns); + // Set kernel width w (aka ns) and ES kernel beta parameter, in opts... + int ns = std::ceil(-log10(eps / (T)10.0)); // 1 digit per power of ten + if (upsampfac != 2.0) // override ns for custom sigma + ns = std::ceil(-log(eps) / (T(M_PI) * sqrt(1 - 1 / upsampfac))); // formula, gamma=1 + ns = std::max(2, ns); // we don't have ns=1 version yet + if (ns > MAX_NSPREAD) { // clip to match allocated arrays + fprintf(stderr, + "%s warning: at upsampfac=%.3g, tol=%.3g would need kernel width ns=%d; " + "clipping to max %d.\n", + __func__, upsampfac, (double)eps, ns, MAX_NSPREAD); + ns = MAX_NSPREAD; + ier = FINUFFT_WARN_EPS_TOO_SMALL; + } + opts.nspread = ns; + opts.ES_halfwidth = (T)ns / 2; // constants to help ker eval (except Horner) + opts.ES_c = 4.0 / (T)(ns * ns); - T betaoverns = 2.30; // gives decent betas for default sigma=2.0 - if (ns == 2) - betaoverns = 2.20; // some small-width tweaks... - if (ns == 3) - betaoverns = 2.26; - if (ns == 4) - betaoverns = 2.38; - if (upsampfac != 2.0) { // again, override beta for custom sigma - T gamma = 0.97; // must match devel/gen_all_horner_C_code.m - betaoverns = gamma * T(M_PI) * (1 - 1 / (2 * upsampfac)); // formula based on cutoff - } - opts.ES_beta = betaoverns * (T)ns; // set the kernel beta parameter - // fprintf(stderr,"setup_spreader: sigma=%.6f, chose ns=%d beta=%.6f\n",(double)upsampfac,ns,(double)opts.ES_beta); - // // user hasn't set debug yet - return ier; + T betaoverns = 2.30; // gives decent betas for default sigma=2.0 + if (ns == 2) betaoverns = 2.20; // some small-width tweaks... + if (ns == 3) betaoverns = 2.26; + if (ns == 4) betaoverns = 2.38; + if (upsampfac != 2.0) { // again, override beta for custom sigma + T gamma = 0.97; // must match devel/gen_all_horner_C_code.m + betaoverns = gamma * T(M_PI) * (1 - 1 / (2 * upsampfac)); // formula based on cutoff + } + opts.ES_beta = betaoverns * (T)ns; // set the kernel beta parameter + // fprintf(stderr,"setup_spreader: sigma=%.6f, chose ns=%d + // beta=%.6f\n",(double)upsampfac,ns,(double)opts.ES_beta); + // // user hasn't set debug yet + return ier; } -template int setup_spreader(finufft_spread_opts &opts, float eps, float upsampfac, int kerevalmeth); -template int setup_spreader(finufft_spread_opts &opts, double eps, double upsampfac, int kerevalmeth); +template int setup_spreader(finufft_spread_opts &opts, float eps, float upsampfac, + int kerevalmeth); +template int setup_spreader(finufft_spread_opts &opts, double eps, double upsampfac, + int kerevalmeth); template float evaluate_kernel(float x, const finufft_spread_opts &opts); template double evaluate_kernel(double x, const finufft_spread_opts &opts); diff --git a/src/cuda/utils.cpp b/src/cuda/utils.cpp index 1c10f3453..9c3003cb8 100644 --- a/src/cuda/utils.cpp +++ b/src/cuda/utils.cpp @@ -9,23 +9,18 @@ CUFINUFFT_BIGINT next235beven(CUFINUFFT_BIGINT n, CUFINUFFT_BIGINT b) // changed INT64 type 3/28/17. Runtime is around n*1e-11 sec for big n. // added condition about b Melody 05/31/20 { - if (n <= 2) - return 2; - if (n % 2 == 1) - n += 1; // even - CUFINUFFT_BIGINT nplus = n - 2; // to cancel out the +=2 at start of loop - CUFINUFFT_BIGINT numdiv = 2; // a dummy that is >1 - while ((numdiv > 1) || (nplus % b != 0)) { - nplus += 2; // stays even - numdiv = nplus; - while (numdiv % 2 == 0) - numdiv /= 2; // remove all factors of 2,3,5... - while (numdiv % 3 == 0) - numdiv /= 3; - while (numdiv % 5 == 0) - numdiv /= 5; - } - return nplus; + if (n <= 2) return 2; + if (n % 2 == 1) n += 1; // even + CUFINUFFT_BIGINT nplus = n - 2; // to cancel out the +=2 at start of loop + CUFINUFFT_BIGINT numdiv = 2; // a dummy that is >1 + while ((numdiv > 1) || (nplus % b != 0)) { + nplus += 2; // stays even + numdiv = nplus; + while (numdiv % 2 == 0) numdiv /= 2; // remove all factors of 2,3,5... + while (numdiv % 3 == 0) numdiv /= 3; + while (numdiv % 5 == 0) numdiv /= 5; + } + return nplus; } // ----------------------- helpers for timing (always stay double prec)... @@ -35,19 +30,19 @@ void CNTime::start() { gettimeofday(&initial, 0); } double CNTime::restart() // Barnett changed to returning in sec { - double delta = this->elapsedsec(); - this->start(); - return delta; + double delta = this->elapsedsec(); + this->start(); + return delta; } double CNTime::elapsedsec() // returns answers as double, in seconds, to microsec accuracy. Barnett 5/22/18 { - struct timeval now; - gettimeofday(&now, 0); - double nowsec = (double)now.tv_sec + 1e-6 * now.tv_usec; - double initialsec = (double)initial.tv_sec + 1e-6 * initial.tv_usec; - return nowsec - initialsec; + struct timeval now; + gettimeofday(&now, 0); + double nowsec = (double)now.tv_sec + 1e-6 * now.tv_usec; + double initialsec = (double)initial.tv_sec + 1e-6 * initial.tv_usec; + return nowsec - initialsec; } } // namespace utils diff --git a/src/finufft.cpp b/src/finufft.cpp index 5b33ef126..8b9c6006b 100644 --- a/src/finufft.cpp +++ b/src/finufft.cpp @@ -4,19 +4,19 @@ // private headers for lib build // (must come after finufft.h which clobbers FINUFFT* macros) #include +#include +#include #include #include -#include -#include -#include +#include "../contrib/legendre_rule_fast.h" #include +#include #include #include #include #include #include -#include "../contrib/legendre_rule_fast.h" using namespace std; using namespace finufft; @@ -24,7 +24,6 @@ using namespace finufft::utils; using namespace finufft::spreadinterp; using namespace finufft::quadrature; - /* Computational core for FINUFFT. Based on Barnett 2017-2018 finufft?d.cpp containing nine drivers, plus @@ -86,18 +85,16 @@ Design notes for guru interface implementation: state apart from that associated with FFTW (and the did_fftw_init). */ - - // ---------- local math routines (were in common.cpp; no need now): -------- namespace finufft { - namespace common { +namespace common { - // Technically global state... - // Needs to be static to avoid name collision with SINGLE/DOUBLE - static std::mutex fftw_lock; +// Technically global state... +// Needs to be static to avoid name collision with SINGLE/DOUBLE +static std::mutex fftw_lock; - // We macro because it has no FLT args but gets compiled for both prec's... +// We macro because it has no FLT args but gets compiled for both prec's... #ifdef SINGLE #define SET_NF_TYPE12 set_nf_type12f #else @@ -108,18 +105,22 @@ int SET_NF_TYPE12(BIGINT ms, finufft_opts opts, finufft_spread_opts spopts, BIGI // and requested number of Fourier modes ms. Returns 0 if success, else an // error code if nf was unreasonably big (& tell the world). { - *nf = (BIGINT)(opts.upsampfac*ms); // manner of rounding not crucial - if (*nf<2*spopts.nspread) *nf=2*spopts.nspread; // otherwise spread fails - if (*nf=0) // overrides + spopts.debug = opts.spread_debug; + spopts.sort = opts.spread_sort; // could make dim or CPU choices here? + spopts.kerpad = opts.spread_kerpad; // (only applies to kerevalmeth=0) + spopts.chkbnds = opts.chkbnds; + spopts.nthreads = opts.nthreads; // 0 passed in becomes omp max by here + if (opts.spread_nthr_atomic >= 0) // overrides spopts.atomic_threshold = opts.spread_nthr_atomic; - if (opts.spread_max_sp_size>0) // overrides + if (opts.spread_max_sp_size > 0) // overrides spopts.max_subproblem_size = opts.spread_max_sp_size; - if (opts.chkbnds != 1) // deprecated default value hardcoded here - fprintf(stderr, "[%s] opts.chkbnds is deprecated; ignoring change from default value.\n",__func__); + if (opts.chkbnds != 1) // deprecated default value hardcoded here + fprintf(stderr, + "[%s] opts.chkbnds is deprecated; ignoring change from default value.\n", + __func__); return ier; -} +} void set_nhg_type3(FLT S, FLT X, finufft_opts opts, finufft_spread_opts spopts, - BIGINT *nf, FLT *h, FLT *gam) + BIGINT *nf, FLT *h, FLT *gam) /* sets nf, h (upsampled grid spacing), and gamma (x_j rescaling factor), for type 3 only. Inputs: @@ -156,26 +159,27 @@ void set_nhg_type3(FLT S, FLT X, finufft_opts opts, finufft_spread_opts spopts, New logic 6/12/17 */ { - int nss = spopts.nspread + 1; // since ns may be odd - FLT Xsafe=X, Ssafe=S; // may be tweaked locally - if (X==0.0) // logic ensures XS>=1, handle X=0 a/o S=0 - if (S==0.0) { - Xsafe=1.0; - Ssafe=1.0; - } else Xsafe = max(Xsafe, 1/S); + int nss = spopts.nspread + 1; // since ns may be odd + FLT Xsafe = X, Ssafe = S; // may be tweaked locally + if (X == 0.0) // logic ensures XS>=1, handle X=0 a/o S=0 + if (S == 0.0) { + Xsafe = 1.0; + Ssafe = 1.0; + } else + Xsafe = max(Xsafe, 1 / S); else - Ssafe = max(Ssafe, 1/X); + Ssafe = max(Ssafe, 1 / X); // use the safe X and S... - FLT nfd = 2.0*opts.upsampfac*Ssafe*Xsafe/PI + nss; - if (!isfinite(nfd)) nfd=0.0; // use FLT to catch inf + FLT nfd = 2.0 * opts.upsampfac * Ssafe * Xsafe / PI + nss; + if (!isfinite(nfd)) nfd = 0.0; // use FLT to catch inf *nf = (BIGINT)nfd; - //printf("initial nf=%lld, ns=%d\n",*nf,spopts.nspread); - // catch too small nf, and nan or +-inf, otherwise spread fails... - if (*nf<2*spopts.nspread) *nf=2*spopts.nspread; - if (*nf a[MAX_NQUAD]; - for (int n=0;n brk(nt+1); // start indices for each thread - for (int t=0; t<=nt; ++t) // split nout mode indices btw threads - brk[t] = (BIGINT)(0.5 + nout*t/(double)nt); + BIGINT nout = nf / 2 + 1; // how many values we're writing to + int nt = min(nout, (BIGINT)opts.nthreads); // how many chunks + std::vector brk(nt + 1); // start indices for each thread + for (int t = 0; t <= nt; ++t) // split nout mode indices btw threads + brk[t] = (BIGINT)(0.5 + nout * t / (double)nt); #pragma omp parallel num_threads(nt) - { // each thread gets own chunk to do + { // each thread gets own chunk to do int t = MY_OMP_GET_THREAD_NUM(); - std::complex aj[MAX_NQUAD]; // phase rotator for this thread - for (int n=0;n aj[MAX_NQUAD]; // phase rotator for this thread + for (int n = 0; n < q; ++n) + aj[n] = pow(a[n], (FLT)brk[t]); // init phase factors for chunk + for (BIGINT j = brk[t]; j < brk[t + 1]; ++j) { // loop along output array + FLT x = 0.0; // accumulator for answer at this j + for (int n = 0; n < q; ++n) { + x += f[n] * 2 * real(aj[n]); // include the negative freq + aj[n] *= a[n]; // wind the phases } fwkerhalf[j] = x; } @@ -259,28 +263,29 @@ void onedim_nuft_kernel(BIGINT nk, FLT *k, FLT *phihat, finufft_spread_opts opts Barnett 2/8/17. openmp since cos slow 2/9/17 */ { - FLT J2 = opts.nspread/2.0; // J/2, half-width of ker z-support + FLT J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support // # quadr nodes in z (from 0 to J/2; reflections will be added)... - int q=(int)(2 + 2.0*J2); // > pi/2 ratio. cannot exceed MAX_NQUAD - if (opts.debug) printf("q (# ker FT quadr pts) = %d\n",q); - FLT f[MAX_NQUAD]; double z[2*MAX_NQUAD],w[2*MAX_NQUAD]; // glr needs double - legendre_compute_glr(2*q,z,w); // only half the nodes used, eg on (0,1) - for (int n=0;n pi/2 ratio. cannot exceed MAX_NQUAD + if (opts.debug) printf("q (# ker FT quadr pts) = %d\n", q); + FLT f[MAX_NQUAD]; + double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD]; // glr needs double + legendre_compute_glr(2 * q, z, w); // only half the nodes used, eg on (0,1) + for (int n = 0; n < q; ++n) { + z[n] *= (FLT)J2; // quadr nodes for [0,J/2] + f[n] = J2 * (FLT)w[n] * evaluate_kernel((FLT)z[n], opts); // w/ quadr weights + // printf("f[%d] = %.3g\n",n,f[n]); } #pragma omp parallel for num_threads(opts.nthreads) - for (BIGINT j=0;jfwBatch, using the same set of @@ -433,19 +446,19 @@ int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN p, CPX* cBatch) // omp_sets_nested deprecated, so don't use; assume not nested for 2 to work. // But when nthr_outer=1 here, omp par inside the loop sees all threads... #ifdef _OPENMP - int nthr_outer = p->opts.spread_thread==1 ? 1 : batchSize; + int nthr_outer = p->opts.spread_thread == 1 ? 1 : batchSize; #endif #pragma omp parallel for num_threads(nthr_outer) - for (int i=0; ifwBatch + i*p->nf; // start of i'th fw array in wkspace - CPX *ci = cBatch + i*p->nj; // start of i'th c array in cBatch - spreadinterpSorted(p->sortIndices, p->nf1, p->nf2, p->nf3, (FLT*)fwi, p->nj, - p->X, p->Y, p->Z, (FLT*)ci, p->spopts, p->didSort); + for (int i = 0; i < batchSize; i++) { + FFTW_CPX *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace + CPX *ci = cBatch + i * p->nj; // start of i'th c array in cBatch + spreadinterpSorted(p->sortIndices, p->nf1, p->nf2, p->nf3, (FLT *)fwi, p->nj, p->X, + p->Y, p->Z, (FLT *)ci, p->spopts, p->didSort); } return 0; } -int deconvolveBatch(int batchSize, FINUFFT_PLAN p, CPX* fkBatch) +int deconvolveBatch(int batchSize, FINUFFT_PLAN p, CPX *fkBatch) /* Type 1: deconvolves (amplifies) from each interior fw array in p->fwBatch into each output array fk in fkBatch. @@ -459,29 +472,25 @@ int deconvolveBatch(int batchSize, FINUFFT_PLAN p, CPX* fkBatch) { // since deconvolveshuffle?d are single-thread, omp par seems to help here... #pragma omp parallel for num_threads(batchSize) - for (int i=0; ifwBatch + i*p->nf; // start of i'th fw array in wkspace - CPX *fki = fkBatch + i*p->N; // start of i'th fk array in fkBatch - + for (int i = 0; i < batchSize; i++) { + FFTW_CPX *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace + CPX *fki = fkBatch + i * p->N; // start of i'th fk array in fkBatch + // Call routine from common.cpp for the dim; prefactors hardcoded to 1.0... if (p->dim == 1) - deconvolveshuffle1d(p->spopts.spread_direction, 1.0, p->phiHat1, - p->ms, (FLT *)fki, + deconvolveshuffle1d(p->spopts.spread_direction, 1.0, p->phiHat1, p->ms, (FLT *)fki, p->nf1, fwi, p->opts.modeord); else if (p->dim == 2) - deconvolveshuffle2d(p->spopts.spread_direction,1.0, p->phiHat1, - p->phiHat2, p->ms, p->mt, (FLT *)fki, - p->nf1, p->nf2, fwi, p->opts.modeord); + deconvolveshuffle2d(p->spopts.spread_direction, 1.0, p->phiHat1, p->phiHat2, p->ms, + p->mt, (FLT *)fki, p->nf1, p->nf2, fwi, p->opts.modeord); else - deconvolveshuffle3d(p->spopts.spread_direction, 1.0, p->phiHat1, - p->phiHat2, p->phiHat3, p->ms, p->mt, p->mu, - (FLT *)fki, p->nf1, p->nf2, p->nf3, - fwi, p->opts.modeord); + deconvolveshuffle3d(p->spopts.spread_direction, 1.0, p->phiHat1, p->phiHat2, + p->phiHat3, p->ms, p->mt, p->mu, (FLT *)fki, p->nf1, p->nf2, + p->nf3, fwi, p->opts.modeord); } return 0; } - // since this func is local only, we macro its name here... #ifdef SINGLE #define GRIDSIZE_FOR_FFTW gridsize_for_fftwf @@ -489,21 +498,20 @@ int deconvolveBatch(int batchSize, FINUFFT_PLAN p, CPX* fkBatch) #define GRIDSIZE_FOR_FFTW gridsize_for_fftw #endif -int* GRIDSIZE_FOR_FFTW(FINUFFT_PLAN p){ -// local helper func returns a new int array of length dim, extracted from -// the finufft plan, that fftw_plan_many_dft needs as its 2nd argument. - int* nf; - if(p->dim == 1){ - nf = new int[1]; +int *GRIDSIZE_FOR_FFTW(FINUFFT_PLAN p) { + // local helper func returns a new int array of length dim, extracted from + // the finufft plan, that fftw_plan_many_dft needs as its 2nd argument. + int *nf; + if (p->dim == 1) { + nf = new int[1]; nf[0] = (int)p->nf1; - } - else if (p->dim == 2){ - nf = new int[2]; + } else if (p->dim == 2) { + nf = new int[2]; nf[0] = (int)p->nf2; - nf[1] = (int)p->nf1; - } // fftw enforced row major ordering, ie dims are backwards ordered - else{ - nf = new int[3]; + nf[1] = (int)p->nf1; + } // fftw enforced row major ordering, ie dims are backwards ordered + else { + nf = new int[3]; nf[0] = (int)p->nf3; nf[1] = (int)p->nf2; nf[2] = (int)p->nf1; @@ -511,17 +519,12 @@ int* GRIDSIZE_FOR_FFTW(FINUFFT_PLAN p){ return nf; } - - } // namespace -} // namespace - - - +} // namespace common +} // namespace finufft // --------------- rest is the 5 user guru (plan) interface drivers: --------- // (not namespaced since have safe names finufft{f}_* ) -using namespace finufft::common; // accesses routines defined above - +using namespace finufft::common; // accesses routines defined above // Marco Barbone: 5.8.2024 // These are user-facing. @@ -540,26 +543,26 @@ void FINUFFT_DEFAULT_OPTS(finufft_opts *o) o->modeord = 0; o->chkbnds = 1; - o->debug = 0; + o->debug = 0; o->spread_debug = 0; - o->showwarn = 1; + o->showwarn = 1; - o->nthreads = 0; - o->fftw = FFTW_ESTIMATE; // - o->spread_sort = 2; + o->nthreads = 0; + o->fftw = FFTW_ESTIMATE; // + o->spread_sort = 2; o->spread_kerevalmeth = 1; - o->spread_kerpad = 1; - o->upsampfac = 0.0; - o->spread_thread = 0; - o->maxbatchsize = 0; + o->spread_kerpad = 1; + o->upsampfac = 0.0; + o->spread_thread = 0; + o->maxbatchsize = 0; o->spread_nthr_atomic = -1; o->spread_max_sp_size = 0; // sphinx tag (don't remove): @defopts_end } // PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP -int FINUFFT_MAKEPLAN(int type, int dim, BIGINT* n_modes, int iflag, - int ntrans, FLT tol, FINUFFT_PLAN *pp, finufft_opts* opts) +int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans, FLT tol, + FINUFFT_PLAN *pp, finufft_opts *opts) // Populates the fields of finufft_plan which is pointed to by "pp". // opts is ptr to a finufft_opts to set options, or NULL to use defaults. // For some of the fields (if "auto" selected) here choose the actual setting. @@ -567,596 +570,649 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT* n_modes, int iflag, // evaluates spreading kernel coefficients, and instantiates the fftw_plan { FINUFFT_PLAN p; - p = new FINUFFT_PLAN_S; // allocate fresh plan struct - *pp = p; // pass out plan as ptr to plan struct + p = new FINUFFT_PLAN_S; // allocate fresh plan struct + *pp = p; // pass out plan as ptr to plan struct - if (opts==NULL) // use default opts + if (opts == NULL) // use default opts FINUFFT_DEFAULT_OPTS(&(p->opts)); - else // or read from what's passed in - p->opts = *opts; // keep a deep copy; changing *opts now has no effect - - if (p->opts.debug) // do a hello world - printf("[%s] new plan: FINUFFT version " FINUFFT_VER " .................\n",__func__); - - if((type!=1)&&(type!=2)&&(type!=3)) { - fprintf(stderr, "[%s] Invalid type (%d), should be 1, 2 or 3.\n",__func__,type); + else // or read from what's passed in + p->opts = *opts; // keep a deep copy; changing *opts now has no effect + + if (p->opts.debug) // do a hello world + printf("[%s] new plan: FINUFFT version " FINUFFT_VER " .................\n", + __func__); + + if ((type != 1) && (type != 2) && (type != 3)) { + fprintf(stderr, "[%s] Invalid type (%d), should be 1, 2 or 3.\n", __func__, type); return FINUFFT_ERR_TYPE_NOTVALID; } - if((dim!=1)&&(dim!=2)&&(dim!=3)) { - fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n",__func__,dim); + if ((dim != 1) && (dim != 2) && (dim != 3)) { + fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim); return FINUFFT_ERR_DIM_NOTVALID; } - if (ntrans<1) { - fprintf(stderr,"[%s] ntrans (%d) should be at least 1.\n",__func__,ntrans); + if (ntrans < 1) { + fprintf(stderr, "[%s] ntrans (%d) should be at least 1.\n", __func__, ntrans); return FINUFFT_ERR_NTRANS_NOTVALID; } - + // get stuff from args... - p->type = type; - p->dim = dim; - p->ntrans = ntrans; - p->tol = tol; - p->fftSign = (iflag>=0) ? 1 : -1; // clean up flag input + p->type = type; + p->dim = dim; + p->ntrans = ntrans; + p->tol = tol; + p->fftSign = (iflag >= 0) ? 1 : -1; // clean up flag input // choose overall # threads... #ifdef _OPENMP int ompmaxnthr = MY_OMP_GET_MAX_THREADS(); - int nthr = ompmaxnthr; // default: use as many as OMP gives us + int nthr = ompmaxnthr; // default: use as many as OMP gives us // (the above could be set, or suggested set, to 1 for small enough problems...) - if (p->opts.nthreads>0) { - nthr = p->opts.nthreads; // user override, now without limit + if (p->opts.nthreads > 0) { + nthr = p->opts.nthreads; // user override, now without limit if (p->opts.showwarn && (nthr > ompmaxnthr)) - fprintf(stderr,"%s warning: using opts.nthreads=%d, more than the %d OpenMP claims available; note large nthreads can be slower.\n",__func__,nthr,ompmaxnthr); + fprintf(stderr, + "%s warning: using opts.nthreads=%d, more than the %d OpenMP claims " + "available; note large nthreads can be slower.\n", + __func__, nthr, ompmaxnthr); } #else - int nthr = 1; // always 1 thread (avoid segfault) - if (p->opts.nthreads>1) - fprintf(stderr,"%s warning: opts.nthreads=%d but library is single-threaded; ignoring!\n",__func__,p->opts.nthreads); + int nthr = 1; // always 1 thread (avoid segfault) + if (p->opts.nthreads > 1) + fprintf(stderr, + "%s warning: opts.nthreads=%d but library is single-threaded; ignoring!\n", + __func__, p->opts.nthreads); #endif - p->opts.nthreads = nthr; // store actual # thr planned for + p->opts.nthreads = nthr; // store actual # thr planned for // (this sets/limits all downstream spread/interp, 1dkernel, and FFT thread counts...) - + // choose batchSize for types 1,2 or 3... (uses int ceil(b/a)=1+(b-1)/a trick) - if (p->opts.maxbatchsize==0) { // logic to auto-set best batchsize - p->nbatch = 1+(ntrans-1)/nthr; // min # batches poss - p->batchSize = 1+(ntrans-1)/p->nbatch; // then cut # thr in each b - } else { // batchSize override by user - p->batchSize = min(p->opts.maxbatchsize,ntrans); - p->nbatch = 1+(ntrans-1)/p->batchSize; // resulting # batches + if (p->opts.maxbatchsize == 0) { // logic to auto-set best batchsize + p->nbatch = 1 + (ntrans - 1) / nthr; // min # batches poss + p->batchSize = 1 + (ntrans - 1) / p->nbatch; // then cut # thr in each b + } else { // batchSize override by user + p->batchSize = min(p->opts.maxbatchsize, ntrans); + p->nbatch = 1 + (ntrans - 1) / p->batchSize; // resulting # batches } - if (p->opts.spread_thread==0) - p->opts.spread_thread=2; // our auto choice - if (p->opts.spread_thread!=1 && p->opts.spread_thread!=2) { - fprintf(stderr,"[%s] illegal opts.spread_thread!\n",__func__); + if (p->opts.spread_thread == 0) p->opts.spread_thread = 2; // our auto choice + if (p->opts.spread_thread != 1 && p->opts.spread_thread != 2) { + fprintf(stderr, "[%s] illegal opts.spread_thread!\n", __func__); return FINUFFT_ERR_SPREAD_THREAD_NOTVALID; } - if (type!=3) { // read in user Fourier mode array sizes... + if (type != 3) { // read in user Fourier mode array sizes... p->ms = n_modes[0]; - p->mt = (dim>1) ? n_modes[1] : 1; // leave as 1 for unused dims - p->mu = (dim>2) ? n_modes[2] : 1; - p->N = p->ms*p->mt*p->mu; // N = total # modes + p->mt = (dim > 1) ? n_modes[1] : 1; // leave as 1 for unused dims + p->mu = (dim > 2) ? n_modes[2] : 1; + p->N = p->ms * p->mt * p->mu; // N = total # modes } - + // heuristic to choose default upsampfac... (currently two poss) - if (p->opts.upsampfac==0.0) { // indicates auto-choose - p->opts.upsampfac=2.0; // default, and need for tol small - if (tol>=(FLT)1E-9) { // the tol sigma=5/4 can reach - if (type==3) // could move to setpts, more known? - p->opts.upsampfac=1.25; // faster b/c smaller RAM & FFT - else if ((dim==1 && p->N>10000000) || (dim==2 && p->N>300000) || (dim==3 && p->N>3000000)) // type 1,2 heuristic cutoffs, double, typ tol, 12-core xeon - p->opts.upsampfac=1.25; + if (p->opts.upsampfac == 0.0) { // indicates auto-choose + p->opts.upsampfac = 2.0; // default, and need for tol small + if (tol >= (FLT)1E-9) { // the tol sigma=5/4 can reach + if (type == 3) // could move to setpts, more known? + p->opts.upsampfac = 1.25; // faster b/c smaller RAM & FFT + else if ((dim == 1 && p->N > 10000000) || (dim == 2 && p->N > 300000) || + (dim == 3 && p->N > 3000000)) // type 1,2 heuristic cutoffs, double, typ + // tol, 12-core xeon + p->opts.upsampfac = 1.25; } if (p->opts.debug > 1) - printf("[%s] set auto upsampfac=%.2f\n",__func__,p->opts.upsampfac); + printf("[%s] set auto upsampfac=%.2f\n", __func__, p->opts.upsampfac); } // use opts to choose and write into plan's spread options... int ier = setup_spreader_for_nufft(p->spopts, tol, p->opts, dim); - if (ier>1) // proceed if success or warning + if (ier > 1) // proceed if success or warning return ier; // set others as defaults (or unallocated for arrays)... - p->X = NULL; p->Y = NULL; p->Z = NULL; - p->phiHat1 = NULL; p->phiHat2 = NULL; p->phiHat3 = NULL; - p->nf1 = 1; p->nf2 = 1; p->nf3 = 1; // crucial to leave as 1 for unused dims - p->sortIndices = NULL; // used in all three types - + p->X = NULL; + p->Y = NULL; + p->Z = NULL; + p->phiHat1 = NULL; + p->phiHat2 = NULL; + p->phiHat3 = NULL; + p->nf1 = 1; + p->nf2 = 1; + p->nf3 = 1; // crucial to leave as 1 for unused dims + p->sortIndices = NULL; // used in all three types + // ------------------------ types 1,2: planning needed --------------------- - if (type==1 || type==2) { + if (type == 1 || type == 2) { - int nthr_fft = nthr; // give FFTW all threads (or use o.spread_thread?) - // Note: batchSize not used since might be only 1. + int nthr_fft = nthr; // give FFTW all threads (or use o.spread_thread?) + // Note: batchSize not used since might be only 1. // Now place FFTW initialization in a lock, courtesy of OMP. Makes FINUFFT // thread-safe (can be called inside OMP) { - static bool did_fftw_init = false; // the only global state of FINUFFT + static bool did_fftw_init = false; // the only global state of FINUFFT std::lock_guard lock(fftw_lock); if (!did_fftw_init) { - FFTW_INIT(); // setup FFTW global state; should only do once - did_fftw_init = true; // ensure other FINUFFT threads don't clash + FFTW_INIT(); // setup FFTW global state; should only do once + did_fftw_init = true; // ensure other FINUFFT threads don't clash } } p->spopts.spread_direction = type; - if (p->opts.showwarn) { // user warn round-off error... - if (EPSILON*p->ms>1.0) - fprintf(stderr,"%s warning: rounding err predicted eps_mach*N1 = %.3g > 1 !\n",__func__,(double)(EPSILON*p->ms)); - if (EPSILON*p->mt>1.0) - fprintf(stderr,"%s warning: rounding err predicted eps_mach*N2 = %.3g > 1 !\n",__func__,(double)(EPSILON*p->mt)); - if (EPSILON*p->mu>1.0) - fprintf(stderr,"%s warning: rounding err predicted eps_mach*N3 = %.3g > 1 !\n",__func__,(double)(EPSILON*p->mu)); + if (p->opts.showwarn) { // user warn round-off error... + if (EPSILON * p->ms > 1.0) + fprintf(stderr, "%s warning: rounding err predicted eps_mach*N1 = %.3g > 1 !\n", + __func__, (double)(EPSILON * p->ms)); + if (EPSILON * p->mt > 1.0) + fprintf(stderr, "%s warning: rounding err predicted eps_mach*N2 = %.3g > 1 !\n", + __func__, (double)(EPSILON * p->mt)); + if (EPSILON * p->mu > 1.0) + fprintf(stderr, "%s warning: rounding err predicted eps_mach*N3 = %.3g > 1 !\n", + __func__, (double)(EPSILON * p->mu)); } - + // determine fine grid sizes, sanity check.. int nfier = SET_NF_TYPE12(p->ms, p->opts, p->spopts, &(p->nf1)); - if (nfier) return nfier; // nf too big; we're done - p->phiHat1 = (FLT*)malloc(sizeof(FLT)*(p->nf1/2 + 1)); + if (nfier) return nfier; // nf too big; we're done + p->phiHat1 = (FLT *)malloc(sizeof(FLT) * (p->nf1 / 2 + 1)); if (dim > 1) { nfier = SET_NF_TYPE12(p->mt, p->opts, p->spopts, &(p->nf2)); if (nfier) return nfier; - p->phiHat2 = (FLT*)malloc(sizeof(FLT)*(p->nf2/2 + 1)); + p->phiHat2 = (FLT *)malloc(sizeof(FLT) * (p->nf2 / 2 + 1)); } if (dim > 2) { - nfier = SET_NF_TYPE12(p->mu, p->opts, p->spopts, &(p->nf3)); + nfier = SET_NF_TYPE12(p->mu, p->opts, p->spopts, &(p->nf3)); if (nfier) return nfier; - p->phiHat3 = (FLT*)malloc(sizeof(FLT)*(p->nf3/2 + 1)); + p->phiHat3 = (FLT *)malloc(sizeof(FLT) * (p->nf3 / 2 + 1)); } if (p->opts.debug) { // "long long" here is to avoid warnings with printf... - printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) (nf1,nf2,nf3)=(%lld,%lld,%lld)\n ntrans=%d nthr=%d batchSize=%d ", __func__, - dim, type, (long long)p->ms,(long long)p->mt, - (long long) p->mu, (long long)p->nf1,(long long)p->nf2, - (long long)p->nf3, ntrans, nthr, p->batchSize); - if (p->batchSize==1) // spread_thread has no effect in this case + printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) (nf1,nf2,nf3)=(%lld,%lld,%lld)\n " + " ntrans=%d nthr=%d batchSize=%d ", + __func__, dim, type, (long long)p->ms, (long long)p->mt, (long long)p->mu, + (long long)p->nf1, (long long)p->nf2, (long long)p->nf3, ntrans, nthr, + p->batchSize); + if (p->batchSize == 1) // spread_thread has no effect in this case printf("\n"); else printf(" spread_thread=%d\n", p->opts.spread_thread); } // STEP 0: get Fourier coeffs of spreading kernel along each fine grid dim - CNTime timer; timer.start(); + CNTime timer; + timer.start(); onedim_fseries_kernel(p->nf1, p->phiHat1, p->spopts); - if (dim>1) onedim_fseries_kernel(p->nf2, p->phiHat2, p->spopts); - if (dim>2) onedim_fseries_kernel(p->nf3, p->phiHat3, p->spopts); - if (p->opts.debug) printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n",__func__,p->spopts.nspread, timer.elapsedsec()); + if (dim > 1) onedim_fseries_kernel(p->nf2, p->phiHat2, p->spopts); + if (dim > 2) onedim_fseries_kernel(p->nf3, p->phiHat3, p->spopts); + if (p->opts.debug) + printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n", __func__, p->spopts.nspread, + timer.elapsedsec()); timer.restart(); - p->nf = p->nf1*p->nf2*p->nf3; // fine grid total number of points + p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points if (p->nf * p->batchSize > MAX_NF) { - fprintf(stderr, "[%s] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",__func__); + fprintf(stderr, + "[%s] fwBatch would be bigger than MAX_NF, not attempting malloc!\n", + __func__); return FINUFFT_ERR_MAXNALLOC; } p->fwBatch = FFTW_ALLOC_CPX(p->nf * p->batchSize); // the big workspace - if (p->opts.debug) printf("[%s] fwBatch %.2fGB alloc: \t%.3g s\n", __func__,(double)1E-09*sizeof(CPX)*p->nf*p->batchSize, timer.elapsedsec()); - if(!p->fwBatch) { // we don't catch all such mallocs, just this big one - fprintf(stderr, "[%s] FFTW malloc failed for fwBatch (working fine grids)!\n",__func__); - free(p->phiHat1); free(p->phiHat2); free(p->phiHat3); + if (p->opts.debug) + printf("[%s] fwBatch %.2fGB alloc: \t%.3g s\n", __func__, + (double)1E-09 * sizeof(CPX) * p->nf * p->batchSize, timer.elapsedsec()); + if (!p->fwBatch) { // we don't catch all such mallocs, just this big one + fprintf(stderr, "[%s] FFTW malloc failed for fwBatch (working fine grids)!\n", + __func__); + free(p->phiHat1); + free(p->phiHat2); + free(p->phiHat3); return FINUFFT_ERR_ALLOC; } - - timer.restart(); // plan the FFTW + + timer.restart(); // plan the FFTW int *ns = GRIDSIZE_FOR_FFTW(p); - // fftw_plan_many_dft args: rank, gridsize/dim, howmany, in, inembed, istride, idist, ot, onembed, ostride, odist, sign, flags + // fftw_plan_many_dft args: rank, gridsize/dim, howmany, in, inembed, istride, idist, + // ot, onembed, ostride, odist, sign, flags { std::lock_guard lock(fftw_lock); // FFTW_PLAN_TH sets all future fftw_plan calls to use nthr_fft threads. - // FIXME: Since this might override what the user wants for fftw, we'd like to set it - // just for our one plan and then revert to the user value. Unfortunately - // fftw_planner_nthreads wasn't introduced until fftw 3.3.9, and there isn't a convenient - // mechanism to probe the version + // FIXME: Since this might override what the user wants for fftw, we'd like to set + // it just for our one plan and then revert to the user value. Unfortunately + // fftw_planner_nthreads wasn't introduced until fftw 3.3.9, and there isn't a + // convenient mechanism to probe the version FFTW_PLAN_TH(nthr_fft); - p->fftwPlan = FFTW_PLAN_MANY_DFT(dim, ns, p->batchSize, p->fwBatch, NULL, 1, p->nf, p->fwBatch, NULL, 1, p->nf, - p->fftSign, p->opts.fftw); + p->fftwPlan = + FFTW_PLAN_MANY_DFT(dim, ns, p->batchSize, p->fwBatch, NULL, 1, p->nf, + p->fwBatch, NULL, 1, p->nf, p->fftSign, p->opts.fftw); } - if (p->opts.debug) printf("[%s] FFTW plan (mode %d, nthr=%d):\t%.3g s\n", __func__,p->opts.fftw, nthr_fft, timer.elapsedsec()); - delete []ns; - - } else { // -------------------------- type 3 (no planning) ------------ + if (p->opts.debug) + printf("[%s] FFTW plan (mode %d, nthr=%d):\t%.3g s\n", __func__, p->opts.fftw, + nthr_fft, timer.elapsedsec()); + delete[] ns; + + } else { // -------------------------- type 3 (no planning) ------------ - if (p->opts.debug) printf("[%s] %dd%d: ntrans=%d\n",__func__,dim,type,ntrans); + if (p->opts.debug) printf("[%s] %dd%d: ntrans=%d\n", __func__, dim, type, ntrans); // in case destroy occurs before setpts, need safe dummy ptrs/plans... - p->CpBatch = NULL; - p->fwBatch = NULL; - p->Sp = NULL; p->Tp = NULL; p->Up = NULL; - p->prephase = NULL; - p->deconv = NULL; + p->CpBatch = NULL; + p->fwBatch = NULL; + p->Sp = NULL; + p->Tp = NULL; + p->Up = NULL; + p->prephase = NULL; + p->deconv = NULL; p->innerT2plan = NULL; // Type 3 will call finufft_makeplan for type 2; no need to init FFTW // Note we don't even know nj or nk yet, so can't do anything else! } - return ier; // report setup_spreader status (could be warning) + return ier; // report setup_spreader status (could be warning) } - // SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS -int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT* xj, FLT* yj, FLT* zj, - BIGINT nk, FLT* s, FLT* t, FLT* u) +int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT nk, + FLT *s, FLT *t, FLT *u) /* For type 1,2: just checks and (possibly) sorts the NU xyz points, in prep for spreading. (The last 4 arguments are ignored.) For type 3: allocates internal working arrays, scales/centers the NU points and NU target freqs (stu), evaluates spreading kernel FT at all target freqs. */ { - int d = p->dim; // abbrev for spatial dim - CNTime timer; timer.start(); - p->nj = nj; // the user only now chooses how many NU (x,y,z) pts - if (nj<0) { - fprintf(stderr,"[%s] nj (%lld) cannot be negative!\n",__func__,(long long)nj); + int d = p->dim; // abbrev for spatial dim + CNTime timer; + timer.start(); + p->nj = nj; // the user only now chooses how many NU (x,y,z) pts + if (nj < 0) { + fprintf(stderr, "[%s] nj (%lld) cannot be negative!\n", __func__, (long long)nj); return FINUFFT_ERR_NUM_NU_PTS_INVALID; - } else if (nj>MAX_NU_PTS) { - fprintf(stderr,"[%s] nj (%lld) exceeds MAX_NU_PTS\n",__func__,(long long)nj); + } else if (nj > MAX_NU_PTS) { + fprintf(stderr, "[%s] nj (%lld) exceeds MAX_NU_PTS\n", __func__, (long long)nj); return FINUFFT_ERR_NUM_NU_PTS_INVALID; } - - if (p->type!=3) { // ------------------ TYPE 1,2 SETPTS ------------------- - // (all we can do is check and maybe bin-sort the NU pts) - p->X = xj; // plan must keep pointers to user's fixed NU pts - p->Y = yj; - p->Z = zj; + + if (p->type != 3) { // ------------------ TYPE 1,2 SETPTS ------------------- + // (all we can do is check and maybe bin-sort the NU pts) + p->X = xj; // plan must keep pointers to user's fixed NU pts + p->Y = yj; + p->Z = zj; int ier = spreadcheck(p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts); - if (p->opts.debug>1) printf("[%s] spreadcheck (%d):\t%.3g s\n", __func__, p->spopts.chkbnds, timer.elapsedsec()); - if (ier) // no warnings allowed here - return ier; + if (p->opts.debug > 1) + printf("[%s] spreadcheck (%d):\t%.3g s\n", __func__, p->spopts.chkbnds, + timer.elapsedsec()); + if (ier) // no warnings allowed here + return ier; timer.restart(); - // Free sortIndices if it has been allocated before in case of repeated setpts calls causing memory leak. - // We don't know it is the same size as before, so we have to malloc each time. + // Free sortIndices if it has been allocated before in case of repeated setpts calls + // causing memory leak. We don't know it is the same size as before, so we have to + // malloc each time. if (p->sortIndices) free(p->sortIndices); - p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT)*p->nj); + p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj); if (!p->sortIndices) { - fprintf(stderr,"[%s] failed to allocate sortIndices!\n",__func__); + fprintf(stderr, "[%s] failed to allocate sortIndices!\n", __func__); return FINUFFT_ERR_SPREAD_ALLOC; } - p->didSort = indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts); - if (p->opts.debug) printf("[%s] sort (didSort=%d):\t\t%.3g s\n", __func__,p->didSort, timer.elapsedsec()); + p->didSort = + indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts); + if (p->opts.debug) + printf("[%s] sort (didSort=%d):\t\t%.3g s\n", __func__, p->didSort, + timer.elapsedsec()); - - } else { // ------------------------- TYPE 3 SETPTS ----------------------- - // (here we can precompute pre/post-phase factors and plan the t2) + } else { // ------------------------- TYPE 3 SETPTS ----------------------- + // (here we can precompute pre/post-phase factors and plan the t2) - if (nk<0) { - fprintf(stderr,"[%s] nk (%lld) cannot be negative!\n",__func__,(long long)nk); + if (nk < 0) { + fprintf(stderr, "[%s] nk (%lld) cannot be negative!\n", __func__, (long long)nk); return FINUFFT_ERR_NUM_NU_PTS_INVALID; - } else if (nk>MAX_NU_PTS) { - fprintf(stderr,"[%s] nk (%lld) exceeds MAX_NU_PTS\n",__func__,(long long)nk); + } else if (nk > MAX_NU_PTS) { + fprintf(stderr, "[%s] nk (%lld) exceeds MAX_NU_PTS\n", __func__, (long long)nk); return FINUFFT_ERR_NUM_NU_PTS_INVALID; } - p->nk = nk; // user set # targ freq pts - p->S = s; // keep pointers to user's input target pts - p->T = t; - p->U = u; + p->nk = nk; // user set # targ freq pts + p->S = s; // keep pointers to user's input target pts + p->T = t; + p->U = u; // pick x, s intervals & shifts & # fine grid pts (nf) in each dim... - FLT S1,S2,S3; // get half-width X, center C, which contains {x_j}... - arraywidcen(nj,xj,&(p->t3P.X1),&(p->t3P.C1)); - arraywidcen(nk,s,&S1,&(p->t3P.D1)); // same D, S, but for {s_k} - set_nhg_type3(S1,p->t3P.X1,p->opts,p->spopts, - &(p->nf1),&(p->t3P.h1),&(p->t3P.gam1)); // applies twist i) - p->t3P.C2 = 0.0; // their defaults if dim 2 unused, etc + FLT S1, S2, S3; // get half-width X, center C, which contains {x_j}... + arraywidcen(nj, xj, &(p->t3P.X1), &(p->t3P.C1)); + arraywidcen(nk, s, &S1, &(p->t3P.D1)); // same D, S, but for {s_k} + set_nhg_type3(S1, p->t3P.X1, p->opts, p->spopts, &(p->nf1), &(p->t3P.h1), + &(p->t3P.gam1)); // applies twist i) + p->t3P.C2 = 0.0; // their defaults if dim 2 unused, etc p->t3P.D2 = 0.0; - if (d>1) { - arraywidcen(nj,yj,&(p->t3P.X2),&(p->t3P.C2)); // {y_j} - arraywidcen(nk,t,&S2,&(p->t3P.D2)); // {t_k} - set_nhg_type3(S2,p->t3P.X2,p->opts,p->spopts,&(p->nf2), - &(p->t3P.h2),&(p->t3P.gam2)); - } + if (d > 1) { + arraywidcen(nj, yj, &(p->t3P.X2), &(p->t3P.C2)); // {y_j} + arraywidcen(nk, t, &S2, &(p->t3P.D2)); // {t_k} + set_nhg_type3(S2, p->t3P.X2, p->opts, p->spopts, &(p->nf2), &(p->t3P.h2), + &(p->t3P.gam2)); + } p->t3P.C3 = 0.0; p->t3P.D3 = 0.0; - if (d>2) { - arraywidcen(nj,zj,&(p->t3P.X3),&(p->t3P.C3)); // {z_j} - arraywidcen(nk,u,&S3,&(p->t3P.D3)); // {u_k} - set_nhg_type3(S3,p->t3P.X3,p->opts,p->spopts, - &(p->nf3),&(p->t3P.h3),&(p->t3P.gam3)); + if (d > 2) { + arraywidcen(nj, zj, &(p->t3P.X3), &(p->t3P.C3)); // {z_j} + arraywidcen(nk, u, &S3, &(p->t3P.D3)); // {u_k} + set_nhg_type3(S3, p->t3P.X3, p->opts, p->spopts, &(p->nf3), &(p->t3P.h3), + &(p->t3P.gam3)); } - if (p->opts.debug) { // report on choices of shifts, centers, etc... - printf("\tM=%lld N=%lld\n",(long long)nj,(long long)nk); - printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%lld\t\n", p->t3P.X1, p->t3P.C1,S1, p->t3P.D1, p->t3P.gam1,(long long) p->nf1); - if (d>1) - printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%lld\n",p->t3P.X2, p->t3P.C2,S2, p->t3P.D2, p->t3P.gam2,(long long) p->nf2); - if (d>2) - printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%lld\n", p->t3P.X3, p->t3P.C3,S3, p->t3P.D3, p->t3P.gam3,(long long) p->nf3); + if (p->opts.debug) { // report on choices of shifts, centers, etc... + printf("\tM=%lld N=%lld\n", (long long)nj, (long long)nk); + printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%lld\t\n", p->t3P.X1, + p->t3P.C1, S1, p->t3P.D1, p->t3P.gam1, (long long)p->nf1); + if (d > 1) + printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%lld\n", p->t3P.X2, + p->t3P.C2, S2, p->t3P.D2, p->t3P.gam2, (long long)p->nf2); + if (d > 2) + printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%lld\n", p->t3P.X3, + p->t3P.C3, S3, p->t3P.D3, p->t3P.gam3, (long long)p->nf3); } - p->nf = p->nf1*p->nf2*p->nf3; // fine grid total number of points + p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points if (p->nf * p->batchSize > MAX_NF) { - fprintf(stderr, "[%s t3] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",__func__); + fprintf(stderr, + "[%s t3] fwBatch would be bigger than MAX_NF, not attempting malloc!\n", + __func__); return FINUFFT_ERR_MAXNALLOC; } - if (p->fwBatch) - FFTW_FR(p->fwBatch); + if (p->fwBatch) FFTW_FR(p->fwBatch); p->fwBatch = FFTW_ALLOC_CPX(p->nf * p->batchSize); // maybe big workspace // (note FFTW_ALLOC is not needed over malloc, but matches its type) - if(p->CpBatch) free(p->CpBatch); - p->CpBatch = (CPX*)malloc(sizeof(CPX) * nj*p->batchSize); // batch c' work - if (p->opts.debug) printf("[%s t3] widcen, batch %.2fGB alloc:\t%.3g s\n", __func__, (double)1E-09*sizeof(CPX)*(p->nf+nj)*p->batchSize, timer.elapsedsec()); - if(!p->fwBatch || !p->CpBatch) { - fprintf(stderr, "[%s t3] malloc fail for fwBatch or CpBatch!\n",__func__); - return FINUFFT_ERR_ALLOC; + if (p->CpBatch) free(p->CpBatch); + p->CpBatch = (CPX *)malloc(sizeof(CPX) * nj * p->batchSize); // batch c' work + if (p->opts.debug) + printf("[%s t3] widcen, batch %.2fGB alloc:\t%.3g s\n", __func__, + (double)1E-09 * sizeof(CPX) * (p->nf + nj) * p->batchSize, + timer.elapsedsec()); + if (!p->fwBatch || !p->CpBatch) { + fprintf(stderr, "[%s t3] malloc fail for fwBatch or CpBatch!\n", __func__); + return FINUFFT_ERR_ALLOC; } - //printf("fwbatch, cpbatch ptrs: %llx %llx\n",p->fwBatch,p->CpBatch); + // printf("fwbatch, cpbatch ptrs: %llx %llx\n",p->fwBatch,p->CpBatch); // alloc rescaled NU src pts x'_j (in X etc), rescaled NU targ pts s'_k ... - if(p->X) free(p->X); - if(p->Sp) free(p->Sp); - p->X = (FLT*)malloc(sizeof(FLT)*nj); - p->Sp = (FLT*)malloc(sizeof(FLT)*nk); - if (d>1) { - if(p->Y) free(p->Y); - if(p->Tp) free(p->Tp); - p->Y = (FLT*)malloc(sizeof(FLT)*nj); - p->Tp = (FLT*)malloc(sizeof(FLT)*nk); + if (p->X) free(p->X); + if (p->Sp) free(p->Sp); + p->X = (FLT *)malloc(sizeof(FLT) * nj); + p->Sp = (FLT *)malloc(sizeof(FLT) * nk); + if (d > 1) { + if (p->Y) free(p->Y); + if (p->Tp) free(p->Tp); + p->Y = (FLT *)malloc(sizeof(FLT) * nj); + p->Tp = (FLT *)malloc(sizeof(FLT) * nk); } - if (d>2) { - if(p->Z) free(p->Z); - if(p->Up) free(p->Up); - p->Z = (FLT*)malloc(sizeof(FLT)*nj); - p->Up = (FLT*)malloc(sizeof(FLT)*nk); + if (d > 2) { + if (p->Z) free(p->Z); + if (p->Up) free(p->Up); + p->Z = (FLT *)malloc(sizeof(FLT) * nj); + p->Up = (FLT *)malloc(sizeof(FLT) * nk); } // always shift as use gam to rescale x_j to x'_j, etc (twist iii)... - FLT ig1 = 1.0/p->t3P.gam1, ig2=0.0, ig3=0.0; // "reciprocal-math" optim - if (d>1) - ig2 = 1.0/p->t3P.gam2; - if (d>2) - ig3 = 1.0/p->t3P.gam3; + FLT ig1 = 1.0 / p->t3P.gam1, ig2 = 0.0, ig3 = 0.0; // "reciprocal-math" optim + if (d > 1) ig2 = 1.0 / p->t3P.gam2; + if (d > 2) ig3 = 1.0 / p->t3P.gam3; #pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) - for (BIGINT j=0;jX[j] = (xj[j] - p->t3P.C1) * ig1; // rescale x_j - if (d>1) // (ok to do inside loop because of branch predict) - p->Y[j] = (yj[j]- p->t3P.C2) * ig2; // rescale y_j - if (d>2) - p->Z[j] = (zj[j] - p->t3P.C3) * ig3; // rescale z_j + for (BIGINT j = 0; j < nj; ++j) { + p->X[j] = (xj[j] - p->t3P.C1) * ig1; // rescale x_j + if (d > 1) // (ok to do inside loop because of branch predict) + p->Y[j] = (yj[j] - p->t3P.C2) * ig2; // rescale y_j + if (d > 2) p->Z[j] = (zj[j] - p->t3P.C3) * ig3; // rescale z_j } // set up prephase array... - CPX imasign = (p->fftSign>=0) ? IMA : -IMA; // +-i - if(p->prephase) free(p->prephase); - p->prephase = (CPX*)malloc(sizeof(CPX)*nj); - if (p->t3P.D1!=0.0 || p->t3P.D2!=0.0 || p->t3P.D3!=0.0) { + CPX imasign = (p->fftSign >= 0) ? IMA : -IMA; // +-i + if (p->prephase) free(p->prephase); + p->prephase = (CPX *)malloc(sizeof(CPX) * nj); + if (p->t3P.D1 != 0.0 || p->t3P.D2 != 0.0 || p->t3P.D3 != 0.0) { #pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) - for (BIGINT j=0;jt3P.D1*xj[j]; - if (d>1) - phase += p->t3P.D2*yj[j]; - if (d>2) - phase += p->t3P.D3*zj[j]; - p->prephase[j] = cos(phase)+imasign*sin(phase); // Euler e^{+-i.phase} + for (BIGINT j = 0; j < nj; ++j) { // ... loop over src NU locs + FLT phase = p->t3P.D1 * xj[j]; + if (d > 1) phase += p->t3P.D2 * yj[j]; + if (d > 2) phase += p->t3P.D3 * zj[j]; + p->prephase[j] = cos(phase) + imasign * sin(phase); // Euler e^{+-i.phase} } } else - for (BIGINT j=0;jprephase[j] = (CPX)1.0; // *** or keep flag so no mult in exec?? - - // rescale the target s_k etc to s'_k etc... + for (BIGINT j = 0; j < nj; ++j) + p->prephase[j] = (CPX)1.0; // *** or keep flag so no mult in exec?? + + // rescale the target s_k etc to s'_k etc... #pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) - for (BIGINT k=0;kSp[k] = p->t3P.h1*p->t3P.gam1*(s[k]- p->t3P.D1); // so |s'_k| < pi/R - if (d>1) - p->Tp[k] = p->t3P.h2*p->t3P.gam2*(t[k]- p->t3P.D2); // so |t'_k| < pi/R - if (d>2) - p->Up[k] = p->t3P.h3*p->t3P.gam3*(u[k]- p->t3P.D3); // so |u'_k| < pi/R + for (BIGINT k = 0; k < nk; ++k) { + p->Sp[k] = p->t3P.h1 * p->t3P.gam1 * (s[k] - p->t3P.D1); // so |s'_k| < pi/R + if (d > 1) + p->Tp[k] = p->t3P.h2 * p->t3P.gam2 * (t[k] - p->t3P.D2); // so |t'_k| < pi/R + if (d > 2) + p->Up[k] = p->t3P.h3 * p->t3P.gam3 * (u[k] - p->t3P.D3); // so |u'_k| < pi/R } - + // (old STEP 3a) Compute deconvolution post-factors array (per targ pt)... // (exploits that FT separates because kernel is prod of 1D funcs) - if(p->deconv) free(p->deconv); - p->deconv = (CPX*)malloc(sizeof(CPX)*nk); - FLT *phiHatk1 = (FLT*)malloc(sizeof(FLT)*nk); // don't confuse w/ p->phiHat - onedim_nuft_kernel(nk, p->Sp, phiHatk1, p->spopts); // fill phiHat1 + if (p->deconv) free(p->deconv); + p->deconv = (CPX *)malloc(sizeof(CPX) * nk); + FLT *phiHatk1 = (FLT *)malloc(sizeof(FLT) * nk); // don't confuse w/ p->phiHat + onedim_nuft_kernel(nk, p->Sp, phiHatk1, p->spopts); // fill phiHat1 FLT *phiHatk2 = NULL, *phiHatk3 = NULL; - if (d>1) { - phiHatk2 = (FLT*)malloc(sizeof(FLT)*nk); - onedim_nuft_kernel(nk, p->Tp, phiHatk2, p->spopts); // fill phiHat2 + if (d > 1) { + phiHatk2 = (FLT *)malloc(sizeof(FLT) * nk); + onedim_nuft_kernel(nk, p->Tp, phiHatk2, p->spopts); // fill phiHat2 } - if (d>2) { - phiHatk3 = (FLT*)malloc(sizeof(FLT)*nk); - onedim_nuft_kernel(nk, p->Up, phiHatk3, p->spopts); // fill phiHat3 + if (d > 2) { + phiHatk3 = (FLT *)malloc(sizeof(FLT) * nk); + onedim_nuft_kernel(nk, p->Up, phiHatk3, p->spopts); // fill phiHat3 } - int Cfinite = isfinite(p->t3P.C1) && isfinite(p->t3P.C2) && isfinite(p->t3P.C3); // C can be nan or inf if M=0, no input NU pts - int Cnonzero = p->t3P.C1!=0.0 || p->t3P.C2!=0.0 || p->t3P.C3!=0.0; // cen + int Cfinite = + isfinite(p->t3P.C1) && isfinite(p->t3P.C2) && isfinite(p->t3P.C3); // C can be nan + // or inf if + // M=0, no + // input NU pts + int Cnonzero = p->t3P.C1 != 0.0 || p->t3P.C2 != 0.0 || p->t3P.C3 != 0.0; // cen #pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) - for (BIGINT k=0;k1) - phiHat *= phiHatk2[k]; - if (d>2) - phiHat *= phiHatk3[k]; + if (d > 1) phiHat *= phiHatk2[k]; + if (d > 2) phiHat *= phiHatk3[k]; p->deconv[k] = (CPX)(1.0 / phiHat); if (Cfinite && Cnonzero) { FLT phase = (s[k] - p->t3P.D1) * p->t3P.C1; - if (d>1) - phase += (t[k] - p->t3P.D2) * p->t3P.C2; - if (d>2) - phase += (u[k] - p->t3P.D3) * p->t3P.C3; - p->deconv[k] *= cos(phase)+imasign*sin(phase); // Euler e^{+-i.phase} + if (d > 1) phase += (t[k] - p->t3P.D2) * p->t3P.C2; + if (d > 2) phase += (u[k] - p->t3P.D3) * p->t3P.C3; + p->deconv[k] *= cos(phase) + imasign * sin(phase); // Euler e^{+-i.phase} } } - free(phiHatk1); free(phiHatk2); free(phiHatk3); // done w/ deconv fill - if (p->opts.debug) printf("[%s t3] phase & deconv factors:\t%.3g s\n",__func__,timer.elapsedsec()); + free(phiHatk1); + free(phiHatk2); + free(phiHatk3); // done w/ deconv fill + if (p->opts.debug) + printf("[%s t3] phase & deconv factors:\t%.3g s\n", __func__, timer.elapsedsec()); // Set up sort for spreading Cp (from primed NU src pts X, Y, Z) to fw... timer.restart(); - // Free sortIndices if it has been allocated before in case of repeated setpts calls causing memory leak. - // We don't know it is the same size as before, so we have to malloc each time. + // Free sortIndices if it has been allocated before in case of repeated setpts calls + // causing memory leak. We don't know it is the same size as before, so we have to + // malloc each time. if (p->sortIndices) free(p->sortIndices); - p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT)*p->nj); + p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj); if (!p->sortIndices) { - fprintf(stderr,"[%s t3] failed to allocate sortIndices!\n",__func__); + fprintf(stderr, "[%s t3] failed to allocate sortIndices!\n", __func__); return FINUFFT_ERR_SPREAD_ALLOC; } - p->didSort = indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, p->X, p->Y, p->Z, p->spopts); - if (p->opts.debug) printf("[%s t3] sort (didSort=%d):\t\t%.3g s\n",__func__, p->didSort, timer.elapsedsec()); - + p->didSort = indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, p->X, p->Y, + p->Z, p->spopts); + if (p->opts.debug) + printf("[%s t3] sort (didSort=%d):\t\t%.3g s\n", __func__, p->didSort, + timer.elapsedsec()); + // Plan and setpts once, for the (repeated) inner type 2 finufft call... timer.restart(); - BIGINT t2nmodes[] = {p->nf1,p->nf2,p->nf3}; // t2 input is actually fw - finufft_opts t2opts = p->opts; // deep copy, since not ptrs - t2opts.modeord = 0; // needed for correct t3! - t2opts.debug = max(0,p->opts.debug-1); // don't print as much detail - t2opts.spread_debug = max(0,p->opts.spread_debug-1); - t2opts.showwarn = 0; // so don't see warnings 2x + BIGINT t2nmodes[] = {p->nf1, p->nf2, p->nf3}; // t2 input is actually fw + finufft_opts t2opts = p->opts; // deep copy, since not ptrs + t2opts.modeord = 0; // needed for correct t3! + t2opts.debug = max(0, p->opts.debug - 1); // don't print as much detail + t2opts.spread_debug = max(0, p->opts.spread_debug - 1); + t2opts.showwarn = 0; // so don't see warnings 2x // (...could vary other t2opts here?) - if(p->innerT2plan) FINUFFT_DESTROY(p->innerT2plan); + if (p->innerT2plan) FINUFFT_DESTROY(p->innerT2plan); int ier = FINUFFT_MAKEPLAN(2, d, t2nmodes, p->fftSign, p->batchSize, p->tol, &p->innerT2plan, &t2opts); - if (ier>1) { // if merely warning, still proceed - fprintf(stderr,"[%s t3]: inner type 2 plan creation failed with ier=%d!\n",__func__,ier); + if (ier > 1) { // if merely warning, still proceed + fprintf(stderr, "[%s t3]: inner type 2 plan creation failed with ier=%d!\n", + __func__, ier); return ier; } - ier = FINUFFT_SETPTS(p->innerT2plan, nk, p->Sp, p->Tp, p->Up, 0, NULL, NULL, NULL); // note nk = # output points (not nj) - if (ier>1) { - fprintf(stderr,"[%s t3]: inner type 2 setpts failed, ier=%d!\n",__func__,ier); + ier = FINUFFT_SETPTS(p->innerT2plan, nk, p->Sp, p->Tp, p->Up, 0, NULL, NULL, + NULL); // note nk = # output points (not nj) + if (ier > 1) { + fprintf(stderr, "[%s t3]: inner type 2 setpts failed, ier=%d!\n", __func__, ier); return ier; } - if (p->opts.debug) printf("[%s t3] inner t2 plan & setpts: \t%.3g s\n", __func__,timer.elapsedsec()); - + if (p->opts.debug) + printf("[%s t3] inner t2 plan & setpts: \t%.3g s\n", __func__, timer.elapsedsec()); } return 0; } // ............ end setpts .................................................. - // EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE -int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX* cj, CPX* fk){ -/* See ../docs/cguru.doc for current documentation. - - For given (stack of) weights cj or coefficients fk, performs NUFFTs with - existing (sorted) NU pts and existing plan. - For type 1 and 3: cj is input, fk is output. - For type 2: fk is input, cj is output. - Performs spread/interp, pre/post deconvolve, and fftw_execute as appropriate - for each of the 3 types. - For cases of ntrans>1, performs work in blocks of size up to batchSize. - Return value 0 (no error diagnosis yet). - Barnett 5/20/20, based on Malleo 2019. -*/ - CNTime timer; timer.start(); - - if (p->type!=3){ // --------------------- TYPE 1,2 EXEC ------------------ - - double t_sprint = 0.0, t_fft = 0.0, t_deconv = 0.0; // accumulated timing +int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) { + /* See ../docs/cguru.doc for current documentation. + + For given (stack of) weights cj or coefficients fk, performs NUFFTs with + existing (sorted) NU pts and existing plan. + For type 1 and 3: cj is input, fk is output. + For type 2: fk is input, cj is output. + Performs spread/interp, pre/post deconvolve, and fftw_execute as appropriate + for each of the 3 types. + For cases of ntrans>1, performs work in blocks of size up to batchSize. + Return value 0 (no error diagnosis yet). + Barnett 5/20/20, based on Malleo 2019. + */ + CNTime timer; + timer.start(); + + if (p->type != 3) { // --------------------- TYPE 1,2 EXEC ------------------ + + double t_sprint = 0.0, t_fft = 0.0, t_deconv = 0.0; // accumulated timing if (p->opts.debug) - printf("[%s] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans, p->nbatch, p->batchSize); - - for (int b=0; b*p->batchSize < p->ntrans; b++) { // .....loop b over batches + printf("[%s] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans, + p->nbatch, p->batchSize); + + for (int b = 0; b * p->batchSize < p->ntrans; b++) { // .....loop b over batches // current batch is either batchSize, or possibly truncated if last one - int thisBatchSize = min(p->ntrans - b*p->batchSize, p->batchSize); - int bB = b*p->batchSize; // index of vector, since batchsizes same - CPX* cjb = cj + bB*p->nj; // point to batch of weights - CPX* fkb = fk + bB*p->N; // point to batch of mode coeffs - if (p->opts.debug>1) printf("[%s] start batch %d (size %d):\n",__func__, b,thisBatchSize); - + int thisBatchSize = min(p->ntrans - b * p->batchSize, p->batchSize); + int bB = b * p->batchSize; // index of vector, since batchsizes same + CPX *cjb = cj + bB * p->nj; // point to batch of weights + CPX *fkb = fk + bB * p->N; // point to batch of mode coeffs + if (p->opts.debug > 1) + printf("[%s] start batch %d (size %d):\n", __func__, b, thisBatchSize); + // STEP 1: (varies by type) timer.restart(); - if (p->type == 1) { // type 1: spread NU pts p->X, weights cj, to fw grid + if (p->type == 1) { // type 1: spread NU pts p->X, weights cj, to fw grid spreadinterpSortedBatch(thisBatchSize, p, cjb); t_sprint += timer.elapsedsec(); - } else { // type 2: amplify Fourier coeffs fk into 0-padded fw + } else { // type 2: amplify Fourier coeffs fk into 0-padded fw deconvolveBatch(thisBatchSize, p, fkb); t_deconv += timer.elapsedsec(); } - + // STEP 2: call the pre-planned FFT on this batch timer.restart(); - FFTW_EX(p->fftwPlan); // if thisBatchSizefftwPlan); // if thisBatchSizeopts.debug>1) - printf("\tFFTW exec:\t\t%.3g s\n", timer.elapsedsec()); - + if (p->opts.debug > 1) printf("\tFFTW exec:\t\t%.3g s\n", timer.elapsedsec()); + // STEP 3: (varies by type) - timer.restart(); - if (p->type == 1) { // type 1: deconvolve (amplify) fw and shuffle to fk + timer.restart(); + if (p->type == 1) { // type 1: deconvolve (amplify) fw and shuffle to fk deconvolveBatch(thisBatchSize, p, fkb); t_deconv += timer.elapsedsec(); - } else { // type 2: interpolate unif fw grid to NU target pts + } else { // type 2: interpolate unif fw grid to NU target pts spreadinterpSortedBatch(thisBatchSize, p, cjb); - t_sprint += timer.elapsedsec(); + t_sprint += timer.elapsedsec(); } - } // ........end b loop - - if (p->opts.debug) { // report total times in their natural order... - if(p->type == 1) { - printf("[%s] done. tot spread:\t\t%.3g s\n",__func__,t_sprint); + } // ........end b loop + + if (p->opts.debug) { // report total times in their natural order... + if (p->type == 1) { + printf("[%s] done. tot spread:\t\t%.3g s\n", __func__, t_sprint); printf(" tot FFT:\t\t\t\t%.3g s\n", t_fft); printf(" tot deconvolve:\t\t\t%.3g s\n", t_deconv); } else { - printf("[%s] done. tot deconvolve:\t\t%.3g s\n",__func__,t_deconv); + printf("[%s] done. tot deconvolve:\t\t%.3g s\n", __func__, t_deconv); printf(" tot FFT:\t\t\t\t%.3g s\n", t_fft); - printf(" tot interp:\t\t\t%.3g s\n",t_sprint); + printf(" tot interp:\t\t\t%.3g s\n", t_sprint); } } } - else { // ----------------------------- TYPE 3 EXEC --------------------- + else { // ----------------------------- TYPE 3 EXEC --------------------- - //for (BIGINT j=0;j<10;++j) printf("\tcj[%ld]=%.15g+%.15gi\n",(long int)j,(double)real(cj[j]),(double)imag(cj[j])); // debug - - double t_pre=0.0, t_spr=0.0, t_t2=0.0, t_deconv=0.0; // accumulated timings + // for (BIGINT j=0;j<10;++j) printf("\tcj[%ld]=%.15g+%.15gi\n",(long + // int)j,(double)real(cj[j]),(double)imag(cj[j])); // debug + + double t_pre = 0.0, t_spr = 0.0, t_t2 = 0.0, t_deconv = 0.0; // accumulated timings if (p->opts.debug) - printf("[%s t3] start ntrans=%d (%d batches, bsize=%d)...\n",__func__,p->ntrans, p->nbatch, p->batchSize); + printf("[%s t3] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans, + p->nbatch, p->batchSize); - for (int b=0; b*p->batchSize < p->ntrans; b++) { // .....loop b over batches + for (int b = 0; b * p->batchSize < p->ntrans; b++) { // .....loop b over batches // batching and pointers to this batch, identical to t1,2 above... - int thisBatchSize = min(p->ntrans - b*p->batchSize, p->batchSize); - int bB = b*p->batchSize; - CPX* cjb = cj + bB*p->nj; // batch of input strengths - CPX* fkb = fk + bB*p->nk; // batch of output strengths - if (p->opts.debug>1) printf("[%s t3] start batch %d (size %d):\n",__func__,b,thisBatchSize); - + int thisBatchSize = min(p->ntrans - b * p->batchSize, p->batchSize); + int bB = b * p->batchSize; + CPX *cjb = cj + bB * p->nj; // batch of input strengths + CPX *fkb = fk + bB * p->nk; // batch of output strengths + if (p->opts.debug > 1) + printf("[%s t3] start batch %d (size %d):\n", __func__, b, thisBatchSize); + // STEP 0: pre-phase (possibly) the c_j input strengths into c'_j batch... timer.restart(); -#pragma omp parallel for num_threads(p->opts.nthreads) // or p->batchSize? - for (int i=0; inj; - for (BIGINT j=0;jnj;++j) - p->CpBatch[ioff+j] = p->prephase[j] * cjb[ioff+j]; +#pragma omp parallel for num_threads(p->opts.nthreads) // or p->batchSize? + for (int i = 0; i < thisBatchSize; i++) { + BIGINT ioff = i * p->nj; + for (BIGINT j = 0; j < p->nj; ++j) + p->CpBatch[ioff + j] = p->prephase[j] * cjb[ioff + j]; } - t_pre += timer.elapsedsec(); - + t_pre += timer.elapsedsec(); + // STEP 1: spread c'_j batch (x'_j NU pts) into fw batch grid... timer.restart(); - p->spopts.spread_direction = 1; // spread - spreadinterpSortedBatch(thisBatchSize, p, p->CpBatch); // p->X are primed + p->spopts.spread_direction = 1; // spread + spreadinterpSortedBatch(thisBatchSize, p, p->CpBatch); // p->X are primed t_spr += timer.elapsedsec(); - //for (int j=0;jnf1;++j) printf("fw[%d]=%.3g+%.3gi\n",j,p->fwBatch[j][0],p->fwBatch[j][1]); // debug - + // for (int j=0;jnf1;++j) + // printf("fw[%d]=%.3g+%.3gi\n",j,p->fwBatch[j][0],p->fwBatch[j][1]); // debug + // STEP 2: type 2 NUFFT from fw batch to user output fk array batch... timer.restart(); // illegal possible shrink of ntrans *after* plan for smaller last batch: - p->innerT2plan->ntrans = thisBatchSize; // do not try this at home! + p->innerT2plan->ntrans = thisBatchSize; // do not try this at home! /* (alarming that FFTW not shrunk, but safe, because t2's fwBatch array still the same size, as Andrea explained; just wastes a few flops) */ - FINUFFT_EXECUTE(p->innerT2plan, fkb, (CPX*)(p->fwBatch)); + FINUFFT_EXECUTE(p->innerT2plan, fkb, (CPX *)(p->fwBatch)); t_t2 += timer.elapsedsec(); // STEP 3: apply deconvolve (precomputed 1/phiHat(targ_k), phasing too)... timer.restart(); #pragma omp parallel for num_threads(p->opts.nthreads) - for (int i=0; ink; - for (BIGINT k=0;knk;++k) - fkb[ioff+k] *= p->deconv[k]; + for (int i = 0; i < thisBatchSize; i++) { + BIGINT ioff = i * p->nk; + for (BIGINT k = 0; k < p->nk; ++k) fkb[ioff + k] *= p->deconv[k]; } t_deconv += timer.elapsedsec(); - } // ........end b loop + } // ........end b loop - if (p->opts.debug) { // report total times in their natural order... - printf("[%s t3] done. tot prephase:\t\t%.3g s\n",__func__,t_pre); - printf(" tot spread:\t\t\t%.3g s\n",t_spr); + if (p->opts.debug) { // report total times in their natural order... + printf("[%s t3] done. tot prephase:\t\t%.3g s\n", __func__, t_pre); + printf(" tot spread:\t\t\t%.3g s\n", t_spr); printf(" tot type 2:\t\t\t%.3g s\n", t_t2); printf(" tot deconvolve:\t\t%.3g s\n", t_deconv); - } + } } - //for (BIGINT k=0;k<10;++k) printf("\tfk[%ld]=%.15g+%.15gi\n",(long int)k,(double)real(fk[k]),(double)imag(fk[k])); // debug - - return 0; -} + // for (BIGINT k=0;k<10;++k) printf("\tfk[%ld]=%.15g+%.15gi\n",(long + // int)k,(double)real(fk[k]),(double)imag(fk[k])); // debug + return 0; +} // DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD int FINUFFT_DESTROY(FINUFFT_PLAN p) @@ -1165,12 +1221,12 @@ int FINUFFT_DESTROY(FINUFFT_PLAN p) // Thus either each thing free'd here is guaranteed to be NULL or correctly // allocated. { - if (!p) // NULL ptr, so not a ptr to a plan, report error + if (!p) // NULL ptr, so not a ptr to a plan, report error return 1; FFTW_FR(p->fwBatch); // free the big FFTW (or t3 spread) working array free(p->sortIndices); - if (p->type==1 || p->type==2) { + if (p->type == 1 || p->type == 2) { { std::lock_guard lock(fftw_lock); FFTW_DE(p->fftwPlan); @@ -1178,14 +1234,18 @@ int FINUFFT_DESTROY(FINUFFT_PLAN p) free(p->phiHat1); free(p->phiHat2); free(p->phiHat3); - } else { // free the stuff alloc for type 3 only - FINUFFT_DESTROY(p->innerT2plan); // if NULL, ignore its error code + } else { // free the stuff alloc for type 3 only + FINUFFT_DESTROY(p->innerT2plan); // if NULL, ignore its error code free(p->CpBatch); - free(p->Sp); free(p->Tp); free(p->Up); - free(p->X); free(p->Y); free(p->Z); + free(p->Sp); + free(p->Tp); + free(p->Up); + free(p->X); + free(p->Y); + free(p->Z); free(p->prephase); free(p->deconv); } delete p; - return 0; // success + return 0; // success } diff --git a/src/ker_horner_allw_loop_constexpr.h b/src/ker_horner_allw_loop_constexpr.h index 6de0540e9..25a791ddb 100644 --- a/src/ker_horner_allw_loop_constexpr.h +++ b/src/ker_horner_allw_loop_constexpr.h @@ -5,228 +5,909 @@ template constexpr std::array, nc> get_horner_coeffs() noexcept { - if constexpr (w == 2) { - return std::array, nc> {{ - {4.5147043243215315E+01, 4.5147043243215300E+01}, - {5.7408070938221300E+01, -5.7408070938221293E+01}, - {-1.8395117920046484E+00, -1.8395117920046560E+00}, - {-2.0382426253182082E+01, 2.0382426253182086E+01}, - {-2.0940804433577420E+00, -2.0940804433577389E+00} - }}; - } else if constexpr (w == 3) { - return std::array, nc> {{ - {1.5653991189315119E+02, 8.8006872410780295E+02, 1.5653991189967152E+02}, - {3.1653018869611077E+02, 7.4325702843759617E-14, -3.1653018868907071E+02}, - {1.7742692790454484E+02, -3.3149255274727801E+02, 1.7742692791117119E+02}, - {-1.5357716116473156E+01, 9.5071486252033243E-15, 1.5357716122720193E+01}, - {-3.7757583061523668E+01, 5.3222970968867315E+01, -3.7757583054647384E+01}, - {-3.9654011076088804E+00, 1.8062124448285358E-13, 3.9654011139270540E+00} - }}; - } else if constexpr (w == 4) { - return std::array, nc> {{ - {5.4284366850213200E+02, 1.0073871433088398E+04, 1.0073871433088396E+04, 5.4284366850213223E+02}, - {1.4650917259256939E+03, 6.1905285583602863E+03, -6.1905285583602881E+03, -1.4650917259256937E+03}, - {1.4186910680718345E+03, -1.3995339862725591E+03, -1.3995339862725598E+03, 1.4186910680718347E+03}, - {5.1133995502497419E+02, -1.4191608683682996E+03, 1.4191608683682998E+03, -5.1133995502497424E+02}, - {-4.8293622641174039E+01, 3.9393732546135226E+01, 3.9393732546135816E+01, -4.8293622641174061E+01}, - {-7.8386867802392288E+01, 1.4918904800408930E+02, -1.4918904800408751E+02, 7.8386867802392359E+01}, - {-1.0039212571700894E+01, 5.0626747735616746E+00, 5.0626747735625512E+00, -1.0039212571700640E+01} - }}; - } else if constexpr (w == 5) { - return std::array, nc> {{ - {9.9223677575398392E+02, 3.7794697666613320E+04, 9.8715771010760494E+04, 3.7794697666613283E+04, 9.9223677575398403E+02}, - {3.0430174925083825E+03, 3.7938404259811403E+04, -1.1842989705877139E-11, -3.7938404259811381E+04, -3.0430174925083829E+03}, - {3.6092689177271222E+03, 7.7501368899498666E+03, -2.2704627332475000E+04, 7.7501368899498730E+03, 3.6092689177271218E+03}, - {1.9990077310495396E+03, -3.8875294641277296E+03, 9.7116927320010791E-12, 3.8875294641277369E+03, -1.9990077310495412E+03}, - {4.0071733590403869E+02, -1.5861137916762602E+03, 2.3839858699098645E+03, -1.5861137916762643E+03, 4.0071733590403909E+02}, - {-9.1301168206167262E+01, 1.2316471075214675E+02, 2.0698495299948402E-11, -1.2316471075214508E+02, 9.1301168206167233E+01}, - {-5.5339722671223846E+01, 1.1960590540261879E+02, -1.5249941358311668E+02, 1.1960590540262307E+02, -5.5339722671223605E+01}, - {-3.3762488150353924E+00, 2.2839981872948751E+00, 7.1884725699454154E-12, -2.2839981872943818E+00, 3.3762488150341459E+00} - }}; - } else if constexpr (w == 6) { - return std::array, nc> {{ - {2.0553833234911876E+03, 1.5499537739913128E+05, 8.1177907023291115E+05, 8.1177907023291173E+05, 1.5499537739913136E+05, 2.0553833235005691E+03}, - {7.1269776034442639E+03, 2.0581923258843314E+05, 3.1559612614917674E+05, -3.1559612614917627E+05, -2.0581923258843317E+05, -7.1269776034341394E+03}, - {1.0023404568475091E+04, 9.0916650498360192E+04, -1.0095927514054619E+05, -1.0095927514054628E+05, 9.0916650498360177E+04, 1.0023404568484635E+04}, - {7.2536109410387417E+03, 4.8347162752602981E+03, -5.0512736602018522E+04, 5.0512736602018478E+04, -4.8347162752603008E+03, -7.2536109410297540E+03}, - {2.7021878300949752E+03, -7.8773465553972646E+03, 5.2105876478342780E+03, 5.2105876478343343E+03, -7.8773465553972710E+03, 2.7021878301048723E+03}, - {3.2120291706547636E+02, -1.8229189469936762E+03, 3.7928113414429808E+03, -3.7928113414427025E+03, 1.8229189469937312E+03, -3.2120291705638243E+02}, - {-1.2051267090537374E+02, 2.2400507411399673E+02, -1.2506575852541796E+02, -1.2506575852521925E+02, 2.2400507411398695E+02, -1.2051267089640181E+02}, - {-4.5977202613350237E+01, 1.1536880606853076E+02, -1.7819720186493959E+02, 1.7819720186497622E+02, -1.1536880606854736E+02, 4.5977202622148909E+01}, - {-1.5631081288842275E+00, 7.1037430591266115E-01, -6.9838401121429056E-02, -6.9838401186476856E-02, 7.1037430589285400E-01, -1.5631081203754575E+00} - }}; - } else if constexpr(w==7) { - return std::array, nc> {{ - {3.9948351830487481E+03, 5.4715865608590771E+05, 5.0196413492771760E+06, 9.8206709220713247E+06, 5.0196413492771825E+06, 5.4715865608590783E+05, 3.9948351830642519E+03}, - {1.5290160332974696E+04, 8.7628248584320408E+05, 3.4421061790934438E+06, -2.6908159596373561E-10, -3.4421061790934461E+06, -8.7628248584320408E+05, -1.5290160332958067E+04}, - {2.4458227486779251E+04, 5.3904618484139396E+05, 2.4315566181017534E+05, -1.6133959371974322E+06, 2.4315566181017453E+05, 5.3904618484139396E+05, 2.4458227486795113E+04}, - {2.1166189345881645E+04, 1.3382732160223130E+05, -3.3113450969689694E+05, 6.9013724510092140E-10, 3.3113450969689724E+05, -1.3382732160223136E+05, -2.1166189345866893E+04}, - {1.0542795672344864E+04, -7.0739172265098678E+03, -6.5563293056049893E+04, 1.2429734005960064E+05, -6.5563293056049602E+04, -7.0739172265098332E+03, 1.0542795672361213E+04}, - {2.7903491906228419E+03, -1.0975382873973093E+04, 1.3656979541144799E+04, 7.7346408577822045E-10, -1.3656979541143772E+04, 1.0975382873973256E+04, -2.7903491906078298E+03}, - {1.6069721418053300E+02, -1.5518707872251393E+03, 4.3634273936642621E+03, -5.9891976420595174E+03, 4.3634273936642730E+03, -1.5518707872251064E+03, 1.6069721419533221E+02}, - {-1.2289277373867256E+02, 2.8583630927743314E+02, -2.8318194617327981E+02, 6.9043515551118249E-10, 2.8318194617392436E+02, -2.8583630927760140E+02, 1.2289277375319763E+02}, - {-3.2270164914249058E+01, 9.1892112257581346E+01, -1.6710678096334209E+02, 2.0317049305432383E+02, -1.6710678096383771E+02, 9.1892112257416159E+01, -3.2270164900224913E+01}, - {-1.4761409685186277E-01, -9.1862771280377487E-01, 1.2845147741777752E+00, 5.6547359492808854E-10, -1.2845147728310689E+00, 9.1862771293147971E-01, 1.4761410890866353E-01} - }}; - } else if constexpr(w==8) { - return std::array, nc> {{ - {7.3898000697447915E+03, 1.7297637497600035E+06, 2.5578341605285794E+07, 8.4789650417103335E+07, 8.4789650417103350E+07, 2.5578341605285816E+07, 1.7297637497600049E+06, 7.3898000697447915E+03}, - {3.0719636811267599E+04, 3.1853145713323927E+06, 2.3797981861403696E+07, 2.4569731244678464E+07, -2.4569731244678471E+07, -2.3797981861403704E+07, -3.1853145713323941E+06, -3.0719636811267606E+04}, - {5.4488498478251728E+04, 2.4101183255475131E+06, 6.4554051283428287E+06, -8.9200440393090546E+06, -8.9200440393090583E+06, 6.4554051283428324E+06, 2.4101183255475126E+06, 5.4488498478251728E+04}, - {5.3926359802542116E+04, 9.0469037926849292E+05, -6.0897036277696118E+05, -3.0743852105799988E+06, 3.0743852105800058E+06, 6.0897036277696711E+05, -9.0469037926849339E+05, -5.3926359802542138E+04}, - {3.2444118016247590E+04, 1.3079802224392134E+05, -5.8652889370129269E+05, 4.2333306008151924E+05, 4.2333306008152053E+05, -5.8652889370128722E+05, 1.3079802224392109E+05, 3.2444118016247590E+04}, - {1.1864306345505294E+04, -2.2700360645707988E+04, -5.0713607251414309E+04, 1.8308704458211688E+05, -1.8308704458210632E+05, 5.0713607251413123E+04, 2.2700360645707628E+04, -1.1864306345505294E+04}, - {2.2812256770903232E+03, -1.1569135767377773E+04, 2.0942387020798891E+04, -1.1661592834945191E+04, -1.1661592834940149E+04, 2.0942387020801420E+04, -1.1569135767377924E+04, 2.2812256770903286E+03}, - {8.5503535636821422E+00, -9.7513976461238224E+02, 3.8242995179171526E+03, -6.9201295567267280E+03, 6.9201295567248662E+03, -3.8242995179155446E+03, 9.7513976461209836E+02, -8.5503535637013552E+00}, - {-1.0230637348345023E+02, 2.8246898554269114E+02, -3.8638201738139219E+02, 1.9106407993320320E+02, 1.9106407993289886E+02, -3.8638201738492717E+02, 2.8246898554219217E+02, -1.0230637348345138E+02}, - {-1.9200143062947848E+01, 6.1692257626706223E+01, -1.2981109187842989E+02, 1.8681284210471688E+02, -1.8681284209654376E+02, 1.2981109187880142E+02, -6.1692257626845532E+01, 1.9200143062947120E+01}, - {3.7894993760177598E-01, -1.7334408836731494E+00, 2.5271184057877303E+00, -1.2600963971824484E+00, -1.2600963917834651E+00, 2.5271184069685657E+00, -1.7334408840526812E+00, 3.7894993760636758E-01} - }}; - } else if constexpr(w==9) { - return std::array, nc> {{ - {1.3136365370186100E+04, 5.0196413492771806E+06, 1.1303327711722563E+08, 5.8225443924996686E+08, 9.7700272582690656E+08, 5.8225443924996758E+08, 1.1303327711722568E+08, 5.0196413492772207E+06, 1.3136365370186135E+04}, - {5.8623313038274340E+04, 1.0326318537280345E+07, 1.2898448324824864E+08, 3.0522863709830385E+08, -3.9398045056223735E-08, -3.0522863709830391E+08, -1.2898448324824864E+08, -1.0326318537280388E+07, -5.8623313038274347E+04}, - {1.1335001341875963E+05, 9.0726133144784812E+06, 5.3501544534038112E+07, -2.6789524644146336E+05, -1.2483923718899371E+08, -2.6789524644172983E+05, 5.3501544534038112E+07, 9.0726133144785129E+06, 1.1335001341875960E+05}, - {1.2489113703229747E+05, 4.3035547171861930E+06, 6.3021978510598792E+06, -2.6014941986659057E+07, 6.0417403157325170E-08, 2.6014941986659389E+07, -6.3021978510598652E+06, -4.3035547171862079E+06, -1.2489113703229751E+05}, - {8.6425493435991244E+04, 1.0891182836653308E+06, -2.0713033564200639E+06, -2.8994941183506218E+06, 7.5905338661205899E+06, -2.8994941183505375E+06, -2.0713033564200667E+06, 1.0891182836653353E+06, 8.6425493435991288E+04}, - {3.8657354724013814E+04, 7.9936390113331305E+04, -7.0458265546791907E+05, 1.0151095605715880E+06, 1.2138090419648379E-07, -1.0151095605717725E+06, 7.0458265546794771E+05, -7.9936390113331567E+04, -3.8657354724013821E+04}, - {1.0779131453134638E+04, -3.3466718311300596E+04, -1.3245366619006139E+04, 1.8238470515353698E+05, -2.9285656292977190E+05, 1.8238470515350526E+05, -1.3245366619000662E+04, -3.3466718311299621E+04, 1.0779131453134616E+04}, - {1.4992527030548456E+03, -9.7024371533891372E+03, 2.3216330734057381E+04, -2.3465262819040818E+04, 5.3299736484284360E-08, 2.3465262819251962E+04, -2.3216330734049119E+04, 9.7024371533890644E+03, -1.4992527030548747E+03}, - {-7.9857427421129714E+01, -4.0585588534807385E+02, 2.6054813773472697E+03, -6.1806593581075495E+03, 8.0679596874001718E+03, -6.1806593581869265E+03, 2.6054813773147021E+03, -4.0585588535363172E+02, -7.9857427421126204E+01}, - {-7.1572272057937070E+01, 2.2785637019511205E+02, -3.9109820765665262E+02, 3.3597424711470910E+02, 1.0596763818009852E-07, -3.3597424723359080E+02, 3.9109820766854079E+02, -2.2785637019009673E+02, 7.1572272057939983E+01}, - {-9.8886360698074700E+00, 3.5359026949867051E+01, -8.5251867715709949E+01, 1.4285748012617628E+02, -1.6935269668779691E+02, 1.4285748010331625E+02, -8.5251867711661305E+01, 3.5359026944299828E+01, -9.8886360698207305E+00} - }}; - } else if constexpr(w==10) { - return std::array, nc> {{ - {2.2594586605749264E+04, 1.3595989066786593E+07, 4.4723032442444897E+08, 3.3781755837397518E+09, 8.6836783895849819E+09, 8.6836783895849762E+09, 3.3781755837397494E+09, 4.4723032442444897E+08, 1.3595989066786474E+07, 2.2594586605749344E+04,}, - {1.0729981697645642E+05, 3.0651490267742988E+07, 5.9387966085130465E+08, 2.4434902657508330E+09, 2.0073077861288922E+09, -2.0073077861288943E+09, -2.4434902657508330E+09, -5.9387966085130453E+08, -3.0651490267742816E+07, -1.0729981697645638E+05,}, - {2.2340399734184606E+05, 3.0258214643190462E+07, 3.1512411458738232E+08, 4.3618276932319808E+08, -7.8178848450497293E+08, -7.8178848450497019E+08, 4.3618276932319826E+08, 3.1512411458738232E+08, 3.0258214643190313E+07, 2.2340399734184548E+05,}, - {2.6917433004353486E+05, 1.6875651476661228E+07, 7.4664745481963441E+07, -9.5882157211118385E+07, -2.0622994435532519E+08, 2.0622994435532743E+08, 9.5882157211118177E+07, -7.4664745481963515E+07, -1.6875651476661161E+07, -2.6917433004353428E+05,}, - {2.0818422772177903E+05, 5.6084730690362519E+06, 1.4435118192351763E+06, -4.0063869969544649E+07, 3.2803674392747045E+07, 3.2803674392746095E+07, -4.0063869969546899E+07, 1.4435118192351642E+06, 5.6084730690362034E+06, 2.0818422772177853E+05,}, - {1.0781139496011091E+05, 9.9202615851199068E+05, -3.3266265543962116E+06, -4.8557049011479173E+05, 1.0176155522772279E+07, -1.0176155522772269E+07, 4.8557049011678610E+05, 3.3266265543963453E+06, -9.9202615851196018E+05, -1.0781139496011072E+05,}, - {3.7380102688153558E+04, 1.2716675000355666E+04, -6.2163527451774501E+05, 1.4157962667184104E+06, -8.4419693137680157E+05, -8.4419693137743860E+05, 1.4157962667189445E+06, -6.2163527451771160E+05, 1.2716675000340010E+04, 3.7380102688153442E+04,}, - {8.1238936393894646E+03, -3.4872365530450072E+04, 2.3913680325196314E+04, 1.2428850301830019E+05, -3.2158255329716846E+05, 3.2158255329951923E+05, -1.2428850301867779E+05, -2.3913680325277423E+04, 3.4872365530457188E+04, -8.1238936393894255E+03,}, - {7.8515926628982663E+02, -6.6607899119372642E+03, 2.0167398338513311E+04, -2.8951401344519112E+04, 1.4622828142848679E+04, 1.4622828143544031E+04, -2.8951401346900999E+04, 2.0167398338398041E+04, -6.6607899119505255E+03, 7.8515926628967964E+02,}, - {-1.0147176570537010E+02, -3.5304284185385157E+01, 1.3576976854876134E+03, -4.3921059353471856E+03, 7.3232085271125388E+03, -7.3232085273978546E+03, 4.3921059367737662E+03, -1.3576976854043962E+03, 3.5304284185385157E+01, 1.0147176570550941E+02,}, - {-4.3161545259389186E+01, 1.5498490981579428E+02, -3.1771250774232175E+02, 3.7215448796427023E+02, -1.7181762832770994E+02, -1.7181763036843782E+02, 3.7215448789408123E+02, -3.1771250773692140E+02, 1.5498490982186786E+02, -4.3161545259547800E+01,}, - {-4.2916172038214198E+00, 1.7402146071148604E+01, -4.7947588069135868E+01, 9.2697698088029625E+01, -1.2821427596894478E+02, 1.2821427705670308E+02, -9.2697698297776569E+01, 4.7947588093524907E+01, -1.7402146074502035E+01, 4.2916172038452141E+00,} - }}; - } else if constexpr(w==11) { - return std::array, nc> {{ - {3.7794653219809625E+04, 3.4782300224660739E+07, 1.6188020733727551E+09, 1.7196758809615005E+10, 6.3754384857724617E+10, 9.7196447559193497E+10, 6.3754384857724617E+10, 1.7196758809614998E+10, 1.6188020733727560E+09, 3.4782300224660769E+07, 3.7794653219808984E+04,}, - {1.8969206922085886E+05, 8.4769319065313652E+07, 2.4230555767723408E+09, 1.5439732722639101E+10, 2.7112836839612309E+10, 2.5609833368650835E-06, -2.7112836839612328E+10, -1.5439732722639105E+10, -2.4230555767723408E+09, -8.4769319065313682E+07, -1.8969206922085711E+05,}, - {4.2138380313901440E+05, 9.2050522922791913E+07, 1.5259983101266613E+09, 4.7070559561237173E+09, -1.2448027572952359E+09, -1.0161446790279301E+10, -1.2448027572952316E+09, 4.7070559561237268E+09, 1.5259983101266615E+09, 9.2050522922791913E+07, 4.2138380313901149E+05,}, - {5.4814313598122005E+05, 5.8085130777589552E+07, 4.9484006166551048E+08, 1.6222124676640952E+08, -2.0440440381345339E+09, 9.1416457449079640E-06, 2.0440440381345336E+09, -1.6222124676640788E+08, -4.9484006166551071E+08, -5.8085130777589560E+07, -5.4814313598121714E+05,}, - {4.6495183529254980E+05, 2.3067199578027144E+07, 6.9832590192482382E+07, -2.2024799260683522E+08, -1.2820270942588677E+08, 5.1017181199129778E+08, -1.2820270942588474E+08, -2.2024799260683942E+08, 6.9832590192482322E+07, 2.3067199578027155E+07, 4.6495183529254742E+05,}, - {2.7021781043532980E+05, 5.6764510325100143E+06, -5.5650761736748898E+06, -3.9907385617900200E+07, 7.2453390663687646E+07, 1.2300109686762266E-05, -7.2453390663684472E+07, 3.9907385617899075E+07, 5.5650761736749066E+06, -5.6764510325099993E+06, -2.7021781043532846E+05,}, - {1.0933249308680627E+05, 6.9586821127987828E+05, -3.6860240321937902E+06, 2.7428169457736355E+06, 8.3392008440593518E+06, -1.6402201025046850E+07, 8.3392008440698013E+06, 2.7428169457778852E+06, -3.6860240321937371E+06, 6.9586821127989423E+05, 1.0933249308680571E+05,}, - {3.0203516161820498E+04, -3.6879059542768438E+04, -4.1141031216788280E+05, 1.4111389975267777E+06, -1.5914376635331670E+06, 9.4095582602103753E-06, 1.5914376635379130E+06, -1.4111389975247320E+06, 4.1141031216776522E+05, 3.6879059542750314E+04, -3.0203516161820549E+04,}, - {5.1670143574922731E+03, -2.8613147115372190E+04, 4.3560195427081359E+04, 4.8438679582765450E+04, -2.5856630639231802E+05, 3.7994883866738499E+05, -2.5856630640319458E+05, 4.8438679579510936E+04, 4.3560195426766244E+04, -2.8613147115376054E+04, 5.1670143574922913E+03,}, - {3.0888018539740131E+02, -3.7949446187471626E+03, 1.4313303204988082E+04, -2.6681600235594462E+04, 2.3856005166166615E+04, 8.6424601730164351E-06, -2.3856005155895236E+04, 2.6681600234453199E+04, -1.4313303205083188E+04, 3.7949446187583080E+03, -3.0888018539728523E+02,}, - {-8.3747489794189363E+01, 1.1948077479405792E+02, 4.8528498015072080E+02, -2.5024391114755094E+03, 5.3511195318669425E+03, -6.7655484107390166E+03, 5.3511195362291774E+03, -2.5024391131167667E+03, 4.8528498019392708E+02, 1.1948077480620087E+02, -8.3747489794426258E+01,}, - {-2.2640047135517630E+01, 9.0840898563949466E+01, -2.1597187544386938E+02, 3.1511229111443720E+02, -2.4856617998395282E+02, 6.1683918215190516E-06, 2.4856618439352349E+02, -3.1511228757800421E+02, 2.1597187557069353E+02, -9.0840898570046704E+01, 2.2640047135565219E+01,}, - {-1.6306382886201207E+00, 7.3325946591320434E+00, -2.3241017682854558E+01, 5.1715494398901185E+01, -8.2673000279130790E+01, 9.6489719151212370E+01, -8.2673010381149226E+01, 5.1715494328769353E+01, -2.3241018024860580E+01, 7.3325946448852415E+00, -1.6306382886460551E+00,} - }}; - } else if constexpr(w==12) { - return std::array, nc> {{ - {6.1722991679852908E+04, 8.4789650417103648E+07, 5.4431675199498701E+09, 7.8788892335272232E+10, 4.0355760945670044E+11, 8.8071481911347949E+11, 8.8071481911347961E+11, 4.0355760945670044E+11, 7.8788892335272430E+10, 5.4431675199498835E+09, 8.4789650417103708E+07, 6.1722991679871957E+04}, - {3.2561466099406168E+05, 2.2112758120210618E+08, 8.9911609880089817E+09, 8.3059508064200943E+10, 2.3965569143469864E+11, 1.6939286803305212E+11, -1.6939286803305203E+11, -2.3965569143469864E+11, -8.3059508064201080E+10, -8.9911609880089989E+09, -2.2112758120210618E+08, -3.2561466099404311E+05}, - {7.6621098001581512E+05, 2.6026568260310286E+08, 6.4524338253008652E+09, 3.3729904113826820E+10, 2.8555202212474091E+10, -6.8998572040731537E+10, -6.8998572040731445E+10, 2.8555202212474079E+10, 3.3729904113826824E+10, 6.4524338253008757E+09, 2.6026568260310274E+08, 7.6621098001583829E+05}, - {1.0657807616803218E+06, 1.8144472126890984E+08, 2.5524827004349842E+09, 5.2112383911371660E+09, -1.0268350564014645E+10, -1.4763245309081306E+10, 1.4763245309081314E+10, 1.0268350564014671E+10, -5.2112383911371059E+09, -2.5524827004349871E+09, -1.8144472126890984E+08, -1.0657807616803099E+06}, - {9.7829638830158755E+05, 8.2222351241519913E+07, 5.5676911894064474E+08, -4.8739037675427330E+08, -2.7153428193078227E+09, 2.5627633609246106E+09, 2.5627633609246163E+09, -2.7153428193078651E+09, -4.8739037675430620E+08, 5.5676911894064546E+08, 8.2222351241519868E+07, 9.7829638830161188E+05}, - {6.2536876825114002E+05, 2.4702814073680203E+07, 4.1488431554846466E+07, -2.9274790542418826E+08, 1.0742154109191516E+08, 6.2185168968032193E+08, -6.2185168968012476E+08, -1.0742154109184742E+08, 2.9274790542423087E+08, -4.1488431554843128E+07, -2.4702814073680237E+07, -6.2536876825112454E+05}, - {2.8527714307528478E+05, 4.6266378435690766E+06, -1.0665598090790771E+07, -2.6048960239891130E+07, 9.1597254427317813E+07, -5.9794495983264342E+07, -5.9794495983220413E+07, 9.1597254427343085E+07, -2.6048960239921503E+07, -1.0665598090794146E+07, 4.6266378435690673E+06, 2.8527714307530399E+05}, - {9.2873647411234080E+04, 3.6630046787425119E+05, -3.1271047224730137E+06, 4.8612412939252760E+06, 3.3820440907796426E+06, -1.6880127953704204E+07, 1.6880127953756198E+07, -3.3820440907614031E+06, -4.8612412938993908E+06, 3.1271047224752530E+06, -3.6630046787425695E+05, -9.2873647411217215E+04}, - {2.0817947751046438E+04, -5.5660303410315042E+04, -1.9519783923444615E+05, 1.0804817251338551E+06, -1.8264985852555393E+06, 9.7602844968061335E+05, 9.7602844962902542E+05, -1.8264985852963410E+06, 1.0804817251124913E+06, -1.9519783923503032E+05, -5.5660303410363231E+04, 2.0817947751063632E+04}, - {2.7986023314783361E+03, -1.9404411093655592E+04, 4.3922625000519314E+04, -7.6450317451901383E+03, -1.5273911974273989E+05, 3.3223441458516393E+05, -3.3223441441930021E+05, 1.5273911979752057E+05, 7.6450317512768806E+03, -4.3922624998141677E+04, 1.9404411093637758E+04, -2.7986023314644049E+03}, - {6.7849020474048089E+01, -1.7921351308204744E+03, 8.4980694686552797E+03, -1.9742624859769410E+04, 2.4620674845030797E+04, -1.1676544851227827E+04, -1.1676544869194569E+04, 2.4620674845030626E+04, -1.9742624831436660E+04, 8.4980694630406069E+03, -1.7921351308312935E+03, 6.7849020488592075E+01}, - {-5.4577020998836872E+01, 1.3637112867242237E+02, 4.5513616580246023E+01, -1.1174001367986359E+03, 3.2018769312434206E+03, -5.0580351396215219E+03, 5.0580351683422405E+03, -3.2018769242193171E+03, 1.1174000998831286E+03, -4.5513609243969356E+01, -1.3637112867730119E+02, 5.4577021011726984E+01}, - {-1.0538365872268786E+01, 4.6577222488645518E+01, -1.2606964198473415E+02, 2.1881091668968099E+02, -2.3273399614976032E+02, 1.0274275204276027E+02, 1.0274270265494516E+02, -2.3273401859852868E+02, 2.1881091865396468E+02, -1.2606964777237258E+02, 4.6577222453584369E+01, -1.0538365860573146E+01}, - {-4.6087004144309118E-01, 2.5969759128998060E+00, -9.6946932216381381E+00, 2.4990041962121211E+01, -4.6013909139329137E+01, 6.2056985032913090E+01, -6.2056925855365186E+01, 4.6013921000662158E+01, -2.4990037445376750E+01, 9.6946954085586885E+00, -2.5969759201692755E+00, 4.6087004744129911E-01} - }}; - } else if constexpr(w==13) { - return std::array, nc> {{ - {9.8715725867495363E+04, 1.9828875496808097E+08, 1.7196758809614983E+10, 3.3083776881353577E+11, 2.2668873993375439E+12, 6.7734720591167568E+12, 9.6695220682534785E+12, 6.7734720591167432E+12, 2.2668873993375430E+12, 3.3083776881353503E+11, 1.7196758809614998E+10, 1.9828875496807891E+08, 9.8715725867496090E+04}, - {5.4491110456935549E+05, 5.4903670125539351E+08, 3.0879465445278183E+10, 3.9588436413399969E+11, 1.6860562536749778E+12, 2.4256447893117891E+12, -5.5583944938791784E-05, -2.4256447893117847E+12, -1.6860562536749768E+12, -3.9588436413399890E+11, -3.0879465445278183E+10, -5.4903670125538898E+08, -5.4491110456935526E+05}, - {1.3504711883426071E+06, 6.9286979077463162E+08, 2.4618123595484577E+10, 1.9493985627722607E+11, 3.9422703517046350E+11, -1.8678883613919861E+11, -8.5538079834550110E+11, -1.8678883613919730E+11, 3.9422703517046375E+11, 1.9493985627722589E+11, 2.4618123595484566E+10, 6.9286979077462614E+08, 1.3504711883426069E+06}, - {1.9937206140846491E+06, 5.2512029493765980E+08, 1.1253303793811750E+10, 4.6205527735932152E+10, -1.1607472377983305E+10, -1.6305241755642313E+11, 3.5385440504350348E-04, 1.6305241755642365E+11, 1.1607472377982582E+10, -4.6205527735932213E+10, -1.1253303793811750E+10, -5.2512029493765628E+08, -1.9937206140846489E+06}, - {1.9607419630386413E+06, 2.6425362558103892E+08, 3.1171259341747193E+09, 2.9839860297839913E+09, -1.9585031917561897E+10, -5.0666917387065792E+09, 3.6568794485480583E+10, -5.0666917387057562E+09, -1.9585031917561817E+10, 2.9839860297838497E+09, 3.1171259341747184E+09, 2.6425362558103728E+08, 1.9607419630386417E+06}, - {1.3593773865640305E+06, 9.1556445104158267E+07, 4.7074012944133747E+08, -1.1192579335657008E+09, -2.1090780087868555E+09, 5.2270306737951984E+09, 5.6467240041521856E-04, -5.2270306737934217E+09, 2.1090780087880819E+09, 1.1192579335658383E+09, -4.7074012944133127E+08, -9.1556445104157984E+07, -1.3593773865640305E+06}, - {6.8417206432039209E+05, 2.1561705510027152E+07, 7.5785249893055111E+06, -2.7456096030221754E+08, 3.4589095671054310E+08, 4.0256106808894646E+08, -1.0074306926603404E+09, 4.0256106809081393E+08, 3.4589095670997137E+08, -2.7456096030236483E+08, 7.5785249893030487E+06, 2.1561705510027405E+07, 6.8417206432039209E+05}, - {2.5248269397037517E+05, 3.0985559672616189E+06, -1.1816517087616559E+07, -8.2958498770184973E+06, 8.0546642347355247E+07, -1.0594657799485898E+08, 2.1816722293163801E-04, 1.0594657799424352E+08, -8.0546642347497791E+07, 8.2958498771036500E+06, 1.1816517087615721E+07, -3.0985559672621777E+06, -2.5248269397037517E+05}, - {6.7530100970876694E+04, 1.2373362326658823E+05, -2.1245597183281910E+06, 5.1047323238754412E+06, -1.4139444405488928E+06, -1.1818267555096827E+07, 2.0121548578624789E+07, -1.1818267557079868E+07, -1.4139444401348191E+06, 5.1047323236516044E+06, -2.1245597183309775E+06, 1.2373362326702787E+05, 6.7530100970876316E+04}, - {1.2421368748961073E+04, -5.0576243647011936E+04, -4.8878193436902722E+04, 6.5307896872028301E+05, -1.5497610127060430E+06, 1.5137725917321201E+06, 4.1615986404011299E-04, -1.5137725918538549E+06, 1.5497610130469005E+06, -6.5307896856811445E+05, 4.8878193438804832E+04, 5.0576243646433126E+04, -1.2421368748961073E+04}, - {1.2904654687550299E+03, -1.1169946055009055E+04, 3.3275109713863385E+04, -3.1765222274236821E+04, -5.9810982085323274E+04, 2.2355863038592847E+05, -3.1083591705219547E+05, 2.2355863445202672E+05, -5.9810982721084511E+04, -3.1765222464963932E+04, 3.3275109714208855E+04, -1.1169946054555618E+04, 1.2904654687545376E+03}, - {-1.9043622268674213E+01, -6.8296542209516542E+02, 4.2702512274202591E+03, -1.2165497317825058E+04, 1.9423733298269544E+04, -1.6010024066956401E+04, 3.4018642874429026E-04, 1.6010021599471667E+04, -1.9423732817821805E+04, 1.2165497483905752E+04, -4.2702512286689680E+03, 6.8296542153908558E+02, 1.9043622268312891E+01}, - {-3.0093984465361217E+01, 9.8972865724808671E+01, -9.7437038666761538E+01, -3.5079928405373198E+02, 1.5699250566648977E+03, -3.1287439837941820E+03, 3.8692196309709061E+03, -3.1287462825615335E+03, 1.5699252631958864E+03, -3.5079944793112952E+02, -9.7437041893750632E+01, 9.8972866189610414E+01, -3.0093984465884773E+01}, - {-4.3050286009489040E+00, 2.1108975724659501E+01, -6.4297198812570272E+01, 1.2922884632277874E+02, -1.6991812716212596E+02, 1.2655005901719436E+02, 9.2483537895948854E-05, -1.2655066232531748E+02, 1.6991805207569072E+02, -1.2922893667436634E+02, 6.4297198424711908E+01, -2.1108976207523057E+01, 4.3050286009485790E+00}, - {-1.0957333716725008E-01, 7.2949317004436565E-01, -3.4300816058693728E+00, 1.0470054474579324E+01, -2.2292134950656113E+01, 3.4570827323582719E+01, -3.9923523442753932E+01, 3.4573264959502886E+01, -2.2292358612963266E+01, 1.0470042004916014E+01, -3.4300810538570281E+00, 7.2949352113279253E-01, -1.0957333740315604E-01} - }}; - } else if constexpr(w==14) { - return std::array, nc> {{ - {1.5499533202966207E+05, 4.4723032442444688E+08, 5.1495083701694740E+10, 1.2904576022918071E+12, 1.1534950432785506E+13, 4.5650102198520484E+13, 8.8830582190032641E+13, 8.8830582190032641E+13, 4.5650102198520492E+13, 1.1534950432785527E+13, 1.2904576022918074E+12, 5.1495083701695107E+10, 4.4723032442444855E+08, 1.5499533202970232E+05}, - {8.9188339002980455E+05, 1.3065352538728635E+09, 9.9400185225815567E+10, 1.7136059013402405E+12, 1.0144146621675832E+13, 2.3034036018490715E+13, 1.4630967270448871E+13, -1.4630967270448855E+13, -2.3034036018490719E+13, -1.0144146621675846E+13, -1.7136059013402405E+12, -9.9400185225815964E+10, -1.3065352538728662E+09, -8.9188339002979454E+05}, - {2.3170473769379663E+06, 1.7532505043698256E+09, 8.6523535958354309E+10, 9.7455289065487354E+11, 3.2977972139362314E+12, 1.7874626001697781E+12, -6.1480918082633916E+12, -6.1480918082633975E+12, 1.7874626001697690E+12, 3.2977972139362285E+12, 9.7455289065487329E+11, 8.6523535958354630E+10, 1.7532505043698275E+09, 2.3170473769380399E+06}, - {3.6089249230396422E+06, 1.4278058213962190E+09, 4.4296625537022423E+10, 2.9466624630419781E+11, 3.1903621584503235E+11, -9.8834691411254565E+11, -1.1072264714919226E+12, 1.1072264714919316E+12, 9.8834691411255151E+11, -3.1903621584503467E+11, -2.9466624630419769E+11, -4.4296625537022621E+10, -1.4278058213962219E+09, -3.6089249230396664E+06}, - {3.7733555140851745E+06, 7.8376718099107409E+08, 1.4443117772349569E+10, 4.3197433307418671E+10, -7.6585042240585556E+10, -1.8569640140763062E+11, 2.0385335192657199E+11, 2.0385335192656519E+11, -1.8569640140762662E+11, -7.6585042240580856E+10, 4.3197433307418686E+10, 1.4443117772349669E+10, 7.8376718099107552E+08, 3.7733555140852560E+06}, - {2.8079157920112358E+06, 3.0340753492383724E+08, 2.9498136661747241E+09, -6.2820200387919831E+08, -2.2372008390623215E+10, 1.5217518660584890E+10, 4.0682590266891922E+10, -4.0682590266869431E+10, -1.5217518660582748E+10, 2.2372008390625935E+10, 6.2820200387968791E+08, -2.9498136661747637E+09, -3.0340753492383808E+08, -2.8079157920112377E+06}, - {1.5361613559533111E+06, 8.3513615594416574E+07, 3.0077547202708024E+08, -1.3749596754067802E+09, -6.6733027297557127E+08, 5.9590333632819109E+09, -4.3025685566870070E+09, -4.3025685566872711E+09, 5.9590333632806673E+09, -6.6733027297523963E+08, -1.3749596754067125E+09, 3.0077547202709383E+08, 8.3513615594416171E+07, 1.5361613559533576E+06}, - {6.2759409419592959E+05, 1.5741723594963098E+07, -1.5632610223406436E+07, -1.9294824907078514E+08, 4.4643806532434595E+08, 1.5178998385244830E+07, -9.6771139891725647E+08, 9.6771139892509627E+08, -1.5178998381042883E+07, -4.4643806533176166E+08, 1.9294824907065383E+08, 1.5632610223392555E+07, -1.5741723594963137E+07, -6.2759409419590747E+05}, - {1.9151404903933613E+05, 1.7156606891563335E+06, -9.7733523156688716E+06, 4.2982266233154163E+06, 5.1660907884347722E+07, -1.1279400211155911E+08, 6.4701089573962681E+07, 6.4701089571562663E+07, -1.1279400211012064E+08, 5.1660907891220264E+07, 4.2982266233826512E+06, -9.7733523157112263E+06, 1.7156606891560503E+06, 1.9151404903936724E+05}, - {4.2715272622845026E+04, -2.2565910611953568E+03, -1.1769776156959014E+06, 4.0078399907813077E+06, -3.8951858063335596E+06, -5.0944610754510267E+06, 1.6765992446914168E+07, -1.6765992426657490E+07, 5.0944610781778870E+06, 3.8951858062361716E+06, -4.0078399907326135E+06, 1.1769776157141617E+06, 2.2565910606306688E+03, -4.2715272622820135E+04}, - {6.4806786522793900E+03, -3.5474227032974472E+04, 1.8237100709385861E+04, 3.0934714629696816E+05, -1.0394703931686131E+06, 1.4743920333143482E+06, -7.3356882447856572E+05, -7.3356882916658197E+05, 1.4743920305501707E+06, -1.0394703929917105E+06, 3.0934714631908614E+05, 1.8237100665157792E+04, -3.5474227033406372E+04, 6.4806786523010323E+03}, - {4.9913632908459954E+02, -5.5416668524952684E+03, 2.0614058717617296E+04, -3.2285139072943130E+04, -5.3099550821623425E+03, 1.1559000502166932E+05, -2.2569743259261423E+05, 2.2569743616896842E+05, -1.1559000130545651E+05, 5.3099543129458480E+03, 3.2285139142872020E+04, -2.0614058670790018E+04, 5.5416668533342381E+03, -4.9913632906195977E+02}, - {-3.3076333188134086E+01, -1.8970588563697331E+02, 1.8160423493164808E+03, -6.3715703355644328E+03, 1.2525624574329036E+04, -1.4199806452802783E+04, 6.4441892296909591E+03, 6.4441909537524216E+03, -1.4199808176873401E+04, 1.2525626154733827E+04, -6.3715704433222418E+03, 1.8160422729911850E+03, -1.8970588700495102E+02, -3.3076333168231550E+01}, - {-1.4394533627743886E+01, 5.7000699089242815E+01, -1.0101142663923416E+02, -3.2954197414395189E+01, 6.1417879182394654E+02, -1.6177283846697430E+03, 2.4593386157454975E+03, -2.4593322941165261E+03, 1.6177291239900730E+03, -6.1417952013923764E+02, 3.2954100943010943E+01, 1.0101142710333265E+02, -5.7000699100179844E+01, 1.4394533639240331E+01}, - {-1.5925952284027161E+00, 8.5113930215357829E+00, -2.8993523187012922E+01, 6.6373454994590404E+01, -1.0329574518449559E+02, 1.0280184257681817E+02, -4.3896094875192006E+01, -4.3899302208087086E+01, 1.0280039795628096E+02, -1.0329511291885207E+02, 6.6373435700858948E+01, -2.8993536490606409E+01, 8.5113924808491728E+00, -1.5925952194145006E+00}, - {1.5984868520881029E-02, 1.2876175212962959E-01, -9.8358742969175483E-01, 3.7711523389360830E+00, -9.4305498095765508E+00, 1.6842854581416674E+01, -2.2308566502972713E+01, 2.2308940200151390E+01, -1.6841512668820517E+01, 9.4313524091989347E+00, -3.7710716543179599E+00, 9.8361025494556609E-01, -1.2876100566420701E-01, -1.5984859433053292E-02} - }}; - } else if constexpr(w==15) { - return std::array, nc> {{ - {2.3939707792241839E+05, 9.7700272582690191E+08, 1.4715933396485257E+11, 4.7242424833337158E+12, 5.3987426629953594E+13, 2.7580474290566078E+14, 7.0693378336533400E+14, 9.6196578554477775E+14, 7.0693378336533400E+14, 2.7580474290566125E+14, 5.3987426629953766E+13, 4.7242424833337246E+12, 1.4715933396485263E+11, 9.7700272582690215E+08, 2.3939707792242285E+05}, - {1.4314487885226035E+06, 2.9961416925358453E+09, 3.0273361232748438E+11, 6.8507333793903584E+12, 5.4192702756911000E+13, 1.7551587948105309E+14, 2.1874615668430150E+14, 3.4316191014053393E-02, -2.1874615668430150E+14, -1.7551587948105334E+14, -5.4192702756911180E+13, -6.8507333793903701E+12, -3.0273361232748438E+11, -2.9961416925358458E+09, -1.4314487885226049E+06}, - {3.8829497354762917E+06, 4.2473082696966448E+09, 2.8414312556015540E+11, 4.3688281331121411E+12, 2.1823119508000543E+13, 3.2228098609392094E+13, -2.1833085454691789E+13, -7.3750710225100812E+13, -2.1833085454691820E+13, 3.2228098609392055E+13, 2.1823119508000594E+13, 4.3688281331121479E+12, 2.8414312556015527E+11, 4.2473082696966434E+09, 3.8829497354762889E+06}, - {6.3495763451755755E+06, 3.6841035003733950E+09, 1.5965774278321045E+11, 1.5630338683778201E+12, 3.8749058615819268E+12, -2.7319740087723574E+12, -1.3233342822865402E+13, 6.1642230420317079E-02, 1.3233342822865449E+13, 2.7319740087723975E+12, -3.8749058615819365E+12, -1.5630338683778203E+12, -1.5965774278321042E+11, -3.6841035003733935E+09, -6.3495763451755764E+06}, - {7.0146619045520434E+06, 2.1782897863065763E+09, 5.8897780310148087E+10, 3.1953009601770325E+11, 4.0651527029737198E+08, -1.6379148273276064E+12, -1.1568753137013029E+11, 2.7451653250460508E+12, -1.1568753137012485E+11, -1.6379148273277261E+12, 4.0651527029819238E+08, 3.1953009601770361E+11, 5.8897780310148087E+10, 2.1782897863065763E+09, 7.0146619045520443E+06}, - {5.5580012413990172E+06, 9.2345162185944164E+08, 1.4522950934020109E+10, 2.7025952371212009E+10, -1.2304576967641914E+11, -1.0116752717202786E+11, 3.8517418245458325E+11, 1.0918347404432817E-01, -3.8517418245444312E+11, 1.0116752717221135E+11, 1.2304576967643665E+11, -2.7025952371214943E+10, -1.4522950934020079E+10, -9.2345162185944211E+08, -5.5580012413990181E+06}, - {3.2693972344231778E+06, 2.8610260147425205E+08, 2.2348528403750563E+09, -3.4574515574242272E+09, -1.7480626463583939E+10, 3.1608597465540653E+10, 1.9879262560072273E+10, -6.6148013553772224E+10, 1.9879262560085339E+10, 3.1608597465515747E+10, -1.7480626463576942E+10, -3.4574515574198236E+09, 2.2348528403750110E+09, 2.8610260147425193E+08, 3.2693972344231787E+06}, - {1.4553539959296256E+06, 6.4136842048384041E+07, 1.3622336582062906E+08, -1.2131510424644001E+09, 6.4322366984221375E+08, 4.5078753872047586E+09, -7.1689413746930647E+09, 3.2906916833662987E-02, 7.1689413746724453E+09, -4.5078753875009747E+09, -6.4322366985365331E+08, 1.2131510424608817E+09, -1.3622336582067037E+08, -6.4136842048384242E+07, -1.4553539959296256E+06}, - {4.9358776531681651E+05, 9.7772970960585065E+06, -2.3511574237987626E+07, -1.0142613816641946E+08, 3.9421144218035364E+08, -2.8449115593052310E+08, -5.7549243243741119E+08, 1.1608781631182449E+09, -5.7549243240763104E+08, -2.8449115600447333E+08, 3.9421144214381480E+08, -1.0142613816429654E+08, -2.3511574237995699E+07, 9.7772970960588697E+06, 4.9358776531681546E+05}, - {1.2660319987326677E+05, 7.7519511328119377E+05, -6.5244610661450895E+06, 9.0878257488052379E+06, 2.3116605621149920E+07, -8.7079594462079599E+07, 9.5542733739275128E+07, 6.0548970733798724E-02, -9.5542733661364838E+07, 8.7079594608550951E+07, -2.3116605559600785E+07, -9.0878257522138134E+06, 6.5244610661298726E+06, -7.7519511328133650E+05, -1.2660319987326639E+05}, - {2.3793325531458529E+04, -4.2305332803808597E+04, -5.2884156985535356E+05, 2.5307340127864038E+06, -4.0404175271559842E+06, -1.7519992360184138E+05, 1.0146438805818636E+07, -1.5828545480742473E+07, 1.0146438778928882E+07, -1.7520004389869148E+05, -4.0404175770437294E+06, 2.5307340149977510E+06, -5.2884156989405944E+05, -4.2305332803937294E+04, 2.3793325531459184E+04}, - {2.9741655196834722E+03, -2.0687056403786246E+04, 3.3295507799709936E+04, 1.0661145730323243E+05, -5.6644238105382060E+05, 1.0874811616841732E+06, -9.6561270266008016E+05, 1.5626594062671070E-02, 9.6561272951271443E+05, -1.0874812528712249E+06, 5.6644243308078672E+05, -1.0661145838213131E+05, -3.3295507812197495E+04, 2.0687056403630129E+04, -2.9741655196846405E+03}, - {1.5389176594899303E+02, -2.3864418511494741E+03, 1.0846266954249364E+04, -2.2940053396478714E+04, 1.4780106121058996E+04, 4.2663651769852157E+04, -1.3047648013242516E+05, 1.7468401314164279E+05, -1.3047645484607235E+05, 4.2663541429144650E+04, 1.4780036296018619E+04, -2.2940053180976502E+04, 1.0846266927315819E+04, -2.3864418517113058E+03, 1.5389176594779781E+02}, - {-2.3857631312588978E+01, -1.9651606133609231E+01, 6.4183083829803820E+02, -2.8648433109641578E+03, 6.8249243722518859E+03, -9.7944325124827701E+03, 7.6177757600121276E+03, 1.8034307737205296E-02, -7.6177559127722052E+03, 9.7944326623113047E+03, -6.8249058342322496E+03, 2.8648407117981119E+03, -6.4183085438795774E+02, 1.9651605969778377E+01, 2.3857631312809222E+01}, - {-6.1348505739169541E+00, 2.7872915855267404E+01, -6.5819942538871970E+01, 5.1366231962952028E+01, 1.7213955398158618E+02, -6.9658621010000411E+02, 1.3192236112353403E+03, -1.6054106225233884E+03, 1.3192031991952242E+03, -6.9663961216547739E+02, 1.7211403815802629E+02, 5.1367579954366171E+01, -6.5819957939661379E+01, 2.7872915947616441E+01, -6.1348505735855374E+00}, - {-4.9671584513490097E-01, 3.0617550953446115E+00, -1.1650665638578070E+01, 3.0081586723089057E+01, -5.4028356726202020E+01, 6.6077203078498044E+01, -4.7145500171928198E+01, 4.2118837140985958E-03, 4.7167106663349848E+01, -6.6048394423269173E+01, 5.4062906728994193E+01, -3.0081603709324451E+01, 1.1650672008416343E+01, -3.0617551285208524E+00, 4.9671584437353217E-01}, - {4.3460786767313729E-03, -1.3199600771767199E-02, -1.9412688562910244E-01, 1.1329433700669471E+00, -3.4442045795063887E+00, 7.1737626956468912E+00, -1.1098109271625262E+01, 1.2385772358881393E+01, -1.1101471316239516E+01, 7.0913926025978853E+00, -3.4845491148773502E+00, 1.1323523856621058E+00, -1.9414904754428672E-01, -1.3200165079792004E-02, 4.3460782759443158E-03} - }}; - } else if constexpr(w==16) { - return std::array, nc> {{ - {3.6434551345570839E+05, 2.0744705928579483E+09, 4.0355760945669995E+11, 1.6364575388763029E+13, 2.3514830376056538E+14, 1.5192201717462528E+15, 4.9956173084674090E+15, 8.9287666945127360E+15, 8.9287666945127390E+15, 4.9956173084674090E+15, 1.5192201717462528E+15, 2.3514830376056538E+14, 1.6364575388763035E+13, 4.0355760945670026E+11, 2.0744705928579524E+09, 3.6434551345571183E+05}, - {2.2576246485480359E+06, 6.6499571180086451E+09, 8.7873753526056287E+11, 2.5606844387131066E+13, 2.6313738449330153E+14, 1.1495095100701460E+15, 2.1932582707747560E+15, 1.2860244365132595E+15, -1.2860244365132600E+15, -2.1932582707747578E+15, -1.1495095100701465E+15, -2.6313738449330159E+14, -2.5606844387131062E+13, -8.7873753526056299E+11, -6.6499571180086451E+09, -2.2576246485480373E+06}, - {6.3730995546265077E+06, 9.9060026035198078E+09, 8.8097248605449023E+11, 1.7953384130753688E+13, 1.2398425545001662E+14, 3.0749346493041262E+14, 1.0259777520247159E+14, -5.5291976457534325E+14, -5.5291976457534325E+14, 1.0259777520247186E+14, 3.0749346493041219E+14, 1.2398425545001659E+14, 1.7953384130753676E+13, 8.8097248605448950E+11, 9.9060026035198040E+09, 6.3730995546265030E+06}, - {1.0896915393078227E+07, 9.0890343524593849E+09, 5.3565169504010010E+11, 7.3004206720038701E+12, 2.9692333044160066E+13, 1.6051737468109549E+13, -9.1273329108089906E+13, -8.5999306918502953E+13, 8.5999306918502422E+13, 9.1273329108089984E+13, -1.6051737468109510E+13, -2.9692333044160082E+13, -7.3004206720038701E+12, -5.3565169504010022E+11, -9.0890343524593849E+09, -1.0896915393078227E+07}, - {1.2655725616100594E+07, 5.7342804054544210E+09, 2.1822836608899570E+11, 1.8300700858999690E+12, 2.7770431049857676E+12, -8.5034969223852568E+12, -1.2846668467423438E+13, 1.6519076896571838E+13, 1.6519076896572182E+13, -1.2846668467423555E+13, -8.5034969223850703E+12, 2.7770431049857896E+12, 1.8300700858999678E+12, 2.1822836608899567E+11, 5.7342804054544210E+09, 1.2655725616100591E+07}, - {1.0609303958036326E+07, 2.6255609052371716E+09, 6.1673589426039413E+10, 2.6044432099085333E+11, -3.5431628074578204E+11, -1.6077602129636348E+12, 1.5534405614728977E+12, 2.8019935380857432E+12, -2.8019935380841978E+12, -1.5534405614724106E+12, 1.6077602129635625E+12, 3.5431628074580896E+11, -2.6044432099084848E+11, -6.1673589426039429E+10, -2.6255609052371716E+09, -1.0609303958036322E+07}, - {6.6544809363384582E+06, 8.9490403680928326E+08, 1.1882638725190845E+10, 8.1552898137823076E+09, -1.2575562817886868E+11, 2.7074695075907585E+10, 3.9453789461955023E+11, -3.1679644857468066E+11, -3.1679644857392346E+11, 3.9453789461966650E+11, 2.7074695075992649E+10, -1.2575562817884555E+11, 8.1552898137788668E+09, 1.1882638725190889E+10, 8.9490403680928278E+08, 6.6544809363384554E+06}, - {3.1906872142825006E+06, 2.2785946180651775E+08, 1.3744578972809248E+09, -4.3997172592883167E+09, -9.2011130754043922E+09, 3.4690551711832901E+10, -9.4227043395047741E+09, -5.9308465070198639E+10, 5.9308465069336540E+10, 9.4227043396350136E+09, -3.4690551711738396E+10, 9.2011130753567543E+09, 4.3997172592879610E+09, -1.3744578972813025E+09, -2.2785946180651844E+08, -3.1906872142825015E+06}, - {1.1821527096621769E+06, 4.2281234059839502E+07, 2.8723226058712766E+07, -8.3553955857628822E+08, 1.2447304828823066E+09, 2.1955280943585949E+09, -7.0514195726908512E+09, 4.3745141239718714E+09, 4.3745141233600502E+09, -7.0514195728029747E+09, 2.1955280943510208E+09, 1.2447304828590808E+09, -8.3553955857879233E+08, 2.8723226058761366E+07, 4.2281234059838109E+07, 1.1821527096621762E+06}, - {3.3854610744280310E+05, 5.2176984975081543E+06, -2.0677283565079328E+07, -3.5831818968518838E+07, 2.6599346106412742E+08, -3.7992777977357000E+08, -1.3426914417466179E+08, 9.1752051229224503E+08, -9.1752051129499328E+08, 1.3426914497246322E+08, 3.7992777991069216E+08, -2.6599346104854536E+08, 3.5831818968908392E+07, 2.0677283564896725E+07, -5.2176984975075833E+06, -3.3854610744279937E+05}, - {7.3893334077310064E+04, 2.6983804209559254E+05, -3.6415998561101072E+06, 8.4025485849181097E+06, 4.9278860779345948E+06, -5.1437033846752726E+07, 8.7603898676325440E+07, -4.6199498412402093E+07, -4.6199498208604209E+07, 8.7603898435731798E+07, -5.1437033863736227E+07, 4.9278861005789889E+06, 8.4025485831489991E+06, -3.6415998560990733E+06, 2.6983804209473461E+05, 7.3893334077307401E+04}, - {1.1778892113375481E+04, -4.0077190108724200E+04, -1.8372552175909068E+05, 1.3262878399160223E+06, -2.9738539927520575E+06, 1.9493509709529271E+06, 4.1881949951139782E+06, -1.1066749616505133E+07, 1.1066749327519676E+07, -4.1881946843906553E+06, -1.9493507810665092E+06, 2.9738539818831389E+06, -1.3262878384774840E+06, 1.8372552162922107E+05, 4.0077190107319519E+04, -1.1778892113376129E+04}, - {1.2019749667923656E+03, -1.0378455844500613E+04, 2.6333352653155256E+04, 1.7117060106301305E+04, -2.5133287443653666E+05, 6.4713914262131555E+05, -8.1634942572553246E+05, 3.8623935281825601E+05, 3.8623876433339820E+05, -8.1634960962672008E+05, 6.4713900469564367E+05, -2.5133289627502396E+05, 1.7117057951236206E+04, 2.6333352581335013E+04, -1.0378455846609291E+04, 1.2019749667911419E+03}, - {3.1189837632471693E+01, -8.9083493807061564E+02, 4.9454293649337906E+03, -1.3124693635095375E+04, 1.5834784331991095E+04, 6.9607870364081436E+03, -5.9789871879430451E+04, 1.0841726514394575E+05, -1.0841709685990328E+05, 5.9790206615067997E+04, -6.9607049368128291E+03, -1.5834783935893831E+04, 1.3124692974990443E+04, -4.9454295091588992E+03, 8.9083493794871868E+02, -3.1189837631106176E+01}, - {-1.2975319073401824E+01, 1.8283698218710011E+01, 1.7684015393859755E+02, -1.1059917445033070E+03, 3.1998168298121523E+03, -5.5988200120063057E+03, 5.9248751921324047E+03, -2.5990022806343668E+03, -2.5990962125709430E+03, 5.9247537039895724E+03, -5.5988835070734467E+03, 3.1998292349030621E+03, -1.1059926481090836E+03, 1.7684013881079576E+02, 1.8283698123134819E+01, -1.2975319073977776E+01}, - {-2.3155118729954247E+00, 1.1938503634469159E+01, -3.4150562973753665E+01, 4.8898615554511437E+01, 1.5853185548633874E+01, -2.4272678107130790E+02, 6.0151276286907887E+02, -8.8751856926690448E+02, 8.8742942550355474E+02, -6.0136491467620624E+02, 2.4282489356694586E+02, -1.5850195971204462E+01, -4.8897392545563044E+01, 3.4150562973753665E+01, -1.1938504430698943E+01, 2.3155118723150525E+00}, - {-1.5401723686076832E-01, 9.8067823888634464E-01, -4.1900843552415639E+00, 1.2150534299778382E+01, -2.4763139606227178E+01, 3.6068014621628578E+01, -3.4346647779134791E+01, 1.3259903958585387E+01, 1.2937147675617604E+01, -3.4454233206790519E+01, 3.6027670086257579E+01, -2.4769863695455662E+01, 1.2149431128889342E+01, -4.1901615115388706E+00, 9.8067695636810759E-01, -1.5401723756214594E-01}, - {1.1808835093099178E-02, -2.5444299558662394E-02, -1.5661344238792723E-04, 2.5820071204205225E-01, -1.0930950485268096E+00, 2.6408492552008669E+00, -4.4415763059111955E+00, 6.8227366238712817E+00, -6.8186662643534008E+00, 4.4887924763186051E+00, -2.6327085361651021E+00, 1.0918739406714428E+00, -2.5844238963842503E-01, 1.2680123888735934E-04, 2.5444206395526567E-02, -1.1808834826225629E-02} - }}; - } else { - static_assert(w >= 2, "w must be >= 2"); - static_assert(w <= 16, "w must be <= 16"); - return {}; - } + if constexpr (w == 2) { + return std::array, nc>{ + {{4.5147043243215315E+01, 4.5147043243215300E+01}, + {5.7408070938221300E+01, -5.7408070938221293E+01}, + {-1.8395117920046484E+00, -1.8395117920046560E+00}, + {-2.0382426253182082E+01, 2.0382426253182086E+01}, + {-2.0940804433577420E+00, -2.0940804433577389E+00}}}; + } else if constexpr (w == 3) { + return std::array, nc>{ + {{1.5653991189315119E+02, 8.8006872410780295E+02, 1.5653991189967152E+02}, + {3.1653018869611077E+02, 7.4325702843759617E-14, -3.1653018868907071E+02}, + {1.7742692790454484E+02, -3.3149255274727801E+02, 1.7742692791117119E+02}, + {-1.5357716116473156E+01, 9.5071486252033243E-15, 1.5357716122720193E+01}, + {-3.7757583061523668E+01, 5.3222970968867315E+01, -3.7757583054647384E+01}, + {-3.9654011076088804E+00, 1.8062124448285358E-13, 3.9654011139270540E+00}}}; + } else if constexpr (w == 4) { + return std::array, nc>{ + {{5.4284366850213200E+02, 1.0073871433088398E+04, 1.0073871433088396E+04, + 5.4284366850213223E+02}, + {1.4650917259256939E+03, 6.1905285583602863E+03, -6.1905285583602881E+03, + -1.4650917259256937E+03}, + {1.4186910680718345E+03, -1.3995339862725591E+03, -1.3995339862725598E+03, + 1.4186910680718347E+03}, + {5.1133995502497419E+02, -1.4191608683682996E+03, 1.4191608683682998E+03, + -5.1133995502497424E+02}, + {-4.8293622641174039E+01, 3.9393732546135226E+01, 3.9393732546135816E+01, + -4.8293622641174061E+01}, + {-7.8386867802392288E+01, 1.4918904800408930E+02, -1.4918904800408751E+02, + 7.8386867802392359E+01}, + {-1.0039212571700894E+01, 5.0626747735616746E+00, 5.0626747735625512E+00, + -1.0039212571700640E+01}}}; + } else if constexpr (w == 5) { + return std::array, nc>{ + {{9.9223677575398392E+02, 3.7794697666613320E+04, 9.8715771010760494E+04, + 3.7794697666613283E+04, 9.9223677575398403E+02}, + {3.0430174925083825E+03, 3.7938404259811403E+04, -1.1842989705877139E-11, + -3.7938404259811381E+04, -3.0430174925083829E+03}, + {3.6092689177271222E+03, 7.7501368899498666E+03, -2.2704627332475000E+04, + 7.7501368899498730E+03, 3.6092689177271218E+03}, + {1.9990077310495396E+03, -3.8875294641277296E+03, 9.7116927320010791E-12, + 3.8875294641277369E+03, -1.9990077310495412E+03}, + {4.0071733590403869E+02, -1.5861137916762602E+03, 2.3839858699098645E+03, + -1.5861137916762643E+03, 4.0071733590403909E+02}, + {-9.1301168206167262E+01, 1.2316471075214675E+02, 2.0698495299948402E-11, + -1.2316471075214508E+02, 9.1301168206167233E+01}, + {-5.5339722671223846E+01, 1.1960590540261879E+02, -1.5249941358311668E+02, + 1.1960590540262307E+02, -5.5339722671223605E+01}, + {-3.3762488150353924E+00, 2.2839981872948751E+00, 7.1884725699454154E-12, + -2.2839981872943818E+00, 3.3762488150341459E+00}}}; + } else if constexpr (w == 6) { + return std::array, nc>{ + {{2.0553833234911876E+03, 1.5499537739913128E+05, 8.1177907023291115E+05, + 8.1177907023291173E+05, 1.5499537739913136E+05, 2.0553833235005691E+03}, + {7.1269776034442639E+03, 2.0581923258843314E+05, 3.1559612614917674E+05, + -3.1559612614917627E+05, -2.0581923258843317E+05, -7.1269776034341394E+03}, + {1.0023404568475091E+04, 9.0916650498360192E+04, -1.0095927514054619E+05, + -1.0095927514054628E+05, 9.0916650498360177E+04, 1.0023404568484635E+04}, + {7.2536109410387417E+03, 4.8347162752602981E+03, -5.0512736602018522E+04, + 5.0512736602018478E+04, -4.8347162752603008E+03, -7.2536109410297540E+03}, + {2.7021878300949752E+03, -7.8773465553972646E+03, 5.2105876478342780E+03, + 5.2105876478343343E+03, -7.8773465553972710E+03, 2.7021878301048723E+03}, + {3.2120291706547636E+02, -1.8229189469936762E+03, 3.7928113414429808E+03, + -3.7928113414427025E+03, 1.8229189469937312E+03, -3.2120291705638243E+02}, + {-1.2051267090537374E+02, 2.2400507411399673E+02, -1.2506575852541796E+02, + -1.2506575852521925E+02, 2.2400507411398695E+02, -1.2051267089640181E+02}, + {-4.5977202613350237E+01, 1.1536880606853076E+02, -1.7819720186493959E+02, + 1.7819720186497622E+02, -1.1536880606854736E+02, 4.5977202622148909E+01}, + {-1.5631081288842275E+00, 7.1037430591266115E-01, -6.9838401121429056E-02, + -6.9838401186476856E-02, 7.1037430589285400E-01, -1.5631081203754575E+00}}}; + } else if constexpr (w == 7) { + return std::array, nc>{ + {{3.9948351830487481E+03, 5.4715865608590771E+05, 5.0196413492771760E+06, + 9.8206709220713247E+06, 5.0196413492771825E+06, 5.4715865608590783E+05, + 3.9948351830642519E+03}, + {1.5290160332974696E+04, 8.7628248584320408E+05, 3.4421061790934438E+06, + -2.6908159596373561E-10, -3.4421061790934461E+06, -8.7628248584320408E+05, + -1.5290160332958067E+04}, + {2.4458227486779251E+04, 5.3904618484139396E+05, 2.4315566181017534E+05, + -1.6133959371974322E+06, 2.4315566181017453E+05, 5.3904618484139396E+05, + 2.4458227486795113E+04}, + {2.1166189345881645E+04, 1.3382732160223130E+05, -3.3113450969689694E+05, + 6.9013724510092140E-10, 3.3113450969689724E+05, -1.3382732160223136E+05, + -2.1166189345866893E+04}, + {1.0542795672344864E+04, -7.0739172265098678E+03, -6.5563293056049893E+04, + 1.2429734005960064E+05, -6.5563293056049602E+04, -7.0739172265098332E+03, + 1.0542795672361213E+04}, + {2.7903491906228419E+03, -1.0975382873973093E+04, 1.3656979541144799E+04, + 7.7346408577822045E-10, -1.3656979541143772E+04, 1.0975382873973256E+04, + -2.7903491906078298E+03}, + {1.6069721418053300E+02, -1.5518707872251393E+03, 4.3634273936642621E+03, + -5.9891976420595174E+03, 4.3634273936642730E+03, -1.5518707872251064E+03, + 1.6069721419533221E+02}, + {-1.2289277373867256E+02, 2.8583630927743314E+02, -2.8318194617327981E+02, + 6.9043515551118249E-10, 2.8318194617392436E+02, -2.8583630927760140E+02, + 1.2289277375319763E+02}, + {-3.2270164914249058E+01, 9.1892112257581346E+01, -1.6710678096334209E+02, + 2.0317049305432383E+02, -1.6710678096383771E+02, 9.1892112257416159E+01, + -3.2270164900224913E+01}, + {-1.4761409685186277E-01, -9.1862771280377487E-01, 1.2845147741777752E+00, + 5.6547359492808854E-10, -1.2845147728310689E+00, 9.1862771293147971E-01, + 1.4761410890866353E-01}}}; + } else if constexpr (w == 8) { + return std::array, nc>{ + {{7.3898000697447915E+03, 1.7297637497600035E+06, 2.5578341605285794E+07, + 8.4789650417103335E+07, 8.4789650417103350E+07, 2.5578341605285816E+07, + 1.7297637497600049E+06, 7.3898000697447915E+03}, + {3.0719636811267599E+04, 3.1853145713323927E+06, 2.3797981861403696E+07, + 2.4569731244678464E+07, -2.4569731244678471E+07, -2.3797981861403704E+07, + -3.1853145713323941E+06, -3.0719636811267606E+04}, + {5.4488498478251728E+04, 2.4101183255475131E+06, 6.4554051283428287E+06, + -8.9200440393090546E+06, -8.9200440393090583E+06, 6.4554051283428324E+06, + 2.4101183255475126E+06, 5.4488498478251728E+04}, + {5.3926359802542116E+04, 9.0469037926849292E+05, -6.0897036277696118E+05, + -3.0743852105799988E+06, 3.0743852105800058E+06, 6.0897036277696711E+05, + -9.0469037926849339E+05, -5.3926359802542138E+04}, + {3.2444118016247590E+04, 1.3079802224392134E+05, -5.8652889370129269E+05, + 4.2333306008151924E+05, 4.2333306008152053E+05, -5.8652889370128722E+05, + 1.3079802224392109E+05, 3.2444118016247590E+04}, + {1.1864306345505294E+04, -2.2700360645707988E+04, -5.0713607251414309E+04, + 1.8308704458211688E+05, -1.8308704458210632E+05, 5.0713607251413123E+04, + 2.2700360645707628E+04, -1.1864306345505294E+04}, + {2.2812256770903232E+03, -1.1569135767377773E+04, 2.0942387020798891E+04, + -1.1661592834945191E+04, -1.1661592834940149E+04, 2.0942387020801420E+04, + -1.1569135767377924E+04, 2.2812256770903286E+03}, + {8.5503535636821422E+00, -9.7513976461238224E+02, 3.8242995179171526E+03, + -6.9201295567267280E+03, 6.9201295567248662E+03, -3.8242995179155446E+03, + 9.7513976461209836E+02, -8.5503535637013552E+00}, + {-1.0230637348345023E+02, 2.8246898554269114E+02, -3.8638201738139219E+02, + 1.9106407993320320E+02, 1.9106407993289886E+02, -3.8638201738492717E+02, + 2.8246898554219217E+02, -1.0230637348345138E+02}, + {-1.9200143062947848E+01, 6.1692257626706223E+01, -1.2981109187842989E+02, + 1.8681284210471688E+02, -1.8681284209654376E+02, 1.2981109187880142E+02, + -6.1692257626845532E+01, 1.9200143062947120E+01}, + {3.7894993760177598E-01, -1.7334408836731494E+00, 2.5271184057877303E+00, + -1.2600963971824484E+00, -1.2600963917834651E+00, 2.5271184069685657E+00, + -1.7334408840526812E+00, 3.7894993760636758E-01}}}; + } else if constexpr (w == 9) { + return std::array, nc>{ + {{1.3136365370186100E+04, 5.0196413492771806E+06, 1.1303327711722563E+08, + 5.8225443924996686E+08, 9.7700272582690656E+08, 5.8225443924996758E+08, + 1.1303327711722568E+08, 5.0196413492772207E+06, 1.3136365370186135E+04}, + {5.8623313038274340E+04, 1.0326318537280345E+07, 1.2898448324824864E+08, + 3.0522863709830385E+08, -3.9398045056223735E-08, -3.0522863709830391E+08, + -1.2898448324824864E+08, -1.0326318537280388E+07, -5.8623313038274347E+04}, + {1.1335001341875963E+05, 9.0726133144784812E+06, 5.3501544534038112E+07, + -2.6789524644146336E+05, -1.2483923718899371E+08, -2.6789524644172983E+05, + 5.3501544534038112E+07, 9.0726133144785129E+06, 1.1335001341875960E+05}, + {1.2489113703229747E+05, 4.3035547171861930E+06, 6.3021978510598792E+06, + -2.6014941986659057E+07, 6.0417403157325170E-08, 2.6014941986659389E+07, + -6.3021978510598652E+06, -4.3035547171862079E+06, -1.2489113703229751E+05}, + {8.6425493435991244E+04, 1.0891182836653308E+06, -2.0713033564200639E+06, + -2.8994941183506218E+06, 7.5905338661205899E+06, -2.8994941183505375E+06, + -2.0713033564200667E+06, 1.0891182836653353E+06, 8.6425493435991288E+04}, + {3.8657354724013814E+04, 7.9936390113331305E+04, -7.0458265546791907E+05, + 1.0151095605715880E+06, 1.2138090419648379E-07, -1.0151095605717725E+06, + 7.0458265546794771E+05, -7.9936390113331567E+04, -3.8657354724013821E+04}, + {1.0779131453134638E+04, -3.3466718311300596E+04, -1.3245366619006139E+04, + 1.8238470515353698E+05, -2.9285656292977190E+05, 1.8238470515350526E+05, + -1.3245366619000662E+04, -3.3466718311299621E+04, 1.0779131453134616E+04}, + {1.4992527030548456E+03, -9.7024371533891372E+03, 2.3216330734057381E+04, + -2.3465262819040818E+04, 5.3299736484284360E-08, 2.3465262819251962E+04, + -2.3216330734049119E+04, 9.7024371533890644E+03, -1.4992527030548747E+03}, + {-7.9857427421129714E+01, -4.0585588534807385E+02, 2.6054813773472697E+03, + -6.1806593581075495E+03, 8.0679596874001718E+03, -6.1806593581869265E+03, + 2.6054813773147021E+03, -4.0585588535363172E+02, -7.9857427421126204E+01}, + {-7.1572272057937070E+01, 2.2785637019511205E+02, -3.9109820765665262E+02, + 3.3597424711470910E+02, 1.0596763818009852E-07, -3.3597424723359080E+02, + 3.9109820766854079E+02, -2.2785637019009673E+02, 7.1572272057939983E+01}, + {-9.8886360698074700E+00, 3.5359026949867051E+01, -8.5251867715709949E+01, + 1.4285748012617628E+02, -1.6935269668779691E+02, 1.4285748010331625E+02, + -8.5251867711661305E+01, 3.5359026944299828E+01, -9.8886360698207305E+00}}}; + } else if constexpr (w == 10) { + return std::array, nc>{{{ + 2.2594586605749264E+04, + 1.3595989066786593E+07, + 4.4723032442444897E+08, + 3.3781755837397518E+09, + 8.6836783895849819E+09, + 8.6836783895849762E+09, + 3.3781755837397494E+09, + 4.4723032442444897E+08, + 1.3595989066786474E+07, + 2.2594586605749344E+04, + }, + { + 1.0729981697645642E+05, + 3.0651490267742988E+07, + 5.9387966085130465E+08, + 2.4434902657508330E+09, + 2.0073077861288922E+09, + -2.0073077861288943E+09, + -2.4434902657508330E+09, + -5.9387966085130453E+08, + -3.0651490267742816E+07, + -1.0729981697645638E+05, + }, + { + 2.2340399734184606E+05, + 3.0258214643190462E+07, + 3.1512411458738232E+08, + 4.3618276932319808E+08, + -7.8178848450497293E+08, + -7.8178848450497019E+08, + 4.3618276932319826E+08, + 3.1512411458738232E+08, + 3.0258214643190313E+07, + 2.2340399734184548E+05, + }, + { + 2.6917433004353486E+05, + 1.6875651476661228E+07, + 7.4664745481963441E+07, + -9.5882157211118385E+07, + -2.0622994435532519E+08, + 2.0622994435532743E+08, + 9.5882157211118177E+07, + -7.4664745481963515E+07, + -1.6875651476661161E+07, + -2.6917433004353428E+05, + }, + { + 2.0818422772177903E+05, + 5.6084730690362519E+06, + 1.4435118192351763E+06, + -4.0063869969544649E+07, + 3.2803674392747045E+07, + 3.2803674392746095E+07, + -4.0063869969546899E+07, + 1.4435118192351642E+06, + 5.6084730690362034E+06, + 2.0818422772177853E+05, + }, + { + 1.0781139496011091E+05, + 9.9202615851199068E+05, + -3.3266265543962116E+06, + -4.8557049011479173E+05, + 1.0176155522772279E+07, + -1.0176155522772269E+07, + 4.8557049011678610E+05, + 3.3266265543963453E+06, + -9.9202615851196018E+05, + -1.0781139496011072E+05, + }, + { + 3.7380102688153558E+04, + 1.2716675000355666E+04, + -6.2163527451774501E+05, + 1.4157962667184104E+06, + -8.4419693137680157E+05, + -8.4419693137743860E+05, + 1.4157962667189445E+06, + -6.2163527451771160E+05, + 1.2716675000340010E+04, + 3.7380102688153442E+04, + }, + { + 8.1238936393894646E+03, + -3.4872365530450072E+04, + 2.3913680325196314E+04, + 1.2428850301830019E+05, + -3.2158255329716846E+05, + 3.2158255329951923E+05, + -1.2428850301867779E+05, + -2.3913680325277423E+04, + 3.4872365530457188E+04, + -8.1238936393894255E+03, + }, + { + 7.8515926628982663E+02, + -6.6607899119372642E+03, + 2.0167398338513311E+04, + -2.8951401344519112E+04, + 1.4622828142848679E+04, + 1.4622828143544031E+04, + -2.8951401346900999E+04, + 2.0167398338398041E+04, + -6.6607899119505255E+03, + 7.8515926628967964E+02, + }, + { + -1.0147176570537010E+02, + -3.5304284185385157E+01, + 1.3576976854876134E+03, + -4.3921059353471856E+03, + 7.3232085271125388E+03, + -7.3232085273978546E+03, + 4.3921059367737662E+03, + -1.3576976854043962E+03, + 3.5304284185385157E+01, + 1.0147176570550941E+02, + }, + { + -4.3161545259389186E+01, + 1.5498490981579428E+02, + -3.1771250774232175E+02, + 3.7215448796427023E+02, + -1.7181762832770994E+02, + -1.7181763036843782E+02, + 3.7215448789408123E+02, + -3.1771250773692140E+02, + 1.5498490982186786E+02, + -4.3161545259547800E+01, + }, + { + -4.2916172038214198E+00, + 1.7402146071148604E+01, + -4.7947588069135868E+01, + 9.2697698088029625E+01, + -1.2821427596894478E+02, + 1.2821427705670308E+02, + -9.2697698297776569E+01, + 4.7947588093524907E+01, + -1.7402146074502035E+01, + 4.2916172038452141E+00, + }}}; + } else if constexpr (w == 11) { + return std::array, nc>{{{ + 3.7794653219809625E+04, + 3.4782300224660739E+07, + 1.6188020733727551E+09, + 1.7196758809615005E+10, + 6.3754384857724617E+10, + 9.7196447559193497E+10, + 6.3754384857724617E+10, + 1.7196758809614998E+10, + 1.6188020733727560E+09, + 3.4782300224660769E+07, + 3.7794653219808984E+04, + }, + { + 1.8969206922085886E+05, + 8.4769319065313652E+07, + 2.4230555767723408E+09, + 1.5439732722639101E+10, + 2.7112836839612309E+10, + 2.5609833368650835E-06, + -2.7112836839612328E+10, + -1.5439732722639105E+10, + -2.4230555767723408E+09, + -8.4769319065313682E+07, + -1.8969206922085711E+05, + }, + { + 4.2138380313901440E+05, + 9.2050522922791913E+07, + 1.5259983101266613E+09, + 4.7070559561237173E+09, + -1.2448027572952359E+09, + -1.0161446790279301E+10, + -1.2448027572952316E+09, + 4.7070559561237268E+09, + 1.5259983101266615E+09, + 9.2050522922791913E+07, + 4.2138380313901149E+05, + }, + { + 5.4814313598122005E+05, + 5.8085130777589552E+07, + 4.9484006166551048E+08, + 1.6222124676640952E+08, + -2.0440440381345339E+09, + 9.1416457449079640E-06, + 2.0440440381345336E+09, + -1.6222124676640788E+08, + -4.9484006166551071E+08, + -5.8085130777589560E+07, + -5.4814313598121714E+05, + }, + { + 4.6495183529254980E+05, + 2.3067199578027144E+07, + 6.9832590192482382E+07, + -2.2024799260683522E+08, + -1.2820270942588677E+08, + 5.1017181199129778E+08, + -1.2820270942588474E+08, + -2.2024799260683942E+08, + 6.9832590192482322E+07, + 2.3067199578027155E+07, + 4.6495183529254742E+05, + }, + { + 2.7021781043532980E+05, + 5.6764510325100143E+06, + -5.5650761736748898E+06, + -3.9907385617900200E+07, + 7.2453390663687646E+07, + 1.2300109686762266E-05, + -7.2453390663684472E+07, + 3.9907385617899075E+07, + 5.5650761736749066E+06, + -5.6764510325099993E+06, + -2.7021781043532846E+05, + }, + { + 1.0933249308680627E+05, + 6.9586821127987828E+05, + -3.6860240321937902E+06, + 2.7428169457736355E+06, + 8.3392008440593518E+06, + -1.6402201025046850E+07, + 8.3392008440698013E+06, + 2.7428169457778852E+06, + -3.6860240321937371E+06, + 6.9586821127989423E+05, + 1.0933249308680571E+05, + }, + { + 3.0203516161820498E+04, + -3.6879059542768438E+04, + -4.1141031216788280E+05, + 1.4111389975267777E+06, + -1.5914376635331670E+06, + 9.4095582602103753E-06, + 1.5914376635379130E+06, + -1.4111389975247320E+06, + 4.1141031216776522E+05, + 3.6879059542750314E+04, + -3.0203516161820549E+04, + }, + { + 5.1670143574922731E+03, + -2.8613147115372190E+04, + 4.3560195427081359E+04, + 4.8438679582765450E+04, + -2.5856630639231802E+05, + 3.7994883866738499E+05, + -2.5856630640319458E+05, + 4.8438679579510936E+04, + 4.3560195426766244E+04, + -2.8613147115376054E+04, + 5.1670143574922913E+03, + }, + { + 3.0888018539740131E+02, + -3.7949446187471626E+03, + 1.4313303204988082E+04, + -2.6681600235594462E+04, + 2.3856005166166615E+04, + 8.6424601730164351E-06, + -2.3856005155895236E+04, + 2.6681600234453199E+04, + -1.4313303205083188E+04, + 3.7949446187583080E+03, + -3.0888018539728523E+02, + }, + { + -8.3747489794189363E+01, + 1.1948077479405792E+02, + 4.8528498015072080E+02, + -2.5024391114755094E+03, + 5.3511195318669425E+03, + -6.7655484107390166E+03, + 5.3511195362291774E+03, + -2.5024391131167667E+03, + 4.8528498019392708E+02, + 1.1948077480620087E+02, + -8.3747489794426258E+01, + }, + { + -2.2640047135517630E+01, + 9.0840898563949466E+01, + -2.1597187544386938E+02, + 3.1511229111443720E+02, + -2.4856617998395282E+02, + 6.1683918215190516E-06, + 2.4856618439352349E+02, + -3.1511228757800421E+02, + 2.1597187557069353E+02, + -9.0840898570046704E+01, + 2.2640047135565219E+01, + }, + { + -1.6306382886201207E+00, + 7.3325946591320434E+00, + -2.3241017682854558E+01, + 5.1715494398901185E+01, + -8.2673000279130790E+01, + 9.6489719151212370E+01, + -8.2673010381149226E+01, + 5.1715494328769353E+01, + -2.3241018024860580E+01, + 7.3325946448852415E+00, + -1.6306382886460551E+00, + }}}; + } else if constexpr (w == 12) { + return std::array, nc>{ + {{6.1722991679852908E+04, 8.4789650417103648E+07, 5.4431675199498701E+09, + 7.8788892335272232E+10, 4.0355760945670044E+11, 8.8071481911347949E+11, + 8.8071481911347961E+11, 4.0355760945670044E+11, 7.8788892335272430E+10, + 5.4431675199498835E+09, 8.4789650417103708E+07, 6.1722991679871957E+04}, + {3.2561466099406168E+05, 2.2112758120210618E+08, 8.9911609880089817E+09, + 8.3059508064200943E+10, 2.3965569143469864E+11, 1.6939286803305212E+11, + -1.6939286803305203E+11, -2.3965569143469864E+11, -8.3059508064201080E+10, + -8.9911609880089989E+09, -2.2112758120210618E+08, -3.2561466099404311E+05}, + {7.6621098001581512E+05, 2.6026568260310286E+08, 6.4524338253008652E+09, + 3.3729904113826820E+10, 2.8555202212474091E+10, -6.8998572040731537E+10, + -6.8998572040731445E+10, 2.8555202212474079E+10, 3.3729904113826824E+10, + 6.4524338253008757E+09, 2.6026568260310274E+08, 7.6621098001583829E+05}, + {1.0657807616803218E+06, 1.8144472126890984E+08, 2.5524827004349842E+09, + 5.2112383911371660E+09, -1.0268350564014645E+10, -1.4763245309081306E+10, + 1.4763245309081314E+10, 1.0268350564014671E+10, -5.2112383911371059E+09, + -2.5524827004349871E+09, -1.8144472126890984E+08, -1.0657807616803099E+06}, + {9.7829638830158755E+05, 8.2222351241519913E+07, 5.5676911894064474E+08, + -4.8739037675427330E+08, -2.7153428193078227E+09, 2.5627633609246106E+09, + 2.5627633609246163E+09, -2.7153428193078651E+09, -4.8739037675430620E+08, + 5.5676911894064546E+08, 8.2222351241519868E+07, 9.7829638830161188E+05}, + {6.2536876825114002E+05, 2.4702814073680203E+07, 4.1488431554846466E+07, + -2.9274790542418826E+08, 1.0742154109191516E+08, 6.2185168968032193E+08, + -6.2185168968012476E+08, -1.0742154109184742E+08, 2.9274790542423087E+08, + -4.1488431554843128E+07, -2.4702814073680237E+07, -6.2536876825112454E+05}, + {2.8527714307528478E+05, 4.6266378435690766E+06, -1.0665598090790771E+07, + -2.6048960239891130E+07, 9.1597254427317813E+07, -5.9794495983264342E+07, + -5.9794495983220413E+07, 9.1597254427343085E+07, -2.6048960239921503E+07, + -1.0665598090794146E+07, 4.6266378435690673E+06, 2.8527714307530399E+05}, + {9.2873647411234080E+04, 3.6630046787425119E+05, -3.1271047224730137E+06, + 4.8612412939252760E+06, 3.3820440907796426E+06, -1.6880127953704204E+07, + 1.6880127953756198E+07, -3.3820440907614031E+06, -4.8612412938993908E+06, + 3.1271047224752530E+06, -3.6630046787425695E+05, -9.2873647411217215E+04}, + {2.0817947751046438E+04, -5.5660303410315042E+04, -1.9519783923444615E+05, + 1.0804817251338551E+06, -1.8264985852555393E+06, 9.7602844968061335E+05, + 9.7602844962902542E+05, -1.8264985852963410E+06, 1.0804817251124913E+06, + -1.9519783923503032E+05, -5.5660303410363231E+04, 2.0817947751063632E+04}, + {2.7986023314783361E+03, -1.9404411093655592E+04, 4.3922625000519314E+04, + -7.6450317451901383E+03, -1.5273911974273989E+05, 3.3223441458516393E+05, + -3.3223441441930021E+05, 1.5273911979752057E+05, 7.6450317512768806E+03, + -4.3922624998141677E+04, 1.9404411093637758E+04, -2.7986023314644049E+03}, + {6.7849020474048089E+01, -1.7921351308204744E+03, 8.4980694686552797E+03, + -1.9742624859769410E+04, 2.4620674845030797E+04, -1.1676544851227827E+04, + -1.1676544869194569E+04, 2.4620674845030626E+04, -1.9742624831436660E+04, + 8.4980694630406069E+03, -1.7921351308312935E+03, 6.7849020488592075E+01}, + {-5.4577020998836872E+01, 1.3637112867242237E+02, 4.5513616580246023E+01, + -1.1174001367986359E+03, 3.2018769312434206E+03, -5.0580351396215219E+03, + 5.0580351683422405E+03, -3.2018769242193171E+03, 1.1174000998831286E+03, + -4.5513609243969356E+01, -1.3637112867730119E+02, 5.4577021011726984E+01}, + {-1.0538365872268786E+01, 4.6577222488645518E+01, -1.2606964198473415E+02, + 2.1881091668968099E+02, -2.3273399614976032E+02, 1.0274275204276027E+02, + 1.0274270265494516E+02, -2.3273401859852868E+02, 2.1881091865396468E+02, + -1.2606964777237258E+02, 4.6577222453584369E+01, -1.0538365860573146E+01}, + {-4.6087004144309118E-01, 2.5969759128998060E+00, -9.6946932216381381E+00, + 2.4990041962121211E+01, -4.6013909139329137E+01, 6.2056985032913090E+01, + -6.2056925855365186E+01, 4.6013921000662158E+01, -2.4990037445376750E+01, + 9.6946954085586885E+00, -2.5969759201692755E+00, 4.6087004744129911E-01}}}; + } else if constexpr (w == 13) { + return std::array, nc>{ + {{9.8715725867495363E+04, 1.9828875496808097E+08, 1.7196758809614983E+10, + 3.3083776881353577E+11, 2.2668873993375439E+12, 6.7734720591167568E+12, + 9.6695220682534785E+12, 6.7734720591167432E+12, 2.2668873993375430E+12, + 3.3083776881353503E+11, 1.7196758809614998E+10, 1.9828875496807891E+08, + 9.8715725867496090E+04}, + {5.4491110456935549E+05, 5.4903670125539351E+08, 3.0879465445278183E+10, + 3.9588436413399969E+11, 1.6860562536749778E+12, 2.4256447893117891E+12, + -5.5583944938791784E-05, -2.4256447893117847E+12, -1.6860562536749768E+12, + -3.9588436413399890E+11, -3.0879465445278183E+10, -5.4903670125538898E+08, + -5.4491110456935526E+05}, + {1.3504711883426071E+06, 6.9286979077463162E+08, 2.4618123595484577E+10, + 1.9493985627722607E+11, 3.9422703517046350E+11, -1.8678883613919861E+11, + -8.5538079834550110E+11, -1.8678883613919730E+11, 3.9422703517046375E+11, + 1.9493985627722589E+11, 2.4618123595484566E+10, 6.9286979077462614E+08, + 1.3504711883426069E+06}, + {1.9937206140846491E+06, 5.2512029493765980E+08, 1.1253303793811750E+10, + 4.6205527735932152E+10, -1.1607472377983305E+10, -1.6305241755642313E+11, + 3.5385440504350348E-04, 1.6305241755642365E+11, 1.1607472377982582E+10, + -4.6205527735932213E+10, -1.1253303793811750E+10, -5.2512029493765628E+08, + -1.9937206140846489E+06}, + {1.9607419630386413E+06, 2.6425362558103892E+08, 3.1171259341747193E+09, + 2.9839860297839913E+09, -1.9585031917561897E+10, -5.0666917387065792E+09, + 3.6568794485480583E+10, -5.0666917387057562E+09, -1.9585031917561817E+10, + 2.9839860297838497E+09, 3.1171259341747184E+09, 2.6425362558103728E+08, + 1.9607419630386417E+06}, + {1.3593773865640305E+06, 9.1556445104158267E+07, 4.7074012944133747E+08, + -1.1192579335657008E+09, -2.1090780087868555E+09, 5.2270306737951984E+09, + 5.6467240041521856E-04, -5.2270306737934217E+09, 2.1090780087880819E+09, + 1.1192579335658383E+09, -4.7074012944133127E+08, -9.1556445104157984E+07, + -1.3593773865640305E+06}, + {6.8417206432039209E+05, 2.1561705510027152E+07, 7.5785249893055111E+06, + -2.7456096030221754E+08, 3.4589095671054310E+08, 4.0256106808894646E+08, + -1.0074306926603404E+09, 4.0256106809081393E+08, 3.4589095670997137E+08, + -2.7456096030236483E+08, 7.5785249893030487E+06, 2.1561705510027405E+07, + 6.8417206432039209E+05}, + {2.5248269397037517E+05, 3.0985559672616189E+06, -1.1816517087616559E+07, + -8.2958498770184973E+06, 8.0546642347355247E+07, -1.0594657799485898E+08, + 2.1816722293163801E-04, 1.0594657799424352E+08, -8.0546642347497791E+07, + 8.2958498771036500E+06, 1.1816517087615721E+07, -3.0985559672621777E+06, + -2.5248269397037517E+05}, + {6.7530100970876694E+04, 1.2373362326658823E+05, -2.1245597183281910E+06, + 5.1047323238754412E+06, -1.4139444405488928E+06, -1.1818267555096827E+07, + 2.0121548578624789E+07, -1.1818267557079868E+07, -1.4139444401348191E+06, + 5.1047323236516044E+06, -2.1245597183309775E+06, 1.2373362326702787E+05, + 6.7530100970876316E+04}, + {1.2421368748961073E+04, -5.0576243647011936E+04, -4.8878193436902722E+04, + 6.5307896872028301E+05, -1.5497610127060430E+06, 1.5137725917321201E+06, + 4.1615986404011299E-04, -1.5137725918538549E+06, 1.5497610130469005E+06, + -6.5307896856811445E+05, 4.8878193438804832E+04, 5.0576243646433126E+04, + -1.2421368748961073E+04}, + {1.2904654687550299E+03, -1.1169946055009055E+04, 3.3275109713863385E+04, + -3.1765222274236821E+04, -5.9810982085323274E+04, 2.2355863038592847E+05, + -3.1083591705219547E+05, 2.2355863445202672E+05, -5.9810982721084511E+04, + -3.1765222464963932E+04, 3.3275109714208855E+04, -1.1169946054555618E+04, + 1.2904654687545376E+03}, + {-1.9043622268674213E+01, -6.8296542209516542E+02, 4.2702512274202591E+03, + -1.2165497317825058E+04, 1.9423733298269544E+04, -1.6010024066956401E+04, + 3.4018642874429026E-04, 1.6010021599471667E+04, -1.9423732817821805E+04, + 1.2165497483905752E+04, -4.2702512286689680E+03, 6.8296542153908558E+02, + 1.9043622268312891E+01}, + {-3.0093984465361217E+01, 9.8972865724808671E+01, -9.7437038666761538E+01, + -3.5079928405373198E+02, 1.5699250566648977E+03, -3.1287439837941820E+03, + 3.8692196309709061E+03, -3.1287462825615335E+03, 1.5699252631958864E+03, + -3.5079944793112952E+02, -9.7437041893750632E+01, 9.8972866189610414E+01, + -3.0093984465884773E+01}, + {-4.3050286009489040E+00, 2.1108975724659501E+01, -6.4297198812570272E+01, + 1.2922884632277874E+02, -1.6991812716212596E+02, 1.2655005901719436E+02, + 9.2483537895948854E-05, -1.2655066232531748E+02, 1.6991805207569072E+02, + -1.2922893667436634E+02, 6.4297198424711908E+01, -2.1108976207523057E+01, + 4.3050286009485790E+00}, + {-1.0957333716725008E-01, 7.2949317004436565E-01, -3.4300816058693728E+00, + 1.0470054474579324E+01, -2.2292134950656113E+01, 3.4570827323582719E+01, + -3.9923523442753932E+01, 3.4573264959502886E+01, -2.2292358612963266E+01, + 1.0470042004916014E+01, -3.4300810538570281E+00, 7.2949352113279253E-01, + -1.0957333740315604E-01}}}; + } else if constexpr (w == 14) { + return std::array, nc>{ + {{1.5499533202966207E+05, 4.4723032442444688E+08, 5.1495083701694740E+10, + 1.2904576022918071E+12, 1.1534950432785506E+13, 4.5650102198520484E+13, + 8.8830582190032641E+13, 8.8830582190032641E+13, 4.5650102198520492E+13, + 1.1534950432785527E+13, 1.2904576022918074E+12, 5.1495083701695107E+10, + 4.4723032442444855E+08, 1.5499533202970232E+05}, + {8.9188339002980455E+05, 1.3065352538728635E+09, 9.9400185225815567E+10, + 1.7136059013402405E+12, 1.0144146621675832E+13, 2.3034036018490715E+13, + 1.4630967270448871E+13, -1.4630967270448855E+13, -2.3034036018490719E+13, + -1.0144146621675846E+13, -1.7136059013402405E+12, -9.9400185225815964E+10, + -1.3065352538728662E+09, -8.9188339002979454E+05}, + {2.3170473769379663E+06, 1.7532505043698256E+09, 8.6523535958354309E+10, + 9.7455289065487354E+11, 3.2977972139362314E+12, 1.7874626001697781E+12, + -6.1480918082633916E+12, -6.1480918082633975E+12, 1.7874626001697690E+12, + 3.2977972139362285E+12, 9.7455289065487329E+11, 8.6523535958354630E+10, + 1.7532505043698275E+09, 2.3170473769380399E+06}, + {3.6089249230396422E+06, 1.4278058213962190E+09, 4.4296625537022423E+10, + 2.9466624630419781E+11, 3.1903621584503235E+11, -9.8834691411254565E+11, + -1.1072264714919226E+12, 1.1072264714919316E+12, 9.8834691411255151E+11, + -3.1903621584503467E+11, -2.9466624630419769E+11, -4.4296625537022621E+10, + -1.4278058213962219E+09, -3.6089249230396664E+06}, + {3.7733555140851745E+06, 7.8376718099107409E+08, 1.4443117772349569E+10, + 4.3197433307418671E+10, -7.6585042240585556E+10, -1.8569640140763062E+11, + 2.0385335192657199E+11, 2.0385335192656519E+11, -1.8569640140762662E+11, + -7.6585042240580856E+10, 4.3197433307418686E+10, 1.4443117772349669E+10, + 7.8376718099107552E+08, 3.7733555140852560E+06}, + {2.8079157920112358E+06, 3.0340753492383724E+08, 2.9498136661747241E+09, + -6.2820200387919831E+08, -2.2372008390623215E+10, 1.5217518660584890E+10, + 4.0682590266891922E+10, -4.0682590266869431E+10, -1.5217518660582748E+10, + 2.2372008390625935E+10, 6.2820200387968791E+08, -2.9498136661747637E+09, + -3.0340753492383808E+08, -2.8079157920112377E+06}, + {1.5361613559533111E+06, 8.3513615594416574E+07, 3.0077547202708024E+08, + -1.3749596754067802E+09, -6.6733027297557127E+08, 5.9590333632819109E+09, + -4.3025685566870070E+09, -4.3025685566872711E+09, 5.9590333632806673E+09, + -6.6733027297523963E+08, -1.3749596754067125E+09, 3.0077547202709383E+08, + 8.3513615594416171E+07, 1.5361613559533576E+06}, + {6.2759409419592959E+05, 1.5741723594963098E+07, -1.5632610223406436E+07, + -1.9294824907078514E+08, 4.4643806532434595E+08, 1.5178998385244830E+07, + -9.6771139891725647E+08, 9.6771139892509627E+08, -1.5178998381042883E+07, + -4.4643806533176166E+08, 1.9294824907065383E+08, 1.5632610223392555E+07, + -1.5741723594963137E+07, -6.2759409419590747E+05}, + {1.9151404903933613E+05, 1.7156606891563335E+06, -9.7733523156688716E+06, + 4.2982266233154163E+06, 5.1660907884347722E+07, -1.1279400211155911E+08, + 6.4701089573962681E+07, 6.4701089571562663E+07, -1.1279400211012064E+08, + 5.1660907891220264E+07, 4.2982266233826512E+06, -9.7733523157112263E+06, + 1.7156606891560503E+06, 1.9151404903936724E+05}, + {4.2715272622845026E+04, -2.2565910611953568E+03, -1.1769776156959014E+06, + 4.0078399907813077E+06, -3.8951858063335596E+06, -5.0944610754510267E+06, + 1.6765992446914168E+07, -1.6765992426657490E+07, 5.0944610781778870E+06, + 3.8951858062361716E+06, -4.0078399907326135E+06, 1.1769776157141617E+06, + 2.2565910606306688E+03, -4.2715272622820135E+04}, + {6.4806786522793900E+03, -3.5474227032974472E+04, 1.8237100709385861E+04, + 3.0934714629696816E+05, -1.0394703931686131E+06, 1.4743920333143482E+06, + -7.3356882447856572E+05, -7.3356882916658197E+05, 1.4743920305501707E+06, + -1.0394703929917105E+06, 3.0934714631908614E+05, 1.8237100665157792E+04, + -3.5474227033406372E+04, 6.4806786523010323E+03}, + {4.9913632908459954E+02, -5.5416668524952684E+03, 2.0614058717617296E+04, + -3.2285139072943130E+04, -5.3099550821623425E+03, 1.1559000502166932E+05, + -2.2569743259261423E+05, 2.2569743616896842E+05, -1.1559000130545651E+05, + 5.3099543129458480E+03, 3.2285139142872020E+04, -2.0614058670790018E+04, + 5.5416668533342381E+03, -4.9913632906195977E+02}, + {-3.3076333188134086E+01, -1.8970588563697331E+02, 1.8160423493164808E+03, + -6.3715703355644328E+03, 1.2525624574329036E+04, -1.4199806452802783E+04, + 6.4441892296909591E+03, 6.4441909537524216E+03, -1.4199808176873401E+04, + 1.2525626154733827E+04, -6.3715704433222418E+03, 1.8160422729911850E+03, + -1.8970588700495102E+02, -3.3076333168231550E+01}, + {-1.4394533627743886E+01, 5.7000699089242815E+01, -1.0101142663923416E+02, + -3.2954197414395189E+01, 6.1417879182394654E+02, -1.6177283846697430E+03, + 2.4593386157454975E+03, -2.4593322941165261E+03, 1.6177291239900730E+03, + -6.1417952013923764E+02, 3.2954100943010943E+01, 1.0101142710333265E+02, + -5.7000699100179844E+01, 1.4394533639240331E+01}, + {-1.5925952284027161E+00, 8.5113930215357829E+00, -2.8993523187012922E+01, + 6.6373454994590404E+01, -1.0329574518449559E+02, 1.0280184257681817E+02, + -4.3896094875192006E+01, -4.3899302208087086E+01, 1.0280039795628096E+02, + -1.0329511291885207E+02, 6.6373435700858948E+01, -2.8993536490606409E+01, + 8.5113924808491728E+00, -1.5925952194145006E+00}, + {1.5984868520881029E-02, 1.2876175212962959E-01, -9.8358742969175483E-01, + 3.7711523389360830E+00, -9.4305498095765508E+00, 1.6842854581416674E+01, + -2.2308566502972713E+01, 2.2308940200151390E+01, -1.6841512668820517E+01, + 9.4313524091989347E+00, -3.7710716543179599E+00, 9.8361025494556609E-01, + -1.2876100566420701E-01, -1.5984859433053292E-02}}}; + } else if constexpr (w == 15) { + return std::array, nc>{ + {{2.3939707792241839E+05, 9.7700272582690191E+08, 1.4715933396485257E+11, + 4.7242424833337158E+12, 5.3987426629953594E+13, 2.7580474290566078E+14, + 7.0693378336533400E+14, 9.6196578554477775E+14, 7.0693378336533400E+14, + 2.7580474290566125E+14, 5.3987426629953766E+13, 4.7242424833337246E+12, + 1.4715933396485263E+11, 9.7700272582690215E+08, 2.3939707792242285E+05}, + {1.4314487885226035E+06, 2.9961416925358453E+09, 3.0273361232748438E+11, + 6.8507333793903584E+12, 5.4192702756911000E+13, 1.7551587948105309E+14, + 2.1874615668430150E+14, 3.4316191014053393E-02, -2.1874615668430150E+14, + -1.7551587948105334E+14, -5.4192702756911180E+13, -6.8507333793903701E+12, + -3.0273361232748438E+11, -2.9961416925358458E+09, -1.4314487885226049E+06}, + {3.8829497354762917E+06, 4.2473082696966448E+09, 2.8414312556015540E+11, + 4.3688281331121411E+12, 2.1823119508000543E+13, 3.2228098609392094E+13, + -2.1833085454691789E+13, -7.3750710225100812E+13, -2.1833085454691820E+13, + 3.2228098609392055E+13, 2.1823119508000594E+13, 4.3688281331121479E+12, + 2.8414312556015527E+11, 4.2473082696966434E+09, 3.8829497354762889E+06}, + {6.3495763451755755E+06, 3.6841035003733950E+09, 1.5965774278321045E+11, + 1.5630338683778201E+12, 3.8749058615819268E+12, -2.7319740087723574E+12, + -1.3233342822865402E+13, 6.1642230420317079E-02, 1.3233342822865449E+13, + 2.7319740087723975E+12, -3.8749058615819365E+12, -1.5630338683778203E+12, + -1.5965774278321042E+11, -3.6841035003733935E+09, -6.3495763451755764E+06}, + {7.0146619045520434E+06, 2.1782897863065763E+09, 5.8897780310148087E+10, + 3.1953009601770325E+11, 4.0651527029737198E+08, -1.6379148273276064E+12, + -1.1568753137013029E+11, 2.7451653250460508E+12, -1.1568753137012485E+11, + -1.6379148273277261E+12, 4.0651527029819238E+08, 3.1953009601770361E+11, + 5.8897780310148087E+10, 2.1782897863065763E+09, 7.0146619045520443E+06}, + {5.5580012413990172E+06, 9.2345162185944164E+08, 1.4522950934020109E+10, + 2.7025952371212009E+10, -1.2304576967641914E+11, -1.0116752717202786E+11, + 3.8517418245458325E+11, 1.0918347404432817E-01, -3.8517418245444312E+11, + 1.0116752717221135E+11, 1.2304576967643665E+11, -2.7025952371214943E+10, + -1.4522950934020079E+10, -9.2345162185944211E+08, -5.5580012413990181E+06}, + {3.2693972344231778E+06, 2.8610260147425205E+08, 2.2348528403750563E+09, + -3.4574515574242272E+09, -1.7480626463583939E+10, 3.1608597465540653E+10, + 1.9879262560072273E+10, -6.6148013553772224E+10, 1.9879262560085339E+10, + 3.1608597465515747E+10, -1.7480626463576942E+10, -3.4574515574198236E+09, + 2.2348528403750110E+09, 2.8610260147425193E+08, 3.2693972344231787E+06}, + {1.4553539959296256E+06, 6.4136842048384041E+07, 1.3622336582062906E+08, + -1.2131510424644001E+09, 6.4322366984221375E+08, 4.5078753872047586E+09, + -7.1689413746930647E+09, 3.2906916833662987E-02, 7.1689413746724453E+09, + -4.5078753875009747E+09, -6.4322366985365331E+08, 1.2131510424608817E+09, + -1.3622336582067037E+08, -6.4136842048384242E+07, -1.4553539959296256E+06}, + {4.9358776531681651E+05, 9.7772970960585065E+06, -2.3511574237987626E+07, + -1.0142613816641946E+08, 3.9421144218035364E+08, -2.8449115593052310E+08, + -5.7549243243741119E+08, 1.1608781631182449E+09, -5.7549243240763104E+08, + -2.8449115600447333E+08, 3.9421144214381480E+08, -1.0142613816429654E+08, + -2.3511574237995699E+07, 9.7772970960588697E+06, 4.9358776531681546E+05}, + {1.2660319987326677E+05, 7.7519511328119377E+05, -6.5244610661450895E+06, + 9.0878257488052379E+06, 2.3116605621149920E+07, -8.7079594462079599E+07, + 9.5542733739275128E+07, 6.0548970733798724E-02, -9.5542733661364838E+07, + 8.7079594608550951E+07, -2.3116605559600785E+07, -9.0878257522138134E+06, + 6.5244610661298726E+06, -7.7519511328133650E+05, -1.2660319987326639E+05}, + {2.3793325531458529E+04, -4.2305332803808597E+04, -5.2884156985535356E+05, + 2.5307340127864038E+06, -4.0404175271559842E+06, -1.7519992360184138E+05, + 1.0146438805818636E+07, -1.5828545480742473E+07, 1.0146438778928882E+07, + -1.7520004389869148E+05, -4.0404175770437294E+06, 2.5307340149977510E+06, + -5.2884156989405944E+05, -4.2305332803937294E+04, 2.3793325531459184E+04}, + {2.9741655196834722E+03, -2.0687056403786246E+04, 3.3295507799709936E+04, + 1.0661145730323243E+05, -5.6644238105382060E+05, 1.0874811616841732E+06, + -9.6561270266008016E+05, 1.5626594062671070E-02, 9.6561272951271443E+05, + -1.0874812528712249E+06, 5.6644243308078672E+05, -1.0661145838213131E+05, + -3.3295507812197495E+04, 2.0687056403630129E+04, -2.9741655196846405E+03}, + {1.5389176594899303E+02, -2.3864418511494741E+03, 1.0846266954249364E+04, + -2.2940053396478714E+04, 1.4780106121058996E+04, 4.2663651769852157E+04, + -1.3047648013242516E+05, 1.7468401314164279E+05, -1.3047645484607235E+05, + 4.2663541429144650E+04, 1.4780036296018619E+04, -2.2940053180976502E+04, + 1.0846266927315819E+04, -2.3864418517113058E+03, 1.5389176594779781E+02}, + {-2.3857631312588978E+01, -1.9651606133609231E+01, 6.4183083829803820E+02, + -2.8648433109641578E+03, 6.8249243722518859E+03, -9.7944325124827701E+03, + 7.6177757600121276E+03, 1.8034307737205296E-02, -7.6177559127722052E+03, + 9.7944326623113047E+03, -6.8249058342322496E+03, 2.8648407117981119E+03, + -6.4183085438795774E+02, 1.9651605969778377E+01, 2.3857631312809222E+01}, + {-6.1348505739169541E+00, 2.7872915855267404E+01, -6.5819942538871970E+01, + 5.1366231962952028E+01, 1.7213955398158618E+02, -6.9658621010000411E+02, + 1.3192236112353403E+03, -1.6054106225233884E+03, 1.3192031991952242E+03, + -6.9663961216547739E+02, 1.7211403815802629E+02, 5.1367579954366171E+01, + -6.5819957939661379E+01, 2.7872915947616441E+01, -6.1348505735855374E+00}, + {-4.9671584513490097E-01, 3.0617550953446115E+00, -1.1650665638578070E+01, + 3.0081586723089057E+01, -5.4028356726202020E+01, 6.6077203078498044E+01, + -4.7145500171928198E+01, 4.2118837140985958E-03, 4.7167106663349848E+01, + -6.6048394423269173E+01, 5.4062906728994193E+01, -3.0081603709324451E+01, + 1.1650672008416343E+01, -3.0617551285208524E+00, 4.9671584437353217E-01}, + {4.3460786767313729E-03, -1.3199600771767199E-02, -1.9412688562910244E-01, + 1.1329433700669471E+00, -3.4442045795063887E+00, 7.1737626956468912E+00, + -1.1098109271625262E+01, 1.2385772358881393E+01, -1.1101471316239516E+01, + 7.0913926025978853E+00, -3.4845491148773502E+00, 1.1323523856621058E+00, + -1.9414904754428672E-01, -1.3200165079792004E-02, 4.3460782759443158E-03}}}; + } else if constexpr (w == 16) { + return std::array, nc>{ + {{3.6434551345570839E+05, 2.0744705928579483E+09, 4.0355760945669995E+11, + 1.6364575388763029E+13, 2.3514830376056538E+14, 1.5192201717462528E+15, + 4.9956173084674090E+15, 8.9287666945127360E+15, 8.9287666945127390E+15, + 4.9956173084674090E+15, 1.5192201717462528E+15, 2.3514830376056538E+14, + 1.6364575388763035E+13, 4.0355760945670026E+11, 2.0744705928579524E+09, + 3.6434551345571183E+05}, + {2.2576246485480359E+06, 6.6499571180086451E+09, 8.7873753526056287E+11, + 2.5606844387131066E+13, 2.6313738449330153E+14, 1.1495095100701460E+15, + 2.1932582707747560E+15, 1.2860244365132595E+15, -1.2860244365132600E+15, + -2.1932582707747578E+15, -1.1495095100701465E+15, -2.6313738449330159E+14, + -2.5606844387131062E+13, -8.7873753526056299E+11, -6.6499571180086451E+09, + -2.2576246485480373E+06}, + {6.3730995546265077E+06, 9.9060026035198078E+09, 8.8097248605449023E+11, + 1.7953384130753688E+13, 1.2398425545001662E+14, 3.0749346493041262E+14, + 1.0259777520247159E+14, -5.5291976457534325E+14, -5.5291976457534325E+14, + 1.0259777520247186E+14, 3.0749346493041219E+14, 1.2398425545001659E+14, + 1.7953384130753676E+13, 8.8097248605448950E+11, 9.9060026035198040E+09, + 6.3730995546265030E+06}, + {1.0896915393078227E+07, 9.0890343524593849E+09, 5.3565169504010010E+11, + 7.3004206720038701E+12, 2.9692333044160066E+13, 1.6051737468109549E+13, + -9.1273329108089906E+13, -8.5999306918502953E+13, 8.5999306918502422E+13, + 9.1273329108089984E+13, -1.6051737468109510E+13, -2.9692333044160082E+13, + -7.3004206720038701E+12, -5.3565169504010022E+11, -9.0890343524593849E+09, + -1.0896915393078227E+07}, + {1.2655725616100594E+07, 5.7342804054544210E+09, 2.1822836608899570E+11, + 1.8300700858999690E+12, 2.7770431049857676E+12, -8.5034969223852568E+12, + -1.2846668467423438E+13, 1.6519076896571838E+13, 1.6519076896572182E+13, + -1.2846668467423555E+13, -8.5034969223850703E+12, 2.7770431049857896E+12, + 1.8300700858999678E+12, 2.1822836608899567E+11, 5.7342804054544210E+09, + 1.2655725616100591E+07}, + {1.0609303958036326E+07, 2.6255609052371716E+09, 6.1673589426039413E+10, + 2.6044432099085333E+11, -3.5431628074578204E+11, -1.6077602129636348E+12, + 1.5534405614728977E+12, 2.8019935380857432E+12, -2.8019935380841978E+12, + -1.5534405614724106E+12, 1.6077602129635625E+12, 3.5431628074580896E+11, + -2.6044432099084848E+11, -6.1673589426039429E+10, -2.6255609052371716E+09, + -1.0609303958036322E+07}, + {6.6544809363384582E+06, 8.9490403680928326E+08, 1.1882638725190845E+10, + 8.1552898137823076E+09, -1.2575562817886868E+11, 2.7074695075907585E+10, + 3.9453789461955023E+11, -3.1679644857468066E+11, -3.1679644857392346E+11, + 3.9453789461966650E+11, 2.7074695075992649E+10, -1.2575562817884555E+11, + 8.1552898137788668E+09, 1.1882638725190889E+10, 8.9490403680928278E+08, + 6.6544809363384554E+06}, + {3.1906872142825006E+06, 2.2785946180651775E+08, 1.3744578972809248E+09, + -4.3997172592883167E+09, -9.2011130754043922E+09, 3.4690551711832901E+10, + -9.4227043395047741E+09, -5.9308465070198639E+10, 5.9308465069336540E+10, + 9.4227043396350136E+09, -3.4690551711738396E+10, 9.2011130753567543E+09, + 4.3997172592879610E+09, -1.3744578972813025E+09, -2.2785946180651844E+08, + -3.1906872142825015E+06}, + {1.1821527096621769E+06, 4.2281234059839502E+07, 2.8723226058712766E+07, + -8.3553955857628822E+08, 1.2447304828823066E+09, 2.1955280943585949E+09, + -7.0514195726908512E+09, 4.3745141239718714E+09, 4.3745141233600502E+09, + -7.0514195728029747E+09, 2.1955280943510208E+09, 1.2447304828590808E+09, + -8.3553955857879233E+08, 2.8723226058761366E+07, 4.2281234059838109E+07, + 1.1821527096621762E+06}, + {3.3854610744280310E+05, 5.2176984975081543E+06, -2.0677283565079328E+07, + -3.5831818968518838E+07, 2.6599346106412742E+08, -3.7992777977357000E+08, + -1.3426914417466179E+08, 9.1752051229224503E+08, -9.1752051129499328E+08, + 1.3426914497246322E+08, 3.7992777991069216E+08, -2.6599346104854536E+08, + 3.5831818968908392E+07, 2.0677283564896725E+07, -5.2176984975075833E+06, + -3.3854610744279937E+05}, + {7.3893334077310064E+04, 2.6983804209559254E+05, -3.6415998561101072E+06, + 8.4025485849181097E+06, 4.9278860779345948E+06, -5.1437033846752726E+07, + 8.7603898676325440E+07, -4.6199498412402093E+07, -4.6199498208604209E+07, + 8.7603898435731798E+07, -5.1437033863736227E+07, 4.9278861005789889E+06, + 8.4025485831489991E+06, -3.6415998560990733E+06, 2.6983804209473461E+05, + 7.3893334077307401E+04}, + {1.1778892113375481E+04, -4.0077190108724200E+04, -1.8372552175909068E+05, + 1.3262878399160223E+06, -2.9738539927520575E+06, 1.9493509709529271E+06, + 4.1881949951139782E+06, -1.1066749616505133E+07, 1.1066749327519676E+07, + -4.1881946843906553E+06, -1.9493507810665092E+06, 2.9738539818831389E+06, + -1.3262878384774840E+06, 1.8372552162922107E+05, 4.0077190107319519E+04, + -1.1778892113376129E+04}, + {1.2019749667923656E+03, -1.0378455844500613E+04, 2.6333352653155256E+04, + 1.7117060106301305E+04, -2.5133287443653666E+05, 6.4713914262131555E+05, + -8.1634942572553246E+05, 3.8623935281825601E+05, 3.8623876433339820E+05, + -8.1634960962672008E+05, 6.4713900469564367E+05, -2.5133289627502396E+05, + 1.7117057951236206E+04, 2.6333352581335013E+04, -1.0378455846609291E+04, + 1.2019749667911419E+03}, + {3.1189837632471693E+01, -8.9083493807061564E+02, 4.9454293649337906E+03, + -1.3124693635095375E+04, 1.5834784331991095E+04, 6.9607870364081436E+03, + -5.9789871879430451E+04, 1.0841726514394575E+05, -1.0841709685990328E+05, + 5.9790206615067997E+04, -6.9607049368128291E+03, -1.5834783935893831E+04, + 1.3124692974990443E+04, -4.9454295091588992E+03, 8.9083493794871868E+02, + -3.1189837631106176E+01}, + {-1.2975319073401824E+01, 1.8283698218710011E+01, 1.7684015393859755E+02, + -1.1059917445033070E+03, 3.1998168298121523E+03, -5.5988200120063057E+03, + 5.9248751921324047E+03, -2.5990022806343668E+03, -2.5990962125709430E+03, + 5.9247537039895724E+03, -5.5988835070734467E+03, 3.1998292349030621E+03, + -1.1059926481090836E+03, 1.7684013881079576E+02, 1.8283698123134819E+01, + -1.2975319073977776E+01}, + {-2.3155118729954247E+00, 1.1938503634469159E+01, -3.4150562973753665E+01, + 4.8898615554511437E+01, 1.5853185548633874E+01, -2.4272678107130790E+02, + 6.0151276286907887E+02, -8.8751856926690448E+02, 8.8742942550355474E+02, + -6.0136491467620624E+02, 2.4282489356694586E+02, -1.5850195971204462E+01, + -4.8897392545563044E+01, 3.4150562973753665E+01, -1.1938504430698943E+01, + 2.3155118723150525E+00}, + {-1.5401723686076832E-01, 9.8067823888634464E-01, -4.1900843552415639E+00, + 1.2150534299778382E+01, -2.4763139606227178E+01, 3.6068014621628578E+01, + -3.4346647779134791E+01, 1.3259903958585387E+01, 1.2937147675617604E+01, + -3.4454233206790519E+01, 3.6027670086257579E+01, -2.4769863695455662E+01, + 1.2149431128889342E+01, -4.1901615115388706E+00, 9.8067695636810759E-01, + -1.5401723756214594E-01}, + {1.1808835093099178E-02, -2.5444299558662394E-02, -1.5661344238792723E-04, + 2.5820071204205225E-01, -1.0930950485268096E+00, 2.6408492552008669E+00, + -4.4415763059111955E+00, 6.8227366238712817E+00, -6.8186662643534008E+00, + 4.4887924763186051E+00, -2.6327085361651021E+00, 1.0918739406714428E+00, + -2.5844238963842503E-01, 1.2680123888735934E-04, 2.5444206395526567E-02, + -1.1808834826225629E-02}}}; + } else { + static_assert(w >= 2, "w must be >= 2"); + static_assert(w <= 16, "w must be <= 16"); + return {}; + } }; - - diff --git a/src/simpleinterfaces.cpp b/src/simpleinterfaces.cpp index e07e76c02..8e55eab4a 100644 --- a/src/simpleinterfaces.cpp +++ b/src/simpleinterfaces.cpp @@ -1,8 +1,8 @@ // public header #include // private headers -#include #include +#include using namespace std; /* --------------------------------------------------------------------------- @@ -18,281 +18,280 @@ using namespace std; --------------------------------------------------------------------------- */ - // Helper layer ........................................................... namespace finufft { - namespace common { +namespace common { -int invokeGuruInterface(int n_dims, int type, int n_transf, BIGINT nj, FLT* xj, - FLT *yj, FLT *zj, CPX* cj,int iflag, FLT eps, - BIGINT *n_modes, BIGINT nk, FLT *s, FLT *t, FLT *u, - CPX* fk, finufft_opts *popts) +int invokeGuruInterface(int n_dims, int type, int n_transf, BIGINT nj, FLT *xj, FLT *yj, + FLT *zj, CPX *cj, int iflag, FLT eps, BIGINT *n_modes, BIGINT nk, + FLT *s, FLT *t, FLT *u, CPX *fk, finufft_opts *popts) // Helper layer between simple interfaces (with opts) and the guru functions. // Author: Andrea Malleo, 2019. { FINUFFT_PLAN plan; - int ier = FINUFFT_MAKEPLAN(type, n_dims, n_modes, iflag, n_transf, eps, - &plan, popts); // popts (ptr to opts) can be NULL - if (ier>1) { // since 1 (a warning) still allows proceeding... + int ier = FINUFFT_MAKEPLAN(type, n_dims, n_modes, iflag, n_transf, eps, &plan, + popts); // popts + // (ptr + // to + // opts) + // can + // be + // NULL + if (ier > 1) { // since 1 (a warning) still allows proceeding... fprintf(stderr, "FINUFFT invokeGuru: plan error (ier=%d)!\n", ier); delete plan; return ier; } int ier2 = FINUFFT_SETPTS(plan, nj, xj, yj, zj, nk, s, t, u); - if (ier2>1) { - fprintf(stderr,"FINUFFT invokeGuru: setpts error (ier=%d)!\n", ier2); + if (ier2 > 1) { + fprintf(stderr, "FINUFFT invokeGuru: setpts error (ier=%d)!\n", ier2); FINUFFT_DESTROY(plan); return ier2; } int ier3 = FINUFFT_EXECUTE(plan, cj, fk); - if (ier3>1) { - fprintf(stderr,"FINUFFT invokeGuru: execute error (ier=%d)!\n", ier3); + if (ier3 > 1) { + fprintf(stderr, "FINUFFT invokeGuru: execute error (ier=%d)!\n", ier3); FINUFFT_DESTROY(plan); return ier3; } FINUFFT_DESTROY(plan); - return max(max(ier,ier2),ier3); // in case any one gave a (positive!) warning + return max(max(ier, ier2), ier3); // in case any one gave a (positive!) warning } - } // namespace -} // namespace +} // namespace common +} // namespace finufft using namespace finufft::common; - // Dimension 1111111111111111111111111111111111111111111111111111111111111111 -int FINUFFT1D1(BIGINT nj,FLT* xj,CPX* cj,int iflag,FLT eps,BIGINT ms, - CPX* fk, finufft_opts *opts) +int FINUFFT1D1(BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, BIGINT ms, CPX *fk, + finufft_opts *opts) // Type-1 1D complex nonuniform FFT. See ../docs/usage.rst { - BIGINT n_modes[]={ms,1,1}; - int n_dims = 1; - int n_transf = 1; - int type = 1; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, - iflag, eps, n_modes, 0, NULL, NULL, NULL, fk, opts); + BIGINT n_modes[] = {ms, 1, 1}; + int n_dims = 1; + int n_transf = 1; + int type = 1; + int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag, + eps, n_modes, 0, NULL, NULL, NULL, fk, opts); return ier; } -int FINUFFT1D1MANY(int n_transf, BIGINT nj,FLT* xj,CPX* cj,int iflag,FLT eps, - BIGINT ms, CPX* fk, finufft_opts *opts) +int FINUFFT1D1MANY(int n_transf, BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, + BIGINT ms, CPX *fk, finufft_opts *opts) // Type-1 1D complex nonuniform FFT for many vectors. See ../docs/usage.rst { - BIGINT n_modes[]={ms,1,1}; - int n_dims = 1; - int type = 1; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, - iflag, eps, n_modes, 0, NULL, NULL, NULL, fk, opts); + BIGINT n_modes[] = {ms, 1, 1}; + int n_dims = 1; + int type = 1; + int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag, + eps, n_modes, 0, NULL, NULL, NULL, fk, opts); return ier; } -int FINUFFT1D2(BIGINT nj,FLT* xj,CPX* cj,int iflag,FLT eps,BIGINT ms, - CPX* fk, finufft_opts *opts) +int FINUFFT1D2(BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, BIGINT ms, CPX *fk, + finufft_opts *opts) // Type-2 1D complex nonuniform FFT. See ../docs/usage.rst { - BIGINT n_modes[]={ms,1,1}; - int n_dims = 1; - int n_transf = 1; - int type = 2; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, - iflag, eps, n_modes, 0, NULL, NULL, NULL, fk, opts); + BIGINT n_modes[] = {ms, 1, 1}; + int n_dims = 1; + int n_transf = 1; + int type = 2; + int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag, + eps, n_modes, 0, NULL, NULL, NULL, fk, opts); return ier; } -int FINUFFT1D2MANY(int n_transf, BIGINT nj,FLT* xj,CPX* cj,int iflag,FLT eps,BIGINT ms, - CPX* fk, finufft_opts *opts) +int FINUFFT1D2MANY(int n_transf, BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, + BIGINT ms, CPX *fk, finufft_opts *opts) // Type-2 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - BIGINT n_modes[]={ms,1,1}; - int n_dims = 1; - int type = 2; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, - iflag, eps, n_modes, 0, NULL, NULL, NULL, fk, opts); + BIGINT n_modes[] = {ms, 1, 1}; + int n_dims = 1; + int type = 2; + int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag, + eps, n_modes, 0, NULL, NULL, NULL, fk, opts); return ier; } -int FINUFFT1D3(BIGINT nj,FLT* xj,CPX* cj,int iflag, FLT eps, BIGINT nk, FLT* s, CPX* fk, finufft_opts *opts) +int FINUFFT1D3(BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, BIGINT nk, FLT *s, + CPX *fk, finufft_opts *opts) // Type-3 1D complex nonuniform FFT. See ../docs/usage.rst { - int n_dims = 1; + int n_dims = 1; int n_transf = 1; - int type = 3; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, - iflag, eps, NULL, nk, s, NULL, NULL, fk, opts); + int type = 3; + int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag, + eps, NULL, nk, s, NULL, NULL, fk, opts); return ier; } -int FINUFFT1D3MANY(int n_transf, BIGINT nj,FLT* xj,CPX* cj,int iflag, FLT eps, BIGINT nk, FLT* s, CPX* fk, finufft_opts *opts) - // Type-3 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst +int FINUFFT1D3MANY(int n_transf, BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, + BIGINT nk, FLT *s, CPX *fk, finufft_opts *opts) +// Type-3 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst { int n_dims = 1; - int type = 3; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, - iflag, eps, NULL, nk, s, NULL, NULL, fk, opts); + int type = 3; + int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag, + eps, NULL, nk, s, NULL, NULL, fk, opts); return ier; } - // Dimension 22222222222222222222222222222222222222222222222222222222222222222 -int FINUFFT2D1(BIGINT nj,FLT* xj,FLT *yj,CPX* cj,int iflag, - FLT eps, BIGINT ms, BIGINT mt, CPX* fk, finufft_opts* opts) +int FINUFFT2D1(BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, BIGINT ms, + BIGINT mt, CPX *fk, finufft_opts *opts) // Type-1 2D complex nonuniform FFT. See ../docs/usage.rst { - BIGINT n_modes[]={ms,mt,1}; - int n_dims = 2; - int n_transf = 1; - int type = 1; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, - iflag, eps, n_modes, 0, NULL, NULL, NULL,fk, opts); + BIGINT n_modes[] = {ms, mt, 1}; + int n_dims = 2; + int n_transf = 1; + int type = 1; + int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, iflag, eps, + n_modes, 0, NULL, NULL, NULL, fk, opts); return ier; } -int FINUFFT2D1MANY(int n_transf, BIGINT nj, FLT* xj, FLT *yj, CPX* c, - int iflag, FLT eps, BIGINT ms, BIGINT mt, CPX* fk, - finufft_opts *opts) +int FINUFFT2D1MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, CPX *c, int iflag, FLT eps, + BIGINT ms, BIGINT mt, CPX *fk, finufft_opts *opts) // Type-1 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - BIGINT n_modes[]={ms,mt,1}; - int n_dims = 2; - int type = 1; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj,NULL, c, - iflag, eps, n_modes, 0, NULL, NULL, NULL, fk, opts); + BIGINT n_modes[] = {ms, mt, 1}; + int n_dims = 2; + int type = 1; + int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, c, iflag, eps, + n_modes, 0, NULL, NULL, NULL, fk, opts); return ier; } -int FINUFFT2D2(BIGINT nj,FLT* xj,FLT *yj,CPX* cj,int iflag,FLT eps, - BIGINT ms, BIGINT mt, CPX* fk, finufft_opts *opts) +int FINUFFT2D2(BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, BIGINT ms, + BIGINT mt, CPX *fk, finufft_opts *opts) // Type-2 2D complex nonuniform FFT. See ../docs/usage.rst { - BIGINT n_modes[]={ms,mt,1}; - int n_dims = 2; - int n_transf = 1; - int type = 2; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, iflag, - eps, n_modes, 0, NULL, NULL, NULL, fk, opts); + BIGINT n_modes[] = {ms, mt, 1}; + int n_dims = 2; + int n_transf = 1; + int type = 2; + int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, iflag, eps, + n_modes, 0, NULL, NULL, NULL, fk, opts); return ier; } -int FINUFFT2D2MANY(int n_transf, BIGINT nj, FLT* xj, FLT *yj, CPX* c, int iflag, - FLT eps, BIGINT ms, BIGINT mt, CPX* fk, finufft_opts *opts) +int FINUFFT2D2MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, CPX *c, int iflag, FLT eps, + BIGINT ms, BIGINT mt, CPX *fk, finufft_opts *opts) // Type-2 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - BIGINT n_modes[]={ms,mt,1}; - int n_dims = 2; - int type = 2; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, c, iflag, - eps, n_modes, 0, NULL, NULL, NULL, fk, opts); + BIGINT n_modes[] = {ms, mt, 1}; + int n_dims = 2; + int type = 2; + int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, c, iflag, eps, + n_modes, 0, NULL, NULL, NULL, fk, opts); return ier; } -int FINUFFT2D3(BIGINT nj,FLT* xj,FLT* yj,CPX* cj,int iflag, FLT eps, BIGINT nk, FLT* s, FLT *t, CPX* fk, finufft_opts *opts) +int FINUFFT2D3(BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, BIGINT nk, + FLT *s, FLT *t, CPX *fk, finufft_opts *opts) // Type-3 2D complex nonuniform FFT. See ../docs/usage.rst { - int n_dims = 2; - int type = 3; + int n_dims = 2; + int type = 3; int n_transf = 1; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj,iflag, eps, NULL, nk, s,t,NULL, fk, opts); + int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, iflag, eps, + NULL, nk, s, t, NULL, fk, opts); return ier; } -int FINUFFT2D3MANY(int n_transf, BIGINT nj,FLT* xj,FLT* yj,CPX* cj,int iflag, FLT eps, BIGINT nk, FLT* s, FLT *t, CPX* fk, finufft_opts *opts) +int FINUFFT2D3MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, + BIGINT nk, FLT *s, FLT *t, CPX *fk, finufft_opts *opts) // Type-3 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst { int n_dims = 2; - int type = 3; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj,iflag, eps, NULL, nk, s,t,NULL, fk, opts); + int type = 3; + int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, iflag, eps, + NULL, nk, s, t, NULL, fk, opts); return ier; } - - // Dimension 3333333333333333333333333333333333333333333333333333333333333333 -int FINUFFT3D1(BIGINT nj,FLT* xj,FLT *yj,FLT *zj,CPX* cj,int iflag, - FLT eps, BIGINT ms, BIGINT mt, BIGINT mu, CPX* fk, - finufft_opts *opts) +int FINUFFT3D1(BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps, + BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts) // Type-1 3D complex nonuniform FFT. See ../docs/usage.rst { - BIGINT n_modes[]={ms,mt,mu}; - int n_dims = 3; - int n_transf = 1; - int type = 1; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, - eps, n_modes, 0, NULL, NULL, NULL, fk, opts); + BIGINT n_modes[] = {ms, mt, mu}; + int n_dims = 3; + int n_transf = 1; + int type = 1; + int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps, + n_modes, 0, NULL, NULL, NULL, fk, opts); return ier; } - -int FINUFFT3D1MANY(int n_transf, BIGINT nj,FLT* xj,FLT *yj,FLT *zj,CPX* cj, - int iflag, FLT eps, BIGINT ms, BIGINT mt, BIGINT mu, CPX* fk, - finufft_opts *opts) +int FINUFFT3D1MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, + FLT eps, BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts) // Type-1 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - BIGINT n_modes[]={ms,mt,mu}; - int n_dims = 3; - int type = 1; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, - eps, n_modes, 0, NULL, NULL, NULL, fk, opts); + BIGINT n_modes[] = {ms, mt, mu}; + int n_dims = 3; + int type = 1; + int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps, + n_modes, 0, NULL, NULL, NULL, fk, opts); return ier; } -int FINUFFT3D2(BIGINT nj,FLT* xj,FLT *yj,FLT *zj,CPX* cj, - int iflag,FLT eps, BIGINT ms, BIGINT mt, BIGINT mu, - CPX* fk, finufft_opts *opts) +int FINUFFT3D2(BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps, + BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts) // Type-2 3D complex nonuniform FFT. See ../docs/usage.rst { - BIGINT n_modes[]={ms,mt,mu}; - int n_dims = 3; - int n_transf = 1; - int type = 2; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, - eps, n_modes, 0, NULL, NULL, NULL, fk, opts); + BIGINT n_modes[] = {ms, mt, mu}; + int n_dims = 3; + int n_transf = 1; + int type = 2; + int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps, + n_modes, 0, NULL, NULL, NULL, fk, opts); return ier; } -int FINUFFT3D2MANY(int n_transf, BIGINT nj,FLT* xj,FLT *yj,FLT *zj,CPX* cj, - int iflag,FLT eps, BIGINT ms, BIGINT mt, BIGINT mu, - CPX* fk, finufft_opts *opts) +int FINUFFT3D2MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, + FLT eps, BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts) // Type-2 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - BIGINT n_modes[]={ms,mt,mu}; - n_modes[0] = ms; - n_modes[1] = mt; - n_modes[2] = mu; - int n_dims = 3; - int type = 2; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, - eps, n_modes, 0, NULL, NULL, NULL, fk, opts); + BIGINT n_modes[] = {ms, mt, mu}; + n_modes[0] = ms; + n_modes[1] = mt; + n_modes[2] = mu; + int n_dims = 3; + int type = 2; + int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps, + n_modes, 0, NULL, NULL, NULL, fk, opts); return ier; } -int FINUFFT3D3(BIGINT nj,FLT* xj,FLT* yj,FLT *zj, CPX* cj, - int iflag, FLT eps, BIGINT nk, FLT* s, FLT *t, - FLT *u, CPX* fk, finufft_opts *opts) +int FINUFFT3D3(BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps, + BIGINT nk, FLT *s, FLT *t, FLT *u, CPX *fk, finufft_opts *opts) // Type-3 3D complex nonuniform FFT. See ../docs/usage.rst { - int n_dims = 3; + int n_dims = 3; int n_transf = 1; - int type = 3; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, - eps, NULL, nk, s ,t ,u, fk, opts); + int type = 3; + int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps, + NULL, nk, s, t, u, fk, opts); return ier; } -int FINUFFT3D3MANY(int n_transf, BIGINT nj,FLT* xj,FLT* yj,FLT *zj, CPX* cj, - int iflag, FLT eps, BIGINT nk, FLT* s, FLT *t, - FLT *u, CPX* fk, finufft_opts *opts) +int FINUFFT3D3MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, + FLT eps, BIGINT nk, FLT *s, FLT *t, FLT *u, CPX *fk, + finufft_opts *opts) // Type-3 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst { int n_dims = 3; - int type = 3; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, - eps, NULL, nk, s ,t ,u, fk, opts); + int type = 3; + int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps, + NULL, nk, s, t, u, fk, opts); return ier; } diff --git a/src/utils.cpp b/src/utils.cpp index 92f4035eb..8df6ed665 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -7,80 +7,80 @@ #include "finufft/defs.h" namespace finufft { - namespace utils { +namespace utils { // ------------ complex array utils --------------------------------- -FLT relerrtwonorm(BIGINT n, CPX* a, CPX* b) +FLT relerrtwonorm(BIGINT n, CPX *a, CPX *b) // ||a-b||_2 / ||a||_2 { FLT err = 0.0, nrm = 0.0; - for (BIGINT m=0; mnrm) nrm = aa; + for (BIGINT m = 0; m < n; ++m) { + FLT aa = real(conj(a[m]) * a[m]); + if (aa > nrm) nrm = aa; } return sqrt(nrm); } // ------------ real array utils --------------------------------- -void arrayrange(BIGINT n, FLT* a, FLT *lo, FLT *hi) +void arrayrange(BIGINT n, FLT *a, FLT *lo, FLT *hi) // With a a length-n array, writes out min(a) to lo and max(a) to hi, // so that all a values lie in [lo,hi]. // If n==0, lo and hi are not finite. { - *lo = INFINITY; *hi = -INFINITY; - for (BIGINT m=0; m*hi) *hi = a[m]; + *lo = INFINITY; + *hi = -INFINITY; + for (BIGINT m = 0; m < n; ++m) { + if (a[m] < *lo) *lo = a[m]; + if (a[m] > *hi) *hi = a[m]; } } -void arraywidcen(BIGINT n, FLT* a, FLT *w, FLT *c) +void arraywidcen(BIGINT n, FLT *a, FLT *w, FLT *c) // Writes out w = half-width and c = center of an interval enclosing all a[n]'s // Only chooses a nonzero center if this increases w by less than fraction // ARRAYWIDCEN_GROWFRAC defined in defs.h. // This prevents rephasings which don't grow nf by much. 6/8/17 // If n==0, w and c are not finite. { - FLT lo,hi; - arrayrange(n,a,&lo,&hi); - *w = (hi-lo)/2; - *c = (hi+lo)/2; - if (std::abs(*c) -#include "finufft/utils_precindep.h" #include "finufft/defs.h" +#include "finufft/utils_precindep.h" using namespace std; namespace finufft { - namespace utils { +namespace utils { BIGINT next235even(BIGINT n) // finds even integer not less than n, with prime factors no larger than 5 // (ie, "smooth"). Adapted from fortran in hellskitchen. Barnett 2/9/17 // changed INT64 type 3/28/17. Runtime is around n*1e-11 sec for big n. { - if (n<=2) return 2; - if (n%2 == 1) n+=1; // even - BIGINT nplus = n-2; // to cancel out the +=2 at start of loop - BIGINT numdiv = 2; // a dummy that is >1 - while (numdiv>1) { - nplus += 2; // stays even + if (n <= 2) return 2; + if (n % 2 == 1) n += 1; // even + BIGINT nplus = n - 2; // to cancel out the +=2 at start of loop + BIGINT numdiv = 2; // a dummy that is >1 + while (numdiv > 1) { + nplus += 2; // stays even numdiv = nplus; - while (numdiv%2 == 0) numdiv /= 2; // remove all factors of 2,3,5... - while (numdiv%3 == 0) numdiv /= 3; - while (numdiv%5 == 0) numdiv /= 5; + while (numdiv % 2 == 0) numdiv /= 2; // remove all factors of 2,3,5... + while (numdiv % 3 == 0) numdiv /= 3; + while (numdiv % 5 == 0) numdiv /= 5; } return nplus; } // ----------------------- helpers for timing (always stay double prec) ------ - -void CNTime::start() -{ + +void CNTime::start() { initial = std::chrono::duration_cast( - std::chrono::steady_clock::now().time_since_epoch()).count()*1e-6; + std::chrono::steady_clock::now().time_since_epoch()) + .count() * + 1e-6; } double CNTime::restart() @@ -51,12 +52,12 @@ double CNTime::elapsedsec() // returns answers as double, in seconds, to microsec accuracy. Barnett 5/22/18 { std::uint64_t now = std::chrono::duration_cast( - std::chrono::steady_clock::now().time_since_epoch()).count(); - const double nowsec = now*1e-6; + std::chrono::steady_clock::now().time_since_epoch()) + .count(); + const double nowsec = now * 1e-6; return nowsec - initial; } - // -------------------------- openmp helpers ------------------------------- int get_num_threads_parallel_block() // return how many threads an omp parallel block would use. @@ -72,19 +73,18 @@ int get_num_threads_parallel_block() return nth_used; } - // ---------- thread-safe rand number generator for Windows platform --------- // (note this is used by macros in defs.h, and supplied in linux/macosx) #ifdef _WIN32 int rand_r(unsigned int *seedp) // Libin Lu, 6/18/20 { - std::random_device rd; - std::default_random_engine generator(rd()); - std::uniform_int_distribution distribution(0,RAND_MAX); - return distribution(generator); + std::random_device rd; + std::default_random_engine generator(rd()); + std::uniform_int_distribution distribution(0, RAND_MAX); + return distribution(generator); } #endif - } // namespace -} // namespace +} // namespace utils +} // namespace finufft diff --git a/test/basicpassfail.cpp b/test/basicpassfail.cpp index c3648d878..c44487925 100644 --- a/test/basicpassfail.cpp +++ b/test/basicpassfail.cpp @@ -6,40 +6,39 @@ // Simplified from Amit Moscovitz and example1d1. Barnett 11/1/18. // Using vectors and default opts, 2/29/20; dual-prec lib 7/3/20. -int main() -{ - BIGINT M = 1e3, N = 1e3; // defaults: M = # srcs, N = # modes out - double tol = 1e-5; // req tol, covers both single & double prec cases - int isign = +1; // exponential sign for NUFFT - static const CPX I = CPX(0.0,1.0); // imaginary unit. Note: avoid (CPX) cast - std::vector F(N); // alloc output mode coeffs +int main() { + BIGINT M = 1e3, N = 1e3; // defaults: M = # srcs, N = # modes out + double tol = 1e-5; // req tol, covers both single & double prec cases + int isign = +1; // exponential sign for NUFFT + static const CPX I = CPX(0.0, 1.0); // imaginary unit. Note: avoid (CPX) cast + std::vector F(N); // alloc output mode coeffs // Make the input data.................................... - srand(42); // seed - std::vector x(M); // NU pts locs - std::vector c(M); // strengths - for (BIGINT j=0; j x(M); // NU pts locs + std::vector c(M); // strengths + for (BIGINT j = 0; j < M; ++j) { + x[j] = M_PI * (2 * ((FLT)rand() / (FLT)RAND_MAX) - 1); // uniform random in [-pi,pi) + c[j] = 2 * ((FLT)rand() / (FLT)RAND_MAX) - 1 + + I * (2 * ((FLT)rand() / (FLT)RAND_MAX) - 1); } // Run it (NULL = default opts) ....................................... - int ier = FINUFFT1D1(M,&x[0],&c[0],isign,tol,N,&F[0],NULL); - if (ier!=0) { - printf("basicpassfail: finufft1d1 error (ier=%d)!",ier); + int ier = FINUFFT1D1(M, &x[0], &c[0], isign, tol, N, &F[0], NULL); + if (ier != 0) { + printf("basicpassfail: finufft1d1 error (ier=%d)!", ier); exit(ier); } // Check correct math for a single mode................... - BIGINT n = (BIGINT)(0.37*N); // choose some mode near the top (N/2) - CPX Ftest = CPX(0.0,0.0); // crude exact answer & error check... - for (BIGINT j=0; jFinfnrm) Finfnrm=aF; + BIGINT n = (BIGINT)(0.37 * N); // choose some mode near the top (N/2) + CPX Ftest = CPX(0.0, 0.0); // crude exact answer & error check... + for (BIGINT j = 0; j < M; ++j) Ftest += c[j] * exp((FLT)isign * I * (FLT)n * x[j]); + BIGINT nout = n + N / 2; // index in output array for freq mode n + FLT Finfnrm = 0.0; // compute inf norm of F... + for (int m = 0; m < N; ++m) { + FLT aF = abs(F[m]); // note C++ abs complex type, not C fabs(f) + if (aF > Finfnrm) Finfnrm = aF; } - FLT relerr = abs(F[nout] - Ftest)/Finfnrm; - //printf("requested tol %.3g: rel err for one mode %.3g\n",tol,relerr); - return (std::isnan(relerr) || relerr > 10.0*tol); // true reports failure + FLT relerr = abs(F[nout] - Ftest) / Finfnrm; + // printf("requested tol %.3g: rel err for one mode %.3g\n",tol,relerr); + return (std::isnan(relerr) || relerr > 10.0 * tol); // true reports failure } diff --git a/test/cuda/cufinufft1d_test.cu b/test/cuda/cufinufft1d_test.cu index bb2d96758..05b62025e 100644 --- a/test/cuda/cufinufft1d_test.cu +++ b/test/cuda/cufinufft1d_test.cu @@ -16,190 +16,193 @@ using cufinufft::utils::infnorm; -template +template int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag) { - std::cout << std::scientific << std::setprecision(3); - int ier; - - thrust::host_vector x(M); - thrust::host_vector> c(M); - thrust::host_vector> fk(N1); - - thrust::device_vector d_x(M); - thrust::device_vector> d_c(M); - thrust::device_vector> d_fk(N1); - - std::default_random_engine eng(1); - std::uniform_real_distribution dist11(-1, 1); - auto randm11 = [&eng, &dist11]() { return dist11(eng); }; - - // Making data + std::cout << std::scientific << std::setprecision(3); + int ier; + + thrust::host_vector x(M); + thrust::host_vector> c(M); + thrust::host_vector> fk(N1); + + thrust::device_vector d_x(M); + thrust::device_vector> d_c(M); + thrust::device_vector> d_fk(N1); + + std::default_random_engine eng(1); + std::uniform_real_distribution dist11(-1, 1); + auto randm11 = [&eng, &dist11]() { + return dist11(eng); + }; + + // Making data + for (int i = 0; i < M; i++) { + x[i] = M_PI * randm11(); // x in [-pi,pi) + } + if (type == 1) { for (int i = 0; i < M; i++) { - x[i] = M_PI * randm11(); // x in [-pi,pi) + c[i].real(randm11()); + c[i].imag(randm11()); } - if (type == 1) { - for (int i = 0; i < M; i++) { - c[i].real(randm11()); - c[i].imag(randm11()); - } - } else if (type == 2) { - for (int i = 0; i < N1; i++) { - fk[i].real(randm11()); - fk[i].imag(randm11()); - } - } else { - std::cerr << "Invalid type " << type << " supplied\n"; - return 1; + } else if (type == 2) { + for (int i = 0; i < N1; i++) { + fk[i].real(randm11()); + fk[i].imag(randm11()); } - - d_x = x; - if (type == 1) - d_c = c; - else if (type == 2) - d_fk = fk; - - cudaEvent_t start, stop; - float milliseconds = 0; - float totaltime = 0; - cudaEventCreate(&start); - cudaEventCreate(&stop); - - // warm up CUFFT (is slow, takes around 0.2 sec... ) - cudaEventRecord(start); - { - int nf1 = 1; - cufftHandle fftplan; - cufftPlan1d(&fftplan, nf1, cufft_type(), 1); - } - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - printf("[time ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000); - - // now to the test... - cufinufft_plan_t *dplan; - const int dim = 1; - - // Here we setup our own opts, for gpu_method. - cufinufft_opts opts; - cufinufft_default_opts(&opts); - - opts.gpu_method = method; - opts.gpu_maxbatchsize = 1; - - int nmodes[3] = {N1, 1, 1}; - int ntransf = 1; - cudaEventRecord(start); - - ier = cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts); - if (ier != 0) { - printf("err: cufinufft1d_plan\n"); - return ier; - } - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - totaltime += milliseconds; - printf("[time ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000); - - cudaEventRecord(start); - ier = cufinufft_setpts_impl(M, d_x.data().get(), NULL, NULL, 0, NULL, NULL, NULL, dplan); - - if (ier != 0) { - printf("err: cufinufft_setpts\n"); - return ier; - } - - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - totaltime += milliseconds; - printf("[time ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds / 1000); - - cudaEventRecord(start); - ier = cufinufft_execute_impl((cuda_complex *)d_c.data().get(), (cuda_complex *)d_fk.data().get(), dplan); - - if (ier != 0) { - printf("err: cufinufft1d_exec\n"); - return ier; - } - - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - totaltime += milliseconds; - float exec_ms = milliseconds; - printf("[time ] cufinufft exec:\t\t %.3g s\n", milliseconds / 1000); - - cudaEventRecord(start); - ier = cufinufft_destroy_impl(dplan); - if (ier != 0) { - printf("err %d: cufinufft1d_destroy\n", ier); - return ier; - } - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - totaltime += milliseconds; - printf("[time ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000); - - printf("[Method %d] %d U pts to %d NU pts in %.3g s: %.3g NU pts/s\n", opts.gpu_method, N1, M, - totaltime / 1000, M / totaltime * 1000); - printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", M / exec_ms * 1000); - - T rel_error = std::numeric_limits::max(); - if (type == 1) { - fk = d_fk; - int nt1 = 0.37 * N1; // choose some mode index to check - thrust::complex Ft = thrust::complex(0, 0), J = thrust::complex(0.0, iflag); - for (int j = 0; j < M; ++j) - Ft += c[j] * exp(J * (nt1 * x[j])); // crude direct - int it = N1 / 2 + nt1; // index in complex F as 1d array - - rel_error = abs(Ft - fk[it]) / infnorm(N1, (std::complex *)fk.data()); - printf("[gpu ] one mode: rel err in F[%d] is %.3g\n", nt1, rel_error); - } else if (type == 2) { - c = d_c; - - int jt = M / 2; // check arbitrary choice of one targ pt - thrust::complex J = thrust::complex(0, iflag); - thrust::complex ct = thrust::complex(0, 0); - int m = 0; - for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1) - ct += fk[m++] * exp(J * (m1 * x[jt])); // crude direct - rel_error = abs(c[jt] - ct) / infnorm(M, (std::complex *)c.data()); - printf("[gpu ] one targ: rel err in c[%d] is %.3g\n", jt, rel_error); - } - - return std::isnan(rel_error) || rel_error > checktol; + } else { + std::cerr << "Invalid type " << type << " supplied\n"; + return 1; + } + + d_x = x; + if (type == 1) + d_c = c; + else if (type == 2) + d_fk = fk; + + cudaEvent_t start, stop; + float milliseconds = 0; + float totaltime = 0; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + // warm up CUFFT (is slow, takes around 0.2 sec... ) + cudaEventRecord(start); + { + int nf1 = 1; + cufftHandle fftplan; + cufftPlan1d(&fftplan, nf1, cufft_type(), 1); + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + printf("[time ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000); + + // now to the test... + cufinufft_plan_t *dplan; + const int dim = 1; + + // Here we setup our own opts, for gpu_method. + cufinufft_opts opts; + cufinufft_default_opts(&opts); + + opts.gpu_method = method; + opts.gpu_maxbatchsize = 1; + + int nmodes[3] = {N1, 1, 1}; + int ntransf = 1; + cudaEventRecord(start); + + ier = cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts); + if (ier != 0) { + printf("err: cufinufft1d_plan\n"); + return ier; + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + totaltime += milliseconds; + printf("[time ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000); + + cudaEventRecord(start); + ier = cufinufft_setpts_impl(M, d_x.data().get(), NULL, NULL, 0, NULL, NULL, NULL, + dplan); + + if (ier != 0) { + printf("err: cufinufft_setpts\n"); + return ier; + } + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + totaltime += milliseconds; + printf("[time ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds / 1000); + + cudaEventRecord(start); + ier = cufinufft_execute_impl((cuda_complex *)d_c.data().get(), + (cuda_complex *)d_fk.data().get(), dplan); + + if (ier != 0) { + printf("err: cufinufft1d_exec\n"); + return ier; + } + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + totaltime += milliseconds; + float exec_ms = milliseconds; + printf("[time ] cufinufft exec:\t\t %.3g s\n", milliseconds / 1000); + + cudaEventRecord(start); + ier = cufinufft_destroy_impl(dplan); + if (ier != 0) { + printf("err %d: cufinufft1d_destroy\n", ier); + return ier; + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + totaltime += milliseconds; + printf("[time ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000); + + printf("[Method %d] %d U pts to %d NU pts in %.3g s: %.3g NU pts/s\n", + opts.gpu_method, N1, M, totaltime / 1000, M / totaltime * 1000); + printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", M / exec_ms * 1000); + + T rel_error = std::numeric_limits::max(); + if (type == 1) { + fk = d_fk; + int nt1 = 0.37 * N1; // choose some mode index to check + thrust::complex Ft = thrust::complex(0, 0), J = thrust::complex(0.0, iflag); + for (int j = 0; j < M; ++j) Ft += c[j] * exp(J * (nt1 * x[j])); // crude direct + int it = N1 / 2 + nt1; // index in complex F as 1d array + + rel_error = abs(Ft - fk[it]) / infnorm(N1, (std::complex *)fk.data()); + printf("[gpu ] one mode: rel err in F[%d] is %.3g\n", nt1, rel_error); + } else if (type == 2) { + c = d_c; + + int jt = M / 2; // check arbitrary choice of one targ pt + thrust::complex J = thrust::complex(0, iflag); + thrust::complex ct = thrust::complex(0, 0); + int m = 0; + for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1) + ct += fk[m++] * exp(J * (m1 * x[jt])); // crude direct + rel_error = abs(c[jt] - ct) / infnorm(M, (std::complex *)c.data()); + printf("[gpu ] one targ: rel err in c[%d] is %.3g\n", jt, rel_error); + } + + return std::isnan(rel_error) || rel_error > checktol; } int main(int argc, char *argv[]) { - if (argc != 8) { - fprintf(stderr, "Usage: cufinufft1d_test method type N1 M tol checktol prec\n" - "Arguments:\n" - " method: One of\n" - " 1: nupts driven\n" - " type: Type of transform (1, 2)\n" - " N1: Number of fourier modes\n" - " M: The number of non-uniform points\n" - " tol: NUFFT tolerance\n" - " checktol: relative error to pass test\n" - " precision: f or d\n"); - return 1; - } - const int method = atoi(argv[1]); - const int type = atoi(argv[2]); - const int N1 = atof(argv[3]); - const int M = atof(argv[4]); - const double tol = atof(argv[5]); - const double checktol = atof(argv[6]); - const int iflag = 1; - const char prec = argv[7][0]; - if (prec == 'f') - return run_test(method, type, N1, M, tol, checktol, iflag); - else if (prec == 'd') - return run_test(method, type, N1, M, tol, checktol, iflag); - else - return -1; + if (argc != 8) { + fprintf(stderr, "Usage: cufinufft1d_test method type N1 M tol checktol prec\n" + "Arguments:\n" + " method: One of\n" + " 1: nupts driven\n" + " type: Type of transform (1, 2)\n" + " N1: Number of fourier modes\n" + " M: The number of non-uniform points\n" + " tol: NUFFT tolerance\n" + " checktol: relative error to pass test\n" + " precision: f or d\n"); + return 1; + } + const int method = atoi(argv[1]); + const int type = atoi(argv[2]); + const int N1 = atof(argv[3]); + const int M = atof(argv[4]); + const double tol = atof(argv[5]); + const double checktol = atof(argv[6]); + const int iflag = 1; + const char prec = argv[7][0]; + if (prec == 'f') + return run_test(method, type, N1, M, tol, checktol, iflag); + else if (prec == 'd') + return run_test(method, type, N1, M, tol, checktol, iflag); + else + return -1; } diff --git a/test/cuda/cufinufft2d1nupts_test.cu b/test/cuda/cufinufft2d1nupts_test.cu index 409c42625..6817712df 100644 --- a/test/cuda/cufinufft2d1nupts_test.cu +++ b/test/cuda/cufinufft2d1nupts_test.cu @@ -18,207 +18,213 @@ using cufinufft::utils::infnorm; -template -int run_test(int method) { - int N1 = 100; - int N2 = 100; - int N = N1 * N2; - int M1 = N1 * N2; - int M2 = 2 * N1 * N2; - - T tol = 1e-5; - int iflag = 1; - - std::cout << std::scientific << std::setprecision(3); - int ier; - - thrust::host_vector x1(M1), y1(M1); - thrust::host_vector> c1(M1), fk1(N1 * N2); - thrust::device_vector d_x1(M1), d_y1(M1); - thrust::device_vector> d_c1(M1), d_fk1(N1 * N2); - - thrust::host_vector x2(M2), y2(M2); - thrust::host_vector> c2(M2), fk2(N1 * N2); - thrust::device_vector d_x2(M2), d_y2(M2); - thrust::device_vector> d_c2(M2), d_fk2(N1 * N2); - - std::default_random_engine eng(1); - std::uniform_real_distribution dist11(-1, 1); - auto randm11 = [&eng, &dist11]() { return dist11(eng); }; - - // Making data - for (int i = 0; i < M1; i++) { - x1[i] = M_PI * randm11(); // x in [-pi,pi) - y1[i] = M_PI * randm11(); - c1[i].real(randm11()); - c1[i].imag(randm11()); - } - - for (int i = 0; i < M2; i++) { - x2[i] = M_PI * randm11(); // x in [-pi,pi) - y2[i] = M_PI * randm11(); - c2[i].real(randm11()); - c2[i].imag(randm11()); - } - - d_x1 = x1; - d_y1 = y1; - d_c1 = c1; - d_x2 = x2; - d_y2 = y2; - d_c2 = c2; - - cudaEvent_t start, stop; - float milliseconds = 0; - float totaltime = 0; - cudaEventCreate(&start); - cudaEventCreate(&stop); - - // warm up CUFFT (is slow, takes around 0.2 sec... ) - cudaEventRecord(start); - { - int nf1 = 1; - cufftHandle fftplan; - cufftPlan1d(&fftplan, nf1, cufft_type(), 1); - } - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - printf("[time ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000); - - // now to our tests... - cufinufft_plan_t *dplan; - int dim = 2; - int type = 1; - - // Here we setup our own opts, for gpu_method. - cufinufft_opts opts; - cufinufft_default_opts(&opts); - - opts.gpu_method = method; - opts.gpu_maxbatchsize = 1; - - int nmodes[3]; - int ntransf = 1; - - nmodes[0] = N1; - nmodes[1] = N2; - nmodes[2] = 1; - cudaEventRecord(start); - ier = cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts); - if (ier != 0) { - printf("err: cufinufft2d_plan\n"); - return ier; - } - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - totaltime += milliseconds; - printf("[time ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000); - - cudaEventRecord(start); - ier = cufinufft_setpts_impl(M1, d_x1.data().get(), d_y1.data().get(), NULL, 0, NULL, NULL, NULL, dplan); - if (ier != 0) { - printf("err: cufinufft_setpts (set 1)\n"); - return ier; - } - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - totaltime += milliseconds; - printf("[time ] cufinufft setNUpts (set 1):\t %.3g s\n", milliseconds / 1000); - - cudaEventRecord(start); - ier = cufinufft_execute_impl((cuda_complex *)d_c1.data().get(), (cuda_complex *)d_fk1.data().get(), dplan); - - if (ier != 0) { - printf("err: cufinufft2d1_exec (set 1)\n"); - return ier; - } - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - totaltime += milliseconds; - float exec_ms = milliseconds; - printf("[time ] cufinufft exec (set 1):\t %.3g s\n", milliseconds / 1000); - - cudaEventRecord(start); - ier = cufinufft_setpts_impl(M2, d_x2.data().get(), d_y2.data().get(), NULL, 0, NULL, NULL, NULL, dplan); - if (ier != 0) { - printf("err: cufinufft_setpts (set 2)\n"); - return ier; - } - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - totaltime += milliseconds; - printf("[time ] cufinufft setNUpts (set 2):\t %.3g s\n", milliseconds / 1000); - - cudaEventRecord(start); - ier = cufinufft_execute_impl((cuda_complex *)d_c2.data().get(), (cuda_complex *)d_fk2.data().get(), dplan); - if (ier != 0) { - printf("err: cufinufft2d1_exec (set 2)\n"); - return ier; - } - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - totaltime += milliseconds; - exec_ms += milliseconds; - printf("[time ] cufinufft exec (set 2):\t %.3g s\n", milliseconds / 1000); - - cudaEventRecord(start); - ier = cufinufft_destroy_impl(dplan); - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - totaltime += milliseconds; - printf("[time ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000); - - fk1 = d_fk1; - fk2 = d_fk2; - - printf("[Method %d] (%d+%d) NU pts to %d U pts in %.3g s: %.3g NU pts/s\n", opts.gpu_method, M1, M2, N1 * N2, - totaltime / 1000, (M1 + M2) / totaltime * 1000); - printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", (M1 + M2) / exec_ms * 1000); - - int nt1 = (int)(0.37 * N1), nt2 = (int)(0.26 * N2); // choose some mode index to check - thrust::complex Ft(0, 0), J(0, iflag); - for (int j = 0; j < M1; ++j) - Ft += c1[j] * exp(J * (nt1 * x1[j] + nt2 * y1[j])); // crude direct - int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2); // index in complex F as 1d array - - printf("[gpu ] one mode: rel err in F[%d,%d] is %.3g (set 1)\n", (int)nt1, (int)nt2, - abs(Ft - fk1[it]) / infnorm(N, (std::complex *)fk1.data())); - Ft = thrust::complex(0, 0); - for (int j = 0; j < M2; ++j) - Ft += c2[j] * exp(J * (nt1 * x2[j] + nt2 * y2[j])); // crude direct - printf("[gpu ] one mode: rel err in F[%d,%d] is %.3g (set 2)\n", (int)nt1, (int)nt2, - abs(Ft - fk2[it]) / infnorm(N, (std::complex *)fk2.data())); - - return 0; +template int run_test(int method) { + int N1 = 100; + int N2 = 100; + int N = N1 * N2; + int M1 = N1 * N2; + int M2 = 2 * N1 * N2; + + T tol = 1e-5; + int iflag = 1; + + std::cout << std::scientific << std::setprecision(3); + int ier; + + thrust::host_vector x1(M1), y1(M1); + thrust::host_vector> c1(M1), fk1(N1 * N2); + thrust::device_vector d_x1(M1), d_y1(M1); + thrust::device_vector> d_c1(M1), d_fk1(N1 * N2); + + thrust::host_vector x2(M2), y2(M2); + thrust::host_vector> c2(M2), fk2(N1 * N2); + thrust::device_vector d_x2(M2), d_y2(M2); + thrust::device_vector> d_c2(M2), d_fk2(N1 * N2); + + std::default_random_engine eng(1); + std::uniform_real_distribution dist11(-1, 1); + auto randm11 = [&eng, &dist11]() { + return dist11(eng); + }; + + // Making data + for (int i = 0; i < M1; i++) { + x1[i] = M_PI * randm11(); // x in [-pi,pi) + y1[i] = M_PI * randm11(); + c1[i].real(randm11()); + c1[i].imag(randm11()); + } + + for (int i = 0; i < M2; i++) { + x2[i] = M_PI * randm11(); // x in [-pi,pi) + y2[i] = M_PI * randm11(); + c2[i].real(randm11()); + c2[i].imag(randm11()); + } + + d_x1 = x1; + d_y1 = y1; + d_c1 = c1; + d_x2 = x2; + d_y2 = y2; + d_c2 = c2; + + cudaEvent_t start, stop; + float milliseconds = 0; + float totaltime = 0; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + // warm up CUFFT (is slow, takes around 0.2 sec... ) + cudaEventRecord(start); + { + int nf1 = 1; + cufftHandle fftplan; + cufftPlan1d(&fftplan, nf1, cufft_type(), 1); + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + printf("[time ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000); + + // now to our tests... + cufinufft_plan_t *dplan; + int dim = 2; + int type = 1; + + // Here we setup our own opts, for gpu_method. + cufinufft_opts opts; + cufinufft_default_opts(&opts); + + opts.gpu_method = method; + opts.gpu_maxbatchsize = 1; + + int nmodes[3]; + int ntransf = 1; + + nmodes[0] = N1; + nmodes[1] = N2; + nmodes[2] = 1; + cudaEventRecord(start); + ier = cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts); + if (ier != 0) { + printf("err: cufinufft2d_plan\n"); + return ier; + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + totaltime += milliseconds; + printf("[time ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000); + + cudaEventRecord(start); + ier = cufinufft_setpts_impl(M1, d_x1.data().get(), d_y1.data().get(), NULL, 0, NULL, + NULL, NULL, dplan); + if (ier != 0) { + printf("err: cufinufft_setpts (set 1)\n"); + return ier; + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + totaltime += milliseconds; + printf("[time ] cufinufft setNUpts (set 1):\t %.3g s\n", milliseconds / 1000); + + cudaEventRecord(start); + ier = cufinufft_execute_impl((cuda_complex *)d_c1.data().get(), + (cuda_complex *)d_fk1.data().get(), dplan); + + if (ier != 0) { + printf("err: cufinufft2d1_exec (set 1)\n"); + return ier; + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + totaltime += milliseconds; + float exec_ms = milliseconds; + printf("[time ] cufinufft exec (set 1):\t %.3g s\n", milliseconds / 1000); + + cudaEventRecord(start); + ier = cufinufft_setpts_impl(M2, d_x2.data().get(), d_y2.data().get(), NULL, 0, NULL, + NULL, NULL, dplan); + if (ier != 0) { + printf("err: cufinufft_setpts (set 2)\n"); + return ier; + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + totaltime += milliseconds; + printf("[time ] cufinufft setNUpts (set 2):\t %.3g s\n", milliseconds / 1000); + + cudaEventRecord(start); + ier = cufinufft_execute_impl((cuda_complex *)d_c2.data().get(), + (cuda_complex *)d_fk2.data().get(), dplan); + if (ier != 0) { + printf("err: cufinufft2d1_exec (set 2)\n"); + return ier; + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + totaltime += milliseconds; + exec_ms += milliseconds; + printf("[time ] cufinufft exec (set 2):\t %.3g s\n", milliseconds / 1000); + + cudaEventRecord(start); + ier = cufinufft_destroy_impl(dplan); + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + totaltime += milliseconds; + printf("[time ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000); + + fk1 = d_fk1; + fk2 = d_fk2; + + printf("[Method %d] (%d+%d) NU pts to %d U pts in %.3g s: %.3g NU pts/s\n", + opts.gpu_method, M1, M2, N1 * N2, totaltime / 1000, + (M1 + M2) / totaltime * 1000); + printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", (M1 + M2) / exec_ms * 1000); + + int nt1 = (int)(0.37 * N1), nt2 = (int)(0.26 * N2); // choose some mode index to check + thrust::complex Ft(0, 0), J(0, iflag); + for (int j = 0; j < M1; ++j) + Ft += c1[j] * exp(J * (nt1 * x1[j] + nt2 * y1[j])); // crude direct + int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2); // index in complex F as 1d array + + printf("[gpu ] one mode: rel err in F[%d,%d] is %.3g (set 1)\n", (int)nt1, (int)nt2, + abs(Ft - fk1[it]) / infnorm(N, (std::complex *)fk1.data())); + Ft = thrust::complex(0, 0); + for (int j = 0; j < M2; ++j) + Ft += c2[j] * exp(J * (nt1 * x2[j] + nt2 * y2[j])); // crude direct + printf("[gpu ] one mode: rel err in F[%d,%d] is %.3g (set 2)\n", (int)nt1, (int)nt2, + abs(Ft - fk2[it]) / infnorm(N, (std::complex *)fk2.data())); + + return 0; } int main(int argc, char *argv[]) { - if (argc < 3) { - fprintf(stderr, "Usage: cufinufft2d1nupts_test method\n" - "Arguments:\n" - " method: One of\n" - " 1: nupts driven,\n" - " 2: sub-problem, or\n" - " precision: f or d\n"); - return 1; - } - int method; - sscanf(argv[1], "%d", &method); - char prec = argv[2][0]; - - if (prec == 'f') - return run_test(method); - else if (prec == 'd') - return run_test(method); - else - fprintf(stderr, "Invalid precision supplied: %s\n", argv[2]); - + if (argc < 3) { + fprintf(stderr, "Usage: cufinufft2d1nupts_test method\n" + "Arguments:\n" + " method: One of\n" + " 1: nupts driven,\n" + " 2: sub-problem, or\n" + " precision: f or d\n"); return 1; + } + int method; + sscanf(argv[1], "%d", &method); + char prec = argv[2][0]; + + if (prec == 'f') + return run_test(method); + else if (prec == 'd') + return run_test(method); + else + fprintf(stderr, "Invalid precision supplied: %s\n", argv[2]); + + return 1; } diff --git a/test/cuda/cufinufft2d_test.cu b/test/cuda/cufinufft2d_test.cu index 371b44b2f..2ce430eb6 100644 --- a/test/cuda/cufinufft2d_test.cu +++ b/test/cuda/cufinufft2d_test.cu @@ -17,189 +17,194 @@ using cufinufft::utils::infnorm; -template +template int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int iflag) { - std::cout << std::scientific << std::setprecision(3); - - thrust::host_vector x(M), y(M); - thrust::host_vector> c(M), fk(N1 * N2); - - thrust::device_vector d_x(M), d_y(M); - thrust::device_vector> d_c(M), d_fk(N1 * N2); - - std::default_random_engine eng(1); - std::uniform_real_distribution dist11(-1, 1); - auto randm11 = [&eng, &dist11]() { return dist11(eng); }; - - // Making data + std::cout << std::scientific << std::setprecision(3); + + thrust::host_vector x(M), y(M); + thrust::host_vector> c(M), fk(N1 * N2); + + thrust::device_vector d_x(M), d_y(M); + thrust::device_vector> d_c(M), d_fk(N1 * N2); + + std::default_random_engine eng(1); + std::uniform_real_distribution dist11(-1, 1); + auto randm11 = [&eng, &dist11]() { + return dist11(eng); + }; + + // Making data + for (int i = 0; i < M; i++) { + x[i] = M_PI * randm11(); // x in [-pi,pi) + y[i] = M_PI * randm11(); + } + if (type == 1) { for (int i = 0; i < M; i++) { - x[i] = M_PI * randm11(); // x in [-pi,pi) - y[i] = M_PI * randm11(); - } - if (type == 1) { - for (int i = 0; i < M; i++) { - c[i].real(randm11()); - c[i].imag(randm11()); - } - } else if (type == 2) { - for (int i = 0; i < N1 * N2; i++) { - fk[i].real(randm11()); - fk[i].imag(randm11()); - } - } else { - std::cerr << "Invalid type " << type << " supplied\n"; - return 1; - } - - d_x = x; - d_y = y; - if (type == 1) - d_c = c; - else if (type == 2) - d_fk = fk; - - cudaEvent_t start, stop; - float milliseconds = 0; - float totaltime = 0; - cudaEventCreate(&start); - cudaEventCreate(&stop); - - // warm up CUFFT (is slow, takes around 0.2 sec... ) - cudaEventRecord(start); - { - int nf1 = 1; - cufftHandle fftplan; - cufftPlan1d(&fftplan, nf1, cufft_type(), 1); - } - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - printf("[time ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000); - - // now to our tests... - cufinufft_plan_t *dplan; - const int dim = 2; - - // Here we setup our own opts, for gpu_method. - cufinufft_opts opts; - cufinufft_default_opts(&opts); - - opts.gpu_method = method; - opts.gpu_maxbatchsize = 1; - - int nmodes[3] = {N1, N2, 1}; - int ntransf = 1; - cudaEventRecord(start); - int ier = cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts); - if (ier != 0) { - printf("err: cufinufft2d_plan\n"); - return ier; - } - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - totaltime += milliseconds; - printf("[time ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000); - - cudaEventRecord(start); - ier = cufinufft_setpts_impl(M, d_x.data().get(), d_y.data().get(), nullptr, 0, nullptr, nullptr, nullptr, dplan); - if (ier != 0) { - printf("err: cufinufft_setpts\n"); - return ier; + c[i].real(randm11()); + c[i].imag(randm11()); } - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - totaltime += milliseconds; - printf("[time ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds / 1000); - - cudaEventRecord(start); - ier = cufinufft_execute_impl((cuda_complex *)d_c.data().get(), (cuda_complex *)d_fk.data().get(), dplan); - if (ier != 0) { - printf("err: cufinufft2d1_exec\n"); - return ier; + } else if (type == 2) { + for (int i = 0; i < N1 * N2; i++) { + fk[i].real(randm11()); + fk[i].imag(randm11()); } - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - totaltime += milliseconds; - float exec_ms = milliseconds; - printf("[time ] cufinufft exec:\t\t %.3g s\n", milliseconds / 1000); - - cudaEventRecord(start); - ier = cufinufft_destroy_impl(dplan); - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - totaltime += milliseconds; - printf("[time ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000); - - if (type == 1) - fk = d_fk; - else if (type == 2) - c = d_c; - - printf("[Method %d] %d NU pts to %d U pts in %.3g s: %.3g NU pts/s\n", opts.gpu_method, M, N1 * N2, - totaltime / 1000, M / totaltime * 1000); - printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", M / exec_ms * 1000); - - T rel_error = std::numeric_limits::max(); - if (type == 1) { - const int nt1 = 0.37 * N1; - const int nt2 = 0.26 * N2; // choose some mode index to check - thrust::complex Ft = thrust::complex(0, 0), J = thrust::complex(0.0, iflag); - for (int j = 0; j < M; ++j) - Ft += c[j] * exp(J * (nt1 * x[j] + nt2 * y[j])); // crude direct - const int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2); // index in complex F as 1d array - - rel_error = abs(Ft - fk[it]) / infnorm(N1, (std::complex *)fk.data()); - printf("[gpu ] one mode: rel err in F[%d,%d] is %.3g\n", nt1, nt2, rel_error); - } else if (type == 2) { - int jt = M / 2; // check arbitrary choice of one targ pt - thrust::complex J = thrust::complex(0, iflag); - thrust::complex ct = thrust::complex(0, 0); - - int m = 0; - for (int m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F - for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1) - ct += fk[m++] * exp(J * (m1 * x[jt] + m2 * y[jt])); // crude direct - - rel_error = abs(c[jt] - ct) / infnorm(M, (std::complex *)c.data()); - printf("[gpu ] one targ: rel err in c[%d] is %.3g\n", jt, rel_error); - } - - return std::isnan(rel_error) || rel_error > checktol; + } else { + std::cerr << "Invalid type " << type << " supplied\n"; + return 1; + } + + d_x = x; + d_y = y; + if (type == 1) + d_c = c; + else if (type == 2) + d_fk = fk; + + cudaEvent_t start, stop; + float milliseconds = 0; + float totaltime = 0; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + // warm up CUFFT (is slow, takes around 0.2 sec... ) + cudaEventRecord(start); + { + int nf1 = 1; + cufftHandle fftplan; + cufftPlan1d(&fftplan, nf1, cufft_type(), 1); + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + printf("[time ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000); + + // now to our tests... + cufinufft_plan_t *dplan; + const int dim = 2; + + // Here we setup our own opts, for gpu_method. + cufinufft_opts opts; + cufinufft_default_opts(&opts); + + opts.gpu_method = method; + opts.gpu_maxbatchsize = 1; + + int nmodes[3] = {N1, N2, 1}; + int ntransf = 1; + cudaEventRecord(start); + int ier = + cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts); + if (ier != 0) { + printf("err: cufinufft2d_plan\n"); + return ier; + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + totaltime += milliseconds; + printf("[time ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000); + + cudaEventRecord(start); + ier = cufinufft_setpts_impl(M, d_x.data().get(), d_y.data().get(), nullptr, 0, + nullptr, nullptr, nullptr, dplan); + if (ier != 0) { + printf("err: cufinufft_setpts\n"); + return ier; + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + totaltime += milliseconds; + printf("[time ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds / 1000); + + cudaEventRecord(start); + ier = cufinufft_execute_impl((cuda_complex *)d_c.data().get(), + (cuda_complex *)d_fk.data().get(), dplan); + if (ier != 0) { + printf("err: cufinufft2d1_exec\n"); + return ier; + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + totaltime += milliseconds; + float exec_ms = milliseconds; + printf("[time ] cufinufft exec:\t\t %.3g s\n", milliseconds / 1000); + + cudaEventRecord(start); + ier = cufinufft_destroy_impl(dplan); + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + totaltime += milliseconds; + printf("[time ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000); + + if (type == 1) + fk = d_fk; + else if (type == 2) + c = d_c; + + printf("[Method %d] %d NU pts to %d U pts in %.3g s: %.3g NU pts/s\n", + opts.gpu_method, M, N1 * N2, totaltime / 1000, M / totaltime * 1000); + printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", M / exec_ms * 1000); + + T rel_error = std::numeric_limits::max(); + if (type == 1) { + const int nt1 = 0.37 * N1; + const int nt2 = 0.26 * N2; // choose some mode index to check + thrust::complex Ft = thrust::complex(0, 0), J = thrust::complex(0.0, iflag); + for (int j = 0; j < M; ++j) + Ft += c[j] * exp(J * (nt1 * x[j] + nt2 * y[j])); // crude direct + const int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2); // index in complex F as 1d array + + rel_error = abs(Ft - fk[it]) / infnorm(N1, (std::complex *)fk.data()); + printf("[gpu ] one mode: rel err in F[%d,%d] is %.3g\n", nt1, nt2, rel_error); + } else if (type == 2) { + int jt = M / 2; // check arbitrary choice of one targ pt + thrust::complex J = thrust::complex(0, iflag); + thrust::complex ct = thrust::complex(0, 0); + + int m = 0; + for (int m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F + for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1) + ct += fk[m++] * exp(J * (m1 * x[jt] + m2 * y[jt])); // crude direct + + rel_error = abs(c[jt] - ct) / infnorm(M, (std::complex *)c.data()); + printf("[gpu ] one targ: rel err in c[%d] is %.3g\n", jt, rel_error); + } + + return std::isnan(rel_error) || rel_error > checktol; } int main(int argc, char *argv[]) { - if (argc != 9) { - fprintf(stderr, "Usage: cufinufft2d1_test method N1 N2 M tol checktol\n" - "Arguments:\n" - " method: One of\n" - " 1: nupts driven,\n" - " 2: sub-problem, or\n" - " type: Type of transform (1, 2)" - " N1, N2: The size of the 2D array\n" - " M: The number of non-uniform points\n" - " tol: NUFFT tolerance\n" - " checktol: relative error to pass test\n" - " prec: 'f' or 'd' (float/double)\n"); - return 1; - } - const int method = atoi(argv[1]); - const int type = atoi(argv[2]); - const int N1 = atof(argv[3]); - const int N2 = atof(argv[4]); - const int M = atof(argv[5]); - const double tol = atof(argv[6]); - const double checktol = atof(argv[7]); - const char prec = argv[8][0]; - const int iflag = 1; - - if (prec == 'f') - return run_test(method, type, N1, N2, M, tol, checktol, iflag); - else if (prec == 'd') - return run_test(method, type, N1, N2, M, tol, checktol, iflag); - else - return -1; + if (argc != 9) { + fprintf(stderr, "Usage: cufinufft2d1_test method N1 N2 M tol checktol\n" + "Arguments:\n" + " method: One of\n" + " 1: nupts driven,\n" + " 2: sub-problem, or\n" + " type: Type of transform (1, 2)" + " N1, N2: The size of the 2D array\n" + " M: The number of non-uniform points\n" + " tol: NUFFT tolerance\n" + " checktol: relative error to pass test\n" + " prec: 'f' or 'd' (float/double)\n"); + return 1; + } + const int method = atoi(argv[1]); + const int type = atoi(argv[2]); + const int N1 = atof(argv[3]); + const int N2 = atof(argv[4]); + const int M = atof(argv[5]); + const double tol = atof(argv[6]); + const double checktol = atof(argv[7]); + const char prec = argv[8][0]; + const int iflag = 1; + + if (prec == 'f') + return run_test(method, type, N1, N2, M, tol, checktol, iflag); + else if (prec == 'd') + return run_test(method, type, N1, N2, M, tol, checktol, iflag); + else + return -1; } diff --git a/test/cuda/cufinufft2dmany_test.cu b/test/cuda/cufinufft2dmany_test.cu index 96f3cecf3..0a9e45d00 100644 --- a/test/cuda/cufinufft2dmany_test.cu +++ b/test/cuda/cufinufft2dmany_test.cu @@ -17,195 +17,208 @@ using cufinufft::utils::infnorm; -template -int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize, int M, T tol, T checktol, int iflag) { - std::cout << std::scientific << std::setprecision(3); - - int ier; - const int N = N1 * N2; - printf("#modes = %d, #inputs = %d, #NUpts = %d\n", N, ntransf, M); - - thrust::host_vector x(M), y(M); - thrust::host_vector> c(M * ntransf), fk(ntransf * N1 * N2); - - thrust::device_vector d_x(M), d_y(M); - thrust::device_vector> d_c(M * ntransf), d_fk(ntransf * N1 * N2); - - std::default_random_engine eng(1); - std::uniform_real_distribution dist11(-1, 1); - auto randm11 = [&eng, &dist11]() { return dist11(eng); }; - - // Making data - for (int i = 0; i < M; i++) { - x[i] = M_PI * randm11(); // x in [-pi,pi) - y[i] = M_PI * randm11(); - } - if (type == 1) { - for (int i = 0; i < ntransf * M; i++) { - c[i].real(randm11()); - c[i].imag(randm11()); - } - } else if (type == 2) { - for (int i = 0; i < ntransf * N1 * N2; i++) { - fk[i].real(randm11()); - fk[i].imag(randm11()); - } - } else { - std::cerr << "Invalid type " << type << " supplied\n"; - return 1; +template +int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize, int M, + T tol, T checktol, int iflag) { + std::cout << std::scientific << std::setprecision(3); + + int ier; + const int N = N1 * N2; + printf("#modes = %d, #inputs = %d, #NUpts = %d\n", N, ntransf, M); + + thrust::host_vector x(M), y(M); + thrust::host_vector> c(M * ntransf), fk(ntransf * N1 * N2); + + thrust::device_vector d_x(M), d_y(M); + thrust::device_vector> d_c(M * ntransf), d_fk(ntransf * N1 * N2); + + std::default_random_engine eng(1); + std::uniform_real_distribution dist11(-1, 1); + auto randm11 = [&eng, &dist11]() { + return dist11(eng); + }; + + // Making data + for (int i = 0; i < M; i++) { + x[i] = M_PI * randm11(); // x in [-pi,pi) + y[i] = M_PI * randm11(); + } + if (type == 1) { + for (int i = 0; i < ntransf * M; i++) { + c[i].real(randm11()); + c[i].imag(randm11()); } - - d_x = x; - d_y = y; - if (type == 1) - d_c = c; - else if (type == 2) - d_fk = fk; - - cudaEvent_t start, stop; - float milliseconds = 0; - double totaltime = 0; - cudaEventCreate(&start); - cudaEventCreate(&stop); - - // warm up CUFFT (is slow, takes around 0.2 sec... ) - cudaEventRecord(start); - { - int nf1 = 1; - cufftHandle fftplan; - cufftPlan1d(&fftplan, nf1, cufft_type(), 1); + } else if (type == 2) { + for (int i = 0; i < ntransf * N1 * N2; i++) { + fk[i].real(randm11()); + fk[i].imag(randm11()); } - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - printf("[time ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000); - - // now to the test... - cufinufft_plan_t *dplan; - int dim = 2; - - // Here we setup our own opts, for gpu_method. - cufinufft_opts opts; - cufinufft_default_opts(&opts); - - opts.gpu_method = method; - opts.gpu_maxbatchsize = maxbatchsize; - - int nmodes[3] = {N1, N2, 1}; - cudaEventRecord(start); - ier = cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts); - if (ier != 0) { - printf("err: cufinufft2d_plan\n"); - return ier; - } - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - totaltime += milliseconds; - printf("[time ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000); - - cudaEventRecord(start); - ier = cufinufft_setpts_impl(M, d_x.data().get(), d_y.data().get(), NULL, 0, NULL, NULL, NULL, dplan); - if (ier != 0) { - printf("err: cufinufft_setpts\n"); - return ier; - } - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - totaltime += milliseconds; - printf("[time ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds / 1000); - - cudaEventRecord(start); - ier = cufinufft_execute_impl((cuda_complex *)d_c.data().get(), (cuda_complex *)d_fk.data().get(), dplan); - if (ier != 0) { - printf("err: cufinufft2d_exec\n"); - return ier; - } - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - float exec_ms = milliseconds; - totaltime += milliseconds; - printf("[time ] cufinufft exec:\t\t %.3g s\n", milliseconds / 1000); - - cudaEventRecord(start); - ier = cufinufft_destroy_impl(dplan); - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - totaltime += milliseconds; - printf("[time ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000); - - if (type == 1) - fk = d_fk; - else if (type == 2) - c = d_c; - - T rel_error = std::numeric_limits::max(); - if (type == 1) { - int i = ntransf - 1; // // choose some data to check - int nt1 = (int)(0.37 * N1), nt2 = (int)(0.26 * N2); // choose some mode index to check - thrust::complex Ft = thrust::complex(0, 0), J = thrust::complex(0.0, iflag); - for (int j = 0; j < M; ++j) - Ft += c[j + i * M] * exp(J * (nt1 * x[j] + nt2 * y[j])); // crude direct - int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2); // index in complex F as 1d array - rel_error = abs(Ft - fk[it + i * N]) / infnorm(N1, (std::complex *)fk.data() + i * N); - printf("[gpu ] %dth data one mode: rel err in F[%d,%d] is %.3g\n", i, nt1, nt2, rel_error); - } else if (type == 2) { - const int t = ntransf - 1; - thrust::complex *fkstart = fk.data() + t * N1 * N2; - const thrust::complex *cstart = c.data() + t * M; - const int jt = M / 2; // check arbitrary choice of one targ pt - const thrust::complex J(0, iflag); - thrust::complex ct(0, 0); - int m = 0; - for (int m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F - for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1) - ct += fkstart[m++] * exp(J * (m1 * x[jt] + m2 * y[jt])); // crude direct - - rel_error = abs(cstart[jt] - ct) / infnorm(M, (std::complex *)c.data()); - printf("[gpu ] %dth data one targ: rel err in c[%d] is %.3g\n", t, jt, rel_error); - } - - printf("[totaltime] %.3g us, speed %.3g NUpts/s\n", totaltime * 1000, M * ntransf / totaltime * 1000); - printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", M * ntransf / exec_ms * 1000); - return std::isnan(rel_error) || rel_error > checktol; + } else { + std::cerr << "Invalid type " << type << " supplied\n"; + return 1; + } + + d_x = x; + d_y = y; + if (type == 1) + d_c = c; + else if (type == 2) + d_fk = fk; + + cudaEvent_t start, stop; + float milliseconds = 0; + double totaltime = 0; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + // warm up CUFFT (is slow, takes around 0.2 sec... ) + cudaEventRecord(start); + { + int nf1 = 1; + cufftHandle fftplan; + cufftPlan1d(&fftplan, nf1, cufft_type(), 1); + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + printf("[time ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000); + + // now to the test... + cufinufft_plan_t *dplan; + int dim = 2; + + // Here we setup our own opts, for gpu_method. + cufinufft_opts opts; + cufinufft_default_opts(&opts); + + opts.gpu_method = method; + opts.gpu_maxbatchsize = maxbatchsize; + + int nmodes[3] = {N1, N2, 1}; + cudaEventRecord(start); + ier = cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts); + if (ier != 0) { + printf("err: cufinufft2d_plan\n"); + return ier; + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + totaltime += milliseconds; + printf("[time ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000); + + cudaEventRecord(start); + ier = cufinufft_setpts_impl(M, d_x.data().get(), d_y.data().get(), NULL, 0, NULL, + NULL, NULL, dplan); + if (ier != 0) { + printf("err: cufinufft_setpts\n"); + return ier; + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + totaltime += milliseconds; + printf("[time ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds / 1000); + + cudaEventRecord(start); + ier = cufinufft_execute_impl((cuda_complex *)d_c.data().get(), + (cuda_complex *)d_fk.data().get(), dplan); + if (ier != 0) { + printf("err: cufinufft2d_exec\n"); + return ier; + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + float exec_ms = milliseconds; + totaltime += milliseconds; + printf("[time ] cufinufft exec:\t\t %.3g s\n", milliseconds / 1000); + + cudaEventRecord(start); + ier = cufinufft_destroy_impl(dplan); + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + totaltime += milliseconds; + printf("[time ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000); + + if (type == 1) + fk = d_fk; + else if (type == 2) + c = d_c; + + T rel_error = std::numeric_limits::max(); + if (type == 1) { + int i = ntransf - 1; // // choose some data to check + int nt1 = (int)(0.37 * N1), nt2 = (int)(0.26 * N2); // choose some mode index to check + thrust::complex Ft = thrust::complex(0, 0), J = thrust::complex(0.0, iflag); + for (int j = 0; j < M; ++j) + Ft += c[j + i * M] * exp(J * (nt1 * x[j] + nt2 * y[j])); // crude direct + int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2); // index in complex F as 1d array + rel_error = + abs(Ft - fk[it + i * N]) / infnorm(N1, (std::complex *)fk.data() + i * N); + printf("[gpu ] %dth data one mode: rel err in F[%d,%d] is %.3g\n", i, nt1, nt2, + rel_error); + } else if (type == 2) { + const int t = ntransf - 1; + thrust::complex *fkstart = fk.data() + t * N1 * N2; + const thrust::complex *cstart = c.data() + t * M; + const int jt = M / 2; // check arbitrary choice of one targ pt + const thrust::complex J(0, iflag); + thrust::complex ct(0, 0); + int m = 0; + for (int m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F + for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1) + ct += fkstart[m++] * exp(J * (m1 * x[jt] + m2 * y[jt])); // crude direct + + rel_error = abs(cstart[jt] - ct) / infnorm(M, (std::complex *)c.data()); + printf("[gpu ] %dth data one targ: rel err in c[%d] is %.3g\n", t, jt, rel_error); + } + + printf("[totaltime] %.3g us, speed %.3g NUpts/s\n", totaltime * 1000, + M * ntransf / totaltime * 1000); + printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", + M * ntransf / exec_ms * 1000); + return std::isnan(rel_error) || rel_error > checktol; } int main(int argc, char *argv[]) { - if (argc != 11) { - fprintf(stderr, "Usage: cufinufft2d1many_test method type N1 N2 ntransf maxbatchsize M tol checktol prec\n" - "Arguments:\n" - " method: One of\n" - " 1: nupts driven,\n" - " 2: sub-problem, or\n" - " type: Type of transform (1, 2)\n" - " N1, N2: The size of the 2D array\n" - " ntransf: Number of inputs\n" - " maxbatchsize: Number of simultaneous transforms (or 0 for default)\n" - " M: The number of non-uniform points\n" - " tol: NUFFT tolerance\n" - " checktol: relative error to pass test\n" - " prec: 'f' or 'd' (float/double)\n"); - return 1; - } - const int method = atoi(argv[1]); - const int type = atoi(argv[2]); - const int N1 = atof(argv[3]); - const int N2 = atof(argv[4]); - const int ntransf = atof(argv[5]); - const int maxbatchsize = atoi(argv[6]); - const int M = atoi(argv[7]); - const double tol = atof(argv[8]); - const double checktol = atof(argv[9]); - const char prec = argv[10][0]; - const int iflag = 1; - - if (prec == 'f') - return run_test(method, type, N1, N2, ntransf, maxbatchsize, M, tol, checktol, iflag); - else if (prec == 'd') - return run_test(method, type, N1, N2, ntransf, maxbatchsize, M, tol, checktol, iflag); - else - return -1; + if (argc != 11) { + fprintf(stderr, + "Usage: cufinufft2d1many_test method type N1 N2 ntransf maxbatchsize M tol " + "checktol prec\n" + "Arguments:\n" + " method: One of\n" + " 1: nupts driven,\n" + " 2: sub-problem, or\n" + " type: Type of transform (1, 2)\n" + " N1, N2: The size of the 2D array\n" + " ntransf: Number of inputs\n" + " maxbatchsize: Number of simultaneous transforms (or 0 for default)\n" + " M: The number of non-uniform points\n" + " tol: NUFFT tolerance\n" + " checktol: relative error to pass test\n" + " prec: 'f' or 'd' (float/double)\n"); + return 1; + } + const int method = atoi(argv[1]); + const int type = atoi(argv[2]); + const int N1 = atof(argv[3]); + const int N2 = atof(argv[4]); + const int ntransf = atof(argv[5]); + const int maxbatchsize = atoi(argv[6]); + const int M = atoi(argv[7]); + const double tol = atof(argv[8]); + const double checktol = atof(argv[9]); + const char prec = argv[10][0]; + const int iflag = 1; + + if (prec == 'f') + return run_test(method, type, N1, N2, ntransf, maxbatchsize, M, tol, checktol, + iflag); + else if (prec == 'd') + return run_test(method, type, N1, N2, ntransf, maxbatchsize, M, tol, checktol, + iflag); + else + return -1; } diff --git a/test/cuda/cufinufft3d_test.cu b/test/cuda/cufinufft3d_test.cu index a882f6715..ddca0fd61 100644 --- a/test/cuda/cufinufft3d_test.cu +++ b/test/cuda/cufinufft3d_test.cu @@ -17,198 +17,210 @@ using cufinufft::utils::infnorm; -template -int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T checktol, int iflag) { - std::cout << std::scientific << std::setprecision(3); - int ier; - - thrust::host_vector x(M), y(M), z(M); - thrust::host_vector> c(M), fk(N1 * N2 * N3); - - thrust::device_vector d_x(M), d_y(M), d_z(M); - thrust::device_vector> d_c(M), d_fk(N1 * N2 * N3); - - std::default_random_engine eng(1); - std::uniform_real_distribution dist11(-1, 1); - auto randm11 = [&eng, &dist11]() { return dist11(eng); }; - - // Making data +template +int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T checktol, + int iflag) { + std::cout << std::scientific << std::setprecision(3); + int ier; + + thrust::host_vector x(M), y(M), z(M); + thrust::host_vector> c(M), fk(N1 * N2 * N3); + + thrust::device_vector d_x(M), d_y(M), d_z(M); + thrust::device_vector> d_c(M), d_fk(N1 * N2 * N3); + + std::default_random_engine eng(1); + std::uniform_real_distribution dist11(-1, 1); + auto randm11 = [&eng, &dist11]() { + return dist11(eng); + }; + + // Making data + for (int i = 0; i < M; i++) { + x[i] = M_PI * randm11(); // x in [-pi,pi) + y[i] = M_PI * randm11(); + z[i] = M_PI * randm11(); + } + if (type == 1) { for (int i = 0; i < M; i++) { - x[i] = M_PI * randm11(); // x in [-pi,pi) - y[i] = M_PI * randm11(); - z[i] = M_PI * randm11(); - } - if (type == 1) { - for (int i = 0; i < M; i++) { - c[i].real(randm11()); - c[i].imag(randm11()); - } - } else if (type == 2) { - for (int i = 0; i < N1 * N2 * N3; i++) { - fk[i].real(randm11()); - fk[i].imag(randm11()); - } - } else { - std::cerr << "Invalid type " << type << " supplied\n"; - return 1; - } - - d_x = x; - d_y = y; - d_z = z; - - if (type == 1) - d_c = c; - else if (type == 2) - d_fk = fk; - - cudaEvent_t start, stop; - float milliseconds = 0; - float totaltime = 0; - cudaEventCreate(&start); - cudaEventCreate(&stop); - - // warm up CUFFT (is slow, takes around 0.2 sec... ) - cudaEventRecord(start); - { - int nf1 = 1; - cufftHandle fftplan; - cufftPlan1d(&fftplan, nf1, cufft_type(), 1); - } - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - printf("[time ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000); - - // now to the test... - cufinufft_plan_t *dplan; - int dim = 3; - - // Here we setup our own opts, for gpu_method and gpu_kerevalmeth. - cufinufft_opts opts; - cufinufft_default_opts(&opts); - - opts.gpu_method = method; - opts.gpu_kerevalmeth = 1; - opts.gpu_maxbatchsize = 1; - - int nmodes[3] = {N1, N2, N3}; - int ntransf = 1; - - cudaEventRecord(start); - ier = cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts); - if (ier != 0) { - printf("err: cufinufft_makeplan\n"); - return ier; - } - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - totaltime += milliseconds; - printf("[time ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000); - - cudaEventRecord(start); - ier = cufinufft_setpts_impl(M, d_x.data().get(), d_y.data().get(), d_z.data().get(), 0, nullptr, nullptr, - nullptr, dplan); - if (ier != 0) { - printf("err: cufinufft_setpts\n"); - return ier; + c[i].real(randm11()); + c[i].imag(randm11()); } - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - totaltime += milliseconds; - printf("[time ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds / 1000); - - cudaEventRecord(start); - ier = cufinufft_execute_impl((cuda_complex *)d_c.data().get(), (cuda_complex *)d_fk.data().get(), dplan); - if (ier != 0) { - printf("err: cufinufft_execute\n"); - return ier; + } else if (type == 2) { + for (int i = 0; i < N1 * N2 * N3; i++) { + fk[i].real(randm11()); + fk[i].imag(randm11()); } - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - totaltime += milliseconds; - float exec_ms = milliseconds; - printf("[time ] cufinufft exec:\t\t %.3g s\n", milliseconds / 1000); - - cudaEventRecord(start); - ier = cufinufft_destroy_impl(dplan); - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - totaltime += milliseconds; - printf("[time ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000); - - if (type == 1) - fk = d_fk; - else if (type == 2) - c = d_c; - - printf("[Method %d] %d NU pts to %d U pts in %.3g s:\t%.3g NU pts/s\n", opts.gpu_method, M, N1 * N2 * N3, - totaltime / 1000, M / totaltime * 1000); - printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", M / exec_ms * 1000); - - T rel_error = std::numeric_limits::max(); - if (type == 1) { - int nt1 = (int)(0.37 * N1), nt2 = (int)(0.26 * N2), nt3 = (int)(0.13 * N3); // choose some mode index to check - thrust::complex Ft = thrust::complex(0, 0), J = thrust::complex(0.0, iflag); - for (int j = 0; j < M; ++j) - Ft += c[j] * exp(J * (nt1 * x[j] + nt2 * y[j] + nt3 * z[j])); // crude direct - - int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2) + N1 * N2 * (N3 / 2 + nt3); // index in complex F as 1d array - rel_error = abs(Ft - fk[it]) / infnorm(N1, (std::complex *)fk.data()); - printf("[gpu ] one mode: rel err in F[%d,%d,%d] is %.3g\n", nt1, nt2, nt3, rel_error); - } else if (type == 2) { - int jt = M / 2; // check arbitrary choice of one targ pt - thrust::complex J = thrust::complex(0, iflag); - thrust::complex ct = thrust::complex(0, 0); - - int m = 0; - for (int m3 = -(N3 / 2); m3 <= (N3 - 1) / 2; ++m3) // loop in correct order over F - for (int m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F - for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1) - ct += fk[m++] * exp(J * (m1 * x[jt] + m2 * y[jt] + m3 * z[jt])); // crude direct - - rel_error = abs(c[jt] - ct) / infnorm(M, (std::complex *)c.data()); - printf("[gpu ] one targ: rel err in c[%ld] is %.3g\n", (int64_t)jt, rel_error); - } - - return std::isnan(rel_error) || rel_error > checktol; + } else { + std::cerr << "Invalid type " << type << " supplied\n"; + return 1; + } + + d_x = x; + d_y = y; + d_z = z; + + if (type == 1) + d_c = c; + else if (type == 2) + d_fk = fk; + + cudaEvent_t start, stop; + float milliseconds = 0; + float totaltime = 0; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + // warm up CUFFT (is slow, takes around 0.2 sec... ) + cudaEventRecord(start); + { + int nf1 = 1; + cufftHandle fftplan; + cufftPlan1d(&fftplan, nf1, cufft_type(), 1); + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + printf("[time ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000); + + // now to the test... + cufinufft_plan_t *dplan; + int dim = 3; + + // Here we setup our own opts, for gpu_method and gpu_kerevalmeth. + cufinufft_opts opts; + cufinufft_default_opts(&opts); + + opts.gpu_method = method; + opts.gpu_kerevalmeth = 1; + opts.gpu_maxbatchsize = 1; + + int nmodes[3] = {N1, N2, N3}; + int ntransf = 1; + + cudaEventRecord(start); + ier = cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts); + if (ier != 0) { + printf("err: cufinufft_makeplan\n"); + return ier; + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + totaltime += milliseconds; + printf("[time ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000); + + cudaEventRecord(start); + ier = cufinufft_setpts_impl(M, d_x.data().get(), d_y.data().get(), d_z.data().get(), + 0, nullptr, nullptr, nullptr, dplan); + if (ier != 0) { + printf("err: cufinufft_setpts\n"); + return ier; + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + totaltime += milliseconds; + printf("[time ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds / 1000); + + cudaEventRecord(start); + ier = cufinufft_execute_impl((cuda_complex *)d_c.data().get(), + (cuda_complex *)d_fk.data().get(), dplan); + if (ier != 0) { + printf("err: cufinufft_execute\n"); + return ier; + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + totaltime += milliseconds; + float exec_ms = milliseconds; + printf("[time ] cufinufft exec:\t\t %.3g s\n", milliseconds / 1000); + + cudaEventRecord(start); + ier = cufinufft_destroy_impl(dplan); + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + totaltime += milliseconds; + printf("[time ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000); + + if (type == 1) + fk = d_fk; + else if (type == 2) + c = d_c; + + printf("[Method %d] %d NU pts to %d U pts in %.3g s:\t%.3g NU pts/s\n", opts.gpu_method, + M, N1 * N2 * N3, totaltime / 1000, M / totaltime * 1000); + printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", M / exec_ms * 1000); + + T rel_error = std::numeric_limits::max(); + if (type == 1) { + int nt1 = (int)(0.37 * N1), nt2 = (int)(0.26 * N2), nt3 = (int)(0.13 * N3); // choose + // some + // mode + // index + // to + // check + thrust::complex Ft = thrust::complex(0, 0), J = thrust::complex(0.0, iflag); + for (int j = 0; j < M; ++j) + Ft += c[j] * exp(J * (nt1 * x[j] + nt2 * y[j] + nt3 * z[j])); // crude direct + + int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2) + N1 * N2 * (N3 / 2 + nt3); // index in + // complex F + // as 1d array + rel_error = abs(Ft - fk[it]) / infnorm(N1, (std::complex *)fk.data()); + printf("[gpu ] one mode: rel err in F[%d,%d,%d] is %.3g\n", nt1, nt2, nt3, + rel_error); + } else if (type == 2) { + int jt = M / 2; // check arbitrary choice of one targ pt + thrust::complex J = thrust::complex(0, iflag); + thrust::complex ct = thrust::complex(0, 0); + + int m = 0; + for (int m3 = -(N3 / 2); m3 <= (N3 - 1) / 2; ++m3) // loop in correct order over F + for (int m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F + for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1) + ct += fk[m++] * exp(J * (m1 * x[jt] + m2 * y[jt] + m3 * z[jt])); // crude direct + + rel_error = abs(c[jt] - ct) / infnorm(M, (std::complex *)c.data()); + printf("[gpu ] one targ: rel err in c[%ld] is %.3g\n", (int64_t)jt, rel_error); + } + + return std::isnan(rel_error) || rel_error > checktol; } int main(int argc, char *argv[]) { - if (argc < 10) { - fprintf(stderr, "Usage: cufinufft3d1_test method type N1 N2 N3 M tol checktol prec\n" - "Arguments:\n" - " method: One of\n" - " 1: nupts driven,\n" - " 2: sub-problem, or\n" - " 4: block gather.\n" - " type: Type of transform (1, 2)" - " N1, N2, N3: The size of the 3D array\n" - " M: The number of non-uniform points\n" - " tol: NUFFT tolerance\n" - " checktol: relative error to pass test\n" - " prec: 'f' or 'd' (float/double)\n"); - return 1; - } - const int method = atoi(argv[1]); - const int type = atoi(argv[2]); - const int N1 = atof(argv[3]); - const int N2 = atof(argv[4]); - const int N3 = atof(argv[5]); - const int M = atof(argv[6]); - const double tol = atof(argv[7]); - const double checktol = atof(argv[8]); - const char prec = argv[9][0]; - const int iflag = 1; - - if (prec == 'f') - return run_test(method, type, N1, N2, N3, M, tol, checktol, iflag); - else if (prec == 'd') - return run_test(method, type, N1, N2, N3, M, tol, checktol, iflag); - else - return -1; + if (argc < 10) { + fprintf(stderr, "Usage: cufinufft3d1_test method type N1 N2 N3 M tol checktol prec\n" + "Arguments:\n" + " method: One of\n" + " 1: nupts driven,\n" + " 2: sub-problem, or\n" + " 4: block gather.\n" + " type: Type of transform (1, 2)" + " N1, N2, N3: The size of the 3D array\n" + " M: The number of non-uniform points\n" + " tol: NUFFT tolerance\n" + " checktol: relative error to pass test\n" + " prec: 'f' or 'd' (float/double)\n"); + return 1; + } + const int method = atoi(argv[1]); + const int type = atoi(argv[2]); + const int N1 = atof(argv[3]); + const int N2 = atof(argv[4]); + const int N3 = atof(argv[5]); + const int M = atof(argv[6]); + const double tol = atof(argv[7]); + const double checktol = atof(argv[8]); + const char prec = argv[9][0]; + const int iflag = 1; + + if (prec == 'f') + return run_test(method, type, N1, N2, N3, M, tol, checktol, iflag); + else if (prec == 'd') + return run_test(method, type, N1, N2, N3, M, tol, checktol, iflag); + else + return -1; } diff --git a/test/cuda/fseries_kernel_test.cu b/test/cuda/fseries_kernel_test.cu index 7e1a5f728..7f18ee21c 100644 --- a/test/cuda/fseries_kernel_test.cu +++ b/test/cuda/fseries_kernel_test.cu @@ -13,155 +13,146 @@ using namespace cufinufft::common; using namespace cufinufft::spreadinterp; using namespace cufinufft::utils; -template -int run_test(int nf1, int dim, T eps, int gpu, int nf2, int nf3) { +template int run_test(int nf1, int dim, T eps, int gpu, int nf2, int nf3) { - finufft_spread_opts opts; - T *fwkerhalf1, *fwkerhalf2, *fwkerhalf3; - T *d_fwkerhalf1, *d_fwkerhalf2, *d_fwkerhalf3; - checkCudaErrors(cudaMalloc(&d_fwkerhalf1, sizeof(T) * (nf1 / 2 + 1))); - if (dim > 1) - checkCudaErrors(cudaMalloc(&d_fwkerhalf2, sizeof(T) * (nf2 / 2 + 1))); - if (dim > 2) - checkCudaErrors(cudaMalloc(&d_fwkerhalf3, sizeof(T) * (nf3 / 2 + 1))); + finufft_spread_opts opts; + T *fwkerhalf1, *fwkerhalf2, *fwkerhalf3; + T *d_fwkerhalf1, *d_fwkerhalf2, *d_fwkerhalf3; + checkCudaErrors(cudaMalloc(&d_fwkerhalf1, sizeof(T) * (nf1 / 2 + 1))); + if (dim > 1) checkCudaErrors(cudaMalloc(&d_fwkerhalf2, sizeof(T) * (nf2 / 2 + 1))); + if (dim > 2) checkCudaErrors(cudaMalloc(&d_fwkerhalf3, sizeof(T) * (nf3 / 2 + 1))); - int ier = setup_spreader(opts, (T)eps, (T)2.0, 0); + int ier = setup_spreader(opts, (T)eps, (T)2.0, 0); - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); - float milliseconds = 0; - float gputime = 0; - float cputime = 0; + float milliseconds = 0; + float gputime = 0; + float cputime = 0; - CNTime timer; - if (!gpu) { - timer.start(); - fwkerhalf1 = (T *)malloc(sizeof(T) * (nf1 / 2 + 1)); - if (dim > 1) - fwkerhalf2 = (T *)malloc(sizeof(T) * (nf2 / 2 + 1)); - if (dim > 2) - fwkerhalf3 = (T *)malloc(sizeof(T) * (nf3 / 2 + 1)); + CNTime timer; + if (!gpu) { + timer.start(); + fwkerhalf1 = (T *)malloc(sizeof(T) * (nf1 / 2 + 1)); + if (dim > 1) fwkerhalf2 = (T *)malloc(sizeof(T) * (nf2 / 2 + 1)); + if (dim > 2) fwkerhalf3 = (T *)malloc(sizeof(T) * (nf3 / 2 + 1)); - onedim_fseries_kernel(nf1, fwkerhalf1, opts); - if (dim > 1) - onedim_fseries_kernel(nf2, fwkerhalf2, opts); - if (dim > 2) - onedim_fseries_kernel(nf3, fwkerhalf3, opts); - cputime = timer.elapsedsec(); - cudaEventRecord(start); - { - checkCudaErrors(cudaMemcpy(d_fwkerhalf1, fwkerhalf1, sizeof(T) * (nf1 / 2 + 1), cudaMemcpyHostToDevice)); - if (dim > 1) - checkCudaErrors( - cudaMemcpy(d_fwkerhalf2, fwkerhalf2, sizeof(T) * (nf2 / 2 + 1), cudaMemcpyHostToDevice)); - if (dim > 2) - checkCudaErrors( - cudaMemcpy(d_fwkerhalf3, fwkerhalf3, sizeof(T) * (nf3 / 2 + 1), cudaMemcpyHostToDevice)); - } - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - gputime = milliseconds; - printf("[time ] dim=%d, nf1=%8d, ns=%2d, CPU: %6.2f ms\n", dim, nf1, opts.nspread, gputime + cputime * 1000); - free(fwkerhalf1); - if (dim > 1) - free(fwkerhalf2); - if (dim > 2) - free(fwkerhalf3); - } else { - timer.start(); - std::complex a[dim * MAX_NQUAD]; - T f[dim * MAX_NQUAD]; - onedim_fseries_kernel_precomp(nf1, f, a, opts); - if (dim > 1) - onedim_fseries_kernel_precomp(nf2, f + MAX_NQUAD, a + MAX_NQUAD, opts); - if (dim > 2) - onedim_fseries_kernel_precomp(nf3, f + 2 * MAX_NQUAD, a + 2 * MAX_NQUAD, opts); - cputime = timer.elapsedsec(); + onedim_fseries_kernel(nf1, fwkerhalf1, opts); + if (dim > 1) onedim_fseries_kernel(nf2, fwkerhalf2, opts); + if (dim > 2) onedim_fseries_kernel(nf3, fwkerhalf3, opts); + cputime = timer.elapsedsec(); + cudaEventRecord(start); + { + checkCudaErrors(cudaMemcpy(d_fwkerhalf1, fwkerhalf1, sizeof(T) * (nf1 / 2 + 1), + cudaMemcpyHostToDevice)); + if (dim > 1) + checkCudaErrors(cudaMemcpy(d_fwkerhalf2, fwkerhalf2, sizeof(T) * (nf2 / 2 + 1), + cudaMemcpyHostToDevice)); + if (dim > 2) + checkCudaErrors(cudaMemcpy(d_fwkerhalf3, fwkerhalf3, sizeof(T) * (nf3 / 2 + 1), + cudaMemcpyHostToDevice)); + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + gputime = milliseconds; + printf("[time ] dim=%d, nf1=%8d, ns=%2d, CPU: %6.2f ms\n", dim, nf1, opts.nspread, + gputime + cputime * 1000); + free(fwkerhalf1); + if (dim > 1) free(fwkerhalf2); + if (dim > 2) free(fwkerhalf3); + } else { + timer.start(); + std::complex a[dim * MAX_NQUAD]; + T f[dim * MAX_NQUAD]; + onedim_fseries_kernel_precomp(nf1, f, a, opts); + if (dim > 1) onedim_fseries_kernel_precomp(nf2, f + MAX_NQUAD, a + MAX_NQUAD, opts); + if (dim > 2) + onedim_fseries_kernel_precomp(nf3, f + 2 * MAX_NQUAD, a + 2 * MAX_NQUAD, opts); + cputime = timer.elapsedsec(); - cuDoubleComplex *d_a; - T *d_f; - cudaEventRecord(start); - { - checkCudaErrors(cudaMalloc(&d_a, dim * MAX_NQUAD * sizeof(cuDoubleComplex))); - checkCudaErrors(cudaMalloc(&d_f, dim * MAX_NQUAD * sizeof(T))); - checkCudaErrors(cudaMemcpy(d_a, a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_f, f, dim * MAX_NQUAD * sizeof(T), cudaMemcpyHostToDevice)); - ier = cufserieskernelcompute(dim, nf1, nf2, nf3, d_f, d_a, d_fwkerhalf1, d_fwkerhalf2, d_fwkerhalf3, - opts.nspread, cudaStreamDefault); - } - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - gputime = milliseconds; - printf("[time ] dim=%d, nf1=%8d, ns=%2d, GPU: %6.2f ms\n", dim, nf1, opts.nspread, gputime + cputime * 1000); - cudaFree(d_a); - cudaFree(d_f); + cuDoubleComplex *d_a; + T *d_f; + cudaEventRecord(start); + { + checkCudaErrors(cudaMalloc(&d_a, dim * MAX_NQUAD * sizeof(cuDoubleComplex))); + checkCudaErrors(cudaMalloc(&d_f, dim * MAX_NQUAD * sizeof(T))); + checkCudaErrors(cudaMemcpy(d_a, a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), + cudaMemcpyHostToDevice)); + checkCudaErrors( + cudaMemcpy(d_f, f, dim * MAX_NQUAD * sizeof(T), cudaMemcpyHostToDevice)); + ier = + cufserieskernelcompute(dim, nf1, nf2, nf3, d_f, d_a, d_fwkerhalf1, d_fwkerhalf2, + d_fwkerhalf3, opts.nspread, cudaStreamDefault); } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&milliseconds, start, stop); + gputime = milliseconds; + printf("[time ] dim=%d, nf1=%8d, ns=%2d, GPU: %6.2f ms\n", dim, nf1, opts.nspread, + gputime + cputime * 1000); + cudaFree(d_a); + cudaFree(d_f); + } - fwkerhalf1 = (T *)malloc(sizeof(T) * (nf1 / 2 + 1)); - if (dim > 1) - fwkerhalf2 = (T *)malloc(sizeof(T) * (nf2 / 2 + 1)); - if (dim > 2) - fwkerhalf3 = (T *)malloc(sizeof(T) * (nf3 / 2 + 1)); + fwkerhalf1 = (T *)malloc(sizeof(T) * (nf1 / 2 + 1)); + if (dim > 1) fwkerhalf2 = (T *)malloc(sizeof(T) * (nf2 / 2 + 1)); + if (dim > 2) fwkerhalf3 = (T *)malloc(sizeof(T) * (nf3 / 2 + 1)); - checkCudaErrors(cudaMemcpy(fwkerhalf1, d_fwkerhalf1, sizeof(T) * (nf1 / 2 + 1), cudaMemcpyDeviceToHost)); - if (dim > 1) - checkCudaErrors(cudaMemcpy(fwkerhalf2, d_fwkerhalf2, sizeof(T) * (nf2 / 2 + 1), cudaMemcpyDeviceToHost)); - if (dim > 2) - checkCudaErrors(cudaMemcpy(fwkerhalf3, d_fwkerhalf3, sizeof(T) * (nf3 / 2 + 1), cudaMemcpyDeviceToHost)); - for (int i = 0; i < nf1 / 2 + 1; i++) - printf("%10.8e ", fwkerhalf1[i]); - printf("\n"); - if (dim > 1) - for (int i = 0; i < nf2 / 2 + 1; i++) - printf("%10.8e ", fwkerhalf2[i]); - printf("\n"); - if (dim > 2) - for (int i = 0; i < nf3 / 2 + 1; i++) - printf("%10.8e ", fwkerhalf3[i]); - printf("\n"); + checkCudaErrors(cudaMemcpy(fwkerhalf1, d_fwkerhalf1, sizeof(T) * (nf1 / 2 + 1), + cudaMemcpyDeviceToHost)); + if (dim > 1) + checkCudaErrors(cudaMemcpy(fwkerhalf2, d_fwkerhalf2, sizeof(T) * (nf2 / 2 + 1), + cudaMemcpyDeviceToHost)); + if (dim > 2) + checkCudaErrors(cudaMemcpy(fwkerhalf3, d_fwkerhalf3, sizeof(T) * (nf3 / 2 + 1), + cudaMemcpyDeviceToHost)); + for (int i = 0; i < nf1 / 2 + 1; i++) printf("%10.8e ", fwkerhalf1[i]); + printf("\n"); + if (dim > 1) + for (int i = 0; i < nf2 / 2 + 1; i++) printf("%10.8e ", fwkerhalf2[i]); + printf("\n"); + if (dim > 2) + for (int i = 0; i < nf3 / 2 + 1; i++) printf("%10.8e ", fwkerhalf3[i]); + printf("\n"); - return 0; + return 0; } int main(int argc, char *argv[]) { - if (argc < 3) { - fprintf(stderr, "Usage: onedim_fseries_kernel_test prec nf1 [dim [tol [gpuversion [nf2 [nf3]]]]]\n" - "Arguments:\n" - " prec: 'f' or 'd' (float/double)\n" - " nf1: The size of the upsampled fine grid size in x.\n" - " dim: Dimension of the nuFFT.\n" - " tol: NUFFT tolerance (default 1e-6).\n" - " gpuversion: Use gpu version or not (default True).\n" - " nf2: The size of the upsampled fine grid size in y. (default nf1)\n" - " nf3: The size of the upsampled fine grid size in z. (default nf3)\n"); - return 1; - } - char prec = argv[1][0]; - int nf1 = std::atof(argv[2]); - int dim = 1; - double eps = 1e-6; - int gpu = 1; - int nf2 = nf1; - int nf3 = nf1; - if (argc > 3) - dim = std::atoi(argv[3]); - if (argc > 4) - eps = std::atof(argv[4]); - if (argc > 5) - gpu = std::atoi(argv[5]); - if (argc > 6) - nf2 = std::atoi(argv[6]); - if (argc > 7) - nf3 = std::atoi(argv[7]); + if (argc < 3) { + fprintf(stderr, + "Usage: onedim_fseries_kernel_test prec nf1 [dim [tol [gpuversion [nf2 " + "[nf3]]]]]\n" + "Arguments:\n" + " prec: 'f' or 'd' (float/double)\n" + " nf1: The size of the upsampled fine grid size in x.\n" + " dim: Dimension of the nuFFT.\n" + " tol: NUFFT tolerance (default 1e-6).\n" + " gpuversion: Use gpu version or not (default True).\n" + " nf2: The size of the upsampled fine grid size in y. (default nf1)\n" + " nf3: The size of the upsampled fine grid size in z. (default nf3)\n"); + return 1; + } + char prec = argv[1][0]; + int nf1 = std::atof(argv[2]); + int dim = 1; + double eps = 1e-6; + int gpu = 1; + int nf2 = nf1; + int nf3 = nf1; + if (argc > 3) dim = std::atoi(argv[3]); + if (argc > 4) eps = std::atof(argv[4]); + if (argc > 5) gpu = std::atoi(argv[5]); + if (argc > 6) nf2 = std::atoi(argv[6]); + if (argc > 7) nf3 = std::atoi(argv[7]); - if (prec == 'f') - return run_test(nf1, dim, eps, gpu, nf2, nf3); - else if (prec == 'd') - return run_test(nf1, dim, eps, gpu, nf2, nf3); - else - return -1; + if (prec == 'f') + return run_test(nf1, dim, eps, gpu, nf2, nf3); + else if (prec == 'd') + return run_test(nf1, dim, eps, gpu, nf2, nf3); + else + return -1; } diff --git a/test/directft/dirft1d.cpp b/test/directft/dirft1d.cpp index a52d826c4..5f36d76d7 100644 --- a/test/directft/dirft1d.cpp +++ b/test/directft/dirft1d.cpp @@ -1,14 +1,14 @@ -#include #include +#include #include // This is basically a port of dirft1d.f from CMCL package, except with // the 1/nj prefactors for type-1 removed. -void dirft1d1(BIGINT nj,FLT* x,CPX* c,int iflag,BIGINT ms, CPX* f) +void dirft1d1(BIGINT nj, FLT *x, CPX *c, int iflag, BIGINT ms, CPX *f) /* Direct computation of 1D type-1 nonuniform FFT. Interface same as finufft1d1. c nj-1 -c f[k1] = SUM c[j] exp(+-i k1 x[j]) +c f[k1] = SUM c[j] exp(+-i k1 x[j]) c j=0 c c for -ms/2 <= k1 <= (ms-1)/2. @@ -17,24 +17,24 @@ c used, otherwise the - sign is used, in the exponential. * Uses C++ complex type and winding trick. Barnett 1/25/17 */ { - BIGINT kmin = -(ms/2); // integer divide - for (BIGINT m=0;m0) ? exp(IMA*x[j]) : exp(-IMA*x[j]); - CPX p = pow(a,(FLT)kmin); // starting phase for most neg freq + BIGINT kmin = -(ms / 2); // integer divide + for (BIGINT m = 0; m < ms; ++m) f[m] = CPX(0, 0); // it knows f is complex type + for (BIGINT j = 0; j < nj; ++j) { + CPX a = (iflag > 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]); + CPX p = pow(a, (FLT)kmin); // starting phase for most neg freq CPX cc = c[j]; // no 1/nj prefac - for (BIGINT m=0;m0 the + sign is @@ -42,12 +42,12 @@ c used, otherwise the - sign is used, in the exponential. * Uses C++ complex type and winding trick. Barnett 1/25/17 */ { - BIGINT kmin = -(ms/2); // integer divide - for (BIGINT j=0;j0) ? exp(IMA*x[j]) : exp(-IMA*x[j]); - CPX p = pow(a,(FLT)kmin); // starting phase for most neg freq - CPX cc = CPX(0,0); - for (BIGINT m=0;m 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]); + CPX p = pow(a, (FLT)kmin); // starting phase for most neg freq + CPX cc = CPX(0, 0); + for (BIGINT m = 0; m < ms; ++m) { cc += f[m] * p; p *= a; } @@ -55,20 +55,19 @@ c used, otherwise the - sign is used, in the exponential. } } -void dirft1d3(BIGINT nj,FLT* x,CPX* c,int iflag,BIGINT nk, FLT* s, CPX* f) +void dirft1d3(BIGINT nj, FLT *x, CPX *c, int iflag, BIGINT nk, FLT *s, CPX *f) /* Direct computation of 1D type-3 nonuniform FFT. Interface same as finufft1d3 c nj-1 -c f[k] = SUM c[j] exp(+-i s[k] x[j]) -c j=0 +c f[k] = SUM c[j] exp(+-i s[k] x[j]) +c j=0 c for k = 0, ..., nk-1 c If iflag>0 the + sign is used, otherwise the - sign is used, in the c exponential. Uses C++ complex type. Simple brute force. Barnett 1/25/17 */ { - for (BIGINT k=0;k0) ? IMA*s[k] : -IMA*s[k]; - f[k] = CPX(0,0); - for (BIGINT j=0;j 0) ? IMA * s[k] : -IMA * s[k]; + f[k] = CPX(0, 0); + for (BIGINT j = 0; j < nj; ++j) f[k] += c[j] * exp(ss * x[j]); } } diff --git a/test/directft/dirft2d.cpp b/test/directft/dirft2d.cpp index 4f91141f6..c13661549 100644 --- a/test/directft/dirft2d.cpp +++ b/test/directft/dirft2d.cpp @@ -1,11 +1,11 @@ -#include #include +#include #include // This is basically a port of dirft2d.f from CMCL package, except with // the 1/nj prefactors for type-1 removed. -void dirft2d1(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT ms, BIGINT mt, CPX* f) +void dirft2d1(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT ms, BIGINT mt, CPX *f) /* Direct computation of 2D type-1 nonuniform FFT. Interface same as finufft2d1. c nj-1 c f[k1,k2] = SUM c[j] exp(+-i (k1 x[j] + k2 y[j])) @@ -18,32 +18,32 @@ c used, otherwise the - sign is used, in the exponential. * Uses C++ complex type and winding trick. Barnett 1/26/17 */ { - BIGINT k1min = -(ms/2), k2min = -(mt/2); // integer divide - BIGINT N = ms*mt; // total # output modes - for (BIGINT m=0;m0) ? exp(IMA*x[j]) : exp(-IMA*x[j]); - CPX a2 = (iflag>0) ? exp(IMA*y[j]) : exp(-IMA*y[j]); - CPX sp1 = pow(a1,(FLT)k1min); // starting phase for most neg k1 freq - CPX p2 = pow(a2,(FLT)k2min); - CPX cc = c[j]; // no 1/nj norm - BIGINT m=0; // output pointer - for (BIGINT m2=0;m2 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]); + CPX a2 = (iflag > 0) ? exp(IMA * y[j]) : exp(-IMA * y[j]); + CPX sp1 = pow(a1, (FLT)k1min); // starting phase for most neg k1 freq + CPX p2 = pow(a2, (FLT)k2min); + CPX cc = c[j]; // no 1/nj norm + BIGINT m = 0; // output pointer + for (BIGINT m2 = 0; m2 < mt; ++m2) { + CPX p1 = sp1; // must reset p1 for each inner loop + for (BIGINT m1 = 0; m1 < ms; ++m1) { // ms is fast, mt slow + f[m++] += cc * p1 * p2; + p1 *= a1; } p2 *= a2; } } } -void dirft2d2(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT ms, BIGINT mt, CPX* f) +void dirft2d2(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT ms, BIGINT mt, CPX *f) /* Direct computation of 2D type-2 nonuniform FFT. Interface same as finufft2d2 - c[j] = SUM f[k1,k2] exp(+-i (k1 x[j] + k2 y[j])) - k1,k2 + c[j] = SUM f[k1,k2] exp(+-i (k1 x[j] + k2 y[j])) + k1,k2 for j = 0,...,nj-1 where sum is over -ms/2 <= k1 <= (ms-1)/2, -mt/2 <= k2 <= (mt-1)/2. @@ -54,19 +54,19 @@ void dirft2d2(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT ms, BIGINT mt, CPX Uses C++ complex type and winding trick. Barnett 1/26/17 */ { - BIGINT k1min = -(ms/2), k2min = -(mt/2); // integer divide - for (BIGINT j=0;j0) ? exp(IMA*x[j]) : exp(-IMA*x[j]); - CPX a2 = (iflag>0) ? exp(IMA*y[j]) : exp(-IMA*y[j]); - CPX sp1 = pow(a1,(FLT)k1min); - CPX p2 = pow(a2,(FLT)k2min); - CPX cc = CPX(0,0); - BIGINT m=0; // input pointer - for (BIGINT m2=0;m2 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]); + CPX a2 = (iflag > 0) ? exp(IMA * y[j]) : exp(-IMA * y[j]); + CPX sp1 = pow(a1, (FLT)k1min); + CPX p2 = pow(a2, (FLT)k2min); + CPX cc = CPX(0, 0); + BIGINT m = 0; // input pointer + for (BIGINT m2 = 0; m2 < mt; ++m2) { CPX p1 = sp1; - for (BIGINT m1=0;m10 the + sign is used, otherwise the - sign is used, in the c exponential. Uses C++ complex type. Simple brute force. Barnett 1/26/17 */ { - for (BIGINT k=0;k0) ? IMA*s[k] : -IMA*s[k]; - CPX tt = (iflag>0) ? IMA*t[k] : -IMA*t[k]; - f[k] = CPX(0,0); - for (BIGINT j=0;j 0) ? IMA * s[k] : -IMA * s[k]; + CPX tt = (iflag > 0) ? IMA * t[k] : -IMA * t[k]; + f[k] = CPX(0, 0); + for (BIGINT j = 0; j < nj; ++j) f[k] += c[j] * exp(ss * x[j] + tt * y[j]); } } diff --git a/test/directft/dirft3d.cpp b/test/directft/dirft3d.cpp index 63e002283..452b62471 100644 --- a/test/directft/dirft3d.cpp +++ b/test/directft/dirft3d.cpp @@ -1,11 +1,12 @@ -#include #include +#include #include // This is basically a port of dirft2d.f from CMCL package, except with // the 1/nj prefactors for type-1 removed. -void dirft3d1(BIGINT nj,FLT* x,FLT *y,FLT *z, CPX* c,int iflag,BIGINT ms, BIGINT mt, BIGINT mu, CPX* f) +void dirft3d1(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT ms, BIGINT mt, + BIGINT mu, CPX *f) /* Direct computation of 3D type-1 nonuniform FFT. Interface same as finufft3d1. c nj-1 c f[k1,k2,k3] = SUM c[j] exp(+-i (k1 x[j] + k2 y[j] + k2 z[j])) @@ -19,38 +20,39 @@ c used, otherwise the - sign is used, in the exponential. * Uses C++ complex type and winding trick. Barnett 2/1/17 */ { - BIGINT k1min = -(ms/2), k2min = -(mt/2), k3min = -(mu/2); // integer divide - BIGINT N = ms*mt*mu; // total # output modes - for (BIGINT m=0;m0) ? exp(IMA*x[j]) : exp(-IMA*x[j]); - CPX a2 = (iflag>0) ? exp(IMA*y[j]) : exp(-IMA*y[j]); - CPX a3 = (iflag>0) ? exp(IMA*z[j]) : exp(-IMA*z[j]); - CPX sp1 = pow(a1,(FLT)k1min); // starting phase for most neg k1 freq - CPX sp2 = pow(a2,(FLT)k2min); - CPX p3 = pow(a3,(FLT)k3min); - CPX cc = c[j]; // no 1/nj norm - BIGINT m=0; // output pointer - for (BIGINT m3=0;m3 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]); + CPX a2 = (iflag > 0) ? exp(IMA * y[j]) : exp(-IMA * y[j]); + CPX a3 = (iflag > 0) ? exp(IMA * z[j]) : exp(-IMA * z[j]); + CPX sp1 = pow(a1, (FLT)k1min); // starting phase for most neg k1 freq + CPX sp2 = pow(a2, (FLT)k2min); + CPX p3 = pow(a3, (FLT)k3min); + CPX cc = c[j]; // no 1/nj norm + BIGINT m = 0; // output pointer + for (BIGINT m3 = 0; m3 < mu; ++m3) { CPX p2 = sp2; - for (BIGINT m2=0;m20) ? exp(IMA*x[j]) : exp(-IMA*x[j]); - CPX a2 = (iflag>0) ? exp(IMA*y[j]) : exp(-IMA*y[j]); - CPX a3 = (iflag>0) ? exp(IMA*z[j]) : exp(-IMA*z[j]); - CPX sp1 = pow(a1,(FLT)k1min); - CPX sp2 = pow(a2,(FLT)k2min); - CPX p3 = pow(a3,(FLT)k3min); - CPX cc = CPX(0,0); - BIGINT m=0; // input pointer - for (BIGINT m3=0;m3 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]); + CPX a2 = (iflag > 0) ? exp(IMA * y[j]) : exp(-IMA * y[j]); + CPX a3 = (iflag > 0) ? exp(IMA * z[j]) : exp(-IMA * z[j]); + CPX sp1 = pow(a1, (FLT)k1min); + CPX sp2 = pow(a2, (FLT)k2min); + CPX p3 = pow(a3, (FLT)k3min); + CPX cc = CPX(0, 0); + BIGINT m = 0; // input pointer + for (BIGINT m3 = 0; m3 < mu; ++m3) { CPX p2 = sp2; - for (BIGINT m2=0;m20 the + sign is used, otherwise the - sign is used, in the c exponential. Uses C++ complex type. Simple brute force. Barnett 2/1/17 */ { - for (BIGINT k=0;k0) ? IMA*s[k] : -IMA*s[k]; - CPX tt = (iflag>0) ? IMA*t[k] : -IMA*t[k]; - CPX uu = (iflag>0) ? IMA*u[k] : -IMA*u[k]; - f[k] = CPX(0,0); - for (BIGINT j=0;j 0) ? IMA * s[k] : -IMA * s[k]; + CPX tt = (iflag > 0) ? IMA * t[k] : -IMA * t[k]; + CPX uu = (iflag > 0) ? IMA * u[k] : -IMA * u[k]; + f[k] = CPX(0, 0); + for (BIGINT j = 0; j < nj; ++j) f[k] += c[j] * exp(ss * x[j] + tt * y[j] + uu * z[j]); } } diff --git a/test/dumbinputs.cpp b/test/dumbinputs.cpp index b1e8bc6a9..d48757aee 100644 --- a/test/dumbinputs.cpp +++ b/test/dumbinputs.cpp @@ -6,7 +6,7 @@ Usage (linux): ./dumbinputs{f} 2> /dev/null (since FINUFFT will spit msgs to stderr, to be ignored) - + Pass: exit code 0. (Stdout should indicate passed) Fail: exit code>0. (Stdout may indicate what failed) @@ -24,61 +24,62 @@ Removed the chkbnds case to 1d1, 05/08/2024. Suggested compile: - g++ -std=c++14 -fopenmp dumbinputs.cpp -I../include ../lib/libfinufft.so -o dumbinputs -lfftw3 -lfftw3_omp -lm - g++ -std=c++14 -fopenmp dumbinputs.cpp -I../include ../lib/libfinufft.so -o dumbinputsf -lfftw3 -lfftw3_omp -lm -DSINGLE + g++ -std=c++14 -fopenmp dumbinputs.cpp -I../include ../lib/libfinufft.so -o dumbinputs + -lfftw3 -lfftw3_omp -lm g++ -std=c++14 -fopenmp dumbinputs.cpp -I../include + ../lib/libfinufft.so -o dumbinputsf -lfftw3 -lfftw3_omp -lm -DSINGLE or if you have built a single-core version: - g++ -std=c++14 dumbinputs.cpp -I../include ../lib/libfinufft.so -o dumbinputs -lfftw3 -lm - etc + g++ -std=c++14 dumbinputs.cpp -I../include ../lib/libfinufft.so -o dumbinputs -lfftw3 + -lm etc */ // This switches FLT macro from double to float if SINGLE is defined, etc... -#include #include "directft/dirft1d.cpp" #include "directft/dirft2d.cpp" #include "directft/dirft3d.cpp" +#include using namespace std; -using namespace finufft::utils; // for twonorm, etc +using namespace finufft::utils; // for twonorm, etc -int main(int argc, char* argv[]) -{ - int M = 100; // number of nonuniform points - int N = 10; // # modes, keep small, also output NU pts in type 3 +int main(int argc, char *argv[]) { + int M = 100; // number of nonuniform points + int N = 10; // # modes, keep small, also output NU pts in type 3 #ifdef SINGLE - FLT acc = 1e-5; // desired accuracy for NUFFTs (prec-dep) + FLT acc = 1e-5; // desired accuracy for NUFFTs (prec-dep) #else - FLT acc = 1e-8; // desired accuracy for NUFFTs + FLT acc = 1e-8; // desired accuracy for NUFFTs #endif - finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts); + finufft_opts opts; + FINUFFT_DEFAULT_OPTS(&opts); - int NN = N*N*N; // modes F alloc size since we'll go to 3d + int NN = N * N * N; // modes F alloc size since we'll go to 3d // generate some "random" nonuniform points (x) and complex strengths (c): - FLT *x = (FLT *)malloc(sizeof(FLT)*M); - CPX* c = (CPX*)malloc(sizeof(CPX)*M); - for (int j=0; j100*acc) { - printf("1d3 M=1:\tier=%d nrm(err)=%.3g\n",ier,err); + ier = FINUFFT1D3(1, x, c, +1, acc, N, s, F, &opts); // XK prod formally 0 + dirft1d3(1, x, c, +1, N, s, Fe); + for (int k = 0; k < N; ++k) F[k] -= Fe[k]; // acc chk + FLT err = twonorm(N, F) / sqrt((FLT)N); + if (ier || err > 100 * acc) { + printf("1d3 M=1:\tier=%d nrm(err)=%.3g\n", ier, err); return 1; } - ier = FINUFFT1D3(M,x,c,+1,acc,1,s,F,&opts); - dirft1d3(M,x,c,+1,1,s,Fe); - err = abs(F[0]-Fe[0]); - if (ier || err>10*acc) { - printf("1d3 nk=1:\tier=%d err=%.3g\n",ier,err); + ier = FINUFFT1D3(M, x, c, +1, acc, 1, s, F, &opts); + dirft1d3(M, x, c, +1, 1, s, Fe); + err = abs(F[0] - Fe[0]); + if (ier || err > 10 * acc) { + printf("1d3 nk=1:\tier=%d err=%.3g\n", ier, err); return 1; } - ier = FINUFFT1D3(1,x,c,+1,acc,1,s,F,&opts); - dirft1d3(1,x,c,+1,1,s,Fe); - err = abs(F[0]-Fe[0]); - if (ier || err>10*acc) { - printf("1d3 M=nk=1:\tier=%d err=%.3g\n",ier,err); + ier = FINUFFT1D3(1, x, c, +1, acc, 1, s, F, &opts); + dirft1d3(1, x, c, +1, 1, s, Fe); + err = abs(F[0] - Fe[0]); + if (ier || err > 10 * acc) { + printf("1d3 M=nk=1:\tier=%d err=%.3g\n", ier, err); return 1; } - ier = FINUFFT1D3(M,x,c,+1,acc,N,shuge,F,&opts); - if (ier==0) { // any nonzero code accepted here - printf("1d3 XK prod too big:\twrong error code %d\n",ier); + ier = FINUFFT1D3(M, x, c, +1, acc, N, shuge, F, &opts); + if (ier == 0) { // any nonzero code accepted here + printf("1d3 XK prod too big:\twrong error code %d\n", ier); return 1; } - int ndata = 10; // how many multiple vectors to test it on - CPX* cm = (CPX*)malloc(sizeof(CPX)*M*ndata); - CPX* Fm = (CPX*)malloc(sizeof(CPX)*NN*ndata); // the biggest array - for (int j=0; j100*acc) { - printf("1d3many M=1:\tier=%d nrm(err)=%.3g\n",ier,err); + ier = FINUFFT1D3MANY(ndata, 1, x, cm, +1, acc, N, s, Fm, &opts); // XK prod formally 0 + dirft1d3(1, x, c, +1, N, s, Fe); + for (int k = 0; k < N; ++k) Fm[k] -= Fe[k]; // acc chk + err = twonorm(N, Fm) / sqrt((FLT)N); // rms, to 5e-5 abs; check just first trial + if (ier || err > 100 * acc) { + printf("1d3many M=1:\tier=%d nrm(err)=%.3g\n", ier, err); return 1; } - ier = FINUFFT1D3MANY(ndata,M,x,cm,+1,acc,1,s,Fm,&opts); - dirft1d3(M,x,c,+1,1,s,Fe); - err = abs(Fm[0]-Fe[0]); - if (ier || err>10*acc) { - printf("1d3many nk=1:\tier=%d err=%.3g\n",ier,err); + ier = FINUFFT1D3MANY(ndata, M, x, cm, +1, acc, 1, s, Fm, &opts); + dirft1d3(M, x, c, +1, 1, s, Fe); + err = abs(Fm[0] - Fe[0]); + if (ier || err > 10 * acc) { + printf("1d3many nk=1:\tier=%d err=%.3g\n", ier, err); return 1; } - ier = FINUFFT1D3MANY(ndata,1,x,cm,+1,acc,1,s,Fm,&opts); - dirft1d3(1,x,c,+1,1,s,Fe); - err = abs(Fm[0]-Fe[0]); - if (ier || err>10*acc) { - printf("1d3many M=nk=1:\tier=%d err=%.3g\n",ier,err); + ier = FINUFFT1D3MANY(ndata, 1, x, cm, +1, acc, 1, s, Fm, &opts); + dirft1d3(1, x, c, +1, 1, s, Fe); + err = abs(Fm[0] - Fe[0]); + if (ier || err > 10 * acc) { + printf("1d3many M=nk=1:\tier=%d err=%.3g\n", ier, err); return 1; } - ier = FINUFFT1D3MANY(ndata,M,x,cm,+1,acc,N,shuge,Fm,&opts); - if (ier==0) { // any nonzero code accepted here - printf("1d3many XK prod too big:\twrong error code %d\n",ier); + ier = FINUFFT1D3MANY(ndata, M, x, cm, +1, acc, N, shuge, Fm, &opts); + if (ier == 0) { // any nonzero code accepted here + printf("1d3many XK prod too big:\twrong error code %d\n", ier); return 1; } // 2222222222222222222222222222222222222222222222222222222222222222222222222 printf("2D dumb cases.\n"); // (uses y=x, and t=s in type 3) - ier = FINUFFT2D1(M,x,x,c,+1,0,N,N,F,&opts); + ier = FINUFFT2D1(M, x, x, c, +1, 0, N, N, F, &opts); if (ier != FINUFFT_WARN_EPS_TOO_SMALL) { - printf("2d1 tol=0:\twrong err code %d\n",ier); + printf("2d1 tol=0:\twrong err code %d\n", ier); return 1; } - ier = FINUFFT2D1(M,x,x,c,+1,acc,0,0,F,&opts); + ier = FINUFFT2D1(M, x, x, c, +1, acc, 0, 0, F, &opts); if (ier) { - printf("2d1 Ns=Nt=0:\tier=%d\n",ier); + printf("2d1 Ns=Nt=0:\tier=%d\n", ier); return ier; } - ier = FINUFFT2D1(M,x,x,c,+1,acc,0,N,F,&opts); + ier = FINUFFT2D1(M, x, x, c, +1, acc, 0, N, F, &opts); if (ier) { - printf("2d1 Ns=0,Nt>0:\tier=%d\n",ier); + printf("2d1 Ns=0,Nt>0:\tier=%d\n", ier); return ier; } - ier = FINUFFT2D1(M,x,x,c,+1,acc,N,0,F,&opts); + ier = FINUFFT2D1(M, x, x, c, +1, acc, N, 0, F, &opts); if (ier) { - printf("2d1 Ns>0,Nt=0:\tier=%d\n",ier); + printf("2d1 Ns>0,Nt=0:\tier=%d\n", ier); return ier; } - ier = FINUFFT2D1(0,x,x,c,+1,acc,N,N,F,&opts); - t = twonorm(N,F); - if (ier || t!=0.0) { - printf("2d1 M=0:\tier=%d nrm(F)=%.3g\n",ier,t); + ier = FINUFFT2D1(0, x, x, c, +1, acc, N, N, F, &opts); + t = twonorm(N, F); + if (ier || t != 0.0) { + printf("2d1 M=0:\tier=%d nrm(F)=%.3g\n", ier, t); return 1; } - for (int k=0; k0:\tier=%d nrm(c)=%.3g\n",ier,t); + ier = FINUFFT2D2(M, x, x, c, +1, acc, 0, N, F, &opts); + t = twonorm(M, c); + if (ier || t != 0.0) { + printf("2d2 Ns=0,Nt>0:\tier=%d nrm(c)=%.3g\n", ier, t); return 1; } - ier = FINUFFT2D2(M,x,x,c,+1,acc,N,0,F,&opts); - t = twonorm(M,c); - if (ier || t!=0.0) { - printf("2d2 Ns>0,Nt=0:\tier=%d nrm(c)=%.3g\n",ier,t); + ier = FINUFFT2D2(M, x, x, c, +1, acc, N, 0, F, &opts); + t = twonorm(M, c); + if (ier || t != 0.0) { + printf("2d2 Ns>0,Nt=0:\tier=%d nrm(c)=%.3g\n", ier, t); return 1; } - ier = FINUFFT2D2(0,x,x,c,+1,acc,N,N,F,&opts); + ier = FINUFFT2D2(0, x, x, c, +1, acc, N, N, F, &opts); if (ier) { - printf("2d2 M=0:\tier=%d\n",ier); + printf("2d2 M=0:\tier=%d\n", ier); return ier; } - for (int j=0; j1D since guess that 1D would catch it. if (ier) { - printf("2d3 M=nk=1:\tier=%d\n",ier); + printf("2d3 M=nk=1:\tier=%d\n", ier); return ier; - } - for (int k=0; k0:\tier=%d\n",ier); + printf("2d1many Ns=0,Nt>0:\tier=%d\n", ier); return ier; } - ier = FINUFFT2D1MANY(ndata,M,x,x,cm,+1,acc,N,0,Fm,&opts); + ier = FINUFFT2D1MANY(ndata, M, x, x, cm, +1, acc, N, 0, Fm, &opts); if (ier) { - printf("2d1many Ns>0,Nt=0:\tier=%d\n",ier); + printf("2d1many Ns>0,Nt=0:\tier=%d\n", ier); return ier; } - ier = FINUFFT2D1MANY(ndata,0,x,x,cm,+1,acc,N,N,Fm,&opts); - t = twonorm(N*ndata,Fm); - if (ier || t!=0.0) { - printf("2d1many M=0:\tier=%d nrm(Fm)=%.3g\n",ier,t); + ier = FINUFFT2D1MANY(ndata, 0, x, x, cm, +1, acc, N, N, Fm, &opts); + t = twonorm(N * ndata, Fm); + if (ier || t != 0.0) { + printf("2d1many M=0:\tier=%d nrm(Fm)=%.3g\n", ier, t); return 1; } - for (int k=0; k0:\tier=%d nrm(cm)=%.3g\n", ier,t); + ier = FINUFFT2D2MANY(ndata, M, x, x, cm, +1, acc, 0, N, Fm, &opts); + t = twonorm(M * ndata, cm); + if (ier || t != 0.0) { + printf("2d2many Ns=0,Nt>0:\tier=%d nrm(cm)=%.3g\n", ier, t); return 1; } - ier = FINUFFT2D2MANY(ndata,M,x,x,cm,+1,acc,N,0,Fm,&opts); - t = twonorm(M*ndata,cm); - if (ier || t!=0.0) { - printf("2d2many Ns>0,Nt=0:\tier=%d nrm(cm)=%.3g\n", ier,t); + ier = FINUFFT2D2MANY(ndata, M, x, x, cm, +1, acc, N, 0, Fm, &opts); + t = twonorm(M * ndata, cm); + if (ier || t != 0.0) { + printf("2d2many Ns>0,Nt=0:\tier=%d nrm(cm)=%.3g\n", ier, t); return 1; } - ier = FINUFFT2D2MANY(ndata,0,x,x,cm,+1,acc,N,N,Fm,&opts); + ier = FINUFFT2D2MANY(ndata, 0, x, x, cm, +1, acc, N, N, Fm, &opts); if (ier) { - printf("2d2many M=0:\tier=%d\n",ier); + printf("2d2many M=0:\tier=%d\n", ier); return ier; } - ier = FINUFFT2D3MANY(0,M,x,x,cm,+1,0,N,s,s,Fm,&opts); + ier = FINUFFT2D3MANY(0, M, x, x, cm, +1, 0, N, s, s, Fm, &opts); if (ier != FINUFFT_ERR_NTRANS_NOTVALID) { - printf("2d3many ndata=0:\twrong err code %d\n",ier); + printf("2d3many ndata=0:\twrong err code %d\n", ier); return 1; } - ier = FINUFFT2D3MANY(ndata,M,x,x,cm,+1,0,N,s,s,Fm,&opts); + ier = FINUFFT2D3MANY(ndata, M, x, x, cm, +1, 0, N, s, s, Fm, &opts); if (ier != FINUFFT_WARN_EPS_TOO_SMALL) { - printf("2d3many tol=0:\twrong err code %d\n",ier); + printf("2d3many tol=0:\twrong err code %d\n", ier); return 1; } - ier = FINUFFT2D3MANY(ndata,M,x,x,cm,+1,acc,0,s,s,Fm,&opts); + ier = FINUFFT2D3MANY(ndata, M, x, x, cm, +1, acc, 0, s, s, Fm, &opts); if (ier) { - printf("2d3many nk=0:\tier=%d\n",ier); + printf("2d3many nk=0:\tier=%d\n", ier); return ier; } - ier = FINUFFT2D3MANY(ndata,0,x,x,cm,+1,acc,N,s,s,Fm,&opts); - t = twonorm(N,Fm); - if (ier || t!=0.0) { - printf("2d3many M=0:\tier=%d nrm(F)=%.3g\n",ier,t); + ier = FINUFFT2D3MANY(ndata, 0, x, x, cm, +1, acc, N, s, s, Fm, &opts); + t = twonorm(N, Fm); + if (ier || t != 0.0) { + printf("2d3many M=0:\tier=%d nrm(F)=%.3g\n", ier, t); return 1; } - ier = FINUFFT2D3MANY(ndata,1,x,x,cm,+1,acc,N,s,s,Fm,&opts); // XK prod formally 0 + ier = FINUFFT2D3MANY(ndata, 1, x, x, cm, +1, acc, N, s, s, Fm, &opts); // XK prod + // formally 0 // we don't check the M=nk=1 case for >1D since guess that 1D would catch it. if (ier) { - printf("2d3many M=nk=1:\tier=%d\n",ier); + printf("2d3many M=nk=1:\tier=%d\n", ier); return ier; } - ier = FINUFFT2D3MANY(ndata,M,x,x,cm,+1,acc,N,shuge,shuge,Fm,&opts); - if (ier==0) { // any nonzero code accepted here - printf("2d3many XK prod too big:\twrong error code %d\n",ier); + ier = FINUFFT2D3MANY(ndata, M, x, x, cm, +1, acc, N, shuge, shuge, Fm, &opts); + if (ier == 0) { // any nonzero code accepted here + printf("2d3many XK prod too big:\twrong error code %d\n", ier); return 1; } - + // 3333333333333333333333333333333333333333333333333333333333333333333333333 - printf("3D dumb cases.\n"); // z=y=x, and u=t=s in type 3 - ier = FINUFFT3D1(M,x,x,x,c,+1,0,N,N,N,F,&opts); + printf("3D dumb cases.\n"); // z=y=x, and u=t=s in type 3 + ier = FINUFFT3D1(M, x, x, x, c, +1, 0, N, N, N, F, &opts); if (ier != FINUFFT_WARN_EPS_TOO_SMALL) { - printf("3d1 tol=0:\twrong err code %d\n",ier); + printf("3d1 tol=0:\twrong err code %d\n", ier); return 1; } - ier = FINUFFT3D1(M,x,x,x,c,+1,acc,0,0,0,F,&opts); + ier = FINUFFT3D1(M, x, x, x, c, +1, acc, 0, 0, 0, F, &opts); if (ier) { - printf("3d1 Ns=Nt=Nu=0:\tier=%d\n",ier); + printf("3d1 Ns=Nt=Nu=0:\tier=%d\n", ier); return ier; } - ier = FINUFFT3D1(M,x,x,x,c,+1,acc,0,N,0,F,&opts); + ier = FINUFFT3D1(M, x, x, x, c, +1, acc, 0, N, 0, F, &opts); if (ier) { - printf("3d1 Ns=0,Nt>0,Nu=0:\tier=%d\n",ier); + printf("3d1 Ns=0,Nt>0,Nu=0:\tier=%d\n", ier); return ier; } - ier = FINUFFT3D1(M,x,x,x,c,+1,acc,N,0,N,F,&opts); + ier = FINUFFT3D1(M, x, x, x, c, +1, acc, N, 0, N, F, &opts); if (ier) { - printf("3d1 Ns>0,Nt=0,Nu>0:\tier=%d\n",ier); + printf("3d1 Ns>0,Nt=0,Nu>0:\tier=%d\n", ier); return ier; } - ier = FINUFFT3D1(0,x,x,x,c,+1,acc,N,N,N,F,&opts); - t = twonorm(N,F); - if (ier || t!=0.0) { - printf("3d1 M=0:\tier=%d nrm(F)=%.3g\n",ier,t); + ier = FINUFFT3D1(0, x, x, x, c, +1, acc, N, N, N, F, &opts); + t = twonorm(N, F); + if (ier || t != 0.0) { + printf("3d1 M=0:\tier=%d nrm(F)=%.3g\n", ier, t); return 1; } - for (int k=0; k0,Nt=Nu=0:\tier=%d nrm(c)=%.3g\n",ier,t); + ier = FINUFFT3D2(M, x, x, x, c, +1, acc, N, 0, 0, F, &opts); + t = twonorm(M, c); + if (ier || t != 0.0) { + printf("3d2 Ns>0,Nt=Nu=0:\tier=%d nrm(c)=%.3g\n", ier, t); return 1; } - ier = FINUFFT3D2(M,x,x,x,c,+1,acc,0,N,0,F,&opts); - t = twonorm(M,c); - if (ier || t!=0.0) { - printf("3d2 Ns=0,Nt>0,Nu=0:\tier=%d nrm(c)=%.3g\n",ier,t); + ier = FINUFFT3D2(M, x, x, x, c, +1, acc, 0, N, 0, F, &opts); + t = twonorm(M, c); + if (ier || t != 0.0) { + printf("3d2 Ns=0,Nt>0,Nu=0:\tier=%d nrm(c)=%.3g\n", ier, t); return 1; } - ier = FINUFFT3D2(M,x,x,x,c,+1,acc,0,0,N,F,&opts); - t = twonorm(M,c); - if (ier || t!=0.0) { - printf("3d2 Ns=Nt=0,Nu>0:\tier=%d nrm(c)=%.3g\n",ier,t); + ier = FINUFFT3D2(M, x, x, x, c, +1, acc, 0, 0, N, F, &opts); + t = twonorm(M, c); + if (ier || t != 0.0) { + printf("3d2 Ns=Nt=0,Nu>0:\tier=%d nrm(c)=%.3g\n", ier, t); return 1; } - ier = FINUFFT3D2(0,x,x,x,c,+1,acc,N,N,N,F,&opts); + ier = FINUFFT3D2(0, x, x, x, c, +1, acc, N, N, N, F, &opts); if (ier) { - printf("3d2 M=0:\tier=%d\n",ier); + printf("3d2 M=0:\tier=%d\n", ier); return ier; } - for (int j=0; j1D since guess that 1D would catch it. if (ier) { - printf("3d3 M=nk=1:\tier=%d\n",ier); + printf("3d3 M=nk=1:\tier=%d\n", ier); return ier; } - for (int k=0; k0,Nt=Nu=0:\tier=%d\n",ier); + printf("3d1many Ns>0,Nt=Nu=0:\tier=%d\n", ier); return ier; } - ier = FINUFFT3D1MANY(ndata,M,x,x,x,cm,+1,acc,0,N,0,Fm,&opts); + ier = FINUFFT3D1MANY(ndata, M, x, x, x, cm, +1, acc, 0, N, 0, Fm, &opts); if (ier) { - printf("3d1many Ns=0,Nt>0,Nu=0:\tier=%d\n",ier); + printf("3d1many Ns=0,Nt>0,Nu=0:\tier=%d\n", ier); return ier; } - ier = FINUFFT3D1MANY(ndata,M,x,x,x,cm,+1,acc,0,0,N,Fm,&opts); + ier = FINUFFT3D1MANY(ndata, M, x, x, x, cm, +1, acc, 0, 0, N, Fm, &opts); if (ier) { - printf("3d1many Ns=Nt=0,Nu>0:\tier=%d\n",ier); + printf("3d1many Ns=Nt=0,Nu>0:\tier=%d\n", ier); return ier; } - ier = FINUFFT3D1MANY(ndata,0,x,x,x,cm,+1,acc,N,N,N,Fm,&opts); - t = twonorm(N*ndata,Fm); - if (ier || t!=0.0) { - printf("3d1many M=0:\tier=%d nrm(Fm)=%.3g\n",ier,t); + ier = FINUFFT3D1MANY(ndata, 0, x, x, x, cm, +1, acc, N, N, N, Fm, &opts); + t = twonorm(N * ndata, Fm); + if (ier || t != 0.0) { + printf("3d1many M=0:\tier=%d nrm(Fm)=%.3g\n", ier, t); return 1; } - for (int k=0; k0,Nt=Nu=0:\tier=%d nrm(cm)=%.3g\n", ier,t); + ier = FINUFFT3D2MANY(ndata, M, x, x, x, cm, +1, acc, N, 0, 0, Fm, &opts); + t = twonorm(M * ndata, cm); + if (ier || t != 0.0) { + printf("3d2many Ns>0,Nt=Nu=0:\tier=%d nrm(cm)=%.3g\n", ier, t); return 1; } - ier = FINUFFT3D2MANY(ndata,M,x,x,x,cm,+1,acc,0,N,0,Fm,&opts); - t = twonorm(M*ndata,cm); - if (ier || t!=0.0) { - printf("3d2many Ns=0,Nt>0,Nu=0:\tier=%d nrm(cm)=%.3g\n", ier,t); + ier = FINUFFT3D2MANY(ndata, M, x, x, x, cm, +1, acc, 0, N, 0, Fm, &opts); + t = twonorm(M * ndata, cm); + if (ier || t != 0.0) { + printf("3d2many Ns=0,Nt>0,Nu=0:\tier=%d nrm(cm)=%.3g\n", ier, t); return 1; } - ier = FINUFFT3D2MANY(ndata,M,x,x,x,cm,+1,acc,0,0,N,Fm,&opts); - t = twonorm(M*ndata,cm); - if (ier || t!=0.0) { - printf("3d2many Ns=Nt=0,Nu>0:\tier=%d nrm(cm)=%.3g\n", ier,t); + ier = FINUFFT3D2MANY(ndata, M, x, x, x, cm, +1, acc, 0, 0, N, Fm, &opts); + t = twonorm(M * ndata, cm); + if (ier || t != 0.0) { + printf("3d2many Ns=Nt=0,Nu>0:\tier=%d nrm(cm)=%.3g\n", ier, t); return 1; } - ier = FINUFFT3D2MANY(ndata,0,x,x,x,cm,+1,acc,N,N,N,Fm,&opts); + ier = FINUFFT3D2MANY(ndata, 0, x, x, x, cm, +1, acc, N, N, N, Fm, &opts); if (ier) { - printf("3d2many M=0:\tier=%d\n",ier); + printf("3d2many M=0:\tier=%d\n", ier); return ier; } - ier = FINUFFT3D3MANY(0,M,x,x,x,cm,+1,0,N,s,s,s,Fm,&opts); + ier = FINUFFT3D3MANY(0, M, x, x, x, cm, +1, 0, N, s, s, s, Fm, &opts); if (ier != FINUFFT_ERR_NTRANS_NOTVALID) { - printf("3d3many ndata=0:\twrong err code %d\n",ier); + printf("3d3many ndata=0:\twrong err code %d\n", ier); return 1; } - ier = FINUFFT3D3MANY(ndata,M,x,x,x,cm,+1,0,N,s,s,s,Fm,&opts); + ier = FINUFFT3D3MANY(ndata, M, x, x, x, cm, +1, 0, N, s, s, s, Fm, &opts); if (ier != FINUFFT_WARN_EPS_TOO_SMALL) { - printf("3d3many tol=0:\twrong err code %d\n",ier); + printf("3d3many tol=0:\twrong err code %d\n", ier); return 1; } - ier = FINUFFT3D3MANY(ndata,M,x,x,x,cm,+1,acc,0,s,s,s,Fm,&opts); + ier = FINUFFT3D3MANY(ndata, M, x, x, x, cm, +1, acc, 0, s, s, s, Fm, &opts); if (ier) { - printf("3d3many nk=0:\tier=%d\n",ier); + printf("3d3many nk=0:\tier=%d\n", ier); return ier; } - ier = FINUFFT3D3MANY(ndata,0,x,x,x,cm,+1,acc,N,s,s,s,Fm,&opts); - t = twonorm(N,Fm); - if (ier || t!=0.0) { - printf("3d3many M=0:\tier=%d nrm(F)=%.3g\n",ier,t); + ier = FINUFFT3D3MANY(ndata, 0, x, x, x, cm, +1, acc, N, s, s, s, Fm, &opts); + t = twonorm(N, Fm); + if (ier || t != 0.0) { + printf("3d3many M=0:\tier=%d nrm(F)=%.3g\n", ier, t); return 1; } - ier = FINUFFT3D3MANY(ndata,1,x,x,x,cm,+1,acc,N,s,s,s,Fm,&opts); // XK prod formally 0 + ier = FINUFFT3D3MANY(ndata, 1, x, x, x, cm, +1, acc, N, s, s, s, Fm, &opts); // XK prod + // formally + // 0 // we don't check the M=nk=1 case for >1D since guess that 1D would catch it. if (ier) { - printf("3d3many M=nk=1:\tier=%d\n",ier); + printf("3d3many M=nk=1:\tier=%d\n", ier); return ier; } - ier = FINUFFT3D3MANY(ndata,M,x,x,x,cm,+1,acc,N,shuge,shuge,shuge,Fm,&opts); - if (ier==0) { // any nonzero code accepted here - printf("3d3many XK prod too big:\twrong error code %d\n",ier); + ier = FINUFFT3D3MANY(ndata, M, x, x, x, cm, +1, acc, N, shuge, shuge, shuge, Fm, &opts); + if (ier == 0) { // any nonzero code accepted here + printf("3d3many XK prod too big:\twrong error code %d\n", ier); return 1; } - - free(x); free(c); free(F); free(s); free(shuge); free(cm); free(Fm); free(Fe); - + + free(x); + free(c); + free(F); + free(s); + free(shuge); + free(cm); + free(Fm); + free(Fe); + // GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG // some dumb tests for guru interface to induce free() crash in destroy... FINUFFT_PLAN plan; - BIGINT Ns[1] = {0}; // since dim=1, don't have to make length 3 - FINUFFT_MAKEPLAN(1, 1, Ns, +1, 1, acc, &plan, NULL); // type 1, now kill it + BIGINT Ns[1] = {0}; // since dim=1, don't have to make length 3 + FINUFFT_MAKEPLAN(1, 1, Ns, +1, 1, acc, &plan, NULL); // type 1, now kill it FINUFFT_DESTROY(plan); - FINUFFT_MAKEPLAN(3, 1, Ns, +1, 1, acc, &plan, NULL); // type 3, now kill it + FINUFFT_MAKEPLAN(3, 1, Ns, +1, 1, acc, &plan, NULL); // type 3, now kill it FINUFFT_DESTROY(plan); // *** todo: more extensive bad inputs and error catching in guru... - + #ifdef SINGLE printf("dumbinputsf passed.\n"); #else printf("dumbinputs passed.\n"); #endif - + return 0; } diff --git a/test/finufft1d_test.cpp b/test/finufft1d_test.cpp index 8dd345b1a..29c0c540f 100644 --- a/test/finufft1d_test.cpp +++ b/test/finufft1d_test.cpp @@ -4,120 +4,128 @@ using namespace std; using namespace finufft::utils; -const char* help[]={ - "Tester for FINUFFT in 1d, all 3 types, either precision.", - "", - "Usage: finufft1d_test Nmodes Nsrc [tol [debug [spread_sort [upsampfac [errfail]]]]]", - "\teg:\tfinufft1d_test 1e6 1e6 1e-6 1 2 2.0 1e-5", - "\tnotes:\tif errfail present, exit code 1 if any error > errfail", - NULL}; +const char *help[] = { + "Tester for FINUFFT in 1d, all 3 types, either precision.", + "", + "Usage: finufft1d_test Nmodes Nsrc [tol [debug [spread_sort [upsampfac [errfail]]]]]", + "\teg:\tfinufft1d_test 1e6 1e6 1e-6 1 2 2.0 1e-5", + "\tnotes:\tif errfail present, exit code 1 if any error > errfail", + NULL}; // Barnett 1/22/17 onwards -int main(int argc, char* argv[]) -{ - BIGINT M, N; // M = # srcs, N = # modes out - double w, tol = 1e-6; // default +int main(int argc, char *argv[]) { + BIGINT M, N; // M = # srcs, N = # modes out + double w, tol = 1e-6; // default double err, errfail = INFINITY, errmax = 0; - finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts); // put defaults in opts + finufft_opts opts; + FINUFFT_DEFAULT_OPTS(&opts); // put defaults in opts // opts.fftw = FFTW_MEASURE; // change from usual FFTW_ESTIMATE - int isign = +1; // choose which exponential sign to test - if (argc<3 || argc>8) { - for (int i=0; help[i]; ++i) - fprintf(stderr,"%s\n",help[i]); + int isign = +1; // choose which exponential sign to test + if (argc < 3 || argc > 8) { + for (int i = 0; help[i]; ++i) fprintf(stderr, "%s\n", help[i]); return 2; } - sscanf(argv[1],"%lf",&w); N = (BIGINT)w; - sscanf(argv[2],"%lf",&w); M = (BIGINT)w; - if (argc>3) sscanf(argv[3],"%lf",&tol); - if (argc>4) sscanf(argv[4],"%d",&opts.debug); - opts.spread_debug = (opts.debug>1) ? 1 : 0; // see output from spreader - if (argc>5) sscanf(argv[5],"%d",&opts.spread_sort); - if (argc>6) { sscanf(argv[6],"%lf",&w); opts.upsampfac=(FLT)w; } - if (argc>7) sscanf(argv[7],"%lf",&errfail); - + sscanf(argv[1], "%lf", &w); + N = (BIGINT)w; + sscanf(argv[2], "%lf", &w); + M = (BIGINT)w; + if (argc > 3) sscanf(argv[3], "%lf", &tol); + if (argc > 4) sscanf(argv[4], "%d", &opts.debug); + opts.spread_debug = (opts.debug > 1) ? 1 : 0; // see output from spreader + if (argc > 5) sscanf(argv[5], "%d", &opts.spread_sort); + if (argc > 6) { + sscanf(argv[6], "%lf", &w); + opts.upsampfac = (FLT)w; + } + if (argc > 7) sscanf(argv[7], "%lf", &errfail); + cout << scientific << setprecision(15); - FLT *x = (FLT*)malloc(sizeof(FLT)*M); // NU pts - CPX* c = (CPX*)malloc(sizeof(CPX)*M); // strengths - CPX* F = (CPX*)malloc(sizeof(CPX)*N); // mode ampls + FLT *x = (FLT *)malloc(sizeof(FLT) * M); // NU pts + CPX *c = (CPX *)malloc(sizeof(CPX) * M); // strengths + CPX *F = (CPX *)malloc(sizeof(CPX) * N); // mode ampls #pragma omp parallel { - unsigned int se=MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s -#pragma omp for schedule(static,TEST_RANDCHUNK) // static => non-stochastic - for (BIGINT j=0; j non-stochastic + for (BIGINT j = 0; j < M; ++j) { + x[j] = PI * randm11r(&se); // fills [-pi,pi) c[j] = crandm11r(&se); } } - //for (BIGINT j=0; j1) { - printf("error (ier=%d)!\n",ier); + CNTime timer; + timer.start(); + int ier = FINUFFT1D1(M, x, c, isign, tol, N, F, &opts); + // for (int j=0;j 1) { + printf("error (ier=%d)!\n", ier); return ier; } else - printf("\t%lld NU pts to %lld modes in %.3g s \t%.3g NU pts/s\n",(long long)M,(long long)N,t,M/t); + printf("\t%lld NU pts to %lld modes in %.3g s \t%.3g NU pts/s\n", (long long)M, + (long long)N, t, M / t); - BIGINT nt = (BIGINT)(0.37*N); // check arb choice of mode near the top (N/2) -//#pragma omp declare reduction (cmplxadd:CPX:omp_out=omp_out+omp_in) initializer(omp_priv={0.0,0.0}) // only for openmp v 4.0! - //#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(cmplxadd:Ft) + BIGINT nt = (BIGINT)(0.37 * N); // check arb choice of mode near the top (N/2) + // #pragma omp declare reduction (cmplxadd:CPX:omp_out=omp_out+omp_in) + // initializer(omp_priv={0.0,0.0}) // only for openmp v 4.0! #pragma omp parallel for + // schedule(static,TEST_RANDCHUNK) reduction(cmplxadd:Ft) FLT Ftr = 0.0, Fti = 0.0; -#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(+:Ftr,Fti) - for (BIGINT j=0; j1) { - printf("error (ier=%d)!\n",ier); + ier = FINUFFT1D2(M, x, c, isign, tol, N, F, &opts); + // cout<<"c:\n"; for (int j=0;j 1) { + printf("error (ier=%d)!\n", ier); return ier; } else - printf("\t%lld modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n",(long long)N,(long long)M,t,M/t); + printf("\t%lld modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", (long long)N, + (long long)M, t, M / t); - BIGINT jt = M/2; // check arbitrary choice of one targ pt - CPX ct = CPX(0,0); - BIGINT m=0, k0 = N/2; // index shift in fk's = mag of most neg freq - //#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(cmplxadd:ct) - for (BIGINT m1=-k0; m1<=(N-1)/2; ++m1) - ct += F[m++] * exp(IMA*((FLT)(isign*m1))*x[jt]); // crude direct - err = abs(ct-c[jt])/infnorm(M,c); - errmax = max(err,errmax); - printf("\tone targ: rel err in c[%lld] is %.3g\n",(long long)jt,err); - if (((int64_t)M)*N<=TEST_BIGPROB) { // also full direct eval - CPX* ct = (CPX*)malloc(sizeof(CPX)*M); - dirft1d2(M,x,ct,isign,N,F); - err = relerrtwonorm(M,ct,c); - errmax = max(err,errmax); - printf("\tdirft1d: rel l2-err of result c is %.3g\n",err); - //cout<<"c/ct:\n"; for (int j=0;j0) { - printf("error (ier=%d)!\n",ier); + ier = FINUFFT1D3(M, x, c, isign, tol, N, s, F, &opts); + t = timer.elapsedsec(); + if (ier > 0) { + printf("error (ier=%d)!\n", ier); return ier; } else - printf("\t%lld NU to %lld NU in %.3g s \t%.3g tot NU pts/s\n",(long long)M,(long long)N,t,(M+N)/t); + printf("\t%lld NU to %lld NU in %.3g s \t%.3g tot NU pts/s\n", (long long)M, + (long long)N, t, (M + N) / t); - BIGINT kt = N/2; // check arbitrary choice of one targ pt - Ftr = 0.0; - Fti = 0.0; -#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(+:Ftr,Fti) - for (BIGINT j=0; jerrfail); + free(x); + free(c); + free(F); + free(s); + return (errmax > errfail); } diff --git a/test/finufft1dmany_test.cpp b/test/finufft1dmany_test.cpp index 581c52c2d..e17cbb65e 100644 --- a/test/finufft1dmany_test.cpp +++ b/test/finufft1dmany_test.cpp @@ -4,164 +4,172 @@ using namespace std; using namespace finufft::utils; -const char* help[]={ - "Tester for FINUFFT in 1d, vectorized, all 3 types, either precision.", - "", - "Usage: finufft1dmany_test ntrans Nmodes Nsrc [tol [debug [spread_thread [maxbatchsize [spreadsort [upsampfac [errfail]]]]]]]", - "\teg:\tfinufft1dmany_test 100 1e3 1e4 1e-6 1 0 0 2 0.0 1e-5", - "\tnotes:\tif errfail present, exit code 1 if any error > errfail", - NULL}; +const char *help[] = { + "Tester for FINUFFT in 1d, vectorized, all 3 types, either precision.", + "", + "Usage: finufft1dmany_test ntrans Nmodes Nsrc [tol [debug [spread_thread " + "[maxbatchsize [spreadsort [upsampfac [errfail]]]]]]]", + "\teg:\tfinufft1dmany_test 100 1e3 1e4 1e-6 1 0 0 2 0.0 1e-5", + "\tnotes:\tif errfail present, exit code 1 if any error > errfail", + NULL}; // Malleo 2019 based on Shih 2018. Tidied, extra args, Barnett 5/25/20 onwards -int main(int argc, char* argv[]) -{ - BIGINT M, N; // M = # srcs, N = # modes - int ntransf; // # of vectors for "many" interface - double w, tol = 1e-6; // default +int main(int argc, char *argv[]) { + BIGINT M, N; // M = # srcs, N = # modes + int ntransf; // # of vectors for "many" interface + double w, tol = 1e-6; // default double err, errfail = INFINITY, errmax = 0; - finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts); + finufft_opts opts; + FINUFFT_DEFAULT_OPTS(&opts); // opts.fftw = FFTW_MEASURE; // change from usual FFTW_ESTIMATE - int isign = +1; // choose which exponential sign to test - if (argc<4 || argc>11) { - for (int i=0; help[i]; ++i) - fprintf(stderr,"%s\n",help[i]); + int isign = +1; // choose which exponential sign to test + if (argc < 4 || argc > 11) { + for (int i = 0; help[i]; ++i) fprintf(stderr, "%s\n", help[i]); return 2; } - sscanf(argv[1],"%lf",&w); ntransf = (int)w; - sscanf(argv[2],"%lf",&w); N = (BIGINT)w; - sscanf(argv[3],"%lf",&w); M = (BIGINT)w; - if (argc>4) sscanf(argv[4],"%lf",&tol); - if (argc>5) sscanf(argv[5],"%d",&opts.debug); - opts.spread_debug = (opts.debug>1) ? 1 : 0; // see output from spreader - if (argc>6) sscanf(argv[6],"%d",&opts.spread_thread); - if (argc>7) sscanf(argv[7],"%d",&opts.maxbatchsize); - if (argc>8) sscanf(argv[8],"%d",&opts.spread_sort); - if (argc>9) { sscanf(argv[9],"%lf",&w); opts.upsampfac=(FLT)w; } - if (argc>10) sscanf(argv[10],"%lf",&errfail); + sscanf(argv[1], "%lf", &w); + ntransf = (int)w; + sscanf(argv[2], "%lf", &w); + N = (BIGINT)w; + sscanf(argv[3], "%lf", &w); + M = (BIGINT)w; + if (argc > 4) sscanf(argv[4], "%lf", &tol); + if (argc > 5) sscanf(argv[5], "%d", &opts.debug); + opts.spread_debug = (opts.debug > 1) ? 1 : 0; // see output from spreader + if (argc > 6) sscanf(argv[6], "%d", &opts.spread_thread); + if (argc > 7) sscanf(argv[7], "%d", &opts.maxbatchsize); + if (argc > 8) sscanf(argv[8], "%d", &opts.spread_sort); + if (argc > 9) { + sscanf(argv[9], "%lf", &w); + opts.upsampfac = (FLT)w; + } + if (argc > 10) sscanf(argv[10], "%lf", &errfail); cout << scientific << setprecision(15); - - FLT* x = (FLT*)malloc(sizeof(FLT)*M); // NU pts x coords - CPX* c = (CPX*)malloc(sizeof(CPX)*M*ntransf); // strengths - CPX* F = (CPX*)malloc(sizeof(CPX)*N*ntransf); // mode ampls + + FLT *x = (FLT *)malloc(sizeof(FLT) * M); // NU pts x coords + CPX *c = (CPX *)malloc(sizeof(CPX) * M * ntransf); // strengths + CPX *F = (CPX *)malloc(sizeof(CPX) * N * ntransf); // mode ampls #pragma omp parallel { - unsigned int se=MY_OMP_GET_THREAD_NUM(); -#pragma omp for schedule(static,TEST_RANDCHUNK) - for (BIGINT j=0; j1) { - printf("error (ier=%d)!\n",ier); + CNTime timer; + timer.start(); + int ier = FINUFFT1D1MANY(ntransf, M, x, c, isign, tol, N, F, &opts); + double ti = timer.elapsedsec(); + if (ier > 1) { + printf("error (ier=%d)!\n", ier); return ier; } else - printf("ntr=%d: %lld NU pts to %lld modes in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)M,(long long)N,ti,ntransf*M/ti); - - int i = (ntransf-1); // choose a trial to check - BIGINT nt1 = (BIGINT)(0.37*N); // choose some mode index to check - CPX Ft = CPX(0,0), J = IMA*(FLT)isign; - for (BIGINT j=0; j1) { - printf("error (ier=%d)!\n",ier); + if (ier > 1) { + printf("error (ier=%d)!\n", ier); return ier; } else - printf("%d of: %lld NU pts to %lld modes in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)M,(long long)N,t,ntransf*M/t); - printf("\t\t\tspeedup \t T_FINUFFT1D1 / T_finufft1d1many = %.3g\n", t/ti); - - // Check consistency (worst over the ntransf) + printf("%d of: %lld NU pts to %lld modes in %.3g s \t%.3g NU pts/s\n", ntransf, + (long long)M, (long long)N, t, ntransf * M / t); + printf("\t\t\tspeedup \t T_FINUFFT1D1 / T_finufft1d1many = %.3g\n", t / ti); + + // Check consistency (worst over the ntransf) double maxerror = 0.0; for (int k = 0; k < ntransf; ++k) - maxerror = max(maxerror, (double)relerrtwonorm(N,F_1d1+k*N,F+k*N)); - errmax = max(maxerror,errmax); - printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) = %.3g\n",maxerror); + maxerror = max(maxerror, (double)relerrtwonorm(N, F_1d1 + k * N, F + k * N)); + errmax = max(maxerror, errmax); + printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) = %.3g\n", maxerror); free(F_1d1); - printf("test 1d2 many vs repeated single: ------------------------------------\n"); FFTW_FORGET_WISDOM(); #pragma omp parallel { - unsigned int se=MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s -#pragma omp for schedule(static,TEST_RANDCHUNK) - for (BIGINT m=0; m1) { - printf("error (ier=%d)!\n",ier); + ier = FINUFFT1D2MANY(ntransf, M, x, c, isign, tol, N, F, &opts); + // cout<<"c:\n"; for (int j=0;j 1) { + printf("error (ier=%d)!\n", ier); return ier; } else - printf("ntr=%d: %lld modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n",ntransf,(long long)N,(long long)M,ti,ntransf*M/ti); - - BIGINT jt = M/2; // check arbitrary choice of one targ pt - CPX ct = CPX(0,0); - BIGINT m=0, k0 = N/2; // index shift in fk's = mag of most neg freq - //#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(cmplxadd:ct) - for (BIGINT m1=-k0; m1<=(N-1)/2; ++m1) - ct += F[i*N + m++] * exp(IMA*((FLT)(isign*m1))*x[jt]); // crude direct - err = abs(ct-c[jt + i*M])/infnorm(M,c+i*M); - errmax = max(err,errmax); - printf("\tone targ: rel err in c[%lld] of trans#%d is %.3g\n",(long long)jt,i,err); + printf("ntr=%d: %lld modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", ntransf, + (long long)N, (long long)M, ti, ntransf * M / ti); + + BIGINT jt = M / 2; // check arbitrary choice of one targ pt + CPX ct = CPX(0, 0); + BIGINT m = 0, k0 = N / 2; // index shift in fk's = mag of most neg freq + // #pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(cmplxadd:ct) + for (BIGINT m1 = -k0; m1 <= (N - 1) / 2; ++m1) + ct += F[i * N + m++] * exp(IMA * ((FLT)(isign * m1)) * x[jt]); // crude direct + err = abs(ct - c[jt + i * M]) / infnorm(M, c + i * M); + errmax = max(err, errmax); + printf("\tone targ: rel err in c[%lld] of trans#%d is %.3g\n", (long long)jt, i, err); // check against single calls to FINUFFT1D2... FFTW_FORGET_WISDOM(); - CPX * c_1d2 = (CPX *)malloc(sizeof(CPX)*M*ntransf); + CPX *c_1d2 = (CPX *)malloc(sizeof(CPX) * M * ntransf); timer.restart(); - for(BIGINT j = 0; j < ntransf; j++){ - Fstart = F + j*N; - cstart = c_1d2 + j*M; - FINUFFT1D2(M,x,cstart,isign,tol,N,Fstart,&simpleopts); + for (BIGINT j = 0; j < ntransf; j++) { + Fstart = F + j * N; + cstart = c_1d2 + j * M; + FINUFFT1D2(M, x, cstart, isign, tol, N, Fstart, &simpleopts); } t = timer.elapsedsec(); - if (ier>1) { - printf("error (ier=%d)!\n",ier); + if (ier > 1) { + printf("error (ier=%d)!\n", ier); return ier; } else - printf("%d of: %lld modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)N,(long long)M,t,ntransf*M/t); - printf("\t\t\tspeedup \t T_FINUFFT1D2 / T_finufft1d2many = %.3g\n", t/ti); - - maxerror = 0.0; // worst error over the ntransf + printf("%d of: %lld modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", ntransf, + (long long)N, (long long)M, t, ntransf * M / t); + printf("\t\t\tspeedup \t T_FINUFFT1D2 / T_finufft1d2many = %.3g\n", t / ti); + + maxerror = 0.0; // worst error over the ntransf for (int k = 0; k < ntransf; ++k) - maxerror = max(maxerror, (double)relerrtwonorm(M,c_1d2+k*M,c+k*M)); - errmax = max(maxerror,errmax); - printf("\tconsistency check: sup ( ||c_many-c||_2 / ||c||_2 ) = %.3g\n",maxerror); + maxerror = max(maxerror, (double)relerrtwonorm(M, c_1d2 + k * M, c + k * M)); + errmax = max(maxerror, errmax); + printf("\tconsistency check: sup ( ||c_many-c||_2 / ||c||_2 ) = %.3g\n", maxerror); free(c_1d2); printf("test 1d3 many vs repeated single: ------------------------------------\n"); @@ -169,68 +177,69 @@ int main(int argc, char* argv[]) #pragma omp parallel { - unsigned int se=MY_OMP_GET_THREAD_NUM(); -#pragma omp for schedule(static,TEST_RANDCHUNK) - for (BIGINT j=0; j1) { - printf("error (ier=%d)!\n",ier); + ier = FINUFFT1D3MANY(ntransf, M, x, c, isign, tol, N, s, F, &opts); + ti = timer.elapsedsec(); + if (ier > 1) { + printf("error (ier=%d)!\n", ier); return ier; } else - printf("ntr=%d: %lld NU to %lld NU in %.3g s \t%.3g tot NU pts/s\n",ntransf,(long long)M,(long long)N,ti,ntransf*(M+N)/ti); - - BIGINT kt = N/4; // check arbitrary choice of one targ pt - Ft = CPX(0,0); - //#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(cmplxadd:Ft) - for (BIGINT j=0;j1) { - printf("error (ier=%d)!\n",ier); + if (ier > 1) { + printf("error (ier=%d)!\n", ier); return ier; } else - printf("%d of: %lld NU to %lld NU in %.3g s \t%.3g tot NU pts/s\n",ntransf, (long long)M,(long long)N,t,ntransf*(M+N)/t); - printf("\t\t\tspeedup \t T_FINUFFT1D3 / T_finufft1d3many = %.3g\n", t/ti); + printf("%d of: %lld NU to %lld NU in %.3g s \t%.3g tot NU pts/s\n", ntransf, + (long long)M, (long long)N, t, ntransf * (M + N) / t); + printf("\t\t\tspeedup \t T_FINUFFT1D3 / T_finufft1d3many = %.3g\n", t / ti); - maxerror = 0.0; // worst error over the ntransf + maxerror = 0.0; // worst error over the ntransf for (int k = 0; k < ntransf; ++k) - maxerror = max(maxerror, (double)relerrtwonorm(N,f_1d3+k*N,F+k*N)); - errmax = max(maxerror,errmax); - printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) = %.3g\n",maxerror); + maxerror = max(maxerror, (double)relerrtwonorm(N, f_1d3 + k * N, F + k * N)); + errmax = max(maxerror, errmax); + printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) = %.3g\n", maxerror); free(f_1d3); free(x); free(s); free(c); free(F); - return (errmax>errfail); -} + return (errmax > errfail); +} diff --git a/test/finufft2d_test.cpp b/test/finufft2d_test.cpp index 04945b5f9..5c053dc8e 100644 --- a/test/finufft2d_test.cpp +++ b/test/finufft2d_test.cpp @@ -4,120 +4,129 @@ using namespace std; using namespace finufft::utils; -const char* help[]={ - "Tester for FINUFFT in 2d, all 3 types, either precision.", - "", - "Usage: finufft2d_test Nmodes1 Nmodes2 Nsrc [tol [debug [spread_sort [upsampfac [errfail]]]]]", - "\teg:\tfinufft2d_test 1000 1000 1000000 1e-12 1 2 2.0 1e-11", - "\tnotes:\tif errfail present, exit code 1 if any error > errfail", - NULL}; +const char *help[] = {"Tester for FINUFFT in 2d, all 3 types, either precision.", + "", + "Usage: finufft2d_test Nmodes1 Nmodes2 Nsrc [tol [debug " + "[spread_sort [upsampfac [errfail]]]]]", + "\teg:\tfinufft2d_test 1000 1000 1000000 1e-12 1 2 2.0 1e-11", + "\tnotes:\tif errfail present, exit code 1 if any error > errfail", + NULL}; // Barnett 2/1/17 onwards -int main(int argc, char* argv[]) -{ - BIGINT M, N1, N2; // M = # srcs, N1,N2 = # modes - double w, tol = 1e-6; // default +int main(int argc, char *argv[]) { + BIGINT M, N1, N2; // M = # srcs, N1,N2 = # modes + double w, tol = 1e-6; // default double err, errfail = INFINITY, errmax = 0; - finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts); + finufft_opts opts; + FINUFFT_DEFAULT_OPTS(&opts); // opts.fftw = FFTW_MEASURE; // change from usual FFTW_ESTIMATE - int isign = +1; // choose which exponential sign to test - if (argc<4 || argc>9) { - for (int i=0; help[i]; ++i) - fprintf(stderr,"%s\n",help[i]); + int isign = +1; // choose which exponential sign to test + if (argc < 4 || argc > 9) { + for (int i = 0; help[i]; ++i) fprintf(stderr, "%s\n", help[i]); return 2; } - sscanf(argv[1],"%lf",&w); N1 = (BIGINT)w; - sscanf(argv[2],"%lf",&w); N2 = (BIGINT)w; - sscanf(argv[3],"%lf",&w); M = (BIGINT)w; - if (argc>4) sscanf(argv[4],"%lf",&tol); - if (argc>5) sscanf(argv[5],"%d",&opts.debug); - opts.spread_debug = (opts.debug>1) ? 1 : 0; // see output from spreader - if (argc>6) sscanf(argv[6],"%d",&opts.spread_sort); - if (argc>7) { sscanf(argv[7],"%lf",&w); opts.upsampfac=(FLT)w; } - if (argc>8) sscanf(argv[8],"%lf",&errfail); - + sscanf(argv[1], "%lf", &w); + N1 = (BIGINT)w; + sscanf(argv[2], "%lf", &w); + N2 = (BIGINT)w; + sscanf(argv[3], "%lf", &w); + M = (BIGINT)w; + if (argc > 4) sscanf(argv[4], "%lf", &tol); + if (argc > 5) sscanf(argv[5], "%d", &opts.debug); + opts.spread_debug = (opts.debug > 1) ? 1 : 0; // see output from spreader + if (argc > 6) sscanf(argv[6], "%d", &opts.spread_sort); + if (argc > 7) { + sscanf(argv[7], "%lf", &w); + opts.upsampfac = (FLT)w; + } + if (argc > 8) sscanf(argv[8], "%lf", &errfail); + cout << scientific << setprecision(15); - BIGINT N = N1*N2; + BIGINT N = N1 * N2; - FLT *x = (FLT *)malloc(sizeof(FLT)*M); // NU pts x coords - FLT *y = (FLT *)malloc(sizeof(FLT)*M); // NU pts y coords - CPX* c = (CPX*)malloc(sizeof(CPX)*M); // strengths - CPX* F = (CPX*)malloc(sizeof(CPX)*N); // mode ampls + FLT *x = (FLT *)malloc(sizeof(FLT) * M); // NU pts x coords + FLT *y = (FLT *)malloc(sizeof(FLT) * M); // NU pts y coords + CPX *c = (CPX *)malloc(sizeof(CPX) * M); // strengths + CPX *F = (CPX *)malloc(sizeof(CPX) * N); // mode ampls #pragma omp parallel { - unsigned int se=MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s -#pragma omp for schedule(static,TEST_RANDCHUNK) - for (BIGINT j=0; j1) { - printf("error (ier=%d)!\n",ier); + CNTime timer; + timer.start(); + int ier = FINUFFT2D1(M, x, y, c, isign, tol, N1, N2, F, &opts); + double ti = timer.elapsedsec(); + if (ier > 1) { + printf("error (ier=%d)!\n", ier); return ier; } else - printf("\t%lld NU pts to (%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n", - (long long)M,(long long)N1,(long long)N2,ti,M/ti); + printf("\t%lld NU pts to (%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n", (long long)M, + (long long)N1, (long long)N2, ti, M / ti); - BIGINT nt1 = (BIGINT)(0.37*N1), nt2 = (BIGINT)(0.26*N2); // choose some mode index to check - FLT Ftr=0, Fti=0; // crude direct... -#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(+:Ftr,Fti) - for (BIGINT j=0; j1) { - printf("error (ier=%d)!\n",ier); + ier = FINUFFT2D2(M, x, y, c, isign, tol, N1, N2, F, &opts); + ti = timer.elapsedsec(); + if (ier > 1) { + printf("error (ier=%d)!\n", ier); return ier; } else - printf("\t(%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n",(long long)N1,(long long)N2,(long long)M,ti,M/ti); + printf("\t(%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", + (long long)N1, (long long)N2, (long long)M, ti, M / ti); - BIGINT jt = M/2; // check arbitrary choice of one targ pt - CPX ct = CPX(0,0); - BIGINT m=0; - for (BIGINT m2=-(N2/2); m2<=(N2-1)/2; ++m2) // loop in correct order over F - for (BIGINT m1=-(N1/2); m1<=(N1-1)/2; ++m1) - ct += F[m++] * exp(IMA*(FLT)isign*(m1*x[jt] + m2*y[jt])); // crude direct - err = abs(ct-c[jt])/infnorm(M,c); - errmax = max(err,errmax); - printf("\tone targ: rel err in c[%lld] is %.3g\n",(long long)jt,err); - if ((int64_t)M*N<=TEST_BIGPROB) { // also full direct eval - CPX* ct = (CPX*)malloc(sizeof(CPX)*M); - dirft2d2(M,x,y,ct,isign,N1,N2,F); - err = relerrtwonorm(M,ct,c); - errmax = max(err,errmax); - printf("\tdirft2d: rel l2-err of result c is %.3g\n",err); - //cout<<"c,ct:\n"; for (int j=0;j1) { - printf("error (ier=%d)!\n",ier); + ier = FINUFFT2D3(M, x, y, c, isign, tol, N, s, t, F, &opts); + ti = timer.elapsedsec(); + if (ier > 1) { + printf("error (ier=%d)!\n", ier); return ier; } else - printf("\t%lld NU to %lld NU in %.3g s \t%.3g tot NU pts/s\n",(long long)M,(long long)N,ti,(M+N)/ti); + printf("\t%lld NU to %lld NU in %.3g s \t%.3g tot NU pts/s\n", (long long)M, + (long long)N, ti, (M + N) / ti); - BIGINT kt = N/2; // check arbitrary choice of one targ pt - Ftr=0, Fti=0; // crude direct... -#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(+:Ftr,Fti) - for (BIGINT j=0; jerrfail); + free(x); + free(y); + free(c); + free(F); + free(s); + free(t); + return (errmax > errfail); } diff --git a/test/finufft2dmany_test.cpp b/test/finufft2dmany_test.cpp index 31b65378e..8b0f040ee 100644 --- a/test/finufft2dmany_test.cpp +++ b/test/finufft2dmany_test.cpp @@ -4,246 +4,262 @@ using namespace std; using namespace finufft::utils; -const char* help[]={ - "Tester for FINUFFT in 2d, vectorized, all 3 types, either precision.", - "", - "Usage: finufft2dmany_test ntrans Nmodes1 Nmodes2 Nsrc [tol [debug [spread_thread [maxbatchsize [spreadsort [upsampfac [errfail]]]]]]]", - "\teg:\tfinufft2dmany_test 100 1e2 1e2 1e5 1e-6 1 0 0 2 0.0 1e-5", - "\tnotes:\tif errfail present, exit code 1 if any error > errfail", - NULL}; +const char *help[] = { + "Tester for FINUFFT in 2d, vectorized, all 3 types, either precision.", + "", + "Usage: finufft2dmany_test ntrans Nmodes1 Nmodes2 Nsrc [tol [debug [spread_thread " + "[maxbatchsize [spreadsort [upsampfac [errfail]]]]]]]", + "\teg:\tfinufft2dmany_test 100 1e2 1e2 1e5 1e-6 1 0 0 2 0.0 1e-5", + "\tnotes:\tif errfail present, exit code 1 if any error > errfail", + NULL}; // Melody Shih Jun 2018; Barnett removed many_seq 7/27/18. Extra args 5/21/20. -int main(int argc, char* argv[]) -{ - BIGINT M, N1, N2; // M = # srcs, N1,N2 = # modes - int ntransf; // # of vectors for "many" interface - double w, tol = 1e-6; // default +int main(int argc, char *argv[]) { + BIGINT M, N1, N2; // M = # srcs, N1,N2 = # modes + int ntransf; // # of vectors for "many" interface + double w, tol = 1e-6; // default double err, errfail = INFINITY, errmax = 0; - finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts); - //opts.fftw = FFTW_MEASURE; // change from default FFTW_ESTIMATE - int isign = +1; // choose which exponential sign to test - if (argc<5 || argc>12) { - for (int i=0; help[i]; ++i) - fprintf(stderr,"%s\n",help[i]); + finufft_opts opts; + FINUFFT_DEFAULT_OPTS(&opts); + // opts.fftw = FFTW_MEASURE; // change from default FFTW_ESTIMATE + int isign = +1; // choose which exponential sign to test + if (argc < 5 || argc > 12) { + for (int i = 0; help[i]; ++i) fprintf(stderr, "%s\n", help[i]); return 2; } - sscanf(argv[1],"%lf",&w); ntransf = (int)w; - sscanf(argv[2],"%lf",&w); N1 = (BIGINT)w; - sscanf(argv[3],"%lf",&w); N2 = (BIGINT)w; - sscanf(argv[4],"%lf",&w); M = (BIGINT)w; - if (argc>5) sscanf(argv[5],"%lf",&tol); - if (argc>6) sscanf(argv[6],"%d",&opts.debug); - opts.spread_debug = (opts.debug>1) ? 1 : 0; // see output from spreader - if (argc>7) sscanf(argv[7],"%d",&opts.spread_thread); - if (argc>8) sscanf(argv[8],"%d",&opts.maxbatchsize); - if (argc>9) sscanf(argv[9],"%d",&opts.spread_sort); - if (argc>10) { sscanf(argv[10],"%lf",&w); opts.upsampfac=(FLT)w; } - if (argc>11) sscanf(argv[11],"%lf",&errfail); - + sscanf(argv[1], "%lf", &w); + ntransf = (int)w; + sscanf(argv[2], "%lf", &w); + N1 = (BIGINT)w; + sscanf(argv[3], "%lf", &w); + N2 = (BIGINT)w; + sscanf(argv[4], "%lf", &w); + M = (BIGINT)w; + if (argc > 5) sscanf(argv[5], "%lf", &tol); + if (argc > 6) sscanf(argv[6], "%d", &opts.debug); + opts.spread_debug = (opts.debug > 1) ? 1 : 0; // see output from spreader + if (argc > 7) sscanf(argv[7], "%d", &opts.spread_thread); + if (argc > 8) sscanf(argv[8], "%d", &opts.maxbatchsize); + if (argc > 9) sscanf(argv[9], "%d", &opts.spread_sort); + if (argc > 10) { + sscanf(argv[10], "%lf", &w); + opts.upsampfac = (FLT)w; + } + if (argc > 11) sscanf(argv[11], "%lf", &errfail); + cout << scientific << setprecision(15); - BIGINT N = N1*N2; + BIGINT N = N1 * N2; - FLT* x = (FLT*)malloc(sizeof(FLT)*M); // NU pts x coords - FLT* y = (FLT*)malloc(sizeof(FLT)*M); // NU pts y coords - CPX* c = (CPX*)malloc(sizeof(CPX)*M*ntransf); // strengths - CPX* F = (CPX*)malloc(sizeof(CPX)*N*ntransf); // mode ampls + FLT *x = (FLT *)malloc(sizeof(FLT) * M); // NU pts x coords + FLT *y = (FLT *)malloc(sizeof(FLT) * M); // NU pts y coords + CPX *c = (CPX *)malloc(sizeof(CPX) * M * ntransf); // strengths + CPX *F = (CPX *)malloc(sizeof(CPX) * N * ntransf); // mode ampls #pragma omp parallel { - unsigned int se=MY_OMP_GET_THREAD_NUM(); -#pragma omp for schedule(static,TEST_RANDCHUNK) - for (BIGINT j=0; j1) { - printf("error (ier=%d)!\n",ier); + CNTime timer; + timer.start(); + int ier = FINUFFT2D1MANY(ntransf, M, x, y, c, isign, tol, N1, N2, F, &opts); + double ti = timer.elapsedsec(); + if (ier > 1) { + printf("error (ier=%d)!\n", ier); return ier; } else - printf("ntr=%d: %lld NU pts to (%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)M,(long long)N1,(long long)N2,ti,ntransf*M/ti); - - int i = ntransf-1; // choose a vector (transform number) to check - BIGINT nt1 = (BIGINT)(0.37*N1), nt2 = (BIGINT)(0.26*N2); // choose some mode index to check - CPX Ft = CPX(0,0), J = IMA*(FLT)isign; - for (BIGINT j=0; j1) { - printf("error (ier=%d)!\n",ier); + double t = timer.elapsedsec(); + if (ier > 1) { + printf("error (ier=%d)!\n", ier); return ier; } else - printf("%d of: %lld NU pts to (%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)M,(long long)N1,(long long)N2,t,ntransf*M/t); - printf("\t\t\tspeedup \t T_FINUFFT2D1 / T_finufft2d1many = %.3g\n", t/ti); + printf("%d of: %lld NU pts to (%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n", ntransf, + (long long)M, (long long)N1, (long long)N2, t, ntransf * M / t); + printf("\t\t\tspeedup \t T_FINUFFT2D1 / T_finufft2d1many = %.3g\n", t / ti); // Check consistency (worst over the ntransf) double maxerror = 0.0; for (int k = 0; k < ntransf; ++k) - maxerror = max(maxerror, (double)relerrtwonorm(N,F_2d1+k*N,F+k*N)); - errmax = max(maxerror,errmax); - printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) = %.3g\n",maxerror); + maxerror = max(maxerror, (double)relerrtwonorm(N, F_2d1 + k * N, F + k * N)); + errmax = max(maxerror, errmax); + printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) = %.3g\n", maxerror); free(F_2d1); printf("test 2d2 many vs repeated single: ------------------------------------\n"); - + #pragma omp parallel { - unsigned int se=MY_OMP_GET_THREAD_NUM(); -#pragma omp for schedule(static,TEST_RANDCHUNK) - for (BIGINT m=0; m1) { - printf("error (ier=%d)!\n",ier); + ier = FINUFFT2D2MANY(ntransf, M, x, y, c, isign, tol, N1, N2, F, &opts); + ti = timer.elapsedsec(); + if (ier > 1) { + printf("error (ier=%d)!\n", ier); return ier; } else - printf("ntr=%d: (%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)N1,(long long)N2,(long long)M,ti,ntransf*M/ti); + printf("ntr=%d: (%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", + ntransf, (long long)N1, (long long)N2, (long long)M, ti, ntransf * M / ti); FFTW_FORGET_WISDOM(); - i = ntransf-1; // choose a data to check - BIGINT jt = M/2; // check arbitrary choice of one targ pt - CPX ct = CPX(0,0); - BIGINT m=0; - for (BIGINT m2=-(N2/2); m2<=(N2-1)/2; ++m2) // loop in correct order over F - for (BIGINT m1=-(N1/2); m1<=(N1-1)/2; ++m1) - ct += F[i*N + m++] * exp(J*(m1*x[jt] + m2*y[jt])); // crude direct - err = abs(ct-c[jt+i*M])/infnorm(M,c+i*M); - errmax = max(err,errmax); - printf("\tone targ: rel err in c[%lld] of trans#%d is %.3g\n",(long long)jt,i,err); - + i = ntransf - 1; // choose a data to check + BIGINT jt = M / 2; // check arbitrary choice of one targ pt + CPX ct = CPX(0, 0); + BIGINT m = 0; + for (BIGINT m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F + for (BIGINT m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1) + ct += F[i * N + m++] * exp(J * (m1 * x[jt] + m2 * y[jt])); // crude direct + err = abs(ct - c[jt + i * M]) / infnorm(M, c + i * M); + errmax = max(err, errmax); + printf("\tone targ: rel err in c[%lld] of trans#%d is %.3g\n", (long long)jt, i, err); + // compare the result with single calls to FINUFFT2D2... - CPX* c_2d2 = (CPX*)malloc(sizeof(CPX)*M*ntransf); + CPX *c_2d2 = (CPX *)malloc(sizeof(CPX) * M * ntransf); timer.restart(); - for (int k=0; k1) { - printf("error (ier=%d)!\n",ier); + if (ier > 1) { + printf("error (ier=%d)!\n", ier); return ier; } else - printf("%d of: (%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)N1,(long long)N2,(long long)M,t,ntransf*M/t); - printf("\t\t\tspeedup \t T_FINUFFT2D2 / T_finufft2d2many = %.3g\n", t/ti); + printf("%d of: (%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", ntransf, + (long long)N1, (long long)N2, (long long)M, t, ntransf * M / t); + printf("\t\t\tspeedup \t T_FINUFFT2D2 / T_finufft2d2many = %.3g\n", t / ti); - maxerror = 0.0; // worst error over the ntransf + maxerror = 0.0; // worst error over the ntransf for (int k = 0; k < ntransf; ++k) - maxerror = max(maxerror, (double)relerrtwonorm(M,c_2d2+k*M,c+k*M)); - errmax = max(maxerror,errmax); - printf("\tconsistency check: sup ( ||c_many-c||_2 / ||c||_2 ) = %.3g\n",maxerror); + maxerror = max(maxerror, (double)relerrtwonorm(M, c_2d2 + k * M, c + k * M)); + errmax = max(maxerror, errmax); + printf("\tconsistency check: sup ( ||c_many-c||_2 / ||c||_2 ) = %.3g\n", maxerror); free(c_2d2); printf("test 2d3 many vs repeated single: ------------------------------------\n"); FFTW_FORGET_WISDOM(); - + // reuse the strengths c, interpret N as number of targs: #pragma omp parallel { - unsigned int se=MY_OMP_GET_THREAD_NUM(); -#pragma omp for schedule(static,TEST_RANDCHUNK) - for (BIGINT j=0; j1) { - printf("error (ier=%d)!\n",ier); + ier = FINUFFT2D3MANY(ntransf, M, x, y, c, isign, tol, N, s_freq, t_freq, F, &opts); + ti = timer.elapsedsec(); + if (ier > 1) { + printf("error (ier=%d)!\n", ier); return ier; } else - printf("ntr=%d: %lld NU to %lld NU in %.3g s \t%.3g tot NU pts/s\n",ntransf, (long long)M,(long long)N,ti,ntransf*(M+N)/ti); - - i = ntransf-1; // choose a transform to check - BIGINT kt = N/4; // check arbitrary choice of one targ pt - Ft = CPX(0,0); - for (BIGINT j=0;j1) { - printf("error (ier=%d)!\n",ier); + if (ier > 1) { + printf("error (ier=%d)!\n", ier); return ier; } else - printf("%d of: %lld NU to %lld NU in %.3g s \t%.3g tot NU pts/s\n",ntransf, (long long)M,(long long)N,t,ntransf*(M+N)/t); - printf("\t\t\tspeedup \t T_FINUFFT2D3 / T_finufft2d3many = %.3g\n", t/ti); + printf("%d of: %lld NU to %lld NU in %.3g s \t%.3g tot NU pts/s\n", ntransf, + (long long)M, (long long)N, t, ntransf * (M + N) / t); + printf("\t\t\tspeedup \t T_FINUFFT2D3 / T_finufft2d3many = %.3g\n", t / ti); - //check against the old - maxerror = 0.0; // worst error over the ntransf + // check against the old + maxerror = 0.0; // worst error over the ntransf for (int k = 0; k < ntransf; ++k) - maxerror = max(maxerror, (double)relerrtwonorm(N,f_2d3+k*N,F+k*N)); - errmax = max(maxerror,errmax); - printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) = %.3g\n",maxerror); + maxerror = max(maxerror, (double)relerrtwonorm(N, f_2d3 + k * N, F + k * N)); + errmax = max(maxerror, errmax); + printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) = %.3g\n", maxerror); free(f_2d3); - - free(x); free(y); free(c); free(F); free(s_freq); free(t_freq); - return (errmax>errfail); + + free(x); + free(y); + free(c); + free(F); + free(s_freq); + free(t_freq); + return (errmax > errfail); } diff --git a/test/finufft3d_test.cpp b/test/finufft3d_test.cpp index 29dba95d0..39ee8ab6d 100644 --- a/test/finufft3d_test.cpp +++ b/test/finufft3d_test.cpp @@ -4,127 +4,136 @@ using namespace std; using namespace finufft::utils; -const char* help[]={ - "Tester for FINUFFT in 3d, all 3 types, either precision.", - "", - "Usage: finufft3d_test Nmodes1 Nmodes2 Nmodes3 Nsrc [tol [debug [spread_sort [upsampfac [errfail]]]]]", - "\teg:\tfinufft3d_test 100 200 50 1e6 1e-12 0 2 0.0 1e-11", - "\tnotes:\tif errfail present, exit code 1 if any error > errfail", - NULL}; +const char *help[] = {"Tester for FINUFFT in 3d, all 3 types, either precision.", + "", + "Usage: finufft3d_test Nmodes1 Nmodes2 Nmodes3 Nsrc [tol [debug " + "[spread_sort [upsampfac [errfail]]]]]", + "\teg:\tfinufft3d_test 100 200 50 1e6 1e-12 0 2 0.0 1e-11", + "\tnotes:\tif errfail present, exit code 1 if any error > errfail", + NULL}; // Barnett 2/2/17 onwards. -int main(int argc, char* argv[]) -{ +int main(int argc, char *argv[]) { BIGINT M, N1, N2, N3; // M = # srcs, N1,N2,N3 = # modes - double w, tol = 1e-6; // default + double w, tol = 1e-6; // default double err, errfail = INFINITY, errmax = 0; - finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts); - //opts.fftw = FFTW_MEASURE; // change from usual FFTW_ESTIMATE - //opts.spread_max_sp_size = 3e4; // override test - //opts.spread_nthr_atomic = 15; // " - int isign = +1; // choose which exponential sign to test - if (argc<5 || argc>10) { - for (int i=0; help[i]; ++i) - fprintf(stderr,"%s\n",help[i]); + finufft_opts opts; + FINUFFT_DEFAULT_OPTS(&opts); + // opts.fftw = FFTW_MEASURE; // change from usual FFTW_ESTIMATE + // opts.spread_max_sp_size = 3e4; // override test + // opts.spread_nthr_atomic = 15; // " + int isign = +1; // choose which exponential sign to test + if (argc < 5 || argc > 10) { + for (int i = 0; help[i]; ++i) fprintf(stderr, "%s\n", help[i]); return 2; } - sscanf(argv[1],"%lf",&w); N1 = (BIGINT)w; - sscanf(argv[2],"%lf",&w); N2 = (BIGINT)w; - sscanf(argv[3],"%lf",&w); N3 = (BIGINT)w; - sscanf(argv[4],"%lf",&w); M = (BIGINT)w; - if (argc>5) sscanf(argv[5],"%lf",&tol); - if (argc>6) sscanf(argv[6],"%d",&opts.debug); // can be 0,1 or 2 - opts.spread_debug = (opts.debug>1) ? 1 : 0; // see output from spreader - if (argc>7) sscanf(argv[7],"%d",&opts.spread_sort); - if (argc>8) { sscanf(argv[8],"%lf",&w); opts.upsampfac=(FLT)w; } - if (argc>9) sscanf(argv[9],"%lf",&errfail); - + sscanf(argv[1], "%lf", &w); + N1 = (BIGINT)w; + sscanf(argv[2], "%lf", &w); + N2 = (BIGINT)w; + sscanf(argv[3], "%lf", &w); + N3 = (BIGINT)w; + sscanf(argv[4], "%lf", &w); + M = (BIGINT)w; + if (argc > 5) sscanf(argv[5], "%lf", &tol); + if (argc > 6) sscanf(argv[6], "%d", &opts.debug); // can be 0,1 or 2 + opts.spread_debug = (opts.debug > 1) ? 1 : 0; // see output from spreader + if (argc > 7) sscanf(argv[7], "%d", &opts.spread_sort); + if (argc > 8) { + sscanf(argv[8], "%lf", &w); + opts.upsampfac = (FLT)w; + } + if (argc > 9) sscanf(argv[9], "%lf", &errfail); + cout << scientific << setprecision(15); - BIGINT N = N1*N2*N3; + BIGINT N = N1 * N2 * N3; - FLT *x = (FLT *)malloc(sizeof(FLT)*M); // NU pts x coords - FLT *y = (FLT *)malloc(sizeof(FLT)*M); // NU pts y coords - FLT *z = (FLT *)malloc(sizeof(FLT)*M); // NU pts z coords - CPX* c = (CPX*)malloc(sizeof(CPX)*M); // strengths - CPX* F = (CPX*)malloc(sizeof(CPX)*N); // mode ampls + FLT *x = (FLT *)malloc(sizeof(FLT) * M); // NU pts x coords + FLT *y = (FLT *)malloc(sizeof(FLT) * M); // NU pts y coords + FLT *z = (FLT *)malloc(sizeof(FLT) * M); // NU pts z coords + CPX *c = (CPX *)malloc(sizeof(CPX) * M); // strengths + CPX *F = (CPX *)malloc(sizeof(CPX) * N); // mode ampls #pragma omp parallel { - unsigned int se=MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s -#pragma omp for schedule(static,TEST_RANDCHUNK) - for (BIGINT j=0; j1) { - printf("error (ier=%d)!\n",ier); + CNTime timer; + timer.start(); + int ier = FINUFFT3D1(M, x, y, z, c, isign, tol, N1, N2, N3, F, &opts); + double ti = timer.elapsedsec(); + if (ier > 1) { + printf("error (ier=%d)!\n", ier); return ier; } else printf(" %lld NU pts to (%lld,%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n", - (long long)M,(long long)N1,(long long)N2,(long long)N3,ti,M/ti); + (long long)M, (long long)N1, (long long)N2, (long long)N3, ti, M / ti); - BIGINT nt1 = (BIGINT)(0.37*N1), nt2 = (BIGINT)(0.26*N2), nt3 = (BIGINT)(-0.39*N3); // choose mode to check - FLT Ftr=0, Fti=0; // crude direct... -#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(+:Ftr,Fti) - for (BIGINT j=0; j1) { - printf("error (ier=%d)!\n",ier); + ier = FINUFFT3D2(M, x, y, z, c, isign, tol, N1, N2, N3, F, &opts); + ti = timer.elapsedsec(); + if (ier > 1) { + printf("error (ier=%d)!\n", ier); return ier; } else printf(" (%lld,%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", - (long long)N1,(long long)N2,(long long)N3,(long long)M,ti,M/ti); + (long long)N1, (long long)N2, (long long)N3, (long long)M, ti, M / ti); - BIGINT jt = M/2; // check arbitrary choice of one targ pt - CPX ct = CPX(0,0); - BIGINT m=0; - for (BIGINT m3=-(N3/2); m3<=(N3-1)/2; ++m3) // loop in F order - for (BIGINT m2=-(N2/2); m2<=(N2-1)/2; ++m2) - for (BIGINT m1=-(N1/2); m1<=(N1-1)/2; ++m1) - ct += F[m++] * exp(IMA*(FLT)isign*(m1*x[jt] + m2*y[jt] + m3*z[jt])); - err = abs(ct-c[jt])/infnorm(M,c); - errmax = max(err,errmax); - printf("\tone targ: rel err in c[%lld] is %.3g\n",(long long)jt,err); - if ((int64_t)M*N<=TEST_BIGPROB) { // also full direct eval - CPX* ct = (CPX*)malloc(sizeof(CPX)*M); - dirft3d2(M,x,y,z,ct,isign,N1,N2,N3,F); - err = relerrtwonorm(M,ct,c); - errmax = max(err,errmax); - printf("\tdirft3d: rel l2-err of result c is %.3g\n",err); + BIGINT jt = M / 2; // check arbitrary choice of one targ pt + CPX ct = CPX(0, 0); + BIGINT m = 0; + for (BIGINT m3 = -(N3 / 2); m3 <= (N3 - 1) / 2; ++m3) // loop in F order + for (BIGINT m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) + for (BIGINT m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1) + ct += F[m++] * exp(IMA * (FLT)isign * (m1 * x[jt] + m2 * y[jt] + m3 * z[jt])); + err = abs(ct - c[jt]) / infnorm(M, c); + errmax = max(err, errmax); + printf("\tone targ: rel err in c[%lld] is %.3g\n", (long long)jt, err); + if ((int64_t)M * N <= TEST_BIGPROB) { // also full direct eval + CPX *ct = (CPX *)malloc(sizeof(CPX) * M); + dirft3d2(M, x, y, z, ct, isign, N1, N2, N3, F); + err = relerrtwonorm(M, ct, c); + errmax = max(err, errmax); + printf("\tdirft3d: rel l2-err of result c is %.3g\n", err); free(ct); } @@ -132,60 +141,71 @@ int main(int argc, char* argv[]) // reuse the strengths c, interpret N as number of targs: #pragma omp parallel { - unsigned int se=MY_OMP_GET_THREAD_NUM(); -#pragma omp for schedule(static,TEST_RANDCHUNK) - for (BIGINT j=0; j1) { - printf("error (ier=%d)!\n",ier); + ier = FINUFFT3D3(M, x, y, z, c, isign, tol, N, s, t, u, F, &opts); + ti = timer.elapsedsec(); + if (ier > 1) { + printf("error (ier=%d)!\n", ier); return ier; } else - printf("\t%lld NU to %lld NU in %.3g s \t%.3g tot NU pts/s\n",(long long)M,(long long)N,ti,(M+N)/ti); + printf("\t%lld NU to %lld NU in %.3g s \t%.3g tot NU pts/s\n", (long long)M, + (long long)N, ti, (M + N) / ti); - BIGINT kt = N/2; // check arbitrary choice of one targ pt - Ftr=0, Fti=0; // crude direct... -#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(+:Ftr,Fti) - for (BIGINT j=0; jerrfail); + free(x); + free(y); + free(z); + free(c); + free(F); + free(s); + free(t); + free(u); + return (errmax > errfail); } diff --git a/test/finufft3dmany_test.cpp b/test/finufft3dmany_test.cpp index d427555c3..48c1fe422 100644 --- a/test/finufft3dmany_test.cpp +++ b/test/finufft3dmany_test.cpp @@ -4,254 +4,283 @@ using namespace std; using namespace finufft::utils; -const char* help[]={ - "Tester for FINUFFT in 3d, vectorized, all 3 types, either precision.", - "", - "Usage: finufft3dmany_test ntrans Nmodes1 Nmodes2 Nmodes3 Nsrc [tol [debug [spread_thread [maxbatchsize [spreadsort [upsampfac [errfail]]]]]]]", - "\teg:\tfinufft3dmany_test 100 50 50 50 1e5 1e-3 1 0 0 2 0.0 1e-2", - "\tnotes:\tif errfail present, exit code 1 if any error > errfail", - NULL}; +const char *help[] = { + "Tester for FINUFFT in 3d, vectorized, all 3 types, either precision.", + "", + "Usage: finufft3dmany_test ntrans Nmodes1 Nmodes2 Nmodes3 Nsrc [tol [debug " + "[spread_thread [maxbatchsize [spreadsort [upsampfac [errfail]]]]]]]", + "\teg:\tfinufft3dmany_test 100 50 50 50 1e5 1e-3 1 0 0 2 0.0 1e-2", + "\tnotes:\tif errfail present, exit code 1 if any error > errfail", + NULL}; // Malleo 2019 based on Shih 2018. Tidied, extra args, Barnett 5/25/20. -int main(int argc, char* argv[]) -{ - BIGINT M, N1, N2, N3; // M = # srcs, N1,N2 = # modes - int ntransf; // # of vectors for "many" interface - double w, tol = 1e-6; // default +int main(int argc, char *argv[]) { + BIGINT M, N1, N2, N3; // M = # srcs, N1,N2 = # modes + int ntransf; // # of vectors for "many" interface + double w, tol = 1e-6; // default double err, errfail = INFINITY, errmax = 0; - finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts); + finufft_opts opts; + FINUFFT_DEFAULT_OPTS(&opts); // opts.fftw = FFTW_MEASURE; // change from usual FFTW_ESTIMATE - int isign = +1; // choose which exponential sign to test - if (argc<6 || argc>13) { - for (int i=0; help[i]; ++i) - fprintf(stderr,"%s\n",help[i]); + int isign = +1; // choose which exponential sign to test + if (argc < 6 || argc > 13) { + for (int i = 0; help[i]; ++i) fprintf(stderr, "%s\n", help[i]); return 2; } - sscanf(argv[1],"%lf",&w); ntransf = (int)w; - sscanf(argv[2],"%lf",&w); N1 = (BIGINT)w; - sscanf(argv[3],"%lf",&w); N2 = (BIGINT)w; - sscanf(argv[4],"%lf",&w); N3 = (BIGINT)w; - sscanf(argv[5],"%lf",&w); M = (BIGINT)w; - if (argc>6) sscanf(argv[6],"%lf",&tol); - if (argc>7) sscanf(argv[7],"%d",&opts.debug); - opts.spread_debug = (opts.debug>1) ? 1 : 0; // see output from spreader - if (argc>8) sscanf(argv[8],"%d",&opts.spread_thread); - if (argc>9) sscanf(argv[9],"%d",&opts.maxbatchsize); - if (argc>10) sscanf(argv[10],"%d",&opts.spread_sort); - if (argc>11) { sscanf(argv[11],"%lf",&w); opts.upsampfac = (FLT)w; } - if (argc>12) sscanf(argv[12],"%lf",&errfail); + sscanf(argv[1], "%lf", &w); + ntransf = (int)w; + sscanf(argv[2], "%lf", &w); + N1 = (BIGINT)w; + sscanf(argv[3], "%lf", &w); + N2 = (BIGINT)w; + sscanf(argv[4], "%lf", &w); + N3 = (BIGINT)w; + sscanf(argv[5], "%lf", &w); + M = (BIGINT)w; + if (argc > 6) sscanf(argv[6], "%lf", &tol); + if (argc > 7) sscanf(argv[7], "%d", &opts.debug); + opts.spread_debug = (opts.debug > 1) ? 1 : 0; // see output from spreader + if (argc > 8) sscanf(argv[8], "%d", &opts.spread_thread); + if (argc > 9) sscanf(argv[9], "%d", &opts.maxbatchsize); + if (argc > 10) sscanf(argv[10], "%d", &opts.spread_sort); + if (argc > 11) { + sscanf(argv[11], "%lf", &w); + opts.upsampfac = (FLT)w; + } + if (argc > 12) sscanf(argv[12], "%lf", &errfail); cout << scientific << setprecision(15); - BIGINT N = N1*N2*N3; + BIGINT N = N1 * N2 * N3; - FLT* x = (FLT*)malloc(sizeof(FLT)*M); // NU pts x coords - FLT* y = (FLT*)malloc(sizeof(FLT)*M); // NU pts y coords - FLT* z = (FLT*)malloc(sizeof(FLT)*M); // NU pts z coords - CPX* c = (CPX*)malloc(sizeof(CPX)*M*ntransf); // strengths - CPX* F = (CPX*)malloc(sizeof(CPX)*N*ntransf); // mode ampls + FLT *x = (FLT *)malloc(sizeof(FLT) * M); // NU pts x coords + FLT *y = (FLT *)malloc(sizeof(FLT) * M); // NU pts y coords + FLT *z = (FLT *)malloc(sizeof(FLT) * M); // NU pts z coords + CPX *c = (CPX *)malloc(sizeof(CPX) * M * ntransf); // strengths + CPX *F = (CPX *)malloc(sizeof(CPX) * N * ntransf); // mode ampls #pragma omp parallel { - unsigned int se=MY_OMP_GET_THREAD_NUM(); -#pragma omp for schedule(static,TEST_RANDCHUNK) - for (BIGINT j=0; j1) { - printf("error (ier=%d)!\n",ier); + CNTime timer; + timer.start(); + int ier = FINUFFT3D1MANY(ntransf, M, x, y, z, c, isign, tol, N1, N2, N3, F, &opts); + double ti = timer.elapsedsec(); + if (ier > 1) { + printf("error (ier=%d)!\n", ier); return ier; } else - printf("ntr=%d: %lld NU pts to (%lld,%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)M,(long long)N1,(long long)N2, (long long)N3, ti,ntransf*M/ti); + printf("ntr=%d: %lld NU pts to (%lld,%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n", + ntransf, (long long)M, (long long)N1, (long long)N2, (long long)N3, ti, + ntransf * M / ti); - int i = ntransf-1; // choose a data to check - BIGINT nt1 = (BIGINT)(0.37*N1), nt2 = (BIGINT)(0.26*N2), nt3 = (BIGINT)(-0.39*N3); // choose some mode index to check - CPX Ft = CPX(0,0), J = IMA*(FLT)isign; - for (BIGINT j=0; j1) { - printf("error (ier=%d)!\n",ier); + double t = timer.elapsedsec(); + if (ier > 1) { + printf("error (ier=%d)!\n", ier); return ier; } else - printf("%d of: %lld NU pts to (%lld,%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)M,(long long)N1,(long long)N2,(long long)N3,t,ntransf*M/t); - printf("\t\t\tspeedup \t T_FINUFFT3D1 / T_finufft3d1many = %.3g\n", t/ti); + printf("%d of: %lld NU pts to (%lld,%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n", + ntransf, (long long)M, (long long)N1, (long long)N2, (long long)N3, t, + ntransf * M / t); + printf("\t\t\tspeedup \t T_FINUFFT3D1 / T_finufft3d1many = %.3g\n", t / ti); // Check accuracy (worst over the ntransf) double maxerror = 0.0; for (int k = 0; k < ntransf; ++k) - maxerror = max(maxerror, (double)relerrtwonorm(N,F_3d1+k*N,F+k*N)); - errmax = max(maxerror,errmax); - printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) = %.3g\n",maxerror); + maxerror = max(maxerror, (double)relerrtwonorm(N, F_3d1 + k * N, F + k * N)); + errmax = max(maxerror, errmax); + printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) = %.3g\n", maxerror); free(F_3d1); - printf("test 3d2 many vs repeated single: ------------------------------------\n"); #pragma omp parallel { - unsigned int se=MY_OMP_GET_THREAD_NUM(); -#pragma omp for schedule(static,TEST_RANDCHUNK) - for (BIGINT m=0; m1) { - printf("error (ier=%d)!\n",ier); + ier = FINUFFT3D2MANY(ntransf, M, x, y, z, c, isign, tol, N1, N2, N3, F, &opts); + ti = timer.elapsedsec(); + if (ier > 1) { + printf("error (ier=%d)!\n", ier); return ier; } else - printf("ntr=%d: (%lld,%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)N1,(long long)N2, (long long)N3, (long long)M,ti,ntransf*M/ti); - - i = ntransf-1; // choose a data to check - BIGINT jt = M/2; // check arbitrary choice of one targ pt - CPX ct = CPX(0,0); - BIGINT m=0; - for(BIGINT m3=-(N3/2); m3<=(N3-1)/2; ++m3){ - for (BIGINT m2=-(N2/2); m2<=(N2-1)/2; ++m2){ // loop in correct order over F - for (BIGINT m1=-(N1/2); m1<=(N1-1)/2; ++m1){ - ct += F[i*N + m++] * exp(J*(m1*x[jt]+m2*y[jt]+m3*z[jt])); // crude direct + printf("ntr=%d: (%lld,%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", + ntransf, (long long)N1, (long long)N2, (long long)N3, (long long)M, ti, + ntransf * M / ti); + + i = ntransf - 1; // choose a data to check + BIGINT jt = M / 2; // check arbitrary choice of one targ pt + CPX ct = CPX(0, 0); + BIGINT m = 0; + for (BIGINT m3 = -(N3 / 2); m3 <= (N3 - 1) / 2; ++m3) { + for (BIGINT m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) { // loop in correct order over + // F + for (BIGINT m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1) { + ct += F[i * N + m++] * exp(J * (m1 * x[jt] + m2 * y[jt] + m3 * z[jt])); // crude + // direct } } } - err = abs(ct-c[jt+i*M])/infnorm(M,c+i*M); - errmax = max(err,errmax); - printf("\tone targ: rel err in c[%lld] of trans#%d is %.3g\n",(long long)jt,i,err); + err = abs(ct - c[jt + i * M]) / infnorm(M, c + i * M); + errmax = max(err, errmax); + printf("\tone targ: rel err in c[%lld] of trans#%d is %.3g\n", (long long)jt, i, err); FFTW_FORGET_WISDOM(); // compare the result with FINUFFT3D2... - CPX* c_3d2 = (CPX*)malloc(sizeof(CPX)*M*ntransf); + CPX *c_3d2 = (CPX *)malloc(sizeof(CPX) * M * ntransf); timer.restart(); - for (int k=0; k1) { - printf("error (ier=%d)!\n",ier); + if (ier > 1) { + printf("error (ier=%d)!\n", ier); return ier; } else - printf("%d of: (%lld,%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)N1,(long long)N2,(long long)N3,(long long)M,t,ntransf*M/t); - printf("\t\t\tspeedup \t T_FINUFFT3D2 / T_finufft3d2many = %.3g\n", t/ti); + printf("%d of: (%lld,%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", + ntransf, (long long)N1, (long long)N2, (long long)N3, (long long)M, t, + ntransf * M / t); + printf("\t\t\tspeedup \t T_FINUFFT3D2 / T_finufft3d2many = %.3g\n", t / ti); - maxerror = 0.0; // worst error over the ntransf + maxerror = 0.0; // worst error over the ntransf for (int k = 0; k < ntransf; ++k) - maxerror = max(maxerror, (double)relerrtwonorm(M,c_3d2+k*M,c+k*M)); - errmax = max(maxerror,errmax); - printf("\tconsistency check: sup ( ||c_many-c||_2 / ||c||_2 ) = %.3g\n",maxerror); + maxerror = max(maxerror, (double)relerrtwonorm(M, c_3d2 + k * M, c + k * M)); + errmax = max(maxerror, errmax); + printf("\tconsistency check: sup ( ||c_many-c||_2 / ||c||_2 ) = %.3g\n", maxerror); free(c_3d2); - printf("test 3d3 many vs repeated single: ------------------------------------\n"); FFTW_FORGET_WISDOM(); // reuse the strengths c, interpret N as number of targs: #pragma omp parallel { - unsigned int se=MY_OMP_GET_THREAD_NUM(); -#pragma omp for schedule(static,TEST_RANDCHUNK) - for (BIGINT j=0; j1) { - printf("error (ier=%d)!\n",ier); + ier = FINUFFT3D3MANY(ntransf, M, x, y, z, c, isign, tol, N, s_freq, t_freq, u_freq, F, + &opts); + ti = timer.elapsedsec(); + if (ier > 1) { + printf("error (ier=%d)!\n", ier); return ier; } else - printf("ntr=%d: %lld NU to %lld NU in %.3g s \t%.3g tot NU pts/s\n",ntransf, (long long)M,(long long)N,ti,ntransf*(M+N)/ti); + printf("ntr=%d: %lld NU to %lld NU in %.3g s \t%.3g tot NU pts/s\n", ntransf, + (long long)M, (long long)N, ti, ntransf * (M + N) / ti); - i = ntransf-1; // choose a transform to check - BIGINT kt = N/4; // check arbitrary choice of one targ pt - Ft = CPX(0,0); - for (BIGINT j=0;j1) { - printf("error (ier=%d)!\n",ier); + if (ier > 1) { + printf("error (ier=%d)!\n", ier); return ier; } else - printf("%d of: %lld NU to %lld NU in %.3g s \t%.3g tot NU pts/s\n",ntransf, (long long)M,(long long)N,t,ntransf*(M+N)/t); - printf("\t\t\tspeedup \t T_FINUFFT3D3 / T_finufft3d3many = %.3g\n", t/ti); - - maxerror = 0.0; // worst error over the ntransf + printf("%d of: %lld NU to %lld NU in %.3g s \t%.3g tot NU pts/s\n", ntransf, + (long long)M, (long long)N, t, ntransf * (M + N) / t); + printf("\t\t\tspeedup \t T_FINUFFT3D3 / T_finufft3d3many = %.3g\n", t / ti); + + maxerror = 0.0; // worst error over the ntransf for (int k = 0; k < ntransf; ++k) - maxerror = max(maxerror, (double)relerrtwonorm(N,f_3d3+k*N,F+k*N)); - errmax = max(maxerror,errmax); - printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) = %.3g\n",maxerror); + maxerror = max(maxerror, (double)relerrtwonorm(N, f_3d3 + k * N, F + k * N)); + errmax = max(maxerror, errmax); + printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) = %.3g\n", maxerror); free(f_3d3); - - free(x); free(y); free(z); free(c); free(F); free(s_freq); free(t_freq); free(u_freq); - return (errmax>errfail); + + free(x); + free(y); + free(z); + free(c); + free(F); + free(s_freq); + free(t_freq); + free(u_freq); + return (errmax > errfail); } diff --git a/test/testutils.cpp b/test/testutils.cpp index cd2cd7bef..64b5d7a0a 100644 --- a/test/testutils.cpp +++ b/test/testutils.cpp @@ -9,16 +9,16 @@ and platform-indep, than having to compare the text output) Suggested compile (double/float versions): - g++ -std=c++14 -fopenmp testutils.cpp -I../include ../src/utils.o ../src/utils_precindep.o -o testutils -lgomp - g++ -std=c++14 -fopenmp testutils.cpp -I../include ../src/utils_32.o ../src/utils_precindep.o -o testutilsf -lgomp -DSINGLE + g++ -std=c++14 -fopenmp testutils.cpp -I../include ../src/utils.o + ../src/utils_precindep.o -o testutils -lgomp g++ -std=c++14 -fopenmp testutils.cpp + -I../include ../src/utils_32.o ../src/utils_precindep.o -o testutilsf -lgomp -DSINGLE */ // This switches FLT macro from double to float if SINGLE is defined, etc... #include using namespace finufft::utils; -int main(int argc, char* argv[]) -{ +int main(int argc, char *argv[]) { #ifdef SINGLE printf("testutilsf started...\n"); #else @@ -28,35 +28,41 @@ int main(int argc, char* argv[]) // test next235even... // Barnett 2/9/17, made smaller range 3/28/17. pass-fail 6/16/23 // The true outputs from {0,1,..,99}: - const BIGINT next235even_true[100] = {2, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 16, 16, 16, 16, 18, 18, 20, 20, 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, 32, 32, 36, 36, 36, 36, 40, 40, 40, 40, 48, 48, 48, 48, 48, 48, 48, 48, 50, 50, 54, 54, 54, 54, 60, 60, 60, 60, 60, 60, 64, 64, 64, 64, 72, 72, 72, 72, 72, 72, 72, 72, 80, 80, 80, 80, 80, 80, 80, 80, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 96, 96, 96, 96, 96, 96, 100, 100, 100}; - for (BIGINT n=0;n<100;++n) { + const BIGINT next235even_true[100] = { + 2, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 16, 16, 16, 16, 18, 18, 20, + 20, 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, 32, 32, 36, 36, 36, 36, 40, 40, 40, + 40, 48, 48, 48, 48, 48, 48, 48, 48, 50, 50, 54, 54, 54, 54, 60, 60, 60, 60, 60, + 60, 64, 64, 64, 64, 72, 72, 72, 72, 72, 72, 72, 72, 80, 80, 80, 80, 80, 80, 80, + 80, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 96, 96, 96, 96, 96, 96, 100, 100, 100}; + for (BIGINT n = 0; n < 100; ++n) { BIGINT o = next235even(n); BIGINT t = next235even_true[n]; - if (o!=t) { - printf("next235even(%lld) =\t%lld, error should be %lld!\n",(long long)n, (long long)o, (long long)t); + if (o != t) { + printf("next235even(%lld) =\t%lld, error should be %lld!\n", (long long)n, + (long long)o, (long long)t); return 1; } } - + // various old devel expts and comments... - //printf("starting huge next235even...\n"); // 1e11 takes 1 sec - //BIGINT n=(BIGINT)120573851963; - //printf("next235even(%ld) =\t%ld\n",n,next235even(n)); - //double* a; printf("%g\n",a[0]); // do deliberate segfault for bash debug! + // printf("starting huge next235even...\n"); // 1e11 takes 1 sec + // BIGINT n=(BIGINT)120573851963; + // printf("next235even(%ld) =\t%ld\n",n,next235even(n)); + // double* a; printf("%g\n",a[0]); // do deliberate segfault for bash debug! // test vector norms and norm difference routines... now pass-fail 6/16/23 BIGINT M = 1e4; std::vector a(M), b(M); - for (BIGINT j=0; j relerr) return 1; - if (abs(twonorm(M,&a[0]) - sqrt((FLT)M)) > relerr*sqrt((FLT)M)) return 1; - b[0] = CPX(0.0,0.0); // perturb b from a - if (abs(errtwonorm(M,&a[0],&b[0]) - 1.0) > relerr) return 1; - if (abs(sqrt((FLT)M)* relerrtwonorm(M,&a[0],&b[0]) - 1.0) > relerr) return 1; + FLT relerr = 2.0 * EPSILON; // 1 ULP, fine since 1.0 rep exactly + if (abs(infnorm(M, &a[0]) - 1.0) > relerr) return 1; + if (abs(twonorm(M, &a[0]) - sqrt((FLT)M)) > relerr * sqrt((FLT)M)) return 1; + b[0] = CPX(0.0, 0.0); // perturb b from a + if (abs(errtwonorm(M, &a[0], &b[0]) - 1.0) > relerr) return 1; + if (abs(sqrt((FLT)M) * relerrtwonorm(M, &a[0], &b[0]) - 1.0) > relerr) return 1; #ifdef SINGLE printf("testutilsf passed.\n");