Skip to content

Commit

Permalink
basic benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
DiamonDinoia committed Jul 3, 2024
1 parent 5dde122 commit 45333fa
Show file tree
Hide file tree
Showing 12 changed files with 156 additions and 62 deletions.
19 changes: 9 additions & 10 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.19)
cmake_minimum_required(VERSION 3.23)

project(finufft VERSION 2.2.0 LANGUAGES C CXX)

Expand All @@ -23,7 +23,7 @@ if (CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS)
endif ()
set(FINUFFT_FFTW_SUFFIX "OpenMP" CACHE STRING "Suffix for FFTW libraries (e.g. OpenMP, Threads etc.)")
set(FINUFFT_FFTW_LIBRARIES "DEFAULT" CACHE STRING "Specify a custom FFTW library")

set(FINUFFT_CUDA_ARCHITECTURES "all-major" CACHE STRING "CUDA architectures to build for (e.g. 60;70;75;)")
# All options go here
# sphinx tag (don't remove): @cmake_opts_start
option(FINUFFT_BUILD_EXAMPLES "Whether to build the FINUFFT examples" OFF)
Expand Down Expand Up @@ -219,30 +219,29 @@ if (FINUFFT_USE_CUDA)
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
message("FINUFFT WARNING: No CUDA architecture supplied via '-DCMAKE_CUDA_ARCHITECTURES=...', defaulting to '60;70;75;'")
message("See: https://developer.nvidia.com/cuda-gpus for more details on what architecture to supply.")
set(CMAKE_CUDA_ARCHITECTURES "60;70;75" CACHE STRING "" FORCE)
endif ()
enable_language(CUDA)
find_package(CUDAToolkit REQUIRED)
add_subdirectory(src/cuda)
if (BUILD_TESTING AND FINUFFT_BUILD_TESTS)
if (BUILD_TESTING OR FINUFFT_BUILD_TESTS)
add_subdirectory(perftest/cuda)
add_subdirectory(test/cuda)
endif ()

list(APPEND INSTALL_TARGETS cufinufft cufinufft_static)
endif ()

# Add tests defined in their own directory
if (BUILD_TESTING AND FINUFFT_BUILD_TESTS AND FINUFFT_USE_CPU)
if (FINUFFT_USE_CPU AND (BUILD_TESTING OR FINUFFT_BUILD_TESTS))
add_subdirectory(test)
add_subdirectory(perftest)
endif ()

if (BUILD_TESTING AND FINUFFT_BUILD_TESTS AND FINUFFT_USE_CUDA)
add_subdirectory(test/cuda)
if (FINUFFT_BUILD_EXAMPLES AND FINUFFT_USE_CPU)
add_subdirectory(examples)
endif ()

if (FINUFFT_BUILD_EXAMPLES)
add_subdirectory(examples)
if (FINUFFT_BUILD_EXAMPLES AND FINUFFT_USE_GPU)
add_subdirectory(examples/cuda)
endif ()

if (FINUFFT_BUILD_FORTRAN)
Expand Down
4 changes: 4 additions & 0 deletions include/cufinufft/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ template<typename T>
void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a,
T *fwkerhalf, finufft_spread_opts opts);

template<typename T>
std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y,
int bin_size_z);

} // namespace common
} // namespace cufinufft
#endif
66 changes: 42 additions & 24 deletions include/cufinufft/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ static void cufinufft_setup_binsize(int type, int dim, cufinufft_opts *opts) {
} break;
case 3: {
switch (opts->gpu_method) {
case 0:
case 1:
case 2: {
opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 16 : opts->gpu_binsizex;
Expand Down Expand Up @@ -109,17 +110,16 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
}

// Mult-GPU support: set the CUDA Device ID:
const int device_id = opts == NULL ? 0 : opts->gpu_device_id;
const int device_id = opts == nullptr ? 0 : opts->gpu_device_id;
cufinufft::utils::WithCudaDevice device_swapper(device_id);

/* allocate the plan structure, assign address to user pointer. */
cufinufft_plan_t<T> *d_plan = new cufinufft_plan_t<T>;
*d_plan_ptr = d_plan;
auto *d_plan = new cufinufft_plan_t<T>;
*d_plan_ptr = d_plan;
// Zero out your struct, (sets all pointers to NULL)
memset(d_plan, 0, sizeof(*d_plan));

/* If a user has not supplied their own options, assign defaults for them. */
if (opts == NULL) { // use default opts
if (opts == nullptr) { // use default opts
cufinufft_default_opts(&(d_plan->opts));
} else { // or read from what's passed in
d_plan->opts = *opts; // keep a deep copy; changing *opts now has no effect
Expand All @@ -138,26 +138,9 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
}

auto &stream = d_plan->stream = (cudaStream_t)d_plan->opts.gpu_stream;

/* Automatically set GPU method. */
if (d_plan->opts.gpu_method == 0) {
/* For type 1, we default to method 2 (SM) since this is generally faster.
* However, in the special case of _double precision_ in _three dimensions_
* with more than _three digits of precision_, there is note enough shared
* memory for this to work. As a result, we will default to method 1 (GM) in
* this special case.
*
* For type 2, we always default to method 1 (GM). */
if (type == 1 && (sizeof(T) == 4 || dim < 3 || tol >= 1e-3))
d_plan->opts.gpu_method = 2;
else if (type == 1 && tol < 1e-3)
d_plan->opts.gpu_method = 1;
else if (type == 2)
d_plan->opts.gpu_method = 1;
}

/* Setup Spreader */
using namespace cufinufft::common;
/* Setup Spreader */

// can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK
if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) {
delete *d_plan_ptr;
Expand All @@ -180,6 +163,41 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
if (dim > 2)
set_nf_type12(d_plan->mu, d_plan->opts, d_plan->spopts, &nf3,
d_plan->opts.gpu_obinsizez);

// dynamically request the maximum amount of shared memory available
// for the spreader

/* Automatically set GPU method. */
if (d_plan->opts.gpu_method == 0) {
/* For type 1, we default to method 2 (SM) since this is generally faster.
* However, in the special case of _double precision_ in _three dimensions_
* with more than _three digits of precision_, there is note enough shared
* memory for this to work. As a result, we will default to method 1 (GM) in
* this special case.
*
* For type 2, we always default to method 1 (GM). */

// query the device for the amount of shared memory available
int shared_mem_per_block{};
cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin,
device_id);
RETURN_IF_CUDA_ERROR
// compute the amount of shared memory required for the method
const auto shared_mem_required =
shared_memory_required<T>(dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);
printf("Shared memory available: %d KB, required: %d KB\n", shared_mem_per_block,
shared_mem_required);
if ((shared_mem_required > shared_mem_per_block)) {
d_plan->opts.gpu_method = 1;
printf("choosing method 1\n");
} else {
d_plan->opts.gpu_method = 2;
printf("choosing method 2\n");
}
printf("using method %d\n", d_plan->opts.gpu_method);
}

int fftsign = (iflag >= 0) ? 1 : -1;

d_plan->nf1 = nf1;
Expand Down
1 change: 1 addition & 0 deletions perftest/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
add_executable(cuperftest cuperftest.cu)
target_include_directories(cuperftest PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
target_link_libraries(cuperftest PUBLIC cufinufft)
#file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/bench.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
13 changes: 13 additions & 0 deletions perftest/cuda/bench.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
./cuperftest --prec d --n_runs 5 --N1 1e2 --N2 1e2 --M 2e6 --method 0 --tol 1e-4
./cuperftest --prec d --n_runs 5 --N1 1e1 --N2 1e1 --N3 1e1 --M 2e6 --method 0 --tol 1e-4
./cuperftest --prec d --n_runs 5 --N1 1e2 --N2 1e2 --N3 1e1 --M 2e6 --method 0 --tol 1e-4
./cuperftest --prec d --n_runs 5 --N1 1e1 --N2 1e2 --N3 1e3 --M 2e6 --method 0 --tol 1e-4
./cuperftest --prec d --n_runs 5 --N1 1e2 --N2 1e2 --N3 1e3 --M 2e6 --method 0 --tol 1e-4
#./cuperftest --prec d --n_runs 5 --N1 1e5 --N2 1e5 --N3 1e5 --M 2e6 --method 0 --tol 1e-10
#./cuperftest --prec d --n_runs 5 --N1 1e4 --N2 1e4 --N3 1e4 --M 2e6 --method 0 --tol 1e-10
#./cuperftest --prec d --n_runs 5 --N1 1e5 --N2 1e5 --N3 1e5 --M 2e6 --method 0 --tol 1e-10
#./cuperftest --prec d --n_runs 5 --N1 1e6 --N2 1e6 --M 2e6 --method 0 --tol 1e-10
#./cuperftest --prec d --n_runs 5 --N1 1e8 --N2 1e6 --M 2e6 --method 0 --tol 1e-10
#./cuperftest --prec d --n_runs 5 --N1 1e6 --N2 1e6 --M 2e6 --method 0 --tol 1e-10
#./cuperftest --prec d --n_runs 5 --N1 1e7 --N2 1e7 --M 2e6 --method 0 --tol 1e-10
#./cuperftest --prec d --n_runs 5 --N1 1e8 --N2 1e8 --M 2e6 --method 0 --tol 1e-10
41 changes: 23 additions & 18 deletions perftest/cuda/cuperftest.cu
Original file line number Diff line number Diff line change
Expand Up @@ -275,24 +275,29 @@ template<typename T> void run_test(test_options_t &test_opts) {
}

const int64_t nupts_tot = M * test_opts.n_runs * ntransf;

printf("event,count,tot(ms),mean(ms),std(ms),nupts/s,ns/nupt\n");
printf("host_to_device,%d,%f,%f,%f,0.0,0.0\n", h2d_timer.count(), h2d_timer.tot(),
h2d_timer.mean(), h2d_timer.std());
printf("makeplan,%d,%f,%f,%f,0.0,0.0\n", makeplan_timer.count(), makeplan_timer.tot(),
makeplan_timer.mean(), makeplan_timer.std());
printf("setpts,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, setpts_timer.tot(),
setpts_timer.mean(), setpts_timer.std(), nupts_tot * 1000 / setpts_timer.tot(),
setpts_timer.tot() * 1E6 / nupts_tot);
printf("execute,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, execute_timer.tot(),
execute_timer.mean(), execute_timer.std(),
nupts_tot * 1000 / execute_timer.tot(), execute_timer.tot() * 1E6 / nupts_tot);
printf("device_to_host,%d,%f,%f,%f,0.0,0.0\n", d2h_timer.count(), d2h_timer.tot(),
d2h_timer.mean(), d2h_timer.std());
printf("amortized,%d,%f,%f,%f,%g,%f\n", 1, amortized_timer.tot(),
amortized_timer.mean(), amortized_timer.std(),
nupts_tot * 1000 / amortized_timer.tot(),
amortized_timer.tot() * 1E6 / nupts_tot);
//
// printf("event,count,tot(ms),mean(ms),std(ms),nupts/s,ns/nupt\n");
// printf("host_to_device,%d,%f,%f,%f,0.0,0.0\n", h2d_timer.count(), h2d_timer.tot(),
// h2d_timer.mean(), h2d_timer.std());
// printf("makeplan,%d,%f,%f,%f,0.0,0.0\n", makeplan_timer.count(),
// makeplan_timer.tot(),
// makeplan_timer.mean(), makeplan_timer.std());
// printf("setpts,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, setpts_timer.tot(),
// setpts_timer.mean(), setpts_timer.std(), nupts_tot * 1000 /
// setpts_timer.tot(), setpts_timer.tot() * 1E6 / nupts_tot);
// printf("execute,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, execute_timer.tot(),
// execute_timer.mean(), execute_timer.std(),
// nupts_tot * 1000 / execute_timer.tot(), execute_timer.tot() * 1E6 /
// nupts_tot);
// printf("device_to_host,%d,%f,%f,%f,0.0,0.0\n", d2h_timer.count(), d2h_timer.tot(),
// d2h_timer.mean(), d2h_timer.std());
// printf("amortized,%d,%f,%f,%f,%g,%f\n", 1, amortized_timer.tot(),
// amortized_timer.mean(), amortized_timer.std(),
// nupts_tot * 1000 / amortized_timer.tot(),
// amortized_timer.tot() * 1E6 / nupts_tot);
// print numpts / s
printf("setpts pts/s: %g\n", float(nupts_tot) * 1000 / setpts_timer.tot());
printf("execute pts/s: %g\n", float(nupts_tot) * 1000 / execute_timer.tot());
}

int main(int argc, char *argv[]) {
Expand Down
3 changes: 1 addition & 2 deletions src/cuda/1d/cufinufft1d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@
#include <iostream>
#include <type_traits>

#include <assert.h>
#include <cassert>
#include <cufft.h>

#include <cufinufft/cudeconvolve.h>
#include <cufinufft/memtransfer.h>
#include <cufinufft/spreadinterp.h>
#include <cufinufft/types.h>

Expand Down
16 changes: 11 additions & 5 deletions src/cuda/3d/spread3d_wrapper.cu
Original file line number Diff line number Diff line change
Expand Up @@ -536,21 +536,27 @@ int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_
size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) *
(bin_size_y + 2 * ceil(ns / 2.0)) *
(bin_size_z + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
if (sharedplanorysize > 49152) {
std::cerr << "[cuspread3d_subprob] error: not enough shared memory ("
<< sharedplanorysize << ")" << std::endl;
return FINUFFT_ERR_INSUFFICIENT_SHMEM;
}
// if (sharedplanorysize > 49152) {
// std::cerr << "[cuspread3d_subprob] error: not enough shared memory ("
// << sharedplanorysize << ")" << std::endl;
// return FINUFFT_ERR_INSUFFICIENT_SHMEM;
// }

for (int t = 0; t < blksize; t++) {
if (d_plan->opts.gpu_kerevalmeth) {
cudaFuncSetAttribute(spread_3d_subprob<T, 1>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
sharedplanorysize);
spread_3d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
sigma, es_c, es_beta, d_binstartpts, d_binsize, bin_size_x, bin_size_y,
bin_size_z, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize,
numbins[0], numbins[1], numbins[2], d_idxnupts);
RETURN_IF_CUDA_ERROR
} else {
cudaFuncSetAttribute(spread_3d_subprob<T, 0>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
sharedplanorysize);
spread_3d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
sigma, es_c, es_beta, d_binstartpts, d_binsize, bin_size_x, bin_size_y,
Expand Down
17 changes: 15 additions & 2 deletions src/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,34 +24,47 @@ set(CUFINUFFT_INCLUDE_DIRS ${CUFINUFFT_INCLUDE_DIRS} PARENT_SCOPE)

add_library(cufinufft_common_objects OBJECT ${PRECISION_INDEPENDENT_SRC})
target_include_directories(cufinufft_common_objects PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
set_property(TARGET cufinufft_common_objects PROPERTY POSITION_INDEPENDENT_CODE ON)
set_target_properties(
cufinufft_common_objects PROPERTIES
POSITION_INDEPENDENT_CODE ON
CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}
)

add_library(cufinufft_objects OBJECT ${PRECISION_DEPENDENT_SRC})
target_include_directories(cufinufft_objects PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
set_property(TARGET cufinufft_objects PROPERTY POSITION_INDEPENDENT_CODE ON)
set_target_properties(
cufinufft_objects PROPERTIES
POSITION_INDEPENDENT_CODE ON
CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}
)

add_library(cufinufft SHARED
$<TARGET_OBJECTS:cufinufft_common_objects>
$<TARGET_OBJECTS:cufinufft_objects>
)
target_include_directories(cufinufft PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
target_link_libraries(cufinufft CUDA::cudart CUDA::cufft CUDA::nvToolsExt)
set_target_properties(
cufinufft PROPERTIES
LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}
)

add_library(cufinufft_static STATIC
$<TARGET_OBJECTS:cufinufft_common_objects>
$<TARGET_OBJECTS:cufinufft_objects>
)
target_include_directories(cufinufft_static PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
if(WIN32)
target_link_libraries(cufinufft_static PUBLIC CUDA::cudart CUDA::cufft CUDA::nvToolsExt)
else()
target_link_libraries(cufinufft_static PUBLIC CUDA::cudart_static CUDA::cufft_static CUDA::nvToolsExt)
endif()
set_target_properties(
cufinufft_static PROPERTIES
ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}
ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
)

file(GLOB CUFINUFFT_PUBLIC_HEADERS "${CMAKE_SOURCE_DIR}/include/cufinufft*.h")
Expand Down
28 changes: 28 additions & 0 deletions src/cuda/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,28 @@ void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex<doubl
}
}

template<typename T>
std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y,
int bin_size_z) {
printf("dim, ns, bin_size_x, bin_size_y, bin_size_z: %d %d %d %d %d\n", dim, ns,
bin_size_x, bin_size_y, bin_size_z);
int adjusted_ns = bin_size_x + ((ns + 1) / 2) * 2;

if (dim == 1) {
return adjusted_ns * sizeof(cuda_complex<T>);
}

adjusted_ns *= (bin_size_y + ((ns + 1) / 2) * 2);

if (dim == 2) {
return adjusted_ns * sizeof(cuda_complex<T>);
}

adjusted_ns *= (bin_size_z + ((ns + 1) / 2) * 2);

return adjusted_ns * sizeof(cuda_complex<T>);
}

template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, float *f,
std::complex<double> *a, float *fwkerhalf,
finufft_spread_opts opts);
Expand Down Expand Up @@ -227,5 +249,11 @@ template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, float *fwkerhalf,
finufft_spread_opts opts);
template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, double *fwkerhalf,
finufft_spread_opts opts);

template std::size_t shared_memory_required<float>(int dim, int ns, int bin_size_x,
int bin_size_y, int bin_size_z);
template std::size_t shared_memory_required<double>(int dim, int ns, int bin_size_x,
int bin_size_y, int bin_size_z);

} // namespace common
} // namespace cufinufft
2 changes: 1 addition & 1 deletion src/cuda/spreadinterp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmet
ier = FINUFFT_WARN_EPS_TOO_SMALL;
}
opts.nspread = ns;
opts.ES_halfwidth = (T)ns / 2; // constants to help ker eval (except Horner)
opts.ES_halfwidth = T(ns * .5); // constants to help ker eval (except Horner)
opts.ES_c = 4.0 / (T)(ns * ns);

T betaoverns = 2.30; // gives decent betas for default sigma=2.0
Expand Down
8 changes: 8 additions & 0 deletions test/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@ foreach(srcfile ${test_src})
add_executable(${executable} ${srcfile})
target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS})
target_link_libraries(${executable} PUBLIC cufinufft m)
set_target_properties(${executable} PROPERTIES
LINKER_LANGUAGE CUDA
CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}
)
message(STATUS "Adding test ${executable}"
" with CUDA_ARCHITECTURES=${FINUFFT_CUDA_ARCHITECTURES}"
" and INCLUDE=${CUFINUFFT_INCLUDE_DIRS}"
)
endforeach()

function(add_tests PREC REQ_TOL CHECK_TOL)
Expand Down

0 comments on commit 45333fa

Please sign in to comment.