-
Notifications
You must be signed in to change notification settings - Fork 81
/
utils.h
152 lines (127 loc) · 3.83 KB
/
utils.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#ifndef FINUFFT_UTILS_H
#define FINUFFT_UTILS_H
// octave (mkoctfile) needs this otherwise it doesn't know what int64_t is!
#include <complex>
#include <cstdint>
#include <cuComplex.h>
#include <cufinufft/types.h>
#include <cuda_runtime.h>
#include <sys/time.h>
#include <cuda.h>
#include <type_traits>
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__)
#else
__inline__ __device__ double atomicAdd(double *address, double val) {
unsigned long long int *address_as_ull = (unsigned long long int *)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val + __longlong_as_double(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN
// (since NaN != NaN)
} while (assumed != old);
return __longlong_as_double(old);
}
#endif
namespace cufinufft {
namespace utils {
class WithCudaDevice {
public:
WithCudaDevice(int device) {
cudaGetDevice(&orig_device_);
cudaSetDevice(device);
}
~WithCudaDevice() { cudaSetDevice(orig_device_); }
private:
int orig_device_;
};
// jfm timer class
class CNTime {
public:
void start();
double restart();
double elapsedsec();
private:
struct timeval initial;
};
// ahb math helpers
CUFINUFFT_BIGINT next235beven(CUFINUFFT_BIGINT n, CUFINUFFT_BIGINT b);
template<typename T> T infnorm(int n, std::complex<T> *a) {
T nrm = 0.0;
for (int m = 0; m < n; ++m) {
T aa = real(conj(a[m]) * a[m]);
if (aa > nrm) nrm = aa;
}
return sqrt(nrm);
}
#ifdef __CUDA_ARCH__
__forceinline__ __device__ auto interval(const int ns, const float x) {
// float to int round up and fused multiply-add to round up
const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x));
// float to int round down and fused multiply-add to round down
const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x));
return int2{xstart, xend};
}
__forceinline__ __device__ auto interval(const int ns, const double x) {
// same as above
const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x));
const auto xend = __double2int_rd(__fma_rd(ns, .5, x));
return int2{xstart, xend};
}
#endif
// Define a macro to check if NVCC version is >= 11.3
#if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__)
#if (__CUDACC_VER_MAJOR__ > 11) || \
(__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 3 && __CUDA_ARCH__ >= 600)
#define ALLOCA_SUPPORTED 1
// windows compatibility
#if __has_include(<malloc.h>)
#include <malloc.h>
#endif
#else
#define ALLOCA_SUPPORTED 0
#endif
#else
#define ALLOCA_SUPPORTED 0
#endif
#if defined(__CUDA_ARCH__)
#if __CUDA_ARCH__ >= 900
#define COMPUTE_CAPABILITY_90_OR_HIGHER 1
#else
#define COMPUTE_CAPABILITY_90_OR_HIGHER 0
#endif
#else
#define COMPUTE_CAPABILITY_90_OR_HIGHER 0
#endif
/**
* does a complex atomic add on a shared memory address
* it adds the real and imaginary parts separately
* cuda does not support atomic operations
* on complex numbers on shared memory directly
*/
template<typename T>
static __forceinline__ __device__ void atomicAddComplexShared(
cuda_complex<T> *address, cuda_complex<T> res) {
const auto raw_address = reinterpret_cast<T *>(address);
atomicAdd(raw_address, res.x);
atomicAdd(raw_address + 1, res.y);
}
/**
* does a complex atomic add on a global memory address
* since cuda 90 atomic operations on complex numbers
* on shared memory are supported so we leverage them
*/
template<typename T>
static __forceinline__ __device__ void atomicAddComplexGlobal(
cuda_complex<T> *address, cuda_complex<T> res) {
if constexpr (
std::is_same_v<cuda_complex<T>, float2> && COMPUTE_CAPABILITY_90_OR_HIGHER) {
atomicAdd(address, res);
} else {
atomicAddComplexShared<T>(address, res);
}
}
} // namespace utils
} // namespace cufinufft
#endif