-
Notifications
You must be signed in to change notification settings - Fork 528
/
Copy pathgpu_cuda.h
156 lines (143 loc) · 4.86 KB
/
gpu_cuda.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#pragma once
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
#define GPU_MAX_NBOR_SIZE 4096
#define DPErrcheck(res) {DPAssert((res), __FILE__, __LINE__);}
inline void DPAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess) {
fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (code == 2) {
// out of memory
// TODO: I have no idea how to thorw errors back to Python interface
fprintf(stderr, "Your memory is not enough, thus an error has been raised " \
"above. You need to take the following actions:\n" \
"1. Check if the network size of the model is too large.\n" \
"2. Check if the batch size of training or testing is too large. " \
"You can set the training batch size to `auto`.\n" \
"3. Check if the number of atoms is too large.\n" \
"4. Check if another program is using the same GPU by execuating `nvidia-smi`. " \
"The usage of GPUs is controlled by `CUDA_VISIBLE_DEVICES` " \
"environment variable.\n");
}
if (abort) exit(code);
}
}
#define nborErrcheck(res) {nborAssert((res), __FILE__, __LINE__);}
inline void nborAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess) {
fprintf(stderr,"cuda assert: %s %s %d\n", "DeePMD-kit:\tillegal nbor list sorting", file, line);
if (code == 2) {
// out of memory
// TODO: I have no idea how to thorw errors back to Python interface
fprintf(stderr, "Your memory is not enough, thus an error has been raised " \
"above. You need to take the following actions:\n" \
"1. Check if the network size of the model is too large.\n" \
"2. Check if the batch size of training or testing is too large. " \
"You can set the training batch size to `auto`.\n" \
"3. Check if the number of atoms is too large.\n" \
"4. Check if another program is using the same GPU by execuating `nvidia-smi`. " \
"The usage of GPUs is controlled by `CUDA_VISIBLE_DEVICES` " \
"environment variable.\n");
}
if (abort) exit(code);
}
}
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
static __inline__ __device__ double atomicAdd(
double* address,
double val)
{
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val + __longlong_as_double(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old);
} while (assumed != old);
return __longlong_as_double(old);
}
#endif
namespace deepmd {
template <typename FPTYPE>
void memcpy_host_to_device(
FPTYPE * device,
const std::vector<FPTYPE> &host)
{
DPErrcheck(cudaMemcpy(device, &host[0], sizeof(FPTYPE) * host.size(), cudaMemcpyHostToDevice));
}
template <typename FPTYPE>
void memcpy_host_to_device(
FPTYPE * device,
const FPTYPE * host,
const int size)
{
DPErrcheck(cudaMemcpy(device, host, sizeof(FPTYPE) * size, cudaMemcpyHostToDevice));
}
template <typename FPTYPE>
void memcpy_device_to_host(
const FPTYPE * device,
std::vector<FPTYPE> &host)
{
DPErrcheck(cudaMemcpy(&host[0], device, sizeof(FPTYPE) * host.size(), cudaMemcpyDeviceToHost));
}
template <typename FPTYPE>
void memcpy_device_to_host(
const FPTYPE * device,
FPTYPE * host,
const int size)
{
DPErrcheck(cudaMemcpy(host, device, sizeof(FPTYPE) * size, cudaMemcpyDeviceToHost));
}
template <typename FPTYPE>
void malloc_device_memory(
FPTYPE * &device,
const std::vector<FPTYPE> &host)
{
DPErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * host.size()));
}
template <typename FPTYPE>
void malloc_device_memory(
FPTYPE * &device,
const int size)
{
DPErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * size));
}
template <typename FPTYPE>
void malloc_device_memory_sync(
FPTYPE * &device,
const std::vector<FPTYPE> &host)
{
DPErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * host.size()));
memcpy_host_to_device(device, host);
}
template <typename FPTYPE>
void malloc_device_memory_sync(
FPTYPE * &device,
const FPTYPE * host,
const int size)
{
DPErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * size));
memcpy_host_to_device(device, host, size);
}
template <typename FPTYPE>
void delete_device_memory(
FPTYPE * &device)
{
if (device != NULL) {
DPErrcheck(cudaFree(device));
}
}
template <typename FPTYPE>
void memset_device_memory(
FPTYPE * device,
const FPTYPE var,
const int size)
{
DPErrcheck(cudaMemset(device, var, sizeof(FPTYPE) * size));
}
} // end of namespace deepmd