Skip to content

Commit

Permalink
fix bug of gpu namespace
Browse files Browse the repository at this point in the history
  • Loading branch information
denghuilu committed Mar 19, 2021
1 parent 71dc7a8 commit 53de560
Show file tree
Hide file tree
Showing 15 changed files with 58 additions and 61 deletions.
26 changes: 26 additions & 0 deletions source/lib/include/gpu_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,23 @@ inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=
}
}

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
static __inline__ __device__ double atomicAdd(
double* address,
double val)
{
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val + __longlong_as_double(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old);
} while (assumed != old);
return __longlong_as_double(old);
}
#endif

template <typename FPTYPE>
void memcpy_host_to_device(
FPTYPE * device,
Expand Down Expand Up @@ -70,4 +87,13 @@ void delete_device_memory(
if (device != NULL) {
cudaErrcheck(cudaFree(device));
}
}

template <typename FPTYPE>
void memset_device_memory(
FPTYPE * device,
const FPTYPE var,
const int size)
{
cudaErrcheck(cudaMemset(device, var, sizeof(FPTYPE) * size));
}
3 changes: 2 additions & 1 deletion source/lib/src/cuda/gelu.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#include "gelu.h"
#include "device.h"
#include "gpu_cuda.h"

template <typename FPTYPE>
__global__ void gelu(
Expand Down Expand Up @@ -49,6 +48,7 @@ __global__ void gelu_grad_grad(
out[idx] = dy[idx] * dy_2[idx] * (0.134145 * SQRT_2_PI * xx[idx] * xx[idx] * (1 - var1 * var1) - SQRT_2_PI * xx[idx] * var2 * (0.134145 * xx[idx] * xx[idx] + 1) * var1 + var2);
}

namespace deepmd {
template<typename FPTYPE>
void gelu_gpu_cuda(
FPTYPE * out,
Expand Down Expand Up @@ -94,3 +94,4 @@ template void gelu_grad_gpu_cuda<float>(float * out, const float * x, const floa
template void gelu_grad_gpu_cuda<double>(double * out, const double * x, const double * dy, const int size);
template void gelu_grad_grad_gpu_cuda<float>(float * out, const float * x, const float * dy, const float * dy_2, const int size);
template void gelu_grad_grad_gpu_cuda<double>(double * out, const double * x, const double * dy, const double * dy_2, const int size);
}
12 changes: 6 additions & 6 deletions source/lib/src/cuda/prod_env_mat.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
#include <cub/block/block_store.cuh>
#include <cub/block/block_radix_sort.cuh>

using namespace deepmd;

// common part of prod_env_mat
template <
typename Key,
Expand Down Expand Up @@ -156,7 +154,7 @@ void format_nbor_list_1024 (
int_64 * key,
const FPTYPE* coord,
const int* type,
const InputNlist & gpu_inlist,
const deepmd::InputNlist & gpu_inlist,
const int& nloc,
const float& rcut,
int * i_idx)
Expand All @@ -182,7 +180,7 @@ void format_nbor_list_2048 (
int_64 * key,
const FPTYPE* coord,
const int* type,
const InputNlist & gpu_inlist,
const deepmd::InputNlist & gpu_inlist,
const int& nloc,
const float& rcut,
int * i_idx)
Expand All @@ -208,7 +206,7 @@ void format_nbor_list_4096 (
int_64 * key,
const FPTYPE* coord,
const int* type,
const InputNlist & gpu_inlist,
const deepmd::InputNlist & gpu_inlist,
const int& nloc,
const float& rcut,
int * i_idx)
Expand All @@ -234,7 +232,7 @@ void format_nbor_list(
int * nlist,
const FPTYPE * coord,
const int * type,
const InputNlist & gpu_inlist,
const deepmd::InputNlist & gpu_inlist,
int * array_int,
int_64 * array_longlong,
const int max_nbor_size,
Expand Down Expand Up @@ -433,6 +431,7 @@ __global__ void compute_env_mat_r(
}
}

namespace deepmd {
template <typename FPTYPE>
void prod_env_mat_a_gpu_cuda(
FPTYPE * em,
Expand Down Expand Up @@ -505,3 +504,4 @@ template void prod_env_mat_a_gpu_cuda<float>(float * em, float * em_deriv, float
template void prod_env_mat_a_gpu_cuda<double>(double * em, double * em_deriv, double * rij, int * nlist, const double * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const double * avg, const double * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
template void prod_env_mat_r_gpu_cuda<float>(float * em, float * em_deriv, float * rij, int * nlist, const float * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const float * avg, const float * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
template void prod_env_mat_r_gpu_cuda<double>(double * em, double * em_deriv, double * rij, int * nlist, const double * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const double * avg, const double * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
}
19 changes: 2 additions & 17 deletions source/lib/src/cuda/prod_force.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,6 @@
#include "gpu_cuda.h"
#include "prod_force.h"

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
static __inline__ __device__ double atomicAdd(
double* address,
double val)
{
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val + __longlong_as_double(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old);
} while (assumed != old);
return __longlong_as_double(old);
}
#endif

template <
typename FPTYPE,
int THREADS_PER_BLOCK>
Expand Down Expand Up @@ -112,6 +95,7 @@ __global__ void force_deriv_wrt_neighbors_r(
net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]);
}

namespace deepmd {
template<typename FPTYPE>
void prod_force_a_gpu_cuda(
FPTYPE * force,
Expand Down Expand Up @@ -172,3 +156,4 @@ template void prod_force_a_gpu_cuda<float>(float * force, const float * net_deri
template void prod_force_a_gpu_cuda<double>(double * force, const double * net_deriv, const double * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei);
template void prod_force_r_gpu_cuda<float>(float * force, const float * net_deriv, const float * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei);
template void prod_force_r_gpu_cuda<double>(double * force, const double * net_deriv, const double * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei);
}
19 changes: 2 additions & 17 deletions source/lib/src/cuda/prod_virial.cu
Original file line number Diff line number Diff line change
@@ -1,23 +1,6 @@
#include "gpu_cuda.h"
#include "prod_virial.h"

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
static __inline__ __device__ double atomicAdd(
double* address,
double val)
{
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val + __longlong_as_double(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old);
} while (assumed != old);
return __longlong_as_double(old);
}
#endif

template<typename FPTYPE>
__global__ void virial_deriv_wrt_neighbors_a(
FPTYPE * virial,
Expand Down Expand Up @@ -90,6 +73,7 @@ __global__ void virial_deriv_wrt_neighbors_r(
net_deriv[idx * ndescrpt + idy] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz / 3]);
}

namespace deepmd {
template<typename FPTYPE>
void prod_virial_a_gpu_cuda(
FPTYPE * virial,
Expand Down Expand Up @@ -152,3 +136,4 @@ template void prod_virial_a_gpu_cuda<float>(float * virial, float * atom_virial,
template void prod_virial_a_gpu_cuda<double>(double * virial, double * atom_virial, const double * net_deriv, const double * in_deriv, const double * rij, const int * nlist, const int nloc, const int nall, const int nnei);
template void prod_virial_r_gpu_cuda<float>(float * virial, float * atom_virial, const float * net_deriv, const float * in_deriv, const float * rij, const int * nlist, const int nloc, const int nall, const int nnei);
template void prod_virial_r_gpu_cuda<double>(double * virial, double * atom_virial, const double * net_deriv, const double * in_deriv, const double * rij, const int * nlist, const int nloc, const int nall, const int nnei);
}
4 changes: 2 additions & 2 deletions source/lib/src/cuda/tabulate.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
#include <cub/cub.cuh>
#include <cub/device/device_radix_sort.cuh>
#include "tabulate.h"
#include "gpu_cuda.h"

Expand Down Expand Up @@ -193,6 +191,7 @@ __global__ void tabulate_fusion_grad_fifth_order_polynomial(
}
}

namespace deepmd {
template<typename FPTYPE>
void tabulate_fusion_gpu_cuda(
FPTYPE * out,
Expand Down Expand Up @@ -238,3 +237,4 @@ template void tabulate_fusion_gpu_cuda<float>(float * out, const float * table,
template void tabulate_fusion_gpu_cuda<double>(double * out, const double * table, const double * table_info, const double * em_x, const double * em, const int nloc, const int nnei, const int last_layer_size);
template void tabulate_fusion_grad_gpu_cuda<float> (float * dy_dem_x, float * dy_dem, const float * table, const float * table_info, const float * em_x, const float * em, const float * dy, const int nloc, const int nnei, const int last_layer_size);
template void tabulate_fusion_grad_gpu_cuda<double> (double * dy_dem_x, double * dy_dem, const double * table, const double * table_info, const double * em_x, const double * em, const double * dy, const int nloc, const int nnei, const int last_layer_size);
}
4 changes: 2 additions & 2 deletions source/lib/src/neighbor_list.cc
Original file line number Diff line number Diff line change
Expand Up @@ -844,7 +844,7 @@ build_nlist_cpu<float>(
const float & rcut);

#if GOOGLE_CUDA
void convert_nlist_gpu_cuda(
void deepmd::convert_nlist_gpu_cuda(
InputNlist & gpu_nlist,
InputNlist & cpu_nlist,
int* & gpu_memory,
Expand All @@ -867,7 +867,7 @@ void convert_nlist_gpu_cuda(
free(_firstneigh);
}

void free_nlist_gpu_cuda(
void deepmd::free_nlist_gpu_cuda(
InputNlist & gpu_nlist)
{
delete_device_memory(gpu_nlist.ilist);
Expand Down
2 changes: 1 addition & 1 deletion source/lib/src/prod_env_mat.cc
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ prod_env_mat_r_cpu<float>(
const std::vector<int> sec);

#if GOOGLE_CUDA
void env_mat_nbor_update(
void deepmd::env_mat_nbor_update(
InputNlist &inlist,
InputNlist &gpu_inlist,
int &max_nbor_size,
Expand Down
8 changes: 4 additions & 4 deletions source/lib/tests/test_env_mat_a.cc
Original file line number Diff line number Diff line change
Expand Up @@ -557,9 +557,9 @@ TEST_F(TestEnvMatA, prod_gpu_cuda)
malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
malloc_device_memory(memory_dev, nloc * max_nbor_size);
convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
deepmd::convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);

prod_env_mat_a_gpu_cuda(
deepmd::prod_env_mat_a_gpu_cuda(
em_dev,
em_deriv_dev,
rij_dev,
Expand Down Expand Up @@ -648,9 +648,9 @@ TEST_F(TestEnvMatA, prod_gpu_cuda_equal_cpu)
malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
malloc_device_memory(memory_dev, nloc * max_nbor_size);
convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
deepmd::convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);

prod_env_mat_a_gpu_cuda(
deepmd::prod_env_mat_a_gpu_cuda(
em_dev,
em_deriv_dev,
rij_dev,
Expand Down
8 changes: 4 additions & 4 deletions source/lib/tests/test_env_mat_r.cc
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda)
}
std::vector<int> ilist(nloc), numneigh(nloc);
std::vector<int*> firstneigh(nloc);
InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
convert_nlist(inlist, nlist_a_cpy);
std::vector<double > em(nloc * ndescrpt, 0.0), em_deriv(nloc * ndescrpt * 3, 0.0), rij(nloc * nnei * 3, 0.0);
std::vector<int> nlist(nloc * nnei, 0);
Expand All @@ -402,7 +402,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda)
malloc_device_memory(memory_dev, nloc * max_nbor_size);
convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);

prod_env_mat_r_gpu_cuda(
deepmd::prod_env_mat_r_gpu_cuda(
em_dev,
em_deriv_dev,
rij_dev,
Expand Down Expand Up @@ -467,7 +467,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda_equal_cpu)
}
std::vector<int> ilist(nloc), numneigh(nloc);
std::vector<int*> firstneigh(nloc);
InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
convert_nlist(inlist, nlist_a_cpy);
std::vector<double > em(nloc * ndescrpt, 0.0), em_deriv(nloc * ndescrpt * 3, 0.0), rij(nloc * nnei * 3, 0.0);
std::vector<int> nlist(nloc * nnei, 0);
Expand All @@ -492,7 +492,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda_equal_cpu)
malloc_device_memory(memory_dev, nloc * max_nbor_size);
convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);

prod_env_mat_r_gpu_cuda(
deepmd::prod_env_mat_r_gpu_cuda(
em_dev,
em_deriv_dev,
rij_dev,
Expand Down
6 changes: 3 additions & 3 deletions source/lib/tests/test_gelu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ TEST_F(TestGelu, gelu_gpu_cuda)
double * gelu_dev = NULL, * xx_dev = NULL;
malloc_device_memory_sync(gelu_dev, gelu);
malloc_device_memory_sync(xx_dev, xx);
gelu_gpu_cuda<double> (gelu_dev, xx_dev, nloc);
deepmd::gelu_gpu_cuda<double> (gelu_dev, xx_dev, nloc);
memcpy_device_to_host(gelu_dev, gelu);
delete_device_memory(gelu_dev);
delete_device_memory(xx_dev);
Expand All @@ -174,7 +174,7 @@ TEST_F(TestGelu, gelu_grad_gpu_cuda)
malloc_device_memory_sync(gelu_grad_dev, gelu_grad);
malloc_device_memory_sync(xx_dev, xx);
malloc_device_memory_sync(dy_dev, dy);
gelu_grad_gpu_cuda<double> (gelu_grad_dev, xx_dev, dy_dev, nloc);
deepmd::gelu_grad_gpu_cuda<double> (gelu_grad_dev, xx_dev, dy_dev, nloc);
memcpy_device_to_host(gelu_grad_dev, gelu_grad);
delete_device_memory(gelu_grad_dev);
delete_device_memory(xx_dev);
Expand All @@ -198,7 +198,7 @@ TEST_F(TestGelu, gelu_grad_grad_gpu_cuda)
malloc_device_memory_sync(xx_dev, xx);
malloc_device_memory_sync(dy_dev, dy);
malloc_device_memory_sync(dy_2_dev, dy_2);
gelu_grad_grad_gpu_cuda<double> (gelu_grad_grad_dev, xx_dev, dy_dev, dy_2_dev, nloc);
deepmd::gelu_grad_grad_gpu_cuda<double> (gelu_grad_grad_dev, xx_dev, dy_dev, dy_2_dev, nloc);
memcpy_device_to_host(gelu_grad_grad_dev, gelu_grad_grad);
delete_device_memory(gelu_grad_grad_dev);
delete_device_memory(xx_dev);
Expand Down
2 changes: 1 addition & 1 deletion source/lib/tests/test_prod_force_a.cc
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ TEST_F(TestProdForceA, gpu_cuda)
malloc_device_memory_sync(net_deriv_dev, net_deriv);
malloc_device_memory_sync(env_deriv_dev, env_deriv);

prod_force_a_gpu_cuda<double> (force_dev, net_deriv_dev, env_deriv_dev, nlist_dev, nloc, nall, nnei);
deepmd::prod_force_a_gpu_cuda<double> (force_dev, net_deriv_dev, env_deriv_dev, nlist_dev, nloc, nall, nnei);

memcpy_device_to_host(force_dev, force);
delete_device_memory(nlist_dev);
Expand Down
2 changes: 1 addition & 1 deletion source/lib/tests/test_prod_force_r.cc
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ TEST_F(TestProdForceR, gpu_cuda)
malloc_device_memory_sync(net_deriv_dev, net_deriv);
malloc_device_memory_sync(env_deriv_dev, env_deriv);

prod_force_r_gpu_cuda<double> (force_dev, net_deriv_dev, env_deriv_dev, nlist_dev, nloc, nall, nnei);
deepmd::prod_force_r_gpu_cuda<double> (force_dev, net_deriv_dev, env_deriv_dev, nlist_dev, nloc, nall, nnei);

memcpy_device_to_host(force_dev, force);
delete_device_memory(nlist_dev);
Expand Down
2 changes: 1 addition & 1 deletion source/lib/tests/test_prod_virial_a.cc
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ TEST_F(TestProdVirialA, gpu_cuda)
malloc_device_memory_sync(env_deriv_dev, env_deriv);
malloc_device_memory_sync(rij_dev, rij);

prod_virial_a_gpu_cuda<double> (virial_dev, atom_virial_dev, net_deriv_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nall, nnei);
deepmd::prod_virial_a_gpu_cuda<double> (virial_dev, atom_virial_dev, net_deriv_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nall, nnei);

memcpy_device_to_host(virial_dev, virial);
memcpy_device_to_host(atom_virial_dev, atom_virial);
Expand Down
2 changes: 1 addition & 1 deletion source/lib/tests/test_prod_virial_r.cc
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ TEST_F(TestProdVirialR, gpu_cuda)
malloc_device_memory_sync(env_deriv_dev, env_deriv);
malloc_device_memory_sync(rij_dev, rij);

prod_virial_r_gpu_cuda<double> (virial_dev, atom_virial_dev, net_deriv_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nall, nnei);
deepmd::prod_virial_r_gpu_cuda<double> (virial_dev, atom_virial_dev, net_deriv_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nall, nnei);

memcpy_device_to_host(virial_dev, virial);
memcpy_device_to_host(atom_virial_dev, atom_virial);
Expand Down

0 comments on commit 53de560

Please sign in to comment.