fix bug of gpu namespace

amcadmus · Mar 19, 2021 · 53de560 · 53de560
1 parent 71dc7a8
commit 53de560
Show file tree

Hide file tree

Showing 15 changed files with 58 additions and 61 deletions.
diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h
@@ -13,6 +13,23 @@ inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=
   }
 }
 
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+static __inline__ __device__ double atomicAdd(
+    double* address, 
+    double val) 
+{
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+          __double_as_longlong(val + __longlong_as_double(assumed)));
+  // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old);
+  } while (assumed != old);
+  return __longlong_as_double(old);
+}
+#endif
+
 template <typename FPTYPE>
 void memcpy_host_to_device(
     FPTYPE * device, 
@@ -70,4 +87,13 @@ void delete_device_memory(
   if (device != NULL) {
     cudaErrcheck(cudaFree(device));
   }
+}
+
+template <typename FPTYPE>
+void memset_device_memory(
+    FPTYPE * device, 
+    const FPTYPE var,
+    const int size) 
+{
+  cudaErrcheck(cudaMemset(device, var, sizeof(FPTYPE) * size));  
 }
diff --git a/source/lib/src/cuda/gelu.cu b/source/lib/src/cuda/gelu.cu
@@ -1,6 +1,5 @@
 #include "gelu.h"
 #include "device.h"
-#include "gpu_cuda.h"
 
 template <typename FPTYPE>
 __global__ void gelu(
@@ -49,6 +48,7 @@ __global__ void gelu_grad_grad(
   out[idx] = dy[idx] * dy_2[idx] * (0.134145 * SQRT_2_PI * xx[idx] * xx[idx] * (1 - var1 * var1) - SQRT_2_PI * xx[idx] * var2 * (0.134145 * xx[idx] * xx[idx] + 1) * var1 + var2);
 }
 
+namespace deepmd {
 template<typename FPTYPE>
 void gelu_gpu_cuda(
     FPTYPE * out, 
@@ -94,3 +94,4 @@ template void gelu_grad_gpu_cuda<float>(float * out, const float * x, const floa
 template void gelu_grad_gpu_cuda<double>(double * out, const double * x, const double * dy, const int size);
 template void gelu_grad_grad_gpu_cuda<float>(float * out, const float * x, const float * dy, const float * dy_2, const int size);
 template void gelu_grad_grad_gpu_cuda<double>(double * out, const double * x, const double * dy, const double * dy_2, const int size);
+}
diff --git a/source/lib/src/cuda/prod_env_mat.cu b/source/lib/src/cuda/prod_env_mat.cu
@@ -4,8 +4,6 @@
 #include <cub/block/block_store.cuh>
 #include <cub/block/block_radix_sort.cuh>
 
-using namespace deepmd;
-
 // common part of prod_env_mat
 template <
     typename    Key,
@@ -156,7 +154,7 @@ void format_nbor_list_1024 (
     int_64 * key,
     const FPTYPE* coord,
     const int* type,
-    const InputNlist & gpu_inlist,
+    const deepmd::InputNlist & gpu_inlist,
     const int& nloc,       
     const float& rcut, 
     int * i_idx) 
@@ -182,7 +180,7 @@ void format_nbor_list_2048 (
     int_64 * key,
     const FPTYPE* coord,
     const int* type,
-    const InputNlist & gpu_inlist,
+    const deepmd::InputNlist & gpu_inlist,
     const int& nloc,       
     const float& rcut, 
     int * i_idx) 
@@ -208,7 +206,7 @@ void format_nbor_list_4096 (
     int_64 * key,
     const FPTYPE* coord,
     const int* type,
-    const InputNlist & gpu_inlist,
+    const deepmd::InputNlist & gpu_inlist,
     const int& nloc,       
     const float& rcut, 
     int * i_idx)
@@ -234,7 +232,7 @@ void format_nbor_list(
     int * nlist, 
     const FPTYPE * coord, 
     const int * type, 
-    const InputNlist & gpu_inlist,
+    const deepmd::InputNlist & gpu_inlist,
     int * array_int,
     int_64 * array_longlong,
     const int max_nbor_size,
@@ -433,6 +431,7 @@ __global__ void compute_env_mat_r(
   }
 }
 
+namespace deepmd {
 template <typename FPTYPE>
 void prod_env_mat_a_gpu_cuda(    
     FPTYPE * em, 
@@ -505,3 +504,4 @@ template void prod_env_mat_a_gpu_cuda<float>(float * em, float * em_deriv, float
 template void prod_env_mat_a_gpu_cuda<double>(double * em, double * em_deriv, double * rij, int * nlist, const double * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const double * avg, const double * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
 template void prod_env_mat_r_gpu_cuda<float>(float * em, float * em_deriv, float * rij, int * nlist, const float * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const float * avg, const float * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
 template void prod_env_mat_r_gpu_cuda<double>(double * em, double * em_deriv, double * rij, int * nlist, const double * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const double * avg, const double * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
+}
diff --git a/source/lib/src/cuda/prod_force.cu b/source/lib/src/cuda/prod_force.cu
@@ -2,23 +2,6 @@
 #include "gpu_cuda.h"
 #include "prod_force.h"
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
-static __inline__ __device__ double atomicAdd(
-    double* address, 
-    double val) 
-{
-  unsigned long long int* address_as_ull = (unsigned long long int*)address;
-  unsigned long long int old = *address_as_ull, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-          __double_as_longlong(val + __longlong_as_double(assumed)));
-  // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old);
-  } while (assumed != old);
-  return __longlong_as_double(old);
-}
-#endif
-
 template <
     typename FPTYPE,
     int      THREADS_PER_BLOCK>
@@ -112,6 +95,7 @@ __global__ void force_deriv_wrt_neighbors_r(
         net_deriv[idx * ndescrpt + idy] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz]);
 }
 
+namespace deepmd {
 template<typename FPTYPE> 
 void prod_force_a_gpu_cuda(    
     FPTYPE * force, 
@@ -172,3 +156,4 @@ template void prod_force_a_gpu_cuda<float>(float * force, const float * net_deri
 template void prod_force_a_gpu_cuda<double>(double * force, const double * net_deriv, const double * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei);
 template void prod_force_r_gpu_cuda<float>(float * force, const float * net_deriv, const float * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei);
 template void prod_force_r_gpu_cuda<double>(double * force, const double * net_deriv, const double * in_deriv, const int * nlist, const int nloc, const int nall, const int nnei);
+}
diff --git a/source/lib/src/cuda/prod_virial.cu b/source/lib/src/cuda/prod_virial.cu
@@ -1,23 +1,6 @@
 #include "gpu_cuda.h"
 #include "prod_virial.h"
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
-static __inline__ __device__ double atomicAdd(
-    double* address, 
-    double val) 
-{
-  unsigned long long int* address_as_ull = (unsigned long long int*)address;
-  unsigned long long int old = *address_as_ull, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-          __double_as_longlong(val + __longlong_as_double(assumed)));
-  // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old);
-  } while (assumed != old);
-  return __longlong_as_double(old);
-}
-#endif
-
 template<typename FPTYPE>
 __global__ void virial_deriv_wrt_neighbors_a(
     FPTYPE * virial, 
@@ -90,6 +73,7 @@ __global__ void virial_deriv_wrt_neighbors_r(
         net_deriv[idx * ndescrpt + idy] * rij[idx * nnei * 3 + idy * 3 + idz % 3] * in_deriv[idx * ndescrpt * 3 + idy * 3 + idz / 3]);
 }
 
+namespace deepmd {
 template<typename FPTYPE>
 void prod_virial_a_gpu_cuda(
     FPTYPE * virial, 
@@ -152,3 +136,4 @@ template void prod_virial_a_gpu_cuda<float>(float * virial, float * atom_virial,
 template void prod_virial_a_gpu_cuda<double>(double * virial, double * atom_virial, const double * net_deriv, const double * in_deriv, const double * rij, const int * nlist, const int nloc, const int nall, const int nnei);
 template void prod_virial_r_gpu_cuda<float>(float * virial, float * atom_virial, const float * net_deriv, const float * in_deriv, const float * rij, const int * nlist, const int nloc, const int nall, const int nnei);
 template void prod_virial_r_gpu_cuda<double>(double * virial, double * atom_virial, const double * net_deriv, const double * in_deriv, const double * rij, const int * nlist, const int nloc, const int nall, const int nnei);
+}
diff --git a/source/lib/src/cuda/tabulate.cu b/source/lib/src/cuda/tabulate.cu
@@ -1,5 +1,3 @@
-#include <cub/cub.cuh>
-#include <cub/device/device_radix_sort.cuh>
 #include "tabulate.h"
 #include "gpu_cuda.h"
 
@@ -193,6 +191,7 @@ __global__ void tabulate_fusion_grad_fifth_order_polynomial(
   }
 }
 
+namespace deepmd {
 template<typename FPTYPE>
 void tabulate_fusion_gpu_cuda(
     FPTYPE * out,
@@ -238,3 +237,4 @@ template void tabulate_fusion_gpu_cuda<float>(float * out, const float * table,
 template void tabulate_fusion_gpu_cuda<double>(double * out, const double * table, const double * table_info, const double * em_x, const double * em, const int nloc, const int nnei, const int last_layer_size);
 template void tabulate_fusion_grad_gpu_cuda<float> (float * dy_dem_x, float * dy_dem, const float * table, const float * table_info, const float * em_x, const float * em, const float * dy, const int nloc, const int nnei, const int last_layer_size); 
 template void tabulate_fusion_grad_gpu_cuda<double> (double * dy_dem_x, double * dy_dem, const double * table, const double * table_info, const double * em_x, const double * em, const double * dy, const int nloc, const int nnei, const int last_layer_size);
+}
diff --git a/source/lib/src/neighbor_list.cc b/source/lib/src/neighbor_list.cc
@@ -844,7 +844,7 @@ build_nlist_cpu<float>(
     const float & rcut);
 
 #if GOOGLE_CUDA
-void convert_nlist_gpu_cuda(
+void deepmd::convert_nlist_gpu_cuda(
     InputNlist & gpu_nlist,
     InputNlist & cpu_nlist,
     int* & gpu_memory,
@@ -867,7 +867,7 @@ void convert_nlist_gpu_cuda(
   free(_firstneigh);
 }
 
-void free_nlist_gpu_cuda(
+void deepmd::free_nlist_gpu_cuda(
     InputNlist & gpu_nlist)
 {
   delete_device_memory(gpu_nlist.ilist);

diff --git a/source/lib/src/prod_env_mat.cc b/source/lib/src/prod_env_mat.cc
@@ -257,7 +257,7 @@ prod_env_mat_r_cpu<float>(
     const std::vector<int> sec);
 
 #if GOOGLE_CUDA
-void env_mat_nbor_update(
+void deepmd::env_mat_nbor_update(
     InputNlist &inlist,
     InputNlist &gpu_inlist,
     int &max_nbor_size,

diff --git a/source/lib/tests/test_env_mat_a.cc b/source/lib/tests/test_env_mat_a.cc
@@ -557,9 +557,9 @@ TEST_F(TestEnvMatA, prod_gpu_cuda)
   malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
-  prod_env_mat_a_gpu_cuda(    
+  deepmd::prod_env_mat_a_gpu_cuda(    
       em_dev, 
       em_deriv_dev, 
       rij_dev, 
@@ -648,9 +648,9 @@ TEST_F(TestEnvMatA, prod_gpu_cuda_equal_cpu)
   malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
   malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
   malloc_device_memory(memory_dev, nloc * max_nbor_size);
-  convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
+  deepmd::convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
-  prod_env_mat_a_gpu_cuda(    
+  deepmd::prod_env_mat_a_gpu_cuda(    
       em_dev, 
       em_deriv_dev, 
       rij_dev, 

diff --git a/source/lib/tests/test_env_mat_r.cc b/source/lib/tests/test_env_mat_r.cc
@@ -377,7 +377,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda)
   }
   std::vector<int> ilist(nloc), numneigh(nloc);
   std::vector<int*> firstneigh(nloc);
-  InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
   convert_nlist(inlist, nlist_a_cpy);
   std::vector<double > em(nloc * ndescrpt, 0.0), em_deriv(nloc * ndescrpt * 3, 0.0), rij(nloc * nnei * 3, 0.0);
   std::vector<int> nlist(nloc * nnei, 0);
@@ -402,7 +402,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda)
   malloc_device_memory(memory_dev, nloc * max_nbor_size);
   convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
-  prod_env_mat_r_gpu_cuda(    
+  deepmd::prod_env_mat_r_gpu_cuda(    
       em_dev, 
       em_deriv_dev, 
       rij_dev, 
@@ -467,7 +467,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda_equal_cpu)
   }
   std::vector<int> ilist(nloc), numneigh(nloc);
   std::vector<int*> firstneigh(nloc);
-  InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
   convert_nlist(inlist, nlist_a_cpy);
   std::vector<double > em(nloc * ndescrpt, 0.0), em_deriv(nloc * ndescrpt * 3, 0.0), rij(nloc * nnei * 3, 0.0);
   std::vector<int> nlist(nloc * nnei, 0);
@@ -492,7 +492,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda_equal_cpu)
   malloc_device_memory(memory_dev, nloc * max_nbor_size);
   convert_nlist_gpu_cuda(gpu_inlist, inlist, memory_dev, max_nbor_size);
 
-  prod_env_mat_r_gpu_cuda(    
+  deepmd::prod_env_mat_r_gpu_cuda(    
       em_dev, 
       em_deriv_dev, 
       rij_dev, 

diff --git a/source/lib/tests/test_gelu.cc b/source/lib/tests/test_gelu.cc
@@ -153,7 +153,7 @@ TEST_F(TestGelu, gelu_gpu_cuda)
   double * gelu_dev = NULL, * xx_dev = NULL;
   malloc_device_memory_sync(gelu_dev, gelu);
   malloc_device_memory_sync(xx_dev, xx);
-  gelu_gpu_cuda<double> (gelu_dev, xx_dev, nloc);
+  deepmd::gelu_gpu_cuda<double> (gelu_dev, xx_dev, nloc);
   memcpy_device_to_host(gelu_dev, gelu);
   delete_device_memory(gelu_dev);
   delete_device_memory(xx_dev);
@@ -174,7 +174,7 @@ TEST_F(TestGelu, gelu_grad_gpu_cuda)
   malloc_device_memory_sync(gelu_grad_dev, gelu_grad);
   malloc_device_memory_sync(xx_dev, xx);
   malloc_device_memory_sync(dy_dev, dy);
-  gelu_grad_gpu_cuda<double> (gelu_grad_dev, xx_dev, dy_dev, nloc);
+  deepmd::gelu_grad_gpu_cuda<double> (gelu_grad_dev, xx_dev, dy_dev, nloc);
   memcpy_device_to_host(gelu_grad_dev, gelu_grad);
   delete_device_memory(gelu_grad_dev);
   delete_device_memory(xx_dev);
@@ -198,7 +198,7 @@ TEST_F(TestGelu, gelu_grad_grad_gpu_cuda)
   malloc_device_memory_sync(xx_dev, xx);
   malloc_device_memory_sync(dy_dev, dy);
   malloc_device_memory_sync(dy_2_dev, dy_2);
-  gelu_grad_grad_gpu_cuda<double> (gelu_grad_grad_dev, xx_dev, dy_dev, dy_2_dev, nloc);
+  deepmd::gelu_grad_grad_gpu_cuda<double> (gelu_grad_grad_dev, xx_dev, dy_dev, dy_2_dev, nloc);
   memcpy_device_to_host(gelu_grad_grad_dev, gelu_grad_grad);
   delete_device_memory(gelu_grad_grad_dev);
   delete_device_memory(xx_dev);

diff --git a/source/lib/tests/test_prod_force_a.cc b/source/lib/tests/test_prod_force_a.cc
@@ -112,7 +112,7 @@ TEST_F(TestProdForceA, gpu_cuda)
   malloc_device_memory_sync(net_deriv_dev, net_deriv);
   malloc_device_memory_sync(env_deriv_dev, env_deriv);
 
-  prod_force_a_gpu_cuda<double> (force_dev, net_deriv_dev, env_deriv_dev, nlist_dev, nloc, nall, nnei);
+  deepmd::prod_force_a_gpu_cuda<double> (force_dev, net_deriv_dev, env_deriv_dev, nlist_dev, nloc, nall, nnei);
 
   memcpy_device_to_host(force_dev, force);
   delete_device_memory(nlist_dev);

diff --git a/source/lib/tests/test_prod_force_r.cc b/source/lib/tests/test_prod_force_r.cc
@@ -112,7 +112,7 @@ TEST_F(TestProdForceR, gpu_cuda)
   malloc_device_memory_sync(net_deriv_dev, net_deriv);
   malloc_device_memory_sync(env_deriv_dev, env_deriv);
 
-  prod_force_r_gpu_cuda<double> (force_dev, net_deriv_dev, env_deriv_dev, nlist_dev, nloc, nall, nnei);
+  deepmd::prod_force_r_gpu_cuda<double> (force_dev, net_deriv_dev, env_deriv_dev, nlist_dev, nloc, nall, nnei);
 
   memcpy_device_to_host(force_dev, force);
   delete_device_memory(nlist_dev);

diff --git a/source/lib/tests/test_prod_virial_a.cc b/source/lib/tests/test_prod_virial_a.cc
@@ -130,7 +130,7 @@ TEST_F(TestProdVirialA, gpu_cuda)
   malloc_device_memory_sync(env_deriv_dev, env_deriv);  
   malloc_device_memory_sync(rij_dev, rij);  
 
-  prod_virial_a_gpu_cuda<double> (virial_dev, atom_virial_dev, net_deriv_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nall, nnei);
+  deepmd::prod_virial_a_gpu_cuda<double> (virial_dev, atom_virial_dev, net_deriv_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nall, nnei);
 
   memcpy_device_to_host(virial_dev, virial);
   memcpy_device_to_host(atom_virial_dev, atom_virial);

diff --git a/source/lib/tests/test_prod_virial_r.cc b/source/lib/tests/test_prod_virial_r.cc
@@ -130,7 +130,7 @@ TEST_F(TestProdVirialR, gpu_cuda)
   malloc_device_memory_sync(env_deriv_dev, env_deriv);  
   malloc_device_memory_sync(rij_dev, rij);  
 
-  prod_virial_r_gpu_cuda<double> (virial_dev, atom_virial_dev, net_deriv_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nall, nnei);
+  deepmd::prod_virial_r_gpu_cuda<double> (virial_dev, atom_virial_dev, net_deriv_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nall, nnei);
 
   memcpy_device_to_host(virial_dev, virial);
   memcpy_device_to_host(atom_virial_dev, atom_virial);