deepmodeling · wanghan-iapcm · Jul 9, 2023 · Jul 8, 2023
diff --git a/source/lib/src/cuda/coord.cu b/source/lib/src/cuda/coord.cu
@@ -335,6 +335,8 @@ template <typename FPTYPE>
 void normalize_coord_gpu(FPTYPE *coord,
                          const int natom,
                          const Region<FPTYPE> &region) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const FPTYPE *boxt = region.boxt;
   const FPTYPE *rec_boxt = region.rec_boxt;
   const int nblock = (natom + TPB - 1) / TPB;
@@ -360,6 +362,8 @@ int copy_coord_gpu(FPTYPE *out_c,
                    const int &total_cellnum,
                    const int *cell_info,
                    const Region<FPTYPE> &region) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   compute_int_data(int_data, in_c, cell_info, region, nloc, loc_cellnum,
                    total_cellnum);
   int *int_data_cpu = new int

diff --git a/source/lib/src/cuda/gelu.cu b/source/lib/src/cuda/gelu.cu
@@ -67,6 +67,8 @@ void gelu_gpu_cuda(FPTYPE* out, const FPTYPE* xx, const int_64 size) {
   if (size <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int THREAD_ITEMS = 1024;
   const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
@@ -83,6 +85,8 @@ void gelu_grad_gpu_cuda(FPTYPE* out,
   if (size <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int THREAD_ITEMS = 1024;
   const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
@@ -100,6 +104,8 @@ void gelu_grad_grad_gpu_cuda(FPTYPE* out,
   if (size <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int THREAD_ITEMS = 1024;
   const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 

diff --git a/source/lib/src/cuda/neighbor_list.cu b/source/lib/src/cuda/neighbor_list.cu
@@ -187,6 +187,8 @@ int build_nlist_gpu(InputNlist &nlist,
   if (mem_size < nall) {
     return 1;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int nblock = (nall + TPB - 1) / TPB;
   int *ilist = nlist.ilist;
   int *numneigh = nlist.numneigh;
@@ -229,6 +231,8 @@ void use_nlist_map(int *nlist,
                    const int *nlist_map,
                    const int nloc,
                    const int nnei) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   int nblock = (nnei + TPB - 1) / TPB;
   dim3 block_grid(nloc, nblock);
   dim3 thread_grid(1, TPB);
@@ -246,6 +250,8 @@ void use_nei_info_gpu(int *nlist,
                       const int nnei,
                       const int ntypes,
                       const bool b_nlist_map) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   int nblock = (nnei + TPB - 1) / TPB;
   dim3 block_grid(nloc, nblock);
   dim3 thread_grid(1, TPB);
@@ -291,6 +297,8 @@ __global__ void map_filter_ftype(int *ftype_out,
 void filter_ftype_gpu_cuda(int *ftype_out,
                            const int *ftype_in,
                            const int nloc) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   int nblock = (nloc + TPB - 1) / TPB;
   map_filter_ftype<<<nblock, TPB>>>(ftype_out, ftype_in, nloc);
   DPErrcheck(cudaGetLastError());

diff --git a/source/lib/src/cuda/prod_env_mat.cu b/source/lib/src/cuda/prod_env_mat.cu
@@ -569,6 +569,8 @@ void format_nbor_list_gpu_cuda(int* nlist,
                                const int nall,
                                const float rcut,
                                const std::vector<int> sec) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int LEN = 256;
   const int nnei = sec.back();
   const int nblock = (nloc + LEN - 1) / LEN;
@@ -629,6 +631,8 @@ void prod_env_mat_a_gpu_cuda(FPTYPE* em,
                              const float rcut_smth,
                              const std::vector<int> sec,
                              const int* f_type) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   if (f_type == NULL) {
     f_type = type;
   }
@@ -669,6 +673,8 @@ void prod_env_mat_r_gpu_cuda(FPTYPE* em,
                              const float rcut,
                              const float rcut_smth,
                              const std::vector<int> sec) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int nnei = sec.back();
   const int ndescrpt = nnei * 1;
   DPErrcheck(cudaMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));

diff --git a/source/lib/src/cuda/prod_force.cu b/source/lib/src/cuda/prod_force.cu
@@ -110,6 +110,8 @@ void prod_force_a_gpu_cuda(FPTYPE* force,
                            const int nall,
                            const int nnei,
                            const int nframes) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int ndescrpt = nnei * 4;
   DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));
 
@@ -137,6 +139,8 @@ void prod_force_r_gpu_cuda(FPTYPE* force,
                            const int nall,
                            const int nnei,
                            const int nframes) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int ndescrpt = nnei * 1;
   DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));
 

diff --git a/source/lib/src/cuda/prod_force_grad.cu b/source/lib/src/cuda/prod_force_grad.cu
@@ -88,6 +88,8 @@ void prod_force_grad_a_gpu_cuda(FPTYPE* grad_net,
                                 const int nloc,
                                 const int nnei,
                                 const int nframes) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int ndescrpt = nnei * 4;
   DPErrcheck(
       cudaMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));
@@ -117,6 +119,8 @@ void prod_force_grad_r_gpu_cuda(FPTYPE* grad_net,
                                 const int nloc,
                                 const int nnei,
                                 const int nframes) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int ndescrpt = nnei * 1;
   DPErrcheck(
       cudaMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));

diff --git a/source/lib/src/cuda/prod_virial.cu b/source/lib/src/cuda/prod_virial.cu
@@ -113,6 +113,8 @@ void prod_virial_a_gpu_cuda(FPTYPE* virial,
                             const int nloc,
                             const int nall,
                             const int nnei) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
   DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
 
@@ -141,6 +143,8 @@ void prod_virial_r_gpu_cuda(FPTYPE* virial,
                             const int nloc,
                             const int nall,
                             const int nnei) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
   DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
 

diff --git a/source/lib/src/cuda/prod_virial_grad.cu b/source/lib/src/cuda/prod_virial_grad.cu
@@ -92,6 +92,8 @@ void prod_virial_grad_a_gpu_cuda(FPTYPE* grad_net,
                                  const int* nlist,
                                  const int nloc,
                                  const int nnei) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int ndescrpt = nnei * 4;
   DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
   const int LEN = 128;
@@ -112,6 +114,8 @@ void prod_virial_grad_r_gpu_cuda(FPTYPE* grad_net,
                                  const int* nlist,
                                  const int nloc,
                                  const int nnei) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int ndescrpt = nnei;
   DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
   const int LEN = 128;

diff --git a/source/lib/src/cuda/region.cu b/source/lib/src/cuda/region.cu
@@ -27,6 +27,8 @@ template <typename FPTYPE>
 void convert_to_inter_gpu(FPTYPE *ri,
                           const Region<FPTYPE> &region,
                           const FPTYPE *rp) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   _phys2Inter<<<1, 1>>>(ri, rp, region.rec_boxt);
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
@@ -36,13 +38,17 @@ template <typename FPTYPE>
 void convert_to_phys_gpu(FPTYPE *rp,
                          const Region<FPTYPE> &region,
                          const FPTYPE *ri) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   _inter2Phys<<<1, 1>>>(rp, ri, region.boxt);
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
 }
 
 template <typename FPTYPE>
 void volume_gpu(FPTYPE *volume, const Region<FPTYPE> &region) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   _compute_volume<<<1, 1>>>(volume, region.boxt);
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());

diff --git a/source/lib/src/cuda/tabulate.cu b/source/lib/src/cuda/tabulate.cu
@@ -630,6 +630,8 @@ void tabulate_fusion_se_a_gpu_cuda(FPTYPE* out,
   if (nloc <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   tabulate_fusion_se_a_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, last_layer_size>>>(
           out, table, em_x, em, two_embed, table_info[0], table_info[1],
@@ -653,6 +655,8 @@ void tabulate_fusion_se_a_grad_gpu_cuda(FPTYPE* dy_dem_x,
   if (nloc <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   DPErrcheck(cudaMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei));
   DPErrcheck(cudaMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei * 4));
 
@@ -679,6 +683,8 @@ void tabulate_fusion_se_a_grad_grad_gpu_cuda(FPTYPE* dz_dy,
   if (nloc <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   DPErrcheck(cudaMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * 4 * last_layer_size));
   tabulate_fusion_se_a_grad_grad_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size>>>(
@@ -702,6 +708,8 @@ void tabulate_fusion_se_t_gpu_cuda(FPTYPE* out,
   if (nloc <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   tabulate_fusion_se_t_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, last_layer_size>>>(
           out, table, em_x, em, table_info[0], table_info[1], table_info[2],
@@ -725,6 +733,8 @@ void tabulate_fusion_se_t_grad_gpu_cuda(FPTYPE* dy_dem_x,
   if (nloc <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   DPErrcheck(cudaMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
   DPErrcheck(cudaMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
 
@@ -752,6 +762,8 @@ void tabulate_fusion_se_t_grad_grad_gpu_cuda(FPTYPE* dz_dy,
   if (nloc <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   DPErrcheck(cudaMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * last_layer_size));
 
   tabulate_fusion_se_t_grad_grad_fifth_order_polynomial<FPTYPE, MM, KK>
@@ -774,6 +786,8 @@ void tabulate_fusion_se_r_gpu_cuda(FPTYPE* out,
   if (nloc <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   tabulate_fusion_se_r_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, last_layer_size>>>(out, table, em, table_info[0], table_info[1],
                                   table_info[2], table_info[3], table_info[4],
@@ -794,6 +808,8 @@ void tabulate_fusion_se_r_grad_gpu_cuda(FPTYPE* dy_dem,
   if (nloc <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   DPErrcheck(cudaMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei));
 
   tabulate_fusion_se_r_grad_fifth_order_polynomial<FPTYPE, MM, KK>
@@ -816,6 +832,8 @@ void tabulate_fusion_se_r_grad_grad_gpu_cuda(FPTYPE* dz_dy,
   if (nloc <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   DPErrcheck(
       cudaMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * nnei * last_layer_size));
   tabulate_fusion_se_r_grad_grad_fifth_order_polynomial<FPTYPE, MM, KK>