From 92fc01cc3a57ac4bf0de34b35c5728e3926b9579 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Fri, 7 Jul 2023 21:24:04 -0400
Subject: [PATCH] synchronize in the beginning of all CUDA functions

Fix #2660.

TensorFlow made streams non-blocking in https://github.com/tensorflow/tensorflow/commit/9d1262082e761cd85d6726bcbdfdef331d6d72c6. Our own CUDA functions uses the default streams that is different from TensorFlow's, so we need to synchronize in the beginning of all functions.

In the future, it might be worth using the same stream as TensorFlow's to improve the performance.

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/lib/src/cuda/coord.cu            |  4 ++++
 source/lib/src/cuda/gelu.cu             |  6 ++++++
 source/lib/src/cuda/neighbor_list.cu    |  8 ++++++++
 source/lib/src/cuda/prod_env_mat.cu     |  6 ++++++
 source/lib/src/cuda/prod_force.cu       |  4 ++++
 source/lib/src/cuda/prod_force_grad.cu  |  4 ++++
 source/lib/src/cuda/prod_virial.cu      |  4 ++++
 source/lib/src/cuda/prod_virial_grad.cu |  4 ++++
 source/lib/src/cuda/region.cu           |  6 ++++++
 source/lib/src/cuda/tabulate.cu         | 18 ++++++++++++++++++
 10 files changed, 64 insertions(+)
diff --git a/source/lib/src/cuda/coord.cu b/source/lib/src/cuda/coord.cu
index a9b63d836b..d37e5de9cf 100644
--- a/source/lib/src/cuda/coord.cu
+++ b/source/lib/src/cuda/coord.cu
@@ -335,6 +335,8 @@ template <typename FPTYPE>
 void normalize_coord_gpu(FPTYPE *coord,
                          const int natom,
                          const Region<FPTYPE> &region) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const FPTYPE *boxt = region.boxt;
   const FPTYPE *rec_boxt = region.rec_boxt;
   const int nblock = (natom + TPB - 1) / TPB;
@@ -360,6 +362,8 @@ int copy_coord_gpu(FPTYPE *out_c,
                    const int &total_cellnum,
                    const int *cell_info,
                    const Region<FPTYPE> &region) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   compute_int_data(int_data, in_c, cell_info, region, nloc, loc_cellnum,
                    total_cellnum);
   int *int_data_cpu = new int
diff --git a/source/lib/src/cuda/gelu.cu b/source/lib/src/cuda/gelu.cu
index 64c147617a..af78043cca 100644
--- a/source/lib/src/cuda/gelu.cu
+++ b/source/lib/src/cuda/gelu.cu
@@ -67,6 +67,8 @@ void gelu_gpu_cuda(FPTYPE* out, const FPTYPE* xx, const int_64 size) {
   if (size <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int THREAD_ITEMS = 1024;
   const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
@@ -83,6 +85,8 @@ void gelu_grad_gpu_cuda(FPTYPE* out,
   if (size <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int THREAD_ITEMS = 1024;
   const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
@@ -100,6 +104,8 @@ void gelu_grad_grad_gpu_cuda(FPTYPE* out,
   if (size <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int THREAD_ITEMS = 1024;
   const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;
 
diff --git a/source/lib/src/cuda/neighbor_list.cu b/source/lib/src/cuda/neighbor_list.cu
index fe148c4978..4fae6f3874 100644
--- a/source/lib/src/cuda/neighbor_list.cu
+++ b/source/lib/src/cuda/neighbor_list.cu
@@ -187,6 +187,8 @@ int build_nlist_gpu(InputNlist &nlist,
   if (mem_size < nall) {
     return 1;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int nblock = (nall + TPB - 1) / TPB;
   int *ilist = nlist.ilist;
   int *numneigh = nlist.numneigh;
@@ -229,6 +231,8 @@ void use_nlist_map(int *nlist,
                    const int *nlist_map,
                    const int nloc,
                    const int nnei) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   int nblock = (nnei + TPB - 1) / TPB;
   dim3 block_grid(nloc, nblock);
   dim3 thread_grid(1, TPB);
@@ -246,6 +250,8 @@ void use_nei_info_gpu(int *nlist,
                       const int nnei,
                       const int ntypes,
                       const bool b_nlist_map) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   int nblock = (nnei + TPB - 1) / TPB;
   dim3 block_grid(nloc, nblock);
   dim3 thread_grid(1, TPB);
@@ -291,6 +297,8 @@ __global__ void map_filter_ftype(int *ftype_out,
 void filter_ftype_gpu_cuda(int *ftype_out,
                            const int *ftype_in,
                            const int nloc) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   int nblock = (nloc + TPB - 1) / TPB;
   map_filter_ftype<<<nblock, TPB>>>(ftype_out, ftype_in, nloc);
   DPErrcheck(cudaGetLastError());
diff --git a/source/lib/src/cuda/prod_env_mat.cu b/source/lib/src/cuda/prod_env_mat.cu
index 243b69c6b5..8a085a47b5 100644
--- a/source/lib/src/cuda/prod_env_mat.cu
+++ b/source/lib/src/cuda/prod_env_mat.cu
@@ -569,6 +569,8 @@ void format_nbor_list_gpu_cuda(int* nlist,
                                const int nall,
                                const float rcut,
                                const std::vector<int> sec) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int LEN = 256;
   const int nnei = sec.back();
   const int nblock = (nloc + LEN - 1) / LEN;
@@ -629,6 +631,8 @@ void prod_env_mat_a_gpu_cuda(FPTYPE* em,
                              const float rcut_smth,
                              const std::vector<int> sec,
                              const int* f_type) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   if (f_type == NULL) {
     f_type = type;
   }
@@ -669,6 +673,8 @@ void prod_env_mat_r_gpu_cuda(FPTYPE* em,
                              const float rcut,
                              const float rcut_smth,
                              const std::vector<int> sec) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int nnei = sec.back();
   const int ndescrpt = nnei * 1;
   DPErrcheck(cudaMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
diff --git a/source/lib/src/cuda/prod_force.cu b/source/lib/src/cuda/prod_force.cu
index 80f4eaed50..04f5b84dcd 100644
--- a/source/lib/src/cuda/prod_force.cu
+++ b/source/lib/src/cuda/prod_force.cu
@@ -110,6 +110,8 @@ void prod_force_a_gpu_cuda(FPTYPE* force,
                            const int nall,
                            const int nnei,
                            const int nframes) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int ndescrpt = nnei * 4;
   DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));
 
@@ -137,6 +139,8 @@ void prod_force_r_gpu_cuda(FPTYPE* force,
                            const int nall,
                            const int nnei,
                            const int nframes) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int ndescrpt = nnei * 1;
   DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));
 
diff --git a/source/lib/src/cuda/prod_force_grad.cu b/source/lib/src/cuda/prod_force_grad.cu
index 9589f3b498..e72ba2ea48 100644
--- a/source/lib/src/cuda/prod_force_grad.cu
+++ b/source/lib/src/cuda/prod_force_grad.cu
@@ -88,6 +88,8 @@ void prod_force_grad_a_gpu_cuda(FPTYPE* grad_net,
                                 const int nloc,
                                 const int nnei,
                                 const int nframes) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int ndescrpt = nnei * 4;
   DPErrcheck(
       cudaMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));
@@ -117,6 +119,8 @@ void prod_force_grad_r_gpu_cuda(FPTYPE* grad_net,
                                 const int nloc,
                                 const int nnei,
                                 const int nframes) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int ndescrpt = nnei * 1;
   DPErrcheck(
       cudaMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));
diff --git a/source/lib/src/cuda/prod_virial.cu b/source/lib/src/cuda/prod_virial.cu
index e01170f01c..618f82625d 100644
--- a/source/lib/src/cuda/prod_virial.cu
+++ b/source/lib/src/cuda/prod_virial.cu
@@ -113,6 +113,8 @@ void prod_virial_a_gpu_cuda(FPTYPE* virial,
                             const int nloc,
                             const int nall,
                             const int nnei) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
   DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
 
@@ -141,6 +143,8 @@ void prod_virial_r_gpu_cuda(FPTYPE* virial,
                             const int nloc,
                             const int nall,
                             const int nnei) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
   DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
 
diff --git a/source/lib/src/cuda/prod_virial_grad.cu b/source/lib/src/cuda/prod_virial_grad.cu
index 7e3e7c3b34..aae7676d3c 100644
--- a/source/lib/src/cuda/prod_virial_grad.cu
+++ b/source/lib/src/cuda/prod_virial_grad.cu
@@ -92,6 +92,8 @@ void prod_virial_grad_a_gpu_cuda(FPTYPE* grad_net,
                                  const int* nlist,
                                  const int nloc,
                                  const int nnei) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int ndescrpt = nnei * 4;
   DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
   const int LEN = 128;
@@ -112,6 +114,8 @@ void prod_virial_grad_r_gpu_cuda(FPTYPE* grad_net,
                                  const int* nlist,
                                  const int nloc,
                                  const int nnei) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   const int ndescrpt = nnei;
   DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
   const int LEN = 128;
diff --git a/source/lib/src/cuda/region.cu b/source/lib/src/cuda/region.cu
index 858739671c..eb8d191a8c 100644
--- a/source/lib/src/cuda/region.cu
+++ b/source/lib/src/cuda/region.cu
@@ -27,6 +27,8 @@ template <typename FPTYPE>
 void convert_to_inter_gpu(FPTYPE *ri,
                           const Region<FPTYPE> &region,
                           const FPTYPE *rp) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   _phys2Inter<<<1, 1>>>(ri, rp, region.rec_boxt);
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
@@ -36,6 +38,8 @@ template <typename FPTYPE>
 void convert_to_phys_gpu(FPTYPE *rp,
                          const Region<FPTYPE> &region,
                          const FPTYPE *ri) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   _inter2Phys<<<1, 1>>>(rp, ri, region.boxt);
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
@@ -43,6 +47,8 @@ void convert_to_phys_gpu(FPTYPE *rp,
 
 template <typename FPTYPE>
 void volume_gpu(FPTYPE *volume, const Region<FPTYPE> &region) {
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   _compute_volume<<<1, 1>>>(volume, region.boxt);
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
diff --git a/source/lib/src/cuda/tabulate.cu b/source/lib/src/cuda/tabulate.cu
index 2df2c5c46a..06d1d49057 100644
--- a/source/lib/src/cuda/tabulate.cu
+++ b/source/lib/src/cuda/tabulate.cu
@@ -630,6 +630,8 @@ void tabulate_fusion_se_a_gpu_cuda(FPTYPE* out,
   if (nloc <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   tabulate_fusion_se_a_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, last_layer_size>>>(
           out, table, em_x, em, two_embed, table_info[0], table_info[1],
@@ -653,6 +655,8 @@ void tabulate_fusion_se_a_grad_gpu_cuda(FPTYPE* dy_dem_x,
   if (nloc <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   DPErrcheck(cudaMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei));
   DPErrcheck(cudaMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei * 4));
 
@@ -679,6 +683,8 @@ void tabulate_fusion_se_a_grad_grad_gpu_cuda(FPTYPE* dz_dy,
   if (nloc <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   DPErrcheck(cudaMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * 4 * last_layer_size));
   tabulate_fusion_se_a_grad_grad_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size>>>(
@@ -702,6 +708,8 @@ void tabulate_fusion_se_t_gpu_cuda(FPTYPE* out,
   if (nloc <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   tabulate_fusion_se_t_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, last_layer_size>>>(
           out, table, em_x, em, table_info[0], table_info[1], table_info[2],
@@ -725,6 +733,8 @@ void tabulate_fusion_se_t_grad_gpu_cuda(FPTYPE* dy_dem_x,
   if (nloc <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   DPErrcheck(cudaMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
   DPErrcheck(cudaMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
 
@@ -752,6 +762,8 @@ void tabulate_fusion_se_t_grad_grad_gpu_cuda(FPTYPE* dz_dy,
   if (nloc <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   DPErrcheck(cudaMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * last_layer_size));
 
   tabulate_fusion_se_t_grad_grad_fifth_order_polynomial<FPTYPE, MM, KK>
@@ -774,6 +786,8 @@ void tabulate_fusion_se_r_gpu_cuda(FPTYPE* out,
   if (nloc <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   tabulate_fusion_se_r_fifth_order_polynomial<FPTYPE, MM, KK>
       <<<nloc, last_layer_size>>>(out, table, em, table_info[0], table_info[1],
                                   table_info[2], table_info[3], table_info[4],
@@ -794,6 +808,8 @@ void tabulate_fusion_se_r_grad_gpu_cuda(FPTYPE* dy_dem,
   if (nloc <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   DPErrcheck(cudaMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei));
 
   tabulate_fusion_se_r_grad_fifth_order_polynomial<FPTYPE, MM, KK>
@@ -816,6 +832,8 @@ void tabulate_fusion_se_r_grad_grad_gpu_cuda(FPTYPE* dz_dy,
   if (nloc <= 0) {
     return;
   }
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
   DPErrcheck(
       cudaMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * nnei * last_layer_size));
   tabulate_fusion_se_r_grad_grad_fifth_order_polynomial<FPTYPE, MM, KK>