Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

synchronize in the beginning of all CUDA functions #2661

Merged
merged 1 commit into from
Jul 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions source/lib/src/cuda/coord.cu
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,8 @@ template <typename FPTYPE>
void normalize_coord_gpu(FPTYPE *coord,
const int natom,
const Region<FPTYPE> &region) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const FPTYPE *boxt = region.boxt;
const FPTYPE *rec_boxt = region.rec_boxt;
const int nblock = (natom + TPB - 1) / TPB;
Expand All @@ -360,6 +362,8 @@ int copy_coord_gpu(FPTYPE *out_c,
const int &total_cellnum,
const int *cell_info,
const Region<FPTYPE> &region) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
compute_int_data(int_data, in_c, cell_info, region, nloc, loc_cellnum,
total_cellnum);
int *int_data_cpu = new int
Expand Down
6 changes: 6 additions & 0 deletions source/lib/src/cuda/gelu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ void gelu_gpu_cuda(FPTYPE* out, const FPTYPE* xx, const int_64 size) {
if (size <= 0) {
return;
}
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const int THREAD_ITEMS = 1024;
const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;

Expand All @@ -83,6 +85,8 @@ void gelu_grad_gpu_cuda(FPTYPE* out,
if (size <= 0) {
return;
}
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const int THREAD_ITEMS = 1024;
const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;

Expand All @@ -100,6 +104,8 @@ void gelu_grad_grad_gpu_cuda(FPTYPE* out,
if (size <= 0) {
return;
}
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const int THREAD_ITEMS = 1024;
const int BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS;

Expand Down
8 changes: 8 additions & 0 deletions source/lib/src/cuda/neighbor_list.cu
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,8 @@ int build_nlist_gpu(InputNlist &nlist,
if (mem_size < nall) {
return 1;
}
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const int nblock = (nall + TPB - 1) / TPB;
int *ilist = nlist.ilist;
int *numneigh = nlist.numneigh;
Expand Down Expand Up @@ -229,6 +231,8 @@ void use_nlist_map(int *nlist,
const int *nlist_map,
const int nloc,
const int nnei) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
int nblock = (nnei + TPB - 1) / TPB;
dim3 block_grid(nloc, nblock);
dim3 thread_grid(1, TPB);
Expand All @@ -246,6 +250,8 @@ void use_nei_info_gpu(int *nlist,
const int nnei,
const int ntypes,
const bool b_nlist_map) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
int nblock = (nnei + TPB - 1) / TPB;
dim3 block_grid(nloc, nblock);
dim3 thread_grid(1, TPB);
Expand Down Expand Up @@ -291,6 +297,8 @@ __global__ void map_filter_ftype(int *ftype_out,
void filter_ftype_gpu_cuda(int *ftype_out,
const int *ftype_in,
const int nloc) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
int nblock = (nloc + TPB - 1) / TPB;
map_filter_ftype<<<nblock, TPB>>>(ftype_out, ftype_in, nloc);
DPErrcheck(cudaGetLastError());
Expand Down
6 changes: 6 additions & 0 deletions source/lib/src/cuda/prod_env_mat.cu
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,8 @@ void format_nbor_list_gpu_cuda(int* nlist,
const int nall,
const float rcut,
const std::vector<int> sec) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const int LEN = 256;
const int nnei = sec.back();
const int nblock = (nloc + LEN - 1) / LEN;
Expand Down Expand Up @@ -629,6 +631,8 @@ void prod_env_mat_a_gpu_cuda(FPTYPE* em,
const float rcut_smth,
const std::vector<int> sec,
const int* f_type) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
if (f_type == NULL) {
f_type = type;
}
Expand Down Expand Up @@ -669,6 +673,8 @@ void prod_env_mat_r_gpu_cuda(FPTYPE* em,
const float rcut,
const float rcut_smth,
const std::vector<int> sec) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const int nnei = sec.back();
const int ndescrpt = nnei * 1;
DPErrcheck(cudaMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
Expand Down
4 changes: 4 additions & 0 deletions source/lib/src/cuda/prod_force.cu
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ void prod_force_a_gpu_cuda(FPTYPE* force,
const int nall,
const int nnei,
const int nframes) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const int ndescrpt = nnei * 4;
DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));

Expand Down Expand Up @@ -137,6 +139,8 @@ void prod_force_r_gpu_cuda(FPTYPE* force,
const int nall,
const int nnei,
const int nframes) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const int ndescrpt = nnei * 1;
DPErrcheck(cudaMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3));

Expand Down
4 changes: 4 additions & 0 deletions source/lib/src/cuda/prod_force_grad.cu
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ void prod_force_grad_a_gpu_cuda(FPTYPE* grad_net,
const int nloc,
const int nnei,
const int nframes) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const int ndescrpt = nnei * 4;
DPErrcheck(
cudaMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));
Expand Down Expand Up @@ -117,6 +119,8 @@ void prod_force_grad_r_gpu_cuda(FPTYPE* grad_net,
const int nloc,
const int nnei,
const int nframes) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const int ndescrpt = nnei * 1;
DPErrcheck(
cudaMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt));
Expand Down
4 changes: 4 additions & 0 deletions source/lib/src/cuda/prod_virial.cu
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ void prod_virial_a_gpu_cuda(FPTYPE* virial,
const int nloc,
const int nall,
const int nnei) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));

Expand Down Expand Up @@ -141,6 +143,8 @@ void prod_virial_r_gpu_cuda(FPTYPE* virial,
const int nloc,
const int nall,
const int nnei) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));

Expand Down
4 changes: 4 additions & 0 deletions source/lib/src/cuda/prod_virial_grad.cu
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ void prod_virial_grad_a_gpu_cuda(FPTYPE* grad_net,
const int* nlist,
const int nloc,
const int nnei) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const int ndescrpt = nnei * 4;
DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
const int LEN = 128;
Expand All @@ -112,6 +114,8 @@ void prod_virial_grad_r_gpu_cuda(FPTYPE* grad_net,
const int* nlist,
const int nloc,
const int nnei) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
const int ndescrpt = nnei;
DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
const int LEN = 128;
Expand Down
6 changes: 6 additions & 0 deletions source/lib/src/cuda/region.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ template <typename FPTYPE>
void convert_to_inter_gpu(FPTYPE *ri,
const Region<FPTYPE> &region,
const FPTYPE *rp) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
_phys2Inter<<<1, 1>>>(ri, rp, region.rec_boxt);
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
Expand All @@ -36,13 +38,17 @@ template <typename FPTYPE>
void convert_to_phys_gpu(FPTYPE *rp,
const Region<FPTYPE> &region,
const FPTYPE *ri) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
_inter2Phys<<<1, 1>>>(rp, ri, region.boxt);
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
}

template <typename FPTYPE>
void volume_gpu(FPTYPE *volume, const Region<FPTYPE> &region) {
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
_compute_volume<<<1, 1>>>(volume, region.boxt);
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
Expand Down
18 changes: 18 additions & 0 deletions source/lib/src/cuda/tabulate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -630,6 +630,8 @@ void tabulate_fusion_se_a_gpu_cuda(FPTYPE* out,
if (nloc <= 0) {
return;
}
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
tabulate_fusion_se_a_fifth_order_polynomial<FPTYPE, MM, KK>
<<<nloc, last_layer_size>>>(
out, table, em_x, em, two_embed, table_info[0], table_info[1],
Expand All @@ -653,6 +655,8 @@ void tabulate_fusion_se_a_grad_gpu_cuda(FPTYPE* dy_dem_x,
if (nloc <= 0) {
return;
}
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
DPErrcheck(cudaMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei));
DPErrcheck(cudaMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei * 4));

Expand All @@ -679,6 +683,8 @@ void tabulate_fusion_se_a_grad_grad_gpu_cuda(FPTYPE* dz_dy,
if (nloc <= 0) {
return;
}
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
DPErrcheck(cudaMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * 4 * last_layer_size));
tabulate_fusion_se_a_grad_grad_fifth_order_polynomial<FPTYPE, MM, KK>
<<<nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size>>>(
Expand All @@ -702,6 +708,8 @@ void tabulate_fusion_se_t_gpu_cuda(FPTYPE* out,
if (nloc <= 0) {
return;
}
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
tabulate_fusion_se_t_fifth_order_polynomial<FPTYPE, MM, KK>
<<<nloc, last_layer_size>>>(
out, table, em_x, em, table_info[0], table_info[1], table_info[2],
Expand All @@ -725,6 +733,8 @@ void tabulate_fusion_se_t_grad_gpu_cuda(FPTYPE* dy_dem_x,
if (nloc <= 0) {
return;
}
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
DPErrcheck(cudaMemset(dy_dem_x, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));
DPErrcheck(cudaMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j));

Expand Down Expand Up @@ -752,6 +762,8 @@ void tabulate_fusion_se_t_grad_grad_gpu_cuda(FPTYPE* dz_dy,
if (nloc <= 0) {
return;
}
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
DPErrcheck(cudaMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * last_layer_size));

tabulate_fusion_se_t_grad_grad_fifth_order_polynomial<FPTYPE, MM, KK>
Expand All @@ -774,6 +786,8 @@ void tabulate_fusion_se_r_gpu_cuda(FPTYPE* out,
if (nloc <= 0) {
return;
}
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
tabulate_fusion_se_r_fifth_order_polynomial<FPTYPE, MM, KK>
<<<nloc, last_layer_size>>>(out, table, em, table_info[0], table_info[1],
table_info[2], table_info[3], table_info[4],
Expand All @@ -794,6 +808,8 @@ void tabulate_fusion_se_r_grad_gpu_cuda(FPTYPE* dy_dem,
if (nloc <= 0) {
return;
}
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
DPErrcheck(cudaMemset(dy_dem, 0, sizeof(FPTYPE) * nloc * nnei));

tabulate_fusion_se_r_grad_fifth_order_polynomial<FPTYPE, MM, KK>
Expand All @@ -816,6 +832,8 @@ void tabulate_fusion_se_r_grad_grad_gpu_cuda(FPTYPE* dz_dy,
if (nloc <= 0) {
return;
}
DPErrcheck(cudaGetLastError());
DPErrcheck(cudaDeviceSynchronize());
DPErrcheck(
cudaMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * nnei * last_layer_size));
tabulate_fusion_se_r_grad_grad_fifth_order_polynomial<FPTYPE, MM, KK>
Expand Down