Skip to content

Commit 16714c6

Browse files
authored
Perf: optimize the stream strategy in module_gint (#5845)
* optimize stream strategy * limit max threads
1 parent 1c30f99 commit 16714c6

File tree

3 files changed

+63
-22
lines changed

3 files changed

+63
-22
lines changed

source/module_hamilt_lcao/module_gint/gint_force_gpu.cu

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,11 @@ void gint_fvl_gpu(const hamilt::HContainer<double>* dm,
5151
const int num_streams = gridt.nstreams;
5252

5353
std::vector<cudaStream_t> streams(num_streams);
54+
std::vector<cudaEvent_t> events(num_streams);
5455
for (int i = 0; i < num_streams; i++)
5556
{
5657
checkCuda(cudaStreamCreate(&streams[i]));
58+
checkCuda(cudaEventCreateWithFlags(&events[i], cudaEventDisableTiming));
5759
}
5860

5961
Cuda_Mem_Wrapper<double> dr_part(3 * max_atom_per_z, num_streams, true);
@@ -89,21 +91,32 @@ void gint_fvl_gpu(const hamilt::HContainer<double>* dm,
8991
dm->get_wrapper(),
9092
dm->get_nnr() * sizeof(double),
9193
cudaMemcpyHostToDevice));
94+
9295
#ifdef _OPENMP
93-
#pragma omp parallel for num_threads(num_streams) collapse(2)
96+
const int max_thread_num = std::min(omp_get_max_threads(), num_streams);
97+
#endif
98+
#pragma omp parallel num_threads(max_thread_num)
99+
{
100+
#ifdef _OPENMP
101+
const int tid = omp_get_thread_num();
102+
const int num_threads = omp_get_num_threads();
103+
const int sid_start = tid * num_streams / num_threads;
104+
const int thread_num_streams = tid == num_threads - 1 ? num_streams - sid_start : num_streams / num_threads;
105+
#else
106+
const int sid_start = 0;
107+
const int thread_num_streams = num_streams;
94108
#endif
109+
#pragma omp for collapse(2) schedule(dynamic)
95110
for (int i = 0; i < gridt.nbx; i++)
96111
{
97112
for (int j = 0; j < gridt.nby; j++)
98113
{
99114
// 20240620 Note that it must be set again here because
100115
// cuda's device is not safe in a multi-threaded environment.
101116
checkCuda(cudaSetDevice(gridt.dev_id));
102-
#ifdef _OPENMP
103-
const int sid = omp_get_thread_num();
104-
#else
105-
const int sid = 0;
106-
#endif
117+
118+
const int sid = (i * gridt.nby + j) % thread_num_streams + sid_start;
119+
checkCuda(cudaEventSynchronize(events[sid]));
107120

108121
int max_m = 0;
109122
int max_n = 0;
@@ -161,6 +174,7 @@ void gint_fvl_gpu(const hamilt::HContainer<double>* dm,
161174
gemm_A.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
162175
gemm_B.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
163176
gemm_C.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
177+
checkCuda(cudaEventRecord(events[sid], streams[sid]));
164178

165179
psi.memset_device_async(streams[sid], sid, 0);
166180
psi_dm.memset_device_async(streams[sid], sid, 0);
@@ -241,9 +255,9 @@ void gint_fvl_gpu(const hamilt::HContainer<double>* dm,
241255
stress.get_device_pointer(sid));
242256
checkCudaLastError();
243257
}
244-
checkCuda(cudaStreamSynchronize(streams[sid]));
245258
}
246259
}
260+
}
247261

248262
for(int i = 0; i < num_streams; i++)
249263
{
@@ -254,6 +268,7 @@ void gint_fvl_gpu(const hamilt::HContainer<double>* dm,
254268
for (int i = 0; i < num_streams; i++)
255269
{
256270
checkCuda(cudaStreamSynchronize(streams[i]));
271+
checkCuda(cudaEventDestroy(events[i]));
257272
}
258273

259274
if (isstress){

source/module_hamilt_lcao/module_gint/gint_rho_gpu.cu

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,11 @@ void gint_rho_gpu(const hamilt::HContainer<double>* dm,
3434
const int max_atompair_per_z = max_atom * max_atom * nbzp;
3535

3636
std::vector<cudaStream_t> streams(num_streams);
37+
std::vector<cudaEvent_t> events(num_streams);
3738
for (int i = 0; i < num_streams; i++)
3839
{
3940
checkCuda(cudaStreamCreate(&streams[i]));
41+
checkCuda(cudaEventCreateWithFlags(&events[i], cudaEventDisableTiming));
4042
}
4143

4244
Cuda_Mem_Wrapper<double> dr_part(max_atom_per_z * 3, num_streams, true);
@@ -71,8 +73,20 @@ void gint_rho_gpu(const hamilt::HContainer<double>* dm,
7173

7274
// calculate the rho for every nbzp bigcells
7375
#ifdef _OPENMP
74-
#pragma omp parallel for num_threads(num_streams) collapse(2)
76+
const int max_thread_num = std::min(omp_get_max_threads(), num_streams);
7577
#endif
78+
#pragma omp parallel num_threads(max_thread_num)
79+
{
80+
#ifdef _OPENMP
81+
const int tid = omp_get_thread_num();
82+
const int num_threads = omp_get_num_threads();
83+
const int sid_start = tid * num_streams / num_threads;
84+
const int thread_num_streams = tid == num_threads - 1 ? num_streams - sid_start : num_streams / num_threads;
85+
#else
86+
const int sid_start = 0;
87+
const int thread_num_streams = num_streams;
88+
#endif
89+
#pragma omp for collapse(2) schedule(dynamic)
7690
for (int i = 0; i < gridt.nbx; i++)
7791
{
7892
for (int j = 0; j < gridt.nby; j++)
@@ -81,12 +95,9 @@ void gint_rho_gpu(const hamilt::HContainer<double>* dm,
8195
// cuda's device is not safe in a multi-threaded environment.
8296

8397
checkCuda(cudaSetDevice(gridt.dev_id));
84-
// get stream id
85-
#ifdef _OPENMP
86-
const int sid = omp_get_thread_num();
87-
#else
88-
const int sid = 0;
89-
#endif
98+
99+
const int sid = (i * gridt.nby + j) % thread_num_streams + sid_start;
100+
checkCuda(cudaEventSynchronize(events[sid]));
90101

91102
int max_m = 0;
92103
int max_n = 0;
@@ -147,6 +158,7 @@ void gint_rho_gpu(const hamilt::HContainer<double>* dm,
147158
gemm_B.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
148159
gemm_C.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
149160
dot_product.copy_host_to_device_async(streams[sid], sid);
161+
checkCuda(cudaEventRecord(events[sid], streams[sid]));
150162

151163
psi.memset_device_async(streams[sid], sid, 0);
152164
psi_dm.memset_device_async(streams[sid], sid, 0);
@@ -203,9 +215,9 @@ void gint_rho_gpu(const hamilt::HContainer<double>* dm,
203215
psi_dm.get_device_pointer(sid),
204216
dot_product.get_device_pointer(sid));
205217
checkCudaLastError();
206-
checkCuda(cudaStreamSynchronize(streams[sid]));
207218
}
208219
}
220+
}
209221

210222
// Copy rho from device to host
211223
checkCuda(cudaMemcpy(rho,
@@ -216,6 +228,7 @@ void gint_rho_gpu(const hamilt::HContainer<double>* dm,
216228
for (int i = 0; i < num_streams; i++)
217229
{
218230
checkCuda(cudaStreamDestroy(streams[i]));
231+
checkCuda(cudaEventDestroy(events[i]));
219232
}
220233
}
221234
} // namespace GintKernel

source/module_hamilt_lcao/module_gint/gint_vl_gpu.cu

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,12 @@ void gint_vl_gpu(hamilt::HContainer<double>* hRGint,
4141
const double vfactor = ucell.omega / gridt.ncxyz;
4242
const int nczp = nbzp * gridt.bz;
4343
std::vector<cudaStream_t> streams(num_streams);
44+
std::vector<cudaEvent_t> events(num_streams);
4445

4546
for (int i = 0; i < num_streams; i++)
4647
{
4748
checkCuda(cudaStreamCreate(&streams[i]));
49+
checkCuda(cudaEventCreateWithFlags(&events[i], cudaEventDisableTiming));
4850
}
4951

5052
const int nnrg = hRGint->get_nnr();
@@ -73,21 +75,30 @@ void gint_vl_gpu(hamilt::HContainer<double>* hRGint,
7375
Cuda_Mem_Wrapper<double*> gemm_C(max_atompair_per_z, num_streams, true);
7476

7577
#ifdef _OPENMP
76-
#pragma omp parallel for num_threads(num_streams) collapse(2)
78+
const int max_thread_num = std::min(omp_get_max_threads(), num_streams);
7779
#endif
80+
#pragma omp parallel num_threads(max_thread_num)
81+
{
82+
#ifdef _OPENMP
83+
const int tid = omp_get_thread_num();
84+
const int num_threads = omp_get_num_threads();
85+
const int sid_start = tid * num_streams / num_threads;
86+
const int thread_num_streams = tid == num_threads - 1 ? num_streams - sid_start : num_streams / num_threads;
87+
#else
88+
const int sid_start = 0;
89+
const int thread_num_streams = num_streams;
90+
#endif
91+
#pragma omp for collapse(2) schedule(dynamic)
7892
for (int i = 0; i < gridt.nbx; i++)
7993
{
8094
for (int j = 0; j < gridt.nby; j++)
8195
{
8296
// 20240620 Note that it must be set again here because
8397
// cuda's device is not safe in a multi-threaded environment.
8498
checkCuda(cudaSetDevice(gridt.dev_id));
85-
#ifdef _OPENMP
86-
const int sid = omp_get_thread_num();
87-
#else
88-
const int sid = 0;
89-
#endif
9099

100+
const int sid = (i * gridt.nby + j) % thread_num_streams + sid_start;
101+
checkCuda(cudaEventSynchronize(events[sid]));
91102
int max_m = 0;
92103
int max_n = 0;
93104
int atom_pair_num = 0;
@@ -141,6 +152,7 @@ void gint_vl_gpu(hamilt::HContainer<double>* hRGint,
141152
gemm_A.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
142153
gemm_B.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
143154
gemm_C.copy_host_to_device_async(streams[sid], sid, atom_pair_num);
155+
checkCuda(cudaEventRecord(events[sid], streams[sid]));
144156

145157
psi.memset_device_async(streams[sid], sid, 0);
146158
psi_vldr3.memset_device_async(streams[sid], sid, 0);
@@ -187,9 +199,9 @@ void gint_vl_gpu(hamilt::HContainer<double>* hRGint,
187199
streams[sid],
188200
nullptr);
189201
checkCudaLastError();
190-
checkCuda(cudaStreamSynchronize(streams[sid]));
191202
}
192203
}
204+
}
193205

194206
checkCuda(cudaMemcpy(
195207
hRGint->get_wrapper(),
@@ -200,6 +212,7 @@ void gint_vl_gpu(hamilt::HContainer<double>* hRGint,
200212
for (int i = 0; i < num_streams; i++)
201213
{
202214
checkCuda(cudaStreamDestroy(streams[i]));
215+
checkCuda(cudaEventDestroy(events[i]));
203216
}
204217
}
205218

0 commit comments

Comments
 (0)