We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 0a6beca commit dfd9b5dCopy full SHA for dfd9b5d
cuda/fastermoe/smart_schedule.h
@@ -338,7 +338,7 @@ void fmoe_cuda_fused_backward_impl(
338
collect_fn(si, i / num_expert, 0);
339
if (i / num_expert == rank) {
340
cudaEventCreate(evt_reduce + i % num_expert);
341
- cudaEventRecord(evt_reduce[i % num_expert], smgr->stream(num_expert));
+ cudaEventRecord(evt_reduce[i % num_expert], smgr->stream(0));
342
}
343
++si;
344
@@ -367,7 +367,6 @@ void fmoe_cuda_fused_backward_impl(
367
for (long i = 0, si = 0; i < world_size * num_expert; ++i) {
368
if (stored_models[i]) {
369
370
- FMOE_SWE(smgr->stream(0), evt_reduce[i % num_expert]);
371
FMOE_SWE(smgr->torchStream(), evt_reduce[i % num_expert]);
372
set_grad_fn(si, i % num_expert);
373
0 commit comments