Skip to content

Commit

Permalink
feat(train): make timeout of wenet_join configurable (#2123)
Browse files Browse the repository at this point in the history
  • Loading branch information
xingchensong authored Nov 6, 2023
1 parent dcdcc87 commit fd3803b
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 1 deletion.
2 changes: 1 addition & 1 deletion wenet/bin/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def main():

# NOTE(xcsong): Why we need a new group? see `train_utils.py::wenet_join`
group_join = dist.new_group(backend="gloo",
timeout=datetime.timedelta(seconds=30))
timeout=datetime.timedelta(seconds=args.timeout))

dist.barrier() # NOTE(xcsong): Ensure all ranks start Train at the same time.
executor.train(model, optimizer, scheduler, train_data_loader,
Expand Down
3 changes: 3 additions & 0 deletions wenet/utils/train_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,9 @@ def add_ddp_args(parser):


def add_deepspeed_args(parser):
parser.add_argument('--timeout', default=30, type=int,
help='timeout (in seconds) of wenet_join. ' +
'30s for aishell & 300s for wenetspeech')
parser.add_argument('--local_rank', type=int, default=-1,
help='local rank passed from distributed launcher')
parser.add_argument('--deepspeed.save_states',
Expand Down

0 comments on commit fd3803b

Please sign in to comment.