diff --git a/parlai/scripts/distributed_eval.py b/parlai/scripts/distributed_eval.py new file mode 100644 index 00000000000..c43d9d9a54d --- /dev/null +++ b/parlai/scripts/distributed_eval.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +""" +Distributed evaluation script. NOT MEANT TO BE CALLED DIRECTLY BY USER. + +This script is meant to be in conjunction with +`SLURM `, which provides environmental variables +describing the environment. + +An example sbatch script is below, for a 2-host, 8-GPU setup (16 total gpus): + +.. code-block:: bash\n\n + + #!/bin/sh + #SBATCH --job-name=distributed_example + #SBATCH --output=/path/to/savepoint/stdout.%j + #SBATCH --error=/path/to/savepoint/stderr.%j + #SBATCH --partition=priority + #SBATCH --nodes=2 + #SBATCH --time=0:10:00 + #SBATCH --signal=SIGINT + #SBATCH --gres=gpu:8 + #SBATCH --ntasks-per-node=8 + #SBATCH --mem=64G + #SBATCH --cpus-per-task=10 + srun python -u -m parlai.scripts.distributed_eval \ + -m seq2seq -t convai2 --dict-file /path/to/dict-file +""" + +import os + +import parlai.scripts.eval_model as eval_model +import parlai.utils.distributed as distributed_utils + + +def main(): + parser = eval_model.setup_args() + parser.add_distributed_training_args() + parser.add_argument('--port', type=int, default=61337, help='TCP port number') + opt = parser.parse_args(print_args=(os.environ['SLURM_PROCID'] == '0')) + + with distributed_utils.slurm_distributed_context(opt) as opt: + return eval_model.eval_model(opt) + + +if __name__ == '__main__': + main() diff --git a/parlai/scripts/eval_model.py b/parlai/scripts/eval_model.py index 8c895844728..0bdbd1902cc 100644 --- a/parlai/scripts/eval_model.py +++ b/parlai/scripts/eval_model.py @@ -135,10 +135,10 @@ def _eval_single_world(opt, agent, task): # max number of examples to evaluate max_cnt = opt['num_examples'] if opt['num_examples'] > 0 else float('inf') cnt = 0 - total_cnt = sum(all_gather_list(world.num_examples())) + total_cnt = world.num_examples() if is_distributed(): - logging.warning('Progress bar is approximate in distributed mode.') + logging.warn('Progress bar is approximate in distributed mode.') while not world.epoch_done() and cnt < max_cnt: cnt += opt.get('batchsize', 1) diff --git a/parlai/scripts/multiprocessing_eval.py b/parlai/scripts/multiprocessing_eval.py new file mode 100644 index 00000000000..5edd3ac4e44 --- /dev/null +++ b/parlai/scripts/multiprocessing_eval.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +""" +Main launch script for single-host, multi-GPU training. + +This is a drop-in replacement for train_model.py. This script will launch N +subprocess, each which runs the full training loop independently. + +Uses torch.nn.parallel.DistributedDataParallel for its main uses. Agents must +specifically implement the wrapper of DistributedDatParallel, but all +TorchRankerAgents and TorchGeneratorAgents support this. +""" + +import torch +import random +import os +import signal +import parlai.utils.distributed as distributed_utils +import parlai.scripts.eval_model as eval_model + + +def multiprocess_eval( + rank, opt, port=61337, rank_offset=0, gpu=None, hostname='localhost' +): + with distributed_utils.distributed_context( + rank, opt, port, rank_offset, gpu, hostname + ) as opt: + return eval_model.eval_model(opt) + + +def launch_and_eval(opt, port): + """ + Perform a fork() to many processes. + """ + # Launch multiple subprocesses + spawncontext = torch.multiprocessing.spawn( + multiprocess_eval, + # need to give rank offset as 1 to cover the fact that the main + # process is rank 0, but that spawn() doesn't let you control rank + (opt, port, 1), + nprocs=opt['distributed_world_size'] - 1, # main proc will also run loop + join=False, + ) + + try: + retval = multiprocess_eval(0, opt, port) + spawncontext.join() + return retval + except KeyboardInterrupt: + # tell the subprocesses to stop too + for p in spawncontext.processes: + if p.is_alive(): + os.kill(p.pid, signal.SIGINT) + raise + + +def setup_args(): + parser = eval_model.setup_args() + parser.add_distributed_training_args() + parser.set_defaults(distributed_world_size=torch.cuda.device_count()) + return parser + + +def main(): + opt = setup_args().parse_args() + port = random.randint(32000, 48000) + return launch_and_eval(opt, port) + + +if __name__ == '__main__': + main()