We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent e48aaeb commit d23693aCopy full SHA for d23693a
distributed/ddp-tutorial-series/slurm/sbatch_run.sh
@@ -14,10 +14,11 @@ head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
14
echo Node IP: $head_node_ip
15
export LOGLEVEL=INFO
16
17
+job_id=2024
18
srun torchrun \
19
--nnodes 4 \
20
--nproc_per_node 1 \
---rdzv_id $RANDOM \
21
+--rdzv_id ${jobid} \
22
--rdzv_backend c10d \
23
--rdzv_endpoint $head_node_ip:29500 \
-/shared/examples/multinode_torchrun.py 50 10
24
+/shared/examples/multinode_torchrun.py 50 10
0 commit comments