From 4b93507699a289f56f939cdad99db775170c2d7f Mon Sep 17 00:00:00 2001 From: Joseph Viviano Date: Tue, 18 Jun 2024 16:49:19 -0400 Subject: [PATCH] example mila slurm script --- tutorials/examples/ddp_gfn.small.4.mila.slurm | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 tutorials/examples/ddp_gfn.small.4.mila.slurm diff --git a/tutorials/examples/ddp_gfn.small.4.mila.slurm b/tutorials/examples/ddp_gfn.small.4.mila.slurm new file mode 100644 index 00000000..16945d8f --- /dev/null +++ b/tutorials/examples/ddp_gfn.small.4.mila.slurm @@ -0,0 +1,43 @@ +#!/bin/bash + +#SBATCH -o /network/scratch/v/vivianoj/torchgfn/logs/intel/slurm-%j.out +#SBATCH -e /network/scratch/v/vivianoj/torchgfn/logs/intel/slurm-%j.err +#SBATCH -J ddp +#SBATCH --get-user-env +#SBATCH --partition=long +#SBATCH --ntasks=2 +#SBATCH --cpus-per-task=4 +#SBATCH --time=00:60:00 + +#source /swtools/intel/2024.0/oneapi-vars.sh +export I_MPI_HYDRA_BOOTSTRAP=slurm + +eval "$(conda shell.bash hook)" +conda activate torchgfn_multinode + +export KMP_AFFINITY=compact,verbose +export OMP_NUM_THREADS=56 +export MASTER_ADDR=$(hostname | head -n 1) +echo $MASTER_ADDR +echo $SLURM_JOB_NUM_NODES +echo $SLURM_NODELIST + +./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories 512000 --batch_size 256000 &> scaling.out.4.4.512000.256000 +#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories 128000 --batch_size 256000 &> scaling.out.4.4.128000.256000 +#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories 64000 --batch_size 256000 &> scaling.out.4.4.64000.256000 +#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories 32000 --batch_size 256000 &> scaling.out.4.4.32000.256000 +#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories 16000 --batch_size 256000 &> scaling.out.4.4.16000.256000 +#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories 8000 --batch_size 256000 &> scaling.out.4.4.8000.256000 +#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories 4000 --batch_size 256000 &> scaling.out.4.4.4000.256000 +#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories 2000 --batch_size 256000 &> scaling.out.4.4.2000.256000 +#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories 1000 --batch_size 256000 &> scaling.out.4.4.1000.256000 +#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories 512 --batch_size 256000 &> scaling.out.4.4.512.256000 +#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories 256 --batch_size 256000 &> scaling.out.4.4.256.256000 +#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories 128 --batch_size 256000 &> scaling.out.4.4.128.256000 +#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories 64 --batch_size 256000 &> scaling.out.4.4.64.256000 +#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories 32 --batch_size 256000 &> scaling.out.4.4.32.256000 +#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories 16 --batch_size 256000 &> scaling.out.4.4.16.256000 +#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories 8 --batch_size 256000 &> scaling.out.4.4.8.256000 +#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories 4 --batch_size 256000 &> scaling.out.4.4.4.256000 +#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories 2 --batch_size 256000 &> scaling.out.4.4.2.256000 +#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories 1 --batch_size 256000 &> scaling.out.4.4.1.256000