-
Notifications
You must be signed in to change notification settings - Fork 1
/
batchShifter.slr
executable file
·85 lines (65 loc) · 2.67 KB
/
batchShifter.slr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/bin/bash
#SBATCH --ntasks-per-node=4 --gpus-per-task=1 --cpus-per-task=32 --exclusive
#SBATCH -N16 --time=12:00:00 -J srBig -q regular
#-SBATCH -N8 --time=30:00 -q debug
#-SBATCH -C gpu -A nstaff_g
#-SBATCH -C gpu -A m3363_g # Peter
#SBATCH -C gpu -A m4194_g # Zarija
#SBATCH --image=nersc/pytorch:ngc-21.08-v2
#-SBATCH -x cgpu08 # block sick nodes
#SBATCH --array 1-1
# - - - E N D O F SLURM C O M M A N D S
# Licenses=scratch ??
nprocspn=${SLURM_NTASKS_PER_NODE}
#nprocspn=1 # special case for partial use of full node
design=benchmk_flux4k
dataName=flux_L80_s12-1LR8HR-Nyx4k-r2c32 ; epochs=150 #?? do -N16 -11h =330 epochs
N=${SLURM_NNODES}
G=$[ $N * $nprocspn ]
export MASTER_ADDR=`hostname`
arrIdx=${SLURM_ARRAY_TASK_ID}
jobId=${SLURM_ARRAY_JOB_ID}_${arrIdx} # must not include'/'
echo S: JID=${jobId} MASTER_ADDR=$MASTER_ADDR G=$G N=$N nprocspn=$nprocspn
nodeList=$(scontrol show hostname $SLURM_NODELIST)
echo S:node-list $nodeList
# grab some variables from environment - if defined
[[ -z "${SRCOS2D_OTHER_PAR}" ]] && otherParStr=" " || otherParStr=" ${SRCOS2D_OTHER_PAR} "
[[ -z "${SRCOS2D_WRK_SUFIX}" ]] && wrkSufix=$jobId || wrkSufix="${SRCOS2D_WRK_SUFIX}"
env |grep SRCOS2D
if [[ $NERSC_HOST == perlmutter ]] ; then
echo "S:on Perlmutter"
facility=perlmutter
module unload pytorch
ENGINE=" shifter "
export OMP_NUM_THREADS=1 # not sure if it makes any difference but w-load is now ~4 instead of 200
fi
#baseDir=/global/homes/b/balewski/prje/tmp_NyxHydro_outFluxB
baseDir=$SCRATCH/tmp_Nyx2022a-flux/jobs
wrkDir=${baseDir}/$wrkSufix
export TORCH_HOME=$baseDir # for storing VGG model
echo "S: jobId=$SLURM_JOBID wrkSufix=$wrkSufix wrkDir=$wrkDir"
date
export CMD=" python3 -u train_dist.py --facility $facility --design $design --epochs $epochs --basePath $wrkDir --expName $jobId --dataName $dataName $otherParStr "
# spare --numGlobSamp 256
echo S: CMD=$CMD ENGINE=$ENGINE
codeList=" train_dist.py predict.py toolbox/ batchShifter.slr *.hpar.yaml "
mkdir -p $wrkDir
cp -rp $codeList $wrkDir
cd $wrkDir
echo S:PWD=`pwd`
if [ $doHPO -gt 0 ]; then
cp -rp ${designLib}/${design}.hpar.yaml .
fi
echo "S:starting srgan_cosmo2 " `date` " wrkDir= $wrkDir"
time srun -n $G $ENGINE toolbox/driveOneTrain.sh >& log.train
sleep 3
echo S:done train
time srun -n1 $ENGINE ./predict.py --basePath $baseDir --expName $jobId --genSol last >& log.predict
# spare --doFOM
# manual: cd sand-box ; srun -n1 shifter ./predict.py --basePath . --expName . --genSol epoch100
echo S:done predict
date
chmod a+rx $wrkDir
chmod a+r ${wrkDir}/*
#Cancel all my jobs:
# squeue -u $USER -h | awk '{print $1}' | xargs scancel