Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
jayashreemohan29 committed Jan 22, 2021
1 parent ea32735 commit 209fbf1
Show file tree
Hide file tree
Showing 11 changed files with 2,397 additions and 0 deletions.
880 changes: 880 additions & 0 deletions models/image_classification/pytorch-imagenet-cf.py

Large diffs are not rendered by default.

85 changes: 85 additions & 0 deletions scripts/crash-test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import torch
import os
import sys
import argparse
import time

parser = argparse.ArgumentParser("Test torch.save")
parser.add_argument('--persist', action='store_true', default=False)
parser.add_argument('--one', action='store_true', default=False)
parser.add_argument('--check', action='store_true', default=False)
parser.add_argument('--overwrite', action='store_true', default=False)
parser.add_argument('--sleep', action='store_true', default=False)
parser.add_argument('--num_tensors', type=int, default=3)
parser.add_argument('--dir', type=str, default='./chk/')

torch.manual_seed(2)

args=parser.parse_args()

def main():

if args.check:
check_results()
return

tensor_bank={}
tensor_bank_ow={}
for i in range(args.num_tensors):
tensor_bank[i] = torch.randn(256,3,224,224)
tensor_bank_ow[i] = torch.randn(256,3,224,224)

if not os.path.exists(args.dir):
os.makedirs(args.dir)

filepath = os.path.join(args.dir, 'model.chk')

print("Starting save to {}..".format(filepath))
s = time.time()
torch.save(tensor_bank, filepath)
if args.persist:
persist(filepath)

if args.overwrite:
filepath_ow = os.path.join(args.dir, 'model.chk')
else:
filepath_ow = os.path.join(args.dir, 'model_new.chk')

if not args.one:
torch.save(tensor_bank_ow, filepath_ow)
if args.persist:
persist(filepath_ow)

dur = time.time() - s
print("Returned from save in {:.2f} s".format(dur))

if args.sleep:
time.sleep(30)

def persist(filepath):
with open(filepath) as f:
os.fsync(f.fileno())


def check_results():
new_ten_1 = torch.load('chk/model.chk')
old_ten_2 = torch.load('chk-compare/model_new.chk')
old_ten_1 = torch.load('chk-compare/model.chk')


if args.overwrite:
for idx, val in new_ten_1.items():
print("Ten 1 : {}".format(torch.all(torch.eq(val, old_ten_1[idx]))))
print("Ten 2 : {}".format(torch.all(torch.eq(val, old_ten_2[idx]))))
return


new_ten_2 = torch.load('chk/model_new.chk')

for idx, val in new_ten_1.items():
print("Ten 1 : {}".format(torch.all(torch.eq(val, old_ten_1[idx]))))
for idx, val in new_ten_2.items():
print("Ten 2 : {}".format(torch.all(torch.eq(val, old_ten_2[idx]))))

if __name__ == '__main__':
main()
5 changes: 5 additions & 0 deletions scripts/free.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash

while sleep 1
do (free -g >> free.out)
done
1 change: 1 addition & 0 deletions scripts/gpulog.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
nvidia-smi --query-gpu=power.draw,utilization.gpu -l 10 --format=csv >> gpu_util.csv
129 changes: 129 additions & 0 deletions scripts/run_all_256.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#!/bin/bash


if [ "$#" -ne 3 ]; then
echo "Usage : ./run_img.sh <data-dir> <out-dir> <worker>"
exit 1
fi

apt-get install jq
DATA_DIR=$1
OUT_DIR=$2
WORKER=$3
SRC="models/image_classification/"
SCRIPTS="scripts/"

mkdir -p $OUT_DIR


gpu=0
num_gpu=8

echo " Data dir is $DATA_DIR"
echo " Out dir is $OUT_DIR"

resnext="resnext101"
densenet="densenet121"

for arch in 'vgg16' ; do
#for arch in 'resnet18' ; do
#for arch in 'resnet50' 'resnet18' 'inception_v3' 'resnext101' 'densenet121' 'vgg16'; do
for workers in $WORKER; do
for batch in 256; do

#: <<'END'
if [ "$arch" = "$resnext" ]; then
batch=128
elif [ "$arch" = "$densenet" ]; then
batch=128
fi

# RUN 1 : CheckFreq
result_dir="${OUT_DIR}/${arch}_b${batch}_w${workers}_g${num_gpu}_dali_fp32_cf"
echo "result dir is $result_dir"
mkdir -p $result_dir
echo "Now running $arch for $workers workers and $batch batch"
mpstat -P ALL 1 > cpu_util.out 2>&1 &
./$SCRIPTS/free.sh &
#./$SCRIPTS/gpulog.sh &
dstat -cdnmgyr --output all-utils.csv 2>&1 &
python -m torch.distributed.launch --nproc_per_node=$num_gpu $SRC/pytorch-imagenet-cf.py --dali -a $arch -b $batch --workers $workers --epochs 2 --deterministic --noeval --barrier --checkfreq --chk-prefix ./chk/ --cf_iterator --data $DATA_DIR > stdout.out 2>&1
sync
echo "RAN $arch for $workers workers, $batch batch with DDP" >> stdout.out
pkill -f mpstat
pkill -f dstat
pkill -f free
pkill -f gpulog
pkill -f nvidia-smi
pkill -f pytorch-imagenet
sleep 2
mv *.out $result_dir/
mv *.log $result_dir/
mv *.csv $result_dir/

#exit
#: <<'END'
#END

# RUN 2 : Epoch boundary
result_dir="${OUT_DIR}/${arch}_b${batch}_w${workers}_g${num_gpu}_dali_fp32_epoch_chk"
echo "result dir is $result_dir"
mkdir -p $result_dir
echo "Now running $arch for $workers workers and $batch batch"
mpstat -P ALL 1 > cpu_util.out 2>&1 &
./$SCRIPTS/free.sh &
#./$SCRIPTS/gpulog.sh &
dstat -cdnmgyr --output all-utils.csv 2>&1 &
python -m torch.distributed.launch --nproc_per_node=$num_gpu $SRC/pytorch-imagenet-cf.py --dali -a $arch -b $batch --workers $workers --epochs 1 --deterministic --noeval --barrier --chk-freq 0 --chk_mode_baseline --checkfreq --chk-prefix ./chk/ --cf_iterator --data $DATA_DIR > stdout.out 2>&1

sync
echo "RAN $arch for $workers workers, $batch batch with DDP" >> stdout.out
pkill -f mpstat
pkill -f dstat
pkill -f free
pkill -f gpulog
pkill -f nvidia-smi
pkill -f pytorch-imagenet
sleep 2
mv *.out $result_dir/
mv *.log $result_dir/
mv *.csv $result_dir/

#exit
#END

# RUN 3 : Synchronous at chosen frequency
result_dir="${OUT_DIR}/${arch}_b${batch}_w${workers}_g${num_gpu}_dali_fp32_iter_chk_baseline_persist"
echo "result dir is $result_dir"
mkdir -p $result_dir
echo "Now running $arch for $workers workers and $batch batch"

cache_file=".cache_${arch}_${batch}"
CHK=$(jq '.chk_freq' $cache_file)
echo "Setting CHK freq = $CHK"
mpstat -P ALL 1 > cpu_util.out 2>&1 &
./$SCRIPTS/free.sh &
#./$SCRIPTS/gpulog.sh &
dstat -cdnmgyr --output all-utils.csv 2>&1 &
python -m torch.distributed.launch --nproc_per_node=$num_gpu $SRC/pytorch-imagenet-cf.py --dali -a $arch -b $batch --workers $workers --epochs 1 --deterministic --noeval --barrier --chk-freq $CHK --chk_mode_baseline --persist --checkfreq --chk-prefix ./chk/ --cf_iterator --data $DATA_DIR > stdout.out 2>&1

sync
echo "RAN $arch for $workers workers, $batch batch with DDP" >> stdout.out
pkill -f mpstat
pkill -f dstat
pkill -f free
pkill -f gpulog
pkill -f nvidia-smi
pkill -f pytorch-imagenet
sleep 2
mv *.out $result_dir/
mv *.log $result_dir/
mv *.csv $result_dir/





done
done
done
6 changes: 6 additions & 0 deletions scripts/start_monitor.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash
SCRIPTS="scripts/"
mpstat -P ALL 1 > cpu_util.out 2>&1 &
$SCRIPTS/free.sh &
dstat -cdnmgyr --output all-utils.csv 2>&1 &
$SCRIPTS/gpulog.sh &
14 changes: 14 additions & 0 deletions scripts/stop_monitor.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash
if [ "$#" -ne 1 ]; then
exit 1
fi
OUT_DIR=$1
pkill -f mpstat
pkill -f free
pkill -f gpulog
pkill -f nvidia-smi
pkill -f dstat

mv *.log $OUT_DIR/
mv *.csv $OUT_DIR/
mv *.out $OUT_DIR/
Loading

0 comments on commit 209fbf1

Please sign in to comment.