forked from ultralytics/yolov5
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Amazon AWS EC2 startup and re-startup scripts (ultralytics#2185)
* Amazon AWS EC2 startup and re-startup scripts * Create resume.py * cleanup
- Loading branch information
1 parent
a2a82a4
commit f47e649
Showing
4 changed files
with
86 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
# AWS EC2 instance startup 'MIME' script https://aws.amazon.com/premiumsupport/knowledge-center/execute-user-data-ec2/ | ||
# This script will run on every instance restart, not only on first start | ||
# --- DO NOT COPY ABOVE COMMENTS WHEN PASTING INTO USERDATA --- | ||
|
||
Content-Type: multipart/mixed; boundary="//" | ||
MIME-Version: 1.0 | ||
|
||
--// | ||
Content-Type: text/cloud-config; charset="us-ascii" | ||
MIME-Version: 1.0 | ||
Content-Transfer-Encoding: 7bit | ||
Content-Disposition: attachment; filename="cloud-config.txt" | ||
|
||
#cloud-config | ||
cloud_final_modules: | ||
- [scripts-user, always] | ||
|
||
--// | ||
Content-Type: text/x-shellscript; charset="us-ascii" | ||
MIME-Version: 1.0 | ||
Content-Transfer-Encoding: 7bit | ||
Content-Disposition: attachment; filename="userdata.txt" | ||
|
||
#!/bin/bash | ||
# --- paste contents of userdata.sh here --- | ||
--// |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# Resume all interrupted trainings in yolov5/ dir including DPP trainings | ||
# Usage: $ python utils/aws/resume.py | ||
|
||
import os | ||
from pathlib import Path | ||
|
||
import torch | ||
import yaml | ||
|
||
port = 0 # --master_port | ||
path = Path('').resolve() | ||
for last in path.rglob('*/**/last.pt'): | ||
ckpt = torch.load(last) | ||
if ckpt['optimizer'] is None: | ||
continue | ||
|
||
# Load opt.yaml | ||
with open(last.parent.parent / 'opt.yaml') as f: | ||
opt = yaml.load(f, Loader=yaml.SafeLoader) | ||
|
||
# Get device count | ||
d = opt['device'].split(',') # devices | ||
nd = len(d) # number of devices | ||
ddp = nd > 1 or (nd == 0 and torch.cuda.device_count() > 1) # distributed data parallel | ||
|
||
if ddp: # multi-GPU | ||
port += 1 | ||
cmd = f'python -m torch.distributed.launch --nproc_per_node {nd} --master_port {port} train.py --resume {last}' | ||
else: # single-GPU | ||
cmd = f'python train.py --resume {last}' | ||
|
||
cmd += ' > /dev/null 2>&1 &' # redirect output to dev/null and run in daemon thread | ||
print(cmd) | ||
os.system(cmd) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
#!/bin/bash | ||
# AWS EC2 instance startup script https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/user-data.html | ||
# This script will run only once on first instance start (for a re-start script see mime.sh) | ||
# /home/ubuntu (ubuntu) or /home/ec2-user (amazon-linux) is working dir | ||
# Use >300 GB SSD | ||
|
||
cd home/ubuntu | ||
if [ ! -d yolov5 ]; then | ||
echo "Running first-time script." # install dependencies, download COCO, pull Docker | ||
git clone https://github.com/ultralytics/yolov5 && sudo chmod -R 777 yolov5 | ||
cd yolov5 | ||
bash data/scripts/get_coco.sh && echo "Data done." & | ||
sudo docker pull ultralytics/yolov5:latest && echo "Docker done." & | ||
# python -m pip install --upgrade pip && pip install -r requirements.txt && python detect.py && echo "Requirements done." & | ||
else | ||
echo "Running re-start script." # resume interrupted runs | ||
i=0 | ||
list=$(docker ps -qa) # container list i.e. $'one\ntwo\nthree\nfour' | ||
while IFS= read -r id; do | ||
((i++)) | ||
echo "restarting container $i: $id" | ||
docker start $id | ||
# docker exec -it $id python train.py --resume # single-GPU | ||
docker exec -d $id python utils/aws/resume.py | ||
done <<<"$list" | ||
fi |