Skip to content

Commit

Permalink
Fix BestCheckpointNameFixer for QI training (kubeflow#29)
Browse files Browse the repository at this point in the history
* Fix BestCheckpointNameFixer

* Update image version

* Update image version
  • Loading branch information
Jerome-Kaleido authored Sep 22, 2021
1 parent 86694e4 commit de8710f
Show file tree
Hide file tree
Showing 6 changed files with 13 additions and 14 deletions.
9 changes: 4 additions & 5 deletions core/bin/removebg-train-trimap-cloud-qi.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,18 +57,17 @@ def __init__(self, dir_path, base_name="best.ckpt", name_pattern="best-v*.ckpt")
self.name_pattern = name_pattern

def on_validation_end(self, trainer, pl_module):
if not os.path.exists(self.base_name):
base_path = os.path.join(self.dir_path, self.base_name)
if not os.path.exists(base_path):
return
# Get mtime of best.ckpt
base_path = os.path.join(self.dir_path, self.base_name)
base_mtime = os.path.getmtime(base_path)

# Get list of files matching the name pattern
for name in glob.glob(os.path.join(self.dir_path, self.name_pattern)):
new_path = os.path.join(self.dir_path, name)
for new_path in glob.glob(os.path.join(self.dir_path, self.name_pattern)):
new_mtime = os.path.getmtime(new_path)
if new_mtime > base_mtime:
logging.info(f"Best checkpoint {name} is newer than {self.base_name}. Replacing {self.base_name} by {name}")
logging.info(f"Best checkpoint {new_path} is newer than {self.base_name}. Replacing {self.base_name} by {new_path}")
os.rename(new_path, base_path)
base_mtime = new_mtime

Expand Down
6 changes: 3 additions & 3 deletions core/data/qi_auto/job_initialization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ spec:
spec:
containers:
- name: qi-auto-initialization
image: eu.gcr.io/kaleido-train/trimap-qi-auto-initialization:v2
image: eu.gcr.io/kaleido-train/trimap-qi-auto-initialization:v3
imagePullPolicy: Always
args:
- "--danni_user=[...]"
- "--danni_token=[...]"
- "--danni_user=jerome.nicolaou"
- "--danni_token=GK3A2RFyVCDnkpNXHHPfHpGI"
restartPolicy: OnFailure
2 changes: 1 addition & 1 deletion core/data/qi_auto/job_postprocessing.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ spec:
medium: Memory
containers:
- name: qi-auto-initialization
image: eu.gcr.io/kaleido-train/trimap-qi-auto-postprocessing:v2
image: eu.gcr.io/kaleido-train/trimap-qi-auto-postprocessing:v3
imagePullPolicy: Always
args:
- "--danni_user=[...]"
Expand Down
2 changes: 1 addition & 1 deletion core/data/qi_auto/job_termination.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ spec:
spec:
containers:
- name: qi-auto-termination
image: eu.gcr.io/kaleido-train/trimap-qi-auto-initialization:v2
image: eu.gcr.io/kaleido-train/trimap-qi-auto-initialization:v3
imagePullPolicy: Always
command: ["python", "-m", "qi_auto.termination"]
restartPolicy: OnFailure
6 changes: 3 additions & 3 deletions core/data/qi_auto/startup_cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ set -e
CLUSTER_NAME=trimap-qi-auto

# Create images
python -m kaleido_utils.gce.create_and_upload_docker -i ${CLUSTER_NAME}-initialization -r /home/jerome/Workspace/removebg/kaleido-removebg/core/ -f ./data/qi_auto/Dockerfile.initialization -t v2
python -m kaleido_utils.gce.create_and_upload_docker -i ${CLUSTER_NAME}-trainer -r /home/jerome/Workspace/removebg/kaleido-removebg/core/ -f ./data/qi_auto/Dockerfile.trainer -t v2
python -m kaleido_utils.gce.create_and_upload_docker -i ${CLUSTER_NAME}-postprocessing -r /home/jerome/Workspace/removebg/kaleido-removebg/core/ -f ./data/qi_auto/Dockerfile.postprocessing -t v2
python -m kaleido_utils.gce.create_and_upload_docker -i ${CLUSTER_NAME}-initialization -r /home/jerome/Workspace/removebg/kaleido-removebg/core/ -f ./data/qi_auto/Dockerfile.initialization -t v3
python -m kaleido_utils.gce.create_and_upload_docker -i ${CLUSTER_NAME}-trainer -r /home/jerome/Workspace/removebg/kaleido-removebg/core/ -f ./data/qi_auto/Dockerfile.trainer -t v3
python -m kaleido_utils.gce.create_and_upload_docker -i ${CLUSTER_NAME}-postprocessing -r /home/jerome/Workspace/removebg/kaleido-removebg/core/ -f ./data/qi_auto/Dockerfile.postprocessing -t v3

# Create cluster
gcloud container clusters create ${CLUSTER_NAME} --machine-type=n1-standard-4 --zone=europe-west4-b --num-nodes=1 --scopes=storage-rw --release-channel=rapid --cluster-version=1.21
Expand Down
2 changes: 1 addition & 1 deletion core/data/qi_auto/template_job_trimap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ spec:
medium: Memory
containers:
- name: removebg-trimap513
image: eu.gcr.io/kaleido-train/trimap-qi-auto-trainer:v2
image: eu.gcr.io/kaleido-train/trimap-qi-auto-trainer:v3
imagePullPolicy: Always
command: ["python", "/workspace/kaleido/removebg-train-trimap-cloud-qi.py"]
args:
Expand Down

0 comments on commit de8710f

Please sign in to comment.