Improve the automatic QI training (kubeflow#39)

* Work in progress * Add script to download latest models from kaleido-models * Remove arguments * Update README.md * Fix listing qi trainings
georgkaleido · Oct 5, 2021 · 344b688 · 344b688
1 parent 002b884
commit 344b688
Show file tree

Hide file tree

Showing 17 changed files with 331 additions and 223 deletions.
diff --git a/core/bin/removebg-train-trimap-cloud-qi.py b/core/bin/removebg-train-trimap-cloud-qi.py
@@ -23,6 +23,7 @@
 
 # Local imports
 from removebg.training.trimap import PlTrimap
+from qi_auto.utilities import Bucket, run_bash
 
 
 logging.basicConfig(
@@ -81,16 +82,14 @@ def main():
  parser.add_argument('--lr', type=float, default=0.00001, help='batch size.')
  parser.add_argument('--workers', type=int, help='number of workers.', default=multiprocessing.cpu_count())
  parser.add_argument('--danni_metadata_path', required=True, type=str, help='JSON file with prefetched danni paths.')
- parser.add_argument('--wandb_api_key', type=str, help='wandb api key.')
- parser.add_argument('--checkpoint_url', type=str, help='checkpoint url to start.')
+ parser.add_argument('--initialize_with_prod_weights', action='store_true',
+  help='Initialized model with pretrained weights from production')
  parser.add_argument('--danni_max_pages', type=int, help='limit pages.')
  parser.add_argument('--fresh', action='store_true', help='removes previous directory with results')
  args = parser.parse_args()
 
- if not args.wandb_api_key:
- os.environ['WANDB_MODE'] = 'dryrun'
- else:
- os.environ['WANDB_API_KEY'] = args.wandb_api_key
+ assert "WANDB_API_KEY" in os.environ, "Could not find environment variable WANDB_API_KEY"
+ assert "GITHUB_AUTH_TOKEN" in os.environ, "Could not find environment variable GITHUB_AUTH_TOKEN"
 
  # Download Danni dataset metadata
  danni_metadata_local_path = os.path.basename(args.danni_metadata_path)
@@ -184,9 +183,15 @@ def main():
  # Init model
  trimap_model = PlTrimap(lr=args.lr)
 
- if not os.path.exists(last_checkpoint_path) and args.checkpoint_url:
- checkpoint_local_path = os.path.basename(args.checkpoint_url)
- download_from_bucket(args.checkpoint_url, checkpoint_local_path)
+ if not os.path.exists(last_checkpoint_path) and args.initialize_with_prod_weights:
+
+ # Download latest checkpoint from github
+ checkpoint_name = "trimap513-deeplab-res2net.pth.tar"
+ checkpoint_local_dir = "."
+ run_bash(f'./scripts/fetch-models.sh "{checkpoint_name}" "{checkpoint_local_dir}"', realtime_output=False)
+
+ # Load checkpoint
+ checkpoint_local_path = os.path.join(checkpoint_local_dir, checkpoint_name)
  checkpoint = torch.load(checkpoint_local_path)
  trimap_model.model.load_state_dict(checkpoint['state_dict'])
 

diff --git a/core/data/qi_auto/Dockerfile.postprocessing b/core/data/qi_auto/Dockerfile.postprocessing
@@ -7,6 +7,11 @@ ARG DEBCONF_NONINTERACTIVE_SEEN=true
 RUN apt-get update
 RUN apt-get install ffmpeg libsm6 libxext6 -y
 
+# Install github fetch
+ARG GITHUB_FETCH_VERSION="v0.4.2"
+RUN curl -Lo /usr/bin/fetch https://github.com/gruntwork-io/fetch/releases/download/${GITHUB_FETCH_VERSION}/fetch_linux_$(dpkg --print-architecture) \
+ && chmod +x /usr/bin/fetch
+
 # Install libGL
 RUN apt-get update \
  && apt-get install -y --no-install-recommends --fix-missing \
@@ -69,6 +74,8 @@ COPY ./eval/plot_results.py ./eval/plot_results.py
 RUN mkdir -p qi_auto
 COPY ./qi_auto/utilities.py ./qi_auto/utilities.py
 COPY ./qi_auto/postprocessing.py ./qi_auto/postprocessing.py
+RUN mkdir -p scripts
+COPY ./scripts/fetch-models.sh ./scripts/fetch-models.sh
 
 # Set PYTHONPATH to the root of the framework
 ENV PYTHONPATH=/workspace/kaleido/

diff --git a/core/data/qi_auto/Dockerfile.trainer b/core/data/qi_auto/Dockerfile.trainer
@@ -10,6 +10,11 @@ ARG DEBCONF_NONINTERACTIVE_SEEN=true
 RUN apt-get update
 RUN apt-get install ffmpeg libsm6 libxext6 -y
 
+# Install github fetch
+ARG GITHUB_FETCH_VERSION="v0.4.2"
+RUN curl -Lo /usr/bin/fetch https://github.com/gruntwork-io/fetch/releases/download/${GITHUB_FETCH_VERSION}/fetch_linux_$(dpkg --print-architecture) \
+ && chmod +x /usr/bin/fetch
+
 RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && apt-get update -y && apt-get install google-cloud-sdk -y
 
 WORKDIR /workspace
@@ -34,6 +39,10 @@ RUN mkdir -p removebg/data
 COPY data/trimap.json removebg/data/trimap.json
 COPY bin/removebg-train-trimap-cloud-qi.py removebg-train-trimap-cloud-qi.py
 COPY liveness.training.sh liveness.training.sh
+RUN mkdir -p qi_auto
+COPY ./qi_auto/utilities.py ./qi_auto/utilities.py
+RUN mkdir -p scripts
+COPY ./scripts/fetch-models.sh ./scripts/fetch-models.sh
 
 # Set PYTHONPATH to directory where the framework is
 ENV PYTHONPATH=/workspace/kaleido/removebg

diff --git a/core/data/qi_auto/job_initialization.yaml b/core/data/qi_auto/job_initialization.yaml
@@ -3,17 +3,25 @@ kind: CronJob
 metadata:
  name: qi-auto-initialization
 spec:
- # Run once a month at 3am of the first day of the month
- schedule: "0 3 1 * *"
+ # Run twice a month (1st and 15th) at 3am
+ schedule: "0 3 1,15 * *"
  jobTemplate:
  spec:
  template:
  spec:
  containers:
  - name: qi-auto-initialization
- image: eu.gcr.io/kaleido-train/trimap-qi-auto-initialization:v3
+ image: eu.gcr.io/kaleido-train/trimap-qi-auto-initialization:v4
  imagePullPolicy: Always
- args:
- - "--danni_user=jerome.nicolaou"
- - "--danni_token=GK3A2RFyVCDnkpNXHHPfHpGI"
+ env:
+ - name: DANNI_USER
+ valueFrom:
+ secretKeyRef:
+ name: auth-secrets
+ key: danni-user
+ - name: DANNI_TOKEN
+ valueFrom:
+ secretKeyRef:
+ name: auth-secrets
+ key: danni-token
  restartPolicy: OnFailure
diff --git a/core/data/qi_auto/job_postprocessing.yaml b/core/data/qi_auto/job_postprocessing.yaml
@@ -3,9 +3,9 @@ kind: CronJob
 metadata:
  name: qi-auto-postprocessing
 spec:
- # Run once a month at 3:30am of the 21st day of the month
- schedule: "30 3 21 * *"
-# schedule: "11 7 20 * *" # TEST
+ # Run twice a month (14th and 28th) at 3am
+ schedule: "0 3 14,28 * *"
+# schedule: "56 17 30 * *" # TEST
  jobTemplate:
  spec:
  template:
@@ -18,11 +18,15 @@ spec:
  medium: Memory
  containers:
  - name: qi-auto-initialization
- image: eu.gcr.io/kaleido-train/trimap-qi-auto-postprocessing:v3
+ image: eu.gcr.io/kaleido-train/trimap-qi-auto-postprocessing:v4
  imagePullPolicy: Always
- args:
- - "--danni_user=[...]"
- - "--danni_token=[...]"
+ command: ["python", "-m", "qi_auto.postprocessing"]
+ env:
+ - name: GITHUB_AUTH_TOKEN
+ valueFrom:
+ secretKeyRef:
+ name: auth-secrets
+ key: github-token
  # Ensure that the node has a GPU
  resources:
  limits:

diff --git a/core/data/qi_auto/job_termination.yaml b/core/data/qi_auto/job_termination.yaml
@@ -3,16 +3,16 @@ kind: CronJob
 metadata:
  name: qi-auto-termination
 spec:
- # Run once a month at 3am of the 21st day of the month
- schedule: "0 3 21 * *"
-# schedule: "59 10 18 * *" # TEST
+ # Run every week on Monday
+ schedule: "0 3 * * 1"
+# schedule: "27 16 30 * *" # TEST
  jobTemplate:
  spec:
  template:
  spec:
  containers:
  - name: qi-auto-termination
- image: eu.gcr.io/kaleido-train/trimap-qi-auto-initialization:v3
+ image: eu.gcr.io/kaleido-train/trimap-qi-auto-initialization:v4
  imagePullPolicy: Always
  command: ["python", "-m", "qi_auto.termination"]
  restartPolicy: OnFailure
diff --git a/core/data/qi_auto/secret_auth.yaml b/core/data/qi_auto/secret_auth.yaml
@@ -0,0 +1,9 @@
+apiVersion: v1
+kind: Secret
+metadata:
+ name: auth-secrets
+data:
+ danni-user:
+ danni-token:
+ github-token:
+ wandb-token:
diff --git a/core/data/qi_auto/startup_cluster.sh b/core/data/qi_auto/startup_cluster.sh
@@ -5,15 +5,15 @@ set -e
 CLUSTER_NAME=trimap-qi-auto
 
 # Create images
-python -m kaleido_utils.gce.create_and_upload_docker -i ${CLUSTER_NAME}-initialization -r /home/jerome/Workspace/removebg/kaleido-removebg/core/ -f ./data/qi_auto/Dockerfile.initialization -t v3
-python -m kaleido_utils.gce.create_and_upload_docker -i ${CLUSTER_NAME}-trainer -r /home/jerome/Workspace/removebg/kaleido-removebg/core/ -f ./data/qi_auto/Dockerfile.trainer -t v3
-python -m kaleido_utils.gce.create_and_upload_docker -i ${CLUSTER_NAME}-postprocessing -r /home/jerome/Workspace/removebg/kaleido-removebg/core/ -f ./data/qi_auto/Dockerfile.postprocessing -t v3
+python -m kaleido_utils.gce.create_and_upload_docker -i ${CLUSTER_NAME}-initialization -r /home/jerome/Workspace/removebg/kaleido-removebg/core/ -f ./data/qi_auto/Dockerfile.initialization -t v4
+python -m kaleido_utils.gce.create_and_upload_docker -i ${CLUSTER_NAME}-trainer -r /home/jerome/Workspace/removebg/kaleido-removebg/core/ -f ./data/qi_auto/Dockerfile.trainer -t v4
+python -m kaleido_utils.gce.create_and_upload_docker -i ${CLUSTER_NAME}-postprocessing -r /home/jerome/Workspace/removebg/kaleido-removebg/core/ -f ./data/qi_auto/Dockerfile.postprocessing -t v4
 
 # Create cluster
 gcloud container clusters create ${CLUSTER_NAME} --machine-type=n1-standard-4 --zone=europe-west4-b --num-nodes=1 --scopes=storage-rw --release-channel=rapid --cluster-version=1.21
 
 # Create gpu pool
-gcloud container node-pools create gpu-pool --cluster ${CLUSTER_NAME} --accelerator type=nvidia-tesla-v100,count=1 --machine-type=custom-12-79872 --zone=europe-west4-b --num-nodes=0 --enable-autoscaling --min-nodes 0 --max-nodes 1 --scopes=storage-rw
+gcloud container node-pools create gpu-pool --cluster ${CLUSTER_NAME} --accelerator type=nvidia-tesla-v100,count=1 --machine-type=custom-12-79872 --zone=europe-west4-b --num-nodes=0 --enable-autoscaling --min-nodes 0 --max-nodes 5 --scopes=storage-rw
 
 # Create gpu pool for tests purpose
 #gcloud container node-pools create gpu-pool-training --cluster ${CLUSTER_NAME} --preemptible --accelerator type=nvidia-tesla-t4,count=1 --machine-type=n1-standard-4 --zone=europe-west4-b --num-nodes=1 --scopes=storage-rw --node-labels=goal=training
@@ -28,6 +28,9 @@ kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container
 kubectl create role pod-manager --verb=get --verb=list --verb=watch --verb=update --verb=delete --verb=create --verb=patch --resource=pods,services,deployments,jobs
 kubectl create rolebinding default-pod-manager --role=pod-manager --serviceaccount=default:default --namespace=default
 
+# Apply secrets
+kubectl apply -f secret_auth.yaml
+
 # Launch jobs
 #kubectl apply -f job_initialization.yaml
 #kubectl apply -f job_termination.yaml

diff --git a/core/data/qi_auto/template_job_trimap.yaml b/core/data/qi_auto/template_job_trimap.yaml
@@ -1,16 +1,16 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
- name: removebg-trimap513-qi-auto
+ name: removebg-trimap513-qi-auto-[{DATE}]
 spec:
  replicas: 1
  selector:
  matchLabels:
- app: removebg-trimap513-qi-auto
+ app: removebg-trimap513-qi-auto-[{DATE}]
  template:
  metadata:
  labels:
- app: removebg-trimap513-qi-auto
+ app: removebg-trimap513-qi-auto-[{DATE}]
  spec:
  # Necessary to have enough shared memory
  volumes:
@@ -19,17 +19,27 @@ spec:
  medium: Memory
  containers:
  - name: removebg-trimap513
- image: eu.gcr.io/kaleido-train/trimap-qi-auto-trainer:v3
+ image: eu.gcr.io/kaleido-train/trimap-qi-auto-trainer:v4
  imagePullPolicy: Always
  command: ["python", "/workspace/kaleido/removebg-train-trimap-cloud-qi.py"]
  args:
  - "--config=/workspace/kaleido/removebg/data/trimap.json"
- - "--wandb_api_key=2b738a8a39d37ad301baa70560881931b3bee07e"
  - "--danni_metadata_path=[{DANNI_METADATA_PATH}]"
  - "--name=[{NAME}]"
  - "--lr=0.00001"
- - "--checkpoint_url=[{CHECKPOINT_URL}]"
+ - "--initialize_with_prod_weights"
  - "--fresh"
+ env:
+ - name: WANDB_API_KEY
+ valueFrom:
+ secretKeyRef:
+ name: auth-secrets
+ key: wandb-token
+ - name: GITHUB_AUTH_TOKEN
+ valueFrom:
+ secretKeyRef:
+ name: auth-secrets
+ key: github-token
  # Ensure that the node has a GPU
  resources:
  limits:

diff --git a/core/qi_auto/README.md b/core/qi_auto/README.md
@@ -1,37 +1,56 @@
 # Automated QI training
 These scripts aim at automating most of the QI training.
 
-The only manual step left is to look at the result of the evaluation between the 22nd and the end of each month.
-The metrics can be found each month in a bucket:
-```bazaar
+Every two weeks, a new training is started for approximately 8 weeks.
+
+The only manual step left is to look at the result of the evaluation every 14th and 28th of each month.
+A summary of the metrics can be found in:
+```
+gs://kaleido-train-checkpoints/removebg/trimap-qi-auto/YYYY-MM-DD_eval_summary
+# YYYY_MM_DD has the date when the evaluation was performed.
+```
+Detailed metrics for each test batch can be found in each QI's directory
+```
 gs://kaleido-train-checkpoints/removebg/trimap-qi-auto/YYYY-MM-DD_qi/YYYY_MM_DD_eval.zip
-# YYYY-MM-DD_qi has the date when the training started. (beginning of the month)
-# YYYY_MM_DD_eval.zip has the date when the evaluation was performed. (end of the month)
+# YYYY-MM-DD_qi has the date when the training started.
+# YYYY_MM_DD_eval.zip has the date when the evaluation was performed.
 ```
 
 From the evaluation metrics, a human should decide whether the new checkpoint should be considered a new best.<br/>
-If so, it has to be copied with the following command:
+If so, the ready-to-be-shipped checkpoint can be copied from the best QI training with the following command:
 ```
-gsutil cp gs://kaleido-train-checkpoints/removebg/trimap-qi-auto/candidate_best/trimap513-deeplab-res2net.pth.tar gs://kaleido-train-checkpoints/removebg/trimap-qi-auto/latest_best/trimap513-deeplab-res2net.pth.tar
+gsutil cp gs://kaleido-train-checkpoints/removebg/trimap-qi-auto/YYYY-MM-DD_qi/trimap513-deeplab-res2net.pth.tar ./trimap513-deeplab-res2net.pth.tar
 ```
 
 ## Setup
 
 Create kubernetes cluster with [startup_cluster.sh](./data/qi_auto/startup_cluster.sh).
 
-This cluster is composed 2 nodes by:
-- Controller: non-preemptible VM `n1-standard-4` without GPU
-- Worker: non-preemptible VM `custom-12-79872` with 1x `nvidia-tesla-v100`
+This cluster is composed of 6 nodes:
+- Controller: 1 non-preemptible VM `n1-standard-4` without GPU
+- Worker: 5 non-preemptible VM `custom-12-79872` with 1x `nvidia-tesla-v100`
  - Autoscaling is enabled
 
+## Authentication
+Authentication tokens are passed as secrets to the kubernetes cluster with [secret_auth.yaml](./data/qi_auto/secret_auth.yaml).
+
+Each token must be encoded in base64 with the following command:
+```
+echo -n YOUR_TOKEN | base64
+```
+The secret config file must then be applied on the cluster
+```
+kubectl apply -f secret_auth.yaml
+```
+
 ## Initialization
 The initialization is performed by the script [initialization.py](./qi_auto/initialization.py).
 
-This script is scheduled once a month at **3am of the first day of the month** by [job_initialization.yaml](./data/qi_auto/job_initialization.yaml). It will run on the controller node.
+This script is scheduled twice a month at **3am on the 1st and 15th** by [job_initialization.yaml](./data/qi_auto/job_initialization.yaml). It will run on the controller node.
 
 It does the following tasks:
-- Grab latest best checkpoint
- - from `gs://kaleido-train-checkpoints/removebg/trimap-qi-auto/latest_best/trimap513-deeplab-res2net.pth.tar`
+- Grab latest checkpoint in production from github
+ - from `https://github.com/remove-bg/kaleido-models/releases/latest/download/trimap513-deeplab-res2net.pth.tar`
 - Fetch metadata for train+valid dataset
  - Uploaded to `gs://kaleido-train-checkpoints/removebg/trimap-qi-auto/YYYY-MM-DD_qi/trimap_dataset_train.json`
 - Fetch metadata for test dataset
@@ -41,10 +60,6 @@ It does the following tasks:
  - Uploaded to `gs://kaleido-train-checkpoints/removebg/trimap-qi-auto/YYYY-MM-DD_qi/job.yaml`
 - Start kubernetes job with `job_trimap.yaml`
 
-The script has to be configured with valid authentication credentials:
-- "--danni_user=[...]"
-- "--danni_token=[...]"
-
 ## Training
 The training is performed by the script [removebg-train-trimap-cloud-qi.py](./bin/removebg-train-trimap-cloud-qi.py).
 
@@ -55,35 +70,34 @@ Its result will be uploaded to a bucket `gs://kaleido-train-checkpoints/removebg
 ## Termination
 The termination is performed by the script [termination.py](./qi_auto/termination.py).
 
-This script is scheduled once a month at **3am of the 21st day of the month** by [job_termination.yaml](./data/qi_auto/job_termination.yaml). It will run on the controller node.
+This script is scheduled every monday at **3am** by [job_termination.yaml](./data/qi_auto/job_termination.yaml). It will run on the controller node.
 
 It does the following tasks:
-- Get the name of the last QI training
+- Get the name of the lastest QI trainings
 - Fetch the job configuration `job.yaml` on the bucket
-- Stop the job
+- Stop the job if it is older than 8 weeks
 
 
 ## Post Processing
 The termination is performed by the script [postprocessing.py](./qi_auto/postprocessing.py).
 
-This script is scheduled once a month at **3:30am of the 21st day of the month** by [job_postprocessing.yaml](./data/qi_auto/job_postprocessing.yaml). It will run on the worker node.
+This script is scheduled twice a month at **3:30am of on 14th and 28th** by [job_postprocessing.yaml](./data/qi_auto/job_postprocessing.yaml). It will run on the worker node.
 
 It does the following tasks:
-- Get the name of the last QI training
+- Loop through the latest QI trainings (under 8 weeks)
 - Grab the best checkpoint from this training
  - From `gs://kaleido-train-checkpoints/removebg/trimap-qi-auto/YYYY_MM_DD_qi/best.ckpt`
 - Strip the checkpoint from the training artifacts and rename it as `trimap513-deeplab-res2net.pth.tar`
  - Uploaded to `gs://kaleido-train-checkpoints/removebg/trimap-qi-auto/candidate_best/trimap513-deeplab-res2net.pth.tar`
 - Grab and test dataset metadata and download dataset
  - from `gs://kaleido-train-checkpoints/removebg/trimap-qi-auto/YYYY-MM-DD_qi/trimap_dataset_test.json`
 - Download `kaleido-models`
- - from `gs://kaleido-train-checkpoints/removebg/trimap-qi-auto/latest_best/`
+ - from `https://github.com/remove-bg/kaleido-models/releases/latest/download/trimap513-deeplab-res2net.pth.tar`
 - Run evaluation on test dataset with [removebg-demo](./bin/removebg-demo.py)
- - With both old and new checkpoint
+ - With the checkpoint from production
+ - With all the checkpoints from all the current QI trainings
 - Compute evaluation metrics
-- Upload metrics on bucket
+ - Between the production checkpoint and all the QI trainings
+- Upload metrics on bucket of each QI
  - To `gs://kaleido-train-checkpoints/removebg/trimap-qi-auto/YYYY-MM-DD_qi/YYYY_MM_DD_eval.zip`
 
-The script has to be configured with valid authentication credentials:
-- "--danni_user=[...]"
-- "--danni_token=[...]"