NRCan · remtav · Feb 11, 2022 · Mar 16, 2022 · Mar 16, 2022 · Mar 17, 2022
diff --git a/.github/workflows/github-actions-ci.yml b/.github/workflows/github-actions-ci.yml
@@ -26,4 +26,6 @@ jobs:
           python GDL.py mode=sampling
           python GDL.py mode=train
           python GDL.py mode=inference
-          python GDL.py mode=evaluate
+          python GDL.py mode=evaluate
+          # hyperparameter optimization with optuna
+          python GDL.py mode=train general.max_epochs=1 --multirun
diff --git a/GDL.py b/GDL.py
@@ -58,7 +58,7 @@ def run_gdl(cfg: DictConfig) -> None:
     start_time = time.time()
     # Read the task and execute it
     task = get_method(f"{cfg.mode}_{cfg.general.task}.main")
-    task(cfg)
+    metric = task(cfg)
 
     # Add git hash from current commit to parameters.
     with open_dict(cfg):
@@ -75,6 +75,7 @@ def run_gdl(cfg: DictConfig) -> None:
         "Elapsed time: {:.2f}s".format(time.time() - start_time) +
         "\n" + "-" * len(msg) + "\n"
     )
+    return metric
     # ------------------------------------
 
 

diff --git a/config/gdl_config_template.yaml b/config/gdl_config_template.yaml
@@ -13,6 +13,8 @@ defaults:
   - hydra: default
   - override hydra/hydra_logging: colorlog # enable color logging to make it pretty
   - override hydra/job_logging: colorlog # enable color logging to make it pretty
+  - override hydra/sweeper: optuna
+  - override hydra/sweeper/sampler: tpe
   - _self_
 
 general:

diff --git a/config/hydra/default.yaml b/config/hydra/default.yaml
@@ -5,6 +5,32 @@ sweep:
   dir: logs/multiruns/${now:%Y-%m-%d_%H-%M-%S}
   subdir: ${hydra.job.num}
 
+# for hyperparameter optimization with Optuna
+sweeper:
+  sampler:
+    seed: 123
+  direction: maximize
+  study_name: run_gdl
+  storage: null
+  n_trials: 3
+  n_jobs: 1
+
+  search_space:
+    training.lr:
+      type: float
+      log: True
+      low: 1e-7
+      high: 0.01
+    loss:
+      type: categorical
+      choices: ["binary/softbce", "binary/lovasz", "binary/dice"]
+    model:
+      type: categorical
+      choices: ['deeplabv3_resnet101']
+    optimizer:
+      type: categorical
+      choices: ['adam', 'adabound']
+
 # you can set here environment variables that are universal for all users
 # for system specific variables (like data paths) it's better to use .env file!
 job:

diff --git a/config/optimizer/adabound.yaml b/config/optimizer/adabound.yaml
@@ -0,0 +1,6 @@
+# @package _global_
+optimizer:
+  optimizer_name: 'adabound'
+  class_name: utils.adabound.AdaBound
+  params:
+    lr: ${training.lr}
diff --git a/config/training/default_training.yaml b/config/training/default_training.yaml
@@ -1,9 +1,9 @@
 # @package _global_
 training:
   num_gpus: 1
-  batch_size: 1
+  batch_size: 2  # models with batchnorm require batch size > 1
   eval_batch_size:
-  batch_metrics:
+  batch_metrics: 1
   lr: 0.0001
   max_epochs: ${general.max_epochs}
   min_epochs: ${general.min_epochs}

diff --git a/gdl_hyperopt_template.py b/gdl_hyperopt_template.py
diff --git a/inference_segmentation.py b/inference_segmentation.py
@@ -512,6 +512,10 @@ def main(params: dict) -> None:
                             debug=debug)
 
         pred = pred[np.newaxis, :, :].astype(np.uint8)
+
+        if debug and not np.any(pred):
+            logging.error(f"Only background values were predicted. There may be a problem with the model")
+
         inf_meta.update({"driver": "GTiff",
                          "height": pred.shape[1],
                          "width": pred.shape[2],
@@ -526,6 +530,7 @@ def main(params: dict) -> None:
             temp_file.unlink()
         except OSError as e:
             logging.warning(f'File Error: {temp_file, e.strerror}')
+
         if raster_to_vec:
             start_vec = time.time()
             inference_vec = working_folder.joinpath(local_img.parent.name,

diff --git a/models/model_choice.py b/models/model_choice.py
@@ -244,7 +244,7 @@ def net(model_name: str,
         else:
             checkpoint = None
         # list of GPU devices that are available and unused. If no GPUs, returns empty list
-        gpu_devices_dict = get_device_ids(num_devices)
+        gpu_devices_dict = get_device_ids(num_devices, max_used_perc=100, max_used_ram_perc=100)  # FIXME: set back to default after issue #246
         num_devices = len(gpu_devices_dict.keys())
         logging.info(f"Number of cuda devices requested: {num_devices}. "
                      f"Cuda devices available: {list(gpu_devices_dict.keys())}\n")

diff --git a/train_segmentation.py b/train_segmentation.py
@@ -6,6 +6,7 @@
 import functools
 import numpy as np
 from PIL import Image
+from hydra.utils import to_absolute_path
 from tqdm import tqdm
 from pathlib import Path
 from shutil import copy
@@ -377,6 +378,8 @@ def evaluation(eval_loader,
     :param debug: if True, debug functions will be performed
     :return: (dict) eval_metrics
     """
+    dontcare = criterion.ignore_index if hasattr(criterion, 'ignore_index') else -1
+
     eval_metrics = create_metrics_dict(num_classes)
     model.eval()
 
@@ -432,12 +435,13 @@ def evaluation(eval_loader,
                     a, segmentation = torch.max(outputs_flatten, dim=1)
                     eval_metrics = iou(segmentation, labels_flatten, batch_size, num_classes, eval_metrics)
                     eval_metrics = report_classification(segmentation, labels_flatten, batch_size, eval_metrics,
-                                                         ignore_index=eval_loader.dataset.dontcare)
-            elif (dataset == 'tst') and (batch_metrics is not None):
+                                                         ignore_index=dontcare)
+            elif dataset == 'tst':
+                batch_metrics = True
                 a, segmentation = torch.max(outputs_flatten, dim=1)
                 eval_metrics = iou(segmentation, labels_flatten, batch_size, num_classes, eval_metrics)
                 eval_metrics = report_classification(segmentation, labels_flatten, batch_size, eval_metrics,
-                                                     ignore_index=eval_loader.dataset.dontcare)
+                                                     ignore_index=dontcare)
 
             logging.debug(OrderedDict(dataset=dataset, loss=f'{eval_metrics["loss"].avg:.4f}'))
 
@@ -450,10 +454,11 @@ def evaluation(eval_loader,
 
     logging.info(f"\n{dataset} Loss: {eval_metrics['loss'].avg:.4f}")
     if batch_metrics is not None:
-        logging.info(f"\n{dataset} precision: {eval_metrics['precision'].avg}")
-        logging.info(f"\n{dataset} recall: {eval_metrics['recall'].avg}")
-        logging.info(f"\n{dataset} fscore: {eval_metrics['fscore'].avg}")
-        logging.info(f"\n{dataset} iou: {eval_metrics['iou'].avg}")
+        logging.info(f"\n{dataset} precision: {eval_metrics['precision'].avg}"
+                     f"\n{dataset} recall: {eval_metrics['recall'].avg}"
+                     f"\n{dataset} fscore: {eval_metrics['fscore'].avg}"
+                     f"\n{dataset} iou: {eval_metrics['iou'].avg}"
+                     f"\n{dataset} iou (non background): {eval_metrics['iou_nonbg'].avg}")
 
     return eval_metrics
 
@@ -608,9 +613,9 @@ def train(cfg: DictConfig) -> None:
             config_path = list_path['path']
     config_name = str(cfg.general.config_name)
     model_id = config_name
-    output_path = Path(f'model/{model_id}')
-    output_path.mkdir(parents=True, exist_ok=False)
-    logging.info(f'\nModel and log files will be saved to: {os.getcwd()}/{output_path}')
+    output_path = Path(to_absolute_path(f'model/{model_id}'))
+    output_path.mkdir(parents=True, exist_ok=True)  # FIXME: restore exist_ok=False when PR#274 is merged
+    logging.info(f'\nModel and log files will be saved to: {output_path}')
     if debug:
         logging.warning(f'\nDebug mode activated. Some debug features may mobilize extra disk space and '
                         f'cause delays in execution.')
@@ -808,6 +813,7 @@ def train(cfg: DictConfig) -> None:
         checkpoint = load_checkpoint(filename)
         model, _ = load_from_checkpoint(checkpoint, model)
 
+    return_metric = None
     if tst_dataloader:
         tst_report = evaluation(eval_loader=tst_dataloader,
                                 model=model,
@@ -829,9 +835,13 @@ def train(cfg: DictConfig) -> None:
             bucket.upload_file("output.txt", bucket_output_path.joinpath(f"Logs/{now}_output.txt"))
             bucket.upload_file(filename, bucket_filename)
 
+        return_metric = tst_report['iou'].avg
+
     # log_artifact(logfile)
     # log_artifact(logfile_debug)
 
+    return return_metric
+
 
 def main(cfg: DictConfig) -> None:
     """
@@ -856,4 +866,5 @@ def main(cfg: DictConfig) -> None:
     # HERE the code to do for the preprocessing for the segmentation
 
     # execute the name mode (need to be in this file for now)
-    train(cfg)
+    tst_iou = train(cfg)
+    return tst_iou
diff --git a/utils/metrics.py b/utils/metrics.py
@@ -1,6 +1,10 @@
 import numpy as np
 from sklearn.metrics import classification_report
 
+from utils.utils import get_logger
+
+logging = get_logger(__name__)  # import logging
+
 min_val = 1e-6
 def create_metrics_dict(num_classes):
     num_classes = num_classes if num_classes == 1 else num_classes + 1
@@ -66,9 +70,12 @@ def report_classification(pred, label, batch_size, metrics_dict, ignore_index=-1
         if key not in ['micro avg', 'macro avg', 'weighted avg', 'accuracy'] and key != str(ignore_index):
             class_score[key] = value
 
-            metrics_dict['precision_' + key].update(class_score[key]['precision'], batch_size)
-            metrics_dict['recall_' + key].update(class_score[key]['recall'], batch_size)
-            metrics_dict['fscore_' + key].update(class_score[key]['f1-score'], batch_size)
+            try:
+                metrics_dict['precision_' + key].update(class_score[key]['precision'], batch_size)
+                metrics_dict['recall_' + key].update(class_score[key]['recall'], batch_size)
+                metrics_dict['fscore_' + key].update(class_score[key]['f1-score'], batch_size)
+            except KeyError as e:
+                logging.error(e)
 
     metrics_dict['precision'].update(class_report['weighted avg']['precision'], batch_size)
     metrics_dict['recall'].update(class_report['weighted avg']['recall'], batch_size)