NRCan · remtav · Feb 11, 2022 · Mar 16, 2022 · Mar 16, 2022 · Mar 17, 2022
diff --git a/.github/workflows/github-actions-ci.yml b/.github/workflows/github-actions-ci.yml
@@ -23,5 +23,8 @@ jobs:
         run: |
           source /usr/share/miniconda/etc/profile.d/conda.sh
           conda activate geo_deep_env
+          # hyperparameter optimization with optuna
+          python GDL.py mode=train general.max_epochs=1 --multirun
           coverage run -m pytest --log-cli-level=INFO --capture=tee-sys
           coverage report -m --sort=Cover
+
diff --git a/GDL.py b/GDL.py
@@ -58,7 +58,7 @@ def run_gdl(cfg: DictConfig) -> None:
     start_time = time.time()
     # Read the task and execute it
     task = get_method(f"{cfg.mode}_{cfg.general.task}.main")
-    task(cfg)
+    metric = task(cfg)
 
     # Add git hash from current commit to parameters.
     with open_dict(cfg):
@@ -75,6 +75,7 @@ def run_gdl(cfg: DictConfig) -> None:
         "Elapsed time: {:.2f}s".format(time.time() - start_time) +
         "\n" + "-" * len(msg) + "\n"
     )
+    return metric
     # ------------------------------------
 
 

diff --git a/config/gdl_config_template.yaml b/config/gdl_config_template.yaml
@@ -13,6 +13,8 @@ defaults:
   - hydra: default
   - override hydra/hydra_logging: colorlog # enable color logging to make it pretty
   - override hydra/job_logging: colorlog # enable color logging to make it pretty
+  - override hydra/sweeper: optuna
+  - override hydra/sweeper/sampler: tpe
   - _self_
 
 general:

diff --git a/config/hydra/default.yaml b/config/hydra/default.yaml
@@ -5,6 +5,36 @@ sweep:
   dir: logs/multiruns/${now:%Y-%m-%d_%H-%M-%S}
   subdir: ${hydra.job.num}
 
+# for hyperparameter optimization with Optuna: https://hydra.cc/docs/next/plugins/optuna_sweeper/
+sweeper:
+  sampler:
+    seed: 123
+  direction: maximize
+  study_name: run_gdl
+  storage: null
+  n_trials: 3
+  n_jobs: 1
+
+  search_space:
+    training.lr:
+      type: float
+      log: True
+      low: 1e-7
+      high: 0.01
+    loss:
+      type: categorical
+      choices: ["binary/softbce", "binary/lovasz", "binary/dice"]
+    model:
+      type: categorical
+      choices: ['deeplabv3_pretrained']
+    optimizer:
+      type: categorical
+      choices: ['adam', 'adabound']
+    # gpus don't empty between each run, so max ram threshold must be 100% or gpu will be excluded
+    training.max_used_ram:
+      type: categorical
+      choices: [100]
+
 # you can set here environment variables that are universal for all users
 # for system specific variables (like data paths) it's better to use .env file!
 job:

diff --git a/config/optimizer/adabound.yaml b/config/optimizer/adabound.yaml
@@ -0,0 +1,6 @@
+# @package _global_
+optimizer:
+  optimizer_name: 'adabound'
+  class_name: utils.adabound.AdaBound
+  params:
+    lr: ${training.lr}
diff --git a/config/training/default_training.yaml b/config/training/default_training.yaml
@@ -1,9 +1,9 @@
 # @package _global_
 training:
   num_gpus: 1
-  batch_size: 1
+  batch_size: 2  # models with batchnorm require batch size > 1
   eval_batch_size:
-  batch_metrics:
+  batch_metrics: 1
   lr: 0.0001
   max_epochs: ${general.max_epochs}
   min_epochs: ${general.min_epochs}

diff --git a/gdl_hyperopt_template.py b/gdl_hyperopt_template.py
@@ -1,166 +0,0 @@
-"""Hyperparamater optimization for GDL using hyperopt
-
-This is a template for using hyperopt with GDL. The my_space variable currently needs to
-be modified here, as well as GDL config modification logic within the objective_with_args
-function.
-
-"""
-
-import argparse
-from pathlib import Path
-import pickle
-from functools import partial
-import pprint
-import numpy as np
-
-from ruamel_yaml import YAML
-import mlflow
-import torch
-# ToDo: Add hyperopt to GDL requirements
-from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
-
-from train_segmentation import main as train_main
-
-# This is the hyperparameter space to explore
-my_space = {'model_name': hp.choice('model_name', ['unet_pretrained', 'deeplabv3_resnet101']),
-            'loss_fn': hp.choice('loss_fn', ['CrossEntropy', 'Lovasz', 'Duo']),
-            'optimizer': hp.choice('optimizer', ['adam', 'adabound']),
-            'learning_rate': hp.loguniform('learning_rate', np.log(1e-7), np.log(0.1))}
-
-
-def read_parameters(param_file):
-    """Read and return parameters in .yaml file
-    Args:
-        param_file: Full file path of the parameters file
-    Returns:
-        YAML (Ruamel) CommentedMap dict-like object
-    """
-    yaml = YAML()
-    with open(param_file) as yamlfile:
-        params = yaml.load(yamlfile)
-    return params
-
-
-def get_latest_mlrun(params):
-    """Get latest mlflow run
-
-    :param params: gdl parameters dictionary
-    :return: mlflow run object
-    """
-
-    tracking_uri = params['global']['mlflow_uri']
-    mlflow.set_tracking_uri(tracking_uri)
-    mlexp = mlflow.get_experiment_by_name(params['global']['mlflow_experiment_name'])
-    exp_id = mlexp.experiment_id
-    try:
-        run_ids = ([x.run_id for x in mlflow.list_run_infos(
-            exp_id, max_results=1, order_by=["tag.release DESC"])])
-    except AttributeError:
-        mlflow_client = mlflow.tracking.MlflowClient(tracking_uri=tracking_uri)
-        run_ids = [x.run_id for x in mlflow_client.list_run_infos(exp_id, run_view_type=3)[0:1]]
-    mlrun = mlflow.get_run(run_ids[0])
-    return mlrun
-
-
-def objective_with_args(hparams, params, config_path):
-    """Objective function for hyperopt
-
-    This function edits the GDL parameters and runs a training.
-
-    :param hparams: arguments provided by hyperopt selection from hyperparameter space
-    :param params: gdl parameters dictionary
-    :param config_path: path to gdl configuration file
-    :return: loss dictionary for hyperopt
-    """
-
-    # ToDo: This is dependent on the specific structure of the GDL config file
-    params['global']['model_name'] = hparams['model_name']
-    # params['training']['target_size'] = hparams['target_size']
-    params['training']['loss_fn '] = hparams['loss_fn']
-    params['training']['optimizer'] = hparams['optimizer']
-    params['training']['learning_rate'] = hparams['learning_rate']
-
-    try:
-        mlrun = get_latest_mlrun(params)
-        run_name_split = mlrun.data.tags['mlflow.runName'].split('_')
-        params['global']['mlflow_run_name'] = run_name_split[0] + f'_{int(run_name_split[1]) + 1}'
-    except:
-        pass
-    train_main(params, config_path)
-    torch.cuda.empty_cache()
-
-    mlflow.end_run()
-    mlrun = get_latest_mlrun(params)
-
-    # ToDo: Probably need some cleanup to avoid accumulating results on disk
-
-    # ToDo: This loss should be configurable
-    return {'loss': -mlrun.data.metrics['tst_iou'], 'status': STATUS_OK}
-
-
-def trials_to_csv(trials, csv_pth):
-    """hyperopt trials to CSV
-
-    :param trials: hyperopt trials object
-    """
-
-    params = sorted(list(trials.vals.keys()))
-    csv_str = ''
-    for param in params:
-        csv_str += f'{param}, '
-    csv_str = csv_str + 'loss' + '\n'
-
-    for i in range(len(trials.trials)):
-        for param in params:
-            if my_space[param].name == 'switch':
-                csv_str += f'{my_space[param].pos_args[trials.vals[param][i] + 1].obj}, '
-            else:
-                csv_str += f'{trials.vals[param][i]}, '
-        csv_str = csv_str + f'{trials.results[i]["loss"]}' + '\n'
-
-    # ToDo: Customize where the csv output is
-    with open(csv_pth, 'w') as csv_obj:
-        csv_obj.write(csv_str)
-
-
-def main(params, config_path):
-    # ToDo: Customize where the trials file is
-    # ToDo: Customize where the trials file is
-    root_path = Path(params['global']['assets_path'])
-    pkl_file = root_path.joinpath('hyperopt_trials.pkl')
-    csv_file = root_path.joinpath('hyperopt_results.csv')
-    if pkl_file.is_file():
-        trials = pickle.load(open(pkl_file, "rb"))
-    else:
-        trials = Trials()
-
-    objective = partial(objective_with_args, params=params, config_path=config_path)
-
-    n = 0
-    while n < params['global']['hyperopt_runs']:
-        best = fmin(objective,
-                    space=my_space,
-                    algo=tpe.suggest,
-                    trials=trials,
-                    max_evals=n + params['global']['hyperopt_delta'])
-        n += params['global']['hyperopt_delta']
-        pickle.dump(trials, open(pkl_file, "wb"))
-
-    # ToDo: Cleanup the output
-    pprint.pprint(trials.vals)
-    pprint.pprint(trials.results)
-    for key, val in best.items():
-        if my_space[key].name == 'switch':
-            best[key] = my_space[key].pos_args[val + 1].obj
-    pprint.pprint(best)
-    print(trials.best_trial['result'])
-    trials_to_csv(trials, csv_file)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Geo Deep Learning hyperopt')
-    parser.add_argument('param_file', type=str, help='Path of gdl config file')
-    args = parser.parse_args()
-    gdl_params = read_parameters(args.param_file)
-    gdl_params['self'] = {'config_file': args.param_file}
-    main(gdl_params, Path(args.param_file))

diff --git a/inference_segmentation.py b/inference_segmentation.py
@@ -389,6 +389,10 @@ def main(params: dict) -> None:
                             debug=debug)
 
         pred = pred[np.newaxis, :, :].astype(np.uint8)
+
+        if debug and not np.any(pred):
+            logging.error(f"Only background values were predicted. There may be a problem with the model")
+
         inf_meta.update({"driver": "GTiff",
                          "height": pred.shape[1],
                          "width": pred.shape[2],
@@ -403,6 +407,7 @@ def main(params: dict) -> None:
             temp_file.unlink()
         except OSError as e:
             logging.warning(f'File Error: {temp_file, e.strerror}')
+
         if raster_to_vec:
             start_vec = time.time()
             inference_vec = working_folder.joinpath(local_img.parent.name,

diff --git a/models/model_choice.py b/models/model_choice.py
@@ -234,3 +234,24 @@ def net(model_name: str,
         criterion = criterion.to(device)
 
         return model, model_name, criterion, optimizer, lr_scheduler, device, gpu_devices_dict
+
+
+if __name__ == '__main__':
+    # TODO convert to unit test
+    rand_img = torch.rand((2, 4, 64, 64))
+    for layer in ['conv1', 'maxpool', 'layer2', 'layer3', 'layer4']:
+        logging.info(layer)
+        model, model_name, criterion, optimizer, lr_scheduler, device, gpu_devices_dict = net(
+            model_name='deeplabv3_resnet101_dualhead',
+            num_bands=4,
+            num_channels=4,
+            num_devices=0,
+            net_params={'training': None, 'optimizer': {'params': None},
+                        'scheduler': {'params': None}},
+            inference_state_dict=None,
+            conc_point=layer,
+            loss_fn={'_target_': 'torch.nn.CrossEntropyLoss'},
+            optimizer='sgd',
+        )
+        output = model(rand_img)
+        logging.info(output.shape)
diff --git a/train_segmentation.py b/train_segmentation.py
@@ -360,6 +360,8 @@ def evaluation(eval_loader,
     :param debug: if True, debug functions will be performed
     :return: (dict) eval_metrics
     """
+    dontcare = criterion.ignore_index if hasattr(criterion, 'ignore_index') else -1
+
     eval_metrics = create_metrics_dict(num_classes)
     model.eval()
 
@@ -409,12 +411,13 @@ def evaluation(eval_loader,
                     a, segmentation = torch.max(outputs_flatten, dim=1)
                     eval_metrics = iou(segmentation, labels_flatten, batch_size, num_classes, eval_metrics)
                     eval_metrics = report_classification(segmentation, labels_flatten, batch_size, eval_metrics,
-                                                         ignore_index=eval_loader.dataset.dontcare)
-            elif (dataset == 'tst') and (batch_metrics is not None):
+                                                         ignore_index=dontcare)
+            elif dataset == 'tst':
+                batch_metrics = True
                 a, segmentation = torch.max(outputs_flatten, dim=1)
                 eval_metrics = iou(segmentation, labels_flatten, batch_size, num_classes, eval_metrics)
                 eval_metrics = report_classification(segmentation, labels_flatten, batch_size, eval_metrics,
-                                                     ignore_index=eval_loader.dataset.dontcare)
+                                                     ignore_index=dontcare)
 
             logging.debug(OrderedDict(dataset=dataset, loss=f'{eval_metrics["loss"].avg:.4f}'))
 
@@ -428,10 +431,11 @@ def evaluation(eval_loader,
     if eval_metrics['loss'].avg:
         logging.info(f"\n{dataset} Loss: {eval_metrics['loss'].avg:.4f}")
     if batch_metrics is not None:
-        logging.info(f"\n{dataset} precision: {eval_metrics['precision'].avg}")
-        logging.info(f"\n{dataset} recall: {eval_metrics['recall'].avg}")
-        logging.info(f"\n{dataset} fscore: {eval_metrics['fscore'].avg}")
-        logging.info(f"\n{dataset} iou: {eval_metrics['iou'].avg}")
+        logging.info(f"\n{dataset} precision: {eval_metrics['precision'].avg}"
+                     f"\n{dataset} recall: {eval_metrics['recall'].avg}"
+                     f"\n{dataset} fscore: {eval_metrics['fscore'].avg}"
+                     f"\n{dataset} iou: {eval_metrics['iou'].avg}"
+                     f"\n{dataset} iou (non background): {eval_metrics['iou_nonbg'].avg}")
 
     return eval_metrics
 
@@ -742,6 +746,7 @@ def train(cfg: DictConfig) -> None:
         checkpoint = load_checkpoint(filename)
         model, _ = load_from_checkpoint(checkpoint, model)
 
+    return_metric = None
     if tst_dataloader:
         tst_report = evaluation(eval_loader=tst_dataloader,
                                 model=model,
@@ -763,9 +768,13 @@ def train(cfg: DictConfig) -> None:
             bucket.upload_file("output.txt", bucket_output_path.joinpath(f"Logs/{now}_output.txt"))
             bucket.upload_file(filename, bucket_filename)
 
+        return_metric = tst_report['iou'].avg
+
     # log_artifact(logfile)
     # log_artifact(logfile_debug)
 
+    return return_metric
+
 
 def main(cfg: DictConfig) -> None:
     """
@@ -790,4 +799,5 @@ def main(cfg: DictConfig) -> None:
     # HERE the code to do for the preprocessing for the segmentation
 
     # execute the name mode (need to be in this file for now)
-    train(cfg)
+    tst_iou = train(cfg)
+    return tst_iou