Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement optuna hyperparameter optimization #278

Closed
wants to merge 11 commits into from
4 changes: 3 additions & 1 deletion .github/workflows/github-actions-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,6 @@ jobs:
python GDL.py mode=sampling
python GDL.py mode=train
python GDL.py mode=inference
python GDL.py mode=evaluate
python GDL.py mode=evaluate
# hyperparameter optimization with optuna
python GDL.py mode=train general.max_epochs=1 --multirun
3 changes: 2 additions & 1 deletion GDL.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def run_gdl(cfg: DictConfig) -> None:
start_time = time.time()
# Read the task and execute it
task = get_method(f"{cfg.mode}_{cfg.general.task}.main")
task(cfg)
metric = task(cfg)

# Add git hash from current commit to parameters.
with open_dict(cfg):
Expand All @@ -75,6 +75,7 @@ def run_gdl(cfg: DictConfig) -> None:
"Elapsed time: {:.2f}s".format(time.time() - start_time) +
"\n" + "-" * len(msg) + "\n"
)
return metric
# ------------------------------------


Expand Down
2 changes: 2 additions & 0 deletions config/gdl_config_template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ defaults:
- hydra: default
- override hydra/hydra_logging: colorlog # enable color logging to make it pretty
- override hydra/job_logging: colorlog # enable color logging to make it pretty
- override hydra/sweeper: optuna
- override hydra/sweeper/sampler: tpe
- _self_

general:
Expand Down
26 changes: 26 additions & 0 deletions config/hydra/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,32 @@ sweep:
dir: logs/multiruns/${now:%Y-%m-%d_%H-%M-%S}
subdir: ${hydra.job.num}

# for hyperparameter optimization with Optuna
sweeper:
sampler:
seed: 123
direction: maximize
study_name: run_gdl
storage: null
n_trials: 3
n_jobs: 1

search_space:
training.lr:
type: float
log: True
low: 1e-7
high: 0.01
loss:
type: categorical
choices: ["binary/softbce", "binary/lovasz", "binary/dice"]
model:
type: categorical
choices: ['deeplabv3_resnet101']
optimizer:
type: categorical
choices: ['adam', 'adabound']

# you can set here environment variables that are universal for all users
# for system specific variables (like data paths) it's better to use .env file!
job:
Expand Down
6 changes: 6 additions & 0 deletions config/optimizer/adabound.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# @package _global_
optimizer:
optimizer_name: 'adabound'
class_name: utils.adabound.AdaBound
params:
lr: ${training.lr}
4 changes: 2 additions & 2 deletions config/training/default_training.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# @package _global_
training:
num_gpus: 1
batch_size: 1
batch_size: 2 # models with batchnorm require batch size > 1
eval_batch_size:
batch_metrics:
batch_metrics: 1
lr: 0.0001
max_epochs: ${general.max_epochs}
min_epochs: ${general.min_epochs}
Expand Down
153 changes: 0 additions & 153 deletions gdl_hyperopt_template.py

This file was deleted.

5 changes: 5 additions & 0 deletions inference_segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,10 @@ def main(params: dict) -> None:
debug=debug)

pred = pred[np.newaxis, :, :].astype(np.uint8)

if debug and not np.any(pred):
logging.error(f"Only background values were predicted. There may be a problem with the model")

inf_meta.update({"driver": "GTiff",
"height": pred.shape[1],
"width": pred.shape[2],
Expand All @@ -526,6 +530,7 @@ def main(params: dict) -> None:
temp_file.unlink()
except OSError as e:
logging.warning(f'File Error: {temp_file, e.strerror}')

if raster_to_vec:
start_vec = time.time()
inference_vec = working_folder.joinpath(local_img.parent.name,
Expand Down
2 changes: 1 addition & 1 deletion models/model_choice.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ def net(model_name: str,
else:
checkpoint = None
# list of GPU devices that are available and unused. If no GPUs, returns empty list
gpu_devices_dict = get_device_ids(num_devices)
gpu_devices_dict = get_device_ids(num_devices, max_used_perc=100, max_used_ram_perc=100) # FIXME: set back to default after issue #246
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the gpus don't empty between each run, so gpu usage threshold must be 100% otherwise each training after the first one runs on cpu (ie gpus get excluded)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or get_device_ids should be called once, before all runs.

num_devices = len(gpu_devices_dict.keys())
logging.info(f"Number of cuda devices requested: {num_devices}. "
f"Cuda devices available: {list(gpu_devices_dict.keys())}\n")
Expand Down
33 changes: 22 additions & 11 deletions train_segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import functools
import numpy as np
from PIL import Image
from hydra.utils import to_absolute_path
from tqdm import tqdm
from pathlib import Path
from shutil import copy
Expand Down Expand Up @@ -377,6 +378,8 @@ def evaluation(eval_loader,
:param debug: if True, debug functions will be performed
:return: (dict) eval_metrics
"""
dontcare = criterion.ignore_index if hasattr(criterion, 'ignore_index') else -1

eval_metrics = create_metrics_dict(num_classes)
model.eval()

Expand Down Expand Up @@ -432,12 +435,13 @@ def evaluation(eval_loader,
a, segmentation = torch.max(outputs_flatten, dim=1)
eval_metrics = iou(segmentation, labels_flatten, batch_size, num_classes, eval_metrics)
eval_metrics = report_classification(segmentation, labels_flatten, batch_size, eval_metrics,
ignore_index=eval_loader.dataset.dontcare)
elif (dataset == 'tst') and (batch_metrics is not None):
ignore_index=dontcare)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixes missing attribute bug. Dontcare value was previously stored in dataloader object, but not anymore (since when?)

elif dataset == 'tst':
batch_metrics = True
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

force metrics at test time. I don't see any reason why we shouldn'y systematically output metrics at test time.

a, segmentation = torch.max(outputs_flatten, dim=1)
eval_metrics = iou(segmentation, labels_flatten, batch_size, num_classes, eval_metrics)
eval_metrics = report_classification(segmentation, labels_flatten, batch_size, eval_metrics,
ignore_index=eval_loader.dataset.dontcare)
ignore_index=dontcare)

logging.debug(OrderedDict(dataset=dataset, loss=f'{eval_metrics["loss"].avg:.4f}'))

Expand All @@ -450,10 +454,11 @@ def evaluation(eval_loader,

logging.info(f"\n{dataset} Loss: {eval_metrics['loss'].avg:.4f}")
if batch_metrics is not None:
logging.info(f"\n{dataset} precision: {eval_metrics['precision'].avg}")
logging.info(f"\n{dataset} recall: {eval_metrics['recall'].avg}")
logging.info(f"\n{dataset} fscore: {eval_metrics['fscore'].avg}")
logging.info(f"\n{dataset} iou: {eval_metrics['iou'].avg}")
logging.info(f"\n{dataset} precision: {eval_metrics['precision'].avg}"
f"\n{dataset} recall: {eval_metrics['recall'].avg}"
f"\n{dataset} fscore: {eval_metrics['fscore'].avg}"
f"\n{dataset} iou: {eval_metrics['iou'].avg}"
f"\n{dataset} iou (non background): {eval_metrics['iou_nonbg'].avg}")

return eval_metrics

Expand Down Expand Up @@ -608,9 +613,9 @@ def train(cfg: DictConfig) -> None:
config_path = list_path['path']
config_name = str(cfg.general.config_name)
model_id = config_name
output_path = Path(f'model/{model_id}')
output_path.mkdir(parents=True, exist_ok=False)
logging.info(f'\nModel and log files will be saved to: {os.getcwd()}/{output_path}')
output_path = Path(to_absolute_path(f'model/{model_id}'))
output_path.mkdir(parents=True, exist_ok=True) # FIXME: restore exist_ok=False when PR#274 is merged
logging.info(f'\nModel and log files will be saved to: {output_path}')
if debug:
logging.warning(f'\nDebug mode activated. Some debug features may mobilize extra disk space and '
f'cause delays in execution.')
Expand Down Expand Up @@ -808,6 +813,7 @@ def train(cfg: DictConfig) -> None:
checkpoint = load_checkpoint(filename)
model, _ = load_from_checkpoint(checkpoint, model)

return_metric = None
if tst_dataloader:
tst_report = evaluation(eval_loader=tst_dataloader,
model=model,
Expand All @@ -829,9 +835,13 @@ def train(cfg: DictConfig) -> None:
bucket.upload_file("output.txt", bucket_output_path.joinpath(f"Logs/{now}_output.txt"))
bucket.upload_file(filename, bucket_filename)

return_metric = tst_report['iou'].avg

# log_artifact(logfile)
# log_artifact(logfile_debug)

return return_metric


def main(cfg: DictConfig) -> None:
"""
Expand All @@ -856,4 +866,5 @@ def main(cfg: DictConfig) -> None:
# HERE the code to do for the preprocessing for the segmentation

# execute the name mode (need to be in this file for now)
train(cfg)
tst_iou = train(cfg)
return tst_iou
13 changes: 10 additions & 3 deletions utils/metrics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import numpy as np
from sklearn.metrics import classification_report

from utils.utils import get_logger

logging = get_logger(__name__) # import logging

min_val = 1e-6
def create_metrics_dict(num_classes):
num_classes = num_classes if num_classes == 1 else num_classes + 1
Expand Down Expand Up @@ -66,9 +70,12 @@ def report_classification(pred, label, batch_size, metrics_dict, ignore_index=-1
if key not in ['micro avg', 'macro avg', 'weighted avg', 'accuracy'] and key != str(ignore_index):
class_score[key] = value

metrics_dict['precision_' + key].update(class_score[key]['precision'], batch_size)
metrics_dict['recall_' + key].update(class_score[key]['recall'], batch_size)
metrics_dict['fscore_' + key].update(class_score[key]['f1-score'], batch_size)
try:
metrics_dict['precision_' + key].update(class_score[key]['precision'], batch_size)
metrics_dict['recall_' + key].update(class_score[key]['recall'], batch_size)
metrics_dict['fscore_' + key].update(class_score[key]['f1-score'], batch_size)
except KeyError as e:
logging.error(e)

metrics_dict['precision'].update(class_report['weighted avg']['precision'], batch_size)
metrics_dict['recall'].update(class_report['weighted avg']['recall'], batch_size)
Expand Down