Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement optuna hyperparameter optimization #278

Closed
wants to merge 11 commits into from
3 changes: 3 additions & 0 deletions .github/workflows/github-actions-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,8 @@ jobs:
run: |
source /usr/share/miniconda/etc/profile.d/conda.sh
conda activate geo_deep_env
# hyperparameter optimization with optuna
python GDL.py mode=train general.max_epochs=1 --multirun
coverage run -m pytest --log-cli-level=INFO --capture=tee-sys
coverage report -m --sort=Cover

3 changes: 2 additions & 1 deletion GDL.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def run_gdl(cfg: DictConfig) -> None:
start_time = time.time()
# Read the task and execute it
task = get_method(f"{cfg.mode}_{cfg.general.task}.main")
task(cfg)
metric = task(cfg)

# Add git hash from current commit to parameters.
with open_dict(cfg):
Expand All @@ -75,6 +75,7 @@ def run_gdl(cfg: DictConfig) -> None:
"Elapsed time: {:.2f}s".format(time.time() - start_time) +
"\n" + "-" * len(msg) + "\n"
)
return metric
# ------------------------------------


Expand Down
2 changes: 2 additions & 0 deletions config/gdl_config_template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ defaults:
- hydra: default
- override hydra/hydra_logging: colorlog # enable color logging to make it pretty
- override hydra/job_logging: colorlog # enable color logging to make it pretty
- override hydra/sweeper: optuna
- override hydra/sweeper/sampler: tpe
- _self_

general:
Expand Down
30 changes: 30 additions & 0 deletions config/hydra/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,36 @@ sweep:
dir: logs/multiruns/${now:%Y-%m-%d_%H-%M-%S}
subdir: ${hydra.job.num}

# for hyperparameter optimization with Optuna: https://hydra.cc/docs/next/plugins/optuna_sweeper/
sweeper:
sampler:
seed: 123
direction: maximize
study_name: run_gdl
storage: null
n_trials: 3
n_jobs: 1

search_space:
training.lr:
type: float
log: True
low: 1e-7
high: 0.01
loss:
type: categorical
choices: ["binary/softbce", "binary/lovasz", "binary/dice"]
model:
type: categorical
choices: ['deeplabv3_pretrained']
optimizer:
type: categorical
choices: ['adam', 'adabound']
# gpus don't empty between each run, so max ram threshold must be 100% or gpu will be excluded
training.max_used_ram:
type: categorical
choices: [100]

# you can set here environment variables that are universal for all users
# for system specific variables (like data paths) it's better to use .env file!
job:
Expand Down
6 changes: 6 additions & 0 deletions config/optimizer/adabound.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# @package _global_
optimizer:
optimizer_name: 'adabound'
class_name: utils.adabound.AdaBound
params:
lr: ${training.lr}
4 changes: 2 additions & 2 deletions config/training/default_training.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# @package _global_
training:
num_gpus: 1
batch_size: 1
batch_size: 2 # models with batchnorm require batch size > 1
eval_batch_size:
batch_metrics:
batch_metrics: 1
lr: 0.0001
max_epochs: ${general.max_epochs}
min_epochs: ${general.min_epochs}
Expand Down
166 changes: 0 additions & 166 deletions gdl_hyperopt_template.py
Original file line number Diff line number Diff line change
@@ -1,166 +0,0 @@
"""Hyperparamater optimization for GDL using hyperopt

This is a template for using hyperopt with GDL. The my_space variable currently needs to
be modified here, as well as GDL config modification logic within the objective_with_args
function.

"""

import argparse
from pathlib import Path
import pickle
from functools import partial
import pprint
import numpy as np

from ruamel_yaml import YAML
import mlflow
import torch
# ToDo: Add hyperopt to GDL requirements
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

from train_segmentation import main as train_main

# This is the hyperparameter space to explore
my_space = {'model_name': hp.choice('model_name', ['unet_pretrained', 'deeplabv3_resnet101']),
'loss_fn': hp.choice('loss_fn', ['CrossEntropy', 'Lovasz', 'Duo']),
'optimizer': hp.choice('optimizer', ['adam', 'adabound']),
'learning_rate': hp.loguniform('learning_rate', np.log(1e-7), np.log(0.1))}


def read_parameters(param_file):
"""Read and return parameters in .yaml file
Args:
param_file: Full file path of the parameters file
Returns:
YAML (Ruamel) CommentedMap dict-like object
"""
yaml = YAML()
with open(param_file) as yamlfile:
params = yaml.load(yamlfile)
return params


def get_latest_mlrun(params):
"""Get latest mlflow run

:param params: gdl parameters dictionary
:return: mlflow run object
"""

tracking_uri = params['global']['mlflow_uri']
mlflow.set_tracking_uri(tracking_uri)
mlexp = mlflow.get_experiment_by_name(params['global']['mlflow_experiment_name'])
exp_id = mlexp.experiment_id
try:
run_ids = ([x.run_id for x in mlflow.list_run_infos(
exp_id, max_results=1, order_by=["tag.release DESC"])])
except AttributeError:
mlflow_client = mlflow.tracking.MlflowClient(tracking_uri=tracking_uri)
run_ids = [x.run_id for x in mlflow_client.list_run_infos(exp_id, run_view_type=3)[0:1]]
mlrun = mlflow.get_run(run_ids[0])
return mlrun


def objective_with_args(hparams, params, config_path):
"""Objective function for hyperopt

This function edits the GDL parameters and runs a training.

:param hparams: arguments provided by hyperopt selection from hyperparameter space
:param params: gdl parameters dictionary
:param config_path: path to gdl configuration file
:return: loss dictionary for hyperopt
"""

# ToDo: This is dependent on the specific structure of the GDL config file
params['global']['model_name'] = hparams['model_name']
# params['training']['target_size'] = hparams['target_size']
params['training']['loss_fn '] = hparams['loss_fn']
params['training']['optimizer'] = hparams['optimizer']
params['training']['learning_rate'] = hparams['learning_rate']

try:
mlrun = get_latest_mlrun(params)
run_name_split = mlrun.data.tags['mlflow.runName'].split('_')
params['global']['mlflow_run_name'] = run_name_split[0] + f'_{int(run_name_split[1]) + 1}'
except:
pass
train_main(params, config_path)
torch.cuda.empty_cache()

mlflow.end_run()
mlrun = get_latest_mlrun(params)

# ToDo: Probably need some cleanup to avoid accumulating results on disk

# ToDo: This loss should be configurable
return {'loss': -mlrun.data.metrics['tst_iou'], 'status': STATUS_OK}


def trials_to_csv(trials, csv_pth):
"""hyperopt trials to CSV

:param trials: hyperopt trials object
"""

params = sorted(list(trials.vals.keys()))
csv_str = ''
for param in params:
csv_str += f'{param}, '
csv_str = csv_str + 'loss' + '\n'

for i in range(len(trials.trials)):
for param in params:
if my_space[param].name == 'switch':
csv_str += f'{my_space[param].pos_args[trials.vals[param][i] + 1].obj}, '
else:
csv_str += f'{trials.vals[param][i]}, '
csv_str = csv_str + f'{trials.results[i]["loss"]}' + '\n'

# ToDo: Customize where the csv output is
with open(csv_pth, 'w') as csv_obj:
csv_obj.write(csv_str)


def main(params, config_path):
# ToDo: Customize where the trials file is
# ToDo: Customize where the trials file is
root_path = Path(params['global']['assets_path'])
pkl_file = root_path.joinpath('hyperopt_trials.pkl')
csv_file = root_path.joinpath('hyperopt_results.csv')
if pkl_file.is_file():
trials = pickle.load(open(pkl_file, "rb"))
else:
trials = Trials()

objective = partial(objective_with_args, params=params, config_path=config_path)

n = 0
while n < params['global']['hyperopt_runs']:
best = fmin(objective,
space=my_space,
algo=tpe.suggest,
trials=trials,
max_evals=n + params['global']['hyperopt_delta'])
n += params['global']['hyperopt_delta']
pickle.dump(trials, open(pkl_file, "wb"))

# ToDo: Cleanup the output
pprint.pprint(trials.vals)
pprint.pprint(trials.results)
for key, val in best.items():
if my_space[key].name == 'switch':
best[key] = my_space[key].pos_args[val + 1].obj
pprint.pprint(best)
print(trials.best_trial['result'])
trials_to_csv(trials, csv_file)


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Geo Deep Learning hyperopt')
parser.add_argument('param_file', type=str, help='Path of gdl config file')
args = parser.parse_args()
gdl_params = read_parameters(args.param_file)
gdl_params['self'] = {'config_file': args.param_file}
main(gdl_params, Path(args.param_file))
5 changes: 5 additions & 0 deletions inference_segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,10 @@ def main(params: dict) -> None:
debug=debug)

pred = pred[np.newaxis, :, :].astype(np.uint8)

if debug and not np.any(pred):
logging.error(f"Only background values were predicted. There may be a problem with the model")

inf_meta.update({"driver": "GTiff",
"height": pred.shape[1],
"width": pred.shape[2],
Expand All @@ -403,6 +407,7 @@ def main(params: dict) -> None:
temp_file.unlink()
except OSError as e:
logging.warning(f'File Error: {temp_file, e.strerror}')

if raster_to_vec:
start_vec = time.time()
inference_vec = working_folder.joinpath(local_img.parent.name,
Expand Down
21 changes: 21 additions & 0 deletions models/model_choice.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,3 +234,24 @@ def net(model_name: str,
criterion = criterion.to(device)

return model, model_name, criterion, optimizer, lr_scheduler, device, gpu_devices_dict


if __name__ == '__main__':
# TODO convert to unit test
rand_img = torch.rand((2, 4, 64, 64))
for layer in ['conv1', 'maxpool', 'layer2', 'layer3', 'layer4']:
logging.info(layer)
model, model_name, criterion, optimizer, lr_scheduler, device, gpu_devices_dict = net(
model_name='deeplabv3_resnet101_dualhead',
num_bands=4,
num_channels=4,
num_devices=0,
net_params={'training': None, 'optimizer': {'params': None},
'scheduler': {'params': None}},
inference_state_dict=None,
conc_point=layer,
loss_fn={'_target_': 'torch.nn.CrossEntropyLoss'},
optimizer='sgd',
)
output = model(rand_img)
logging.info(output.shape)
26 changes: 18 additions & 8 deletions train_segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,8 @@ def evaluation(eval_loader,
:param debug: if True, debug functions will be performed
:return: (dict) eval_metrics
"""
dontcare = criterion.ignore_index if hasattr(criterion, 'ignore_index') else -1

eval_metrics = create_metrics_dict(num_classes)
model.eval()

Expand Down Expand Up @@ -409,12 +411,13 @@ def evaluation(eval_loader,
a, segmentation = torch.max(outputs_flatten, dim=1)
eval_metrics = iou(segmentation, labels_flatten, batch_size, num_classes, eval_metrics)
eval_metrics = report_classification(segmentation, labels_flatten, batch_size, eval_metrics,
ignore_index=eval_loader.dataset.dontcare)
elif (dataset == 'tst') and (batch_metrics is not None):
ignore_index=dontcare)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixes missing attribute bug. Dontcare value was previously stored in dataloader object, but not anymore (since when?)

elif dataset == 'tst':
batch_metrics = True
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

force metrics at test time. I don't see any reason why we shouldn'y systematically output metrics at test time.

a, segmentation = torch.max(outputs_flatten, dim=1)
eval_metrics = iou(segmentation, labels_flatten, batch_size, num_classes, eval_metrics)
eval_metrics = report_classification(segmentation, labels_flatten, batch_size, eval_metrics,
ignore_index=eval_loader.dataset.dontcare)
ignore_index=dontcare)

logging.debug(OrderedDict(dataset=dataset, loss=f'{eval_metrics["loss"].avg:.4f}'))

Expand All @@ -428,10 +431,11 @@ def evaluation(eval_loader,
if eval_metrics['loss'].avg:
logging.info(f"\n{dataset} Loss: {eval_metrics['loss'].avg:.4f}")
if batch_metrics is not None:
logging.info(f"\n{dataset} precision: {eval_metrics['precision'].avg}")
logging.info(f"\n{dataset} recall: {eval_metrics['recall'].avg}")
logging.info(f"\n{dataset} fscore: {eval_metrics['fscore'].avg}")
logging.info(f"\n{dataset} iou: {eval_metrics['iou'].avg}")
logging.info(f"\n{dataset} precision: {eval_metrics['precision'].avg}"
f"\n{dataset} recall: {eval_metrics['recall'].avg}"
f"\n{dataset} fscore: {eval_metrics['fscore'].avg}"
f"\n{dataset} iou: {eval_metrics['iou'].avg}"
f"\n{dataset} iou (non background): {eval_metrics['iou_nonbg'].avg}")

return eval_metrics

Expand Down Expand Up @@ -742,6 +746,7 @@ def train(cfg: DictConfig) -> None:
checkpoint = load_checkpoint(filename)
model, _ = load_from_checkpoint(checkpoint, model)

return_metric = None
if tst_dataloader:
tst_report = evaluation(eval_loader=tst_dataloader,
model=model,
Expand All @@ -763,9 +768,13 @@ def train(cfg: DictConfig) -> None:
bucket.upload_file("output.txt", bucket_output_path.joinpath(f"Logs/{now}_output.txt"))
bucket.upload_file(filename, bucket_filename)

return_metric = tst_report['iou'].avg

# log_artifact(logfile)
# log_artifact(logfile_debug)

return return_metric


def main(cfg: DictConfig) -> None:
"""
Expand All @@ -790,4 +799,5 @@ def main(cfg: DictConfig) -> None:
# HERE the code to do for the preprocessing for the segmentation

# execute the name mode (need to be in this file for now)
train(cfg)
tst_iou = train(cfg)
return tst_iou
Loading