Skip to content

Commit

Permalink
Tuning updates: Merge dev into main (#171)
Browse files Browse the repository at this point in the history
* use lightray branch (#167)

* update workflow to build dev branch (#168)

* Update `lightray` (#169)

* add asds to save augmented batch

* move save augmented batch callback to config

* overwrite

* update lightray

* Finalize `lightray` refactor (#170)

* update to lightray 0.2.0

* update docs

* add logging to cli

* clean up tuning changes

* remove print

* Bump `lightray` version (#172)

* update lightray

* remove lightning config from tune.yam

* parse outdir from env var

* remove amplfi-tune executable

* update mldatafind

* import multimodal psd in embeddings module

* dont log validation steop
  • Loading branch information
EthanMarx authored Nov 4, 2024
1 parent 02ff2a5 commit 382f751
Show file tree
Hide file tree
Showing 14 changed files with 949 additions and 987 deletions.
1 change: 1 addition & 0 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ on:
push:
branches:
- main
- dev

env:
REGISTRY: ghcr.io
Expand Down
26 changes: 11 additions & 15 deletions amplfi/amplfi_init.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
#!/usr/bin/env python3

import logging
import os
import shutil
from pathlib import Path
from textwrap import dedent
from typing import Literal, Optional

import yaml
from jsonargparse import ArgumentParser

root = Path(__file__).resolve().parent.parent
data_config = (root / "amplfi" / "data" / "datagen.cfg",)
TUNE_CONFIGS = [
root / "amplfi" / "tune" / "tune.yaml",
root / "amplfi" / "tune" / "search_space.py",
root / "amplfi" / "train" / "configs" / "tune.yaml",
]


Expand Down Expand Up @@ -45,15 +44,7 @@ def copy_configs(
path.mkdir(parents=True, exist_ok=True)
for config in configs:
dest = path / config.name
if config.name == "tune.yaml":
with open(config, "r") as f:
dict = yaml.safe_load(f)
dict["train_config"] = str(path / "cbc.yaml")

with open(dest, "w") as f:
yaml.dump(dict, f)
else:
shutil.copy(config, dest)
shutil.copy(config, dest)


def write_content(content: str, path: Path):
Expand All @@ -77,13 +68,13 @@ def create_runfile(
# store training data and training info there
base = path if s3_bucket is None else s3_bucket

config = path / "datagen.cfg"
config = path / name / "datagen.cfg"
# make the below one string
data_cmd = f"LAW_CONFIG_FILE={config} "
data_cmd += "law run amplfi.data.DataGeneration --workers 5"

if pipeline == "tune":
train_cmd = "amplfi-tune --config tune.yaml"
train_cmd = "lightray --config tune.yaml -- --config cbc.yaml"
else:
train_cmd = f"amplfi-{mode}-cli fit --config cbc.yaml"

Expand Down Expand Up @@ -147,7 +138,8 @@ def main():
)

parser.add_argument("--s3-bucket")

log_format = "%(levelname)s - %(message)s"
logging.basicConfig(level=logging.INFO, format=log_format)
args = parser.parse_args()
directory = (
args.directory.resolve()
Expand All @@ -172,6 +164,10 @@ def main():
create_runfile(
directory, args.name, args.mode, args.pipeline, args.s3_bucket
)
logging.info(
f"Initialized a {args.mode} {args.pipeline} "
f"pipeline at {directory / args.name}"
)


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion amplfi/train/architectures/embeddings/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .dense import CoherentDenseEmbedding, NChannelDenseEmbedding
from .flattener import Flattener
from .multimodal import MultiModal
from .multimodal import FrequencyPsd, MultiModal, MultiModalPsd
from .resnet import ResNet
6 changes: 4 additions & 2 deletions amplfi/train/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ def on_train_start(self, trainer, pl_module):
X = X.to(device)

cross, plus, parameters = datamodule.waveform_sampler.sample(X)
strain, parameters = datamodule.inject(X, cross, plus, parameters)
strain, asds, parameters = datamodule.inject(
X, cross, plus, parameters
)

# save an example validation batch
# and parameters to disk
Expand All @@ -57,7 +59,7 @@ def on_train_start(self, trainer, pl_module):
val_parameters = {
k: val_parameters[:, i] for i, k in enumerate(keys)
}
val_strain, val_parameters = datamodule.inject(
val_strain, val_asds, val_parameters = datamodule.inject(
background, val_cross, val_plus, val_parameters
)

Expand Down
2 changes: 2 additions & 0 deletions amplfi/train/cli/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .flow import AmplfiFlowCLI
from .similarity import AmplfiSimilarityCLI
1 change: 1 addition & 0 deletions amplfi/train/cli/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ def __init__(self, *args, **kwargs):
# parser_mode to omegaconf for all subclasses
kwargs["parser_kwargs"] = {"parser_mode": "omegaconf"}
kwargs["save_config_callback"] = SaveConfigCallback
kwargs["save_config_kwargs"] = {"overwrite": True}
super().__init__(*args, **kwargs)

def add_arguments_to_parser(self, parser):
Expand Down
58 changes: 58 additions & 0 deletions amplfi/train/configs/tune.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# configuration file to be used with `lightray`
# for running hyper-paramter runing with ray
# see https://github.com/EthanMarx/lightray/

# ray.tune.TuneConfig
tune_config:
mode: "min"
metric: "valid_loss_epoch"
scheduler:
class_path: ray.tune.schedulers.ASHAScheduler
init_args:
max_t: 200
grace_period: 21
reduction_factor: 2
num_samples: 1
reuse_actors: true

# ray.train.RunConfig
run_config:
name: "my-first-run"
storage_path: ${oc.env:AMPLFI_OUTDIR}
failure_config:
class_path: ray.train.FailureConfig
init_args:
max_failures: 1
checkpoint_config:
class_path: ray.train.CheckpointConfig
init_args:
num_to_keep: 5
checkpoint_score_attribute: "valid_loss_epoch"
checkpoint_score_order: "min"
verbose: null

# ray.train.SyncConfig
sync_config:
sync_period: 1000

# ray.init
ray_init:
address: null

# tune.Tune.param_space
param_space:
model.learning_rate: tune.loguniform(1e-3, 4)

# ray.tune.TuneCallback
tune_callback:
class_path: lightray.callbacks.LightRayReportCheckpointCallback
init_args:
'on': "validation_end"
checkpoint_every: 10

# resources per trial
cpus_per_trial: 2
gpus_per_trial: 1

# lightning cli
lightning_cli_cls: amplfi.train.cli.AmplfiFlowCLI
8 changes: 1 addition & 7 deletions amplfi/train/models/flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import torch

from ..architectures.flows import FlowArchitecture
from ..callbacks import SaveAugmentedBatch
from ..testing import Result
from .base import AmplfiModel

Expand Down Expand Up @@ -77,19 +76,14 @@ def validation_step(self, batch, _):
self.log(
"valid_loss",
loss,
on_step=True,
on_step=False,
on_epoch=True,
prog_bar=True,
sync_dist=True,
logger=True,
)
return loss

def configure_callbacks(self):
callbacks = super().configure_callbacks()
callbacks.append(SaveAugmentedBatch())
return callbacks

def cast_as_bilby_result(
self,
samples: np.ndarray,
Expand Down
5 changes: 0 additions & 5 deletions amplfi/tune/search_space.py

This file was deleted.

50 changes: 0 additions & 50 deletions amplfi/tune/tune.py

This file was deleted.

19 changes: 0 additions & 19 deletions amplfi/tune/tune.yaml

This file was deleted.

38 changes: 14 additions & 24 deletions docs/tuning.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,39 +15,29 @@ amplfi-init --mode flow --pipeline tune --directory ~/amplfi/my-first-tune/
This will create a directory at `~/amplfi/my-first-tune/`, and populate it with
configuration files for the run. The `train.yaml` contains the main configuration for the training.
`datagen.cfg` controls the configuration for querying training and testing strain data.
`tune.yaml` configure parameters that control how the hyperparameter tuning is performed. Finally,
`search_space.py` constructs the space of parameters that will searched over during tuning.
`tune.yaml` configures parameters that control how `Ray` will perform the hyperparameter tuning.


## Configuring an Experiment
The search space of parameters to tune over can be set in the `search_space.py` file.
For example, the below parameter space will search over the models learning rate
and the kernel length of the data.
A key ingredient in the tuning job is the parameter space that is searched over. This can be configured
via the `param_space` parameter in the `tune.yaml` configuration file.

```
# search_space.py
from ray import tune
space = {
"model.learning_rate": tune.loguniform(1e-4, 1e-1),
"data.kernel_length": tune.choice([1, 2])
}
```yaml
# tune.yaml
param_space:
model.learning_rate: tune.loguniform(1e-3, 4)
data.kernel_length: tune.choice([1, 2])
```
the parameter names should be python "dot path" to attributes in the `train.yaml`. Any
the parameter names should be python "dot paths" to attributes in the `train.yaml`. Any
parameters set in the search space will be sampled from the distribution
when each trial is launched, and override the value set in `train.yaml`.

The `tune.yaml` file configures parameters of the tuning. You can see a full list of configuration by running
Most of the parameters from the [`ray.tune.Tuner`](https://docs.ray.io/en/latest/tune/api/doc/ray.tune.Tuner.html) are also configurable, including the tuning scheduler and search algorithm. Please see the ray tune [documentation](https://docs.ray.io/en/latest/tune/index.html) for more information.

You can see a full list of configuration by running
```
amplfi-tune --help
```

```{eval-rst}
.. note::
Currently, the `lightray` library automatically uses the `Asynchronous Hyper Band algorithm <https://docs.ray.io/en/latest/tune/api/doc/ray.tune.schedulers.AsyncHyperBandScheduler.html#ray.tune.schedulers.AsyncHyperBandScheduler>`_, which will kill under performing trials after a certain amount of epochs this is
controlled by the :code:`min_epochs` parameter.
lightray --help
```
## Launching a Run
Expand All @@ -68,7 +58,7 @@ CUDA_VISIBLE_DEVICES=0
LAW_CONFIG_FILE=/home/albert.einstein/amplfi/my-first-tune/datagen.cfg law run amplfi.law.DataGeneration --workers 5
# launch training or tuning pipeline
amplfi-tune --config tune.yaml
lightray --config tune.yaml -- --config cbc.yaml
```

If you've run the [training pipeline](first_pipeline.md) this should look familiar: environment variables control the location where
Expand Down Expand Up @@ -165,7 +155,7 @@ address = ray://11.22.10.27:10001
Now, launch the run!

```console
amplfi-tune --tune.yaml
lightray --tune.yaml -- --config cbc.yaml
```

```{eval-rst}
Expand Down
Loading

0 comments on commit 382f751

Please sign in to comment.