Tuning updates: Merge dev into main (#171)

* use lightray branch (#167) * update workflow to build dev branch (#168) * Update `lightray` (#169) * add asds to save augmented batch * move save augmented batch callback to config * overwrite * update lightray * Finalize `lightray` refactor (#170) * update to lightray 0.2.0 * update docs * add logging to cli * clean up tuning changes * remove print * Bump `lightray` version (#172) * update lightray * remove lightning config from tune.yam * parse outdir from env var * remove amplfi-tune executable * update mldatafind * import multimodal psd in embeddings module * dont log validation steop
ML4GW · Nov 4, 2024 · 382f751 · 382f751
1 parent 02ff2a5
commit 382f751
Show file tree

Hide file tree

Showing 14 changed files with 949 additions and 987 deletions.
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -11,6 +11,7 @@ on:
   push:
     branches:
       - main
+      - dev
 
 env:
   REGISTRY: ghcr.io

diff --git a/amplfi/amplfi_init.py b/amplfi/amplfi_init.py
@@ -1,19 +1,18 @@
 #!/usr/bin/env python3
 
+import logging
 import os
 import shutil
 from pathlib import Path
 from textwrap import dedent
 from typing import Literal, Optional
 
-import yaml
 from jsonargparse import ArgumentParser
 
 root = Path(__file__).resolve().parent.parent
 data_config = (root / "amplfi" / "data" / "datagen.cfg",)
 TUNE_CONFIGS = [
-    root / "amplfi" / "tune" / "tune.yaml",
-    root / "amplfi" / "tune" / "search_space.py",
+    root / "amplfi" / "train" / "configs" / "tune.yaml",
 ]
 
 
@@ -45,15 +44,7 @@ def copy_configs(
     path.mkdir(parents=True, exist_ok=True)
     for config in configs:
         dest = path / config.name
-        if config.name == "tune.yaml":
-            with open(config, "r") as f:
-                dict = yaml.safe_load(f)
-                dict["train_config"] = str(path / "cbc.yaml")
-
-            with open(dest, "w") as f:
-                yaml.dump(dict, f)
-        else:
-            shutil.copy(config, dest)
+        shutil.copy(config, dest)
 
 
 def write_content(content: str, path: Path):
@@ -77,13 +68,13 @@ def create_runfile(
     # store training data and training info there
     base = path if s3_bucket is None else s3_bucket
 
-    config = path / "datagen.cfg"
+    config = path / name / "datagen.cfg"
     # make the below one string
     data_cmd = f"LAW_CONFIG_FILE={config} "
     data_cmd += "law run amplfi.data.DataGeneration --workers 5"
 
     if pipeline == "tune":
-        train_cmd = "amplfi-tune --config tune.yaml"
+        train_cmd = "lightray --config tune.yaml -- --config cbc.yaml"
     else:
         train_cmd = f"amplfi-{mode}-cli fit --config cbc.yaml"
 
@@ -147,7 +138,8 @@ def main():
     )
 
     parser.add_argument("--s3-bucket")
-
+    log_format = "%(levelname)s - %(message)s"
+    logging.basicConfig(level=logging.INFO, format=log_format)
     args = parser.parse_args()
     directory = (
         args.directory.resolve()
@@ -172,6 +164,10 @@ def main():
     create_runfile(
         directory, args.name, args.mode, args.pipeline, args.s3_bucket
     )
+    logging.info(
+        f"Initialized a {args.mode} {args.pipeline} "
+        f"pipeline at {directory / args.name}"
+    )
 
 
 if __name__ == "__main__":

diff --git a/amplfi/train/architectures/embeddings/__init__.py b/amplfi/train/architectures/embeddings/__init__.py
@@ -1,4 +1,4 @@
 from .dense import CoherentDenseEmbedding, NChannelDenseEmbedding
 from .flattener import Flattener
-from .multimodal import MultiModal
+from .multimodal import FrequencyPsd, MultiModal, MultiModalPsd
 from .resnet import ResNet
diff --git a/amplfi/train/callbacks.py b/amplfi/train/callbacks.py
@@ -36,7 +36,9 @@ def on_train_start(self, trainer, pl_module):
             X = X.to(device)
 
             cross, plus, parameters = datamodule.waveform_sampler.sample(X)
-            strain, parameters = datamodule.inject(X, cross, plus, parameters)
+            strain, asds, parameters = datamodule.inject(
+                X, cross, plus, parameters
+            )
 
             # save an example validation batch
             # and parameters to disk
@@ -57,7 +59,7 @@ def on_train_start(self, trainer, pl_module):
             val_parameters = {
                 k: val_parameters[:, i] for i, k in enumerate(keys)
             }
-            val_strain, val_parameters = datamodule.inject(
+            val_strain, val_asds, val_parameters = datamodule.inject(
                 background, val_cross, val_plus, val_parameters
             )
 

diff --git a/amplfi/train/cli/__init__.py b/amplfi/train/cli/__init__.py
@@ -0,0 +1,2 @@
+from .flow import AmplfiFlowCLI
+from .similarity import AmplfiSimilarityCLI
diff --git a/amplfi/train/cli/base.py b/amplfi/train/cli/base.py
@@ -9,6 +9,7 @@ def __init__(self, *args, **kwargs):
         # parser_mode to omegaconf for all subclasses
         kwargs["parser_kwargs"] = {"parser_mode": "omegaconf"}
         kwargs["save_config_callback"] = SaveConfigCallback
+        kwargs["save_config_kwargs"] = {"overwrite": True}
         super().__init__(*args, **kwargs)
 
     def add_arguments_to_parser(self, parser):

diff --git a/amplfi/train/configs/tune.yaml b/amplfi/train/configs/tune.yaml
@@ -0,0 +1,58 @@
+# configuration file to be used with `lightray`
+# for running hyper-paramter runing with ray
+# see https://github.com/EthanMarx/lightray/
+
+# ray.tune.TuneConfig
+tune_config:
+  mode: "min"
+  metric: "valid_loss_epoch"
+  scheduler: 
+    class_path: ray.tune.schedulers.ASHAScheduler
+    init_args:
+      max_t: 200
+      grace_period: 21
+      reduction_factor: 2
+  num_samples: 1
+  reuse_actors: true
+
+# ray.train.RunConfig
+run_config:
+  name: "my-first-run"
+  storage_path: ${oc.env:AMPLFI_OUTDIR}
+  failure_config:
+    class_path: ray.train.FailureConfig
+    init_args:
+      max_failures: 1
+  checkpoint_config:
+    class_path: ray.train.CheckpointConfig
+    init_args:
+      num_to_keep: 5
+      checkpoint_score_attribute: "valid_loss_epoch"
+      checkpoint_score_order: "min"
+  verbose: null
+
+# ray.train.SyncConfig
+sync_config:
+  sync_period: 1000
+
+# ray.init
+ray_init:
+  address: null
+
+# tune.Tune.param_space
+param_space:
+  model.learning_rate: tune.loguniform(1e-3, 4)
+
+# ray.tune.TuneCallback
+tune_callback:
+  class_path: lightray.callbacks.LightRayReportCheckpointCallback
+  init_args:
+    'on': "validation_end"
+    checkpoint_every: 10
+
+# resources per trial
+cpus_per_trial: 2
+gpus_per_trial: 1
+
+# lightning cli
+lightning_cli_cls: amplfi.train.cli.AmplfiFlowCLI
diff --git a/amplfi/train/models/flow.py b/amplfi/train/models/flow.py
@@ -5,7 +5,6 @@
 import torch
 
 from ..architectures.flows import FlowArchitecture
-from ..callbacks import SaveAugmentedBatch
 from ..testing import Result
 from .base import AmplfiModel
 
@@ -77,19 +76,14 @@ def validation_step(self, batch, _):
         self.log(
             "valid_loss",
             loss,
-            on_step=True,
+            on_step=False,
             on_epoch=True,
             prog_bar=True,
             sync_dist=True,
             logger=True,
         )
         return loss
 
-    def configure_callbacks(self):
-        callbacks = super().configure_callbacks()
-        callbacks.append(SaveAugmentedBatch())
-        return callbacks
-
     def cast_as_bilby_result(
         self,
         samples: np.ndarray,

diff --git a/amplfi/tune/search_space.py b/amplfi/tune/search_space.py
diff --git a/amplfi/tune/tune.py b/amplfi/tune/tune.py
diff --git a/amplfi/tune/tune.yaml b/amplfi/tune/tune.yaml
diff --git a/docs/tuning.md b/docs/tuning.md
@@ -15,39 +15,29 @@ amplfi-init --mode flow --pipeline tune --directory ~/amplfi/my-first-tune/
 This will create a directory at `~/amplfi/my-first-tune/`, and populate it with 
 configuration files for the run. The `train.yaml` contains the main configuration for the training.
 `datagen.cfg` controls the configuration for querying training and testing strain data. 
-`tune.yaml` configure parameters that control how the hyperparameter tuning is performed. Finally,
-`search_space.py` constructs the space of parameters that will searched over during tuning. 
+`tune.yaml` configures parameters that control how `Ray` will perform the hyperparameter tuning.
 
 
 ## Configuring an Experiment
-The search space of parameters to tune over can be set in the `search_space.py` file. 
-For example, the below parameter space will search over the models learning rate 
-and the kernel length of the data.
+A key ingredient in the tuning job is the parameter space that is searched over. This can be configured
+via the `param_space` parameter in the `tune.yaml` configuration file.
 
-```
-# search_space.py
-from ray import tune
-
-space = {
-    "model.learning_rate": tune.loguniform(1e-4, 1e-1),
-    "data.kernel_length": tune.choice([1, 2])
-}
+```yaml
+# tune.yaml
+param_space:
+  model.learning_rate: tune.loguniform(1e-3, 4)
+  data.kernel_length: tune.choice([1, 2])
 ```
 
-the parameter names should be python "dot path" to attributes in the `train.yaml`. Any
+the parameter names should be python "dot paths" to attributes in the `train.yaml`. Any
 parameters set in the search space will be sampled from the distribution
 when each trial is launched, and override the value set in `train.yaml`.
 
-The `tune.yaml` file configures parameters of the tuning. You can see a full list of configuration by running 
+Most of the parameters from the [`ray.tune.Tuner`](https://docs.ray.io/en/latest/tune/api/doc/ray.tune.Tuner.html) are also configurable, including the tuning scheduler and search algorithm. Please see the ray tune [documentation](https://docs.ray.io/en/latest/tune/index.html) for more information.
 
+You can see a full list of configuration by running 
 ```
-amplfi-tune --help
-```
-
-```{eval-rst}
-.. note::
-    Currently, the `lightray` library automatically uses the `Asynchronous Hyper Band algorithm <https://docs.ray.io/en/latest/tune/api/doc/ray.tune.schedulers.AsyncHyperBandScheduler.html#ray.tune.schedulers.AsyncHyperBandScheduler>`_, which will kill under performing trials after a certain amount of epochs this is
-    controlled by the :code:`min_epochs` parameter.
+lightray --help
 ```
 
 ## Launching a Run
@@ -68,7 +58,7 @@ CUDA_VISIBLE_DEVICES=0
 LAW_CONFIG_FILE=/home/albert.einstein/amplfi/my-first-tune/datagen.cfg law run amplfi.law.DataGeneration --workers 5
 
 # launch training or tuning pipeline
-amplfi-tune --config tune.yaml
+lightray --config tune.yaml -- --config cbc.yaml
 ```
 
 If you've run the [training pipeline](first_pipeline.md) this should look familiar: environment variables control the location where 
@@ -165,7 +155,7 @@ address = ray://11.22.10.27:10001
 Now, launch the run!
 
 ```console
-amplfi-tune --tune.yaml
+lightray --tune.yaml -- --config cbc.yaml
 ```
 
 ```{eval-rst}
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,6 +11,7 @@ on: @@
       push:
         branches:
           - main
+          - dev
     env:
       REGISTRY: ghcr.io
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .flow import AmplfiFlowCLI
		from .similarity import AmplfiSimilarityCLI