[wip] sharing for updates only

mmcdermott · Aug 20, 2024 · f3c985a · f3c985a
1 parent 2feee79
commit f3c985a
Show file tree

Hide file tree

Showing 15 changed files with 316 additions and 46 deletions.
diff --git a/src/MEDS_tabular_automl/base_model.py b/src/MEDS_tabular_automl/base_model.py
@@ -0,0 +1,23 @@
+from typing import Dict, Type
+from abc import ABC, abstractmethod
+from pathlib import Path
+from omegaconf import DictConfig
+from mixins import TimeableMixin
+
+
+class BaseModel(ABC, TimeableMixin):
+    @abstractmethod
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def train(self):
+        pass
+
+    @abstractmethod
+    def evaluate(self) -> float:
+        pass
+
+    @abstractmethod
+    def save_model(self, output_fp: Path):
+        pass
diff --git a/src/MEDS_tabular_automl/configs/launch_model.yaml b/src/MEDS_tabular_automl/configs/launch_model.yaml
@@ -0,0 +1,22 @@
+defaults:
+  - default
+  - tabularization: default
+  - model: xgboost  # This can be changed to sgd_classifier or any other model
+  - override hydra/sweeper: optuna
+  - override hydra/sweeper/sampler: tpe
+  - override hydra/launcher: joblib
+  - _self_
+
+task_name: task
+
+# Task cached data dir
+input_dir: ${output_cohort_dir}/${task_name}/task_cache
+# Directory with task labels
+input_label_dir: ${output_cohort_dir}/${task_name}/labels/
+# Where to output the model and cached data
+model_dir: ${output_cohort_dir}/model/model_${now:%Y-%m-%d_%H-%M-%S}
+output_filepath: ${model_dir}/model_metadata.json
+
+log_dir: ${model_dir}/.logs/
+
+name: launch_model
diff --git a/src/MEDS_tabular_automl/configs/models/sgd_classifier.yaml b/src/MEDS_tabular_automl/configs/models/sgd_classifier.yaml
@@ -0,0 +1,19 @@
+model_params:
+  epochs: 20
+  early_stopping_rounds: 5
+  model:
+    type: sklearn
+    _target_: sklearn.linear_model.SGDClassifier
+    loss: log_loss
+  iterator:
+    keep_data_in_memory: True
+    binarize_task: True
+
+hydra:
+  sweeper:
+    params:
+      +model_params.model.alpha: tag(log, interval(1e-6, 1))
+      +model_params.model.l1_ratio: interval(0, 1)
+      +model_params.model.penalty: choice(['l1', 'l2', 'elasticnet'])
+      model_params.epochs: range(10, 100)
+      model_params.early_stopping_rounds: range(1, 10)
diff --git a/src/MEDS_tabular_automl/configs/models/xgboost.yaml b/src/MEDS_tabular_automl/configs/models/xgboost.yaml
@@ -0,0 +1,27 @@
+model_params:
+  num_boost_round: 1000
+  early_stopping_rounds: 5
+  model:
+    type: xgboost
+    # _target_: xgboost.XGBClassifier
+    booster: gbtree
+    device: cpu
+    nthread: 1
+    tree_method: hist
+    objective: binary:logistic
+  iterator:
+    keep_data_in_memory: True
+    binarize_task: True
+
+hydra:
+  sweeper:
+    params:
+      +model_params.model.eta: tag(log, interval(0.001, 1))
+      +model_params.model.lambda: tag(log, interval(0.001, 1))
+      +model_params.model.alpha: tag(log, interval(0.001, 1))
+      +model_params.model.subsample: interval(0.5, 1)
+      +model_params.model.min_child_weight: interval(1e-2, 100)
+      model_params.num_boost_round: range(100, 1000)
+      model_params.early_stopping_rounds: range(1, 10)
+      +model_params.model.max_depth: range(2, 16)
+      tabularization.min_code_inclusion_frequency: tag(log, range(10, 1000000))
diff --git a/src/MEDS_tabular_automl/configs/tabularization/default.yaml b/src/MEDS_tabular_automl/configs/tabularization/default.yaml
@@ -2,7 +2,7 @@
 filtered_code_metadata_fp: ${output_cohort_dir}/tabularized_code_metadata.parquet
 allowed_codes: null
 min_code_inclusion_count: 10
-min_code_inclusion_frequency: 0.01
+min_code_inclusion_frequency: null
 max_included_codes: null
 window_sizes:
   - "1d"
@@ -21,4 +21,4 @@ aggs:
   - "value/max"
 
 # Resolved inputs
-_resolved_codes: ${filter_to_codes:${tabularization.filtered_code_metadata_fp},${tabularization.allowed_codes},${tabularization.min_code_inclusion_count},$`{tabularization.min_code_inclusion_frequency},${tabularization.max_included_codes}}`}
+_resolved_codes: ${filter_to_codes:${tabularization.filtered_code_metadata_fp},${tabularization.allowed_codes},${tabularization.min_code_inclusion_count},${tabularization.min_code_inclusion_frequency},${tabularization.max_included_codes}}
diff --git a/src/MEDS_tabular_automl/dense_iterator.py b/src/MEDS_tabular_automl/dense_iterator.py
@@ -0,0 +1,54 @@
+from pathlib import Path
+
+import hydra
+import numpy as np
+import scipy.sparse as sp
+from loguru import logger
+from mixins import TimeableMixin
+from omegaconf import DictConfig
+from sklearn.metrics import roc_auc_score
+
+from .tabular_dataset import TabularDataset
+from .base_model import BaseModel
+
+
+class DenseIterator(TabularDataset, TimeableMixin):
+
+    def __init__(self, cfg: DictConfig, split: str):
+        """Initializes the SklearnIterator with the provided configuration and data split.
+
+        Args:
+            cfg: The configuration dictionary.
+            split: The data split to use.
+        """
+        TabularDataset.__init__(self, cfg=cfg, split=split)
+        TimeableMixin.__init__(self)
+        self.valid_event_ids, self.labels = self._load_ids_and_labels()
+        # check if the labels are empty
+        if len(self.labels) == 0:
+            raise ValueError("No labels found.")
+        # self._it = 0
+
+    def densify(self) -> np.ndarray:
+        """Builds the data as a dense matrix based on column subselection."""
+
+        # get the column indices to include
+        cols = self.get_feature_indices()
+
+        # map those to the feature names in the data
+        feature_names = self.get_all_column_names()
+        selected_features = [feature_names[col] for col in cols]
+
+        # get the dense matrix by iterating through the data shards
+        data = []
+        labels = []
+        for shard_idx in range(len(self._data_shards)):
+            shard_data, shard_labels = self.get_data_shards(shard_idx)
+            shard_data = shard_data[:, cols]
+            data.append(shard_data)
+            labels.append(shard_labels)
+        data = sp.vstack(data)
+        labels = np.concatenate(labels, axis=0)
+        return data, labels, selected_features
+
+
diff --git a/src/MEDS_tabular_automl/scripts/launch_autogluon.py b/src/MEDS_tabular_automl/scripts/launch_autogluon.py
@@ -0,0 +1,60 @@
+from importlib.resources import files
+from pathlib import Path
+
+import hydra
+import pandas as pd
+from loguru import logger
+from omegaconf import DictConfig
+
+from MEDS_tabular_automl.dense_iterator import DenseIterator
+
+from ..utils import hydra_loguru_init
+
+config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_xgboost.yaml")
+if not config_yaml.is_file():
+    raise FileNotFoundError("Core configuration not successfully installed!")
+
+
+@hydra.main(version_base=None, config_path=str(config_yaml.parent.resolve()), config_name=config_yaml.stem)
+def main(cfg: DictConfig) -> float:
+    """Launches AutoGluon after collecting data based on the provided configuration.
+
+    Args:
+        cfg: The configuration dictionary specifying model and training parameters.
+
+    """
+
+    # print(OmegaConf.to_yaml(cfg))
+    if not cfg.loguru_init:
+        hydra_loguru_init()
+
+    # check that autogluon is installed
+    try:
+        import autogluon as ag
+    except ImportError:
+        logger.error("AutoGluon is not installed. Please install AutoGluon.")
+
+    # collect data based on the configuration
+    itrain = DenseIterator(cfg, "train")
+    ituning = DenseIterator(cfg, "tuning")
+    iheld_out = DenseIterator(cfg, "held_out")
+
+    # collect data for AutoGluon
+    train_data, train_labels, cols = itrain.densify()
+    tuning_data, tuning_labels, _ = ituning.densify()
+    held_out_data, held_out_labels, _ = iheld_out.densify()
+
+    # construct dfs for AutoGluon
+    train_df = pd.DataFrame(train_data.todense(), columns=cols)
+    train_df[cfg.task_name] = train_labels
+    tuning_df = pd.DataFrame(tuning_data.todense(), columns=cols)
+    tuning_df[cfg.task_name] = tuning_labels
+    held_out_df = pd.DataFrame(held_out_data.todense(), columns=cols)
+    held_out_df[cfg.task_name] = held_out_labels
+
+    # launch AutoGluon
+    predictor = ag.TabularPredictor(label=cfg.task_name).fit(train_data=train_df, tuning_data=tuning_df)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/MEDS_tabular_automl/scripts/launch_model.py b/src/MEDS_tabular_automl/scripts/launch_model.py
@@ -0,0 +1,63 @@
+from importlib.resources import files
+from pathlib import Path
+
+import hydra
+from loguru import logger
+from omegaconf import DictConfig
+from typing import Dict, Type
+
+from MEDS_tabular_automl.base_model import BaseModel
+from MEDS_tabular_automl.sklearn_model import SklearnModel
+from MEDS_tabular_automl.xgboost_model import XGBoostModel
+
+
+MODEL_CLASSES: Dict[str, Type[BaseModel]] = {
+    "xgboost": XGBoostModel,
+    "sklearn": SklearnModel
+}
+
+from ..utils import hydra_loguru_init
+
+config_yaml = files("MEDS_tabular_automl").joinpath("configs/launch_xgboost.yaml")
+if not config_yaml.is_file():
+    raise FileNotFoundError("Core configuration not successfully installed!")
+
+
+@hydra.main(version_base=None, config_path=str(config_yaml.parent.resolve()), config_name=config_yaml.stem)
+def main(cfg: DictConfig) -> float:
+    """Optimizes the model based on the provided configuration.
+
+    Args:
+        cfg: The configuration dictionary specifying model and training parameters.
+
+    Returns:
+        The evaluation result as the ROC AUC score on the held-out test set.
+    """
+
+    # print(OmegaConf.to_yaml(cfg))
+    if not cfg.loguru_init:
+        hydra_loguru_init()
+    try:
+        model_type = cfg.model.type
+        ModelClass = MODEL_CLASSES.get(model_type)
+        if ModelClass is None:
+            raise ValueError(f"Model type {model_type} not supported.")
+
+        model = ModelClass(cfg)
+        model.train()
+        auc = model.evaluate()
+        logger.info(f"AUC: {auc}")
+
+        # save model
+        output_fp = Path(cfg.output_filepath)
+        output_fp.parent.mkdir(parents=True, exist_ok=True)
+
+        model.save_model(output_fp)
+    except Exception as e:
+        logger.error(f"Error occurred: {e}")
+        auc = 0.0
+    return auc
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/MEDS_tabular_automl/scripts/launch_xgboost.py b/src/MEDS_tabular_automl/scripts/launch_xgboost.py
@@ -27,30 +27,30 @@ def main(cfg: DictConfig) -> float:
     # print(OmegaConf.to_yaml(cfg))
     if not cfg.loguru_init:
         hydra_loguru_init()
-    try:
-        model = XGBoostModel(cfg)
-        model.train()
-        auc = model.evaluate()
-        logger.info(f"AUC: {auc}")
-
-        # print(
-        #     "Time Profiling for window sizes ",
-        #     f"{cfg.tabularization.window_sizes} and min ",
-        #     f"code frequency of {cfg.tabularization.min_code_inclusion_frequency}:",
-        # )
-        # print("Train Time: \n", model._profile_durations())
-        # print("Train Iterator Time: \n", model.itrain._profile_durations())
-        # print("Tuning Iterator Time: \n", model.ituning._profile_durations())
-        # print("Held Out Iterator Time: \n", model.iheld_out._profile_durations())
-
-        # save model
-        output_fp = Path(cfg.output_filepath)
-        output_fp.parent.mkdir(parents=True, exist_ok=True)
-
-        model.save_model(output_fp)
-    except Exception as e:
-        logger.error(f"Error occurred: {e}")
-        auc = 0.0
+    # try:
+    model = XGBoostModel(cfg)
+    model.train()
+    auc = model.evaluate()
+    logger.info(f"AUC: {auc}")
+
+    # print(
+    #     "Time Profiling for window sizes ",
+    #     f"{cfg.tabularization.window_sizes} and min ",
+    #     f"code frequency of {cfg.tabularization.min_code_inclusion_frequency}:",
+    # )
+    # print("Train Time: \n", model._profile_durations())
+    # print("Train Iterator Time: \n", model.itrain._profile_durations())
+    # print("Tuning Iterator Time: \n", model.ituning._profile_durations())
+    # print("Held Out Iterator Time: \n", model.iheld_out._profile_durations())
+
+    # save model
+    output_fp = Path(cfg.output_filepath)
+    output_fp.parent.mkdir(parents=True, exist_ok=True)
+
+    model.save_model(output_fp)
+    # except Exception as e:
+    #     logger.error(f"Error occurred: {e}")
+    #     auc = 0.0
     return auc
 
 

diff --git a/src/MEDS_tabular_automl/scripts/tabularize_static.py b/src/MEDS_tabular_automl/scripts/tabularize_static.py
@@ -86,7 +86,6 @@ def main(
         hydra_loguru_init()
 
     # Step 1: Cache the filtered features that will be used in the tabularization process and modeling
-    # import pdb; pdb.set_trace()
     def read_fn(_):
         return _
 

diff --git a/src/MEDS_tabular_automl/sklearn_model.py b/src/MEDS_tabular_automl/sklearn_model.py
@@ -9,6 +9,7 @@
 from sklearn.metrics import roc_auc_score
 
 from .tabular_dataset import TabularDataset
+from .base_model import BaseModel
 
 
 class SklearnIterator(TabularDataset, TimeableMixin):
@@ -77,7 +78,7 @@ def get_label(self):
         return self.labels
 
 
-class SklearnModel(TimeableMixin):
+class SklearnModel(BaseModel, TimeableMixin):
     """Class for configuring, training, and evaluating an SciKit-Learn model.
 
     This class utilizes the configuration settings provided to manage the training and evaluation
@@ -86,7 +87,7 @@ class SklearnModel(TimeableMixin):
     disk using iterators.
 
     Args:
-        cfg: The configuration settings for the model, including data paths, model parameters,
+        cfg: The configuration settings for the model, including data paths, model parameters,ß
             and flags for data handling.
 
     Attributes:
@@ -107,6 +108,7 @@ def __init__(self, cfg: DictConfig):
         Args:
             cfg: The configuration dictionary.
         """
+        super().__init__()
         self.cfg = cfg
         self.keep_data_in_memory = cfg.model_params.iterator.keep_data_in_memory
 

diff --git a/src/MEDS_tabular_automl/utils.py b/src/MEDS_tabular_automl/utils.py
@@ -108,7 +108,6 @@ def load_tqdm(use_tqdm: bool):
 
         return tqdm
     else:
-
         def noop(x, **kwargs):
             return x
-Original file line number
+Diff line change
@@ Expand Up / @@ -108,7 +108,6 @@ def load_tqdm(use_tqdm: bool): @@
             return tqdm
         else:
             def noop(x, **kwargs):
                 return x
@@ Expand Down @@