adamovanja · adamovanja · Sep 4, 2024 · Sep 3, 2024 · Sep 4, 2024 · Sep 4, 2024
diff --git a/.gitignore b/.gitignore
@@ -162,3 +162,4 @@ x_*
 # all but one json config file
 *.json
 !run_config.json
+!run_config_whparams.json
diff --git a/README.md b/README.md
@@ -40,6 +40,7 @@ To define a suitable model configuration, please find the description of each va
 | test_mode | Boolean flag to indicate if running in test mode. |
 | tracking_uri | Which platform to use for experiment tracking either "wandb" for WandB or "mlruns" for MLflow. See  #model-tracking for set-up instructions. |
 | train_size | Fraction of data to use for training (e.g., 0.8 for 80% train, 20% test split). |
+| model_hyperparameters | Optional: For each model type the range of hyperparameters to check can be defined here. Note: in case this key is not provided, the default ranges are used as defined in `model_space/static_searchspace.py`. You can find an example of a configuration file with all hyperparameters defined as per default in `run_config_whparams.json`|
 
 ### Local training
 To locally train models with a defined configuration in `q2_ritme/run_config.json` run:

diff --git a/q2_ritme/model_space/static_searchspace.py b/q2_ritme/model_space/static_searchspace.py
@@ -52,91 +52,213 @@ def get_data_eng_space(trial, tax, test_mode: bool = False) -> None:
     return None
 
 
-def get_linreg_space(trial, tax, test_mode: bool = False) -> Dict[str, str]:
+def get_linreg_space(
+    trial, tax, test_mode: bool = False, model_hyperparameters: dict = {}
+) -> Dict[str, str]:
     get_data_eng_space(trial, tax, test_mode)
 
     # alpha controls overall regularization strength, alpha = 0 is equivalent to
     # an ordinary least square. large alpha -> more regularization
-    trial.suggest_float("alpha", 0, 1)
+    alpha = model_hyperparameters.get("alpha", {"min": 0, "max": 1})
+    trial.suggest_float("alpha", alpha["min"], alpha["max"])
+
     # balance between L1 and L2 reg., when =0 -> L2, when =1 -> L1
-    trial.suggest_float("l1_ratio", 0, 1)
+    l1_ratio = model_hyperparameters.get("l1_ratio", {"min": 0, "max": 1})
+    trial.suggest_float("l1_ratio", l1_ratio["min"], l1_ratio["max"])
 
     return {"model": "linreg"}
 
 
-def get_rf_space(trial, tax, test_mode: bool = False) -> Dict[str, str]:
+def get_rf_space(
+    trial, tax, test_mode: bool = False, model_hyperparameters: dict = {}
+) -> Dict[str, str]:
     get_data_eng_space(trial, tax, test_mode)
 
     # number of trees in forest: the more the higher computational costs
-    trial.suggest_int("n_estimators", 50, 300)
+    n_estimators = model_hyperparameters.get(
+        "n_estimators", {"min": 40, "max": 200, "step": 20}
+    )
+    trial.suggest_int(
+        "n_estimators",
+        n_estimators["min"],
+        n_estimators["max"],
+        step=n_estimators["step"],
+    )
 
     # max depths of the tree: the higher the higher probab of overfitting
-    trial.suggest_int("max_depth", 5, 50)
+    # ! 4, 8, 16, None
+    max_depth = model_hyperparameters.get("max_depth", [4, 8, 16, 32, None])
+    trial.suggest_categorical("max_depth", max_depth)
+
     # min number of samples requires to split internal node: small
     # values higher probab of overfitting
-    trial.suggest_float("min_samples_split", 0.01, 0.1, step=0.01)
+    # ! 0.001, 0.01, 0.1
+    min_samples_split = model_hyperparameters.get(
+        "min_samples_split", {"min": 0.001, "max": 0.1, "log": True}
+    )
+    trial.suggest_float(
+        "min_samples_split",
+        min_samples_split["min"],
+        min_samples_split["max"],
+        log=min_samples_split["log"],
+    )
+
+    # a leaf should have at least this fraction of samples within - larger
+    # values can help reduce overfitting
+    # ! 0.0001, 0.001, 0.01
+    min_weight_fraction_leaf = model_hyperparameters.get(
+        "min_weight_fraction_leaf", {"min": 0.0001, "max": 0.01, "log": True}
+    )
+    trial.suggest_float(
+        "min_weight_fraction_leaf",
+        min_weight_fraction_leaf["min"],
+        min_weight_fraction_leaf["max"],
+        log=min_weight_fraction_leaf["log"],
+    )
 
     # min # samples requires at leaf node: small values higher probab
     # of overfitting
-    trial.suggest_categorical("min_samples_leaf", [0.005, 0.01, 0.05, 0.1])
+    min_samples_leaf = model_hyperparameters.get(
+        "min_samples_leaf", {"min": 0.001, "max": 0.1, "log": True}
+    )
+    trial.suggest_float(
+        "min_samples_leaf",
+        min_samples_leaf["min"],
+        min_samples_leaf["max"],
+        log=min_samples_leaf["log"],
+    )
 
     # max # features to consider when looking for best split: small can
     # reduce overfitting
-    trial.suggest_categorical("max_features", [None, "sqrt", "log2", 0.1, 0.2, 0.5])
+    # ! None, "sqrt", "log2", 0.1
+    max_features = model_hyperparameters.get(
+        "max_features", [None, "sqrt", "log2", 0.1, 0.2, 0.5]
+    )
+    trial.suggest_categorical("max_features", max_features)
 
     # node split occurs if impurity is >= to this value: large values
     # prevent overfitting
-    trial.suggest_float("min_impurity_decrease", 0.01, 0.1, step=0.01)
+    min_impurity_decrease = model_hyperparameters.get(
+        "min_impurity_decrease", {"min": 0.001, "max": 0.5, "log": True}
+    )
+    trial.suggest_float(
+        "min_impurity_decrease",
+        min_impurity_decrease["min"],
+        min_impurity_decrease["max"],
+        log=min_impurity_decrease["log"],
+    )
 
-    trial.suggest_categorical("bootstrap", [True, False])
+    # bootstrap: whether bootstrap samples are used when building trees
+    # ! True, False
+    bootstrap = model_hyperparameters.get("bootstrap", [True, False])
+    trial.suggest_categorical("bootstrap", bootstrap)
 
     return {"model": "rf"}
 
 
 def get_nn_space(
-    trial, tax, model_name: str, test_mode: bool = False
+    trial,
+    tax,
+    model_name: str,
+    test_mode: bool = False,
+    model_hyperparameters: dict = {},
 ) -> Dict[str, str]:
     get_data_eng_space(trial, tax, test_mode)
-    # todo: make max_layers configurable parameter!
-    max_layers = 30
+
+    # max_layers: in network
+    max_layers = model_hyperparameters.get(
+        "max_layers", {"min": 5, "max": 30, "step": 5}
+    )
+    max_layers_selected = trial.suggest_int(
+        "max_layers", max_layers["min"], max_layers["max"], step=max_layers["step"]
+    )
+
     # Sample random uniformly between [1,max_layers] rounding to multiples of 5
-    trial.suggest_int("n_hidden_layers", 1, max_layers, step=5)
-    trial.suggest_categorical(
+    n_hidden_layers = model_hyperparameters.get(
+        "n_hidden_layers", {"min": 1, "max": max_layers, "step": 5}
+    )
+    trial.suggest_int(
+        "n_hidden_layers",
+        n_hidden_layers["min"],
+        n_hidden_layers["max"],
+        step=n_hidden_layers["step"],
+    )
+
+    learning_rate = model_hyperparameters.get(
         "learning_rate", [0.01, 0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001]
     )
-    trial.suggest_categorical("batch_size", [32, 64, 128, 256])
-    trial.suggest_categorical("epochs", [10, 50, 100, 200])
+    trial.suggest_categorical("learning_rate", learning_rate)
+
+    batch_size = model_hyperparameters.get("batch_size", [32, 64, 128, 256])
+    trial.suggest_categorical("batch_size", batch_size)
+
+    epochs = model_hyperparameters.get("epochs", [10, 50, 100, 200])
+    trial.suggest_categorical("epochs", epochs)
 
     # first and last layer are fixed by shape of features and target
-    for i in range(max_layers):
-        trial.suggest_categorical(f"n_units_hl{i}", [32, 64, 128, 256, 512])
+    n_units_hl = model_hyperparameters.get("n_units_hl", [32, 64, 128, 256, 512])
+    for i in range(max_layers_selected):
+        trial.suggest_categorical(f"n_units_hl{i}", n_units_hl)
 
     return {"model": model_name}
 
 
-def get_xgb_space(trial, tax, test_mode: bool = False) -> Dict[str, Any]:
+def get_xgb_space(
+    trial, tax, test_mode: bool = False, model_hyperparameters: dict = {}
+) -> Dict[str, Any]:
     get_data_eng_space(trial, tax, test_mode)
 
-    # value between 2 and 6 is often a good starting point
-    trial.suggest_int("max_depth", 2, 10)
+    # max_depth: value between 2 and 6 is often a good starting point
+    max_depth = model_hyperparameters.get("max_depth", {"min": 2, "max": 10})
+    trial.suggest_int("max_depth", max_depth["min"], max_depth["max"])
 
-    # depends on sample size: 0 - 2% # todo make depends on sample number
-    trial.suggest_int("min_child_weight", 0, 4)
-    trial.suggest_categorical("subsample", [0.7, 0.8, 0.9, 1.0])
-    trial.suggest_categorical("eta", [0.01, 0.05, 0.1, 0.2, 0.3])
-    trial.suggest_int("num_parallel_tree", 1, 3, step=1)
+    # min_child_weight: depends on sample size: 0 - 2%
+    # todo make dependable on sample number
+    min_child_weight = model_hyperparameters.get(
+        "min_child_weight", {"min": 0, "max": 4}
+    )
+    trial.suggest_int(
+        "min_child_weight", min_child_weight["min"], min_child_weight["max"]
+    )
+
+    # subsample
+    subsample = model_hyperparameters.get("subsample", {"min": 0.7, "max": 1.0})
+    trial.suggest_float("subsample", subsample["min"], subsample["max"])
+
+    # eta (learning rate)
+    eta = model_hyperparameters.get("eta", {"min": 0.01, "max": 0.3})
+    trial.suggest_float("eta", eta["min"], eta["max"], log=True)
+
+    # num_parallel_tree
+    num_parallel_tree = model_hyperparameters.get(
+        "num_parallel_tree", {"min": 1, "max": 3, "step": 1}
+    )
+    trial.suggest_int(
+        "num_parallel_tree",
+        num_parallel_tree["min"],
+        num_parallel_tree["max"],
+        step=num_parallel_tree["step"],
+    )
 
     return {"model": "xgb"}
 
 
-def get_trac_space(trial, tax, test_mode: bool = False) -> Dict[str, Any]:
+def get_trac_space(
+    trial, tax, test_mode: bool = False, model_hyperparameters: dict = {}
+) -> Dict[str, Any]:
     # no feature_transformation to be used for trac
     # data_aggregate=taxonomy not an option because tax tree does not match with
     # regards to feature IDs here
 
     # with loguniform: sampled values are more densely concentrated
     # towards the lower end of the range
-    trial.suggest_float("lambda", 1e-3, 1.0, log=True)
+    lambda_param = model_hyperparameters.get(
+        "lambda", {"min": 1e-3, "max": 1.0, "log": True}
+    )
+    trial.suggest_float(
+        "lambda", lambda_param["min"], lambda_param["max"], log=lambda_param["log"]
+    )
+
     data_eng_space_trac = {
         "data_aggregation": None,
         "data_selection": None,
@@ -149,7 +271,11 @@ def get_trac_space(trial, tax, test_mode: bool = False) -> Dict[str, Any]:
 
 
 def get_search_space(
-    trial, model_type: str, tax, test_mode: bool = False
+    trial,
+    model_type: str,
+    tax,
+    test_mode: bool = False,
+    model_hyperparameters: Dict[str, Any] = {},
 ) -> Optional[Dict[str, Any]]:
     """Creates the search space"""
     if model_type in ["xgb", "linreg", "rf", "trac"]:
@@ -159,8 +285,8 @@ def get_search_space(
             "rf": get_rf_space,
             "trac": get_trac_space,
         }
-        return space_functions[model_type](trial, tax, test_mode)
+        return space_functions[model_type](trial, tax, test_mode, model_hyperparameters)
     elif model_type in ["nn_reg", "nn_class", "nn_corn"]:
-        return get_nn_space(trial, tax, model_type, test_mode)
+        return get_nn_space(trial, tax, model_type, test_mode, model_hyperparameters)
     else:
         raise ValueError(f"Model type {model_type} not supported.")
diff --git a/q2_ritme/model_space/static_trainables.py b/q2_ritme/model_space/static_trainables.py
@@ -302,6 +302,8 @@ def train_rf(
         # ray tune uses joblib for parallelization - so this makes sure all
         # available resources are used
         n_jobs=None,
+        # set randomness
+        random_state=seed_model,
     )
     rf.fit(X_train, y_train)