Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: config setup and RF search space #31

Merged
merged 4 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,4 @@ x_*
# all but one json config file
*.json
!run_config.json
!run_config_whparams.json
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ To define a suitable model configuration, please find the description of each va
| test_mode | Boolean flag to indicate if running in test mode. |
| tracking_uri | Which platform to use for experiment tracking either "wandb" for WandB or "mlruns" for MLflow. See #model-tracking for set-up instructions. |
| train_size | Fraction of data to use for training (e.g., 0.8 for 80% train, 20% test split). |
| model_hyperparameters | Optional: For each model type the range of hyperparameters to check can be defined here. Note: in case this key is not provided, the default ranges are used as defined in `model_space/static_searchspace.py`. You can find an example of a configuration file with all hyperparameters defined as per default in `run_config_whparams.json`|

### Local training
To locally train models with a defined configuration in `q2_ritme/run_config.json` run:
Expand Down
192 changes: 159 additions & 33 deletions q2_ritme/model_space/static_searchspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,91 +52,213 @@ def get_data_eng_space(trial, tax, test_mode: bool = False) -> None:
return None


def get_linreg_space(trial, tax, test_mode: bool = False) -> Dict[str, str]:
def get_linreg_space(
trial, tax, test_mode: bool = False, model_hyperparameters: dict = {}
) -> Dict[str, str]:
get_data_eng_space(trial, tax, test_mode)

# alpha controls overall regularization strength, alpha = 0 is equivalent to
# an ordinary least square. large alpha -> more regularization
trial.suggest_float("alpha", 0, 1)
alpha = model_hyperparameters.get("alpha", {"min": 0, "max": 1})
trial.suggest_float("alpha", alpha["min"], alpha["max"])

# balance between L1 and L2 reg., when =0 -> L2, when =1 -> L1
trial.suggest_float("l1_ratio", 0, 1)
l1_ratio = model_hyperparameters.get("l1_ratio", {"min": 0, "max": 1})
trial.suggest_float("l1_ratio", l1_ratio["min"], l1_ratio["max"])

return {"model": "linreg"}


def get_rf_space(trial, tax, test_mode: bool = False) -> Dict[str, str]:
def get_rf_space(
trial, tax, test_mode: bool = False, model_hyperparameters: dict = {}
) -> Dict[str, str]:
get_data_eng_space(trial, tax, test_mode)

# number of trees in forest: the more the higher computational costs
trial.suggest_int("n_estimators", 50, 300)
n_estimators = model_hyperparameters.get(
"n_estimators", {"min": 40, "max": 200, "step": 20}
)
trial.suggest_int(
"n_estimators",
n_estimators["min"],
n_estimators["max"],
step=n_estimators["step"],
)

# max depths of the tree: the higher the higher probab of overfitting
trial.suggest_int("max_depth", 5, 50)
# ! 4, 8, 16, None
max_depth = model_hyperparameters.get("max_depth", [4, 8, 16, 32, None])
trial.suggest_categorical("max_depth", max_depth)

# min number of samples requires to split internal node: small
# values higher probab of overfitting
trial.suggest_float("min_samples_split", 0.01, 0.1, step=0.01)
# ! 0.001, 0.01, 0.1
min_samples_split = model_hyperparameters.get(
"min_samples_split", {"min": 0.001, "max": 0.1, "log": True}
)
trial.suggest_float(
"min_samples_split",
min_samples_split["min"],
min_samples_split["max"],
log=min_samples_split["log"],
)

# a leaf should have at least this fraction of samples within - larger
# values can help reduce overfitting
# ! 0.0001, 0.001, 0.01
min_weight_fraction_leaf = model_hyperparameters.get(
"min_weight_fraction_leaf", {"min": 0.0001, "max": 0.01, "log": True}
)
trial.suggest_float(
"min_weight_fraction_leaf",
min_weight_fraction_leaf["min"],
min_weight_fraction_leaf["max"],
log=min_weight_fraction_leaf["log"],
)

# min # samples requires at leaf node: small values higher probab
# of overfitting
trial.suggest_categorical("min_samples_leaf", [0.005, 0.01, 0.05, 0.1])
min_samples_leaf = model_hyperparameters.get(
"min_samples_leaf", {"min": 0.001, "max": 0.1, "log": True}
)
trial.suggest_float(
"min_samples_leaf",
min_samples_leaf["min"],
min_samples_leaf["max"],
log=min_samples_leaf["log"],
)

# max # features to consider when looking for best split: small can
# reduce overfitting
trial.suggest_categorical("max_features", [None, "sqrt", "log2", 0.1, 0.2, 0.5])
# ! None, "sqrt", "log2", 0.1
max_features = model_hyperparameters.get(
"max_features", [None, "sqrt", "log2", 0.1, 0.2, 0.5]
)
trial.suggest_categorical("max_features", max_features)

# node split occurs if impurity is >= to this value: large values
# prevent overfitting
trial.suggest_float("min_impurity_decrease", 0.01, 0.1, step=0.01)
min_impurity_decrease = model_hyperparameters.get(
"min_impurity_decrease", {"min": 0.001, "max": 0.5, "log": True}
)
trial.suggest_float(
"min_impurity_decrease",
min_impurity_decrease["min"],
min_impurity_decrease["max"],
log=min_impurity_decrease["log"],
)

trial.suggest_categorical("bootstrap", [True, False])
# bootstrap: whether bootstrap samples are used when building trees
# ! True, False
bootstrap = model_hyperparameters.get("bootstrap", [True, False])
trial.suggest_categorical("bootstrap", bootstrap)

return {"model": "rf"}


def get_nn_space(
trial, tax, model_name: str, test_mode: bool = False
trial,
tax,
model_name: str,
test_mode: bool = False,
model_hyperparameters: dict = {},
) -> Dict[str, str]:
get_data_eng_space(trial, tax, test_mode)
# todo: make max_layers configurable parameter!
max_layers = 30

# max_layers: in network
max_layers = model_hyperparameters.get(
"max_layers", {"min": 5, "max": 30, "step": 5}
)
max_layers_selected = trial.suggest_int(
"max_layers", max_layers["min"], max_layers["max"], step=max_layers["step"]
)

# Sample random uniformly between [1,max_layers] rounding to multiples of 5
trial.suggest_int("n_hidden_layers", 1, max_layers, step=5)
trial.suggest_categorical(
n_hidden_layers = model_hyperparameters.get(
"n_hidden_layers", {"min": 1, "max": max_layers, "step": 5}
)
trial.suggest_int(
"n_hidden_layers",
n_hidden_layers["min"],
n_hidden_layers["max"],
step=n_hidden_layers["step"],
)

learning_rate = model_hyperparameters.get(
"learning_rate", [0.01, 0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001]
)
trial.suggest_categorical("batch_size", [32, 64, 128, 256])
trial.suggest_categorical("epochs", [10, 50, 100, 200])
trial.suggest_categorical("learning_rate", learning_rate)

batch_size = model_hyperparameters.get("batch_size", [32, 64, 128, 256])
trial.suggest_categorical("batch_size", batch_size)

epochs = model_hyperparameters.get("epochs", [10, 50, 100, 200])
trial.suggest_categorical("epochs", epochs)

# first and last layer are fixed by shape of features and target
for i in range(max_layers):
trial.suggest_categorical(f"n_units_hl{i}", [32, 64, 128, 256, 512])
n_units_hl = model_hyperparameters.get("n_units_hl", [32, 64, 128, 256, 512])
for i in range(max_layers_selected):
trial.suggest_categorical(f"n_units_hl{i}", n_units_hl)

return {"model": model_name}


def get_xgb_space(trial, tax, test_mode: bool = False) -> Dict[str, Any]:
def get_xgb_space(
trial, tax, test_mode: bool = False, model_hyperparameters: dict = {}
) -> Dict[str, Any]:
get_data_eng_space(trial, tax, test_mode)

# value between 2 and 6 is often a good starting point
trial.suggest_int("max_depth", 2, 10)
# max_depth: value between 2 and 6 is often a good starting point
max_depth = model_hyperparameters.get("max_depth", {"min": 2, "max": 10})
trial.suggest_int("max_depth", max_depth["min"], max_depth["max"])

# depends on sample size: 0 - 2% # todo make depends on sample number
trial.suggest_int("min_child_weight", 0, 4)
trial.suggest_categorical("subsample", [0.7, 0.8, 0.9, 1.0])
trial.suggest_categorical("eta", [0.01, 0.05, 0.1, 0.2, 0.3])
trial.suggest_int("num_parallel_tree", 1, 3, step=1)
# min_child_weight: depends on sample size: 0 - 2%
# todo make dependable on sample number
min_child_weight = model_hyperparameters.get(
"min_child_weight", {"min": 0, "max": 4}
)
trial.suggest_int(
"min_child_weight", min_child_weight["min"], min_child_weight["max"]
)

# subsample
subsample = model_hyperparameters.get("subsample", {"min": 0.7, "max": 1.0})
trial.suggest_float("subsample", subsample["min"], subsample["max"])

# eta (learning rate)
eta = model_hyperparameters.get("eta", {"min": 0.01, "max": 0.3})
trial.suggest_float("eta", eta["min"], eta["max"], log=True)

# num_parallel_tree
num_parallel_tree = model_hyperparameters.get(
"num_parallel_tree", {"min": 1, "max": 3, "step": 1}
)
trial.suggest_int(
"num_parallel_tree",
num_parallel_tree["min"],
num_parallel_tree["max"],
step=num_parallel_tree["step"],
)

return {"model": "xgb"}


def get_trac_space(trial, tax, test_mode: bool = False) -> Dict[str, Any]:
def get_trac_space(
trial, tax, test_mode: bool = False, model_hyperparameters: dict = {}
) -> Dict[str, Any]:
# no feature_transformation to be used for trac
# data_aggregate=taxonomy not an option because tax tree does not match with
# regards to feature IDs here

# with loguniform: sampled values are more densely concentrated
# towards the lower end of the range
trial.suggest_float("lambda", 1e-3, 1.0, log=True)
lambda_param = model_hyperparameters.get(
"lambda", {"min": 1e-3, "max": 1.0, "log": True}
)
trial.suggest_float(
"lambda", lambda_param["min"], lambda_param["max"], log=lambda_param["log"]
)

data_eng_space_trac = {
"data_aggregation": None,
"data_selection": None,
Expand All @@ -149,7 +271,11 @@ def get_trac_space(trial, tax, test_mode: bool = False) -> Dict[str, Any]:


def get_search_space(
trial, model_type: str, tax, test_mode: bool = False
trial,
model_type: str,
tax,
test_mode: bool = False,
model_hyperparameters: Dict[str, Any] = {},
) -> Optional[Dict[str, Any]]:
"""Creates the search space"""
if model_type in ["xgb", "linreg", "rf", "trac"]:
Expand All @@ -159,8 +285,8 @@ def get_search_space(
"rf": get_rf_space,
"trac": get_trac_space,
}
return space_functions[model_type](trial, tax, test_mode)
return space_functions[model_type](trial, tax, test_mode, model_hyperparameters)
elif model_type in ["nn_reg", "nn_class", "nn_corn"]:
return get_nn_space(trial, tax, model_type, test_mode)
return get_nn_space(trial, tax, model_type, test_mode, model_hyperparameters)
else:
raise ValueError(f"Model type {model_type} not supported.")
2 changes: 2 additions & 0 deletions q2_ritme/model_space/static_trainables.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,8 @@ def train_rf(
# ray tune uses joblib for parallelization - so this makes sure all
# available resources are used
n_jobs=None,
# set randomness
random_state=seed_model,
)
rf.fit(X_train, y_train)

Expand Down
Loading
Loading