From 4be086161db3a1838affd8d7729d9920e1431f0b Mon Sep 17 00:00:00 2001 From: chasem Date: Tue, 19 Nov 2024 13:02:47 -0600 Subject: [PATCH 1/7] entry point added. source code significantly adjusted. tutorial unified --- .../interactor_modeling_workflow.ipynb | 782 +++++------------- yeastdnnexplorer/__main__.py | 256 ++++++ yeastdnnexplorer/ml_models/lasso_modeling.py | 362 +++++--- .../tests/ml_models/test_lasso_modeling.py | 111 ++- 4 files changed, 791 insertions(+), 720 deletions(-) diff --git a/docs/tutorials/interactor_modeling_workflow.ipynb b/docs/tutorials/interactor_modeling_workflow.ipynb index ff2792c..0386d3f 100644 --- a/docs/tutorials/interactor_modeling_workflow.ipynb +++ b/docs/tutorials/interactor_modeling_workflow.ipynb @@ -2,35 +2,25 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# configure the logger to print to console\n", - "from typing import Any\n", "import logging\n", - "from matplotlib import pyplot as plt\n", - "import re\n", "import random\n", "\n", "import pandas as pd\n", "import numpy as np\n", - "from sklearn.linear_model import LassoCV\n", - "\n", - "\n", "\n", "from yeastdnnexplorer.ml_models.lasso_modeling import (\n", " generate_modeling_data,\n", - " stratification_classification,\n", - " stratified_cv_modeling,\n", - " bootstrap_stratified_cv_modeling,\n", - " examine_bootstrap_coefficients,\n", + " get_significant_predictors,\n", " stratified_cv_r2,\n", " get_interactor_importance,\n", - " backwards_OLS_feature_selection,\n", + " OLSFeatureSelector\n", " )\n", "\n", - "\n", "logging.basicConfig(level=logging.ERROR)\n", "\n", "random.seed(42)\n", @@ -69,32 +59,38 @@ "\n", "## Interactor sparse modeling\n", "\n", - "1. First, we apply bootstrapping to a 4-fold cross-validated Lasso model. The folds\n", - "are stratified based on the binding data domain of the perturbed TF, ensuring that\n", - "each fold better represents the domain structure.\n", - "\n", - " - We produce two variations of this model:\n", - " \n", - " 1. A model trained using all available data.\n", - " \n", - " 2. A model trained using only the top 10% of data based on the binding\n", - " score of the perturbed TF.\n", - "\n", - "1. For model `1.1`, we select coefficients whose 99.8% confidence interval does not\n", - "include zero. For model `1.2`, we select coefficients whose 90.0% confidence interval\n", - "does not include zero. We assume that, due to the non-linear relationship between\n", - "perturbation response and binding, interaction effects are more pronounced in the\n", - "top 10% of the data. By intersecting the coefficients from both models, we highlight\n", - "those that are predictive across the full dataset.\n", - "\n", - "1. Now, with our reduced set of features, we perform the exact same process as Step 3 in\n", - "Workflow 1 above. That is, With this set of predictors, next create an OLS model using the same 4-fold\n", - "stratified cross validation from which we calculated an average $R^2$. Next, for each\n", - "interactor in the model, we produce one other cross validated OLS model by\n", - "replacing the interactor with its corresponding main effect. We note if this\n", - "variant yields a better average $R^2$. We remove all features from our set in which the main effect outperforms the interactor term.\n", - "\n", - "1. Finally, we report, as significant interactors, the interaction terms which have survived the steps above. We use the average R-squared achieved by this model and compare it to the average R-squared achieved by the univariate counterpart (the response TF predicted solely by its main effect). We would like to create a boxplot of this comparison across all TFs to see how this pipeline affects model performance in explaining variance in contrast to the simple univariate model.\n" + "1. We currently have two methods to produce a set of what we deem to be significant\n", + "initial parameters. In the first, we use the bootstrap on a 4-fold cross-validated\n", + "Lasso model. In this case, we consider a coefficient significant if the confidence\n", + "interval generated by the bootstrap is far enough away from 0. The second method is\n", + "to perform 4-fold cross validated Lasso once, and then with the non-zero coefficients, \n", + "iteratively use OLS modeling with a p-value threshold to drop insignificant predictors. \n", + "With either method, we produce two variations of this model:\n", + "\n", + " 1. A model trained using all available data. For the bootstrapped LassoCV, we\n", + " accept a coefficient if its 99.8% confidence interval does not include 0.0. For\n", + " the OLS reduction method, we accept a predictor when its p-value is <= 0.001\n", + " \n", + " 2. A model trained using only the top 10% of data based on the binding\n", + " score of the perturbed TF. For the bootstrapped LassoCV, we\n", + " accept a coefficient if its 90.0% confidence interval does not include 0.0. For\n", + " the OLS reduction method, we accept a predictor when its p-value is <= 0.01\n", + "\n", + "1. We intersect predictors that result from steps 1.1 and 1.2\n", + "\n", + "1. With this set of predictors, we then test whether the interactor is a significantly\n", + "better predictor than the corresponding main effect. We do this by individually\n", + "swapping out the interactor term for its main effect and comparing the average\n", + "r-squared of OLS models across 4 folds. When this process is complete, we adjust our\n", + "predictors by replacing interactors with main effects where the main effect was deemed\n", + "better.\n", + "\n", + "1. Finally, we report, as significant interactors, the interaction terms which have\n", + "survived the steps above. We use the average R-squared achieved by this model and\n", + "compare it to the average R-squared achieved by the univariate counterpart (the\n", + "response TF predicted solely by its main effect). We would like to create a boxplot\n", + "of this comparison across all TFs to see how this pipeline affects model performance\n", + "in explaining variance in contrast to the simple univariate model.\n" ] }, { @@ -119,12 +115,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Step 1: Find significant predictors using all of the data\n", - "\n", - "The function `get_significant_predictors()` is a wrapper of the lassoCV bootstrap \n", - "protocol described in the LassoCV notebook. It allows using the same code to produce\n", - "both the 'all data' (step 1.1) and 'top 10%' models (step 1.2), and returns the \n", - "significant coefficients as described in the protocol above." + "We will demonstrate both methods of identifying a significant set of interactor terms\n", + "using the TF Cbf1" ] }, { @@ -133,104 +125,24 @@ "metadata": {}, "outputs": [], "source": [ + "model_tf = 'CBF1'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 1: Find significant predictors using all of the data\n", "\n", - "# TODO: the top10% models likely should not have teh same number of classes as the\n", - "# all data, possibly not stratified at all\n", - "\n", - "def get_significant_predictors(\n", - " perturbed_tf: str,\n", - " response_df: pd.DataFrame,\n", - " predictors_df: pd.DataFrame,\n", - " add_max_lrb: bool,\n", - " **kwargs: Any,\n", - ") -> tuple[dict[str, tuple[float, float]], pd.DataFrame, pd.Series]:\n", - " \"\"\"\n", - " This function is used to get the significant predictors for a given TF. It is\n", - " capable of conducting steps 1.1 and 1.2 described above.\n", - "\n", - " :param perturbed_tf: the TF for which the significant predictors are to be\n", - " identified\n", - " :param response_df: The DataFrame containing the response values\n", - " :param predictors_df: The DataFrame containing the predictor values\n", - " :param add_max_lrb: A boolean to add/not add in the max_LRB term for a response TF\n", - " into the formula that we perform bootstrapping on\n", - " :param kwargs: Additional arguments to be passed to the function. Expected arguments\n", - " are 'quantile_threshold' from generate_modeling_data() and 'ci_percentile' from\n", - " examine_bootstrap_coefficients()\n", - " :return sig_coef_dict: A dictionary containing the significant predictors and their\n", - " corresponding coefficients\n", - "\n", - " \"\"\"\n", - "\n", - " y, X = generate_modeling_data(\n", - " perturbed_tf,\n", - " response_df,\n", - " predictors_df,\n", - " quantile_threshold=kwargs.get(\"quantile_threshold\", None),\n", - " drop_intercept=True,\n", - " )\n", - "\n", - " # NOTE: fit_intercept is set to `true`\n", - " lassoCV_estimator = LassoCV(\n", - " fit_intercept=True,\n", - " max_iter=10000,\n", - " selection=\"random\",\n", - " random_state=42,\n", - " n_jobs=4,\n", - " )\n", - "\n", - " predictor_variable = re.sub(r\"_rep\\d+\", \"\", perturbed_tf)\n", - "\n", - " stratification_classes = stratification_classification(\n", - " X[predictor_variable].squeeze(), y.squeeze()\n", - " )\n", - "\n", - " # Fit the model to the data in order to extract the alphas_ which are generated\n", - " # during the fitting process\n", - " lasso_model = stratified_cv_modeling(\n", - " y, X, stratification_classes, lassoCV_estimator\n", - " )\n", - "\n", - " if add_max_lrb:\n", - " # add a column to X which is the rowMax excluding column `predictor_variable`\n", - " # called max_lrb\n", - " X[\"max_lrb\"] = X.drop(columns=predictor_variable).max(axis=1)\n", - "\n", - " # set the alphas_ attribute of the lassoCV_estimator to the alphas_ attribute of the\n", - " # lasso_model fit on the whole data. This will allow the\n", - " # bootstrap_stratified_cv_modeling function to use the same set of lambdas\n", - " lassoCV_estimator.alphas_ = lasso_model.alphas_\n", - "\n", - " # for test purposes, set n_bootstraps to 10\n", - " # NOTE: fit_intercept=True is passed to the internal Lasso model for bootstrap\n", - " # iterations, along with some other settings\n", - "\n", - " logging.info(\"running bootstraps\")\n", - " bootstrap_lasso_output = bootstrap_stratified_cv_modeling(\n", - " y=y,\n", - " X=X,\n", - " estimator=lassoCV_estimator,\n", - " ci_percentile=kwargs.get(\"ci_percentile\", 95.0),\n", - " n_bootstraps=kwargs.get(\"n_bootstraps\", 10),\n", - " max_iter=10000,\n", - " fit_intercept=True,\n", - " selection=\"random\",\n", - " random_state=42,\n", - " )\n", - "\n", - " sig_coef_plt, sig_coef_dict = examine_bootstrap_coefficients(\n", - " bootstrap_lasso_output, ci_level=kwargs.get(\"ci_percentile\", 95.0)\n", - " )\n", - "\n", - " plt.close(sig_coef_plt)\n", - "\n", - " return sig_coef_dict, y, stratification_classes\n", - "\n" + "The function `get_significant_predictors()` is capable of choosing significant\n", + "perdictors using two methods: either by the bootstrap with LassoCV, or by a single\n", + "LassoCV model and subsequently choosing only significant coefficients through\n", + "iterative OLS modeling. " ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -238,37 +150,57 @@ "output_type": "stream", "text": [ "Significant coefficients for 99.8, where intervals are entirely above or below ±0.0:\n", - "CBF1:SWI6: (-0.14416455697760805, -0.009884405236493745)\n", - "CBF1:RGM1: (0.014576695345595829, 0.1606224517888866)\n", - "CBF1:ARG81: (-0.2148742948328769, -0.03345111234665489)\n", - "CBF1:MET28: (0.07821562304993965, 0.20784574601671207)\n", - "CBF1:AZF1: (-0.1541657715595909, -0.026421884027885357)\n", - "CBF1:GAL4: (0.09086284981352442, 0.31851294627923615)\n", - "CBF1:MSN2: (0.10065808506838021, 0.27766726330427544)\n", - "max_lrb: (0.0018092836372706894, 0.0983814591861912)\n", + "CBF1:SWI6: (-0.14416455697760802, -0.009884405236493667)\n", + "CBF1:RGM1: (0.014576695345595624, 0.16062245178888665)\n", + "CBF1:ARG81: (-0.21487429483287668, -0.03345111234665463)\n", + "CBF1:MET28: (0.07821562304993974, 0.20784574601671219)\n", + "CBF1:AZF1: (-0.15416577155959094, -0.026421884027885402)\n", + "CBF1:GAL4: (0.09086284981352438, 0.3185129462792361)\n", + "CBF1:MSN2: (0.10065808506838035, 0.2776672633042757)\n", + "max_lrb: (0.0018092836372706933, 0.0983814591861913)\n", "Significant coefficients for 90.0, where intervals are entirely above or below ±0.0:\n", - "CBF1:MET28: (0.06998122210410228, 0.2088974100580522)\n" + "CBF1:MET28: (0.06998122210410229, 0.2088974100580522)\n" ] } ], "source": [ - "all_data_sig_coef, all_y, all_stratification_classes = get_significant_predictors(\n", - " \"CBF1\",\n", + "# step 1.1 with the bootstrapped lassoCV\n", + "bootstrap_lasso_all_data = get_significant_predictors(\n", + " \"bootstrap_lassocv\",\n", + " model_tf,\n", " response_df,\n", " predictors_df,\n", " ci_percentile=99.8,\n", " n_bootstraps=100,\n", " add_max_lrb=True)\n", "\n", - "top10_data_sig_coef, top10_y, top10_stratification_classes = get_significant_predictors(\n", - " \"CBF1\",\n", + "# step 1.2 with the bootstrapped lassoCV\n", + "bootstrap_lasso_top10 = get_significant_predictors(\n", + " \"bootstrap_lassocv\",\n", + " model_tf,\n", " response_df,\n", " predictors_df,\n", " quantile_threshold=0.1,\n", " ci_percentile=90.0,\n", " n_bootstraps=100,\n", " add_max_lrb=True)\n", - "\n" + "\n", + "# step 1.1 with with the lassoCV method (the ols reduction comes later)\n", + "lassocv_ols_all_data = get_significant_predictors(\n", + " \"lassocv_ols\",\n", + " model_tf,\n", + " response_df,\n", + " predictors_df,\n", + " add_max_lrb=True)\n", + "\n", + "# step 1.2 with with the lassoCV method (the ols reduction comes later)\n", + "lassocv_ols_top10 = get_significant_predictors(\n", + " \"lassocv_ols\",\n", + " model_tf,\n", + " response_df,\n", + " predictors_df,\n", + " add_max_lrb=True,\n", + " quantile_threshold=0.1)" ] }, { @@ -285,551 +217,228 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The surviving coefficients are: {'CBF1:MET28'}\n" - ] - } - ], - "source": [ - "intersect_coefficients = set(all_data_sig_coef.keys()).intersection(set(top10_data_sig_coef.keys()))\n", - "print(f\"The surviving coefficients are: {intersect_coefficients}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 3\n", - "\n", - "We next implement the method which searches alternative models, which include the\n", - "surviving interactor terms, with variations on substituing in the main effect. In this case, \n", - "we have a single term. However, if we had more than one term, we would do the following for each surviving interactor term. The goal of this process, remember, is to generate a set of\n", - "high confidence interactor terms for this TF. If the predictive power of the main effect\n", - "is equivalent or better than a model with the interactor, we consider that a low\n", - "confidence interactor effect. \n", - "\n", - "After identifying all interactor terms for which substituting in the main effect improves\n", - "the average R-squared from CV, we drop these terms from our set of features. We then log the final average R-squared achieved by this model. In this case, no terms are dropped from testing the substitution of main effects." - ] - }, - { - "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "The full model avg r-squared is 0.01051720848723997\n", - "The interactor results are: []\n", - "The final model avg r-squared is 0.01051720848723997\n", - "Final set of terms: {'CBF1:MET28'}\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/ericjia/yeastdnnexplorer/.venv/lib/python3.11/site-packages/sklearn/model_selection/_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.\n", - " warnings.warn(\n", - "/Users/ericjia/yeastdnnexplorer/.venv/lib/python3.11/site-packages/sklearn/model_selection/_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.\n", - " warnings.warn(\n" + "Bootstrap Lasso Intersect Coefs: {'CBF1:MET28'}\n", + "LassoCV Intersect Coefs: {'CBF1:MET28', 'CBF1:SKN7', 'CBF1:SWI6', 'CBF1:AZF1'}\n" ] } ], "source": [ + "bootstrap_lassocv_final_features = (set(bootstrap_lasso_all_data['sig_coefs'].keys())\n", + " .intersection(set(bootstrap_lasso_top10['sig_coefs'].keys())))\n", "\n", - "# get the additional main effects which will be tested from the intersect_coefficients\n", - "main_effects = []\n", - "for term in intersect_coefficients:\n", - " if \":\" in term:\n", - " main_effects.append(term.split(\":\")[1])\n", - " else:\n", - " main_effects.append(term)\n", - "\n", - "# combine these main effects with the intersect_coefficients\n", - "interactor_terms_and_main_effects = list(intersect_coefficients) + main_effects\n", - "\n", - "# generate a model matrix with the intersect terms and the main effects. This full\n", - "# model will not be used for modeling -- subsets of the columns will be, however.\n", - "_, full_X = generate_modeling_data(\n", - " 'CBF1',\n", - " response_df,\n", - " predictors_df,\n", - " formula = f\"~ {' + '.join(interactor_terms_and_main_effects)}\",\n", - " drop_intercept=False,\n", - ")\n", - "\n", - "full_X[\"max_lrb\"] = predictors_df.drop(columns=\"CBF1\").max(axis=1)\n", - "\n", - "# Currently, this function tests each interactor term in the intersect_coefficients\n", - "# with two variants by replacing the interaction term with the main effect only, and\n", - "# with the main effect + interactor. If either of the variants has a higher avg\n", - "# r-squared than the intersect_model, then that variant is returned. In this case,\n", - "# the original intersect_coefficients are the best model.\n", - "full_avg_rsquared, x = get_interactor_importance(\n", - " all_y,\n", - " full_X,\n", - " all_stratification_classes,\n", - " intersect_coefficients\n", - ")\n", - "\n", - "print(f\"The full model avg r-squared is {full_avg_rsquared}\")\n", - "print(f\"The interactor results are: {x}\")\n", - "\n", - "# Now that we have identifed the interactors whose main effects improve average R-squared, we drop them from our model\n", - "\n", - "if x:\n", - " interactors_to_remove = set()\n", - " for dictionary in x:\n", - " interactor = dictionary.get(\"interactor\")\n", - " interactors_to_remove.add(interactor) \n", - " print(\"Removing term: \"+str(interactor))\n", - "\n", - " final_feature_set = intersect_coefficients.difference(interactors_to_remove)\n", - " # get the final avg r-squared for this set\n", - " final_model_avg_r_squared, _ = get_interactor_importance(\n", - " all_y,\n", - " full_X,\n", - " all_stratification_classes,\n", - " final_feature_set\n", - " )\n", + "print(f\"Bootstrap Lasso Intersect Coefs: {bootstrap_lassocv_final_features}\")\n", "\n", - "else:\n", - " final_feature_set = intersect_coefficients\n", - " final_model_avg_r_squared = full_avg_rsquared\n", + "# these are not the final features from this method!\n", + "lassocv_ols_intersect_coefs = (set(lassocv_ols_all_data['sig_coefs'].keys())\n", + " .intersection(set(lassocv_ols_top10['sig_coefs'].keys())))\n", "\n", - "print(f\"The final model avg r-squared is {final_model_avg_r_squared}\")\n", - "print(f\"Final set of terms: {final_feature_set}\")" + "print(f\"LassoCV Intersect Coefs: {lassocv_ols_intersect_coefs}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Step 4: Comparing our final model to a univariate model\n", + "### OLS reduction only\n", "\n", - "In our last step, we take our reamining set of features from the end of Step 3, and now compare its performance to that of a univariate model where the response TF is predicted solely by its main effect. We will use the average R-squared achieved by 4-Fold CV on both models." + "For the LassoCV result, we go through another round of reduction. Using the intersect\n", + "of the LassoCV result, we consider iteratively use this set of predictors with an OLS\n", + "model. At eac iteration, if a predictor's pvalue falls below a given threshold, it is\n", + "removed from the model." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "The univariate average R-squared is: 0.004970412632932103\n", - "The final model average R-squared is 0.01051720848723997\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/ericjia/yeastdnnexplorer/.venv/lib/python3.11/site-packages/sklearn/model_selection/_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.\n", - " warnings.warn(\n" + "Significant features in all data: ['CBF1:MET28', 'CBF1:SWI6']\n", + "All data model summary:\n", + " coef std_err t pvalue\n", + "const 0.421557 0.006203 67.964194 0.000000e+00\n", + "CBF1:MET28 0.138165 0.012179 11.344703 1.547294e-29\n", + "CBF1:SWI6 -0.104014 0.015297 -6.799570 1.148398e-11\n", + "Significant features in top 10: ['CBF1:MET28']\n", + "Top 10 model summary:\n", + " coef std_err t pvalue\n", + "const 0.441569 0.029296 15.072468 6.710796e-44\n", + "CBF1:MET28 0.083300 0.016322 5.103594 4.451965e-07\n", + "LassoCV OLS Final Features: {'CBF1:MET28'}\n" ] } ], "source": [ - "y, X = generate_modeling_data(\"CBF1\", response_df, predictors_df, drop_intercept=True, formula=\"CBF1_LRR ~ CBF1\")\n", - "classes = stratification_classification(X[\"CBF1\"].squeeze(), y.squeeze())\n", - "avg_r2_univariate = stratified_cv_r2(\n", - " y,\n", - " X, \n", - " classes,\n", - " )\n", - "\n", - "print(f\"The univariate average R-squared is: {avg_r2_univariate}\")\n", - "print(f\"The final model average R-squared is {final_model_avg_r_squared}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As we can see, the final model we achieved through Workflow 1 demonstrates a higher average R-squared achieved by 4-fold CV. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Interactor Modeling Workflow 2\n", - "\n", - "An alternative workflow to identifying a meaningful set of transcription factors takes a slightly different approach. There are some commonalities between this workflow and Workflow 1, and we will point them out in the steps below which outline how this alternative workflow operates.\n", - "\n", - "\n", - "1. First, we apply a 4-fold cross-validated Lasso model (without bootstrapping - this is the \n", - "difference between this step and Step 1 from Workflow 1). The folds\n", - "are stratified based on the binding data domain of the perturbed TF, ensuring that\n", - "each fold better represents the domain structure.\n", - "\n", - " - We produce two variations of this model:\n", - " \n", - " 1. A model trained using all available data.\n", - " \n", - " 2. A model trained using only the top 10% of data based on the binding\n", - " score of the perturbed TF.\n", - "\n", - "2. Each Lasso model from Step 1 will return a set of non-zero coefficients. We then intersect the coefficients from both models, and retain this list of features\n", - "in the exact same fashion as Step 2 of Workflow 1. \n", - "\n", - "3. With this set of predictors, we then perform a \"backwards OLS feature selection,\" in \n", - "which we create OLS models using this set of predictors on the entire dataset, and remove\n", - "features which have a p-value above a threshold for significance (0.001). We continously \n", - "remove features and re-create models on the reduced set of features until all of the \n", - "features in a model are significant. Then, taking this filtered set of predictors, we \n", - "perform the same process, but this time on the top 10% of the data based on the binding\n", - "score of the perturbed TF. Since we are now using a smaller dataset, our threshold for \n", - "significance is increased (0.01), and we perform the same process until we arrive at a \n", - "final model in which all of the terms are significant. \n", - "\n", - "From here onwards, we follow the same steps as Step 3 and Step 4 in Workflow 1 above. I have copied their descriptions from above, and have renamed them Steps 4 and 5 to match the numbering for this workflow.\n", - "\n", - "4. Now, with our reduced set of features, we perform the exact same process as Step 3 in\n", - "Workflow 1 above. That is, With this set of predictors, next create an OLS model using the same 4-fold\n", - "stratified cross validation from which we calculated an average $R^2$. Next, for each\n", - "interactor in the model, we produce one other cross validated OLS model by\n", - "replacing the interactor with its corresponding main effect. We note if this\n", - "variant yields a better average $R^2$. We remove all features from our set in which the main effect outperforms the interactor term.\n", - "\n", - "5. Finally, we report, as significant interactors, the interaction terms which have survived the steps above. We use the average R-squared achieved by this model and compare it to the average R-squared achieved by the univariate counterpart (the response TF predicted solely by its main effect). We would like to create a boxplot of this comparison across all TFs to see how this pipeline affects model performance in explaining variance in contrast to the simple univariate model." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's choose a particular TF to run though this workflow with." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "tf_of_interest = \"CBF1\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 1: get the non-zero coefficients from the Lasso models\n", - "\n", - "The function `get_non_zero_predictors()` is a wrapper of the stratified CV modeling \n", - "protocol described in the LassoCV notebook. It allows using the same code to produce\n", - "the Lasso models trained on the 'all data' (step 1.1) and 'top 10%' models (step 1.2), and returns the \n", - "features with non-zero coefficients as described in the protocol above." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ + "# Initialize the selector\n", + "selector_all = OLSFeatureSelector(p_value_threshold=0.001)\n", "\n", - "def get_non_zero_predictors(\n", - " perturbed_tf: str,\n", - " response_df: pd.DataFrame,\n", - " predictors_df: pd.DataFrame,\n", - " **kwargs: Any,\n", - ") -> list[str]:\n", - " \"\"\"\n", - " This function is used to get the features with non-zero coefficients from LassoCV\n", - " for a given TF. It is capable of conducting steps 1a and 1b above.\n", - "\n", - " :param perturbed_tf: str, the TF for which the significant predictors are to be\n", - " identified\n", - " :param response_df: pd.DataFrame, the response dataframe containing the response\n", - " values\n", - " :param predictors_df: pd.DataFrame, the predictors dataframe containing the\n", - " predictor values\n", - " :param kwargs: dict, additional arguments to be passed to the function. Expected\n", - " arguments is 'quantile_threshold' fom generate_modeling_data() to signify the\n", - " top 10%\n", - " :return non_zero_features: List, a list containing the features with non-zero coefs\n", - "\n", - " \"\"\"\n", - " # Validate input\n", - " if perturbed_tf not in response_df.columns:\n", - " raise ValueError(\n", - " f\"The response TF {perturbed_tf} does not exist in the response DataFrame.\"\n", - " )\n", - " if perturbed_tf not in predictors_df.columns:\n", - " raise ValueError(\n", - " f\"The response TF {perturbed_tf} does not exist in the binding DataFrame.\"\n", - " )\n", - " if response_df.shape[0] != predictors_df.shape[0]:\n", - " raise ValueError(\n", - " \"The binding and response DataFrames contain different counts of genes\"\n", - " )\n", - "\n", - " tf_y, tf_X = generate_modeling_data(\n", - " perturbed_tf,\n", - " response_df,\n", - " predictors_df,\n", - " quantile_threshold=kwargs.get(\"quantile_threshold\", None),\n", - " drop_intercept=True,\n", - " )\n", - "\n", - " # add the max_lrb term to the formula\n", - " tf_X[\"max_lrb\"] = predictors_df.drop(columns=perturbed_tf).max(axis=1)\n", - "\n", - " # NOTE: fit_intercept is set to `true`\n", - " lassoCV_estimator = LassoCV(\n", - " fit_intercept=True,\n", - " max_iter=10000,\n", - " selection=\"random\",\n", - " random_state=42,\n", - " n_jobs=4,\n", - " )\n", + "# Transform the data to select only significant features\n", + "selector_all.refine_features(\n", + " lassocv_ols_all_data['predictors'][list(lassocv_ols_intersect_coefs)],\n", + " lassocv_ols_all_data['response'],)\n", "\n", - " # create the classifications\n", - " classes = stratification_classification(tf_X[perturbed_tf], tf_y[perturbed_tf])\n", + "selector_top10 = OLSFeatureSelector(p_value_threshold=0.01)\n", "\n", - " # Fit the model to the data in order to extract the non-zero coefficients\n", - " # surviving after the fitting process\n", - " lasso_model = stratified_cv_modeling(tf_y, tf_X, classes, lassoCV_estimator)\n", + "_ = selector_top10.refine_features(\n", + " lassocv_ols_top10['predictors'].loc[lassocv_ols_top10['response'].index, \n", + " list(lassocv_ols_intersect_coefs)],\n", + " lassocv_ols_top10['response'],)\n", "\n", - " # return a list of the non-zero features that survived the fitting\n", - " non_zero_indices = lasso_model.coef_ != 0\n", - " non_zero_features = tf_X.columns[non_zero_indices]\n", "\n", - " return non_zero_features" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# Step 1.1: get the non-zero features on all data\n", - "tf_surviving_terms = get_non_zero_predictors(tf_of_interest, response_df, predictors_df)\n", - "# Step 1.2: get the non-zero features on only the top10% of genes\n", - "tf_top10_surviving_terms = get_non_zero_predictors(tf_of_interest, response_df, predictors_df, quantile_threshold=0.1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 2: Intersect the features from both Lasso models\n", - "\n", - "Here, we simply intersect the sets of non-zero features found by both of the Lasso models in Step 1." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The surviving coefficients are: {'CBF1:SWI6', 'CBF1:SKN7', 'CBF1:AZF1', 'CBF1:MET28'}\n" - ] - } - ], - "source": [ - "intersect_coefficients = set(tf_surviving_terms).intersection(set(tf_top10_surviving_terms))\n", - "print(f\"The surviving coefficients are: {intersect_coefficients}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 3: Perform backwards OLS feature selection on the intersected features\n", + "# Get significant features and the model summary\n", + "print(\"Significant features in all data:\", selector_all.get_significant_features(drop_intercept=True))\n", + "print(\"All data model summary:\\n\", selector_all.get_summary())\n", "\n", - "Now, taking our set of intersecting features from the step above, we perform the process of backwards OLS feature selection as described above. The function `backwards_OLS_feature_selection()` is a wrapper function that repeatedly calls `select_significant_features()` to perform the iterative process of removing insignificant features based on their p-value with respect to the given quantile threshold. Currently, we are performing this both on the entire dataset with a p-value threshold of 0.001, then subsequently on the top 10% by perturbed TF binding data with a p-value threshold of 0.01." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The surviving features from backwards OLS feature selection is {'CBF1:MET28', 'CBF1:SWI6'}\n" - ] - } - ], - "source": [ - "quantile_thresholds = [None, 0.1] # None for full dataset, 0.1 for top 10%\n", - "p_value_thresholds = [0.001, 0.01] # Corresponding p-value thresholds for both datasets\n", + "print(\"Significant features in top 10:\", selector_top10.get_significant_features(drop_intercept=True))\n", + "print(\"Top 10 model summary:\\n\", selector_top10.get_summary())\n", "\n", - "backward_OLS_feature_result = backwards_OLS_feature_selection(\n", - " tf_of_interest,\n", - " intersect_coefficients,\n", - " response_df,\n", - " predictors_df,\n", - " quantile_thresholds,\n", - " p_value_thresholds,\n", + "lassocv_ols_final_features = set(selector_all.get_significant_features(drop_intercept=True)).intersection(\n", + " selector_top10.get_significant_features(drop_intercept=True)\n", ")\n", - "\n", - "print(f\"The surviving features from backwards OLS feature selection is {backward_OLS_feature_result}\")" + "print(f\"LassoCV OLS Final Features: {lassocv_ols_final_features}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Step 4\n", + "## Step 3\n", + "\n", + "We next implement the method which searches alternative models, which include the\n", + "surviving interactor terms, with variations on substituing in the main effect. In this case, \n", + "we have a single term. However, if we had more than one term, we would do the following for each surviving interactor term. The goal of this process, remember, is to generate a set of\n", + "high confidence interactor terms for this TF. If the predictive power of the main effect\n", + "is equivalent or better than a model with the interactor, we consider that a low\n", + "confidence interactor effect. \n", "\n", - "This is now the exact same workflow as Step 3 from Workflow 1. To recap, we now implement the method which searches alternative models, which include the surviving interactor terms, with variations on including the main effect. The goal of this process is to generate a set of high confidence interactor terms for this TF. If the predictive power of the main effect is equivalent or better than a model with the interactor, we consider that a low confidence interactor effect." + "After identifying all interactor terms for which substituting in the main effect improves\n", + "the average R-squared from CV, we drop these terms from our set of features. We then log the final average R-squared achieved by this model. In this case, no terms are dropped from testing the substitution of main effects." ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/Users/ericjia/yeastdnnexplorer/.venv/lib/python3.11/site-packages/sklearn/model_selection/_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.\n", + "/home/chase/code/yeastdnnexplorer/.venv/lib/python3.11/site-packages/sklearn/model_selection/_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.\n", " warnings.warn(\n", - "/Users/ericjia/yeastdnnexplorer/.venv/lib/python3.11/site-packages/sklearn/model_selection/_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.\n", - " warnings.warn(\n", - "/Users/ericjia/yeastdnnexplorer/.venv/lib/python3.11/site-packages/sklearn/model_selection/_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.\n", + "/home/chase/code/yeastdnnexplorer/.venv/lib/python3.11/site-packages/sklearn/model_selection/_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.\n", " warnings.warn(\n" ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The full model avg r-squared is 0.016951766474153418\n", - "The interactor results are: []\n", - "The final model avg r-squared is 0.016951766474153418\n", - "Final set of terms: {'CBF1:MET28', 'CBF1:SWI6'}\n" - ] } ], "source": [ "\n", - "# get the additional main effects which will be tested from the backward_OLS_feature_result\n", + "# in this case, the lassocv_ols_final_features are the bootstrap_lassocv_final_features\n", + "# are the same. I'm setting the lassocv_ols_final_features to the final_features for\n", + "# the following steps.\n", + "final_features = lassocv_ols_final_features\n", + "\n", + "# get the additional main effects which will be tested from the final_features\n", "main_effects = []\n", - "for term in backward_OLS_feature_result:\n", + "for term in final_features:\n", " if \":\" in term:\n", " main_effects.append(term.split(\":\")[1])\n", " else:\n", " main_effects.append(term)\n", "\n", - "# combine these main effects with the backward_OLS_feature_result\n", - "interactor_terms_and_main_effects = list(backward_OLS_feature_result) + main_effects\n", + "# combine these main effects with the final_features\n", + "interactor_terms_and_main_effects = list(final_features) + main_effects\n", "\n", "# generate a model matrix with the intersect terms and the main effects. This full\n", "# model will not be used for modeling -- subsets of the columns will be, however.\n", "_, full_X = generate_modeling_data(\n", - " tf_of_interest,\n", + " model_tf,\n", " response_df,\n", " predictors_df,\n", " formula = f\"~ {' + '.join(interactor_terms_and_main_effects)}\",\n", - " drop_intercept=False\n", + " drop_intercept=False,\n", ")\n", "\n", - "# have to generate the stratification classes and the \"y\" column for input into \n", - "# get_interactor_importance below\n", - "y, X = generate_modeling_data(tf_of_interest, response_df, predictors_df)\n", - "all_stratification_classes = stratification_classification(X[tf_of_interest].squeeze(), y.squeeze())\n", + "# Add the max_lrb column, just in case it is present in the final_predictors. In this\n", + "# case, it is not.\n", + "max_lrb = predictors_df.drop(columns=model_tf).max(axis=1)\n", + "full_X['max_lrb'] = max_lrb\n", "\n", - "# Currently, this function tests each interactor term in the backward_OLS_feature_result\n", + "# Currently, this function tests each interactor term in the final_features\n", "# with two variants by replacing the interaction term with the main effect only, and\n", "# with the main effect + interactor. If either of the variants has a higher avg\n", - "# r-squared than the intersect_model, then that variant is returned. \n", - "full_avg_rsquared, x = get_interactor_importance(\n", - " y,\n", + "# r-squared than the intersect_model, then that variant is returned. In this case,\n", + "# the original final_features are the best model.\n", + "full_avg_rsquared, interactor_results = get_interactor_importance(\n", + " lassocv_ols_all_data['response'],\n", " full_X,\n", - " all_stratification_classes,\n", - " backward_OLS_feature_result\n", + " lassocv_ols_all_data['classes'],\n", + " final_features\n", ")\n", "\n", - "print(f\"The full model avg r-squared is {full_avg_rsquared}\")\n", - "print(f\"The interactor results are: {x}\")\n", - "\n", - "if x:\n", - " interactors_to_remove = set()\n", - " for dictionary in x:\n", - " interactor = dictionary.get(\"interactor\")\n", - " interactors_to_remove.add(interactor) \n", - " print(f\"Removing term: {interactor}\")\n", - "\n", - " final_feature_set = backward_OLS_feature_result.difference(interactors_to_remove)\n", - " # get the final avg r-squared for this set\n", - " final_model_avg_r_squared, _ = get_interactor_importance(\n", - " y,\n", - " full_X,\n", - " all_stratification_classes,\n", - " final_feature_set\n", - " )\n", - "\n", - "else:\n", - " final_feature_set = backward_OLS_feature_result\n", - " final_model_avg_r_squared = full_avg_rsquared\n", - "\n", - "print(f\"The final model avg r-squared is {final_model_avg_r_squared}\")\n", - "print(f\"Final set of terms: {final_feature_set}\")" + "# use the interactor_results to update the final_features\n", + "for interactor_variant in interactor_results:\n", + " k = interactor_variant['interactor']\n", + " v = interactor_variant['variant']\n", + " final_features.remove(k)\n", + " final_features.add(v)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Step 5\n", + "## Step 4: Comparing our final model to a univariate model\n", "\n", - "This is the same as Step 4 from Workflow 1 above. " + "In our last step, we take our reamining set of features from the end of Step 3, and now compare its performance to that of a univariate model where the response TF is predicted solely by its main effect. We will use the average R-squared achieved by 4-Fold CV on both models." ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "The univariate average R-squared is: 0.004970412632932103\n", - "The final model average R-squared is 0.016951766474153418\n" + "The univariate average R-squared is: 0.0049704126329321585\n", + "The final model average R-squared is 0.010517208487239943\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "/Users/ericjia/yeastdnnexplorer/.venv/lib/python3.11/site-packages/sklearn/model_selection/_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.\n", + "/home/chase/code/yeastdnnexplorer/.venv/lib/python3.11/site-packages/sklearn/model_selection/_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.\n", + " warnings.warn(\n", + "/home/chase/code/yeastdnnexplorer/.venv/lib/python3.11/site-packages/sklearn/model_selection/_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.\n", " warnings.warn(\n" ] } ], "source": [ - "y, X = generate_modeling_data(tf_of_interest, response_df, predictors_df, drop_intercept=True, formula=f\"{tf_of_interest}_LRR ~ {tf_of_interest}\")\n", "avg_r2_univariate = stratified_cv_r2(\n", - " y,\n", - " X, \n", - " all_stratification_classes,\n", - " )\n", + " lassocv_ols_all_data['response'],\n", + " lassocv_ols_all_data['predictors'][[model_tf]], \n", + " lassocv_ols_all_data['classes'],\n", + ")\n", + "\n", + "final_model_avg_r_squared = stratified_cv_r2(\n", + " lassocv_ols_all_data['response'],\n", + " full_X[list(final_features)],\n", + " lassocv_ols_all_data['classes'],\n", + ")\n", "\n", "print(f\"The univariate average R-squared is: {avg_r2_univariate}\")\n", "print(f\"The final model average R-squared is {final_model_avg_r_squared}\")" @@ -839,15 +448,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As we can see, the final mdel we achieved through Workflow 2 demonstrates a higher average R-squared achieved by 4-fold CV. " + "As we can see, the final model we achieved through Workflow 1 demonstrates a higher average R-squared achieved by 4-fold CV. " ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -866,7 +468,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.1" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/yeastdnnexplorer/__main__.py b/yeastdnnexplorer/__main__.py index 6e67682..e0db7a5 100644 --- a/yeastdnnexplorer/__main__.py +++ b/yeastdnnexplorer/__main__.py @@ -12,10 +12,14 @@ from sklearn.linear_model import LassoCV from yeastdnnexplorer.ml_models.lasso_modeling import ( + OLSFeatureSelector, bootstrap_stratified_cv_modeling, generate_modeling_data, + get_interactor_importance, + get_significant_predictors, stratification_classification, stratified_cv_modeling, + stratified_cv_r2, ) from yeastdnnexplorer.utils import LogLevel, configure_logger @@ -185,6 +189,160 @@ def run_lasso_bootstrap(args: argparse.Namespace) -> None: bootstrap_alphas.to_csv(bootstrap_alphas_path, index=False) +def find_interactors_workflow(args: argparse.Namespace) -> None: + """ + Run the find_interactors_workflow with the specified arguments. + + :param args: The parsed command-line arguments. + + """ + output_dirpath = args.output_dir + if os.path.exists(output_dirpath): + raise FileExistsError( + f"File {output_dirpath} already exists. " + "Please specify a different `output_dir`." + ) + else: + os.makedirs(output_dirpath, exist_ok=True) + if not os.path.exists(args.response_file): + raise FileNotFoundError(f"File {args.response_file} does not exist.") + if not os.path.exists(args.predictors_file): + raise FileNotFoundError(f"File {args.predictors_file} does not exist.") + + # Load data + response_df = pd.read_csv(args.response_file, index_col=0) + predictors_df = pd.read_csv(args.predictors_file, index_col=0) + + # Step 1: Run LassoCV, possibly with the bootstrap depending on args.method + lasso_res = { + "all": get_significant_predictors( + args.method, + args.response_tf, + response_df, + predictors_df, + ci_percentile=args.all_ci_percentile, + n_bootstraps=args.n_bootstraps, + add_max_lrb=True, + ), + "top": get_significant_predictors( + args.method, + args.response_tf, + response_df, + predictors_df, + ci_percentile=args.top_ci_percentile, + n_bootstraps=args.n_bootstraps, + add_max_lrb=True, + quantile_threshold=args.data_quantile, + ), + } + + # Step 2: find the intersect coefficients between the all and top models. This is + # performed differently depending on args.method (see the tutorial) + if args.method == "lassocv_ols": + lassocv_ols_intersect_coefs = set( + lasso_res["all"]["sig_coefs"].keys() + ).intersection(set(lasso_res["top"]["sig_coefs"].keys())) + + # Initialize the selector + selector_all = OLSFeatureSelector(p_value_threshold=args.all_pval_threshold) + + # Transform the data to select only significant features + selector_all.refine_features( + lasso_res["all"]["predictors"][list(lassocv_ols_intersect_coefs)], + lasso_res["all"]["response"], + ) + + selector_top10 = OLSFeatureSelector(p_value_threshold=args.top_pval_threshold) + + _ = selector_top10.refine_features( + lasso_res["top"]["predictors"].loc[ + lasso_res["top"]["response"].index, list(lassocv_ols_intersect_coefs) + ], + lasso_res["top"]["response"], + ) + + final_features = set( + selector_all.get_significant_features(drop_intercept=True) + ).intersection(selector_top10.get_significant_features(drop_intercept=True)) + + else: + final_features = set(lasso_res["all"]["sig_coefs"].keys()).intersection( + set(lasso_res["top"]["sig_coefs"].keys()) + ) + + # Step 3: determine if the interactor predictor is significant compared to its + # main effect + # get the additional main effects which will be tested from the final_features + main_effects = [] + for term in final_features: + if ":" in term: + main_effects.append(term.split(":")[1]) + else: + main_effects.append(term) + + # combine these main effects with the final_features + interactor_terms_and_main_effects = list(final_features) + main_effects + + # generate a model matrix with the intersect terms and the main effects. This full + # model will not be used for modeling -- subsets of the columns will be, however. + _, full_X = generate_modeling_data( + args.response_tf, + response_df, + predictors_df, + formula=f"~ {' + '.join(interactor_terms_and_main_effects)}", + drop_intercept=False, + ) + + # Add the max_lrb column, just in case it is present in the final_predictors. In this + # case, it is not. + model_tf = re.sub("_rep\\d+", "", args.response_tf) + max_lrb = predictors_df.drop(columns=model_tf).max(axis=1) + full_X["max_lrb"] = max_lrb + + # Currently, this function tests each interactor term in the final_features + # with two variants by replacing the interaction term with the main effect only, and + # with the main effect + interactor. If either of the variants has a higher avg + # r-squared than the intersect_model, then that variant is returned. In this case, + # the original final_features are the best model. + full_avg_rsquared, interactor_results = get_interactor_importance( + lasso_res["all"]["response"], + full_X, + lasso_res["all"]["classes"], + final_features, + ) + + # use the interactor_results to update the final_features + for interactor_variant in interactor_results: + k = interactor_variant["interactor"] + v = interactor_variant["variant"] + final_features.remove(k) + final_features.add(v) + + # Step 4: compare the results of the final model with a univariate model + avg_r2_univariate = stratified_cv_r2( + lasso_res["all"]["response"], + lasso_res["all"]["predictors"][[model_tf]], + lasso_res["all"]["classes"], + ) + + final_model_avg_r_squared = stratified_cv_r2( + lasso_res["all"]["response"], + full_X[list(final_features)], + lasso_res["all"]["classes"], + ) + + output_dict = { + "response_tf": args.response_tf, + "final_features": list(final_features), + "avg_r2_univariate": avg_r2_univariate, + "final_model_avg_r_squared": final_model_avg_r_squared, + } + + output_path = os.path.join(args.output_dir, f"{args.response_tf}_output.json") + with open(output_path, "w") as f: + json.dump(output_dict, f, indent=4) + + class CustomHelpFormatter(argparse.HelpFormatter): """Custom help formatter to format the subcommands and general options sections.""" @@ -325,6 +483,104 @@ def main() -> None: lasso_parser.set_defaults(func=run_lasso_bootstrap) + # Find Interactors Workflow command + find_interactors_parser = subparsers.add_parser( + "find_interactors_workflow", + help="Run the find interactors workflow", + description="Run the find interactors workflow", + formatter_class=CustomHelpFormatter, + ) + + # Input arguments + input_group = find_interactors_parser.add_argument_group("Input") + input_group.add_argument( + "--response_file", + type=str, + required=True, + help="Path to the response CSV file. NOTE: the index column must be " + "present and the first column in the CSV. Additionally, the index values " + "should be either symbols or locus tags, matching the index values in both " + "response and predictors files. The perturbed gene will be removed from " + "the model data only if column names in response data match the index " + "format (e.g., symbol or locus tag).", + ) + input_group.add_argument( + "--predictors_file", + type=str, + required=True, + help="Path to the predictors CSV file. NOTE: the index column must be " + "present and the first column in the CSV. Additionally, the index values " + "should be either symbols or locus tags, matching the index values in both " + "response and predictors files. The perturbed gene will be removed from the " + "model data only if column names in predictors data match the index " + "format (e.g., symbol or locus tag).", + ) + input_group.add_argument( + "--response_tf", + type=str, + required=True, + help="The response TF to use for the modeling.", + ) + input_group.add_argument( + "--method", + type=str, + required=True, + choices=["bootstrap_lassocv", "lassocv_ols"], + help="The method to use for modeling.", + ) + input_group.add_argument( + "--all_ci_percentile", + type=float, + default=99.8, + help="The percentile to use for the all model CI. This will only be used " + "if the method is `bootstrap_lassocv`.", + ) + input_group.add_argument( + "--top_ci_percentile", + type=float, + default=90.0, + help="The percentile to use for the top model CI. This will only be used " + "if the method is `bootstrap_lassocv`.", + ) + input_group.add_argument( + "--all_pval_threshold", + type=float, + default=0.001, + help="The p-value threshold to use for the all model. This will only be used " + "if the method is `lassocv_ols`.", + ) + input_group.add_argument( + "--top_pval_threshold", + type=float, + default=0.01, + help="The p-value threshold to use for the top model. This will only be used " + "if the method is `lassocv_ols`.", + ) + input_group.add_argument( + "--data_quantile", + type=float, + default=0.1, + help="The quantile threshold to use for the `top` data. See the tutorial for " + "more information.", + ) + input_group.add_argument( + "--n_bootstraps", + type=int, + default=1000, + help="Number of bootstrap samples to generate.", + ) + + # output arguments + output_group = find_interactors_parser.add_argument_group("Output") + output_group.add_argument( + "--output_dir", + type=str, + default="./find_interactors_output", + help="Path to the output directory where results will be saved.", + ) + + find_interactors_parser.set_defaults(func=find_interactors_workflow) + # Add the general arguments to the subcommand parsers add_general_arguments_to_subparsers(subparsers, [log_level_argument]) diff --git a/yeastdnnexplorer/ml_models/lasso_modeling.py b/yeastdnnexplorer/ml_models/lasso_modeling.py index 39a36eb..536eaf5 100644 --- a/yeastdnnexplorer/ml_models/lasso_modeling.py +++ b/yeastdnnexplorer/ml_models/lasso_modeling.py @@ -1,20 +1,20 @@ import logging import re import warnings -from typing import Any +from typing import Any, Literal, Union import matplotlib.pyplot as plt import numpy as np import pandas as pd import patsy as pt import seaborn as sns -import statsmodels.api as sm from matplotlib.figure import Figure -from sklearn.base import BaseEstimator, clone +from sklearn.base import BaseEstimator, TransformerMixin, clone from sklearn.linear_model import LassoCV, LinearRegression from sklearn.metrics import r2_score from sklearn.model_selection import StratifiedKFold from sklearn.utils import resample +from statsmodels.api import OLS, add_constant logger = logging.getLogger("main") @@ -540,6 +540,127 @@ def examine_bootstrap_coefficients( return fig, significant_coefs_dict +def get_significant_predictors( + method: Literal["lassocv_ols", "bootstrap_lassocv"], + perturbed_tf: str, + response_df: pd.DataFrame, + predictors_df: pd.DataFrame, + add_max_lrb: bool, + **kwargs: Any, +) -> dict[str, Union[set[str], pd.DataFrame, np.ndarray]]: + """ + This function is used to get the significant predictors for a given TF using + bootstrapped LassoCV. It wraps `generate_modeling_data`, + `stratification_classification`, `stratified_cv_modeling`, + `bootstrap_stratified_cv_modeling` and `examine_bootstrap_coefficients`. + + :param method: This must be 'lassocv_ols', which will conduct a single lassocv call + followed by pruning non zero coefficients by pvalue until all are significant + at a given threshold, or 'bootstrap_lassocv', which will conduct bootstrapped + lassoCV and return only coefficients which are deemed significant by + ci_percentile and threshold (see `examine_bootstrap_coeficients` for more info) + :param perturbed_tf: the TF for which the significant predictors are to be + identified + :param response_df: The DataFrame containing the response values + :param predictors_df: The DataFrame containing the predictor values + :param add_max_lrb: A boolean to add/not add in the max_LRB term for a response TF + into the formula that we perform bootstrapping on + :param kwargs: Additional arguments to be passed to the function. Expected arguments + are 'quantile_threshold' from generate_modeling_data() and 'ci_percentile' from + examine_bootstrap_coefficients() + + :return a dictionary with keys 'sig_coefs', 'response', 'classes' where: + - 'sig_coefs' is a dictionary of significant coefficients + - 'response' is the response variable + - 'classes' is the stratification classes for the data + + """ + if method not in ["lassocv_ols", "bootstrap_lassocv"]: + raise ValueError( + "method {} unrecognized. " + "Must be one of ['lassocv_ols', 'bootstrap_lassocv']" + ) + + y, X = generate_modeling_data( + perturbed_tf, + response_df, + predictors_df, + quantile_threshold=kwargs.get("quantile_threshold", None), + drop_intercept=True, + ) + + # NOTE: fit_intercept is set to `true` + lassoCV_estimator = LassoCV( + fit_intercept=True, + max_iter=10000, + selection="random", + random_state=42, + n_jobs=4, + ) + + predictor_variable = re.sub(r"_rep\d+", "", perturbed_tf) + + stratification_classes = stratification_classification( + X[predictor_variable].squeeze(), y.squeeze() + ) + + if add_max_lrb: + # add a column to X which is the rowMax excluding column `predictor_variable` + # called max_lrb + max_lrb = X.drop(columns=predictor_variable).max(axis=1) + X["max_lrb"] = max_lrb + + # Fit the model to the data in order to extract the alphas_ which are generated + # during the fitting process + lasso_model = stratified_cv_modeling( + y, X, stratification_classes, lassoCV_estimator + ) + + if method == "lassocv_ols": + # return a list of the non-zero features that survived the fitting + non_zero_indices = lasso_model.coef_ != 0 + non_zero_features = X.columns[non_zero_indices] + sig_coef_dict = { + k: v for k, v in zip(non_zero_features, lasso_model.coef_[non_zero_indices]) + } + + elif method == "bootstrap_lassocv": + + # set the alphas_ attribute of the lassoCV_estimator to the alphas_ attribute of the + # lasso_model fit on the whole data. This will allow the + # bootstrap_stratified_cv_modeling function to use the same set of lambdas + lassoCV_estimator.alphas_ = lasso_model.alphas_ + + logging.info("running bootstraps") + bootstrap_lasso_output = bootstrap_stratified_cv_modeling( + y=y, + X=X, + estimator=lassoCV_estimator, + ci_percentile=kwargs.get("ci_percentile", 95.0), + n_bootstraps=kwargs.get("n_bootstraps", 1000), + max_iter=10000, + fit_intercept=True, + selection="random", + random_state=42, + ) + + sig_coef_plt, sig_coef_dict = examine_bootstrap_coefficients( + bootstrap_lasso_output, ci_level=kwargs.get("ci_percentile", 95.0) + ) + + plt.close(sig_coef_plt) + + else: + ValueError(f"method {method} not recognized") + + return { + "sig_coefs": sig_coef_dict, + "predictors": X, + "response": y, + "classes": stratification_classes, + } + + def stratified_cv_r2( y: pd.DataFrame, X: pd.DataFrame, @@ -562,17 +683,18 @@ def stratified_cv_r2( :return: the average r-squared value for the stratified CV """ - # Check if the X matrix already has an intercept - if "Intercept" not in X.columns: - X = X.copy() # Ensure we don't modify the original DataFrame - X["Intercept"] = 1.0 + # If there is no constant term, add one + X_with_intercept = add_constant(X, has_constant="skip") estimator_local = clone(estimator) r2_scores = [] - for train_idx, test_idx in skf.split(X, classes): + for train_idx, test_idx in skf.split(X_with_intercept, classes): # Use train and test indices to split X and y - X_train, X_test = X.iloc[train_idx], X.iloc[test_idx] + X_train, X_test = ( + X_with_intercept.iloc[train_idx], + X_with_intercept.iloc[test_idx], + ) y_train, y_test = y.iloc[train_idx], y.iloc[test_idx] # Fit the model @@ -615,7 +737,12 @@ def try_interactor_variants( if stratification_classes is None: raise ValueError("stratification_classes must be passed as a keyword argument") + # the main effect is assumed to be the second term in the interactor main_effect = interactor.split(":")[1] + + # Add different ways of replacing the interactor term here. if only [main_effect], + # then the only variant tested will be replacing the interactor term with the main + # effect interactor_formula_variants = [main_effect] output = [] @@ -688,111 +815,132 @@ def get_interactor_importance( stratification_classes=stratification_classes, ) - if interactor_variant_results[0]["avg_r2"] > input_model_avg_rsquared: - interactor_results.append(interactor_variant_results[0]) - - return input_model_avg_rsquared, interactor_results - - -def select_significant_features( - feature_set: set, y: pd.DataFrame, X: pd.DataFrame, p_value_threshold: float -) -> list: - """ - Select significant features from a given set of predictors using OLS. - - :param feature_set: Set of predictor features to be filtered on. - :param y: A DataFrame containing the response data for the perturbed TF. - :param X: A DataFrame containing all of the predictor data which we will identify - the relevant subset for fitting the model using feature_set - :param p_value_threshold: A threshold for qualifying significance for features. - :return: List of significant predictors with original names. - - """ + # Find the variant with the maximum avg_r2 + best_variant = max(interactor_variant_results, key=lambda x: x["avg_r2"]) - X = X.loc[:, list(feature_set)] - # Add in the intercept term - X["Intercept"] = 1.0 - # Fit the OLS model and identify significant features - model = sm.OLS(y, X).fit() - significant_features = [ - feature - for feature, pval in model.pvalues.items() - if pval < p_value_threshold and feature != "Intercept" - ] + # Compare its avg_r2 to the input model's avg_r2 + if best_variant["avg_r2"] > input_model_avg_rsquared: + interactor_results.append(best_variant) - return significant_features + return input_model_avg_rsquared, interactor_results -def backwards_OLS_feature_selection( - perturbed_tf: str, - intersect_coefficients: set[str], - response_df: pd.DataFrame, - predictors_df: pd.DataFrame, - quantile_thresholds: list[float | None], - p_value_thresholds: list[float], -) -> set[str]: +class OLSFeatureSelector(BaseEstimator, TransformerMixin): """ - Perform backward feature selection using OLS to iteratively filter down a set of - input features based on multiple quantile and p-value thresholds. This now takes in - two sets of identical sizes in which a desired quantile on the data, as well as the - corresponding p-value threshold to use on this data are given. This way, it can - handle any combination of quantiles and thresholds and perform backwards OLS feature - selection on all of them. - - :param perturbed_tf: The name of the response TF. - :param intersect_coefficients: The initial intersected set of predictor features - passed in from Step 2. - :param response_df: A DataFrame containing the response data for the response TF. - :param predictors_df: A DataFrame containing the predictor data for the response TF. - :param quantile_thresholds: A list of quantile thresholds to filter the data. - Each value should be a float between 0 and 1, or None for no filtering. - :param p_value_thresholds: A list of corresponding p-value thresholds for each - quantile. The length must match `quantile_thresholds`. - - :return: The final refined set of significant predictors. - - :raises ValueError: If `quantile_thresholds` and `p_value_thresholds` are not the - same length. - + This class performs iterative feature selection using OLS. It removes non-significant + features until all remaining features are significant. """ - # Ensure input lists are of the same length - if len(quantile_thresholds) != len(p_value_thresholds): - raise ValueError( - "quantile_thresholds and p_value_thresholds must have the same length." - ) - curr_feature_set = intersect_coefficients - - for quantile, p_value_threshold in zip(quantile_thresholds, p_value_thresholds): - # Generate modeling data for the current quantile - y, X = generate_modeling_data( - perturbed_tf, - response_df, - predictors_df, - quantile_threshold=quantile, - drop_intercept=True, + def __init__(self, p_value_threshold=0.05): + """ + Initialize the OLSFeatureSelector. + + :param p_value_threshold: The threshold for significance of features. + """ + self.p_value_threshold = p_value_threshold + self.significant_features_ = [] + self.summary_ = None + + def fit(self, X, y, **kwargs) -> "OLSFeatureSelector": + """ + Fit the OLS model and identify significant features. Significant features are + selected based based on coef p-value <= p_value_threshold. + + :param X: A DataFrame of predictors. + :param y: A Series of the response variable. + :param kwargs: Optional arguments for `add_constant()`. + :return: self + """ + if not isinstance(X, pd.DataFrame): + raise ValueError(f"X must be a DataFrame, but got {type(X)}.") + + # Add an intercept term + X_with_intercept = add_constant( + X, has_constant=kwargs.get("has_constant", "skip") ) - y = y.add_suffix("_LRR") - - # Adding the max_lrb column to the data - X["max_lrb"] = predictors_df.drop(columns=perturbed_tf).max(axis=1) - - # Initialize variables for the while loop - prev_set_size = 0 - curr_set_size = len(curr_feature_set) + # Fit the OLS model + model = OLS(y, X_with_intercept).fit() - # Perform iterative feature selection - while curr_set_size != prev_set_size and curr_set_size > 0: - curr_feature_set = set( - select_significant_features( - curr_feature_set, - y, - X, - p_value_threshold=p_value_threshold, - ) - ) - prev_set_size = curr_set_size - curr_set_size = len(curr_feature_set) - - return curr_feature_set + # Save the summary table + summary_data = { + "coef": model.params, + "std_err": model.bse, + "t": model.tvalues, + "pvalue": model.pvalues, + } + self.summary_ = pd.DataFrame(summary_data) + + # Select significant features based on p-values + self.significant_features_ = [ + feature + for feature, pval in model.pvalues.items() + if pval <= self.p_value_threshold and feature != "const" + ] + return self + + def transform(self, X) -> pd.DataFrame: + """ + Iteratively apply OLS to remove non-significant features. + + :param X: A DataFrame of predictors. + :return: A DataFrame with only significant features. + """ + if not self.significant_features_: + raise ValueError("The model has not been fitted yet. Call `fit` first.") + + # Ensure the input DataFrame contains all required columns + missing_features = set(self.significant_features_) - set(X.columns) + if missing_features: + raise ValueError(f"Missing required features: {missing_features}") + + return X[self.significant_features_] + + def refine_features(self, X, y) -> pd.DataFrame: + """ + Iteratively fit the selector and transform the data in one step. + + :param X: A DataFrame of predictors. + :param y: A Series of the response variable. + + :return: A DataFrame with only significant predictors. + """ + remaining_predictors = X.copy() + + while True: + # Fit the model + self.fit(remaining_predictors, y) + + # If all predictors are significant, stop iteration + if set(self.significant_features_) == set(remaining_predictors.columns): + break + + # Retain only significant predictors for the next iteration + remaining_predictors = remaining_predictors[self.significant_features_] + + return remaining_predictors + + def get_significant_features(self, drop_intercept=True) -> list: + """ + Get the list of significant features. + + param drop_intercept: Whether to exclude the intercept term from the list. + NOTE: this only looks for a feature called "Intercept" + + :return: List of significant feature names. + """ + if not self.significant_features_: + raise ValueError("The model has not been fitted yet. Call `fit` first.") + if drop_intercept and "Intercept" in self.significant_features_: + return [f for f in self.significant_features_ if f != "Intercept"] + return self.significant_features_ + + def get_summary(self) -> pd.DataFrame: + """ + Get the OLS model summary as a DataFrame. + + :return: A DataFrame containing coefficients, standard errors, t-values, and p-values. + """ + if self.summary_ is None: + raise ValueError("The model has not been fitted yet. Call `fit` first.") + return self.summary_ diff --git a/yeastdnnexplorer/tests/ml_models/test_lasso_modeling.py b/yeastdnnexplorer/tests/ml_models/test_lasso_modeling.py index 2dbb432..1c7f75f 100644 --- a/yeastdnnexplorer/tests/ml_models/test_lasso_modeling.py +++ b/yeastdnnexplorer/tests/ml_models/test_lasso_modeling.py @@ -6,12 +6,11 @@ # Import the functions from the module being tested from yeastdnnexplorer.ml_models.lasso_modeling import ( - backwards_OLS_feature_selection, + OLSFeatureSelector, bootstrap_stratified_cv_modeling, examine_bootstrap_coefficients, generate_modeling_data, get_interactor_importance, - select_significant_features, stratification_classification, stratified_cv_modeling, stratified_cv_r2, @@ -155,27 +154,6 @@ def test_select_significant_features(sample_data): drop_intercept=True, ) - # Combine y and X into a single DataFrame for Patsy - y = y.add_suffix("_LRR") - feature_set = {"tf1", "tf1:tf2", "tf1:tf3"} - significant_features = select_significant_features(feature_set, y, X, 0.05) - assert isinstance(significant_features, list), "The output should be a list." - assert all( - isinstance(feature, str) for feature in significant_features - ), "All features should be strings." - - -# test for backwards_OLS_Feature_selection -def test_backwards_OLS_feature_selection(sample_data): - response_df, predictors_df = sample_data - intersect_coefficients = {"tf1", "tf1:tf2", "tf1:tf3"} - final_features = backwards_OLS_feature_selection( - "tf1", intersect_coefficients, response_df, predictors_df, [None], [0.05] - ) - assert ( - final_features or len(final_features) == 0 - ), "The set can be empty or have valid features." - # test for get_interactor_importance def test_get_interactor_importance(sample_data): @@ -247,3 +225,90 @@ def test_stratified_cv_r2(sample_data): r2 = stratified_cv_r2(y, X, classes) assert isinstance(r2, float), "The result should be a float." assert -1 <= r2 <= 1, "R² should be between 0 and 1." + + +def test_ols_feature_selector_fit(): + # Create sample data + np.random.seed(42) + X = pd.DataFrame( + { + "feature1": np.random.rand(100), + "feature2": np.random.rand(100), + "feature3": np.random.rand(100), + } + ) + y = 2 * X["feature1"] + 3 * X["feature3"] + np.random.rand(100) * 0.1 + + # Initialize selector + selector = OLSFeatureSelector(p_value_threshold=0.05) + + # Fit the selector + selector.fit(X, y) + + # Check that the significant features include "feature1" + assert "feature2" not in selector.get_significant_features() + + # Check that the summary is not None + summary = selector.get_summary() + assert summary is not None + assert "pvalue" in summary.columns + + +def test_ols_feature_selector_transform(): + # Create sample data + np.random.seed(42) + X = pd.DataFrame( + { + "feature1": np.random.rand(100), + "feature2": np.random.rand(100), + "feature3": np.random.rand(100), + } + ) + y = 2 * X["feature1"] + 3 * X["feature3"] + np.random.rand(100) * 0.1 + + # Initialize and fit selector + selector = OLSFeatureSelector(p_value_threshold=0.05) + selector.fit(X, y) + + # Transform X + X_transformed = selector.transform(X) + + # Check that the transformed X contains only significant features + assert X_transformed.shape[1] == len(selector.get_significant_features()) + assert "feature2" not in X_transformed.columns + + +def test_ols_feature_selector_refine_features(): + # Create sample data + np.random.seed(42) + X = pd.DataFrame( + { + "feature1": np.random.rand(100), + "feature2": np.random.rand(100), + "feature3": np.random.rand(100), + } + ) + y = 2 * X["feature1"] + 3 * X["feature3"] + np.random.rand(100) * 0.1 + + # Initialize selector + selector = OLSFeatureSelector(p_value_threshold=0.05) + + # Refine features + refined_X = selector.refine_features(X, y) + + # Check that the refined X contains only significant features + assert refined_X.shape[1] == len(selector.get_significant_features()) + assert "feature2" not in refined_X.columns + + +def test_ols_feature_selector_errors(): + # Initialize selector + selector = OLSFeatureSelector(p_value_threshold=0.05) + + # Test transform before fit + with pytest.raises(ValueError, match="The model has not been fitted yet"): + selector.transform(pd.DataFrame({"feature1": [1, 2, 3]})) + + # Test get_summary before fit + with pytest.raises(ValueError, match="The model has not been fitted yet"): + selector.get_summary() From 3bd7c52d582010f96353f394904a7bc6ce84681a Mon Sep 17 00:00:00 2001 From: chasem Date: Tue, 19 Nov 2024 13:15:07 -0600 Subject: [PATCH 2/7] removing macos 12 --- .github/workflows/ci.yml | 1 - yeastdnnexplorer/__main__.py | 55 +++++++++++++++----- yeastdnnexplorer/ml_models/lasso_modeling.py | 24 ++++++--- 3 files changed, 57 insertions(+), 23 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c6c627b..40e5cd0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -37,7 +37,6 @@ jobs: ubuntu-22.04, ubuntu-20.04, macos-13, - macos-12, windows-2022, windows-2019, ] diff --git a/yeastdnnexplorer/__main__.py b/yeastdnnexplorer/__main__.py index e0db7a5..ca5f49d 100644 --- a/yeastdnnexplorer/__main__.py +++ b/yeastdnnexplorer/__main__.py @@ -236,29 +236,58 @@ def find_interactors_workflow(args: argparse.Namespace) -> None: ), } + # Ensure lasso_res["all"]["sig_coefs"] and + # lasso_res["top"]["sig_coefs"] are dictionaries + all_sig_coefs = lasso_res["all"]["sig_coefs"] + top_sig_coefs = lasso_res["top"]["sig_coefs"] + + # intersect with type checking + if isinstance(all_sig_coefs, dict) and isinstance(top_sig_coefs, dict): + lasso_intersect_coefs = set(all_sig_coefs.keys()).intersection( + set(top_sig_coefs.keys()) + ) + else: + raise TypeError( + "Expected 'sig_coefs' to be dictionaries in 'all' and 'top', " + f"but got {type(all_sig_coefs)} and {type(top_sig_coefs)}." + ) + + # extract the predictors and ensure that they are dataframes + all_predictors = lasso_res["all"]["predictors"] + top_predictors = lasso_res["top"]["predictors"] + top_response = lasso_res["top"]["response"] + + if not isinstance(all_predictors, pd.DataFrame) or not isinstance( + top_predictors, pd.DataFrame + ): + raise TypeError( + "Expected 'predictors' to be dataframes in 'all' and 'top', " + f"but got {type(all_predictors)} and {type(top_predictors)}." + ) + if not isinstance(top_response, pd.DataFrame): + raise TypeError( + "Expected 'response' to be a dataframe in 'top', " + f"but got {type(top_response)}." + ) + # Step 2: find the intersect coefficients between the all and top models. This is # performed differently depending on args.method (see the tutorial) if args.method == "lassocv_ols": - lassocv_ols_intersect_coefs = set( - lasso_res["all"]["sig_coefs"].keys() - ).intersection(set(lasso_res["top"]["sig_coefs"].keys())) # Initialize the selector selector_all = OLSFeatureSelector(p_value_threshold=args.all_pval_threshold) # Transform the data to select only significant features selector_all.refine_features( - lasso_res["all"]["predictors"][list(lassocv_ols_intersect_coefs)], + all_predictors[list(lasso_intersect_coefs)], lasso_res["all"]["response"], ) selector_top10 = OLSFeatureSelector(p_value_threshold=args.top_pval_threshold) _ = selector_top10.refine_features( - lasso_res["top"]["predictors"].loc[ - lasso_res["top"]["response"].index, list(lassocv_ols_intersect_coefs) - ], - lasso_res["top"]["response"], + top_predictors.loc[top_response.index, list(lasso_intersect_coefs)], + top_response, ) final_features = set( @@ -266,9 +295,7 @@ def find_interactors_workflow(args: argparse.Namespace) -> None: ).intersection(selector_top10.get_significant_features(drop_intercept=True)) else: - final_features = set(lasso_res["all"]["sig_coefs"].keys()).intersection( - set(lasso_res["top"]["sig_coefs"].keys()) - ) + final_features = lasso_intersect_coefs # Step 3: determine if the interactor predictor is significant compared to its # main effect @@ -293,8 +320,8 @@ def find_interactors_workflow(args: argparse.Namespace) -> None: drop_intercept=False, ) - # Add the max_lrb column, just in case it is present in the final_predictors. In this - # case, it is not. + # Add the max_lrb column, just in case it is present in the final_predictors. + # In this case, it is not. model_tf = re.sub("_rep\\d+", "", args.response_tf) max_lrb = predictors_df.drop(columns=model_tf).max(axis=1) full_X["max_lrb"] = max_lrb @@ -321,7 +348,7 @@ def find_interactors_workflow(args: argparse.Namespace) -> None: # Step 4: compare the results of the final model with a univariate model avg_r2_univariate = stratified_cv_r2( lasso_res["all"]["response"], - lasso_res["all"]["predictors"][[model_tf]], + all_predictors[[model_tf]], lasso_res["all"]["classes"], ) diff --git a/yeastdnnexplorer/ml_models/lasso_modeling.py b/yeastdnnexplorer/ml_models/lasso_modeling.py index 536eaf5..a4fb381 100644 --- a/yeastdnnexplorer/ml_models/lasso_modeling.py +++ b/yeastdnnexplorer/ml_models/lasso_modeling.py @@ -1,7 +1,7 @@ import logging import re import warnings -from typing import Any, Literal, Union +from typing import Any, Literal import matplotlib.pyplot as plt import numpy as np @@ -547,7 +547,7 @@ def get_significant_predictors( predictors_df: pd.DataFrame, add_max_lrb: bool, **kwargs: Any, -) -> dict[str, Union[set[str], pd.DataFrame, np.ndarray]]: +) -> dict[str, set[str] | pd.DataFrame | np.ndarray]: """ This function is used to get the significant predictors for a given TF using bootstrapped LassoCV. It wraps `generate_modeling_data`, @@ -626,8 +626,8 @@ def get_significant_predictors( elif method == "bootstrap_lassocv": - # set the alphas_ attribute of the lassoCV_estimator to the alphas_ attribute of the - # lasso_model fit on the whole data. This will allow the + # set the alphas_ attribute of the lassoCV_estimator to the alphas_ + # attribute of the lasso_model fit on the whole data. This will allow the # bootstrap_stratified_cv_modeling function to use the same set of lambdas lassoCV_estimator.alphas_ = lasso_model.alphas_ @@ -827,8 +827,10 @@ def get_interactor_importance( class OLSFeatureSelector(BaseEstimator, TransformerMixin): """ - This class performs iterative feature selection using OLS. It removes non-significant - features until all remaining features are significant. + This class performs iterative feature selection using OLS. + + It removes non-significant features until all remaining features are significant. + """ def __init__(self, p_value_threshold=0.05): @@ -836,6 +838,7 @@ def __init__(self, p_value_threshold=0.05): Initialize the OLSFeatureSelector. :param p_value_threshold: The threshold for significance of features. + """ self.p_value_threshold = p_value_threshold self.significant_features_ = [] @@ -850,6 +853,7 @@ def fit(self, X, y, **kwargs) -> "OLSFeatureSelector": :param y: A Series of the response variable. :param kwargs: Optional arguments for `add_constant()`. :return: self + """ if not isinstance(X, pd.DataFrame): raise ValueError(f"X must be a DataFrame, but got {type(X)}.") @@ -885,6 +889,7 @@ def transform(self, X) -> pd.DataFrame: :param X: A DataFrame of predictors. :return: A DataFrame with only significant features. + """ if not self.significant_features_: raise ValueError("The model has not been fitted yet. Call `fit` first.") @@ -902,8 +907,8 @@ def refine_features(self, X, y) -> pd.DataFrame: :param X: A DataFrame of predictors. :param y: A Series of the response variable. - :return: A DataFrame with only significant predictors. + """ remaining_predictors = X.copy() @@ -928,6 +933,7 @@ def get_significant_features(self, drop_intercept=True) -> list: NOTE: this only looks for a feature called "Intercept" :return: List of significant feature names. + """ if not self.significant_features_: raise ValueError("The model has not been fitted yet. Call `fit` first.") @@ -939,7 +945,9 @@ def get_summary(self) -> pd.DataFrame: """ Get the OLS model summary as a DataFrame. - :return: A DataFrame containing coefficients, standard errors, t-values, and p-values. + :return: A DataFrame containing coefficients, standard errors, t-values, and + p-values. + """ if self.summary_ is None: raise ValueError("The model has not been fitted yet. Call `fit` first.") From 3aea0974f683872f816adb1567ef0ec520ab65db Mon Sep 17 00:00:00 2001 From: chasem Date: Tue, 19 Nov 2024 13:21:11 -0600 Subject: [PATCH 3/7] adding cmd line utility instructions to tutorial --- .../interactor_modeling_workflow.ipynb | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/docs/tutorials/interactor_modeling_workflow.ipynb b/docs/tutorials/interactor_modeling_workflow.ipynb index 0386d3f..3068393 100644 --- a/docs/tutorials/interactor_modeling_workflow.ipynb +++ b/docs/tutorials/interactor_modeling_workflow.ipynb @@ -450,6 +450,46 @@ "source": [ "As we can see, the final model we achieved through Workflow 1 demonstrates a higher average R-squared achieved by 4-fold CV. " ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using the cmd line interface\n", + "\n", + "There is a cmd line utility that performs this analysis. See\n", + "`python -m yeastdnnexplorer find_interactors_workflow --help` for more information. An\n", + "example command looks like this:\n", + "\n", + "```bash\n", + "python -m yeastdnnexplorer find_interactors_workflow \\\n", + " --response_file ~/htcf_local/lasso_bootstrap/erics_tfs/response_dataframe_20241105.csv \\\n", + " --predictors_file ~/htcf_local/lasso_bootstrap/erics_tfs/predictors_dataframe_20241105.csv \\\n", + " --method bootstrap_lassocv \\\n", + " --response_tf CBF1\n", + "```\n", + "\n", + "the output, in your `$PWD` would look like this:\n", + "\n", + "```raw\n", + "├── find_interactors_output\n", + "│ └── CBF1_output.json\n", + "```\n", + "\n", + "Where CBF1_output.json looks like:\n", + "\n", + "```json\n", + "{\n", + " \"response_tf\": \"CBF1\",\n", + " \"final_features\": [\n", + " \"CBF1:MET28\",\n", + " \"GAL4\"\n", + " ],\n", + " \"avg_r2_univariate\": 0.0049704126329321585,\n", + " \"final_model_avg_r_squared\": 0.022854644322078704\n", + "}\n", + "```" + ] } ], "metadata": { From 99a05d7aec4ceff028cd842052df3f56d58658f0 Mon Sep 17 00:00:00 2001 From: chasem Date: Tue, 19 Nov 2024 13:26:46 -0600 Subject: [PATCH 4/7] removing some output from the notebook for publishing purposes --- docs/tutorials/interactor_modeling_workflow.ipynb | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/docs/tutorials/interactor_modeling_workflow.ipynb b/docs/tutorials/interactor_modeling_workflow.ipynb index 3068393..a323d43 100644 --- a/docs/tutorials/interactor_modeling_workflow.ipynb +++ b/docs/tutorials/interactor_modeling_workflow.ipynb @@ -329,18 +329,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/chase/code/yeastdnnexplorer/.venv/lib/python3.11/site-packages/sklearn/model_selection/_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.\n", - " warnings.warn(\n", - "/home/chase/code/yeastdnnexplorer/.venv/lib/python3.11/site-packages/sklearn/model_selection/_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=4.\n", - " warnings.warn(\n" - ] - } - ], + "outputs": [], "source": [ "\n", "# in this case, the lassocv_ols_final_features are the bootstrap_lassocv_final_features\n", From 15f098f3fa939d8d71e7812f2b6674294c01d4b7 Mon Sep 17 00:00:00 2001 From: chasem Date: Tue, 19 Nov 2024 13:31:13 -0600 Subject: [PATCH 5/7] adding new lasso_modeling objects to docs --- docs/ml_models/lasso_modeling.md | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/docs/ml_models/lasso_modeling.md b/docs/ml_models/lasso_modeling.md index b5fe33a..d93b996 100644 --- a/docs/ml_models/lasso_modeling.md +++ b/docs/ml_models/lasso_modeling.md @@ -18,4 +18,24 @@

examine_bootstrap_coefficients

-::: yeastdnnexplorer.ml_models.lasso_modeling.examine_bootstrap_coefficients \ No newline at end of file +::: yeastdnnexplorer.ml_models.lasso_modeling.examine_bootstrap_coefficients + +

get_significant_predictors

+ +::: yeastdnnexplorer.ml_models.lasso_modeling.get_significant_predictors + +

stratified_cv_r2

+ +::: yeastdnnexplorer.ml_models.lasso_modeling.stratified_cv_r2 + +

try_interactor_variants

+ +::: yeastdnnexplorer.ml_models.lasso_modeling.try_interactor_variants + +

get_interactor_importance

+ +::: yeastdnnexplorer.ml_models.lasso_modeling.get_interactor_importance + +

OLSFeatureSelector

+ +::: yeastdnnexplorer.ml_models.lasso_modeling.OLSFeatureSelector \ No newline at end of file From 8618ba623adf92e019ef2a5a0c54c78400eed973 Mon Sep 17 00:00:00 2001 From: chasem Date: Tue, 19 Nov 2024 13:36:42 -0600 Subject: [PATCH 6/7] fixing docstring on get_significant_predictors --- yeastdnnexplorer/ml_models/lasso_modeling.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/yeastdnnexplorer/ml_models/lasso_modeling.py b/yeastdnnexplorer/ml_models/lasso_modeling.py index a4fb381..1895bac 100644 --- a/yeastdnnexplorer/ml_models/lasso_modeling.py +++ b/yeastdnnexplorer/ml_models/lasso_modeling.py @@ -550,9 +550,9 @@ def get_significant_predictors( ) -> dict[str, set[str] | pd.DataFrame | np.ndarray]: """ This function is used to get the significant predictors for a given TF using - bootstrapped LassoCV. It wraps `generate_modeling_data`, - `stratification_classification`, `stratified_cv_modeling`, - `bootstrap_stratified_cv_modeling` and `examine_bootstrap_coefficients`. + one of two methods, either the bootstrapped LassoCV, in which case we look for + intervals that do not cross 0, or direct LassoCV with a selection on non-zero + coefficients. :param method: This must be 'lassocv_ols', which will conduct a single lassocv call followed by pruning non zero coefficients by pvalue until all are significant From 724582196ca064e3ffcf2db29394dbadb77b1634 Mon Sep 17 00:00:00 2001 From: chasem Date: Tue, 19 Nov 2024 13:40:33 -0600 Subject: [PATCH 7/7] linting the docstring --- yeastdnnexplorer/ml_models/lasso_modeling.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/yeastdnnexplorer/ml_models/lasso_modeling.py b/yeastdnnexplorer/ml_models/lasso_modeling.py index 1895bac..91be962 100644 --- a/yeastdnnexplorer/ml_models/lasso_modeling.py +++ b/yeastdnnexplorer/ml_models/lasso_modeling.py @@ -549,10 +549,9 @@ def get_significant_predictors( **kwargs: Any, ) -> dict[str, set[str] | pd.DataFrame | np.ndarray]: """ - This function is used to get the significant predictors for a given TF using - one of two methods, either the bootstrapped LassoCV, in which case we look for - intervals that do not cross 0, or direct LassoCV with a selection on non-zero - coefficients. + This function is used to get the significant predictors for a given TF using one of + two methods, either the bootstrapped LassoCV, in which case we look for intervals + that do not cross 0, or direct LassoCV with a selection on non-zero coefficients. :param method: This must be 'lassocv_ols', which will conduct a single lassocv call followed by pruning non zero coefficients by pvalue until all are significant