From f8a5a2cf0959c9eea25fae016b11cdfcda3ff137 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Wed, 13 Dec 2023 10:16:58 +0000 Subject: [PATCH 1/5] feat(L2GFeatureMatrix): add `features_list` as attribute --- src/otg/dataset/l2g_feature_matrix.py | 27 ++++++++++++++++++++++----- src/otg/dataset/l2g_prediction.py | 8 ++++++-- src/otg/l2g.py | 2 ++ 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/src/otg/dataset/l2g_feature_matrix.py b/src/otg/dataset/l2g_feature_matrix.py index 578211756..0b25b6e60 100644 --- a/src/otg/dataset/l2g_feature_matrix.py +++ b/src/otg/dataset/l2g_feature_matrix.py @@ -21,11 +21,25 @@ @dataclass class L2GFeatureMatrix(Dataset): - """Dataset with features for Locus to Gene prediction.""" + """Dataset with features for Locus to Gene prediction. + + Attributes: + features_list (list[str] | None): List of features to use. If None, all possible features are used. + """ + + features_list: list[str] | None = None + + def __post_init__(self: L2GFeatureMatrix) -> None: + """Post-initialisation to set the features list. If not provided, all columns except the fixed ones are used.""" + fixed_cols = ["studyLocusId", "geneId", "goldStandardSet"] + self.features_list = self.features_list or [ + col for col in self._df.columns if col not in fixed_cols + ] @classmethod def generate_features( cls: Type[L2GFeatureMatrix], + features_list: list[str], study_locus: StudyLocus, study_index: StudyIndex, variant_gene: V2G, @@ -34,6 +48,7 @@ def generate_features( """Generate features from the OTG datasets. Args: + features_list (list[str]): List of features to generate study_locus (StudyLocus): Study locus dataset study_index (StudyIndex): Study index dataset variant_gene (V2G): Variant to gene dataset @@ -65,6 +80,7 @@ def generate_features( fm, ["studyLocusId", "geneId"], "featureName", "featureValue" ), _schema=cls.get_schema(), + features_list=features_list, ) raise ValueError("L2G Feature matrix is empty") @@ -93,18 +109,19 @@ def fill_na( return self def select_features( - self: L2GFeatureMatrix, features_list: list[str] + self: L2GFeatureMatrix, features_list: list[str] | None ) -> L2GFeatureMatrix: """Select a subset of features from the feature matrix. Args: - features_list (list[str]): List of features to select + features_list (list[str] | None): List of features to select Returns: L2GFeatureMatrix: L2G feature matrix dataset """ - fixed_rows = ["studyLocusId", "geneId", "goldStandardSet"] - self.df = self._df.select(fixed_rows + features_list) + features_list = features_list or self.features_list + fixed_cols = ["studyLocusId", "geneId", "goldStandardSet"] + self.df = self._df.select(fixed_cols + features_list) # type: ignore return self def train_test_split( diff --git a/src/otg/dataset/l2g_prediction.py b/src/otg/dataset/l2g_prediction.py index a588818cd..67e522756 100644 --- a/src/otg/dataset/l2g_prediction.py +++ b/src/otg/dataset/l2g_prediction.py @@ -44,6 +44,7 @@ def get_schema(cls: type[L2GPrediction]) -> StructType: def from_credible_set( cls: Type[L2GPrediction], model_path: str, + features_list: list[str], study_locus: StudyLocus, study_index: StudyIndex, v2g: V2G, @@ -53,6 +54,7 @@ def from_credible_set( Args: model_path (str): Path to the fitted model + features_list (list[str]): List of features to use for the model study_locus (StudyLocus): Study locus dataset study_index (StudyIndex): Study index dataset v2g (V2G): Variant to gene dataset @@ -61,6 +63,7 @@ def from_credible_set( L2GPrediction: L2G dataset """ fm = L2GFeatureMatrix.generate_features( + features_list=features_list, study_locus=study_locus, study_index=study_index, variant_gene=v2g, @@ -71,8 +74,9 @@ def from_credible_set( _df=( LocusToGeneModel.load_from_disk( model_path, - features_list=fm.df.drop("studyLocusId", "geneId").columns, - ).predict(fm) + features_list=features_list, + ) + .predict(fm) # the probability of the positive class is the second element inside the probability array # - this is selected as the L2G probability .select( diff --git a/src/otg/l2g.py b/src/otg/l2g.py index 35692ada5..c93906d22 100644 --- a/src/otg/l2g.py +++ b/src/otg/l2g.py @@ -137,6 +137,7 @@ def __post_init__(self: LocusToGeneStep) -> None: ) fm = L2GFeatureMatrix.generate_features( + features_list=self.features_list, study_locus=credible_set, study_index=studies, variant_gene=v2g, @@ -193,6 +194,7 @@ def __post_init__(self: LocusToGeneStep) -> None: ) predictions = L2GPrediction.from_credible_set( self.model_path, + self.features_list, credible_set, studies, v2g, From be480cd07ccf1cc31eb8936e34a3522407611a0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Wed, 13 Dec 2023 10:17:49 +0000 Subject: [PATCH 2/5] fix: log wandb table --- src/otg/method/l2g/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/otg/method/l2g/model.py b/src/otg/method/l2g/model.py index ad9aa8c75..ebaaea3af 100644 --- a/src/otg/method/l2g/model.py +++ b/src/otg/method/l2g/model.py @@ -132,7 +132,7 @@ def log_to_wandb( } wandb_run.log(gs_counts_dict) training_table = wandb.Table(dataframe=training_data.df.toPandas()) - wandb_run.log({"trainingSet": wandb.Table(dataframe=training_table)}) + wandb_run.log({"trainingSet": training_table}) @classmethod def load_from_disk( From e69c47e747e088f865755f068ba1f0ac88503a38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Wed, 13 Dec 2023 10:45:19 +0000 Subject: [PATCH 3/5] feat(L2GFeatureMatrix): track missingness rate for each feature --- src/otg/dataset/l2g_feature_matrix.py | 20 ++++++++++++++++++++ src/otg/method/l2g/model.py | 11 ++++++++--- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/src/otg/dataset/l2g_feature_matrix.py b/src/otg/dataset/l2g_feature_matrix.py index 0b25b6e60..c966b87c7 100644 --- a/src/otg/dataset/l2g_feature_matrix.py +++ b/src/otg/dataset/l2g_feature_matrix.py @@ -93,6 +93,26 @@ def get_schema(cls: type[L2GFeatureMatrix]) -> StructType: """ return parse_spark_schema("l2g_feature_matrix.json") + def calculate_feature_missingness_rate( + self: L2GFeatureMatrix, + ) -> dict[str, float]: + """Calculate the proportion of missing values in each feature. + + Returns: + dict[str, float]: Dictionary of feature names and their missingness rate. + + Raises: + ValueError: If no features are found. + """ + total_count = self._df.count() + if not self.features_list: + raise ValueError("No features found") + + return { + feature: (self._df.filter(self._df[feature].isNull()).count() / total_count) + for feature in self.features_list + } + def fill_na( self: L2GFeatureMatrix, value: float = 0.0, subset: list[str] | None = None ) -> L2GFeatureMatrix: diff --git a/src/otg/method/l2g/model.py b/src/otg/method/l2g/model.py index ebaaea3af..7429c6770 100644 --- a/src/otg/method/l2g/model.py +++ b/src/otg/method/l2g/model.py @@ -125,14 +125,19 @@ def log_to_wandb( wandb_evaluator.evaluate(results) ## Track feature importance wandb_run.log({"importances": self.get_feature_importance()}) - ## Track training set metadata + ## Track training set + training_table = wandb.Table(dataframe=training_data.df.toPandas()) + wandb_run.log({"trainingSet": training_table}) + # Count number of positive and negative labels gs_counts_dict = { "goldStandard" + row["goldStandardSet"].capitalize(): row["count"] for row in training_data.df.groupBy("goldStandardSet").count().collect() } wandb_run.log(gs_counts_dict) - training_table = wandb.Table(dataframe=training_data.df.toPandas()) - wandb_run.log({"trainingSet": training_table}) + # Missingness rates + wandb_run.log( + "missingnessRates", training_data.calculate_feature_missingness_rate() + ) @classmethod def load_from_disk( From dc4bb532f15174d73a702abfd86ceb583e24cda5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Wed, 13 Dec 2023 11:05:56 +0000 Subject: [PATCH 4/5] feat(L2GFeatureMatrix): track missingness rate for each feature --- src/otg/method/l2g/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/otg/method/l2g/model.py b/src/otg/method/l2g/model.py index 7429c6770..23405e6b8 100644 --- a/src/otg/method/l2g/model.py +++ b/src/otg/method/l2g/model.py @@ -136,7 +136,7 @@ def log_to_wandb( wandb_run.log(gs_counts_dict) # Missingness rates wandb_run.log( - "missingnessRates", training_data.calculate_feature_missingness_rate() + {"missingnessRates": training_data.calculate_feature_missingness_rate()} ) @classmethod From bd67c78540c86723f4480322fa3b0643bbe69925 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Wed, 13 Dec 2023 11:09:20 +0000 Subject: [PATCH 5/5] chore(LocusToGeneModel): remove evaluation outside experiment tracking --- src/otg/method/l2g/model.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/src/otg/method/l2g/model.py b/src/otg/method/l2g/model.py index 23405e6b8..61deb3066 100644 --- a/src/otg/method/l2g/model.py +++ b/src/otg/method/l2g/model.py @@ -223,30 +223,7 @@ def evaluate( labelCol="label", predictionCol="prediction" ) - print("Evaluating model...") # noqa: T201 - print( # noqa: T201 - "... Area under ROC curve:", - binary_evaluator.evaluate( - results, {binary_evaluator.metricName: "areaUnderROC"} - ), - ) - print( # noqa: T201 - "... Area under Precision-Recall curve:", - binary_evaluator.evaluate( - results, {binary_evaluator.metricName: "areaUnderPR"} - ), - ) - print( # noqa: T201 - "... Accuracy:", - multi_evaluator.evaluate(results, {multi_evaluator.metricName: "accuracy"}), - ) - print( # noqa: T201 - "... F1 score:", - multi_evaluator.evaluate(results, {multi_evaluator.metricName: "f1"}), - ) - if wandb_run_name and training_data: - print("Logging to W&B...") # noqa: T201 run = wandb.init( project=self.wandb_l2g_project_name, config=hyperparameters,