opentargets · ireneisdoomed · Dec 13, 2023 · Dec 13, 2023 · Dec 13, 2023 · Dec 13, 2023
diff --git a/src/otg/dataset/l2g_feature_matrix.py b/src/otg/dataset/l2g_feature_matrix.py
@@ -21,11 +21,25 @@
 
 @dataclass
 class L2GFeatureMatrix(Dataset):
-    """Dataset with features for Locus to Gene prediction."""
+    """Dataset with features for Locus to Gene prediction.
+
+    Attributes:
+        features_list (list[str] | None): List of features to use. If None, all possible features are used.
+    """
+
+    features_list: list[str] | None = None
+
+    def __post_init__(self: L2GFeatureMatrix) -> None:
+        """Post-initialisation to set the features list. If not provided, all columns except the fixed ones are used."""
+        fixed_cols = ["studyLocusId", "geneId", "goldStandardSet"]
+        self.features_list = self.features_list or [
+            col for col in self._df.columns if col not in fixed_cols
+        ]
 
     @classmethod
     def generate_features(
         cls: Type[L2GFeatureMatrix],
+        features_list: list[str],
         study_locus: StudyLocus,
         study_index: StudyIndex,
         variant_gene: V2G,
@@ -34,6 +48,7 @@ def generate_features(
         """Generate features from the OTG datasets.
 
         Args:
+            features_list (list[str]): List of features to generate
             study_locus (StudyLocus): Study locus dataset
             study_index (StudyIndex): Study index dataset
             variant_gene (V2G): Variant to gene dataset
@@ -65,6 +80,7 @@ def generate_features(
                     fm, ["studyLocusId", "geneId"], "featureName", "featureValue"
                 ),
                 _schema=cls.get_schema(),
+                features_list=features_list,
             )
         raise ValueError("L2G Feature matrix is empty")
 
@@ -77,6 +93,26 @@ def get_schema(cls: type[L2GFeatureMatrix]) -> StructType:
         """
         return parse_spark_schema("l2g_feature_matrix.json")
 
+    def calculate_feature_missingness_rate(
+        self: L2GFeatureMatrix,
+    ) -> dict[str, float]:
+        """Calculate the proportion of missing values in each feature.
+
+        Returns:
+            dict[str, float]: Dictionary of feature names and their missingness rate.
+
+        Raises:
+            ValueError: If no features are found.
+        """
+        total_count = self._df.count()
+        if not self.features_list:
+            raise ValueError("No features found")
+
+        return {
+            feature: (self._df.filter(self._df[feature].isNull()).count() / total_count)
+            for feature in self.features_list
+        }
+
     def fill_na(
         self: L2GFeatureMatrix, value: float = 0.0, subset: list[str] | None = None
     ) -> L2GFeatureMatrix:
@@ -93,18 +129,19 @@ def fill_na(
         return self
 
     def select_features(
-        self: L2GFeatureMatrix, features_list: list[str]
+        self: L2GFeatureMatrix, features_list: list[str] | None
     ) -> L2GFeatureMatrix:
         """Select a subset of features from the feature matrix.
 
         Args:
-            features_list (list[str]): List of features to select
+            features_list (list[str] | None): List of features to select
 
         Returns:
             L2GFeatureMatrix: L2G feature matrix dataset
         """
-        fixed_rows = ["studyLocusId", "geneId", "goldStandardSet"]
-        self.df = self._df.select(fixed_rows + features_list)
+        features_list = features_list or self.features_list
+        fixed_cols = ["studyLocusId", "geneId", "goldStandardSet"]
+        self.df = self._df.select(fixed_cols + features_list)  # type: ignore
         return self
 
     def train_test_split(

diff --git a/src/otg/dataset/l2g_prediction.py b/src/otg/dataset/l2g_prediction.py
@@ -44,6 +44,7 @@ def get_schema(cls: type[L2GPrediction]) -> StructType:
     def from_credible_set(
         cls: Type[L2GPrediction],
         model_path: str,
+        features_list: list[str],
         study_locus: StudyLocus,
         study_index: StudyIndex,
         v2g: V2G,
@@ -53,6 +54,7 @@ def from_credible_set(
 
         Args:
             model_path (str): Path to the fitted model
+            features_list (list[str]): List of features to use for the model
             study_locus (StudyLocus): Study locus dataset
             study_index (StudyIndex): Study index dataset
             v2g (V2G): Variant to gene dataset
@@ -61,6 +63,7 @@ def from_credible_set(
             L2GPrediction: L2G dataset
         """
         fm = L2GFeatureMatrix.generate_features(
+            features_list=features_list,
             study_locus=study_locus,
             study_index=study_index,
             variant_gene=v2g,
@@ -71,8 +74,9 @@ def from_credible_set(
             _df=(
                 LocusToGeneModel.load_from_disk(
                     model_path,
-                    features_list=fm.df.drop("studyLocusId", "geneId").columns,
-                ).predict(fm)
+                    features_list=features_list,
+                )
+                .predict(fm)
                 # the probability of the positive class is the second element inside the probability array
                 # - this is selected as the L2G probability
                 .select(

diff --git a/src/otg/l2g.py b/src/otg/l2g.py
@@ -137,6 +137,7 @@ def __post_init__(self: LocusToGeneStep) -> None:
             )
 
             fm = L2GFeatureMatrix.generate_features(
+                features_list=self.features_list,
                 study_locus=credible_set,
                 study_index=studies,
                 variant_gene=v2g,
@@ -193,6 +194,7 @@ def __post_init__(self: LocusToGeneStep) -> None:
                 )
             predictions = L2GPrediction.from_credible_set(
                 self.model_path,
+                self.features_list,
                 credible_set,
                 studies,
                 v2g,

diff --git a/src/otg/method/l2g/model.py b/src/otg/method/l2g/model.py
@@ -125,14 +125,19 @@ def log_to_wandb(
             wandb_evaluator.evaluate(results)
         ## Track feature importance
         wandb_run.log({"importances": self.get_feature_importance()})
-        ## Track training set metadata
+        ## Track training set
+        training_table = wandb.Table(dataframe=training_data.df.toPandas())
+        wandb_run.log({"trainingSet": training_table})
+        # Count number of positive and negative labels
         gs_counts_dict = {
             "goldStandard" + row["goldStandardSet"].capitalize(): row["count"]
             for row in training_data.df.groupBy("goldStandardSet").count().collect()
         }
         wandb_run.log(gs_counts_dict)
-        training_table = wandb.Table(dataframe=training_data.df.toPandas())
-        wandb_run.log({"trainingSet": wandb.Table(dataframe=training_table)})
+        # Missingness rates
+        wandb_run.log(
+            {"missingnessRates": training_data.calculate_feature_missingness_rate()}
+        )
 
     @classmethod
     def load_from_disk(
@@ -218,30 +223,7 @@ def evaluate(
             labelCol="label", predictionCol="prediction"
         )
 
-        print("Evaluating model...")  # noqa: T201
-        print(  # noqa: T201
-            "... Area under ROC curve:",
-            binary_evaluator.evaluate(
-                results, {binary_evaluator.metricName: "areaUnderROC"}
-            ),
-        )
-        print(  # noqa: T201
-            "... Area under Precision-Recall curve:",
-            binary_evaluator.evaluate(
-                results, {binary_evaluator.metricName: "areaUnderPR"}
-            ),
-        )
-        print(  # noqa: T201
-            "... Accuracy:",
-            multi_evaluator.evaluate(results, {multi_evaluator.metricName: "accuracy"}),
-        )
-        print(  # noqa: T201
-            "... F1 score:",
-            multi_evaluator.evaluate(results, {multi_evaluator.metricName: "f1"}),
-        )
-
         if wandb_run_name and training_data:
-            print("Logging to W&B...")  # noqa: T201
             run = wandb.init(
                 project=self.wandb_l2g_project_name,
                 config=hyperparameters,