Merge branch 'main' into jakdam-fa-adapt-eval

Aarhus-Psychiatry-Research · Sep 4, 2023 · 3bca38b · 3bca38b
2 parents a448ab4 + 4008cb6
commit 3bca38b
Show file tree

Hide file tree

Showing 28 changed files with 201 additions and 194 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,20 @@
 
 <!--next-version-placeholder-->
 
+## v0.106.0 (2023-08-31)
+
+### Feature
+
+* First version ([`2fc715c`](https://github.com/Aarhus-Psychiatry-Research/psycop-common/commit/2fc715c099c94b9dd470595f73546aa3b9c0786b))
+
+### Fix
+
+* Possibly unbound variable ([`864f59b`](https://github.com/Aarhus-Psychiatry-Research/psycop-common/commit/864f59bfa20d27e5b8ec28a5b9d84a3a2ac487ba))
+
+### Documentation
+
+* Point to patient object tests ([`d727664`](https://github.com/Aarhus-Psychiatry-Research/psycop-common/commit/d7276642d5a650110a6693c6428058fa3784f432))
+
 ## v0.105.0 (2023-08-30)
 
 ### Feature

diff --git a/psycop/common/data_structures/patient.py b/psycop/common/data_structures/patient.py
@@ -58,7 +58,7 @@ def to_prediction_times(
         self,
         lookbehind: dt.timedelta,
         lookahead: dt.timedelta,
-        outcome_timestamp: dt.datetime,
+        outcome_timestamp: dt.datetime | None,
         prediction_timestamps: Sequence[dt.datetime],
     ) -> list[PredictionTime]:
         """Creates prediction times for a boolean outome. E.g. for the task of predicting whether a patient will be diagnosed with diabetes within the next year, this function will return a list of PredictionTime objects, each of which contains the patient's data for a specific prediction time (predictors, prediction timestamp and whether the outcome occurs within the lookahead)."""
@@ -73,8 +73,10 @@ def to_prediction_times(
                 end=prediction_timestamp,
             )
 
-            outcome_within_lookahead = outcome_timestamp <= (
-                prediction_timestamp + lookahead
+            outcome_within_lookahead = (
+                outcome_timestamp <= (prediction_timestamp + lookahead)
+                if outcome_timestamp is not None
+                else False
             )
 
             # 2. Return prediction sequences

diff --git a/psycop/common/feature_generation/sequences/cohort_definer_to_prediction_times.py b/psycop/common/feature_generation/sequences/cohort_definer_to_prediction_times.py
@@ -0,0 +1,65 @@
+import datetime as dt
+from collections import defaultdict
+
+import polars as pl
+
+from psycop.common.cohort_definition import CohortDefiner
+from psycop.common.data_structures.patient import Patient
+from psycop.common.data_structures.prediction_time import PredictionTime
+
+
+class CohortToPredictionTimes:
+    def __init__(self, cohort_definer: CohortDefiner, patient_objects: list[Patient]):
+        self.cohort_definer = cohort_definer
+        self.patients = patient_objects
+
+    @staticmethod
+    def _polars_dataframe_to_patient_timestamp_mapping(
+        dataframe: pl.DataFrame,
+        id_col_name: str,
+        patient_timestamp_col_name: str,
+    ) -> dict[str | int, list[dt.datetime]]:
+        timestamp_dicts = dataframe.iter_rows(named=True)
+
+        patient_to_prediction_times = defaultdict(list)
+        for prediction_time_dict in timestamp_dicts:
+            patient_id = prediction_time_dict[id_col_name]
+            patient_to_prediction_times[patient_id].append(
+                prediction_time_dict[patient_timestamp_col_name],
+            )
+
+        return patient_to_prediction_times
+
+    def create_prediction_times(
+        self,
+        lookbehind: dt.timedelta,
+        lookahead: dt.timedelta,
+    ) -> tuple[PredictionTime]:
+        outcome_timestamps = self._polars_dataframe_to_patient_timestamp_mapping(
+            dataframe=self.cohort_definer.get_outcome_timestamps(),
+            id_col_name="dw_ek_borger",
+            patient_timestamp_col_name="timestamp",
+        )
+        prediction_timestamps = self._polars_dataframe_to_patient_timestamp_mapping(
+            dataframe=self.cohort_definer.get_filtered_prediction_times_bundle().prediction_times,
+            id_col_name="dw_ek_borger",
+            patient_timestamp_col_name="timestamp",
+        )
+
+        prediction_times = []
+        for patient in self.patients:
+            pt_outcome_timestamps = outcome_timestamps.get(patient.patient_id)
+
+            if pt_outcome_timestamps is not None:
+                outcome_timestamp = pt_outcome_timestamps[0]
+            else:
+                outcome_timestamp = None
+
+            prediction_times += patient.to_prediction_times(
+                lookbehind=lookbehind,
+                lookahead=lookahead,
+                outcome_timestamp=outcome_timestamp,
+                prediction_timestamps=prediction_timestamps[patient.patient_id],
+            )
+
+        return tuple(prediction_times)
diff --git a/psycop/common/feature_generation/sequences/test_cohort_definer_to_patients.py b/psycop/common/feature_generation/sequences/test_cohort_definer_to_patients.py
@@ -0,0 +1,53 @@
+import datetime as dt
+
+import polars as pl
+
+from psycop.common.cohort_definition import CohortDefiner, FilteredPredictionTimeBundle
+from psycop.common.data_structures.test_patient import get_test_patient
+from psycop.common.feature_generation.sequences.cohort_definer_to_prediction_times import (
+    CohortToPredictionTimes,
+)
+from psycop.common.test_utils.str_to_df import str_to_pl_df
+
+
+class MockCohortDefiner(CohortDefiner):
+    @staticmethod
+    def get_filtered_prediction_times_bundle() -> FilteredPredictionTimeBundle:
+        df = str_to_pl_df(
+            """dw_ek_borger,timestamp
+    1,2021-01-01
+    2,2022-01-01
+    """,
+        )
+        return FilteredPredictionTimeBundle(
+            prediction_times=df,
+            filter_steps=[],
+        )
+
+    @staticmethod
+    def get_outcome_timestamps() -> pl.DataFrame:
+        df = str_to_pl_df(
+            """dw_ek_borger,timestamp
+    1,2021-01-02
+    """,
+        )
+        return df
+
+
+def test_polars_dataframe_to_dict():
+    """Test that each prediction time is mapped to the correct patient."""
+    prediction_times = CohortToPredictionTimes(
+        cohort_definer=MockCohortDefiner(),
+        patient_objects=[
+            get_test_patient(patient_id=1),
+            get_test_patient(patient_id=2),
+        ],
+    ).create_prediction_times(
+        lookbehind=dt.timedelta(days=1),
+        lookahead=dt.timedelta(days=1),
+    )
+
+    assert len(prediction_times) == 2
+    patient_1 = list(filter(lambda x: x.patient.patient_id == 1, prediction_times))[0]
+    assert patient_1.prediction_timestamp == dt.datetime(2021, 1, 1)
+    # The rest of the prediction time creation logic is tested in the patient object tests
diff --git a/...inpatient/model_evaluation/application/pipelines/performance/confusion_matrix_pipeline.py b/...inpatient/model_evaluation/application/pipelines/performance/confusion_matrix_pipeline.py
diff --git a/...int/model_evaluation/application/pipelines/feature_importance/shap_dependency_pipeline.py b/...int/model_evaluation/application/pipelines/feature_importance/shap_dependency_pipeline.py
@@ -5,7 +5,7 @@
 import polars as pl
 
 from psycop.projects.restraint.model_evaluation.config import (
-    EVAL_RUN,
+    BEST_DEV_RUN,
     FIGURES_PATH,
     TABLES_PATH,
     TEXT_EVAL_RUN,
@@ -29,7 +29,7 @@ def shap_dependency_pipeline(
     top_n_shap_scatter: Optional[int] = 20,
 ):
     if model == "baseline":
-        run, f_path, t_path = EVAL_RUN, FIGURES_PATH, TABLES_PATH
+        run, f_path, t_path = BEST_DEV_RUN, FIGURES_PATH, TABLES_PATH
     elif model == "text":
         run, f_path, t_path = TEXT_EVAL_RUN, TEXT_FIGURES_PATH, TEXT_TABLES_PATH
     else:

diff --git a/...traint/model_evaluation/application/pipelines/feature_importance/shap_summary_pipeline.py b/...traint/model_evaluation/application/pipelines/feature_importance/shap_summary_pipeline.py
@@ -7,7 +7,10 @@
     infer_outcome_col_name,
     infer_predictor_col_name,
 )
-from psycop.projects.restraint.model_evaluation.config import EVAL_RUN, TEXT_EVAL_RUN
+from psycop.projects.restraint.model_evaluation.config import (
+    BEST_DEV_RUN,
+    TEXT_EVAL_RUN,
+)
 from psycop.projects.restraint.model_evaluation.data.load_true_data import (
     load_file_from_pkl,
     load_fullconfig,
@@ -31,7 +34,7 @@ def shap_summary_pipeline(model: Literal["baseline", "text"], top_n: int = 20):
         top_n (int, optional): How many features to include in gain plot. Defaults to 20
     """
     if model == "baseline":
-        run = EVAL_RUN
+        run = BEST_DEV_RUN
     elif model == "text":
         run = TEXT_EVAL_RUN
     else:

diff --git a/...restraint/model_evaluation/application/pipelines/performance/confusion_matrix_pipeline.py b/...restraint/model_evaluation/application/pipelines/performance/confusion_matrix_pipeline.py
@@ -3,19 +3,18 @@
 import pandas as pd
 import plotnine as pn
 
+from psycop.common.model_evaluation.confusion_matrix import confusion_matrix
 from psycop.common.model_evaluation.confusion_matrix.confusion_matrix import (
     ConfusionMatrix,
-    get_confusion_matrix_cells_from_df,
 )
 from psycop.common.test_utils.str_to_df import str_to_df
 from psycop.projects.restraint.model_evaluation.config import (
     BEST_DEV_RUN,
     COLOURS,
-    MODEL_NAME,
     PN_THEME,
     TABLES_PATH,
 )
-from psycop.projects.restraint.utils.best_runs import Run
+from psycop.projects.restraint.utils.best_runs import Run, df_to_eval_dataset
 
 
 def plotnine_confusion_matrix(matrix: ConfusionMatrix, x_title: str) -> pn.ggplot:
@@ -63,7 +62,7 @@ def confusion_matrix_metrics(
     Creates a confusion matrix dataframe with PPV, NPV, SENS, and SPEC.
     """
     # Calculate the confusion matrix using sklearn
-    cm = get_confusion_matrix_cells_from_df(df)
+    cm = confusion_matrix.get_confusion_matrix_cells_from_df(df)
 
     # Extract the TP, FP, TN, and FN values from the confusion matrix
 
@@ -92,11 +91,15 @@ def confusion_matrix_metrics(
 def confusion_matrix_pipeline(run: Run, path: Path):
     eval_ds = run.get_eval_dataset()
 
+    df_to_eval_dataset(eval_ds, custom_columns=None)  # type: ignore
     df = pd.DataFrame(
         {
-            "true": eval_ds.y,
+            "true": eval_ds["outcome_coercion_type_within_2_days"].replace(  # type: ignore
+                {1: 0, 2: 0, 3: 1},
+            ),
             "pred": eval_ds.get_predictions_for_positive_rate(
                 desired_positive_rate=run.pos_rate,
+                y_hat_probs_column="y_hat_prob",
             )[0],
         },
     )
@@ -109,7 +112,7 @@ def confusion_matrix_pipeline(run: Run, path: Path):
     conf_matrix.to_csv(path / "confusion_matrix.csv")
     metrics_df.to_csv(path / "confusion_matrix_metrics.csv")
 
-    plotnine_confusion_matrix(cm, f"Confusion Matrix for {MODEL_NAME[run.name]}").save(
+    plotnine_confusion_matrix(cm, "Confusion Matrix").save(
         path / "confusion_matrix.png",
     )
 

diff --git a/...int/model_evaluation/application/pipelines/performance/incidence_by_time_until_outcome.py b/...int/model_evaluation/application/pipelines/performance/incidence_by_time_until_outcome.py
@@ -3,8 +3,8 @@
 import plotnine as pn
 
 from psycop.projects.restraint.model_evaluation.config import (
+    BEST_DEV_RUN,
     COLOURS,
-    EVAL_RUN,
     FIGURES_PATH,
     PN_THEME,
 )
@@ -43,4 +43,4 @@ def incidence_by_time_until_outcome_pipeline(run: Run, path: Path):
 
 
 if __name__ == "__main__":
-    incidence_by_time_until_outcome_pipeline(EVAL_RUN, FIGURES_PATH)
+    incidence_by_time_until_outcome_pipeline(BEST_DEV_RUN, FIGURES_PATH)