Skip to content

Commit

Permalink
Merge branch 'main' into jakdam-fa-adapt-eval
Browse files Browse the repository at this point in the history
  • Loading branch information
bokajgd committed Sep 4, 2023
2 parents a448ab4 + 4008cb6 commit 3bca38b
Show file tree
Hide file tree
Showing 28 changed files with 201 additions and 194 deletions.
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,20 @@

<!--next-version-placeholder-->

## v0.106.0 (2023-08-31)

### Feature

* First version ([`2fc715c`](https://github.com/Aarhus-Psychiatry-Research/psycop-common/commit/2fc715c099c94b9dd470595f73546aa3b9c0786b))

### Fix

* Possibly unbound variable ([`864f59b`](https://github.com/Aarhus-Psychiatry-Research/psycop-common/commit/864f59bfa20d27e5b8ec28a5b9d84a3a2ac487ba))

### Documentation

* Point to patient object tests ([`d727664`](https://github.com/Aarhus-Psychiatry-Research/psycop-common/commit/d7276642d5a650110a6693c6428058fa3784f432))

## v0.105.0 (2023-08-30)

### Feature
Expand Down
8 changes: 5 additions & 3 deletions psycop/common/data_structures/patient.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def to_prediction_times(
self,
lookbehind: dt.timedelta,
lookahead: dt.timedelta,
outcome_timestamp: dt.datetime,
outcome_timestamp: dt.datetime | None,
prediction_timestamps: Sequence[dt.datetime],
) -> list[PredictionTime]:
"""Creates prediction times for a boolean outome. E.g. for the task of predicting whether a patient will be diagnosed with diabetes within the next year, this function will return a list of PredictionTime objects, each of which contains the patient's data for a specific prediction time (predictors, prediction timestamp and whether the outcome occurs within the lookahead)."""
Expand All @@ -73,8 +73,10 @@ def to_prediction_times(
end=prediction_timestamp,
)

outcome_within_lookahead = outcome_timestamp <= (
prediction_timestamp + lookahead
outcome_within_lookahead = (
outcome_timestamp <= (prediction_timestamp + lookahead)
if outcome_timestamp is not None
else False
)

# 2. Return prediction sequences
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import datetime as dt
from collections import defaultdict

import polars as pl

from psycop.common.cohort_definition import CohortDefiner
from psycop.common.data_structures.patient import Patient
from psycop.common.data_structures.prediction_time import PredictionTime


class CohortToPredictionTimes:
def __init__(self, cohort_definer: CohortDefiner, patient_objects: list[Patient]):
self.cohort_definer = cohort_definer
self.patients = patient_objects

@staticmethod
def _polars_dataframe_to_patient_timestamp_mapping(
dataframe: pl.DataFrame,
id_col_name: str,
patient_timestamp_col_name: str,
) -> dict[str | int, list[dt.datetime]]:
timestamp_dicts = dataframe.iter_rows(named=True)

patient_to_prediction_times = defaultdict(list)
for prediction_time_dict in timestamp_dicts:
patient_id = prediction_time_dict[id_col_name]
patient_to_prediction_times[patient_id].append(
prediction_time_dict[patient_timestamp_col_name],
)

return patient_to_prediction_times

def create_prediction_times(
self,
lookbehind: dt.timedelta,
lookahead: dt.timedelta,
) -> tuple[PredictionTime]:
outcome_timestamps = self._polars_dataframe_to_patient_timestamp_mapping(
dataframe=self.cohort_definer.get_outcome_timestamps(),
id_col_name="dw_ek_borger",
patient_timestamp_col_name="timestamp",
)
prediction_timestamps = self._polars_dataframe_to_patient_timestamp_mapping(
dataframe=self.cohort_definer.get_filtered_prediction_times_bundle().prediction_times,
id_col_name="dw_ek_borger",
patient_timestamp_col_name="timestamp",
)

prediction_times = []
for patient in self.patients:
pt_outcome_timestamps = outcome_timestamps.get(patient.patient_id)

if pt_outcome_timestamps is not None:
outcome_timestamp = pt_outcome_timestamps[0]
else:
outcome_timestamp = None

prediction_times += patient.to_prediction_times(
lookbehind=lookbehind,
lookahead=lookahead,
outcome_timestamp=outcome_timestamp,
prediction_timestamps=prediction_timestamps[patient.patient_id],
)

return tuple(prediction_times)
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import datetime as dt

import polars as pl

from psycop.common.cohort_definition import CohortDefiner, FilteredPredictionTimeBundle
from psycop.common.data_structures.test_patient import get_test_patient
from psycop.common.feature_generation.sequences.cohort_definer_to_prediction_times import (
CohortToPredictionTimes,
)
from psycop.common.test_utils.str_to_df import str_to_pl_df


class MockCohortDefiner(CohortDefiner):
@staticmethod
def get_filtered_prediction_times_bundle() -> FilteredPredictionTimeBundle:
df = str_to_pl_df(
"""dw_ek_borger,timestamp
1,2021-01-01
2,2022-01-01
""",
)
return FilteredPredictionTimeBundle(
prediction_times=df,
filter_steps=[],
)

@staticmethod
def get_outcome_timestamps() -> pl.DataFrame:
df = str_to_pl_df(
"""dw_ek_borger,timestamp
1,2021-01-02
""",
)
return df


def test_polars_dataframe_to_dict():
"""Test that each prediction time is mapped to the correct patient."""
prediction_times = CohortToPredictionTimes(
cohort_definer=MockCohortDefiner(),
patient_objects=[
get_test_patient(patient_id=1),
get_test_patient(patient_id=2),
],
).create_prediction_times(
lookbehind=dt.timedelta(days=1),
lookahead=dt.timedelta(days=1),
)

assert len(prediction_times) == 2
patient_1 = list(filter(lambda x: x.patient.patient_id == 1, prediction_times))[0]
assert patient_1.prediction_timestamp == dt.datetime(2021, 1, 1)
# The rest of the prediction time creation logic is tested in the patient object tests

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import polars as pl

from psycop.projects.restraint.model_evaluation.config import (
EVAL_RUN,
BEST_DEV_RUN,
FIGURES_PATH,
TABLES_PATH,
TEXT_EVAL_RUN,
Expand All @@ -29,7 +29,7 @@ def shap_dependency_pipeline(
top_n_shap_scatter: Optional[int] = 20,
):
if model == "baseline":
run, f_path, t_path = EVAL_RUN, FIGURES_PATH, TABLES_PATH
run, f_path, t_path = BEST_DEV_RUN, FIGURES_PATH, TABLES_PATH
elif model == "text":
run, f_path, t_path = TEXT_EVAL_RUN, TEXT_FIGURES_PATH, TEXT_TABLES_PATH
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@
infer_outcome_col_name,
infer_predictor_col_name,
)
from psycop.projects.restraint.model_evaluation.config import EVAL_RUN, TEXT_EVAL_RUN
from psycop.projects.restraint.model_evaluation.config import (
BEST_DEV_RUN,
TEXT_EVAL_RUN,
)
from psycop.projects.restraint.model_evaluation.data.load_true_data import (
load_file_from_pkl,
load_fullconfig,
Expand All @@ -31,7 +34,7 @@ def shap_summary_pipeline(model: Literal["baseline", "text"], top_n: int = 20):
top_n (int, optional): How many features to include in gain plot. Defaults to 20
"""
if model == "baseline":
run = EVAL_RUN
run = BEST_DEV_RUN
elif model == "text":
run = TEXT_EVAL_RUN
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,18 @@
import pandas as pd
import plotnine as pn

from psycop.common.model_evaluation.confusion_matrix import confusion_matrix
from psycop.common.model_evaluation.confusion_matrix.confusion_matrix import (
ConfusionMatrix,
get_confusion_matrix_cells_from_df,
)
from psycop.common.test_utils.str_to_df import str_to_df
from psycop.projects.restraint.model_evaluation.config import (
BEST_DEV_RUN,
COLOURS,
MODEL_NAME,
PN_THEME,
TABLES_PATH,
)
from psycop.projects.restraint.utils.best_runs import Run
from psycop.projects.restraint.utils.best_runs import Run, df_to_eval_dataset


def plotnine_confusion_matrix(matrix: ConfusionMatrix, x_title: str) -> pn.ggplot:
Expand Down Expand Up @@ -63,7 +62,7 @@ def confusion_matrix_metrics(
Creates a confusion matrix dataframe with PPV, NPV, SENS, and SPEC.
"""
# Calculate the confusion matrix using sklearn
cm = get_confusion_matrix_cells_from_df(df)
cm = confusion_matrix.get_confusion_matrix_cells_from_df(df)

# Extract the TP, FP, TN, and FN values from the confusion matrix

Expand Down Expand Up @@ -92,11 +91,15 @@ def confusion_matrix_metrics(
def confusion_matrix_pipeline(run: Run, path: Path):
eval_ds = run.get_eval_dataset()

df_to_eval_dataset(eval_ds, custom_columns=None) # type: ignore
df = pd.DataFrame(
{
"true": eval_ds.y,
"true": eval_ds["outcome_coercion_type_within_2_days"].replace( # type: ignore
{1: 0, 2: 0, 3: 1},
),
"pred": eval_ds.get_predictions_for_positive_rate(
desired_positive_rate=run.pos_rate,
y_hat_probs_column="y_hat_prob",
)[0],
},
)
Expand All @@ -109,7 +112,7 @@ def confusion_matrix_pipeline(run: Run, path: Path):
conf_matrix.to_csv(path / "confusion_matrix.csv")
metrics_df.to_csv(path / "confusion_matrix_metrics.csv")

plotnine_confusion_matrix(cm, f"Confusion Matrix for {MODEL_NAME[run.name]}").save(
plotnine_confusion_matrix(cm, "Confusion Matrix").save(
path / "confusion_matrix.png",
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
import plotnine as pn

from psycop.projects.restraint.model_evaluation.config import (
BEST_DEV_RUN,
COLOURS,
EVAL_RUN,
FIGURES_PATH,
PN_THEME,
)
Expand Down Expand Up @@ -43,4 +43,4 @@ def incidence_by_time_until_outcome_pipeline(run: Run, path: Path):


if __name__ == "__main__":
incidence_by_time_until_outcome_pipeline(EVAL_RUN, FIGURES_PATH)
incidence_by_time_until_outcome_pipeline(BEST_DEV_RUN, FIGURES_PATH)
Loading

0 comments on commit 3bca38b

Please sign in to comment.