From 6678f36bb0e2c36e8e1da8bdfdd65af1c4fc1e7c Mon Sep 17 00:00:00 2001 From: sarakolding Date: Tue, 16 Jan 2024 14:07:20 +0000 Subject: [PATCH 1/7] first stab --- .vscode/settings.json | 2 +- psycop/common/cohort_definition.py | 57 +++++++++++++++++-- .../loaders/raw/load_ids.py | 2 +- .../sequences/prediction_times_from_cohort.py | 4 +- .../test_prediction_time_from_cohort.py | 13 +++-- psycop/common/test_cohort_definition.py | 2 +- .../cancer_cohort_definer.py | 6 +- .../cancer/feature_generation/main.py | 2 +- .../feature_generation/specify_features.py | 4 +- psycop/projects/cancer/main.py | 2 +- .../clozapine_cohort_definition.py | 7 ++- .../clozapine/feature_generation/main.py | 2 +- .../modules/specify_features.py | 4 +- .../cvd_cohort_definition.py | 7 ++- .../projects/cvd/feature_generation/main.py | 2 +- .../feature_generation/specify_features.py | 2 +- ..._admissions_inpatient_cohort_definition.py | 9 +-- .../feature_generation/main.py | 4 +- .../check_age_distribution.py | 4 +- .../scz_bp_prediction_time_loader.py | 11 ++-- .../scz_bp_generate_features.py | 2 +- .../scz_bp_specify_features.py | 2 +- .../cohort_definition/t2d_cohort_definer.py | 5 +- .../projects/t2d/feature_generation/main.py | 2 +- psycop/projects/t2d/main.py | 2 +- .../table_one/table_one.py | 2 +- 26 files changed, 111 insertions(+), 50 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 114b28c0d..7e3020ecb 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,7 +1,7 @@ { "python.analysis.typeCheckingMode": "strict", "python.testing.pytestArgs": [ - "psycop" + "." ], "githubIssues.queries": [ { diff --git a/psycop/common/cohort_definition.py b/psycop/common/cohort_definition.py index 81fc7aa53..c5017b0e0 100644 --- a/psycop/common/cohort_definition.py +++ b/psycop/common/cohort_definition.py @@ -1,11 +1,60 @@ from abc import ABC, abstractmethod -from collections.abc import Iterable +from collections.abc import Iterable, Sequence +from dataclasses import dataclass from typing import Protocol, runtime_checkable import polars as pl from wasabi import Printer from psycop.common.global_utils.pydantic_basemodel import PSYCOPBaseModel +from psycop.common.types.validated_frame import ValidatedFrame +from psycop.common.types.validator_rules import ( + ColumnExistsRule, + ColumnTypeRule, + ValidatorRule, +) + + +@dataclass(frozen=True) +class PredictionTimeFrame(ValidatedFrame[pl.DataFrame]): + """ValidatedFrame with extra validation for prediction times""" + + frame: pl.DataFrame + + entity_id_col_name: str = "dw_ek_borger" + entity_id_col_rules: Sequence[ValidatorRule] = ( + ColumnExistsRule(), + ColumnTypeRule(expected_type=pl.Int64), + ) + + timestamp_col_name: str = "timestamp" + timestamp_col_rules: Sequence[ValidatorRule] = ( + ColumnExistsRule(), + ColumnTypeRule(expected_type=pl.Datetime), + ) + + allow_extra_columns: bool = True + + +@dataclass(frozen=True) +class OutcomeTimestampFrame(ValidatedFrame[pl.DataFrame]): + """ValidatedFrame with extra validation for prediction times""" + + frame: pl.DataFrame + + entity_id_col_name: str = "dw_ek_borger" + entity_id_col_rules: Sequence[ValidatorRule] = ( + ColumnExistsRule(), + ColumnTypeRule(expected_type=pl.Int64), + ) + + timestamp_col_name: str = "timestamp" + timestamp_col_rules: Sequence[ValidatorRule] = ( + ColumnExistsRule(), + ColumnTypeRule(expected_type=pl.Datetime), + ) + + allow_extra_columns: bool = True @runtime_checkable @@ -34,7 +83,7 @@ def n_dropped_ids(self) -> int: class FilteredPredictionTimeBundle(PSYCOPBaseModel): - prediction_times: pl.DataFrame + prediction_times: PredictionTimeFrame filter_steps: list[StepDelta] @@ -46,7 +95,7 @@ def get_filtered_prediction_times_bundle() -> FilteredPredictionTimeBundle: @staticmethod @abstractmethod - def get_outcome_timestamps() -> pl.DataFrame: + def get_outcome_timestamps() -> OutcomeTimestampFrame: ... @@ -96,6 +145,6 @@ def filter_prediction_times( prediction_times = prediction_times.drop("date_of_birth") return FilteredPredictionTimeBundle( - prediction_times=prediction_times.collect(), + prediction_times=PredictionTimeFrame(frame=prediction_times.collect()), filter_steps=stepdeltas, ) diff --git a/psycop/common/feature_generation/loaders/raw/load_ids.py b/psycop/common/feature_generation/loaders/raw/load_ids.py index 9dd1c7be1..d56ec586c 100644 --- a/psycop/common/feature_generation/loaders/raw/load_ids.py +++ b/psycop/common/feature_generation/loaders/raw/load_ids.py @@ -32,7 +32,7 @@ class SplitFrame(ValidatedFrame[pl.LazyFrame]): id_col_name: str = "dw_ek_borger" id_col_rules: Sequence[ValidatorRule] = ( ColumnExistsRule(), - ColumnTypeRule(expected_type=pl.Utf8), + ColumnTypeRule(expected_type=pl.Int64), ) diff --git a/psycop/common/feature_generation/sequences/prediction_times_from_cohort.py b/psycop/common/feature_generation/sequences/prediction_times_from_cohort.py index 0645a3737..397a461c3 100644 --- a/psycop/common/feature_generation/sequences/prediction_times_from_cohort.py +++ b/psycop/common/feature_generation/sequences/prediction_times_from_cohort.py @@ -94,13 +94,13 @@ def create_prediction_times( lookahead: dt.timedelta, ) -> tuple[PredictionTime, ...]: outcome_timestamps = self._polars_dataframe_to_patient_timestamp_mapping( - dataframe=self.cohort_definer.get_outcome_timestamps(), + dataframe=self.cohort_definer.get_outcome_timestamps().frame, id_col_name="dw_ek_borger", patient_timestamp_col_name="timestamp", ) naive_prediction_times = ( - self.cohort_definer.get_filtered_prediction_times_bundle().prediction_times + self.cohort_definer.get_filtered_prediction_times_bundle().prediction_times.frame ).lazy() prediction_times_for_split = self.split_filter.apply( naive_prediction_times, diff --git a/psycop/common/feature_generation/sequences/test_prediction_time_from_cohort.py b/psycop/common/feature_generation/sequences/test_prediction_time_from_cohort.py index 6131043c0..5b9971331 100644 --- a/psycop/common/feature_generation/sequences/test_prediction_time_from_cohort.py +++ b/psycop/common/feature_generation/sequences/test_prediction_time_from_cohort.py @@ -2,7 +2,12 @@ import polars as pl -from psycop.common.cohort_definition import CohortDefiner, FilteredPredictionTimeBundle +from psycop.common.cohort_definition import ( + CohortDefiner, + FilteredPredictionTimeBundle, + OutcomeTimestampFrame, + PredictionTimeFrame, +) from psycop.common.data_structures.test_patient import get_test_patient from psycop.common.feature_generation.sequences.prediction_times_from_cohort import ( PredictionTimesFromCohort, @@ -23,18 +28,18 @@ def get_filtered_prediction_times_bundle() -> FilteredPredictionTimeBundle: """, ) return FilteredPredictionTimeBundle( - prediction_times=df, + prediction_times=PredictionTimeFrame(df), filter_steps=[], ) @staticmethod - def get_outcome_timestamps() -> pl.DataFrame: + def get_outcome_timestamps() -> OutcomeTimestampFrame: df = str_to_pl_df( """dw_ek_borger,timestamp 1,2021-01-02 """, ) - return df + return OutcomeTimestampFrame(frame=df) def test_polars_dataframe_to_dict(): diff --git a/psycop/common/test_cohort_definition.py b/psycop/common/test_cohort_definition.py index a6506be11..87672d550 100644 --- a/psycop/common/test_cohort_definition.py +++ b/psycop/common/test_cohort_definition.py @@ -36,4 +36,4 @@ def apply(self, df: pl.LazyFrame) -> pl.LazyFrame: entity_id_col_name="entity_id", ) - assert len(filtered.prediction_times) == 1 + assert len(filtered.prediction_times.frame) == 1 diff --git a/psycop/projects/cancer/feature_generation/cohort_definition/cancer_cohort_definer.py b/psycop/projects/cancer/feature_generation/cohort_definition/cancer_cohort_definer.py index fde053759..615bf0b5d 100644 --- a/psycop/projects/cancer/feature_generation/cohort_definition/cancer_cohort_definer.py +++ b/psycop/projects/cancer/feature_generation/cohort_definition/cancer_cohort_definer.py @@ -4,10 +4,12 @@ CohortDefiner, FilteredPredictionTimeBundle, filter_prediction_times, + OutcomeTimestampFrame ) from psycop.common.feature_generation.loaders.raw.load_visits import ( physical_visits_to_psychiatry, ) + from psycop.projects.cancer.feature_generation.cohort_definition.eligible_prediction_times.single_filters import ( CancerMinAgeFilter, CancerMinDateFilter, @@ -41,8 +43,8 @@ def get_filtered_prediction_times_bundle() -> FilteredPredictionTimeBundle: ) @staticmethod - def get_outcome_timestamps() -> pl.DataFrame: - return pl.from_pandas(get_first_cancer_diagnosis()) + def get_outcome_timestamps() -> OutcomeTimestampFrame: + return OutcomeTimestampFrame(frame=pl.from_pandas(get_first_cancer_diagnosis())) if __name__ == "__main__": diff --git a/psycop/projects/cancer/feature_generation/main.py b/psycop/projects/cancer/feature_generation/main.py index 8a4569d8c..bf8717ec1 100644 --- a/psycop/projects/cancer/feature_generation/main.py +++ b/psycop/projects/cancer/feature_generation/main.py @@ -23,7 +23,7 @@ init_wandb_and_generate_feature_set( project_info=get_cancer_project_info(), - eligible_prediction_times=CancerCohortDefiner.get_filtered_prediction_times_bundle().prediction_times.to_pandas(), + eligible_prediction_times=CancerCohortDefiner.get_filtered_prediction_times_bundle().prediction_times.frame.to_pandas(), feature_specs=get_cancer_feature_specifications(), generate_in_chunks=True, chunksize=10, diff --git a/psycop/projects/cancer/feature_generation/specify_features.py b/psycop/projects/cancer/feature_generation/specify_features.py index 0cf0dc85e..e5d4611ec 100644 --- a/psycop/projects/cancer/feature_generation/specify_features.py +++ b/psycop/projects/cancer/feature_generation/specify_features.py @@ -122,7 +122,7 @@ def _get_outcome_specs(self) -> list[OutcomeSpec]: return [ OutcomeSpec( feature_base_name="first_cancer_diagnosis", - timeseries_df=CancerCohortDefiner.get_outcome_timestamps().to_pandas(), + timeseries_df=CancerCohortDefiner.get_outcome_timestamps().frame.to_pandas(), lookahead_days=365, aggregation_fn=maximum, fallback=0, @@ -134,7 +134,7 @@ def _get_outcome_specs(self) -> list[OutcomeSpec]: return OutcomeGroupSpec( named_dataframes=[ NamedDataframe( - df=CancerCohortDefiner.get_outcome_timestamps().to_pandas(), + df=CancerCohortDefiner.get_outcome_timestamps().frame.to_pandas(), name="first_cancer_diagnosis", ), ], diff --git a/psycop/projects/cancer/main.py b/psycop/projects/cancer/main.py index fba1f2b4b..e40d70400 100644 --- a/psycop/projects/cancer/main.py +++ b/psycop/projects/cancer/main.py @@ -15,7 +15,7 @@ if __name__ == "__main__": feature_set_path = init_wandb_and_generate_feature_set( project_info=get_cancer_project_info(), - eligible_prediction_times=CancerCohortDefiner.get_filtered_prediction_times_bundle().prediction_times.to_pandas(), + eligible_prediction_times=CancerCohortDefiner.get_filtered_prediction_times_bundle().prediction_times.frame.to_pandas(), feature_specs=get_cancer_feature_specifications(), generate_in_chunks=True, chunksize=10, diff --git a/psycop/projects/clozapine/feature_generation/cohort_definition/clozapine_cohort_definition.py b/psycop/projects/clozapine/feature_generation/cohort_definition/clozapine_cohort_definition.py index 9ef49de59..cd08cf2e2 100644 --- a/psycop/projects/clozapine/feature_generation/cohort_definition/clozapine_cohort_definition.py +++ b/psycop/projects/clozapine/feature_generation/cohort_definition/clozapine_cohort_definition.py @@ -3,6 +3,7 @@ from psycop.common.cohort_definition import ( CohortDefiner, FilteredPredictionTimeBundle, + OutcomeTimestampFrame, filter_prediction_times, ) from psycop.common.feature_generation.loaders.raw.load_visits import ( @@ -45,12 +46,12 @@ def get_filtered_prediction_times_bundle() -> FilteredPredictionTimeBundle: return result @staticmethod - def get_outcome_timestamps() -> pl.DataFrame: - return ( + def get_outcome_timestamps() -> OutcomeTimestampFrame: + return OutcomeTimestampFrame(frame=( pl.from_pandas(get_first_clozapine_prescription()) .with_columns(value=pl.lit(1)) .select(["dw_ek_borger", "timestamp", "value"]) - ) + )) if __name__ == "__main__": diff --git a/psycop/projects/clozapine/feature_generation/main.py b/psycop/projects/clozapine/feature_generation/main.py index e8d02b312..21791eea6 100644 --- a/psycop/projects/clozapine/feature_generation/main.py +++ b/psycop/projects/clozapine/feature_generation/main.py @@ -94,7 +94,7 @@ def main( if generate_in_chunks: flattened_df = ChunkedFeatureGenerator.create_flattened_dataset_with_chunking( project_info=project_info, - eligible_prediction_times=ClozapineCohortDefiner.get_filtered_prediction_times_bundle().prediction_times.to_pandas(), + eligible_prediction_times=ClozapineCohortDefiner.get_filtered_prediction_times_bundle().prediction_times.frame.to_pandas(), feature_specs=feature_specs, # type: ignore chunksize=chunksize, ) diff --git a/psycop/projects/clozapine/feature_generation/modules/specify_features.py b/psycop/projects/clozapine/feature_generation/modules/specify_features.py index 59d2fcd03..24c4d2b8e 100644 --- a/psycop/projects/clozapine/feature_generation/modules/specify_features.py +++ b/psycop/projects/clozapine/feature_generation/modules/specify_features.py @@ -158,7 +158,7 @@ def _get_outcome_specs(self) -> list[OutcomeSpec]: return OutcomeGroupSpec( named_dataframes=[ NamedDataframe( - df=ClozapineCohortDefiner.get_outcome_timestamps().to_pandas(), + df=ClozapineCohortDefiner.get_outcome_timestamps().frame.to_pandas(), name="first_clozapine_prescription", ), ], @@ -176,7 +176,7 @@ def _get_outcome_timestamp_specs(self) -> list[OutcomeSpec]: return OutcomeGroupSpec( named_dataframes=[ NamedDataframe( - df=ClozapineCohortDefiner.get_outcome_timestamps().to_pandas(), + df=ClozapineCohortDefiner.get_outcome_timestamps().frame.to_pandas(), name="first_clozapine_prescription", ), ], diff --git a/psycop/projects/cvd/feature_generation/cohort_definition/cvd_cohort_definition.py b/psycop/projects/cvd/feature_generation/cohort_definition/cvd_cohort_definition.py index 8b29ba7ae..2881a19ab 100644 --- a/psycop/projects/cvd/feature_generation/cohort_definition/cvd_cohort_definition.py +++ b/psycop/projects/cvd/feature_generation/cohort_definition/cvd_cohort_definition.py @@ -3,6 +3,7 @@ from psycop.common.cohort_definition import ( CohortDefiner, FilteredPredictionTimeBundle, + OutcomeTimestampFrame, filter_prediction_times, ) from psycop.common.feature_generation.loaders.raw.load_visits import ( @@ -45,12 +46,12 @@ def get_filtered_prediction_times_bundle() -> FilteredPredictionTimeBundle: return result @staticmethod - def get_outcome_timestamps() -> pl.DataFrame: - return ( + def get_outcome_timestamps() -> OutcomeTimestampFrame: + return OutcomeTimestampFrame(frame=( pl.from_pandas(get_first_cvd_indicator()) .with_columns(value=pl.lit(1)) .select(["dw_ek_borger", "timestamp", "value"]) - ) + )) if __name__ == "__main__": diff --git a/psycop/projects/cvd/feature_generation/main.py b/psycop/projects/cvd/feature_generation/main.py index 5579f3329..8e7bd9bc7 100644 --- a/psycop/projects/cvd/feature_generation/main.py +++ b/psycop/projects/cvd/feature_generation/main.py @@ -24,7 +24,7 @@ def get_cvd_project_info() -> ProjectInfo: if __name__ == "__main__": project_info = get_cvd_project_info() eligible_prediction_times = ( - CVDCohortDefiner.get_filtered_prediction_times_bundle().prediction_times.to_pandas() + CVDCohortDefiner.get_filtered_prediction_times_bundle().prediction_times.frame.to_pandas() ) feature_specs = CVDFeatureSpecifier().get_feature_specs(layer=3) diff --git a/psycop/projects/cvd/feature_generation/specify_features.py b/psycop/projects/cvd/feature_generation/specify_features.py index cc87b6223..eeafe4e95 100644 --- a/psycop/projects/cvd/feature_generation/specify_features.py +++ b/psycop/projects/cvd/feature_generation/specify_features.py @@ -63,7 +63,7 @@ def _get_outcome_specs(self) -> list[OutcomeSpec]: return OutcomeGroupSpec( named_dataframes=[ NamedDataframe( - df=CVDCohortDefiner.get_outcome_timestamps().to_pandas(), + df=CVDCohortDefiner.get_outcome_timestamps().frame.to_pandas(), name="score2_cvd", ), ], diff --git a/psycop/projects/forced_admission_inpatient/cohort/forced_admissions_inpatient_cohort_definition.py b/psycop/projects/forced_admission_inpatient/cohort/forced_admissions_inpatient_cohort_definition.py index 3e704038e..f16c31c28 100644 --- a/psycop/projects/forced_admission_inpatient/cohort/forced_admissions_inpatient_cohort_definition.py +++ b/psycop/projects/forced_admission_inpatient/cohort/forced_admissions_inpatient_cohort_definition.py @@ -3,6 +3,7 @@ from psycop.common.cohort_definition import ( CohortDefiner, FilteredPredictionTimeBundle, + OutcomeTimestampFrame, filter_prediction_times, ) from psycop.projects.forced_admission_inpatient.cohort.extract_admissions_and_visits.get_forced_admissions import ( @@ -50,8 +51,8 @@ def get_filtered_prediction_times_bundle( ) @staticmethod - def get_outcome_timestamps() -> pl.DataFrame: - return pl.from_pandas(forced_admissions_onset_timestamps()) + def get_outcome_timestamps() -> OutcomeTimestampFrame: + return OutcomeTimestampFrame(frame=pl.from_pandas(forced_admissions_onset_timestamps())) if __name__ == "__main__": @@ -65,8 +66,8 @@ def get_outcome_timestamps() -> pl.DataFrame: ) ) - df = bundle.prediction_times.to_pandas() + df = bundle.prediction_times.frame.to_pandas() - df_no_washout = bundle_no_washout.prediction_times.to_pandas() + df_no_washout = bundle_no_washout.prediction_times.frame.to_pandas() outcome_timestamps = ForcedAdmissionsInpatientCohortDefiner.get_outcome_timestamps() diff --git a/psycop/projects/forced_admission_inpatient/feature_generation/main.py b/psycop/projects/forced_admission_inpatient/feature_generation/main.py index 8057c93fd..2a862beb6 100644 --- a/psycop/projects/forced_admission_inpatient/feature_generation/main.py +++ b/psycop/projects/forced_admission_inpatient/feature_generation/main.py @@ -100,7 +100,7 @@ def main( project_info=project_info, eligible_prediction_times=ForcedAdmissionsInpatientCohortDefiner.get_filtered_prediction_times_bundle( washout_on_prior_forced_admissions=washout_on_prior_forced_admissions, - ).prediction_times.to_pandas(), + ).prediction_times.frame.to_pandas(), feature_specs=feature_specs, # type: ignore chunksize=chunksize, ) @@ -110,7 +110,7 @@ def main( feature_specs=feature_specs, # type: ignore prediction_times_df=ForcedAdmissionsInpatientCohortDefiner.get_filtered_prediction_times_bundle( washout_on_prior_forced_admissions=washout_on_prior_forced_admissions, - ).prediction_times.to_pandas(), + ).prediction_times.frame.to_pandas(), drop_pred_times_with_insufficient_look_distance=False, project_info=project_info, ) diff --git a/psycop/projects/scz_bp/feature_generation/eligible_prediction_times/check_age_distribution.py b/psycop/projects/scz_bp/feature_generation/eligible_prediction_times/check_age_distribution.py index 97dd2ff02..dd6ba47ef 100644 --- a/psycop/projects/scz_bp/feature_generation/eligible_prediction_times/check_age_distribution.py +++ b/psycop/projects/scz_bp/feature_generation/eligible_prediction_times/check_age_distribution.py @@ -11,11 +11,11 @@ if __name__ == "__main__": pred_times = SczBpCohort.get_filtered_prediction_times_bundle().prediction_times - outcome_timestamps = SczBpCohort.get_outcome_timestamps().lazy() + outcome_timestamps = SczBpCohort.get_outcome_timestamps().frame.lazy() outcome_with_age = SczBpAddAge().apply(outcome_timestamps) first_eligible_outcome = ( - pred_times.join( + pred_times.frame.join( outcome_with_age.collect(), how="left", on="dw_ek_borger", diff --git a/psycop/projects/scz_bp/feature_generation/eligible_prediction_times/scz_bp_prediction_time_loader.py b/psycop/projects/scz_bp/feature_generation/eligible_prediction_times/scz_bp_prediction_time_loader.py index caea4b99b..4a06f0475 100644 --- a/psycop/projects/scz_bp/feature_generation/eligible_prediction_times/scz_bp_prediction_time_loader.py +++ b/psycop/projects/scz_bp/feature_generation/eligible_prediction_times/scz_bp_prediction_time_loader.py @@ -7,6 +7,7 @@ from psycop.common.cohort_definition import ( CohortDefiner, FilteredPredictionTimeBundle, + OutcomeTimestampFrame, PredictionTimeFilter, filter_prediction_times, ) @@ -63,12 +64,12 @@ def get_filtered_prediction_times_bundle() -> FilteredPredictionTimeBundle: return filtered_prediction_time_bundle @staticmethod - def get_outcome_timestamps() -> pl.DataFrame: - return get_first_scz_or_bp_diagnosis().select( + def get_outcome_timestamps() -> OutcomeTimestampFrame: + return OutcomeTimestampFrame(frame=get_first_scz_or_bp_diagnosis().select( "dw_ek_borger", "timestamp", "value", - ) + )) @staticmethod def _get_filtering_steps() -> Iterable[PredictionTimeFilter]: @@ -87,9 +88,9 @@ def _get_filtering_steps() -> Iterable[PredictionTimeFilter]: f"{stepdelta.step_name} dropped {stepdelta.n_dropped_prediction_times}, remaining: {stepdelta.n_prediction_times_after}", ) - print(f"Remaining: {filtered_prediction_times.prediction_times.shape[0]}") + print(f"Remaining: {filtered_prediction_times.prediction_times.frame.shape[0]}") diag = get_first_scz_or_bp_diagnosis().select("dw_ek_borger", "source") - pos = filtered_prediction_times.prediction_times.join(diag, on="dw_ek_borger") + pos = filtered_prediction_times.prediction_times.frame.join(diag, on="dw_ek_borger") pos.groupby("source").agg(pl.col("dw_ek_borger").unique().len()) diff --git a/psycop/projects/scz_bp/feature_generation/scz_bp_generate_features.py b/psycop/projects/scz_bp/feature_generation/scz_bp_generate_features.py index 6ccd075b7..340f257a9 100644 --- a/psycop/projects/scz_bp/feature_generation/scz_bp_generate_features.py +++ b/psycop/projects/scz_bp/feature_generation/scz_bp_generate_features.py @@ -23,7 +23,7 @@ def get_scz_bp_project_info() -> ProjectInfo: if __name__ == "__main__": init_wandb_and_generate_feature_set( project_info=get_scz_bp_project_info(), - eligible_prediction_times=SczBpCohort.get_filtered_prediction_times_bundle().prediction_times.to_pandas(), + eligible_prediction_times=SczBpCohort.get_filtered_prediction_times_bundle().prediction_times.frame.to_pandas(), feature_specs=SczBpFeatureSpecifier().get_feature_specs( max_layer=3, lookbehind_days=[730], diff --git a/psycop/projects/scz_bp/feature_generation/scz_bp_specify_features.py b/psycop/projects/scz_bp/feature_generation/scz_bp_specify_features.py index ff5620e63..948c86c59 100644 --- a/psycop/projects/scz_bp/feature_generation/scz_bp_specify_features.py +++ b/psycop/projects/scz_bp/feature_generation/scz_bp_specify_features.py @@ -79,7 +79,7 @@ def _get_outcome_specs(self) -> list[OutcomeSpec]: return OutcomeGroupSpec( named_dataframes=[ NamedDataframe( - df=SczBpCohort.get_outcome_timestamps().to_pandas(), + df=SczBpCohort.get_outcome_timestamps().frame.to_pandas(), name="first_scz_or_bp", ), ], diff --git a/psycop/projects/t2d/feature_generation/cohort_definition/t2d_cohort_definer.py b/psycop/projects/t2d/feature_generation/cohort_definition/t2d_cohort_definer.py index 44d9608e2..36b66b59c 100644 --- a/psycop/projects/t2d/feature_generation/cohort_definition/t2d_cohort_definer.py +++ b/psycop/projects/t2d/feature_generation/cohort_definition/t2d_cohort_definer.py @@ -4,6 +4,7 @@ from psycop.common.cohort_definition import ( CohortDefiner, FilteredPredictionTimeBundle, + OutcomeTimestampFrame, filter_prediction_times, ) from psycop.common.feature_generation.loaders.raw.load_demographic import birthdays @@ -52,8 +53,8 @@ def get_filtered_prediction_times_bundle() -> FilteredPredictionTimeBundle: ) @staticmethod - def get_outcome_timestamps() -> pl.DataFrame: - return pl.from_pandas(get_first_diabetes_indicator()) + def get_outcome_timestamps() -> OutcomeTimestampFrame: + return OutcomeTimestampFrame(frame=pl.from_pandas(get_first_diabetes_indicator())) if __name__ == "__main__": diff --git a/psycop/projects/t2d/feature_generation/main.py b/psycop/projects/t2d/feature_generation/main.py index db7fc87eb..99162d152 100644 --- a/psycop/projects/t2d/feature_generation/main.py +++ b/psycop/projects/t2d/feature_generation/main.py @@ -15,6 +15,6 @@ if __name__ == "__main__": init_wandb_and_generate_feature_set( project_info=get_t2d_project_info(), - eligible_prediction_times=T2DCohortDefiner.get_filtered_prediction_times_bundle().prediction_times.to_pandas(), + eligible_prediction_times=T2DCohortDefiner.get_filtered_prediction_times_bundle().prediction_times.frame.to_pandas(), feature_specs=get_t2d_feature_specifications(), ) diff --git a/psycop/projects/t2d/main.py b/psycop/projects/t2d/main.py index b3902849e..1a263ce55 100644 --- a/psycop/projects/t2d/main.py +++ b/psycop/projects/t2d/main.py @@ -15,7 +15,7 @@ if __name__ == "__main__": feature_set_path = init_wandb_and_generate_feature_set( project_info=get_t2d_project_info(), - eligible_prediction_times=T2DCohortDefiner.get_filtered_prediction_times_bundle().prediction_times.to_pandas(), + eligible_prediction_times=T2DCohortDefiner.get_filtered_prediction_times_bundle().prediction_times.frame.to_pandas(), feature_specs=get_t2d_feature_specifications(), ) train_models_in_parallel(dataset_override_path=feature_set_path) diff --git a/psycop/projects/t2d/paper_outputs/dataset_description/table_one/table_one.py b/psycop/projects/t2d/paper_outputs/dataset_description/table_one/table_one.py index 32eb0a850..b2857003d 100644 --- a/psycop/projects/t2d/paper_outputs/dataset_description/table_one/table_one.py +++ b/psycop/projects/t2d/paper_outputs/dataset_description/table_one/table_one.py @@ -38,7 +38,7 @@ # %% -pred_times_to_keep_with_uuid = pred_times_to_keep.lazy().with_columns( +pred_times_to_keep_with_uuid = pred_times_to_keep.frame.lazy().with_columns( pred_time_uuid=pl.col("dw_ek_borger").cast(pl.Utf8) + "-" + pl.col("timestamp").dt.strftime(format="%Y-%m-%d-%H-%M-%S"), From c9f5b7b261912a0ca2865d3eea0e867fddb93401 Mon Sep 17 00:00:00 2001 From: sarakolding Date: Tue, 16 Jan 2024 14:07:30 +0000 Subject: [PATCH 2/7] style: auto-fixes from pre-commit --- .../sequences/test_prediction_time_from_cohort.py | 2 -- .../cohort_definition/cancer_cohort_definer.py | 3 +-- .../cohort_definition/clozapine_cohort_definition.py | 12 +++++++----- .../cohort_definition/cvd_cohort_definition.py | 12 +++++++----- .../forced_admissions_inpatient_cohort_definition.py | 4 +++- .../scz_bp_prediction_time_loader.py | 12 +++++++----- .../cohort_definition/t2d_cohort_definer.py | 4 +++- 7 files changed, 28 insertions(+), 21 deletions(-) diff --git a/psycop/common/feature_generation/sequences/test_prediction_time_from_cohort.py b/psycop/common/feature_generation/sequences/test_prediction_time_from_cohort.py index 5b9971331..e231efda5 100644 --- a/psycop/common/feature_generation/sequences/test_prediction_time_from_cohort.py +++ b/psycop/common/feature_generation/sequences/test_prediction_time_from_cohort.py @@ -1,7 +1,5 @@ import datetime as dt -import polars as pl - from psycop.common.cohort_definition import ( CohortDefiner, FilteredPredictionTimeBundle, diff --git a/psycop/projects/cancer/feature_generation/cohort_definition/cancer_cohort_definer.py b/psycop/projects/cancer/feature_generation/cohort_definition/cancer_cohort_definer.py index 615bf0b5d..5b91022fb 100644 --- a/psycop/projects/cancer/feature_generation/cohort_definition/cancer_cohort_definer.py +++ b/psycop/projects/cancer/feature_generation/cohort_definition/cancer_cohort_definer.py @@ -3,13 +3,12 @@ from psycop.common.cohort_definition import ( CohortDefiner, FilteredPredictionTimeBundle, + OutcomeTimestampFrame, filter_prediction_times, - OutcomeTimestampFrame ) from psycop.common.feature_generation.loaders.raw.load_visits import ( physical_visits_to_psychiatry, ) - from psycop.projects.cancer.feature_generation.cohort_definition.eligible_prediction_times.single_filters import ( CancerMinAgeFilter, CancerMinDateFilter, diff --git a/psycop/projects/clozapine/feature_generation/cohort_definition/clozapine_cohort_definition.py b/psycop/projects/clozapine/feature_generation/cohort_definition/clozapine_cohort_definition.py index cd08cf2e2..8363d6357 100644 --- a/psycop/projects/clozapine/feature_generation/cohort_definition/clozapine_cohort_definition.py +++ b/psycop/projects/clozapine/feature_generation/cohort_definition/clozapine_cohort_definition.py @@ -47,11 +47,13 @@ def get_filtered_prediction_times_bundle() -> FilteredPredictionTimeBundle: @staticmethod def get_outcome_timestamps() -> OutcomeTimestampFrame: - return OutcomeTimestampFrame(frame=( - pl.from_pandas(get_first_clozapine_prescription()) - .with_columns(value=pl.lit(1)) - .select(["dw_ek_borger", "timestamp", "value"]) - )) + return OutcomeTimestampFrame( + frame=( + pl.from_pandas(get_first_clozapine_prescription()) + .with_columns(value=pl.lit(1)) + .select(["dw_ek_borger", "timestamp", "value"]) + ), + ) if __name__ == "__main__": diff --git a/psycop/projects/cvd/feature_generation/cohort_definition/cvd_cohort_definition.py b/psycop/projects/cvd/feature_generation/cohort_definition/cvd_cohort_definition.py index 2881a19ab..262920b0a 100644 --- a/psycop/projects/cvd/feature_generation/cohort_definition/cvd_cohort_definition.py +++ b/psycop/projects/cvd/feature_generation/cohort_definition/cvd_cohort_definition.py @@ -47,11 +47,13 @@ def get_filtered_prediction_times_bundle() -> FilteredPredictionTimeBundle: @staticmethod def get_outcome_timestamps() -> OutcomeTimestampFrame: - return OutcomeTimestampFrame(frame=( - pl.from_pandas(get_first_cvd_indicator()) - .with_columns(value=pl.lit(1)) - .select(["dw_ek_borger", "timestamp", "value"]) - )) + return OutcomeTimestampFrame( + frame=( + pl.from_pandas(get_first_cvd_indicator()) + .with_columns(value=pl.lit(1)) + .select(["dw_ek_borger", "timestamp", "value"]) + ), + ) if __name__ == "__main__": diff --git a/psycop/projects/forced_admission_inpatient/cohort/forced_admissions_inpatient_cohort_definition.py b/psycop/projects/forced_admission_inpatient/cohort/forced_admissions_inpatient_cohort_definition.py index f16c31c28..4cb94f748 100644 --- a/psycop/projects/forced_admission_inpatient/cohort/forced_admissions_inpatient_cohort_definition.py +++ b/psycop/projects/forced_admission_inpatient/cohort/forced_admissions_inpatient_cohort_definition.py @@ -52,7 +52,9 @@ def get_filtered_prediction_times_bundle( @staticmethod def get_outcome_timestamps() -> OutcomeTimestampFrame: - return OutcomeTimestampFrame(frame=pl.from_pandas(forced_admissions_onset_timestamps())) + return OutcomeTimestampFrame( + frame=pl.from_pandas(forced_admissions_onset_timestamps()), + ) if __name__ == "__main__": diff --git a/psycop/projects/scz_bp/feature_generation/eligible_prediction_times/scz_bp_prediction_time_loader.py b/psycop/projects/scz_bp/feature_generation/eligible_prediction_times/scz_bp_prediction_time_loader.py index 4a06f0475..0fb1b3439 100644 --- a/psycop/projects/scz_bp/feature_generation/eligible_prediction_times/scz_bp_prediction_time_loader.py +++ b/psycop/projects/scz_bp/feature_generation/eligible_prediction_times/scz_bp_prediction_time_loader.py @@ -65,11 +65,13 @@ def get_filtered_prediction_times_bundle() -> FilteredPredictionTimeBundle: @staticmethod def get_outcome_timestamps() -> OutcomeTimestampFrame: - return OutcomeTimestampFrame(frame=get_first_scz_or_bp_diagnosis().select( - "dw_ek_borger", - "timestamp", - "value", - )) + return OutcomeTimestampFrame( + frame=get_first_scz_or_bp_diagnosis().select( + "dw_ek_borger", + "timestamp", + "value", + ), + ) @staticmethod def _get_filtering_steps() -> Iterable[PredictionTimeFilter]: diff --git a/psycop/projects/t2d/feature_generation/cohort_definition/t2d_cohort_definer.py b/psycop/projects/t2d/feature_generation/cohort_definition/t2d_cohort_definer.py index 36b66b59c..60081d253 100644 --- a/psycop/projects/t2d/feature_generation/cohort_definition/t2d_cohort_definer.py +++ b/psycop/projects/t2d/feature_generation/cohort_definition/t2d_cohort_definer.py @@ -54,7 +54,9 @@ def get_filtered_prediction_times_bundle() -> FilteredPredictionTimeBundle: @staticmethod def get_outcome_timestamps() -> OutcomeTimestampFrame: - return OutcomeTimestampFrame(frame=pl.from_pandas(get_first_diabetes_indicator())) + return OutcomeTimestampFrame( + frame=pl.from_pandas(get_first_diabetes_indicator()), + ) if __name__ == "__main__": From 6048d8aae211b08d06529963459fa2f97146afa4 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Wed, 17 Jan 2024 08:41:36 +0000 Subject: [PATCH 3/7] fix: runtime_checkable --- psycop/common/cohort_definition.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/psycop/common/cohort_definition.py b/psycop/common/cohort_definition.py index c5017b0e0..0270544de 100644 --- a/psycop/common/cohort_definition.py +++ b/psycop/common/cohort_definition.py @@ -82,7 +82,8 @@ def n_dropped_ids(self) -> int: return self.n_ids_before - self.n_ids_after -class FilteredPredictionTimeBundle(PSYCOPBaseModel): +@dataclass(frozen=True) +class FilteredPredictionTimeBundle: prediction_times: PredictionTimeFrame filter_steps: list[StepDelta] From 51e0a017658d3cf90d961a28a238821de06402ae Mon Sep 17 00:00:00 2001 From: sarakolding Date: Wed, 17 Jan 2024 09:00:34 +0000 Subject: [PATCH 4/7] fix tests --- psycop/common/test_cohort_definition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/psycop/common/test_cohort_definition.py b/psycop/common/test_cohort_definition.py index 87672d550..565ea2980 100644 --- a/psycop/common/test_cohort_definition.py +++ b/psycop/common/test_cohort_definition.py @@ -9,7 +9,7 @@ def test_filter_prediction_times(): prediction_times = str_to_pl_df( """ - entity_id, timestamp, + dw_ek_borger, timestamp, 1, 2020-01-01, 1, 2019-01-01, # Filtered because of timestamp in filter 1 1, 2018-01-01, # Filtered because of timestamp in filter 2 From 5a0de52fda36d030165b73a4b115f728adf08a2a Mon Sep 17 00:00:00 2001 From: sarakolding Date: Wed, 17 Jan 2024 09:15:00 +0000 Subject: [PATCH 5/7] change test directory back --- .vscode/settings.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 7e3020ecb..114b28c0d 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,7 +1,7 @@ { "python.analysis.typeCheckingMode": "strict", "python.testing.pytestArgs": [ - "." + "psycop" ], "githubIssues.queries": [ { From a588f3c997769674a3d9ff20ba6bd16dec5c0560 Mon Sep 17 00:00:00 2001 From: sarakolding Date: Wed, 17 Jan 2024 14:23:49 +0000 Subject: [PATCH 6/7] implemented requested changes --- psycop/common/cohort_definition.py | 4 ---- psycop/common/test_cohort_definition.py | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/psycop/common/cohort_definition.py b/psycop/common/cohort_definition.py index 0270544de..093a56596 100644 --- a/psycop/common/cohort_definition.py +++ b/psycop/common/cohort_definition.py @@ -23,13 +23,11 @@ class PredictionTimeFrame(ValidatedFrame[pl.DataFrame]): entity_id_col_name: str = "dw_ek_borger" entity_id_col_rules: Sequence[ValidatorRule] = ( - ColumnExistsRule(), ColumnTypeRule(expected_type=pl.Int64), ) timestamp_col_name: str = "timestamp" timestamp_col_rules: Sequence[ValidatorRule] = ( - ColumnExistsRule(), ColumnTypeRule(expected_type=pl.Datetime), ) @@ -44,13 +42,11 @@ class OutcomeTimestampFrame(ValidatedFrame[pl.DataFrame]): entity_id_col_name: str = "dw_ek_borger" entity_id_col_rules: Sequence[ValidatorRule] = ( - ColumnExistsRule(), ColumnTypeRule(expected_type=pl.Int64), ) timestamp_col_name: str = "timestamp" timestamp_col_rules: Sequence[ValidatorRule] = ( - ColumnExistsRule(), ColumnTypeRule(expected_type=pl.Datetime), ) diff --git a/psycop/common/test_cohort_definition.py b/psycop/common/test_cohort_definition.py index 565ea2980..501e382b7 100644 --- a/psycop/common/test_cohort_definition.py +++ b/psycop/common/test_cohort_definition.py @@ -33,7 +33,7 @@ def apply(self, df: pl.LazyFrame) -> pl.LazyFrame: RemoveYear(dt.strptime("2018", "%Y")), RemoveYear(dt.strptime("2019", "%Y")), ], - entity_id_col_name="entity_id", + entity_id_col_name="dw_ek_borger", ) assert len(filtered.prediction_times.frame) == 1 From 643db4292f86bdbfb88ee1ae0abf6e4c9a5ae026 Mon Sep 17 00:00:00 2001 From: sarakolding Date: Wed, 17 Jan 2024 14:23:51 +0000 Subject: [PATCH 7/7] style: auto-fixes from pre-commit --- psycop/common/cohort_definition.py | 1 - 1 file changed, 1 deletion(-) diff --git a/psycop/common/cohort_definition.py b/psycop/common/cohort_definition.py index 093a56596..2e730e10b 100644 --- a/psycop/common/cohort_definition.py +++ b/psycop/common/cohort_definition.py @@ -9,7 +9,6 @@ from psycop.common.global_utils.pydantic_basemodel import PSYCOPBaseModel from psycop.common.types.validated_frame import ValidatedFrame from psycop.common.types.validator_rules import ( - ColumnExistsRule, ColumnTypeRule, ValidatorRule, )