Skip to content

Commit

Permalink
feat: extract runs to functions, to avoid instantiation on import
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinBernstorff committed Oct 18, 2023
1 parent 80d00d0 commit afc94cb
Show file tree
Hide file tree
Showing 33 changed files with 78 additions and 66 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,6 @@ def generate_feature_importance_table(pipeline_run: PipelineRun) -> pl.DataFrame

if __name__ == "__main__":
top_100_features = generate_feature_importance_table(
pipeline_run=get_best_eval_pipeline,
pipeline_run=get_best_eval_pipeline(),
)
pass
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@ def fa_auroc_plot(run: PipelineRun) -> pn.ggplot:


if __name__ == "__main__":
fa_auroc_plot(run=get_best_eval_pipeline)
fa_auroc_plot(run=get_best_eval_pipeline())
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,4 @@ def fa_confusion_matrix_plot(run: PipelineRun) -> pn.ggplot:


if __name__ == "__main__":
fa_confusion_matrix_plot(run=get_best_eval_pipeline)
fa_confusion_matrix_plot(run=get_best_eval_pipeline())
Original file line number Diff line number Diff line change
Expand Up @@ -88,4 +88,4 @@ def fa_output_performance_by_ppr(run: PipelineRun):
get_best_eval_pipeline,
)

fa_output_performance_by_ppr(run=get_best_eval_pipeline)
fa_output_performance_by_ppr(run=get_best_eval_pipeline())
Original file line number Diff line number Diff line change
Expand Up @@ -109,4 +109,4 @@ def fa_sensitivity_by_time_to_event(pipeline_run: PipelineRun) -> pn.ggplot:
get_best_eval_pipeline,
)

fa_sensitivity_by_time_to_event(pipeline_run=get_best_eval_pipeline)
fa_sensitivity_by_time_to_event(pipeline_run=get_best_eval_pipeline())
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,21 @@
PipelineRun,
)

get_best_dev_pipeline = PipelineRun(
group=DEVELOPMENT_GROUP,
name=DEVELOPMENT_GROUP.get_best_runs_by_lookahead()[
0,
2,
], # [0,2] for best logistic regression and [1,2] for best xgboost
pos_rate=BEST_POS_RATE,
create_output_paths_on_init=False,
)

get_best_eval_pipeline = test_selected_model_pipeline(
pipeline_to_test=get_best_dev_pipeline,
datasets_for_evaluation=["val_with_washout"],
)
def get_best_dev_pipeline() -> PipelineRun:
return PipelineRun(
group=DEVELOPMENT_GROUP,
name=DEVELOPMENT_GROUP.get_best_runs_by_lookahead()[
0,
2,
], # [0,2] for best logistic regression and [1,2] for best xgboost
pos_rate=BEST_POS_RATE,
create_output_paths_on_init=False,
)


def get_best_eval_pipeline() -> PipelineRun:
return test_selected_model_pipeline(
pipeline_to_test=get_best_dev_pipeline(),
datasets_for_evaluation=["val_with_washout"],
)
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,20 @@

model_train_df = pl.concat(
[
get_best_eval_pipeline.inputs.get_flattened_split_as_lazyframe(split="train"),
get_best_eval_pipeline().inputs.get_flattened_split_as_lazyframe(split="train"),
],
how="vertical",
).with_columns(dataset=pl.format("0. train"))


val_dataset = get_best_eval_pipeline.inputs.get_flattened_split_as_lazyframe(
split="val",
).with_columns(
dataset=pl.format("val"),
val_dataset = (
get_best_eval_pipeline()
.inputs.get_flattened_split_as_lazyframe(
split="val",
)
.with_columns(
dataset=pl.format("val"),
)
)

flattened_combined = pl.concat([model_train_df, val_dataset], how="vertical").rename(
Expand Down Expand Up @@ -156,9 +160,9 @@
############
combined = pd.concat([visit_table_one, patient_table_one])

get_best_eval_pipeline.paper_outputs.paths.tables.mkdir(parents=True, exist_ok=True)
get_best_eval_pipeline().paper_outputs.paths.tables.mkdir(parents=True, exist_ok=True)
combined.to_csv(
get_best_eval_pipeline.paper_outputs.paths.tables / "descriptive_stats_table.csv",
get_best_eval_pipeline().paper_outputs.paths.tables / "descriptive_stats_table.csv",
)

# %%
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def full_eval_for_supplementary(
)

# Do not add the main pipeline's eval to supplementary
if run.name != get_best_eval_pipeline.name:
if run.name != get_best_eval_pipeline().name:
artifacts += run_artifacts

combined_supplementary_md = create_supplementary_from_markdown_artifacts(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,11 @@
)
from psycop.projects.t2d.paper_outputs.selected_runs import get_best_eval_pipeline

out_dir = get_best_eval_pipeline.paper_outputs.paths.tables / "feature_description"
out_dir = get_best_eval_pipeline().paper_outputs.paths.tables / "feature_description"
out_dir.mkdir(parents=True, exist_ok=True)

df = generate_feature_description_df(
df=get_best_eval_pipeline.inputs.get_flattened_split_as_pd(split="train"),
df=get_best_eval_pipeline().inputs.get_flattened_split_as_pd(split="train"),
predictor_specs=selected_specs, # type: ignore
)

Expand Down Expand Up @@ -89,7 +89,7 @@ def prettify_feature_description_df(
from psycop.projects.t2d.paper_outputs.selected_runs import get_best_eval_pipeline

predictor_description_path = (
get_best_eval_pipeline.paper_outputs.paths.tables / "predictor_description.csv"
get_best_eval_pipeline().paper_outputs.paths.tables / "predictor_description.csv"
)

prettified.to_csv(predictor_description_path)
Expand Down Expand Up @@ -119,7 +119,7 @@ def prettify_feature_description_df(
).get_markdown()

with (
get_best_eval_pipeline.paper_outputs.paths.tables / "predictor_description.md"
get_best_eval_pipeline().paper_outputs.paths.tables / "predictor_description.md"
).open(
"+w",
) as f:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def get_dataset(self, run: T2DPipelineRun) -> pl.DataFrame:
)

lookahead_days = (
get_best_eval_pipeline.inputs.cfg.preprocessing.pre_split.min_lookahead_days
get_best_eval_pipeline().inputs.cfg.preprocessing.pre_split.min_lookahead_days
)

hba1c_timestamps = hba1c()
Expand Down Expand Up @@ -239,8 +239,8 @@ def get_plot(self, run: T2DPipelineRun) -> pn.ggplot:


if __name__ == "__main__":
pipeline = get_best_eval_pipeline
plot = MeasurementsWithinLookaheadPlot().get_plot(run=get_best_eval_pipeline)
pipeline = get_best_eval_pipeline()
plot = MeasurementsWithinLookaheadPlot().get_plot(run=get_best_eval_pipeline())
size = (6.5, 8)

plot.save(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@
# %%
from psycop.projects.t2d.paper_outputs.selected_runs import get_best_eval_pipeline

get_best_eval_pipeline.paper_outputs.paths.figures.mkdir(parents=True, exist_ok=True)
get_best_eval_pipeline().paper_outputs.paths.figures.mkdir(parents=True, exist_ok=True)

save_path = (
get_best_eval_pipeline.paper_outputs.paths.figures
get_best_eval_pipeline().paper_outputs.paths.figures
/ "diabetes_incidence_by_time.png"
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
msg = Printer(timestamp=True)

if __name__ == "__main__":
pipeline_inputs = get_best_eval_pipeline.inputs
pipeline_inputs = get_best_eval_pipeline().inputs

flattened_dataset = pl.concat(
[
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,20 @@

model_train_df = pl.concat(
[
get_best_eval_pipeline.inputs.get_flattened_split_as_lazyframe(split="train"),
get_best_eval_pipeline.inputs.get_flattened_split_as_lazyframe(split="val"),
get_best_eval_pipeline().inputs.get_flattened_split_as_lazyframe(split="train"),
get_best_eval_pipeline().inputs.get_flattened_split_as_lazyframe(split="val"),
],
how="vertical",
).with_columns(dataset=pl.format("0. train"))

test_dataset = get_best_eval_pipeline.inputs.get_flattened_split_as_lazyframe(
split="test",
).with_columns(
dataset=pl.format("test"),
test_dataset = (
get_best_eval_pipeline()
.inputs.get_flattened_split_as_lazyframe(
split="test",
)
.with_columns(
dataset=pl.format("test"),
)
)

flattened_combined = pl.concat([model_train_df, test_dataset], how="vertical").rename(
Expand Down Expand Up @@ -209,9 +213,9 @@
############
combined = pd.concat([visit_table_one, patient_table_one])

get_best_eval_pipeline.paper_outputs.paths.tables.mkdir(parents=True, exist_ok=True)
get_best_eval_pipeline().paper_outputs.paths.tables.mkdir(parents=True, exist_ok=True)
combined.to_csv(
get_best_eval_pipeline.paper_outputs.paths.tables / "descriptive_stats_table.csv",
get_best_eval_pipeline().paper_outputs.paths.tables / "descriptive_stats_table.csv",
)

# %%
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def time_from_first_pos_pred_to_next_hba1c(
if __name__ == "__main__":
from psycop.projects.t2d.paper_outputs.selected_runs import get_best_eval_pipeline

pipeline = get_best_eval_pipeline
pipeline = get_best_eval_pipeline()
eval_ds = pipeline.pipeline_outputs.get_eval_dataset()

positive_predictions = (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,6 @@ def generate_feature_importance_table(pipeline_run: T2DPipelineRun) -> pl.DataFr

if __name__ == "__main__":
top_100_features = generate_feature_importance_table(
pipeline_run=get_best_eval_pipeline,
pipeline_run=get_best_eval_pipeline(),
)
pass
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def get_top_i_features_by_mean_abs_shap(
from psycop.projects.t2d.paper_outputs.selected_runs import get_best_eval_pipeline

shap_bundle = get_shap_bundle_for_best_run(
run=get_best_eval_pipeline,
run=get_best_eval_pipeline(),
n_rows=1_000,
cache_ver=0.1,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
plotting_df = pl.from_pandas(long_shap_df)

shap_figures_path = (
get_best_eval_pipeline.paper_outputs.paths.figures / "shap_plot.png"
get_best_eval_pipeline().paper_outputs.paths.figures / "shap_plot.png"
)
shap_figures_path.mkdir(exist_ok=True, parents=True)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ def t2d_auroc_plot(run: T2DPipelineRun) -> pn.ggplot:
if __name__ == "__main__":
from psycop.projects.t2d.paper_outputs.selected_runs import get_best_eval_pipeline

t2d_auroc_plot(run=get_best_eval_pipeline)
t2d_auroc_plot(run=get_best_eval_pipeline())
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,4 @@ def t2d_confusion_matrix_plot(run: T2DPipelineRun) -> pn.ggplot:
if __name__ == "__main__":
from psycop.projects.t2d.paper_outputs.selected_runs import get_best_eval_pipeline

t2d_confusion_matrix_plot(run=get_best_eval_pipeline)
t2d_confusion_matrix_plot(run=get_best_eval_pipeline())
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ def t2d_first_pred_to_event(run: T2DPipelineRun) -> pn.ggplot:
if __name__ == "__main__":
from psycop.projects.t2d.paper_outputs.selected_runs import get_best_eval_pipeline

t2d_first_pred_to_event(run=get_best_eval_pipeline).save(
get_best_eval_pipeline.paper_outputs.paths.figures
t2d_first_pred_to_event(run=get_best_eval_pipeline()).save(
get_best_eval_pipeline().paper_outputs.paths.figures
/ "time_from_pred_to_event.png",
width=5,
height=5,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,4 @@ def t2d_create_main_performance_figure(run: T2DPipelineRun) -> None:
if __name__ == "__main__":
from psycop.projects.t2d.paper_outputs.selected_runs import get_best_eval_pipeline

t2d_create_main_performance_figure(run=get_best_eval_pipeline)
t2d_create_main_performance_figure(run=get_best_eval_pipeline())
Original file line number Diff line number Diff line change
Expand Up @@ -87,4 +87,4 @@ def t2d_output_performance_by_ppr(run: T2DPipelineRun):
if __name__ == "__main__":
from psycop.projects.t2d.paper_outputs.selected_runs import get_best_eval_pipeline

t2d_output_performance_by_ppr(run=get_best_eval_pipeline)
t2d_output_performance_by_ppr(run=get_best_eval_pipeline())
Original file line number Diff line number Diff line change
Expand Up @@ -103,4 +103,4 @@ def t2d_sensitivity_by_time_to_event(run: T2DPipelineRun) -> pn.ggplot:
if __name__ == "__main__":
from psycop.projects.t2d.paper_outputs.selected_runs import get_best_eval_pipeline

t2d_sensitivity_by_time_to_event(run=get_best_eval_pipeline)
t2d_sensitivity_by_time_to_event(run=get_best_eval_pipeline())
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,4 @@ def t2d_create_main_robustness_figure(run: T2DPipelineRun) -> None:


if __name__ == "__main__":
t2d_create_main_robustness_figure(run=get_best_eval_pipeline)
t2d_create_main_robustness_figure(run=get_best_eval_pipeline())
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,4 @@ def t2d_auroc_by_age(run: T2DPipelineRun) -> pn.ggplot:


if __name__ == "__main__":
t2d_auroc_by_age(run=get_best_eval_pipeline)
t2d_auroc_by_age(run=get_best_eval_pipeline())
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@ def t2d_auroc_by_quarter(run: T2DPipelineRun) -> pn.ggplot:


if __name__ == "__main__":
t2d_auroc_by_quarter(run=get_best_eval_pipeline)
t2d_auroc_by_quarter(run=get_best_eval_pipeline())
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,5 @@ def t2d_auroc_by_month_of_year(run: T2DPipelineRun) -> pn.ggplot:


if __name__ == "__main__":
t2d_auroc_by_day_of_week(run=get_best_eval_pipeline)
t2d_auroc_by_month_of_year(run=get_best_eval_pipeline)
t2d_auroc_by_day_of_week(run=get_best_eval_pipeline())
t2d_auroc_by_month_of_year(run=get_best_eval_pipeline())
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,4 @@ def t2d_auroc_by_n_hba1c(


if __name__ == "__main__":
t2d_auroc_by_n_hba1c(run=get_best_eval_pipeline)
t2d_auroc_by_n_hba1c(run=get_best_eval_pipeline())
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,4 @@ def t2d_auroc_by_sex(run: T2DPipelineRun) -> pn.ggplot:


if __name__ == "__main__":
t2d_auroc_by_sex(run=get_best_eval_pipeline)
t2d_auroc_by_sex(run=get_best_eval_pipeline())
Original file line number Diff line number Diff line change
Expand Up @@ -60,4 +60,4 @@ def t2d_auroc_by_time_from_first_visit(run: T2DPipelineRun) -> pn.ggplot:


if __name__ == "__main__":
t2d_auroc_by_time_from_first_visit(run=get_best_eval_pipeline)
t2d_auroc_by_time_from_first_visit(run=get_best_eval_pipeline())
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def modify_features(
)

evaluate_pipeline_with_modified_dataset(
run=get_best_eval_pipeline,
run=get_best_eval_pipeline(),
feature_modifier=CreateBooleanDataset(),
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from psycop.projects.t2d.paper_outputs.selected_runs import get_best_eval_pipeline

if __name__ == "__main__":
evaluation_dataset = get_best_eval_pipeline.pipeline_outputs.get_eval_dataset()
evaluation_dataset = get_best_eval_pipeline().pipeline_outputs.get_eval_dataset()

eval_df = pd.DataFrame(
{
Expand Down
2 changes: 1 addition & 1 deletion psycop/projects/t2d/paper_outputs/run_pipeline_on_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,4 +74,4 @@ def test_pipeline(
if __name__ == "__main__":
from psycop.projects.t2d.paper_outputs.selected_runs import get_best_dev_pipeline

eval_run = test_pipeline(pipeline_to_test=get_best_dev_pipeline)
eval_run = test_pipeline(pipeline_to_test=get_best_dev_pipeline())

0 comments on commit afc94cb

Please sign in to comment.