From b242b05b14990856b2fd683507ae935cbeb0d12f Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 17 Nov 2023 11:11:14 +0100 Subject: [PATCH 01/21] misc. --- .../cvd/model_training/cvd_baseline.cfg | 46 +++++++++++++++++++ .../cvd/model_training/train_model_e2e_v2.py | 13 ++++++ 2 files changed, 59 insertions(+) create mode 100644 psycop/projects/cvd/model_training/cvd_baseline.cfg create mode 100644 psycop/projects/cvd/model_training/train_model_e2e_v2.py diff --git a/psycop/projects/cvd/model_training/cvd_baseline.cfg b/psycop/projects/cvd/model_training/cvd_baseline.cfg new file mode 100644 index 000000000..7bc08ac43 --- /dev/null +++ b/psycop/projects/cvd/model_training/cvd_baseline.cfg @@ -0,0 +1,46 @@ +[project_info] +experiment_path = /. + +[logger] +@loggers = "terminal_logger" + +[trainer] +@trainers = "crossval_trainer" +outcome_col_name = "outcome" +n_splits = 5 +group_col_name = "dw_ek_borger" + +[trainer.metric] +@metrics = "binary_auroc" + +[trainer.training_data] +@data = "minimal_test_data" + +[trainer.validation_data] +@data = "minimal_test_data" + +[trainer.logger] +@loggers = "terminal_logger" + +[trainer.preprocessing_pipeline] +@preprocessing = "baseline_preprocessing_pipeline" + +[trainer.preprocessing_pipeline.*.age_filter] +@preprocessing = "age_filter" +min_age = 0 +max_age = 99 +age_col_name = "pred_age" + +[trainer.task] +@tasks = "binary_classification" +pred_time_uuid_col_name = "pred_time_uuid" + +[trainer.task.task_pipe] +@task_pipelines = "binary_classification_pipeline" + +[trainer.task.task_pipe.sklearn_pipe] +@task_pipelines = "pipe_constructor" + +[trainer.task.task_pipe.sklearn_pipe.*.logistic_regression] +@estimator_steps = "logistic_regression" + diff --git a/psycop/projects/cvd/model_training/train_model_e2e_v2.py b/psycop/projects/cvd/model_training/train_model_e2e_v2.py new file mode 100644 index 000000000..a290fefce --- /dev/null +++ b/psycop/projects/cvd/model_training/train_model_e2e_v2.py @@ -0,0 +1,13 @@ +from pathlib import Path + +from psycop.common.model_training_v2.config.baseline_pipeline import ( + train_baseline_model, +) +from psycop.common.model_training_v2.config.config_utils import ( + load_baseline_config, +) + +if __name__ == "__main__": + config = load_baseline_config(Path(__file__).parent / "cvd_baseline.cfg") + train_baseline_model(config) + From 6a47c600badfddfc43b9bb4fd0ebb186677eb972 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 17 Nov 2023 11:17:30 +0100 Subject: [PATCH 02/21] misc. --- .../cvd/model_training/cvd_baseline.cfg | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/psycop/projects/cvd/model_training/cvd_baseline.cfg b/psycop/projects/cvd/model_training/cvd_baseline.cfg index 7bc08ac43..ccc50cb14 100644 --- a/psycop/projects/cvd/model_training/cvd_baseline.cfg +++ b/psycop/projects/cvd/model_training/cvd_baseline.cfg @@ -22,18 +22,32 @@ group_col_name = "dw_ek_borger" [trainer.logger] @loggers = "terminal_logger" +################# +# Preprocessing # +################# [trainer.preprocessing_pipeline] @preprocessing = "baseline_preprocessing_pipeline" +[trainer.preprocessing_pipeline.*.col_validator] +@preprocessing = "column_exists_validator" +age = "pred_age" +pred_time_uuid = "pred_time_uuid" + +[trainer.preprocessing_pipeline.*.prefix_count_validator] +outcome_prefix = ["outc_", 1] + [trainer.preprocessing_pipeline.*.age_filter] @preprocessing = "age_filter" min_age = 0 max_age = 99 -age_col_name = "pred_age" - +age_col_name = ${trainer.preprocessing_pipeline.*.col_validator.age} + +######## +# Task # +######## [trainer.task] @tasks = "binary_classification" -pred_time_uuid_col_name = "pred_time_uuid" +age_col_name = ${trainer.preprocessing_pipeline.*.col_validator.pred_time_uuid} [trainer.task.task_pipe] @task_pipelines = "binary_classification_pipeline" From 01fe0e76b62f20c0bf80f7628e7c49e0ab950aed Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 17 Nov 2023 11:18:50 +0100 Subject: [PATCH 03/21] docs: add todo --- psycop/common/model_training_v2/config/baseline_pipeline.py | 1 + 1 file changed, 1 insertion(+) diff --git a/psycop/common/model_training_v2/config/baseline_pipeline.py b/psycop/common/model_training_v2/config/baseline_pipeline.py index ce384525d..1cb523d1c 100644 --- a/psycop/common/model_training_v2/config/baseline_pipeline.py +++ b/psycop/common/model_training_v2/config/baseline_pipeline.py @@ -8,5 +8,6 @@ def train_baseline_model(cfg: BaselineSchema) -> float: result = cfg.trainer.train() result.df.write_parquet(cfg.project_info.experiment_path / "eval_df.parquet") + # TODO: https://github.com/Aarhus-Psychiatry-Research/psycop-common/issues/447 Allow dynamic generation of experiments paths return result.metric.value From 607cf3ae0690c7fc312899d467aab0509bb5c923 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 17 Nov 2023 11:35:39 +0100 Subject: [PATCH 04/21] feat: add vertical concatenator --- .../cvd/model_training/cvd_baseline.cfg | 7 ++-- .../data_loader/trainval_loader.py | 35 +++++++++++++++++++ .../model_training/populate_cvd_registry.py | 6 ++++ 3 files changed, 43 insertions(+), 5 deletions(-) create mode 100644 psycop/projects/cvd/model_training/data_loader/trainval_loader.py create mode 100644 psycop/projects/cvd/model_training/populate_cvd_registry.py diff --git a/psycop/projects/cvd/model_training/cvd_baseline.cfg b/psycop/projects/cvd/model_training/cvd_baseline.cfg index ccc50cb14..20d12c798 100644 --- a/psycop/projects/cvd/model_training/cvd_baseline.cfg +++ b/psycop/projects/cvd/model_training/cvd_baseline.cfg @@ -13,11 +13,8 @@ group_col_name = "dw_ek_borger" [trainer.metric] @metrics = "binary_auroc" -[trainer.training_data] -@data = "minimal_test_data" - -[trainer.validation_data] -@data = "minimal_test_data" +[trainer.data] +@training_data = "minimal_test_data" [trainer.logger] @loggers = "terminal_logger" diff --git a/psycop/projects/cvd/model_training/data_loader/trainval_loader.py b/psycop/projects/cvd/model_training/data_loader/trainval_loader.py new file mode 100644 index 000000000..3a327246c --- /dev/null +++ b/psycop/projects/cvd/model_training/data_loader/trainval_loader.py @@ -0,0 +1,35 @@ +from pathlib import Path +import polars as pl +from functionalpy import Seq + +from psycop.common.model_training_v2.config.baseline_registry import BaselineRegistry +from psycop.common.model_training_v2.trainer.base_dataloader import BaselineDataLoader + +class MissingPathError(Exception): + ... + + +@BaselineRegistry.data.register("parquet_vertical_concatenator") +class ParquetVerticalConcatenator(BaselineDataLoader): + def __init__(self, *args: str): + self.dataset_paths = [Path(arg) for arg in args] + + missing_paths = Seq(self.dataset_paths).map(self._check_path_exists).flatten() + if missing_paths: + raise MissingPathError("""The following paths are missing: + {missing_paths} + """) + + def _check_path_exists(self, path: Path) -> list[MissingPathError] + if not path.exists(): + return [MissingPathError(path)] + + return [] + + def load(self) -> pl.LazyFrame: + return pl.concat(how="vertical", items= + [ + pl.scan_parquet(path) + for path in self.dataset_paths + ], + ) diff --git a/psycop/projects/cvd/model_training/populate_cvd_registry.py b/psycop/projects/cvd/model_training/populate_cvd_registry.py new file mode 100644 index 000000000..240e2d22a --- /dev/null +++ b/psycop/projects/cvd/model_training/populate_cvd_registry.py @@ -0,0 +1,6 @@ +# ruff: noqa + +def populate_with_cvd_registry() -> None: + from psycop.projects.cvd.model_training.data_loader.trainval_loader import ParquetVerticalConcatenator + +populate_with_cvd_registry() \ No newline at end of file From fc21f455b3e38c344dd408a0a78057264247add1 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 17 Nov 2023 11:37:56 +0100 Subject: [PATCH 05/21] cfg: add validation paths --- psycop/projects/cvd/model_training/cvd_baseline.cfg | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/psycop/projects/cvd/model_training/cvd_baseline.cfg b/psycop/projects/cvd/model_training/cvd_baseline.cfg index 20d12c798..dbe4fe51f 100644 --- a/psycop/projects/cvd/model_training/cvd_baseline.cfg +++ b/psycop/projects/cvd/model_training/cvd_baseline.cfg @@ -14,7 +14,9 @@ group_col_name = "dw_ek_borger" @metrics = "binary_auroc" [trainer.data] -@training_data = "minimal_test_data" +@training_data = "parquet_vertical_concatenator" +train_path = "E:/shared_resources/cvd/e2e_base_test/flattened_datasets/train.parquet" +val_path = "E:/shared_resources/cvd/e2e_base_test/flattened_datasets/val.parquet" [trainer.logger] @loggers = "terminal_logger" From 20329073a857834aeaea054c4959a69f746ea9c8 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 17 Nov 2023 12:15:38 +0100 Subject: [PATCH 06/21] misc. --- .../config/baseline_pipeline.py | 1 + .../preprocessing/steps/column_validator.py | 3 +- .../cvd/model_training/cvd_baseline.cfg | 30 ++++++++++++------- .../data_loader/trainval_loader.py | 13 ++++---- .../model_training/populate_cvd_registry.py | 2 ++ .../preprocessing/regex_filter.py | 24 +++++++++++++++ .../cvd/model_training/train_model_e2e_v2.py | 5 +++- 7 files changed, 61 insertions(+), 17 deletions(-) create mode 100644 psycop/projects/cvd/model_training/preprocessing/regex_filter.py diff --git a/psycop/common/model_training_v2/config/baseline_pipeline.py b/psycop/common/model_training_v2/config/baseline_pipeline.py index 1cb523d1c..46c971e7f 100644 --- a/psycop/common/model_training_v2/config/baseline_pipeline.py +++ b/psycop/common/model_training_v2/config/baseline_pipeline.py @@ -5,6 +5,7 @@ def train_baseline_model(cfg: BaselineSchema) -> float: cfg.logger.log_config( cfg.dict(), ) # Dict handling, might have to be flattened depending on the logger. Probably want all loggers to take flattened dicts. + # TODO: Currently logs the resolved objects. We want to fix that. result = cfg.trainer.train() result.df.write_parquet(cfg.project_info.experiment_path / "eval_df.parquet") diff --git a/psycop/common/model_training_v2/trainer/preprocessing/steps/column_validator.py b/psycop/common/model_training_v2/trainer/preprocessing/steps/column_validator.py index 1d51bcec2..2a75ae0d8 100644 --- a/psycop/common/model_training_v2/trainer/preprocessing/steps/column_validator.py +++ b/psycop/common/model_training_v2/trainer/preprocessing/steps/column_validator.py @@ -1,3 +1,4 @@ +from collections.abc import Sequence from dataclasses import dataclass import polars as pl @@ -76,7 +77,7 @@ def from_list( return cls(prefix=prefix, count=count) # type: ignore -@BaselineRegistry.preprocessing.register("prefix_count_validator") +@BaselineRegistry.preprocessing.register("column_prefix_count_expectation") class ColumnPrefixExpectation(PresplitStep): def __init__( self, diff --git a/psycop/projects/cvd/model_training/cvd_baseline.cfg b/psycop/projects/cvd/model_training/cvd_baseline.cfg index dbe4fe51f..82d8eeeeb 100644 --- a/psycop/projects/cvd/model_training/cvd_baseline.cfg +++ b/psycop/projects/cvd/model_training/cvd_baseline.cfg @@ -13,10 +13,9 @@ group_col_name = "dw_ek_borger" [trainer.metric] @metrics = "binary_auroc" -[trainer.data] -@training_data = "parquet_vertical_concatenator" -train_path = "E:/shared_resources/cvd/e2e_base_test/flattened_datasets/train.parquet" -val_path = "E:/shared_resources/cvd/e2e_base_test/flattened_datasets/val.parquet" +[trainer.training_data] +@data = "parquet_vertical_concatenator" +paths = ["E:/shared_resources/cvd/e2e_base_test/flattened_datasets/train.parquet", "E:/shared_resources/cvd/e2e_base_test/flattened_datasets/val.parquet"] [trainer.logger] @loggers = "terminal_logger" @@ -27,26 +26,37 @@ val_path = "E:/shared_resources/cvd/e2e_base_test/flattened_datasets/val.parquet [trainer.preprocessing_pipeline] @preprocessing = "baseline_preprocessing_pipeline" -[trainer.preprocessing_pipeline.*.col_validator] +[trainer.preprocessing_pipeline.*.columns_exist] @preprocessing = "column_exists_validator" -age = "pred_age" -pred_time_uuid = "pred_time_uuid" -[trainer.preprocessing_pipeline.*.prefix_count_validator] +[trainer.preprocessing_pipeline.*.columns_exist.*] +age = "pred_age_in_years" +pred_time_uuid = "prediction_time_uuid" + +[trainer.preprocessing_pipeline.*.regex_column_blacklist] +@preprocessing = "regex_column_blacklist" + +[trainer.preprocessing_pipeline.*.regex_column_blacklist.*] +outcome = "^outc_.+(?!.*1825.*).*" + +[trainer.preprocessing_pipeline.*.column_prefix_count_expectation] +@preprocessing = "column_prefix_count_expectation" + +[trainer.preprocessing_pipeline.*.column_prefix_count_expectation.*] outcome_prefix = ["outc_", 1] [trainer.preprocessing_pipeline.*.age_filter] @preprocessing = "age_filter" min_age = 0 max_age = 99 -age_col_name = ${trainer.preprocessing_pipeline.*.col_validator.age} +age_col_name = ${trainer.preprocessing_pipeline.*.columns_exist.*.age} ######## # Task # ######## [trainer.task] @tasks = "binary_classification" -age_col_name = ${trainer.preprocessing_pipeline.*.col_validator.pred_time_uuid} +pred_time_uuid_col_name = ${trainer.preprocessing_pipeline.*.columns_exist.*.pred_time_uuid} [trainer.task.task_pipe] @task_pipelines = "binary_classification_pipeline" diff --git a/psycop/projects/cvd/model_training/data_loader/trainval_loader.py b/psycop/projects/cvd/model_training/data_loader/trainval_loader.py index 3a327246c..b7f4b1112 100644 --- a/psycop/projects/cvd/model_training/data_loader/trainval_loader.py +++ b/psycop/projects/cvd/model_training/data_loader/trainval_loader.py @@ -1,26 +1,29 @@ +from collections.abc import Sequence from pathlib import Path + import polars as pl from functionalpy import Seq from psycop.common.model_training_v2.config.baseline_registry import BaselineRegistry from psycop.common.model_training_v2.trainer.base_dataloader import BaselineDataLoader + class MissingPathError(Exception): ... @BaselineRegistry.data.register("parquet_vertical_concatenator") class ParquetVerticalConcatenator(BaselineDataLoader): - def __init__(self, *args: str): - self.dataset_paths = [Path(arg) for arg in args] + def __init__(self, paths: Sequence[str]): + self.dataset_paths = [Path(arg) for arg in paths] - missing_paths = Seq(self.dataset_paths).map(self._check_path_exists).flatten() + missing_paths = Seq(self.dataset_paths).map(self._check_path_exists).flatten().to_list() if missing_paths: - raise MissingPathError("""The following paths are missing: + raise MissingPathError(f"""The following paths are missing: {missing_paths} """) - def _check_path_exists(self, path: Path) -> list[MissingPathError] + def _check_path_exists(self, path: Path) -> list[MissingPathError]: if not path.exists(): return [MissingPathError(path)] diff --git a/psycop/projects/cvd/model_training/populate_cvd_registry.py b/psycop/projects/cvd/model_training/populate_cvd_registry.py index 240e2d22a..66576a7e0 100644 --- a/psycop/projects/cvd/model_training/populate_cvd_registry.py +++ b/psycop/projects/cvd/model_training/populate_cvd_registry.py @@ -1,6 +1,8 @@ # ruff: noqa + def populate_with_cvd_registry() -> None: from psycop.projects.cvd.model_training.data_loader.trainval_loader import ParquetVerticalConcatenator + from psycop.projects.cvd.model_training.preprocessing.regex_filter import RegexColumnBlacklist populate_with_cvd_registry() \ No newline at end of file diff --git a/psycop/projects/cvd/model_training/preprocessing/regex_filter.py b/psycop/projects/cvd/model_training/preprocessing/regex_filter.py new file mode 100644 index 000000000..7b9d8a7c0 --- /dev/null +++ b/psycop/projects/cvd/model_training/preprocessing/regex_filter.py @@ -0,0 +1,24 @@ +from collections.abc import Sequence +from dataclasses import dataclass + +import polars as pl +from functionalpy import Seq +from polars import LazyFrame + +from psycop.common.model_training_v2.config.baseline_registry import BaselineRegistry +from psycop.common.model_training_v2.trainer.preprocessing.step import ( + PolarsFrame_T0, + PresplitStep, +) + + +@BaselineRegistry.preprocessing.register("regex_column_blacklist") +class RegexColumnBlacklist(PresplitStep): + def __init__(self, *args: str): + self.regex_blacklist = args + + def apply(self, input_df: PolarsFrame_T0) -> PolarsFrame_T0: + for blacklist in self.regex_blacklist: + input_df = input_df.select(pl.exclude(blacklist)) + + return input_df diff --git a/psycop/projects/cvd/model_training/train_model_e2e_v2.py b/psycop/projects/cvd/model_training/train_model_e2e_v2.py index a290fefce..3e5d554e5 100644 --- a/psycop/projects/cvd/model_training/train_model_e2e_v2.py +++ b/psycop/projects/cvd/model_training/train_model_e2e_v2.py @@ -6,8 +6,11 @@ from psycop.common.model_training_v2.config.config_utils import ( load_baseline_config, ) +from psycop.projects.cvd.model_training.populate_cvd_registry import ( + populate_with_cvd_registry, +) if __name__ == "__main__": + populate_with_cvd_registry() config = load_baseline_config(Path(__file__).parent / "cvd_baseline.cfg") train_baseline_model(config) - From c4366bb1df7340a0559c98034b8d1f24ed2a538d Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 17 Nov 2023 12:30:10 +0100 Subject: [PATCH 07/21] misc. --- psycop/common/model_training_v2/config/config_utils.py | 9 ++++++++- psycop/projects/cvd/model_training/cvd_baseline.cfg | 4 ++-- .../cvd/model_training/preprocessing/regex_filter.py | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/psycop/common/model_training_v2/config/config_utils.py b/psycop/common/model_training_v2/config/config_utils.py index 0bd6fdeb1..f1f6c5a31 100644 --- a/psycop/common/model_training_v2/config/config_utils.py +++ b/psycop/common/model_training_v2/config/config_utils.py @@ -13,7 +13,6 @@ populate_baseline_registry() - def load_baseline_config(config_path: Path) -> BaselineSchema: """Loads the baseline config from disk and resolves it.""" cfg = Config().from_disk(config_path) @@ -26,3 +25,11 @@ def load_hyperparam_config(config_path: Path) -> dict[str, Any]: cfg = Config().from_disk(config_path) resolved = BaselineRegistry.resolve(cfg) return resolved + +if __name__ == "__main__": + config_str = """[section] +value='^test$' +""" + Config().from_str(config_str) + pass + diff --git a/psycop/projects/cvd/model_training/cvd_baseline.cfg b/psycop/projects/cvd/model_training/cvd_baseline.cfg index 82d8eeeeb..ace838a20 100644 --- a/psycop/projects/cvd/model_training/cvd_baseline.cfg +++ b/psycop/projects/cvd/model_training/cvd_baseline.cfg @@ -6,7 +6,7 @@ experiment_path = /. [trainer] @trainers = "crossval_trainer" -outcome_col_name = "outcome" +outcome_col_name = "outc_score2_cvd_within_1825_days_maximum_fallback_0_dichotomous" n_splits = 5 group_col_name = "dw_ek_borger" @@ -37,7 +37,7 @@ pred_time_uuid = "prediction_time_uuid" @preprocessing = "regex_column_blacklist" [trainer.preprocessing_pipeline.*.regex_column_blacklist.*] -outcome = "^outc_.+(?!.*1825.*).*" +outcome = "outc_.+(365|1095).*" [trainer.preprocessing_pipeline.*.column_prefix_count_expectation] @preprocessing = "column_prefix_count_expectation" diff --git a/psycop/projects/cvd/model_training/preprocessing/regex_filter.py b/psycop/projects/cvd/model_training/preprocessing/regex_filter.py index 7b9d8a7c0..c805a6cef 100644 --- a/psycop/projects/cvd/model_training/preprocessing/regex_filter.py +++ b/psycop/projects/cvd/model_training/preprocessing/regex_filter.py @@ -19,6 +19,6 @@ def __init__(self, *args: str): def apply(self, input_df: PolarsFrame_T0) -> PolarsFrame_T0: for blacklist in self.regex_blacklist: - input_df = input_df.select(pl.exclude(blacklist)) + input_df = input_df.select(pl.exclude(f"^{blacklist}$")) return input_df From ce4b0098edd289a715ab6a8677314a03f873abfb Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 17 Nov 2023 14:00:52 +0100 Subject: [PATCH 08/21] misc. --- .../trainer/task/estimator_steps/xgboost.py | 1 + .../cvd/model_training/cvd_baseline.cfg | 5 +++- .../model_training/populate_cvd_registry.py | 3 +++ .../preprocessing/datetime_filter.py | 23 +++++++++++++++++++ 4 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 psycop/projects/cvd/model_training/preprocessing/datetime_filter.py diff --git a/psycop/common/model_training_v2/trainer/task/estimator_steps/xgboost.py b/psycop/common/model_training_v2/trainer/task/estimator_steps/xgboost.py index f03d67bb5..f2b994f24 100644 --- a/psycop/common/model_training_v2/trainer/task/estimator_steps/xgboost.py +++ b/psycop/common/model_training_v2/trainer/task/estimator_steps/xgboost.py @@ -8,6 +8,7 @@ ) +# TODO: Make function signature as good as for logistic regression def xgboost_classifier_step(**kwargs: Any) -> ModelStep: """Initialize XGBClassifier model with hparams specified as kwargs. The 'missing' hyperparameter specifies the value to be treated as missing diff --git a/psycop/projects/cvd/model_training/cvd_baseline.cfg b/psycop/projects/cvd/model_training/cvd_baseline.cfg index ace838a20..31cf7486e 100644 --- a/psycop/projects/cvd/model_training/cvd_baseline.cfg +++ b/psycop/projects/cvd/model_training/cvd_baseline.cfg @@ -39,6 +39,9 @@ pred_time_uuid = "prediction_time_uuid" [trainer.preprocessing_pipeline.*.regex_column_blacklist.*] outcome = "outc_.+(365|1095).*" +[trainer.preprocessing_pipeline.*.temporal_col_filter] +@preprocessing = "temporal_col_filter" + [trainer.preprocessing_pipeline.*.column_prefix_count_expectation] @preprocessing = "column_prefix_count_expectation" @@ -64,6 +67,6 @@ pred_time_uuid_col_name = ${trainer.preprocessing_pipeline.*.columns_exist.*.pre [trainer.task.task_pipe.sklearn_pipe] @task_pipelines = "pipe_constructor" -[trainer.task.task_pipe.sklearn_pipe.*.logistic_regression] +[trainer.task.task_pipe.sklearn_pipe.*.model] @estimator_steps = "logistic_regression" diff --git a/psycop/projects/cvd/model_training/populate_cvd_registry.py b/psycop/projects/cvd/model_training/populate_cvd_registry.py index 66576a7e0..7aba9b7a8 100644 --- a/psycop/projects/cvd/model_training/populate_cvd_registry.py +++ b/psycop/projects/cvd/model_training/populate_cvd_registry.py @@ -1,8 +1,11 @@ # ruff: noqa + + def populate_with_cvd_registry() -> None: from psycop.projects.cvd.model_training.data_loader.trainval_loader import ParquetVerticalConcatenator from psycop.projects.cvd.model_training.preprocessing.regex_filter import RegexColumnBlacklist + from psycop.projects.cvd.model_training.preprocessing.datetime_filter import TemporalColumnFilter populate_with_cvd_registry() \ No newline at end of file diff --git a/psycop/projects/cvd/model_training/preprocessing/datetime_filter.py b/psycop/projects/cvd/model_training/preprocessing/datetime_filter.py new file mode 100644 index 000000000..40e7f32bd --- /dev/null +++ b/psycop/projects/cvd/model_training/preprocessing/datetime_filter.py @@ -0,0 +1,23 @@ +from collections.abc import Sequence +from dataclasses import dataclass + +import polars as pl +import polars.selectors as cs +from functionalpy import Seq +from polars import LazyFrame + +from psycop.common.model_training_v2.config.baseline_registry import BaselineRegistry +from psycop.common.model_training_v2.trainer.preprocessing.step import ( + PolarsFrame_T0, + PresplitStep, +) + + +@BaselineRegistry.preprocessing.register("temporal_col_filter") +class TemporalColumnFilter(PresplitStep): + def __init__(self): + pass + + def apply(self, input_df: PolarsFrame_T0) -> PolarsFrame_T0: + temporal_columns = input_df.select(cs.temporal()).columns + return input_df.drop(temporal_columns) From f92f36a977a8bf18c0c64ef16dacbc25320bc58d Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 17 Nov 2023 10:43:32 +0100 Subject: [PATCH 09/21] dev: populate baseline registry on import of BaselineRegistry --- psycop/common/model_training_v2/config/baseline_registry.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/psycop/common/model_training_v2/config/baseline_registry.py b/psycop/common/model_training_v2/config/baseline_registry.py index 48b120560..5a3e51abe 100644 --- a/psycop/common/model_training_v2/config/baseline_registry.py +++ b/psycop/common/model_training_v2/config/baseline_registry.py @@ -1,6 +1,10 @@ import catalogue from confection import registry +from psycop.common.model_training_v2.config.populate_registry import ( + populate_baseline_registry, +) + class RegistryWithDict(registry): def to_dict(self) -> dict[str, catalogue.Registry]: @@ -38,3 +42,5 @@ def to_dict(self) -> dict[str, catalogue.Registry]: for attribute_name in dir(self) if isinstance(getattr(self, attribute_name), catalogue.Registry) } + +populate_baseline_registry() From cd4d7660e77f45b2aa43245f9d6fc075ef2118c1 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 17 Nov 2023 10:44:05 +0100 Subject: [PATCH 10/21] style: auto-fixes from pre-commit --- psycop/common/model_training_v2/config/baseline_registry.py | 1 + 1 file changed, 1 insertion(+) diff --git a/psycop/common/model_training_v2/config/baseline_registry.py b/psycop/common/model_training_v2/config/baseline_registry.py index 5a3e51abe..e29c32c12 100644 --- a/psycop/common/model_training_v2/config/baseline_registry.py +++ b/psycop/common/model_training_v2/config/baseline_registry.py @@ -43,4 +43,5 @@ def to_dict(self) -> dict[str, catalogue.Registry]: if isinstance(getattr(self, attribute_name), catalogue.Registry) } + populate_baseline_registry() From d0925eb69eb4ceafd0bf625acb4c657ce60a1fc6 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 17 Nov 2023 10:56:21 +0100 Subject: [PATCH 11/21] refactor: move terminallogger to its own file to avoid circular improt --- .../config/populate_registry.py | 2 +- .../model_training_v2/loggers/base_logger.py | 30 --------------- .../loggers/terminal_logger.py | 38 +++++++++++++++++++ .../loggers/test_multilogger.py | 2 +- .../common/model_training_v2/test_pipeline.py | 2 +- .../preprocessing/steps/test_v2_filters.py | 2 +- 6 files changed, 42 insertions(+), 34 deletions(-) create mode 100644 psycop/common/model_training_v2/loggers/terminal_logger.py diff --git a/psycop/common/model_training_v2/config/populate_registry.py b/psycop/common/model_training_v2/config/populate_registry.py index 85f16e751..68d8b8321 100644 --- a/psycop/common/model_training_v2/config/populate_registry.py +++ b/psycop/common/model_training_v2/config/populate_registry.py @@ -9,7 +9,7 @@ def populate_baseline_registry() -> None: and easier to debug for people who are not familiar with python setup hooks. """ # Loggers - from ..loggers.base_logger import TerminalLogger + from ..loggers.terminal_logger import TerminalLogger # Preprocessing from ..trainer.preprocessing.pipeline import BaselinePreprocessingPipeline diff --git a/psycop/common/model_training_v2/loggers/base_logger.py b/psycop/common/model_training_v2/loggers/base_logger.py index b206ef581..e2e0475ca 100644 --- a/psycop/common/model_training_v2/loggers/base_logger.py +++ b/psycop/common/model_training_v2/loggers/base_logger.py @@ -1,9 +1,5 @@ from typing import Any, Protocol, runtime_checkable -import wasabi - -from psycop.common.global_utils.config_utils import flatten_nested_dict -from psycop.common.model_training_v2.config.baseline_registry import BaselineRegistry from psycop.common.model_training_v2.trainer.task.base_metric import ( CalculatedMetric, ) @@ -30,29 +26,3 @@ def log_config(self, config: dict[str, Any]) -> None: ... -@BaselineRegistry.loggers.register("terminal_logger") -class TerminalLogger(BaselineLogger): - def __init__(self) -> None: - self._l = wasabi.Printer(timestamp=True) - - def info(self, message: str) -> None: - self._l.info(message) - - def good(self, message: str) -> None: - self._l.good(message) - - def warn(self, message: str) -> None: - self._l.warn(message) - - def fail(self, message: str) -> None: - self._l.fail(message) - - def log_metric(self, metric: CalculatedMetric) -> None: - self._l.divider(f"Logging metric {metric.name}") - self._l.info(f"{metric.name}: {metric.value}") - - def log_config(self, config: dict[str, Any]) -> None: - self._l.divider("Logging config") - config = flatten_nested_dict(config) - cfg_str = "\n".join([f"{k}: {v}" for k, v in config.items()]) - self._l.info(cfg_str) diff --git a/psycop/common/model_training_v2/loggers/terminal_logger.py b/psycop/common/model_training_v2/loggers/terminal_logger.py new file mode 100644 index 000000000..25db5b6a5 --- /dev/null +++ b/psycop/common/model_training_v2/loggers/terminal_logger.py @@ -0,0 +1,38 @@ +from psycop.common.global_utils.config_utils import flatten_nested_dict +from psycop.common.model_training_v2.config.baseline_registry import BaselineRegistry +from psycop.common.model_training_v2.loggers.base_logger import BaselineLogger +from psycop.common.model_training_v2.trainer.task.base_metric import CalculatedMetric + + +import wasabi + + +from typing import Any + + +@BaselineRegistry.loggers.register("terminal_logger") +class TerminalLogger(BaselineLogger): + def __init__(self) -> None: + self._l = wasabi.Printer(timestamp=True) + + def info(self, message: str) -> None: + self._l.info(message) + + def good(self, message: str) -> None: + self._l.good(message) + + def warn(self, message: str) -> None: + self._l.warn(message) + + def fail(self, message: str) -> None: + self._l.fail(message) + + def log_metric(self, metric: CalculatedMetric) -> None: + self._l.divider(f"Logging metric {metric.name}") + self._l.info(f"{metric.name}: {metric.value}") + + def log_config(self, config: dict[str, Any]) -> None: + self._l.divider("Logging config") + config = flatten_nested_dict(config) + cfg_str = "\n".join([f"{k}: {v}" for k, v in config.items()]) + self._l.info(cfg_str) \ No newline at end of file diff --git a/psycop/common/model_training_v2/loggers/test_multilogger.py b/psycop/common/model_training_v2/loggers/test_multilogger.py index db85e6aed..fcd309426 100644 --- a/psycop/common/model_training_v2/loggers/test_multilogger.py +++ b/psycop/common/model_training_v2/loggers/test_multilogger.py @@ -1,6 +1,6 @@ import pytest -from psycop.common.model_training_v2.loggers.base_logger import TerminalLogger +from psycop.common.model_training_v2.loggers.terminal_logger import TerminalLogger from psycop.common.model_training_v2.loggers.multi_logger import MultiLogger diff --git a/psycop/common/model_training_v2/test_pipeline.py b/psycop/common/model_training_v2/test_pipeline.py index 10a87dd9f..9a89f39ca 100644 --- a/psycop/common/model_training_v2/test_pipeline.py +++ b/psycop/common/model_training_v2/test_pipeline.py @@ -12,7 +12,7 @@ from psycop.common.model_training_v2.config.config_utils import ( load_baseline_config, ) -from psycop.common.model_training_v2.loggers.base_logger import ( +from psycop.common.model_training_v2.loggers.terminal_logger import ( TerminalLogger, ) from psycop.common.model_training_v2.trainer.cross_validator_trainer import ( diff --git a/psycop/common/model_training_v2/trainer/preprocessing/steps/test_v2_filters.py b/psycop/common/model_training_v2/trainer/preprocessing/steps/test_v2_filters.py index cd0410205..21ee4207a 100644 --- a/psycop/common/model_training_v2/trainer/preprocessing/steps/test_v2_filters.py +++ b/psycop/common/model_training_v2/trainer/preprocessing/steps/test_v2_filters.py @@ -1,7 +1,7 @@ import polars as pl import pytest -from psycop.common.model_training_v2.loggers.base_logger import TerminalLogger +from psycop.common.model_training_v2.loggers.terminal_logger import TerminalLogger from psycop.common.model_training_v2.trainer.preprocessing.steps.col_filters import ( LookbehindCombinationColFilter, ) From 23e94ce18630087a6c5771a043bcee61cb8c6fdd Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 17 Nov 2023 10:56:26 +0100 Subject: [PATCH 12/21] style: auto-fixes from pre-commit --- .../common/model_training_v2/loggers/base_logger.py | 2 -- .../model_training_v2/loggers/terminal_logger.py | 12 +++++------- .../model_training_v2/loggers/test_multilogger.py | 2 +- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/psycop/common/model_training_v2/loggers/base_logger.py b/psycop/common/model_training_v2/loggers/base_logger.py index e2e0475ca..0fb63f721 100644 --- a/psycop/common/model_training_v2/loggers/base_logger.py +++ b/psycop/common/model_training_v2/loggers/base_logger.py @@ -24,5 +24,3 @@ def log_metric(self, metric: CalculatedMetric) -> None: def log_config(self, config: dict[str, Any]) -> None: ... - - diff --git a/psycop/common/model_training_v2/loggers/terminal_logger.py b/psycop/common/model_training_v2/loggers/terminal_logger.py index 25db5b6a5..4d437d8d4 100644 --- a/psycop/common/model_training_v2/loggers/terminal_logger.py +++ b/psycop/common/model_training_v2/loggers/terminal_logger.py @@ -1,15 +1,13 @@ +from typing import Any + +import wasabi + from psycop.common.global_utils.config_utils import flatten_nested_dict from psycop.common.model_training_v2.config.baseline_registry import BaselineRegistry from psycop.common.model_training_v2.loggers.base_logger import BaselineLogger from psycop.common.model_training_v2.trainer.task.base_metric import CalculatedMetric -import wasabi - - -from typing import Any - - @BaselineRegistry.loggers.register("terminal_logger") class TerminalLogger(BaselineLogger): def __init__(self) -> None: @@ -35,4 +33,4 @@ def log_config(self, config: dict[str, Any]) -> None: self._l.divider("Logging config") config = flatten_nested_dict(config) cfg_str = "\n".join([f"{k}: {v}" for k, v in config.items()]) - self._l.info(cfg_str) \ No newline at end of file + self._l.info(cfg_str) diff --git a/psycop/common/model_training_v2/loggers/test_multilogger.py b/psycop/common/model_training_v2/loggers/test_multilogger.py index fcd309426..f68c2453d 100644 --- a/psycop/common/model_training_v2/loggers/test_multilogger.py +++ b/psycop/common/model_training_v2/loggers/test_multilogger.py @@ -1,7 +1,7 @@ import pytest -from psycop.common.model_training_v2.loggers.terminal_logger import TerminalLogger from psycop.common.model_training_v2.loggers.multi_logger import MultiLogger +from psycop.common.model_training_v2.loggers.terminal_logger import TerminalLogger def test_multilogger(capsys: pytest.CaptureFixture[str]): From 95ab2c809db91007b641eda9abbba555c717585b Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 17 Nov 2023 13:11:38 +0000 Subject: [PATCH 13/21] Fixes #450 --- .../config/populate_registry.py | 3 +++ ...istic_regression.py => test_suggesters.py} | 0 .../trainer/task/estimator_steps/xgboost.py | 25 +++++++++++++------ 3 files changed, 21 insertions(+), 7 deletions(-) rename psycop/common/model_training_v2/trainer/task/estimator_steps/{test_logistic_regression.py => test_suggesters.py} (100%) diff --git a/psycop/common/model_training_v2/config/populate_registry.py b/psycop/common/model_training_v2/config/populate_registry.py index 68d8b8321..f4bbdc7ba 100644 --- a/psycop/common/model_training_v2/config/populate_registry.py +++ b/psycop/common/model_training_v2/config/populate_registry.py @@ -36,6 +36,9 @@ def populate_baseline_registry() -> None: from ..trainer.task.estimator_steps.logistic_regression import ( logistic_regression_step, ) + from ..trainer.task.estimator_steps.xgboost import ( + xgboost_classifier_step, + ) # Suggesters from ..hyperparameter_suggester.hyperparameter_suggester import SuggesterSpace diff --git a/psycop/common/model_training_v2/trainer/task/estimator_steps/test_logistic_regression.py b/psycop/common/model_training_v2/trainer/task/estimator_steps/test_suggesters.py similarity index 100% rename from psycop/common/model_training_v2/trainer/task/estimator_steps/test_logistic_regression.py rename to psycop/common/model_training_v2/trainer/task/estimator_steps/test_suggesters.py diff --git a/psycop/common/model_training_v2/trainer/task/estimator_steps/xgboost.py b/psycop/common/model_training_v2/trainer/task/estimator_steps/xgboost.py index f2b994f24..8a86543e1 100644 --- a/psycop/common/model_training_v2/trainer/task/estimator_steps/xgboost.py +++ b/psycop/common/model_training_v2/trainer/task/estimator_steps/xgboost.py @@ -1,17 +1,28 @@ -from typing import Any +from typing import Any, Literal import numpy as np from xgboost import XGBClassifier +from psycop.common.model_training_v2.config.baseline_registry import BaselineRegistry from psycop.common.model_training_v2.trainer.task.model_step import ( ModelStep, ) -# TODO: Make function signature as good as for logistic regression -def xgboost_classifier_step(**kwargs: Any) -> ModelStep: +@BaselineRegistry.estimator_steps.register("xgboost") +def xgboost_classifier_step( + tree_method: Literal["auto", "gpu_hist"], + n_estimators: int = 100, + max_depth: int = 3, +) -> ModelStep: """Initialize XGBClassifier model with hparams specified as kwargs. - The 'missing' hyperparameter specifies the value to be treated as missing - and is set to np.nan by default.""" - static_hyperparameters: dict[str, float] = {"missing": np.nan} - return ("xgboost", XGBClassifier(**kwargs, **static_hyperparameters)) + The 'missing' hyperparameter specifies the value to be treated as missing and is set to np.nan by default.""" + return ( + "xgboost", + XGBClassifier( + n_estimators=n_estimators, + max_depth=max_depth, + tree_method=tree_method, + missing=np.nan, + ), + ) From 7ae518254d6a8cb6734b0a1504e1f4a27b3c0404 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 17 Nov 2023 13:11:53 +0000 Subject: [PATCH 14/21] style: auto-fixes from pre-commit --- .../trainer/task/estimator_steps/xgboost.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/psycop/common/model_training_v2/trainer/task/estimator_steps/xgboost.py b/psycop/common/model_training_v2/trainer/task/estimator_steps/xgboost.py index 8a86543e1..736666171 100644 --- a/psycop/common/model_training_v2/trainer/task/estimator_steps/xgboost.py +++ b/psycop/common/model_training_v2/trainer/task/estimator_steps/xgboost.py @@ -1,4 +1,4 @@ -from typing import Any, Literal +from typing import Literal import numpy as np from xgboost import XGBClassifier @@ -16,7 +16,8 @@ def xgboost_classifier_step( max_depth: int = 3, ) -> ModelStep: """Initialize XGBClassifier model with hparams specified as kwargs. - The 'missing' hyperparameter specifies the value to be treated as missing and is set to np.nan by default.""" + The 'missing' hyperparameter specifies the value to be treated as missing and is set to np.nan by default. + """ return ( "xgboost", XGBClassifier( From 3d2d9fddca5905f619f35db0a90ff2c6a6196c83 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 17 Nov 2023 13:12:31 +0000 Subject: [PATCH 15/21] misc. --- .../model_training_v2/trainer/task/estimator_steps/xgboost.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/psycop/common/model_training_v2/trainer/task/estimator_steps/xgboost.py b/psycop/common/model_training_v2/trainer/task/estimator_steps/xgboost.py index 736666171..2a3c71dc0 100644 --- a/psycop/common/model_training_v2/trainer/task/estimator_steps/xgboost.py +++ b/psycop/common/model_training_v2/trainer/task/estimator_steps/xgboost.py @@ -11,7 +11,7 @@ @BaselineRegistry.estimator_steps.register("xgboost") def xgboost_classifier_step( - tree_method: Literal["auto", "gpu_hist"], + tree_method: Literal["auto", "gpu_hist"] = "gpu_hist", n_estimators: int = 100, max_depth: int = 3, ) -> ModelStep: From 51bde842e98a8300e874c73c25285d3d274f2912 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 17 Nov 2023 13:14:02 +0000 Subject: [PATCH 16/21] feat: add xgboost defaults --- .../estimator_steps/xgboost/xgboost_20231117_131345.cfg | 8 ++++++++ .../model_training_v2/config/registries_testing_utils.py | 2 ++ 2 files changed, 10 insertions(+) create mode 100644 psycop/common/model_training_v2/config/historical_registry_configs/estimator_steps/xgboost/xgboost_20231117_131345.cfg diff --git a/psycop/common/model_training_v2/config/historical_registry_configs/estimator_steps/xgboost/xgboost_20231117_131345.cfg b/psycop/common/model_training_v2/config/historical_registry_configs/estimator_steps/xgboost/xgboost_20231117_131345.cfg new file mode 100644 index 000000000..3e9ecbb2c --- /dev/null +++ b/psycop/common/model_training_v2/config/historical_registry_configs/estimator_steps/xgboost/xgboost_20231117_131345.cfg @@ -0,0 +1,8 @@ +# Example cfg for xgboost +# You can find args at: +# psycop.common.model_training_v2.trainer.task.estimator_steps.xgboost +[placeholder] +@estimator_steps = "xgboost" +tree_method = "gpu_hist" +n_estimators = 100 +max_depth = 3 \ No newline at end of file diff --git a/psycop/common/model_training_v2/config/registries_testing_utils.py b/psycop/common/model_training_v2/config/registries_testing_utils.py index 5d599b762..487269ccd 100644 --- a/psycop/common/model_training_v2/config/registries_testing_utils.py +++ b/psycop/common/model_training_v2/config/registries_testing_utils.py @@ -98,6 +98,8 @@ def _timestamped_cfg_to_disk( fn.get_cfg_dir(top_level_dir=top_level_dir) / f"{fn.fn_name}_{current_datetime}.cfg" ) + filepath.parent.mkdir(exist_ok=True, parents=True) + filled_cfg.to_disk(filepath) # Prepend location to filepath From 91c5bad365e3ef79277f9c9cf219867cb8652f3b Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 17 Nov 2023 14:17:06 +0100 Subject: [PATCH 17/21] misc. --- psycop/projects/cvd/model_training/cvd_baseline.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/psycop/projects/cvd/model_training/cvd_baseline.cfg b/psycop/projects/cvd/model_training/cvd_baseline.cfg index 31cf7486e..f49e07ab3 100644 --- a/psycop/projects/cvd/model_training/cvd_baseline.cfg +++ b/psycop/projects/cvd/model_training/cvd_baseline.cfg @@ -68,5 +68,5 @@ pred_time_uuid_col_name = ${trainer.preprocessing_pipeline.*.columns_exist.*.pre @task_pipelines = "pipe_constructor" [trainer.task.task_pipe.sklearn_pipe.*.model] -@estimator_steps = "logistic_regression" +@estimator_steps = "xgboost" From 2407849f71ee6fbc2cd42480d3ecae2ef3936c55 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 17 Nov 2023 14:49:38 +0100 Subject: [PATCH 18/21] misc. --- .../cvd/model_training/cvd_baseline.cfg | 3 +++ .../model_training/populate_cvd_registry.py | 1 + .../preprocessing/bool_to_int.py | 26 +++++++++++++++++++ 3 files changed, 30 insertions(+) create mode 100644 psycop/projects/cvd/model_training/preprocessing/bool_to_int.py diff --git a/psycop/projects/cvd/model_training/cvd_baseline.cfg b/psycop/projects/cvd/model_training/cvd_baseline.cfg index f49e07ab3..85763c382 100644 --- a/psycop/projects/cvd/model_training/cvd_baseline.cfg +++ b/psycop/projects/cvd/model_training/cvd_baseline.cfg @@ -26,6 +26,9 @@ paths = ["E:/shared_resources/cvd/e2e_base_test/flattened_datasets/train.parquet [trainer.preprocessing_pipeline] @preprocessing = "baseline_preprocessing_pipeline" +[trainer.preprocessing_pipeline.*.bool_to_int] +@preprocessing = "bool_to_int" + [trainer.preprocessing_pipeline.*.columns_exist] @preprocessing = "column_exists_validator" diff --git a/psycop/projects/cvd/model_training/populate_cvd_registry.py b/psycop/projects/cvd/model_training/populate_cvd_registry.py index 7aba9b7a8..a407d46ca 100644 --- a/psycop/projects/cvd/model_training/populate_cvd_registry.py +++ b/psycop/projects/cvd/model_training/populate_cvd_registry.py @@ -7,5 +7,6 @@ def populate_with_cvd_registry() -> None: from psycop.projects.cvd.model_training.data_loader.trainval_loader import ParquetVerticalConcatenator from psycop.projects.cvd.model_training.preprocessing.regex_filter import RegexColumnBlacklist from psycop.projects.cvd.model_training.preprocessing.datetime_filter import TemporalColumnFilter + from psycop.projects.cvd.model_training.preprocessing.bool_to_int import BoolToInt populate_with_cvd_registry() \ No newline at end of file diff --git a/psycop/projects/cvd/model_training/preprocessing/bool_to_int.py b/psycop/projects/cvd/model_training/preprocessing/bool_to_int.py new file mode 100644 index 000000000..aca0c44c6 --- /dev/null +++ b/psycop/projects/cvd/model_training/preprocessing/bool_to_int.py @@ -0,0 +1,26 @@ +from collections.abc import Sequence +from dataclasses import dataclass + +import polars as pl +import polars.selectors as cs +from functionalpy import Seq +from polars import Boolean, LazyFrame + +from psycop.common.model_training_v2.config.baseline_registry import BaselineRegistry +from psycop.common.model_training_v2.trainer.preprocessing.step import ( + PolarsFrame_T0, + PresplitStep, +) + + +@BaselineRegistry.preprocessing.register("bool_to_int") +class BoolToInt(PresplitStep): + def __init__(self): + pass + + def apply(self, input_df: PolarsFrame_T0) -> PolarsFrame_T0: + for col_name in input_df.columns: + if input_df.schema[col_name] == Boolean: # type: ignore + input_df = input_df.with_columns(pl.col(col_name).cast(int)) + + return input_df From 0b48a86ba5bf08e8a30907d60aec46f54a649ba6 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 17 Nov 2023 13:51:34 +0000 Subject: [PATCH 19/21] style: auto-fixes from pre-commit --- .../model_training_v2/config/config_utils.py | 3 ++- .../preprocessing/steps/column_validator.py | 1 - .../data_loader/trainval_loader.py | 18 ++++++++++-------- .../model_training/populate_cvd_registry.py | 17 +++++++++++------ .../preprocessing/bool_to_int.py | 9 ++------- .../preprocessing/datetime_filter.py | 6 ------ .../preprocessing/regex_filter.py | 5 ----- 7 files changed, 25 insertions(+), 34 deletions(-) diff --git a/psycop/common/model_training_v2/config/config_utils.py b/psycop/common/model_training_v2/config/config_utils.py index f1f6c5a31..91175a01e 100644 --- a/psycop/common/model_training_v2/config/config_utils.py +++ b/psycop/common/model_training_v2/config/config_utils.py @@ -13,6 +13,7 @@ populate_baseline_registry() + def load_baseline_config(config_path: Path) -> BaselineSchema: """Loads the baseline config from disk and resolves it.""" cfg = Config().from_disk(config_path) @@ -26,10 +27,10 @@ def load_hyperparam_config(config_path: Path) -> dict[str, Any]: resolved = BaselineRegistry.resolve(cfg) return resolved + if __name__ == "__main__": config_str = """[section] value='^test$' """ Config().from_str(config_str) pass - diff --git a/psycop/common/model_training_v2/trainer/preprocessing/steps/column_validator.py b/psycop/common/model_training_v2/trainer/preprocessing/steps/column_validator.py index 2a75ae0d8..27abde545 100644 --- a/psycop/common/model_training_v2/trainer/preprocessing/steps/column_validator.py +++ b/psycop/common/model_training_v2/trainer/preprocessing/steps/column_validator.py @@ -1,4 +1,3 @@ -from collections.abc import Sequence from dataclasses import dataclass import polars as pl diff --git a/psycop/projects/cvd/model_training/data_loader/trainval_loader.py b/psycop/projects/cvd/model_training/data_loader/trainval_loader.py index b7f4b1112..7722b71bc 100644 --- a/psycop/projects/cvd/model_training/data_loader/trainval_loader.py +++ b/psycop/projects/cvd/model_training/data_loader/trainval_loader.py @@ -17,11 +17,15 @@ class ParquetVerticalConcatenator(BaselineDataLoader): def __init__(self, paths: Sequence[str]): self.dataset_paths = [Path(arg) for arg in paths] - missing_paths = Seq(self.dataset_paths).map(self._check_path_exists).flatten().to_list() + missing_paths = ( + Seq(self.dataset_paths).map(self._check_path_exists).flatten().to_list() + ) if missing_paths: - raise MissingPathError(f"""The following paths are missing: + raise MissingPathError( + f"""The following paths are missing: {missing_paths} - """) + """, + ) def _check_path_exists(self, path: Path) -> list[MissingPathError]: if not path.exists(): @@ -30,9 +34,7 @@ def _check_path_exists(self, path: Path) -> list[MissingPathError]: return [] def load(self) -> pl.LazyFrame: - return pl.concat(how="vertical", items= - [ - pl.scan_parquet(path) - for path in self.dataset_paths - ], + return pl.concat( + how="vertical", + items=[pl.scan_parquet(path) for path in self.dataset_paths], ) diff --git a/psycop/projects/cvd/model_training/populate_cvd_registry.py b/psycop/projects/cvd/model_training/populate_cvd_registry.py index a407d46ca..7275b32ee 100644 --- a/psycop/projects/cvd/model_training/populate_cvd_registry.py +++ b/psycop/projects/cvd/model_training/populate_cvd_registry.py @@ -1,12 +1,17 @@ # ruff: noqa - - def populate_with_cvd_registry() -> None: - from psycop.projects.cvd.model_training.data_loader.trainval_loader import ParquetVerticalConcatenator - from psycop.projects.cvd.model_training.preprocessing.regex_filter import RegexColumnBlacklist - from psycop.projects.cvd.model_training.preprocessing.datetime_filter import TemporalColumnFilter + from psycop.projects.cvd.model_training.data_loader.trainval_loader import ( + ParquetVerticalConcatenator, + ) + from psycop.projects.cvd.model_training.preprocessing.regex_filter import ( + RegexColumnBlacklist, + ) + from psycop.projects.cvd.model_training.preprocessing.datetime_filter import ( + TemporalColumnFilter, + ) from psycop.projects.cvd.model_training.preprocessing.bool_to_int import BoolToInt -populate_with_cvd_registry() \ No newline at end of file + +populate_with_cvd_registry() diff --git a/psycop/projects/cvd/model_training/preprocessing/bool_to_int.py b/psycop/projects/cvd/model_training/preprocessing/bool_to_int.py index aca0c44c6..fb4253b13 100644 --- a/psycop/projects/cvd/model_training/preprocessing/bool_to_int.py +++ b/psycop/projects/cvd/model_training/preprocessing/bool_to_int.py @@ -1,10 +1,5 @@ -from collections.abc import Sequence -from dataclasses import dataclass - import polars as pl -import polars.selectors as cs -from functionalpy import Seq -from polars import Boolean, LazyFrame +from polars import Boolean from psycop.common.model_training_v2.config.baseline_registry import BaselineRegistry from psycop.common.model_training_v2.trainer.preprocessing.step import ( @@ -20,7 +15,7 @@ def __init__(self): def apply(self, input_df: PolarsFrame_T0) -> PolarsFrame_T0: for col_name in input_df.columns: - if input_df.schema[col_name] == Boolean: # type: ignore + if input_df.schema[col_name] == Boolean: # type: ignore input_df = input_df.with_columns(pl.col(col_name).cast(int)) return input_df diff --git a/psycop/projects/cvd/model_training/preprocessing/datetime_filter.py b/psycop/projects/cvd/model_training/preprocessing/datetime_filter.py index 40e7f32bd..1044d73a3 100644 --- a/psycop/projects/cvd/model_training/preprocessing/datetime_filter.py +++ b/psycop/projects/cvd/model_training/preprocessing/datetime_filter.py @@ -1,10 +1,4 @@ -from collections.abc import Sequence -from dataclasses import dataclass - -import polars as pl import polars.selectors as cs -from functionalpy import Seq -from polars import LazyFrame from psycop.common.model_training_v2.config.baseline_registry import BaselineRegistry from psycop.common.model_training_v2.trainer.preprocessing.step import ( diff --git a/psycop/projects/cvd/model_training/preprocessing/regex_filter.py b/psycop/projects/cvd/model_training/preprocessing/regex_filter.py index c805a6cef..ff5319011 100644 --- a/psycop/projects/cvd/model_training/preprocessing/regex_filter.py +++ b/psycop/projects/cvd/model_training/preprocessing/regex_filter.py @@ -1,9 +1,4 @@ -from collections.abc import Sequence -from dataclasses import dataclass - import polars as pl -from functionalpy import Seq -from polars import LazyFrame from psycop.common.model_training_v2.config.baseline_registry import BaselineRegistry from psycop.common.model_training_v2.trainer.preprocessing.step import ( From 669c8b4b060a9cda577d8e23093ebb9326a2e224 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 17 Nov 2023 13:53:14 +0000 Subject: [PATCH 20/21] misc. --- .../column_prefix_count_expectation_20231117_135230.cfg | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 psycop/common/model_training_v2/config/historical_registry_configs/preprocessing/column_prefix_count_expectation/column_prefix_count_expectation_20231117_135230.cfg diff --git a/psycop/common/model_training_v2/config/historical_registry_configs/preprocessing/column_prefix_count_expectation/column_prefix_count_expectation_20231117_135230.cfg b/psycop/common/model_training_v2/config/historical_registry_configs/preprocessing/column_prefix_count_expectation/column_prefix_count_expectation_20231117_135230.cfg new file mode 100644 index 000000000..78ad13e4c --- /dev/null +++ b/psycop/common/model_training_v2/config/historical_registry_configs/preprocessing/column_prefix_count_expectation/column_prefix_count_expectation_20231117_135230.cfg @@ -0,0 +1,6 @@ + +# Example cfg for column_prefix_count_expectation +# You can find args at: +# psycop.common.model_training_v2.trainer.preprocessing.steps.column_validator +[placeholder] +placeholder = ["pred", 1] \ No newline at end of file From e673388e710ea157a34e4c0c063844adac8f5d16 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 17 Nov 2023 13:58:09 +0000 Subject: [PATCH 21/21] misc. --- psycop/common/model_training_v2/config/config_utils.py | 8 -------- .../prefix_count_validator_20231117_092246.cfg | 6 ------ 2 files changed, 14 deletions(-) delete mode 100644 psycop/common/model_training_v2/config/historical_registry_configs/preprocessing/prefix_count_validator/prefix_count_validator_20231117_092246.cfg diff --git a/psycop/common/model_training_v2/config/config_utils.py b/psycop/common/model_training_v2/config/config_utils.py index 91175a01e..0bd6fdeb1 100644 --- a/psycop/common/model_training_v2/config/config_utils.py +++ b/psycop/common/model_training_v2/config/config_utils.py @@ -26,11 +26,3 @@ def load_hyperparam_config(config_path: Path) -> dict[str, Any]: cfg = Config().from_disk(config_path) resolved = BaselineRegistry.resolve(cfg) return resolved - - -if __name__ == "__main__": - config_str = """[section] -value='^test$' -""" - Config().from_str(config_str) - pass diff --git a/psycop/common/model_training_v2/config/historical_registry_configs/preprocessing/prefix_count_validator/prefix_count_validator_20231117_092246.cfg b/psycop/common/model_training_v2/config/historical_registry_configs/preprocessing/prefix_count_validator/prefix_count_validator_20231117_092246.cfg deleted file mode 100644 index 59ea59055..000000000 --- a/psycop/common/model_training_v2/config/historical_registry_configs/preprocessing/prefix_count_validator/prefix_count_validator_20231117_092246.cfg +++ /dev/null @@ -1,6 +0,0 @@ - -# Example cfg for prefix_count_validator -# You can find args at: -# psycop.common.model_training_v2.trainer.preprocessing.steps.column_validator -[placeholder] -test = ["prefix_", 2] \ No newline at end of file