Skip to content

Commit

Permalink
feat: feature gen for sczbp text experiment
Browse files Browse the repository at this point in the history
  • Loading branch information
HLasse committed Jan 26, 2024
1 parent 0cb9c96 commit 13a3622
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,18 +41,6 @@ def _get_feature_by_note_type_and_model_name(
lookbehind_days=lookbehind_days,
)

def _get_tfidf_by_size_and_note_type(
self, note_type: str, max_features: int, lookbehind_days: list[float]
) -> Sequence[AnySpec]:
filename = f"text_embeddings_{note_type}_tfidf_{max_features}.parquet"
embedded_text_df = pd.read_parquet(TEXT_EMBEDDINGS_DIR / filename)

return self._text_specs_from_embedding_df(
embedded_text_df=embedded_text_df,
name_prefix=f"pred_{note_type}_tfidf_{max_features}_",
lookbehind_days=lookbehind_days,
)

def get_feature_specs( # type: ignore[override]
self, lookbehind_days: list[float]
) -> list[AnySpec]:
Expand All @@ -61,29 +49,22 @@ def get_feature_specs( # type: ignore[override]
self._get_outcome_specs(),
]
note_types = ["aktuelt_psykisk", "all_relevant"]
sentence_transformer_models = [
models_names = [
"dfm-encoder-large",
"e5-large",
# "e5-large",
"dfm-encoder-large-v1-finetuned",
"tfidf-500",
"tfidf-1000",
]
tfidf_max_features = [500, 1000]

for note_type in note_types:
for model_name in sentence_transformer_models:
for model_name in models_names:
feature_specs.append(
self._get_feature_by_note_type_and_model_name(
note_type=note_type, model_name=model_name, lookbehind_days=lookbehind_days
)
)

for max_features in tfidf_max_features:
feature_specs.append(
self._get_tfidf_by_size_and_note_type(
note_type=note_type,
max_features=max_features,
lookbehind_days=lookbehind_days,
)
)
# flatten the sequence of lists
features = [feature for sublist in feature_specs for feature in sublist]
return features
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from psycop.common.feature_generation.application_modules.generate_feature_set import (
generate_feature_set,
)
from psycop.common.feature_generation.application_modules.project_setup import ProjectInfo
from psycop.common.global_utils.paths import OVARTACI_SHARED_DIR
from psycop.projects.scz_bp.feature_generation.eligible_prediction_times.scz_bp_prediction_time_loader import (
SczBpCohort,
)
from psycop.projects.scz_bp.feature_generation.scz_bp_specify_features import SczBpFeatureSpecifier
from psycop.projects.scz_bp.feature_generation.text_experiment.scz_bp_text_experiment_feature_spec import (
SczBpTextExperimentFeatures,
)

if __name__ == "__main__":
generate_feature_set(
project_info=ProjectInfo(
project_name="scz_bp", project_path=OVARTACI_SHARED_DIR / "scz_bp" / "text_exp"
),
eligible_prediction_times=SczBpCohort.get_filtered_prediction_times_bundle().prediction_times.frame.to_pandas(),
feature_specs=SczBpTextExperimentFeatures().get_feature_specs(lookbehind_days=[730]),
# generate_in_chunks=True, # noqa: ERA001
# chunksize=10, # noqa: ERA001
feature_set_name="text_exp_730d",
)

0 comments on commit 13a3622

Please sign in to comment.