Skip to content

Commit

Permalink
feat: adding text specs
Browse files Browse the repository at this point in the history
  • Loading branch information
bokajgd committed Aug 2, 2023
1 parent a74ca45 commit 27b37ac
Show file tree
Hide file tree
Showing 4 changed files with 162 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@
from psycop.projects.forced_admission_inpatient.feature_generation.modules.specify_features import (
FeatureSpecifier,
)
from psycop.projects.forced_admission_inpatient.feature_generation.modules.specify_text_features import (
TextFeatureSpecifier,
)
from psycop.projects.forced_admission_inpatient.feature_generation.modules.utils import (
add_outcome_col,
)
Expand All @@ -41,14 +44,22 @@


@wandb_alert_on_exception
def main():
def main(add_text_features: bool = True, min_set_for_debug: bool = True):
"""Main function for loading, generating and evaluating a flattened
dataset."""
feature_specs = FeatureSpecifier(
project_info=project_info,
min_set_for_debug=False, # Remember to set to False when generating full dataset
min_set_for_debug=min_set_for_debug, # Remember to set to False when generating full dataset
).get_feature_specs()

if add_text_features:
text_feature_specs = TextFeatureSpecifier(
project_info=project_info,
min_set_for_debug=min_set_for_debug, # Remember to set to False when generating full dataset
).get_text_feature_specs()

feature_specs += text_feature_specs

flattened_df = create_flattened_dataset(
feature_specs=feature_specs, # type: ignore
prediction_times_df=forced_admissions_inpatient(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
"""Text-feature specification module."""
import logging
from typing import Callable, Union

import numpy as np
from timeseriesflattener.aggregation_fns import (
concatenate,
mean_number_of_characters,
type_token_ratio,
)
from timeseriesflattener.feature_specs.group_specs import (
NamedDataframe,
PredictorGroupSpec,
TextPredictorGroupSpec,
)
from timeseriesflattener.feature_specs.single_specs import (
PredictorSpec,
TextPredictorSpec,
)
from timeseriesflattener.text_embedding_functions import sklearn_embedding

from psycop.common.feature_generation.application_modules.project_setup import (
ProjectInfo,
)
from psycop.common.feature_generation.loaders.raw.load_text import load_aktuel_psykisk
from psycop.common.feature_generation.text_models.utils import load_text_model

log = logging.getLogger(__name__)


class TextFeatureSpecifier:
"""Specify features based on prediction time."""

def __init__(self, project_info: ProjectInfo, min_set_for_debug: bool = False):
self.min_set_for_debug = min_set_for_debug
self.project_info = project_info

def _get_text_features_specs(
self,
resolve_multiple: list[Callable],
interval_days: list[float],
) -> list[PredictorSpec]:
"""Get mean character length sfis specs"""
log.info("-------- Generating mean character length all sfis specs --------")

text_features = PredictorGroupSpec(
named_dataframes=(
NamedDataframe(df=load_aktuel_psykisk(), name="aktuelt_psykisk"),
),
lookbehind_days=interval_days,
fallback=[np.nan],
aggregation_fns=resolve_multiple,
).create_combinations()

return text_features

def _get_text_embedding_features_specs(
self,
resolve_multiple: list[Callable],
interval_days: list[float],
) -> list[TextPredictorSpec]:
"""Get bow all sfis specs"""
log.info("-------- Generating bow all sfis specs --------")

tfidf_model = load_text_model(
filename="tfidf_psycop_train_all_sfis_preprocessed_sfi_type_Aktueltpsykisk_ngram_range_12_max_df_10_min_df_1_max_features_500.pkl",
)

tfidf_specs = TextPredictorGroupSpec(
named_dataframes=[
NamedDataframe(df=load_aktuel_psykisk(), name="aktuel_psykisk"),
],
lookbehind_days=interval_days,
aggregation_fns=resolve_multiple,
embedding_fn_name="tfidf",
fallback=[np.nan],
embedding_fn=[sklearn_embedding],
embedding_fn_kwargs=[{"model": tfidf_model}],
).create_combinations()

return tfidf_specs

def get_text_feature_specs(
self,
) -> list[Union[TextPredictorSpec, PredictorSpec]]:
"""Generate text predictor spec list."""
log.info("-------- Generating text predictor specs --------")

if self.min_set_for_debug:
text_embedding_features = self._get_text_embedding_features_specs(
resolve_multiple=[concatenate],
interval_days=[60, 365, 730],
)

return text_embedding_features + self._get_text_features_specs(
resolve_multiple=[mean_number_of_characters],
interval_days=[7],
) # type: ignore

text_features = self._get_text_features_specs(
resolve_multiple=[mean_number_of_characters, type_token_ratio],
interval_days=[7],
)

text_embedding_features = self._get_text_embedding_features_specs(
resolve_multiple=[concatenate],
interval_days=[7, 30],
)

return text_features + text_embedding_features
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Pipeline for fitting and saving BoW and TF-IDF models on a preprocessed corpus"""

from psycop.common.feature_generation.text_models.text_model_pipeline import (
text_model_pipeline,
)

if __name__ == "__main__":
text_model_pipeline(
model="bow",
corpus_name="psycop_train_all_sfis_all_years_lowercase_stopwords_and_symbols_removed",
sfi_type=["Aktuelt psykisk"], # Current Subjective Mental State
max_features=100,
max_df=1.0,
min_df=1,
ngram_range=(1, 2),
)

text_model_pipeline(
model="tfidf",
corpus_name="psycop_train_all_sfis_all_years_lowercase_stopwords_and_symbols_removed",
sfi_type=["Aktuelt psykisk"], # Current Subjective Mental State
max_features=100,
max_df=1.0,
min_df=1,
ngram_range=(1, 2),
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
"""Preprocess sfis"""

from psycop.common.feature_generation.text_models.preprocessing import (
text_preprocessing_pipeline,
)


def main() -> str:
return text_preprocessing_pipeline()


if __name__ == "__main__":
main()

0 comments on commit 27b37ac

Please sign in to comment.