-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
162 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
110 changes: 110 additions & 0 deletions
110
...p/projects/forced_admission_inpatient/feature_generation/modules/specify_text_features.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
"""Text-feature specification module.""" | ||
import logging | ||
from typing import Callable, Union | ||
|
||
import numpy as np | ||
from timeseriesflattener.aggregation_fns import ( | ||
concatenate, | ||
mean_number_of_characters, | ||
type_token_ratio, | ||
) | ||
from timeseriesflattener.feature_specs.group_specs import ( | ||
NamedDataframe, | ||
PredictorGroupSpec, | ||
TextPredictorGroupSpec, | ||
) | ||
from timeseriesflattener.feature_specs.single_specs import ( | ||
PredictorSpec, | ||
TextPredictorSpec, | ||
) | ||
from timeseriesflattener.text_embedding_functions import sklearn_embedding | ||
|
||
from psycop.common.feature_generation.application_modules.project_setup import ( | ||
ProjectInfo, | ||
) | ||
from psycop.common.feature_generation.loaders.raw.load_text import load_aktuel_psykisk | ||
from psycop.common.feature_generation.text_models.utils import load_text_model | ||
|
||
log = logging.getLogger(__name__) | ||
|
||
|
||
class TextFeatureSpecifier: | ||
"""Specify features based on prediction time.""" | ||
|
||
def __init__(self, project_info: ProjectInfo, min_set_for_debug: bool = False): | ||
self.min_set_for_debug = min_set_for_debug | ||
self.project_info = project_info | ||
|
||
def _get_text_features_specs( | ||
self, | ||
resolve_multiple: list[Callable], | ||
interval_days: list[float], | ||
) -> list[PredictorSpec]: | ||
"""Get mean character length sfis specs""" | ||
log.info("-------- Generating mean character length all sfis specs --------") | ||
|
||
text_features = PredictorGroupSpec( | ||
named_dataframes=( | ||
NamedDataframe(df=load_aktuel_psykisk(), name="aktuelt_psykisk"), | ||
), | ||
lookbehind_days=interval_days, | ||
fallback=[np.nan], | ||
aggregation_fns=resolve_multiple, | ||
).create_combinations() | ||
|
||
return text_features | ||
|
||
def _get_text_embedding_features_specs( | ||
self, | ||
resolve_multiple: list[Callable], | ||
interval_days: list[float], | ||
) -> list[TextPredictorSpec]: | ||
"""Get bow all sfis specs""" | ||
log.info("-------- Generating bow all sfis specs --------") | ||
|
||
tfidf_model = load_text_model( | ||
filename="tfidf_psycop_train_all_sfis_preprocessed_sfi_type_Aktueltpsykisk_ngram_range_12_max_df_10_min_df_1_max_features_500.pkl", | ||
) | ||
|
||
tfidf_specs = TextPredictorGroupSpec( | ||
named_dataframes=[ | ||
NamedDataframe(df=load_aktuel_psykisk(), name="aktuel_psykisk"), | ||
], | ||
lookbehind_days=interval_days, | ||
aggregation_fns=resolve_multiple, | ||
embedding_fn_name="tfidf", | ||
fallback=[np.nan], | ||
embedding_fn=[sklearn_embedding], | ||
embedding_fn_kwargs=[{"model": tfidf_model}], | ||
).create_combinations() | ||
|
||
return tfidf_specs | ||
|
||
def get_text_feature_specs( | ||
self, | ||
) -> list[Union[TextPredictorSpec, PredictorSpec]]: | ||
"""Generate text predictor spec list.""" | ||
log.info("-------- Generating text predictor specs --------") | ||
|
||
if self.min_set_for_debug: | ||
text_embedding_features = self._get_text_embedding_features_specs( | ||
resolve_multiple=[concatenate], | ||
interval_days=[60, 365, 730], | ||
) | ||
|
||
return text_embedding_features + self._get_text_features_specs( | ||
resolve_multiple=[mean_number_of_characters], | ||
interval_days=[7], | ||
) # type: ignore | ||
|
||
text_features = self._get_text_features_specs( | ||
resolve_multiple=[mean_number_of_characters, type_token_ratio], | ||
interval_days=[7], | ||
) | ||
|
||
text_embedding_features = self._get_text_embedding_features_specs( | ||
resolve_multiple=[concatenate], | ||
interval_days=[7, 30], | ||
) | ||
|
||
return text_features + text_embedding_features |
26 changes: 26 additions & 0 deletions
26
...ion_inpatient/feature_generation/modules/text_model_pipelines/fit_and_save_text_models.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
"""Pipeline for fitting and saving BoW and TF-IDF models on a preprocessed corpus""" | ||
|
||
from psycop.common.feature_generation.text_models.text_model_pipeline import ( | ||
text_model_pipeline, | ||
) | ||
|
||
if __name__ == "__main__": | ||
text_model_pipeline( | ||
model="bow", | ||
corpus_name="psycop_train_all_sfis_all_years_lowercase_stopwords_and_symbols_removed", | ||
sfi_type=["Aktuelt psykisk"], # Current Subjective Mental State | ||
max_features=100, | ||
max_df=1.0, | ||
min_df=1, | ||
ngram_range=(1, 2), | ||
) | ||
|
||
text_model_pipeline( | ||
model="tfidf", | ||
corpus_name="psycop_train_all_sfis_all_years_lowercase_stopwords_and_symbols_removed", | ||
sfi_type=["Aktuelt psykisk"], # Current Subjective Mental State | ||
max_features=100, | ||
max_df=1.0, | ||
min_df=1, | ||
ngram_range=(1, 2), | ||
) |
13 changes: 13 additions & 0 deletions
13
...ed_admission_inpatient/feature_generation/modules/text_model_pipelines/preprocess_sfis.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
"""Preprocess sfis""" | ||
|
||
from psycop.common.feature_generation.text_models.preprocessing import ( | ||
text_preprocessing_pipeline, | ||
) | ||
|
||
|
||
def main() -> str: | ||
return text_preprocessing_pipeline() | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |