diff --git a/changelog.md b/changelog.md index da41ab59..21c037d9 100644 --- a/changelog.md +++ b/changelog.md @@ -1,4 +1,8 @@ # Changelog + +## v0.1.3 - 22-12-2022 + +- ConditionProbe: Update, computed as a proportion of number of visit. ## v0.1.2 - 14-12-2022 - ConditionProbe computes the availability of administrative data related to visits with at least one ICD-10 code recorded. diff --git a/docs/components/probe.md b/docs/components/probe.md index aa525220..e8b7afd5 100644 --- a/docs/components/probe.md +++ b/docs/components/probe.md @@ -159,7 +159,7 @@ We list hereafter the Probes that have already been implemented in the library. === "NoteProbe" - The [``NoteProbe``][edsteva.probes.note.NoteProbe] computes $c_{note}(t)$ the availability of clinical documents linked to patients' visits: + The [``NoteProbe``][edsteva.probes.note.NoteProbe] computes $c_{note}(t)$ the availability of clinical documents linked to patients' administrative visit for each care site, stay type and note type according to time: $$ c_{note}(t) = \frac{n_{with\,doc}(t)}{n_{visit}(t)} @@ -194,27 +194,30 @@ We list hereafter the Probes that have already been implemented in the library. | care_site_level | care_site_id | care_site_short_name | stay_type | note_type | date | n_visit | c | | :----------------------- | :----------- | :------------------- | :----------- | :-------------------- | :--------- | :------ | :----- | - | Unité Fonctionnelle (UF) | 8312056386 | Care site 1 | 'Urg_Hospit' | 'All' | 2019-05-01 | 233.0 | '0.841 | + | Unité Fonctionnelle (UF) | 8312056386 | Care site 1 | 'Urg' | 'All' | 2019-05-01 | 233.0 | '0.841 | | Unité Fonctionnelle (UF) | 8653815660 | Care site 1 | 'All' | 'CRH' | 2011-04-01 | 393.0 | 0.640 | - | Pôle/DMU | 8312027648 | Care site 2 | 'Urg' | 'CRH' | 2021-03-01 | 204.0 | 0.497 | + | Pôle/DMU | 8312027648 | Care site 2 | 'Hospit' | 'CRH' | 2021-03-01 | 204.0 | 0.497 | | Pôle/DMU | 8312056379 | Care site 2 | 'All' | 'Ordonnance' | 2018-08-01 | 22.0 | 0.274 | - | Hôpital | 8312022130 | Care site 3 | 'Hospit' | 'CR Passage Urgences' | 2022-02-01 | 9746.0 | 0.769 | + | Hôpital | 8312022130 | Care site 3 | 'Urg_Hospit' | 'CR Passage Urgences' | 2022-02-01 | 9746.0 | 0.769 | === "ConditionProbe" - The [``ConditionProbe``][edsteva.probes.condition.ConditionProbe] computes $c_{condition}(t)$ the availability of administrative data related to visits with at least one ICD-10 code recorded for each care site according to time: + The [``ConditionProbe``][edsteva.probes.condition.ConditionProbe] computes $c_{condition}(t)$ the availability of claim data in patients' administrative visit for each care site, stay type, diag type and condition type according to time: $$ - c_{condition}(t) = \frac{n_{condition}(t)}{n_{99}} + c_{condition}(t) = \frac{n_{with\,condition}(t)}{n_{visit}(t)} $$ - Where $n_{condition}(t)$ is the number of stays with at least one ICD-10 code recorded, $t$ is the month and $n_{99}$ is the $99^{th}$ percentile of $n_{condition}(t)$. + Where $n_{visit}(t)$ is the number of administrative stays, $n_{with\,condition}$ the number of stays having at least one claim code (e.g. ICD-10) recorded and $t$ is the month. !!!info "" - If the $99^{th}$ percentile $n_{99}$ is equal to 0, we consider that the completeness predictor $c(t)$ is also equal to 0. + If the number of visits $n_{visit}(t)$ is equal to 0, we consider that the completeness predictor $c(t)$ is also equal to 0. + + !!!Warning "Care site level" + This probe is only available at hospital level. ```python - from edsteva.probes import VisitProbe + from edsteva.probes import ConditionProbe condition = ConditionProbe() condition.compute( @@ -235,10 +238,10 @@ We list hereafter the Probes that have already been implemented in the library. condition.predictor.head() ``` - | care_site_level | care_site_id | care_site_short_name | stay_type | diag_type | condition_type | date | n_visit | c | - | :----------------------- | :----------- | :------------------- | :-------- | :-------- | :------------------- | :--------- | :------ | :---- | - | Unité Fonctionnelle (UF) | 8312056386 | Care site 1 | 'All' | 'All' | 'Pulmonary_embolism' | 2019-05-01 | 233.0 | 0.841 | - | Unité Fonctionnelle (UF) | 8312056386 | Care site 1 | 'All' | 'DP/DR' | 'Pulmonary_embolism' | 2021-04-01 | 393.0 | 0.640 | - | Pôle/DMU | 8312027648 | Care site 2 | 'Hospit' | 'All' | 'Pulmonary_embolism' | 2011-03-01 | 204.0 | 0.497 | - | Pôle/DMU | 8312027648 | Care site 2 | 'All' | 'All' | 'All' | 2018-08-01 | 22.0 | 0.274 | - | Hôpital | 8312022130 | Care site 3 | 'Hospit' | 'DP/DR' | 'Pulmonary_embolism' | 2022-02-01 | 9746.0 | 0.769 | + | care_site_level | care_site_id | care_site_short_name | stay_type | diag_type | condition_type | date | n_visit | c | + | :-------------- | :----------- | :------------------- | :-------- | :-------- | :------------------- | :--------- | :------ | :---- | + | Hôpital | 8312057527 | Care site 1 | 'All' | 'All' | 'Pulmonary_embolism' | 2019-05-01 | 233.0 | 0.841 | + | Hôpital | 8312057527 | Care site 1 | 'All' | 'DP/DR' | 'Pulmonary_embolism' | 2021-04-01 | 393.0 | 0.640 | + | Hôpital | 8312027648 | Care site 2 | 'Hospit' | 'All' | 'Pulmonary_embolism' | 2011-03-01 | 204.0 | 0.497 | + | Hôpital | 8312027648 | Care site 2 | 'All' | 'All' | 'All' | 2018-08-01 | 22.0 | 0.274 | + | Hôpital | 8312022130 | Care site 3 | 'Hospit' | 'DP/DR' | 'Pulmonary_embolism' | 2022-02-01 | 9746.0 | 0.769 | diff --git a/docs/index.md b/docs/index.md index e9d89ffb..2047d867 100644 --- a/docs/index.md +++ b/docs/index.md @@ -478,7 +478,7 @@ The working example above describes the canonical usage workflow. However, you w === "NoteProbe" - The [``NoteProbe``][edsteva.probes.note.NoteProbe] computes $c_{note}(t)$ the availability of clinical documents linked to patients' visits for each care site, stay type and note type according to time: + The [``NoteProbe``][edsteva.probes.note.NoteProbe] computes $c_{note}(t)$ the availability of clinical documents linked to patients' administrative visit for each care site, stay type and note type according to time: $$ c_{note}(t) = \frac{n_{with\,doc}(t)}{n_{visit}(t)} @@ -521,19 +521,22 @@ The working example above describes the canonical usage workflow. However, you w === "ConditionProbe" - The [``ConditionProbe``][edsteva.probes.condition.ConditionProbe] computes $c_{condition}(t)$ the availability of administrative data related to visits with at least one ICD-10 code recorded for each care site according to time: + The [``ConditionProbe``][edsteva.probes.condition.ConditionProbe] computes $c_{condition}(t)$ the availability of claim data in patients' administrative visit for each care site, stay type, diag type and condition type according to time: $$ - c_{condition}(t) = \frac{n_{condition}(t)}{n_{99}} + c_{condition}(t) = \frac{n_{with\,condition}(t)}{n_{visit}(t)} $$ - Where $n_{condition}(t)$ is the number of stays with at least one ICD-10 code recorded, $t$ is the month and $n_{99}$ is the $99^{th}$ percentile of $n_{condition}(t)$. + Where $n_{visit}(t)$ is the number of administrative stays, $n_{with\,condition}$ the number of stays having at least one claim code (e.g. ICD-10) recorded and $t$ is the month. !!!info "" - If the $99^{th}$ percentile $n_{99}$ is equal to 0, we consider that the completeness predictor $c(t)$ is also equal to 0. + If the number of visits $n_{visit}(t)$ is equal to 0, we consider that the completeness predictor $c(t)$ is also equal to 0. + + !!!Warning "Care site level" + This probe is only available at hospital level. ```python - from edsteva.probes import VisitProbe + from edsteva.probes import ConditionProbe condition = ConditionProbe() condition.compute( @@ -554,13 +557,13 @@ The working example above describes the canonical usage workflow. However, you w condition.predictor.head() ``` - | care_site_level | care_site_id | care_site_short_name | stay_type | diag_type | condition_type | date | n_visit | c | - | :----------------------- | :----------- | :------------------- | :-------- | :-------- | :------------------- | :--------- | :------ | :---- | - | Unité Fonctionnelle (UF) | 8312056386 | Care site 1 | 'All' | 'All' | 'Pulmonary_embolism' | 2019-05-01 | 233.0 | 0.841 | - | Unité Fonctionnelle (UF) | 8312056386 | Care site 1 | 'All' | 'DP/DR' | 'Pulmonary_embolism' | 2021-04-01 | 393.0 | 0.640 | - | Pôle/DMU | 8312027648 | Care site 2 | 'Hospit' | 'All' | 'Pulmonary_embolism' | 2011-03-01 | 204.0 | 0.497 | - | Pôle/DMU | 8312027648 | Care site 2 | 'All' | 'All' | 'All' | 2018-08-01 | 22.0 | 0.274 | - | Hôpital | 8312022130 | Care site 3 | 'Hospit' | 'DP/DR' | 'Pulmonary_embolism' | 2022-02-01 | 9746.0 | 0.769 | + | care_site_level | care_site_id | care_site_short_name | stay_type | diag_type | condition_type | date | n_visit | c | + | :-------------- | :----------- | :------------------- | :-------- | :-------- | :------------------- | :--------- | :------ | :---- | + | Hôpital | 8312057527 | Care site 1 | 'All' | 'All' | 'Pulmonary_embolism' | 2019-05-01 | 233.0 | 0.841 | + | Hôpital | 8312057527 | Care site 1 | 'All' | 'DP/DR' | 'Pulmonary_embolism' | 2021-04-01 | 393.0 | 0.640 | + | Hôpital | 8312027648 | Care site 2 | 'Hospit' | 'All' | 'Pulmonary_embolism' | 2011-03-01 | 204.0 | 0.497 | + | Hôpital | 8312027648 | Care site 2 | 'All' | 'All' | 'All' | 2018-08-01 | 22.0 | 0.274 | + | Hôpital | 8312022130 | Care site 3 | 'Hospit' | 'DP/DR' | 'Pulmonary_embolism' | 2022-02-01 | 9746.0 | 0.769 | === "Model" diff --git a/edsteva/io/hive.py b/edsteva/io/hive.py index 679f25a1..96a10eed 100644 --- a/edsteva/io/hive.py +++ b/edsteva/io/hive.py @@ -8,6 +8,8 @@ from pyspark.sql import SparkSession from pyspark.sql.types import LongType, StructField, StructType +from edsteva import koalas_options + from . import settings from .i2b2_mapping import get_i2b2_table @@ -100,6 +102,7 @@ def __init__( if spark_session is not None: self.spark_session = spark_session else: + koalas_options() logger.warning( """ To improve performances when using Spark and Koalas, please call `edsteva.improve_performances()` diff --git a/edsteva/probes/condition.py b/edsteva/probes/condition.py index b36505a6..f3562de0 100644 --- a/edsteva/probes/condition.py +++ b/edsteva/probes/condition.py @@ -2,17 +2,15 @@ from typing import Dict, List, Union import pandas as pd +from loguru import logger from edsteva.probes.base import BaseProbe from edsteva.probes.utils import ( CARE_SITE_LEVEL_NAMES, concatenate_predictor_by_level, - convert_table_to_pole, - convert_table_to_uf, hospital_only, prepare_care_site, prepare_condition_occurrence, - prepare_visit_detail, prepare_visit_occurrence, ) from edsteva.utils.checks import check_tables @@ -20,7 +18,7 @@ from edsteva.utils.typing import Data -def compute_completeness(visit_predictor): +def compute_completeness(condition_predictor): partition_cols = [ "care_site_level", @@ -31,47 +29,67 @@ def compute_completeness(visit_predictor): "condition_type", "date", ] - n_visit = ( - visit_predictor.groupby( + n_visit_with_condition = ( + condition_predictor.groupby( partition_cols, as_index=False, dropna=False, ) - .agg({"visit_id": "count"}) - .rename(columns={"visit_id": "n_visit"}) + .agg({"has_condition": "count"}) + .rename(columns={"has_condition": "n_visit_with_condition"}) ) + partition_cols = list(set(partition_cols) - {"diag_type", "condition_type"}) - n_visit = to("pandas", n_visit) - - partition_cols = list(set(partition_cols) - {"date"}) - q_99_visit = ( - n_visit.groupby( + n_visit = ( + condition_predictor.groupby( partition_cols, as_index=False, dropna=False, - )[["n_visit"]] - .quantile(q=0.99) - .rename(columns={"n_visit": "q_99_visit"}) + ) + .agg({"visit_id": "nunique"}) + .rename(columns={"visit_id": "n_visit"}) ) - visit_predictor = n_visit.merge( - q_99_visit, + condition_predictor = n_visit_with_condition.merge( + n_visit, on=partition_cols, ) - visit_predictor["c"] = visit_predictor["q_99_visit"].where( - visit_predictor["q_99_visit"] == 0, - visit_predictor["n_visit"] / visit_predictor["q_99_visit"], + condition_predictor = to("pandas", condition_predictor) + + condition_predictor["c"] = condition_predictor["n_visit"].where( + condition_predictor["n_visit"] == 0, + condition_predictor["n_visit_with_condition"] / condition_predictor["n_visit"], ) - visit_predictor = visit_predictor.drop(columns="q_99_visit") + condition_predictor = condition_predictor.drop(columns=["n_visit_with_condition"]) - return visit_predictor + return condition_predictor -def get_hospital_visit(condition_occurrence, care_site): - hospital_visit = condition_occurrence.rename( - columns={"visit_occurrence_id": "visit_id"} - ) +def get_hospital_visit(condition_occurrence, visit_occurrence, care_site, source): + # visit/condition linkage + if source == "AREM": + # Link with visit_occurrence_source_value + condition_hospital = condition_occurrence.drop_duplicates( + ["visit_occurrence_source_value", "diag_type", "condition_type"] + ) + condition_hospital["has_condition"] = True + hospital_visit = condition_hospital.merge( + visit_occurrence, + on="visit_occurrence_source_value", + how="left", + ).drop(columns="visit_occurrence_source_value") + else: + condition_hospital = condition_occurrence.drop_duplicates( + ["visit_occurrence_id", "diag_type", "condition_type"] + ) + condition_hospital["has_condition"] = True + hospital_visit = condition_hospital.merge( + visit_occurrence, + on="visit_occurrence_id", + how="left", + ) + hospital_visit = hospital_visit.rename(columns={"visit_occurrence_id": "visit_id"}) hospital_visit = hospital_visit.merge(care_site, on="care_site_id") if is_koalas(hospital_visit): @@ -80,53 +98,15 @@ def get_hospital_visit(condition_occurrence, care_site): return hospital_visit -def get_uf_visit(condition_occurrence, visit_detail, care_site, care_site_relationship): - visit_detail = visit_detail.merge( - condition_occurrence[ - ["visit_occurrence_id", "stay_type", "diag_type", "condition_type"] - ], - on="visit_occurrence_id", - ).drop(columns="visit_occurrence_id") - - uf_visit = convert_table_to_uf( - table=visit_detail, - table_name="visit_detail", - care_site_relationship=care_site_relationship, - ) - uf_visit = uf_visit.merge(care_site, on="care_site_id") - uf_visit = uf_visit[uf_visit["care_site_level"] == CARE_SITE_LEVEL_NAMES["UF"]] - if is_koalas(uf_visit): - uf_visit.spark.cache() - - return uf_visit - - -def get_pole_visit(uf_visit, care_site, care_site_relationship): - pole_visit = convert_table_to_pole( - table=uf_visit.drop(columns=["care_site_short_name", "care_site_level"]), - table_name="uf_visit", - care_site_relationship=care_site_relationship, - ) - - pole_visit = pole_visit.merge(care_site, on="care_site_id") - pole_visit = pole_visit[ - pole_visit["care_site_level"] == CARE_SITE_LEVEL_NAMES["Pole"] - ] - if is_koalas(pole_visit): - pole_visit.spark.cache() - - return pole_visit - - class ConditionProbe(BaseProbe): r""" - The ``ConditionProbe`` computes $c_(t)$ the availability of administrative data related to visits with at least one ICD-10 code recorded for each care site according to time: + The [``ConditionProbe``][edsteva.probes.condition.ConditionProbe] computes $c_{condition}(t)$ the availability of claim data in patients' administrative stay: $$ - c(t) = \frac{n_{condition}(t)}{n_{99}} + c_{condition}(t) = \frac{n_{with\,condition}(t)}{n_{visit}(t)} $$ - Where $n_{condition}(t)$ is the number of stays with at least one ICD-10 code recorded, $t$ is the month and $n_{99}$ is the $99^{th}$ percentile of $n_{condition}(t)$. + Where $n_{visit}(t)$ is the number of administrative stays, $n_{with\,condition}$ the number of stays having at least one claim code (e.g. ICD-10) recorded and $t$ is the month. Attributes ---------- @@ -190,16 +170,19 @@ def compute_process( check_tables(data=data, required_tables=["condition_occurrence"]) + if not hospital_only(care_site_levels=care_site_levels): + logger.warning("Claim data is only available at hospital level") + care_site_levels = None + visit_occurrence = prepare_visit_occurrence( - data, - start_date, - end_date, - stay_types, + data=data, + start_date=start_date, + end_date=end_date, + stay_types=stay_types, ) condition_occurrence = prepare_condition_occurrence( data=data, - visit_occurrence=visit_occurrence, extra_data=extra_data, source=source, diag_types=diag_types, @@ -207,42 +190,24 @@ def compute_process( ) care_site = prepare_care_site( - data, - care_site_ids, - care_site_short_names, - care_site_relationship, + data=data, + care_site_ids=care_site_ids, + care_site_short_names=care_site_short_names, + care_site_relationship=care_site_relationship, ) hospital_visit = get_hospital_visit( condition_occurrence, + visit_occurrence, care_site, + source, ) hospital_name = CARE_SITE_LEVEL_NAMES["Hospital"] - visit_predictor_by_level = {hospital_name: hospital_visit} + condition_predictor_by_level = {hospital_name: hospital_visit} - if not hospital_only(care_site_levels=care_site_levels): - visit_detail = prepare_visit_detail(data, start_date, end_date) - - uf_name = CARE_SITE_LEVEL_NAMES["UF"] - uf_visit = get_uf_visit( - condition_occurrence, - visit_detail, - care_site, - care_site_relationship, - ) - visit_predictor_by_level[uf_name] = uf_visit - - pole_name = CARE_SITE_LEVEL_NAMES["Pole"] - pole_visit = get_pole_visit( - uf_visit, - care_site, - care_site_relationship, - ) - visit_predictor_by_level[pole_name] = pole_visit - - visit_predictor = concatenate_predictor_by_level( - predictor_by_level=visit_predictor_by_level, + condition_predictor = concatenate_predictor_by_level( + predictor_by_level=condition_predictor_by_level, care_site_levels=care_site_levels, ) - return compute_completeness(visit_predictor) + return compute_completeness(condition_predictor) diff --git a/edsteva/probes/note.py b/edsteva/probes/note.py index 4bda746d..a0d5fdb3 100644 --- a/edsteva/probes/note.py +++ b/edsteva/probes/note.py @@ -33,6 +33,7 @@ def compute_completeness(note_predictor): note_predictor.groupby( partition_cols, as_index=False, + dropna=False, ) .agg({"has_note": "count"}) .rename(columns={"has_note": "n_visit_with_note"}) @@ -43,6 +44,7 @@ def compute_completeness(note_predictor): note_predictor.groupby( partition_cols, as_index=False, + dropna=False, ) .agg({"visit_id": "nunique"}) .rename(columns={"visit_id": "n_visit"}) @@ -172,7 +174,7 @@ def get_pole_visit(uf_visit, care_site, care_site_relationship): # pragma: no c class NoteProbe(BaseProbe): r""" - The ``NoteProbe`` computes $c(t)$ the availability of clinical documents linked to patients' visits: + The ``NoteProbe`` computes $c(t)$ the availability of clinical documents linked to patients' administrative visit: $$ c(t) = \frac{n_{with\,doc}(t)}{n_{visit}(t)} diff --git a/edsteva/probes/utils.py b/edsteva/probes/utils.py index f1361079..51dbafef 100644 --- a/edsteva/probes/utils.py +++ b/edsteva/probes/utils.py @@ -74,7 +74,6 @@ def prepare_visit_occurrence(data, start_date, end_date, stay_types): def prepare_condition_occurrence( data: Data, - visit_occurrence: DataFrame, extra_data: Data, source: str, diag_types: List[str], @@ -145,25 +144,6 @@ def prepare_condition_occurrence( name="condition_type", ) - # visit/condition linkage - if source == "AREM": - # Link with visit_occurrence_source_value - condition_occurrence = condition_occurrence.drop_duplicates( - ["visit_occurrence_source_value", "diag_type", "condition_type"] - ) - condition_occurrence = condition_occurrence.merge( - visit_occurrence, - on="visit_occurrence_source_value", - ).drop(columns="visit_occurrence_source_value") - else: - condition_occurrence = condition_occurrence.drop_duplicates( - ["visit_occurrence_id", "diag_type", "condition_type"] - ) - condition_occurrence = condition_occurrence.merge( - visit_occurrence, - on="visit_occurrence_id", - ) - return condition_occurrence diff --git a/edsteva/probes/visit.py b/edsteva/probes/visit.py index 0ca3ffbf..918f43d0 100644 --- a/edsteva/probes/visit.py +++ b/edsteva/probes/visit.py @@ -33,7 +33,7 @@ def compute_completeness(visit_predictor): as_index=False, dropna=False, ) - .agg({"visit_id": "count"}) + .agg({"visit_id": "nunique"}) .rename(columns={"visit_id": "n_visit"}) )