diff --git a/edsteva/io/synthetic/biology.py b/edsteva/io/synthetic/biology.py index cc50c8b..4e47739 100644 --- a/edsteva/io/synthetic/biology.py +++ b/edsteva/io/synthetic/biology.py @@ -2,23 +2,14 @@ import pandas as pd from loguru import logger -from edsteva.io.synthetic.utils import ( - generate_events_after_t0, - generate_events_after_t1, - generate_events_around_t0, - generate_events_around_t1, - generate_events_before_t0, -) - def generate_bio( generator: np.random.Generator, - t_start: int, - t_end: int, - n_events: int, - increase_time: int, - increase_ratio: float, + visit_care_site, + t0_visit, + date_col: str, bio_date_col: str, + id_visit_col, unit: str, concept_code: str, mode: str, @@ -26,24 +17,22 @@ def generate_bio( if mode == "step": return _generate_bio_step( generator=generator, - t_start=t_start, - t_end=t_end, - n_events=n_events, - increase_time=increase_time, - increase_ratio=increase_ratio, + visit_care_site=visit_care_site, + t0_visit=t0_visit, + date_col=date_col, bio_date_col=bio_date_col, + id_visit_col=id_visit_col, unit=unit, concept_code=concept_code, ) if mode == "rect": return _generate_bio_rect( generator=generator, - t_start=t_start, - t_end=t_end, - n_events=n_events, - increase_time=increase_time, - increase_ratio=increase_ratio, + visit_care_site=visit_care_site, + t0_visit=t0_visit, + date_col=date_col, bio_date_col=bio_date_col, + id_visit_col=id_visit_col, unit=unit, concept_code=concept_code, ) @@ -51,101 +40,104 @@ def generate_bio( def _generate_bio_step( generator: np.random.Generator, - t_start: int, - t_end: int, - n_events: int, - increase_time: int, - increase_ratio: float, + visit_care_site, + t0_visit, + date_col: str, bio_date_col: str, + id_visit_col, unit: str, concept_code: str, ): - t0 = generator.integers(t_start + increase_time, t_end - increase_time) - params = dict( - generator=generator, - t_start=t_start, - t_end=t_end, - n_events=n_events, - t0=t0, - increase_ratio=increase_ratio, - increase_time=increase_time, + t_end = visit_care_site[date_col].max() + t0 = generator.integers(t0_visit, t_end) + c_before = generator.uniform(0, 0.01) + c_after = generator.uniform(0.8, 1) + + measurement_before_t0_visit = ( + visit_care_site[visit_care_site[date_col] <= t0_visit][[id_visit_col, date_col]] + .sample(frac=c_before) + .rename(columns={date_col: bio_date_col}) ) - df = pd.concat( - [ - generate_events_before_t0(**params), - generate_events_after_t0(**params), - generate_events_around_t0(**params), - ] - ).to_frame() - df.columns = [bio_date_col] - df["unit_source_value"] = unit - df["measurement_source_concept_id"] = concept_code - df["t_0_min"] = t0 - increase_time / 2 - df["t_0_max"] = t0 + increase_time / 2 - logger.debug("Generate measurement deploying as step function") + # Stratify visit between t0_visit and t0 to + # ensure that these elements are represented + # in the final measurements dataset. + + measurement_before_t0 = ( + visit_care_site[ + (visit_care_site[date_col] <= t0) & (visit_care_site[date_col] > t0_visit) + ][[id_visit_col, date_col]] + .sample(frac=c_before) + .rename(columns={date_col: bio_date_col}) + ) + + measurement_after_t0 = ( + visit_care_site[visit_care_site[date_col] > t0][[id_visit_col, date_col]] + .sample(frac=c_after) + .rename(columns={date_col: bio_date_col}) + ) + + measurement = pd.concat( + [measurement_before_t0_visit, measurement_before_t0, measurement_after_t0] + ) + + measurement[bio_date_col] = pd.to_datetime(measurement[bio_date_col], unit="s") + measurement["unit_source_value"] = unit + measurement["measurement_source_concept_id"] = concept_code + measurement["t_0"] = t0 + + logger.debug("Generate synthetic measurement deploying as step function") - return df + return measurement def _generate_bio_rect( generator: np.random.Generator, - t_start: int, - t_end: int, - n_events: int, - increase_time: int, - increase_ratio: float, + visit_care_site, + t0_visit, + date_col: str, bio_date_col: str, + id_visit_col, unit: str, concept_code: str, ): - t0 = generator.integers( - t_start + increase_time, (t_end + t_start) / 2 - increase_time - ) - t1 = generator.integers( - (t_end + t_start) / 2 + increase_time, t_end - increase_time + t1_visit = visit_care_site["t_1_min"].max() + t0 = generator.integers(t0_visit, t0_visit + (t1_visit - t0_visit) / 3) + t1 = generator.integers(t0_visit + 2 * (t1_visit - t0_visit) / 3, t1_visit) + c_out = generator.uniform(0, 0.1) + c_in = generator.uniform(0.8, 1) + + measurement_before_t0 = ( + visit_care_site[visit_care_site[date_col] <= t0][[id_visit_col, date_col]] + .sample(frac=c_out) + .rename(columns={date_col: bio_date_col}) ) - t0_params = dict( - generator=generator, - t_start=t_start, - t_end=t1 - increase_time / 2, - n_events=n_events, - t0=t0, - increase_ratio=increase_ratio, - increase_time=increase_time, + measurement_between_t0_t1 = ( + visit_care_site[ + (visit_care_site[date_col] > t0) & (visit_care_site[date_col] <= t1) + ][[id_visit_col, date_col]] + .sample(frac=c_in) + .rename(columns={date_col: bio_date_col}) ) - before_t0 = generate_events_before_t0(**t0_params) - around_t0 = generate_events_around_t0(**t0_params) - # Raise n_visit to enforce a rectangle shape - between_t0_t1 = generate_events_after_t0(**t0_params) - t1_params = dict( - generator=generator, - t_start=t_start, - t_end=t_end, - n_events=n_events, - t1=t1, - increase_time=increase_time, - increase_ratio=increase_ratio, + + measurement_after_t1 = ( + visit_care_site[(visit_care_site[date_col] > t1)][[id_visit_col, date_col]] + .sample(frac=c_out) + .rename(columns={date_col: bio_date_col}) ) - around_t1 = generate_events_around_t1(**t1_params) - after_t1 = generate_events_after_t1(**t1_params) - df = pd.concat( + measurement = pd.concat( [ - before_t0, - around_t0, - between_t0_t1, - around_t1, - after_t1, + measurement_before_t0, + measurement_between_t0_t1, + measurement_after_t1, ] - ).to_frame() - - df.columns = [bio_date_col] - df["unit_source_value"] = unit - df["measurement_source_concept_id"] = concept_code - df["t_0_min"] = t0 - increase_time / 2 - df["t_0_max"] = t0 + increase_time / 2 - df["t_1_min"] = t1 - increase_time / 2 - df["t_1_max"] = t1 + increase_time / 2 - logger.debug("Generate measurement deploying as rectangle function") - - return df + ) + + measurement[bio_date_col] = pd.to_datetime(measurement[bio_date_col], unit="s") + measurement["unit_source_value"] = unit + measurement["measurement_source_concept_id"] = concept_code + measurement["t_0"] = t0 + measurement["t_1"] = t1 + logger.debug("Generate synthetic measurement deploying as rectangle function") + + return measurement diff --git a/edsteva/io/synthetic/synthetic.py b/edsteva/io/synthetic/synthetic.py index 2d2d847..d33b645 100644 --- a/edsteva/io/synthetic/synthetic.py +++ b/edsteva/io/synthetic/synthetic.py @@ -616,8 +616,8 @@ def _generate_measurement( mean_measurement: int = 1000, units: List[str] = ["g", "g/l", "mol", "s"], ): - t_min = self.t_min.timestamp() - t_max = self.t_max.timestamp() + self.t_min.timestamp() + self.t_max.timestamp() measurements = [] visit_occurrence = visit_occurrence.sample(frac=0.9) for concept_name in src_concept_name: @@ -626,50 +626,37 @@ def _generate_measurement( mean_value = (1 + units.index(unit)) * 2 std_value = 1 for care_site_id in hospital_ids: - t_start = t_min + self.generator.integers(0, (t_max - t_min) / 20) - t_end = t_max - self.generator.integers(0, (t_max - t_min) / 20) - valid_measurements = int( - self.generator.normal(mean_measurement, mean_measurement / 5) - ) - missing_value = int(self.generator.uniform(1, valid_measurements / 10)) - n_measurements = valid_measurements + missing_value - increase_time = self.generator.integers( - (t_end - t_start) / 100, (t_end - t_start) / 10 + visit_care_site = visit_occurrence[ + visit_occurrence.care_site_id == care_site_id + ].reset_index(drop=True) + visit_care_site[self.date_col] = ( + visit_care_site[self.date_col].view("int64") // 10**9 ) - increase_ratio = self.generator.uniform(150, 200) - concept_code = concept_name.split("_")[1] - unit = concept_name.split("_")[-1] - mean_value = (1 + units.index(unit)) * 2 - std_value = 1 + + t0_visit = visit_care_site["t_0_max"].max() params = dict( generator=self.generator, - t_start=t_start, - t_end=t_end, - n_events=n_measurements, - increase_ratio=increase_ratio, - increase_time=increase_time, + visit_care_site=visit_care_site, + date_col=self.date_col, bio_date_col=self.bio_date_col, + id_visit_col=self.id_visit_col, unit=unit, concept_code=concept_code, + t0_visit=t0_visit, mode=self.mode, ) + measurement = generate_bio(**params) - visit_care_site = visit_occurrence[ - visit_occurrence.care_site_id == care_site_id - ] - measurement[self.id_visit_col] = ( - visit_care_site[self.id_visit_col] - .sample( - n=measurement.shape[0], - replace=True, - ) - .reset_index(drop=True) + + measurement["value_as_number"] = self.generator.normal( + mean_value, std_value, measurement.shape[0] ) - measurement["value_as_number"] = [None] * missing_value + list( - self.generator.normal( - mean_value, std_value, measurement.shape[0] - missing_value - ) + + valid_measurements = ( + self.generator.uniform(0, 1, measurement.shape[0]) > 0.01 ) + measurement.loc[~valid_measurements, "value_as_number"] = None + measurements.append(measurement) measurements = pd.concat(measurements).reset_index(drop=True) diff --git a/edsteva/io/synthetic/utils.py b/edsteva/io/synthetic/utils.py index 05d4980..7e70d05 100644 --- a/edsteva/io/synthetic/utils.py +++ b/edsteva/io/synthetic/utils.py @@ -11,6 +11,30 @@ def generate_events_before_t0( increase_time: int, increase_ratio: float, ): + """Generate events before t0 - increase_time / 2 + + Parameters + ---------- + generator : np.random.Generator + t_start : int + Starting date in seconds + t_end : int + Ending date in seconds + n_events : int + Number of events to generate + t0 : int + Events deployment date + increase_time : int + Events deployment interval in seconds + increase_ratio : float + Ratio between events before t0 and events after t0 + + Returns + ------- + pd.Series + A series of datetime values representing generated events + """ + t0_before = t0 - increase_time / 2 n_before = int( (t0_before - t_start) @@ -33,6 +57,30 @@ def generate_events_after_t0( increase_time: int, increase_ratio: float, ): + """Generate events after t0 + increase_time / 2 + + Parameters + ---------- + generator : np.random.Generator + t_start : int + Starting date in seconds + t_end : int + Ending date in seconds + n_events : int + Number of events to generate + t0 : int + Events deployment date + increase_time : int + Events deployment interval in seconds + increase_ratio : float + Ratio between events before t0 and events after t0 + + Returns + ------- + pd.Series + A series of datetime values representing generated events + """ + t0_after = t0 + increase_time / 2 n_after = int( increase_ratio @@ -56,6 +104,29 @@ def generate_events_around_t0( increase_time: int, increase_ratio: float, ): + """Generate events between t0 - increase_time / 2 and t0 + increase_time / 2 + + Parameters + ---------- + generator : np.random.Generator + t_start : int + Starting date in seconds + t_end : int + Ending date in seconds + n_events : int + Number of events to generate + t0 : int + Events deployment date + increase_time : int + Events deployment interval in seconds + increase_ratio : float + Ratio between events before t0 and events after t0 + + Returns + ------- + pd.Series + A series of datetime values representing generated events + """ t0_before = t0 - increase_time / 2 t0_after = t0 + increase_time / 2 n_middle = int( @@ -84,6 +155,29 @@ def generate_events_around_t1( increase_time: int, increase_ratio: float, ): + """Generate events between t1 - increase_time / 2 and t1 + increase_time / 2 + + Parameters + ---------- + generator : np.random.Generator + t_start : int + Starting date in seconds + t_end : int + Ending date in seconds + n_events : int + Number of events to generate + t1 : int + End of events deployment date + increase_time : int + End of events deployment interval in seconds + increase_ratio : float + Ratio between events before t1 and events after t1 + + Returns + ------- + pd.Series + A series of datetime values representing generated events + """ t1_before = t1 - increase_time / 2 t1_after = t1 + increase_time / 2 n_middle = int( @@ -112,6 +206,30 @@ def generate_events_after_t1( increase_time: int, increase_ratio: float, ): + """Generate events after t1 + increase_time / 2 + + Parameters + ---------- + generator : np.random.Generator + t_start : int + Starting date in seconds + t_end : int + Ending date in seconds + n_events : int + Number of events to generate + t1 : int + End of events deployment date + increase_time : int + End of events deployment interval in seconds + increase_ratio : float + Ratio between events before t1 and events after t1 + + Returns + ------- + pd.Series + A series of datetime values representing generated events + """ + t1_after = t1 + increase_time / 2 n_after = int( (t_end - t1_after) * n_events / ((t1 - t_start) * increase_ratio + (t_end - t1)) diff --git a/tests/test_model.py b/tests/test_model.py index f277424..88cf348 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -7,7 +7,7 @@ from edsteva.io import SyntheticData from edsteva.models.rectangle_function import RectangleFunction from edsteva.models.step_function import StepFunction -from edsteva.probes import NoteProbe, VisitProbe +from edsteva.probes import BiologyProbe, NoteProbe, VisitProbe from edsteva.utils.loss_functions import l1_loss pytestmark = pytest.mark.filterwarnings("ignore") @@ -193,3 +193,56 @@ def test_step_function_note(): (prediction["t_0"] <= prediction["t_0_max"]) & (prediction["t_0_min"] <= prediction["t_0"]) ).all() + + +def test_step_function_biology(): + biology = BiologyProbe(completeness_predictor="per_visit_default") + biology.compute( + data=data_step, + start_date=data_step.t_min, + end_date=data_step.t_max, + stay_types={"ALL": ".*", "HC": "hospitalisés", "Urg": "urgences"}, + care_site_ids=["1", "2"], + care_site_short_names=["Hôpital-1", "Hôpital-2"], + concepts_sets=None, + concept_codes=True, + length_of_stays=None, + ) + + biology_model = StepFunction() + biology_model.fit( + probe=biology, + start_date=data_step.t_min, + end_date=data_step.t_max, + ) + + simulation = data_step.measurement.merge( + data_step.visit_occurrence, on="visit_occurrence_id" + ) + simulation["ANABIO_concept_code"] = ( + simulation["measurement_source_concept_id"] + .str.findall(r"\b[A-Z]\d{4}\b") + .str[0] + ) + + simulation = simulation.groupby( + ["ANABIO_concept_code", "care_site_id"], as_index=False + )[["t_0"]].min() + simulation.t_0 = pd.to_datetime(simulation.t_0, unit="s") + + biology_model = biology_model.estimates.merge( + simulation, + on=["ANABIO_concept_code", "care_site_id"], + suffixes=("_model", "_simulation"), + ) + + assert ( + ( + biology_model.t_0_model + <= biology_model.t_0_simulation + pd.DateOffset(months=2) + ) + & ( + biology_model.t_0_model + > biology_model.t_0_simulation - pd.DateOffset(months=2) + ) + ).all()