Skip to content

Commit

Permalink
Biology pipeline testing (#27)
Browse files Browse the repository at this point in the history
Biology pipeline testing

---------

Co-authored-by: Adam <adam.remaki@aphp.fr>
  • Loading branch information
svittoz and Aremaki authored Sep 14, 2023
1 parent ae4ab63 commit 0585222
Show file tree
Hide file tree
Showing 4 changed files with 287 additions and 137 deletions.
194 changes: 93 additions & 101 deletions edsteva/io/synthetic/biology.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,150 +2,142 @@
import pandas as pd
from loguru import logger

from edsteva.io.synthetic.utils import (
generate_events_after_t0,
generate_events_after_t1,
generate_events_around_t0,
generate_events_around_t1,
generate_events_before_t0,
)


def generate_bio(
generator: np.random.Generator,
t_start: int,
t_end: int,
n_events: int,
increase_time: int,
increase_ratio: float,
visit_care_site,
t0_visit,
date_col: str,
bio_date_col: str,
id_visit_col,
unit: str,
concept_code: str,
mode: str,
):
if mode == "step":
return _generate_bio_step(
generator=generator,
t_start=t_start,
t_end=t_end,
n_events=n_events,
increase_time=increase_time,
increase_ratio=increase_ratio,
visit_care_site=visit_care_site,
t0_visit=t0_visit,
date_col=date_col,
bio_date_col=bio_date_col,
id_visit_col=id_visit_col,
unit=unit,
concept_code=concept_code,
)
if mode == "rect":
return _generate_bio_rect(
generator=generator,
t_start=t_start,
t_end=t_end,
n_events=n_events,
increase_time=increase_time,
increase_ratio=increase_ratio,
visit_care_site=visit_care_site,
t0_visit=t0_visit,
date_col=date_col,
bio_date_col=bio_date_col,
id_visit_col=id_visit_col,
unit=unit,
concept_code=concept_code,
)


def _generate_bio_step(
generator: np.random.Generator,
t_start: int,
t_end: int,
n_events: int,
increase_time: int,
increase_ratio: float,
visit_care_site,
t0_visit,
date_col: str,
bio_date_col: str,
id_visit_col,
unit: str,
concept_code: str,
):
t0 = generator.integers(t_start + increase_time, t_end - increase_time)
params = dict(
generator=generator,
t_start=t_start,
t_end=t_end,
n_events=n_events,
t0=t0,
increase_ratio=increase_ratio,
increase_time=increase_time,
t_end = visit_care_site[date_col].max()
t0 = generator.integers(t0_visit, t_end)
c_before = generator.uniform(0, 0.01)
c_after = generator.uniform(0.8, 1)

measurement_before_t0_visit = (
visit_care_site[visit_care_site[date_col] <= t0_visit][[id_visit_col, date_col]]
.sample(frac=c_before)
.rename(columns={date_col: bio_date_col})
)
df = pd.concat(
[
generate_events_before_t0(**params),
generate_events_after_t0(**params),
generate_events_around_t0(**params),
]
).to_frame()
df.columns = [bio_date_col]
df["unit_source_value"] = unit
df["measurement_source_concept_id"] = concept_code
df["t_0_min"] = t0 - increase_time / 2
df["t_0_max"] = t0 + increase_time / 2
logger.debug("Generate measurement deploying as step function")
# Stratify visit between t0_visit and t0 to
# ensure that these elements are represented
# in the final measurements dataset.

measurement_before_t0 = (
visit_care_site[
(visit_care_site[date_col] <= t0) & (visit_care_site[date_col] > t0_visit)
][[id_visit_col, date_col]]
.sample(frac=c_before)
.rename(columns={date_col: bio_date_col})
)

measurement_after_t0 = (
visit_care_site[visit_care_site[date_col] > t0][[id_visit_col, date_col]]
.sample(frac=c_after)
.rename(columns={date_col: bio_date_col})
)

measurement = pd.concat(
[measurement_before_t0_visit, measurement_before_t0, measurement_after_t0]
)

measurement[bio_date_col] = pd.to_datetime(measurement[bio_date_col], unit="s")
measurement["unit_source_value"] = unit
measurement["measurement_source_concept_id"] = concept_code
measurement["t_0"] = t0

logger.debug("Generate synthetic measurement deploying as step function")

return df
return measurement


def _generate_bio_rect(
generator: np.random.Generator,
t_start: int,
t_end: int,
n_events: int,
increase_time: int,
increase_ratio: float,
visit_care_site,
t0_visit,
date_col: str,
bio_date_col: str,
id_visit_col,
unit: str,
concept_code: str,
):
t0 = generator.integers(
t_start + increase_time, (t_end + t_start) / 2 - increase_time
)
t1 = generator.integers(
(t_end + t_start) / 2 + increase_time, t_end - increase_time
t1_visit = visit_care_site["t_1_min"].max()
t0 = generator.integers(t0_visit, t0_visit + (t1_visit - t0_visit) / 3)
t1 = generator.integers(t0_visit + 2 * (t1_visit - t0_visit) / 3, t1_visit)
c_out = generator.uniform(0, 0.1)
c_in = generator.uniform(0.8, 1)

measurement_before_t0 = (
visit_care_site[visit_care_site[date_col] <= t0][[id_visit_col, date_col]]
.sample(frac=c_out)
.rename(columns={date_col: bio_date_col})
)
t0_params = dict(
generator=generator,
t_start=t_start,
t_end=t1 - increase_time / 2,
n_events=n_events,
t0=t0,
increase_ratio=increase_ratio,
increase_time=increase_time,
measurement_between_t0_t1 = (
visit_care_site[
(visit_care_site[date_col] > t0) & (visit_care_site[date_col] <= t1)
][[id_visit_col, date_col]]
.sample(frac=c_in)
.rename(columns={date_col: bio_date_col})
)
before_t0 = generate_events_before_t0(**t0_params)
around_t0 = generate_events_around_t0(**t0_params)
# Raise n_visit to enforce a rectangle shape
between_t0_t1 = generate_events_after_t0(**t0_params)
t1_params = dict(
generator=generator,
t_start=t_start,
t_end=t_end,
n_events=n_events,
t1=t1,
increase_time=increase_time,
increase_ratio=increase_ratio,

measurement_after_t1 = (
visit_care_site[(visit_care_site[date_col] > t1)][[id_visit_col, date_col]]
.sample(frac=c_out)
.rename(columns={date_col: bio_date_col})
)
around_t1 = generate_events_around_t1(**t1_params)
after_t1 = generate_events_after_t1(**t1_params)

df = pd.concat(
measurement = pd.concat(
[
before_t0,
around_t0,
between_t0_t1,
around_t1,
after_t1,
measurement_before_t0,
measurement_between_t0_t1,
measurement_after_t1,
]
).to_frame()

df.columns = [bio_date_col]
df["unit_source_value"] = unit
df["measurement_source_concept_id"] = concept_code
df["t_0_min"] = t0 - increase_time / 2
df["t_0_max"] = t0 + increase_time / 2
df["t_1_min"] = t1 - increase_time / 2
df["t_1_max"] = t1 + increase_time / 2
logger.debug("Generate measurement deploying as rectangle function")

return df
)

measurement[bio_date_col] = pd.to_datetime(measurement[bio_date_col], unit="s")
measurement["unit_source_value"] = unit
measurement["measurement_source_concept_id"] = concept_code
measurement["t_0"] = t0
measurement["t_1"] = t1
logger.debug("Generate synthetic measurement deploying as rectangle function")

return measurement
57 changes: 22 additions & 35 deletions edsteva/io/synthetic/synthetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -616,8 +616,8 @@ def _generate_measurement(
mean_measurement: int = 1000,
units: List[str] = ["g", "g/l", "mol", "s"],
):
t_min = self.t_min.timestamp()
t_max = self.t_max.timestamp()
self.t_min.timestamp()
self.t_max.timestamp()
measurements = []
visit_occurrence = visit_occurrence.sample(frac=0.9)
for concept_name in src_concept_name:
Expand All @@ -626,50 +626,37 @@ def _generate_measurement(
mean_value = (1 + units.index(unit)) * 2
std_value = 1
for care_site_id in hospital_ids:
t_start = t_min + self.generator.integers(0, (t_max - t_min) / 20)
t_end = t_max - self.generator.integers(0, (t_max - t_min) / 20)
valid_measurements = int(
self.generator.normal(mean_measurement, mean_measurement / 5)
)
missing_value = int(self.generator.uniform(1, valid_measurements / 10))
n_measurements = valid_measurements + missing_value
increase_time = self.generator.integers(
(t_end - t_start) / 100, (t_end - t_start) / 10
visit_care_site = visit_occurrence[
visit_occurrence.care_site_id == care_site_id
].reset_index(drop=True)
visit_care_site[self.date_col] = (
visit_care_site[self.date_col].view("int64") // 10**9
)
increase_ratio = self.generator.uniform(150, 200)
concept_code = concept_name.split("_")[1]
unit = concept_name.split("_")[-1]
mean_value = (1 + units.index(unit)) * 2
std_value = 1

t0_visit = visit_care_site["t_0_max"].max()
params = dict(
generator=self.generator,
t_start=t_start,
t_end=t_end,
n_events=n_measurements,
increase_ratio=increase_ratio,
increase_time=increase_time,
visit_care_site=visit_care_site,
date_col=self.date_col,
bio_date_col=self.bio_date_col,
id_visit_col=self.id_visit_col,
unit=unit,
concept_code=concept_code,
t0_visit=t0_visit,
mode=self.mode,
)

measurement = generate_bio(**params)
visit_care_site = visit_occurrence[
visit_occurrence.care_site_id == care_site_id
]
measurement[self.id_visit_col] = (
visit_care_site[self.id_visit_col]
.sample(
n=measurement.shape[0],
replace=True,
)
.reset_index(drop=True)

measurement["value_as_number"] = self.generator.normal(
mean_value, std_value, measurement.shape[0]
)
measurement["value_as_number"] = [None] * missing_value + list(
self.generator.normal(
mean_value, std_value, measurement.shape[0] - missing_value
)

valid_measurements = (
self.generator.uniform(0, 1, measurement.shape[0]) > 0.01
)
measurement.loc[~valid_measurements, "value_as_number"] = None

measurements.append(measurement)

measurements = pd.concat(measurements).reset_index(drop=True)
Expand Down
Loading

0 comments on commit 0585222

Please sign in to comment.