Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate chain of events for individuals #1468

Draft
wants to merge 25 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
dbff470
Investigate analysis of events at sim level
marghe-molaro Apr 3, 2024
bf64628
Merge branch 'master' into molaro/harvest-training-data
marghe-molaro Sep 17, 2024
05098f7
Final data-printing set-up
marghe-molaro Sep 30, 2024
16c071c
Print event chains
marghe-molaro Oct 2, 2024
ba81487
Add chains in mode 2 too and clean up in simuation
marghe-molaro Oct 2, 2024
0474624
Merged with master, and moved all logging into event module to keep t…
marghe-molaro Oct 2, 2024
b1c907c
Fix issue with tests by ensuring standard Polling and infection is ma…
marghe-molaro Oct 7, 2024
cfb4264
Switch iloc for loc
marghe-molaro Oct 7, 2024
e0327de
Change syntax of if statement
marghe-molaro Oct 7, 2024
fceee02
Change syntax of if statement and print string of event
marghe-molaro Oct 9, 2024
eaeae62
Focus on rti and print footprint
marghe-molaro Oct 10, 2024
c7bd9d0
Only store change in individual properties, not entire property row. …
marghe-molaro Oct 11, 2024
769aaec
Style fixes
marghe-molaro Oct 11, 2024
757cee3
Include printing of individual properties at the beginning and at bir…
marghe-molaro Oct 13, 2024
22a5e44
Log everything to simulation, as events logger doesn't seem to be vis…
marghe-molaro Oct 16, 2024
7faa817
Consider all modules included as of interest
marghe-molaro Oct 18, 2024
7232f97
Remove pop-wide HSI warning and make epi default even when printing c…
marghe-molaro Oct 18, 2024
98a8832
Merge branch 'master' into molaro/harvest-training-data
marghe-molaro Oct 18, 2024
a6def2d
Style fix
marghe-molaro Oct 18, 2024
ecea532
Remove data generation test, which wasn't really a test
marghe-molaro Oct 18, 2024
ae7a44c
Change dict of properties to string in logging, and add analysis files
marghe-molaro Oct 23, 2024
16299a2
Include debugging option, final set-up of scenario to print data, ana…
marghe-molaro Nov 25, 2024
0dd862f
Change label of person when iterating
marghe-molaro Nov 26, 2024
0e7dc99
Merge branch 'master' into molaro/harvest-training-data
marghe-molaro Dec 9, 2024
84f8263
Correctly retrieve event name
marghe-molaro Dec 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
513 changes: 513 additions & 0 deletions src/scripts/analysis_data_generation/analysis_extract_data.py

Large diffs are not rendered by default.

156 changes: 156 additions & 0 deletions src/scripts/analysis_data_generation/postprocess_events_chain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import pandas as pd
from dateutil.relativedelta import relativedelta

# Remove from every individual's event chain all events that were fired after death
def cut_off_events_after_death(df):

events_chain = df.groupby('person_ID')

filtered_data = pd.DataFrame()

for name, group in events_chain:

# Find the first non-NaN 'date_of_death' and its index
first_non_nan_index = group['date_of_death'].first_valid_index()

if first_non_nan_index is not None:
# Filter out all rows after the first non-NaN index
filtered_group = group.loc[:first_non_nan_index] # Keep rows up to and including the first valid index
filtered_data = pd.concat([filtered_data, filtered_group])
else:
# If there are no non-NaN values, keep the original group
filtered_data = pd.concat([filtered_data, group])

return filtered_data

# Load into DataFrame
def load_csv_to_dataframe(file_path):
try:
# Load raw chains into df
df = pd.read_csv(file_path)
print("Raw event chains loaded successfully!")
return df
except FileNotFoundError:
print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
print(f"An error occurred: {e}")

file_path = 'output.csv' # Replace with the path to your CSV file

output = load_csv_to_dataframe(file_path)

# Some of the dates appeared not to be in datetime format. Correct here.
output['date_of_death'] = pd.to_datetime(output['date_of_death'], errors='coerce')
output['date_of_birth'] = pd.to_datetime(output['date_of_birth'], errors='coerce')
if 'hv_date_inf' in output.columns:
output['hv_date_inf'] = pd.to_datetime(output['hv_date_inf'], errors='coerce')


date_start = pd.to_datetime('2010-01-01')
if 'Other' in output['cause_of_death'].values:
print("ERROR: 'Other' was included in sim as possible cause of death")
exit(-1)

# Choose which columns in individual properties to visualise
columns_to_print =['event','is_alive','hv_inf', 'hv_art','tb_inf', 'tb_date_active', 'event_date', 'when']
#columns_to_print =['person_ID', 'date_of_birth', 'date_of_death', 'cause_of_death','hv_date_inf', 'hv_art','tb_inf', 'tb_date_active', 'event date', 'event']

# When checking which individuals led to *any* changes in individual properties, exclude these columns from comparison
columns_to_exclude_in_comparison = ['when', 'event', 'event_date', 'age_exact_years', 'age_years', 'age_days', 'age_range', 'level', 'appt_footprint']

# If considering epidemiology consistent with sim, add check here.
check_ages_of_those_HIV_inf = False
if check_ages_of_those_HIV_inf:
for index, row in output.iterrows():
if pd.isna(row['hv_date_inf']):
continue # Skip this iteration
diff = relativedelta(output.loc[index, 'hv_date_inf'],output.loc[index, 'date_of_birth'])
if diff.years > 1 and diff.years<15:
print("Person contracted HIV infection at age younger than 15", diff)

# Remove events after death
filtered_data = cut_off_events_after_death(output)

print_raw_events = True # Print raw chain of events for each individual
print_selected_changes = False
print_all_changes = True
person_ID_of_interest = 494

pd.set_option('display.max_rows', None)

for name, group in filtered_data.groupby('person_ID'):
list_of_dob = group['date_of_birth']

# Select individuals based on when they were born
if list_of_dob.iloc[0].year<2010:

# Check that immutable properties are fixed for this individual, i.e. that events were collated properly:
all_identical_dob = group['date_of_birth'].nunique() == 1
all_identical_sex = group['sex'].nunique() == 1
if all_identical_dob is False or all_identical_sex is False:
print("Immutable properties are changing! This is not chain for single individual")
print(group)
exit(-1)

print("----------------------------------------------------------------------")
print("person_ID ", group['person_ID'].iloc[0], "d.o.b ", group['date_of_birth'].iloc[0])
print("Number of events for this individual ", group['person_ID'].iloc[0], "is :", len(group)/2) # Divide by 2 before printing Before/After for each event
number_of_events =len(group)/2
number_of_changes=0
if print_raw_events:
print(group)

if print_all_changes:
# Check each row
comparison = group.drop(columns=columns_to_exclude_in_comparison).fillna(-99999).ne(group.drop(columns=columns_to_exclude_in_comparison).shift().fillna(-99999))

# Iterate over rows where any column has changed
for idx, row_changed in comparison.iloc[1:].iterrows():
if row_changed.any(): # Check if any column changed in this row
number_of_changes+=1
changed_columns = row_changed[row_changed].index.tolist() # Get the columns where changes occurred
print(f"Row {idx} - Changes detected in columns: {changed_columns}")
columns_output = ['event', 'event_date', 'appt_footprint', 'level'] + changed_columns
print(group.loc[idx, columns_output]) # Print only the changed columns
if group.loc[idx, 'when'] == 'Before':
print('-----> THIS CHANGE OCCURRED BEFORE EVENT!')
#print(group.loc[idx,columns_to_print])
print() # For better readability
print("Number of changes is ", number_of_changes, "out of ", number_of_events, " events")

if print_selected_changes:
tb_inf_condition = (
((group['tb_inf'].shift(1) == 'uninfected') & (group['tb_inf'] == 'active')) |
((group['tb_inf'].shift(1) == 'latent') & (group['tb_inf'] == 'active')) |
((group['tb_inf'].shift(1) == 'active') & (group['tb_inf'] == 'latent')) |
((group['hv_inf'].shift(1) is False) & (group['hv_inf'] is True)) |
((group['hv_art'].shift(1) == 'not') & (group['hv_art'] == 'on_not_VL_suppressed')) |
((group['hv_art'].shift(1) == 'not') & (group['hv_art'] == 'on_VL_suppressed')) |
((group['hv_art'].shift(1) == 'on_VL_suppressed') & (group['hv_art'] == 'on_not_VL_suppressed')) |
((group['hv_art'].shift(1) == 'on_VL_suppressed') & (group['hv_art'] == 'not')) |
((group['hv_art'].shift(1) == 'on_not_VL_suppressed') & (group['hv_art'] == 'on_VL_suppressed')) |
((group['hv_art'].shift(1) == 'on_not_VL_suppressed') & (group['hv_art'] == 'not'))
)

alive_condition = (
(group['is_alive'].shift(1) is True) & (group['is_alive'] is False)
)
# Combine conditions for rows of interest
transition_condition = tb_inf_condition | alive_condition

if list_of_dob.iloc[0].year >= 2010:
print("DETECTED OF INTEREST")
print(group[group['event'] == 'Birth'][columns_to_print])

# Filter the DataFrame based on the condition
filtered_transitions = group[transition_condition]
if not filtered_transitions.empty:
if list_of_dob.iloc[0].year < 2010:
print("DETECTED OF INTEREST")
print(filtered_transitions[columns_to_print])


print("Number of individuals simulated ", filtered_data.groupby('person_ID').ngroups)



152 changes: 152 additions & 0 deletions src/scripts/analysis_data_generation/scenario_generate_chains.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
"""This Scenario file run the model to generate event chans

Run on the batch system using:
```
tlo batch-submit
src/scripts/analysis_data_generation/scenario_generate_chains.py
```

or locally using:
```
tlo scenario-run src/scripts/analysis_data_generation/scenario_generate_chains.py
```

"""
from pathlib import Path
from typing import Dict

import pandas as pd

from tlo import Date, logging
from tlo.analysis.utils import get_parameters_for_status_quo, mix_scenarios
from tlo.methods.fullmodel import fullmodel
from tlo.methods.scenario_switcher import ImprovedHealthSystemAndCareSeekingScenarioSwitcher
from tlo.scenario import BaseScenario
from tlo.methods import (
alri,
cardio_metabolic_disorders,
care_of_women_during_pregnancy,
contraception,
demography,
depression,
diarrhoea,
enhanced_lifestyle,
epi,
healthburden,
healthseekingbehaviour,
healthsystem,
hiv,
rti,
labour,
malaria,
newborn_outcomes,
postnatal_supervisor,
pregnancy_supervisor,
stunting,
symptommanager,
tb,
wasting,
)

class GenerateDataChains(BaseScenario):
def __init__(self):
super().__init__()
self.seed = 0
self.start_date = Date(2010, 1, 1)
self.end_date = self.start_date + pd.DateOffset(months=13)
self.pop_size = 1000
self._scenarios = self._get_scenarios()
self.number_of_draws = len(self._scenarios)
self.runs_per_draw = 50
self.generate_event_chains = True

def log_configuration(self):
return {
'filename': 'generate_event_chains',
'directory': Path('./outputs'), # <- (specified only for local running)
'custom_levels': {
'*': logging.WARNING,
'tlo.methods.demography': logging.INFO,
'tlo.methods.events': logging.INFO,
'tlo.methods.demography.detail': logging.WARNING,
'tlo.methods.healthburden': logging.INFO,
'tlo.methods.healthsystem.summary': logging.INFO,
}
}

def modules(self):
# MODIFY
# Here instead of running full module
return [demography.Demography(resourcefilepath=self.resources),
enhanced_lifestyle.Lifestyle(resourcefilepath=self.resources),
healthburden.HealthBurden(resourcefilepath=self.resources),
symptommanager.SymptomManager(resourcefilepath=self.resources, spurious_symptoms=False),
rti.RTI(resourcefilepath=self.resources),
healthseekingbehaviour.HealthSeekingBehaviour(resourcefilepath=self.resources),
#simplified_births.SimplifiedBirths(resourcefilepath=resourcefilepath),
healthsystem.HealthSystem(resourcefilepath=self.resources,
mode_appt_constraints=1,
cons_availability='all')]

# return (
# fullmodel(resourcefilepath=self.resources)
# + [ImprovedHealthSystemAndCareSeekingScenarioSwitcher(resourcefilepath=self.resources)]
# )

def draw_parameters(self, draw_number, rng):
if draw_number < self.number_of_draws:
return list(self._scenarios.values())[draw_number]
else:
return

# case 1: gfHE = -0.030, factor = 1.01074
# case 2: gfHE = -0.020, factor = 1.02116
# case 3: gfHE = -0.015, factor = 1.02637
# case 4: gfHE = 0.015, factor = 1.05763
# case 5: gfHE = 0.020, factor = 1.06284
# case 6: gfHE = 0.030, factor = 1.07326

def _get_scenarios(self) -> Dict[str, Dict]:
"""Return the Dict with values for the parameters that are changed, keyed by a name for the scenario.
"""

self.YEAR_OF_CHANGE = 2019

return {

# =========== STATUS QUO ============
"Baseline":
mix_scenarios(
self._baseline(),
{
"HealthSystem": {
"yearly_HR_scaling_mode": "no_scaling",
},
}
),

}

def _baseline(self) -> Dict:
"""Return the Dict with values for the parameter changes that define the baseline scenario. """
return mix_scenarios(
get_parameters_for_status_quo(),
{
"HealthSystem": {
"mode_appt_constraints": 1, # <-- Mode 1 prior to change to preserve calibration
"mode_appt_constraints_postSwitch": 2, # <-- Mode 2 post-change to show effects of HRH
"year_mode_switch": self.YEAR_OF_CHANGE,
"scale_to_effective_capabilities": True,
"policy_name": "Naive",
"tclose_overwrite": 1,
"tclose_days_offset_overwrite": 7,
"use_funded_or_actual_staffing": "actual",
"cons_availability": "default",
}
},
)

if __name__ == '__main__':
from tlo.cli import scenario_run

scenario_run([__file__])
Loading
Loading