From dd256ca9563077a26bb312c02bd8f7e105e18a34 Mon Sep 17 00:00:00 2001
From: Tim Hallett <timothy.hallett@imperial.ac.uk>
Date: Thu, 9 Dec 2021 11:22:44 +0000
Subject: [PATCH 001/131] establish files and folder structure

---
 .../analysis_healthsystem_usage.py}           |   0
 ...en_year_run_tracking_healthsystem_usage.py | 124 ++++++++++++++++++
 2 files changed, 124 insertions(+)
 rename src/scripts/{calibration_analyses/analysis_scripts/analysis_hsi_descriptions.py => healthsystem/showing_hsi_and_appt_types_that_run/analysis_healthsystem_usage.py} (100%)
 create mode 100644 src/scripts/healthsystem/showing_hsi_and_appt_types_that_run/ten_year_run_tracking_healthsystem_usage.py

diff --git a/src/scripts/calibration_analyses/analysis_scripts/analysis_hsi_descriptions.py b/src/scripts/healthsystem/showing_hsi_and_appt_types_that_run/analysis_healthsystem_usage.py
similarity index 100%
rename from src/scripts/calibration_analyses/analysis_scripts/analysis_hsi_descriptions.py
rename to src/scripts/healthsystem/showing_hsi_and_appt_types_that_run/analysis_healthsystem_usage.py
diff --git a/src/scripts/healthsystem/showing_hsi_and_appt_types_that_run/ten_year_run_tracking_healthsystem_usage.py b/src/scripts/healthsystem/showing_hsi_and_appt_types_that_run/ten_year_run_tracking_healthsystem_usage.py
new file mode 100644
index 0000000000..efd33378af
--- /dev/null
+++ b/src/scripts/healthsystem/showing_hsi_and_appt_types_that_run/ten_year_run_tracking_healthsystem_usage.py
@@ -0,0 +1,124 @@
+"""
+This file is used to capture the HSI that are run during a typical ten year simulation 2010-2019.
+
+It defines a large population and an (mode=0) HealthSystem.
+
+Run on the batch system using:
+```tlo batch-submit src/scripts/healthsystem/showing_hsi_and_appt_types_that_run/ten_year_run_tracking_healthsystem_usage.py```
+
+or locally using:
+    ```tlo scenario-run src/scripts/healthsystem/showing_hsi_and_appt_types_that_run/ten_year_run_tracking_healthsystem_usage.py```
+
+"""
+
+from tlo import Date, logging
+from tlo.methods import (
+    alri,
+    bladder_cancer,
+    breast_cancer,
+    cardio_metabolic_disorders,
+    care_of_women_during_pregnancy,
+    contraception,
+    demography,
+    depression,
+    diarrhoea,
+    enhanced_lifestyle,
+    epi,
+    epilepsy,
+    healthburden,
+    healthseekingbehaviour,
+    healthsystem,
+    hiv,
+    labour,
+    malaria,
+    measles,
+    newborn_outcomes,
+    oesophagealcancer,
+    other_adult_cancers,
+    postnatal_supervisor,
+    pregnancy_supervisor,
+    prostate_cancer,
+    stunting,
+    symptommanager,
+    wasting,
+)
+from tlo.scenario import BaseScenario
+
+
+class LongRun(BaseScenario):
+    def __init__(self):
+        super().__init__()
+        self.seed = 0
+        self.start_date = Date(2010, 1, 1)
+        self.end_date = Date(2015, 12, 31)
+        self.pop_size = 5_000
+        self.number_of_draws = 1
+        self.runs_per_draw = 1
+
+    def log_configuration(self):
+        return {
+            'filename': 'long_run',
+            'directory': './outputs',
+            'custom_levels': {
+                '*': logging.INFO,
+            }
+        }
+
+    def modules(self):
+        return [
+            # Core Modules
+            demography.Demography(resourcefilepath=self.resources),
+            enhanced_lifestyle.Lifestyle(resourcefilepath=self.resources),
+            symptommanager.SymptomManager(resourcefilepath=self.resources, spurious_symptoms=False),
+            healthseekingbehaviour.HealthSeekingBehaviour(resourcefilepath=self.resources),
+            healthburden.HealthBurden(resourcefilepath=self.resources),
+
+            # Representations of the Healthcare System
+            healthsystem.HealthSystem(resourcefilepath=self.resources, mode_appt_constraints=0),
+            epi.Epi(resourcefilepath=self.resources),
+
+            # - Contraception, Pregnancy and Labour
+            contraception.Contraception(resourcefilepath=self.resources, use_healthsystem=True),
+            pregnancy_supervisor.PregnancySupervisor(resourcefilepath=self.resources),
+            care_of_women_during_pregnancy.CareOfWomenDuringPregnancy(resourcefilepath=self.resources),
+            labour.Labour(resourcefilepath=self.resources),
+            newborn_outcomes.NewbornOutcomes(resourcefilepath=self.resources),
+            postnatal_supervisor.PostnatalSupervisor(resourcefilepath=self.resources),
+
+            # - Conditions of Early Childhood
+            diarrhoea.Diarrhoea(resourcefilepath=self.resources),
+            alri.Alri(resourcefilepath=self.resources),
+            stunting.Stunting(resourcefilepath=self.resources),
+            wasting.Wasting(resourcefilepath=self.resources),
+
+            # - Communicable Diseases
+            hiv.Hiv(resourcefilepath=self.resources),
+            malaria.Malaria(resourcefilepath=self.resources),
+            measles.Measles(resourcefilepath=self.resources),
+
+            # - Non-Communicable Conditions
+            # -- Cancers
+            bladder_cancer.BladderCancer(resourcefilepath=self.resources),
+            breast_cancer.BreastCancer(resourcefilepath=self.resources),
+            oesophagealcancer.OesophagealCancer(resourcefilepath=self.resources),
+            other_adult_cancers.OtherAdultCancer(resourcefilepath=self.resources),
+            prostate_cancer.ProstateCancer(resourcefilepath=self.resources),
+
+            # -- Cardiometabolic Disorders
+            cardio_metabolic_disorders.CardioMetabolicDisorders(resourcefilepath=self.resources),
+
+            # -- Injuries (Forthcoming)
+
+            # -- Other Non-Communicable Conditions
+            depression.Depression(resourcefilepath=self.resources),
+            epilepsy.Epilepsy(resourcefilepath=self.resources),
+        ]
+
+    def draw_parameters(self, draw_number, rng):
+        pass
+
+
+if __name__ == '__main__':
+    from tlo.cli import scenario_run
+
+    scenario_run([__file__])

From db526b454a39e070b305a10fe87a601b4e949cd7 Mon Sep 17 00:00:00 2001
From: Tim Hallett <timothy.hallett@imperial.ac.uk>
Date: Thu, 9 Dec 2021 11:27:10 +0000
Subject: [PATCH 002/131] renaming for brevity

---
 .../analysis_hsi_in_typical_run.py}                           | 0
 .../scenario_hsi_in_typical_run.py}                           | 4 ++--
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename src/scripts/healthsystem/{showing_hsi_and_appt_types_that_run/analysis_healthsystem_usage.py => hsi_in_typical_run/analysis_hsi_in_typical_run.py} (100%)
 rename src/scripts/healthsystem/{showing_hsi_and_appt_types_that_run/ten_year_run_tracking_healthsystem_usage.py => hsi_in_typical_run/scenario_hsi_in_typical_run.py} (94%)

diff --git a/src/scripts/healthsystem/showing_hsi_and_appt_types_that_run/analysis_healthsystem_usage.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
similarity index 100%
rename from src/scripts/healthsystem/showing_hsi_and_appt_types_that_run/analysis_healthsystem_usage.py
rename to src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
diff --git a/src/scripts/healthsystem/showing_hsi_and_appt_types_that_run/ten_year_run_tracking_healthsystem_usage.py b/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
similarity index 94%
rename from src/scripts/healthsystem/showing_hsi_and_appt_types_that_run/ten_year_run_tracking_healthsystem_usage.py
rename to src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
index efd33378af..eaa8261f46 100644
--- a/src/scripts/healthsystem/showing_hsi_and_appt_types_that_run/ten_year_run_tracking_healthsystem_usage.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
@@ -4,10 +4,10 @@
 It defines a large population and an (mode=0) HealthSystem.
 
 Run on the batch system using:
-```tlo batch-submit src/scripts/healthsystem/showing_hsi_and_appt_types_that_run/ten_year_run_tracking_healthsystem_usage.py```
+    ```tlo batch-submit src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py```
 
 or locally using:
-    ```tlo scenario-run src/scripts/healthsystem/showing_hsi_and_appt_types_that_run/ten_year_run_tracking_healthsystem_usage.py```
+    ```tlo scenario-run src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py```
 
 """
 

From 39813df9387884a53e3ae01895139522b327f92f Mon Sep 17 00:00:00 2001
From: Tim Hallett <timothy.hallett@imperial.ac.uk>
Date: Thu, 9 Dec 2021 11:28:50 +0000
Subject: [PATCH 003/131] renaming for brevity

---
 .../hsi_in_typical_run/scenario_hsi_in_typical_run.py       | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
index eaa8261f46..a13a0e2cf8 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
@@ -1,5 +1,5 @@
 """
-This file is used to capture the HSI that are run during a typical ten year simulation 2010-2019.
+This file is used to capture the HSI that are run during a typical simulation 2010-2014.
 
 It defines a large population and an (mode=0) HealthSystem.
 
@@ -50,8 +50,8 @@ def __init__(self):
         super().__init__()
         self.seed = 0
         self.start_date = Date(2010, 1, 1)
-        self.end_date = Date(2015, 12, 31)
-        self.pop_size = 5_000
+        self.end_date = Date(2014, 12, 31)
+        self.pop_size = 20_000
         self.number_of_draws = 1
         self.runs_per_draw = 1
 

From 18ab263e71a8fdbe0e210bfe1d8f72273f2d7003 Mon Sep 17 00:00:00 2001
From: Tim Hallett <timothy.hallett@imperial.ac.uk>
Date: Thu, 9 Dec 2021 11:29:14 +0000
Subject: [PATCH 004/131] small pop size

---
 .../hsi_in_typical_run/scenario_hsi_in_typical_run.py           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
index a13a0e2cf8..1078ac1317 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
@@ -51,7 +51,7 @@ def __init__(self):
         self.seed = 0
         self.start_date = Date(2010, 1, 1)
         self.end_date = Date(2014, 12, 31)
-        self.pop_size = 20_000
+        self.pop_size = 5_000
         self.number_of_draws = 1
         self.runs_per_draw = 1
 

From 0ad24724dc156ff9b52dbc92babf16703994f818 Mon Sep 17 00:00:00 2001
From: Tim Hallett <timothy.hallett@imperial.ac.uk>
Date: Thu, 9 Dec 2021 12:34:09 +0000
Subject: [PATCH 005/131] change file name for testing

---
 .../analysis_hsi_in_typical_run.py            | 37 +++++++++++++++++++
 .../scenario_hsi_in_typical_run.py            | 19 +++++-----
 2 files changed, 46 insertions(+), 10 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
index 5e9a035ce0..872c8d9d0f 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
@@ -1,3 +1,40 @@
+"""This file uses the run generated by `scenario_hsi_in_typical_run.py` to generate descriptions of the HSI that occur
+in a typical run."""
+
+
+
+# %% Declare the name of the file that specified the scenarios used in this run.
+from pathlib import Path
+
+from tlo.analysis.utils import get_scenario_outputs, load_pickled_dataframes
+
+scenario_filename = 'long_run_no_diseases.py'
+
+# %% Declare usual paths:
+outputspath = Path('./outputs/tbh03@ic.ac.uk')
+rfp = Path('./resources')
+
+# Find results folder (most recent run generated using that scenario_filename)
+results_folder = get_scenario_outputs(scenario_filename, outputspath)[-1]
+print(f"Results folder is: {results_folder}")
+
+# Declare path for output graphs from this script
+make_graph_file_name = lambda stub: results_folder / f"{stub}.png"  # noqa: E731
+
+# %% Extract results
+
+log = load_pickled_dataframes(results_folder)  # (There was only one draw and one run)
+
+
+
+
+
+
+
+
+
+
+
 # todo - describe the HSI that are actually being done (desreiptions and frequency)
 #
 # """Produce plots to show the usage of the healthcare system when 'Everything' is in service_availability.
diff --git a/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
index 1078ac1317..e5e371e1fa 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
@@ -1,14 +1,12 @@
 """
-This file is used to capture the HSI that are run during a typical simulation 2010-2014.
+This file is used to capture the HSI that are run during a typical simulation, 2010-2014. It defines a large population
+ with all disease modules registered and an unconstrained (mode_appt_constraints=0) HealthSystem.
 
-It defines a large population and an (mode=0) HealthSystem.
-
-Run on the batch system using:
+Run on the remote batch system using:
     ```tlo batch-submit src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py```
 
-or locally using:
+... or locally using:
     ```tlo scenario-run src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py```
-
 """
 
 from tlo import Date, logging
@@ -50,17 +48,18 @@ def __init__(self):
         super().__init__()
         self.seed = 0
         self.start_date = Date(2010, 1, 1)
-        self.end_date = Date(2014, 12, 31)
-        self.pop_size = 5_000
+        self.end_date = Date(2010, 1, 2)
+        self.pop_size = 100  # todo increase this
         self.number_of_draws = 1
         self.runs_per_draw = 1
 
     def log_configuration(self):
         return {
-            'filename': 'long_run',
+            'filename': 'local_file_name',
             'directory': './outputs',
             'custom_levels': {
-                '*': logging.INFO,
+                '*': logging.WARNING,
+                'tlo.methods.healthsystem': logging.INFO,
             }
         }
 

From 5c2d723db65fa7dad0d2b646f45c80af1da8fd0f Mon Sep 17 00:00:00 2001
From: Tim Hallett <timothy.hallett@imperial.ac.uk>
Date: Thu, 9 Dec 2021 12:45:15 +0000
Subject: [PATCH 006/131] restore definition of scenario_hsi_in_typical_run.py

---
 .../hsi_in_typical_run/scenario_hsi_in_typical_run.py       | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
index e5e371e1fa..841ce652b8 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
@@ -48,14 +48,14 @@ def __init__(self):
         super().__init__()
         self.seed = 0
         self.start_date = Date(2010, 1, 1)
-        self.end_date = Date(2010, 1, 2)
-        self.pop_size = 100  # todo increase this
+        self.end_date = Date(2014, 12, 31)
+        self.pop_size = 20_000
         self.number_of_draws = 1
         self.runs_per_draw = 1
 
     def log_configuration(self):
         return {
-            'filename': 'local_file_name',
+            'filename': 'scenario_hsi_in_typical_run',
             'directory': './outputs',
             'custom_levels': {
                 '*': logging.WARNING,

From 397fa3f2cf8e216dec557e0b213830f67296fb34 Mon Sep 17 00:00:00 2001
From: Tim Hallett <timothy.hallett@imperial.ac.uk>
Date: Thu, 9 Dec 2021 13:34:00 +0000
Subject: [PATCH 007/131] produce skeleton script

---
 .../analysis_hsi_in_typical_run.py            | 148 ++++++++----------
 1 file changed, 61 insertions(+), 87 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
index 872c8d9d0f..32e9d37c13 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
@@ -5,10 +5,12 @@
 
 # %% Declare the name of the file that specified the scenarios used in this run.
 from pathlib import Path
-
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
 from tlo.analysis.utils import get_scenario_outputs, load_pickled_dataframes
 
-scenario_filename = 'long_run_no_diseases.py'
+scenario_filename = 'scenario_hsi_in_typical_run.py'
 
 # %% Declare usual paths:
 outputspath = Path('./outputs/tbh03@ic.ac.uk')
@@ -22,88 +24,60 @@
 make_graph_file_name = lambda stub: results_folder / f"{stub}.png"  # noqa: E731
 
 # %% Extract results
-
-log = load_pickled_dataframes(results_folder)  # (There was only one draw and one run)
-
-
-
-
-
-
-
-
-
-
-
-# todo - describe the HSI that are actually being done (desreiptions and frequency)
-#
-# """Produce plots to show the usage of the healthcare system when 'Everything' is in service_availability.
-# This uses the file that is created by: run_healthsystem_analysis_and_pickle_log
-# """
-#
-# import pickle
-# from datetime import datetime
-# from pathlib import Path
-#
-# import matplotlib.pyplot as plt
-# import numpy as np
-# import pandas as pd
-#
-# from tlo.methods.demography import get_scaling_factor
-#
-# # Define paths and filenames
-# rfp = Path("./resources")
-# outputpath = Path("./outputs")  # folder for convenience of storing outputs
-# results_filename = outputpath / '2020_11_23_health_system_systematic_run.pickle'
-# make_file_name = lambda stub: outputpath / f"{datetime.today().strftime('%Y_%m_%d''')}_{stub}.png"
-#
-# with open(results_filename, 'rb') as f:
-#     output = pickle.load(f)['results']['Everything']
-#
-# # %% Scaling Factor
-# scaling_factor = get_scaling_factor(output, rfp)
-#
-# # %% Show overall usage of the healthsystem:
-#
-# cap = output['tlo.methods.healthsystem']['Capacity'].copy()
-# cap["date"] = pd.to_datetime(cap["date"])
-# cap = cap.set_index('date')
-#
-# frac_time_used = cap['Frac_Time_Used_Overall']
-# cap = cap.drop(columns=['Frac_Time_Used_Overall'])
-#
-# # Plot Fraction of total time of health-care-workers being used
-# frac_time_used.plot()
-# plt.title("Fraction of total health-care worker time being used")
-# plt.xlabel("Date")
-# plt.savefig(make_file_name('HSI_Frac_time_used'))
-# plt.show()
-#
-# # %% Breakdowns by HSI:
-# hsi = output['tlo.methods.healthsystem']['HSI_Event'].copy()
-# hsi["date"] = pd.to_datetime(hsi["date"])
-# hsi["month"] = hsi["date"].dt.month
-# # Reduce TREATMENT_ID to the originating module
-# hsi["Module"] = hsi["TREATMENT_ID"].str.split('_').apply(lambda x: x[0])
-#
-# # Plot the HSI that are taking place, by month, in a a particular year
-# year = 2012
-# evs = hsi.loc[hsi.date.dt.year == year]\
-#     .groupby(by=['month', 'Module'])\
-#     .size().reset_index().rename(columns={0: 'count'})\
-#     .pivot_table(index='month', columns='Module', values='count', fill_value=0)
-# evs *= scaling_factor
-#
-# evs.plot.bar(stacked=True)
-# plt.title(f"HSI by Module, per Month (year {year})")
-# plt.ylabel('Total per month')
-# plt.savefig(make_file_name('HSI_per_module_per_month'))
-# plt.show()
-#
-# # Plot the breakdown of all HSI, over all the years
-# evs = hsi.groupby(by=['Module'])\
-#     .size().rename(columns={0: 'count'}) * scaling_factor
-# evs.plot.pie()
-# plt.title(f"HSI by Module")
-# plt.savefig(make_file_name('HSI_per_module'))
-# plt.show()
+log = load_pickled_dataframes(results_folder)['tlo.methods.healthsystem']  # (There was only one draw and one run)
+
+# %% Plot: Fraction of Total Healthcare Worker Time Used
+
+cap = log['Capacity']
+cap["date"] = pd.to_datetime(cap["date"])
+cap = cap.set_index('date')
+
+frac_time_used = cap['Frac_Time_Used_Overall']
+
+# Plot:
+frac_time_used.plot()
+plt.title("Fraction of Total Healthcare Worker Time Used")
+plt.xlabel("Date")
+plt.tight_layout()
+plt.savefig(make_graph_file_name ('HSI_Frac_time_used'))
+plt.show()
+
+# %% Number of HSI:
+
+hsi = log['HSI_Event']
+hsi["date"] = pd.to_datetime(hsi["date"])
+hsi["month"] = hsi["date"].dt.month
+
+# Number of HSI that are taking place by originating module, by month
+year = 2012
+hsi["Module"] = hsi["TREATMENT_ID"].str.split('_').apply(lambda x: x[0])
+evs = hsi.loc[hsi.date.dt.year == year]\
+    .groupby(by=['month', 'Module'])\
+    .size().reset_index().rename(columns={0: 'count'})\
+    .pivot_table(index='month', columns='Module', values='count', fill_value=0)
+
+# Plot:
+evs.plot.bar(stacked=True)
+plt.title(f"HSI by Module, per Month (year {year})")
+plt.ylabel('Total per month')
+plt.tight_layout()
+plt.savefig(make_graph_file_name('HSI_per_module_per_month'))
+plt.show()
+
+# Plot the breakdown of all HSI, over all the years
+evs = hsi.groupby(by=['Module']).size()
+evs.plot.pie()
+plt.title(f"HSI by Module")
+plt.tight_layout()
+plt.savefig(make_graph_file_name('HSI_per_module'))
+plt.show()
+
+# %% Demand for appointments
+
+num_hsi_by_treatment_id = hsi.groupby(hsi.TREATMENT_ID)['Number_By_Appt_Type_Code'].size()
+
+# find the appt footprint for each treatment_id
+appts_by_treatment_id = \
+    hsi.set_index('TREATMENT_ID')['Number_By_Appt_Type_Code'].drop_duplicates().apply(pd.Series).fillna(0.0)
+
+# Plot...

From 281235880a80fd2cf823483ed2ac38f64f8ecf47 Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Thu, 9 Dec 2021 15:48:21 +0000
Subject: [PATCH 008/131] Create the .py that generate daily capabilities and
 staff allocation histograms

---
 ...ribe_healthsystem_capabilities_Bingling.py | 158 ++++++++++++++++++
 1 file changed, 158 insertions(+)
 create mode 100644 src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities_Bingling.py

diff --git a/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities_Bingling.py b/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities_Bingling.py
new file mode 100644
index 0000000000..0f52a5e9f4
--- /dev/null
+++ b/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities_Bingling.py
@@ -0,0 +1,158 @@
+"""
+This file produces histograms of the healthsystem capabilities \
+in terms of staff allocation and daily capabilities in minutes per cadre per facility level.
+"""
+
+from pathlib import Path
+import pandas as pd
+import numpy as np
+from matplotlib import pyplot as plt
+from matplotlib.ticker import ScalarFormatter
+
+# Get the path of the folder that stores the data - three scenarios: actual, funded, funded_plus
+workingpath = Path('./resources/healthsystem/human_resources/funded_plus')
+
+# Define the path of output histograms - three scenarios: actual, funded, funded_plus
+outputpath = Path('./outputs/healthsystem/human_resources/funded_plus')
+
+# Read data
+data = pd.read_csv(workingpath / 'ResourceFile_Daily_Capabilities.csv')
+
+
+# MINUTES PER HEALTH OFFICER CATEGORY BY DISTRICT
+data_districts = data.dropna(inplace=False)
+dat = pd.DataFrame(data_districts.groupby(['District', 'Officer_Category'], as_index=False)['Total_Mins_Per_Day'].sum())
+tab = dat.pivot(index='District', columns='Officer_Category', values='Total_Mins_Per_Day')
+ax = tab.plot.bar(stacked=True)
+plt.ylabel('Minutes per day')
+plt.xlabel('District')
+
+ax.legend(ncol=3, bbox_to_anchor=(0, 1),
+          loc='lower left', fontsize='small')
+
+plt.savefig(outputpath / 'health_officer_minutes_per_district.pdf', bbox_inches='tight')
+
+# STAFF COUNTS PER HEALTH OFFICER CATEGORY BY DISTRICT
+data_districts = data.dropna(inplace=False)
+dat = pd.DataFrame(data_districts.groupby(['District', 'Officer_Category'], as_index=False)['Staff_Count'].sum())
+tab = dat.pivot(index='District', columns='Officer_Category', values='Staff_Count')
+ax = tab.plot.bar(stacked=True)
+plt.ylabel('Staff counts')
+plt.xlabel('District')
+
+ax.legend(ncol=3, bbox_to_anchor=(0, 1),
+          loc='lower left', fontsize='small')
+
+plt.savefig(outputpath / 'staff_allocation_per_district.pdf', bbox_inches='tight')
+
+
+# MINUTES PER HEALTH OFFICER CATEGORY BY LEVEL
+dat = pd.DataFrame(data.groupby(['Facility_Level', 'Officer_Category'], as_index=False)['Total_Mins_Per_Day'].sum())
+tab = dat.pivot(index='Facility_Level', columns='Officer_Category', values='Total_Mins_Per_Day')
+ax = tab.plot.bar(stacked=True)
+# ax = tab.plot.bar(stacked=True, log=True)
+plt.ylabel('Minutes per day')
+plt.xlabel('Facility level')
+
+ax.tick_params(axis='x', rotation=0)
+
+formatter = ScalarFormatter()
+formatter.set_scientific(False)
+ax.yaxis.set_major_formatter(formatter)
+
+ax.legend(ncol=3, bbox_to_anchor=(0, 1),
+          loc='lower left', fontsize='small')
+
+plt.savefig(outputpath / 'health_officer_minutes_per_level.pdf', bbox_inches='tight')
+
+# STAFF COUNTS PER HEALTH OFFICER CATEGORY BY LEVEL
+dat = pd.DataFrame(data.groupby(['Facility_Level', 'Officer_Category'], as_index=False)['Staff_Count'].sum())
+tab = dat.pivot(index='Facility_Level', columns='Officer_Category', values='Staff_Count')
+ax = tab.plot.bar(stacked=True)
+# ax = tab.plot.bar(stacked=True, log=True)
+plt.ylabel('Staff counts')
+plt.xlabel('Facility level')
+
+ax.tick_params(axis='x', rotation=0)
+
+ax.legend(ncol=3, bbox_to_anchor=(0, 1),
+          loc='lower left', fontsize='small')
+
+plt.savefig(outputpath / 'staff_allocation_per_level.pdf', bbox_inches='tight')
+
+
+# MINUTES PER HEALTH OFFICER CATEGORY BY LEVEL
+
+# Level 0
+data_level = data.loc[data['Facility_Level'] == '0',:]
+tab = data_level.pivot(index='District', columns='Officer_Category', values='Total_Mins_Per_Day')
+ax = tab.plot.bar(stacked=True)
+plt.ylabel('Minutes per day at level 0')
+plt.xlabel('District')
+
+ax.legend(ncol=3, bbox_to_anchor=(0, 1),
+          loc='lower left', fontsize='small')
+
+plt.savefig(outputpath / 'health_officer_minutes_per_district_level_0.pdf', bbox_inches='tight')
+
+# Level 1a
+data_level = data.loc[data['Facility_Level'] == '1a',:]
+tab = data_level.pivot(index='District', columns='Officer_Category', values='Total_Mins_Per_Day')
+ax = tab.plot.bar(stacked=True)
+plt.ylabel('Minutes per day at level 1a')
+plt.xlabel('District')
+
+ax.legend(ncol=3, bbox_to_anchor=(0, 1),
+          loc='lower left', fontsize='small')
+
+plt.savefig(outputpath / 'health_officer_minutes_per_district_level_1a.pdf', bbox_inches='tight')
+
+# Level 1b
+data_level = data.loc[data['Facility_Level'] == '1b',:]
+tab = data_level.pivot(index='District', columns='Officer_Category', values='Total_Mins_Per_Day')
+ax = tab.plot.bar(stacked=True)
+plt.ylabel('Minutes per day at level 1b')
+plt.xlabel('District')
+
+ax.legend(ncol=3, bbox_to_anchor=(0, 1),
+          loc='lower left', fontsize='small')
+
+plt.savefig(outputpath / 'health_officer_minutes_per_district_level_1b.pdf', bbox_inches='tight')
+
+# Level 2
+data_level = data.loc[data['Facility_Level'] == '2',:]
+tab = data_level.pivot(index='District', columns='Officer_Category', values='Total_Mins_Per_Day')
+ax = tab.plot.bar(stacked=True)
+plt.ylabel('Minutes per day at level 2')
+plt.xlabel('District')
+
+ax.legend(ncol=3, bbox_to_anchor=(0, 1),
+          loc='lower left', fontsize='small')
+
+plt.savefig(outputpath / 'health_officer_minutes_per_district_level_2.pdf', bbox_inches='tight')
+
+# Level 3
+data_level = data.loc[data['Facility_Level'] == '3',:]
+tab = data_level.pivot(index='Region', columns='Officer_Category', values='Total_Mins_Per_Day')
+ax = tab.plot.bar(stacked=True)
+plt.ylabel('Minutes per day at level 3')
+plt.xlabel('Regional Referral Hospital')
+ax.tick_params(axis='x', rotation=0)
+
+ax.legend(ncol=3, bbox_to_anchor=(0, 1),
+          loc='lower left', fontsize='small')
+
+plt.savefig(outputpath / 'health_officer_minutes_per_district_level_3.pdf', bbox_inches='tight')
+
+# Level 4
+data_level = data.loc[data['Facility_Level'] == '4',:]
+tab = data_level.pivot(index='Facility_Name', columns='Officer_Category', values='Total_Mins_Per_Day')
+ax = tab.plot.bar(stacked=True, width=0.1)
+plt.ylabel('Minutes per day at level 4')
+plt.xlabel('National resource hospital')
+ax.tick_params(axis='x', rotation=0)
+
+ax.legend(ncol=3, bbox_to_anchor=(0, 1),
+          loc='lower left', fontsize='small')
+
+plt.savefig(outputpath / 'health_officer_minutes_per_district_level_4.pdf', bbox_inches='tight')

From 62b9047715b05620730289cde397fec701e4bc32 Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Thu, 9 Dec 2021 16:15:41 +0000
Subject: [PATCH 009/131] Refactor

---
 ...s_describe_healthsystem_capabilities_Bingling.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities_Bingling.py b/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities_Bingling.py
index 0f52a5e9f4..e54c20327b 100644
--- a/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities_Bingling.py
+++ b/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities_Bingling.py
@@ -5,7 +5,6 @@
 
 from pathlib import Path
 import pandas as pd
-import numpy as np
 from matplotlib import pyplot as plt
 from matplotlib.ticker import ScalarFormatter
 
@@ -84,7 +83,7 @@
 # MINUTES PER HEALTH OFFICER CATEGORY BY LEVEL
 
 # Level 0
-data_level = data.loc[data['Facility_Level'] == '0',:]
+data_level = data.loc[data['Facility_Level'] == '0', :]
 tab = data_level.pivot(index='District', columns='Officer_Category', values='Total_Mins_Per_Day')
 ax = tab.plot.bar(stacked=True)
 plt.ylabel('Minutes per day at level 0')
@@ -96,7 +95,7 @@
 plt.savefig(outputpath / 'health_officer_minutes_per_district_level_0.pdf', bbox_inches='tight')
 
 # Level 1a
-data_level = data.loc[data['Facility_Level'] == '1a',:]
+data_level = data.loc[data['Facility_Level'] == '1a', :]
 tab = data_level.pivot(index='District', columns='Officer_Category', values='Total_Mins_Per_Day')
 ax = tab.plot.bar(stacked=True)
 plt.ylabel('Minutes per day at level 1a')
@@ -108,7 +107,7 @@
 plt.savefig(outputpath / 'health_officer_minutes_per_district_level_1a.pdf', bbox_inches='tight')
 
 # Level 1b
-data_level = data.loc[data['Facility_Level'] == '1b',:]
+data_level = data.loc[data['Facility_Level'] == '1b', :]
 tab = data_level.pivot(index='District', columns='Officer_Category', values='Total_Mins_Per_Day')
 ax = tab.plot.bar(stacked=True)
 plt.ylabel('Minutes per day at level 1b')
@@ -120,7 +119,7 @@
 plt.savefig(outputpath / 'health_officer_minutes_per_district_level_1b.pdf', bbox_inches='tight')
 
 # Level 2
-data_level = data.loc[data['Facility_Level'] == '2',:]
+data_level = data.loc[data['Facility_Level'] == '2', :]
 tab = data_level.pivot(index='District', columns='Officer_Category', values='Total_Mins_Per_Day')
 ax = tab.plot.bar(stacked=True)
 plt.ylabel('Minutes per day at level 2')
@@ -132,7 +131,7 @@
 plt.savefig(outputpath / 'health_officer_minutes_per_district_level_2.pdf', bbox_inches='tight')
 
 # Level 3
-data_level = data.loc[data['Facility_Level'] == '3',:]
+data_level = data.loc[data['Facility_Level'] == '3', :]
 tab = data_level.pivot(index='Region', columns='Officer_Category', values='Total_Mins_Per_Day')
 ax = tab.plot.bar(stacked=True)
 plt.ylabel('Minutes per day at level 3')
@@ -145,7 +144,7 @@
 plt.savefig(outputpath / 'health_officer_minutes_per_district_level_3.pdf', bbox_inches='tight')
 
 # Level 4
-data_level = data.loc[data['Facility_Level'] == '4',:]
+data_level = data.loc[data['Facility_Level'] == '4', :]
 tab = data_level.pivot(index='Facility_Name', columns='Officer_Category', values='Total_Mins_Per_Day')
 ax = tab.plot.bar(stacked=True, width=0.1)
 plt.ylabel('Minutes per day at level 4')

From daa0e516b5f5c7171064ad323ad656824fc75a8d Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Thu, 9 Dec 2021 16:45:27 +0000
Subject: [PATCH 010/131] Refactor as much as I could

---
 .../hsi_in_typical_run/analysis_hsi_in_typical_run.py       | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
index 32e9d37c13..19d6b99bde 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
@@ -1,12 +1,10 @@
 """This file uses the run generated by `scenario_hsi_in_typical_run.py` to generate descriptions of the HSI that occur
 in a typical run."""
 
-
-
 # %% Declare the name of the file that specified the scenarios used in this run.
 from pathlib import Path
 import matplotlib.pyplot as plt
-import numpy as np
+# import numpy as np
 import pandas as pd
 from tlo.analysis.utils import get_scenario_outputs, load_pickled_dataframes
 
@@ -39,7 +37,7 @@
 plt.title("Fraction of Total Healthcare Worker Time Used")
 plt.xlabel("Date")
 plt.tight_layout()
-plt.savefig(make_graph_file_name ('HSI_Frac_time_used'))
+plt.savefig(make_graph_file_name('HSI_Frac_time_used'))
 plt.show()
 
 # %% Number of HSI:

From 6e21721f02fcff2a03a7f37a0c6113de8e2a2837 Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Fri, 10 Dec 2021 15:53:32 +0000
Subject: [PATCH 011/131] reset date and pop

---
 .../hsi_in_typical_run/scenario_hsi_in_typical_run.py         | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
index 841ce652b8..1525af97f8 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
@@ -48,8 +48,8 @@ def __init__(self):
         super().__init__()
         self.seed = 0
         self.start_date = Date(2010, 1, 1)
-        self.end_date = Date(2014, 12, 31)
-        self.pop_size = 20_000
+        self.end_date = Date(2010, 1, 31)
+        self.pop_size = 1000  # 20_000
         self.number_of_draws = 1
         self.runs_per_draw = 1
 

From 327042351ad443d1086b3dbc0a36e86ad86a7091 Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Fri, 10 Dec 2021 16:24:13 +0000
Subject: [PATCH 012/131] refactor

---
 .../hsi_in_typical_run/analysis_hsi_in_typical_run.py         | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
index 19d6b99bde..2da10ae350 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
@@ -11,7 +11,7 @@
 scenario_filename = 'scenario_hsi_in_typical_run.py'
 
 # %% Declare usual paths:
-outputspath = Path('./outputs/tbh03@ic.ac.uk')
+outputspath = Path('./outputs/bshe@ic.ac.uk')
 rfp = Path('./resources')
 
 # Find results folder (most recent run generated using that scenario_filename)
@@ -47,7 +47,7 @@
 hsi["month"] = hsi["date"].dt.month
 
 # Number of HSI that are taking place by originating module, by month
-year = 2012
+year = 2010
 hsi["Module"] = hsi["TREATMENT_ID"].str.split('_').apply(lambda x: x[0])
 evs = hsi.loc[hsi.date.dt.year == year]\
     .groupby(by=['month', 'Module'])\

From 7dc912746bf78a3abd3ee86f43db69131763cc64 Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Fri, 10 Dec 2021 16:40:47 +0000
Subject: [PATCH 013/131] Update the file plotting health system daily
 capabilities

---
 ...ysis_describe_healthsystem_capabilities.py | 164 ++++++++++++++----
 ...ribe_healthsystem_capabilities_Bingling.py | 157 -----------------
 2 files changed, 134 insertions(+), 187 deletions(-)
 delete mode 100644 src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities_Bingling.py

diff --git a/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities.py b/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities.py
index 8cfd59e745..e54c20327b 100644
--- a/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities.py
+++ b/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities.py
@@ -1,53 +1,157 @@
 """
-This file produces a nice plot of the capabilities of the healthsystem in terms of the hours available for
-different cadres of healthcare workers.
+This file produces histograms of the healthsystem capabilities \
+in terms of staff allocation and daily capabilities in minutes per cadre per facility level.
 """
 
-# %%
-
 from pathlib import Path
-
 import pandas as pd
 from matplotlib import pyplot as plt
+from matplotlib.ticker import ScalarFormatter
+
+# Get the path of the folder that stores the data - three scenarios: actual, funded, funded_plus
+workingpath = Path('./resources/healthsystem/human_resources/funded_plus')
+
+# Define the path of output histograms - three scenarios: actual, funded, funded_plus
+outputpath = Path('./outputs/healthsystem/human_resources/funded_plus')
 
-resourcefilepath = Path("./resources")
+# Read data
+data = pd.read_csv(workingpath / 'ResourceFile_Daily_Capabilities.csv')
 
-# %%
 
-outputpath = Path("./outputs")  # folder for convenience of storing outputs
+# MINUTES PER HEALTH OFFICER CATEGORY BY DISTRICT
+data_districts = data.dropna(inplace=False)
+dat = pd.DataFrame(data_districts.groupby(['District', 'Officer_Category'], as_index=False)['Total_Mins_Per_Day'].sum())
+tab = dat.pivot(index='District', columns='Officer_Category', values='Total_Mins_Per_Day')
+ax = tab.plot.bar(stacked=True)
+plt.ylabel('Minutes per day')
+plt.xlabel('District')
+
+ax.legend(ncol=3, bbox_to_anchor=(0, 1),
+          loc='lower left', fontsize='small')
 
+plt.savefig(outputpath / 'health_officer_minutes_per_district.pdf', bbox_inches='tight')
 
-data = pd.read_csv(
-    Path(resourcefilepath) / "ResourceFile_Daily_Capabilities.csv"
-)
+# STAFF COUNTS PER HEALTH OFFICER CATEGORY BY DISTRICT
+data_districts = data.dropna(inplace=False)
+dat = pd.DataFrame(data_districts.groupby(['District', 'Officer_Category'], as_index=False)['Staff_Count'].sum())
+tab = dat.pivot(index='District', columns='Officer_Category', values='Staff_Count')
+ax = tab.plot.bar(stacked=True)
+plt.ylabel('Staff counts')
+plt.xlabel('District')
 
-# [['Total_Minutes_Per_Day','Officer_Type','District']]
+ax.legend(ncol=3, bbox_to_anchor=(0, 1),
+          loc='lower left', fontsize='small')
 
-data = data.dropna()
-# data['District'] = data['District'].fillna('National')
+plt.savefig(outputpath / 'staff_allocation_per_district.pdf', bbox_inches='tight')
 
-# do some re-grouping to make a more manageable number of health cadres:
-data['Officer_Type'] = data['Officer_Type'].replace('DCSA', 'CHW')
-data['Officer_Type'] = data['Officer_Type'].replace(['Lab Officer', 'Lab Technician', 'Lab Assistant'], 'Lab Support')
-data['Officer_Type'] = data['Officer_Type'].replace(['Radiographer', 'Radiography Technician'], 'Radiography')
-data['Officer_Type'] = data['Officer_Type'].replace(['Nurse Officer', 'Nutrition Staff', 'Med. Assistant'], 'Nurse')
-data['Officer_Type'] = data['Officer_Type'].replace('Nurse Midwife Technician', 'MidWife')
-data['Officer_Type'] = data['Officer_Type'].replace(['Pharmacist', 'Pharm Technician', 'Pharm Assistant'], 'Pharmacy')
-data['Officer_Type'] = data['Officer_Type'].replace(['Medical Officer / Specialist', 'Clinical Officer / Technician'],
-                                                    'Clinician')
-data['Officer_Type'] = data['Officer_Type'].replace(['Dental Therapist'], 'Dentist')
 
-# MINUTES PER HEALTH OFFICER TYPE BY DISTRICT:
-dat = pd.DataFrame(data.groupby(['District', 'Officer_Type'], as_index=False)['Total_Minutes_Per_Day'].sum())
-tab = dat.pivot(index='District', columns='Officer_Type', values='Total_Minutes_Per_Day')
+# MINUTES PER HEALTH OFFICER CATEGORY BY LEVEL
+dat = pd.DataFrame(data.groupby(['Facility_Level', 'Officer_Category'], as_index=False)['Total_Mins_Per_Day'].sum())
+tab = dat.pivot(index='Facility_Level', columns='Officer_Category', values='Total_Mins_Per_Day')
 ax = tab.plot.bar(stacked=True)
+# ax = tab.plot.bar(stacked=True, log=True)
 plt.ylabel('Minutes per day')
+plt.xlabel('Facility level')
+
+ax.tick_params(axis='x', rotation=0)
+
+formatter = ScalarFormatter()
+formatter.set_scientific(False)
+ax.yaxis.set_major_formatter(formatter)
+
+ax.legend(ncol=3, bbox_to_anchor=(0, 1),
+          loc='lower left', fontsize='small')
+
+plt.savefig(outputpath / 'health_officer_minutes_per_level.pdf', bbox_inches='tight')
+
+# STAFF COUNTS PER HEALTH OFFICER CATEGORY BY LEVEL
+dat = pd.DataFrame(data.groupby(['Facility_Level', 'Officer_Category'], as_index=False)['Staff_Count'].sum())
+tab = dat.pivot(index='Facility_Level', columns='Officer_Category', values='Staff_Count')
+ax = tab.plot.bar(stacked=True)
+# ax = tab.plot.bar(stacked=True, log=True)
+plt.ylabel('Staff counts')
+plt.xlabel('Facility level')
+
+ax.tick_params(axis='x', rotation=0)
+
+ax.legend(ncol=3, bbox_to_anchor=(0, 1),
+          loc='lower left', fontsize='small')
+
+plt.savefig(outputpath / 'staff_allocation_per_level.pdf', bbox_inches='tight')
+
+
+# MINUTES PER HEALTH OFFICER CATEGORY BY LEVEL
+
+# Level 0
+data_level = data.loc[data['Facility_Level'] == '0', :]
+tab = data_level.pivot(index='District', columns='Officer_Category', values='Total_Mins_Per_Day')
+ax = tab.plot.bar(stacked=True)
+plt.ylabel('Minutes per day at level 0')
 plt.xlabel('District')
 
 ax.legend(ncol=3, bbox_to_anchor=(0, 1),
           loc='lower left', fontsize='small')
 
-plt.savefig(outputpath / 'health_officer_minutes_per_district.pdf', bbox_inches='tight')
-plt.show()
+plt.savefig(outputpath / 'health_officer_minutes_per_district_level_0.pdf', bbox_inches='tight')
+
+# Level 1a
+data_level = data.loc[data['Facility_Level'] == '1a', :]
+tab = data_level.pivot(index='District', columns='Officer_Category', values='Total_Mins_Per_Day')
+ax = tab.plot.bar(stacked=True)
+plt.ylabel('Minutes per day at level 1a')
+plt.xlabel('District')
+
+ax.legend(ncol=3, bbox_to_anchor=(0, 1),
+          loc='lower left', fontsize='small')
+
+plt.savefig(outputpath / 'health_officer_minutes_per_district_level_1a.pdf', bbox_inches='tight')
+
+# Level 1b
+data_level = data.loc[data['Facility_Level'] == '1b', :]
+tab = data_level.pivot(index='District', columns='Officer_Category', values='Total_Mins_Per_Day')
+ax = tab.plot.bar(stacked=True)
+plt.ylabel('Minutes per day at level 1b')
+plt.xlabel('District')
+
+ax.legend(ncol=3, bbox_to_anchor=(0, 1),
+          loc='lower left', fontsize='small')
+
+plt.savefig(outputpath / 'health_officer_minutes_per_district_level_1b.pdf', bbox_inches='tight')
+
+# Level 2
+data_level = data.loc[data['Facility_Level'] == '2', :]
+tab = data_level.pivot(index='District', columns='Officer_Category', values='Total_Mins_Per_Day')
+ax = tab.plot.bar(stacked=True)
+plt.ylabel('Minutes per day at level 2')
+plt.xlabel('District')
+
+ax.legend(ncol=3, bbox_to_anchor=(0, 1),
+          loc='lower left', fontsize='small')
+
+plt.savefig(outputpath / 'health_officer_minutes_per_district_level_2.pdf', bbox_inches='tight')
+
+# Level 3
+data_level = data.loc[data['Facility_Level'] == '3', :]
+tab = data_level.pivot(index='Region', columns='Officer_Category', values='Total_Mins_Per_Day')
+ax = tab.plot.bar(stacked=True)
+plt.ylabel('Minutes per day at level 3')
+plt.xlabel('Regional Referral Hospital')
+ax.tick_params(axis='x', rotation=0)
+
+ax.legend(ncol=3, bbox_to_anchor=(0, 1),
+          loc='lower left', fontsize='small')
+
+plt.savefig(outputpath / 'health_officer_minutes_per_district_level_3.pdf', bbox_inches='tight')
+
+# Level 4
+data_level = data.loc[data['Facility_Level'] == '4', :]
+tab = data_level.pivot(index='Facility_Name', columns='Officer_Category', values='Total_Mins_Per_Day')
+ax = tab.plot.bar(stacked=True, width=0.1)
+plt.ylabel('Minutes per day at level 4')
+plt.xlabel('National resource hospital')
+ax.tick_params(axis='x', rotation=0)
+
+ax.legend(ncol=3, bbox_to_anchor=(0, 1),
+          loc='lower left', fontsize='small')
 
-# %%
+plt.savefig(outputpath / 'health_officer_minutes_per_district_level_4.pdf', bbox_inches='tight')
diff --git a/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities_Bingling.py b/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities_Bingling.py
deleted file mode 100644
index e54c20327b..0000000000
--- a/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities_Bingling.py
+++ /dev/null
@@ -1,157 +0,0 @@
-"""
-This file produces histograms of the healthsystem capabilities \
-in terms of staff allocation and daily capabilities in minutes per cadre per facility level.
-"""
-
-from pathlib import Path
-import pandas as pd
-from matplotlib import pyplot as plt
-from matplotlib.ticker import ScalarFormatter
-
-# Get the path of the folder that stores the data - three scenarios: actual, funded, funded_plus
-workingpath = Path('./resources/healthsystem/human_resources/funded_plus')
-
-# Define the path of output histograms - three scenarios: actual, funded, funded_plus
-outputpath = Path('./outputs/healthsystem/human_resources/funded_plus')
-
-# Read data
-data = pd.read_csv(workingpath / 'ResourceFile_Daily_Capabilities.csv')
-
-
-# MINUTES PER HEALTH OFFICER CATEGORY BY DISTRICT
-data_districts = data.dropna(inplace=False)
-dat = pd.DataFrame(data_districts.groupby(['District', 'Officer_Category'], as_index=False)['Total_Mins_Per_Day'].sum())
-tab = dat.pivot(index='District', columns='Officer_Category', values='Total_Mins_Per_Day')
-ax = tab.plot.bar(stacked=True)
-plt.ylabel('Minutes per day')
-plt.xlabel('District')
-
-ax.legend(ncol=3, bbox_to_anchor=(0, 1),
-          loc='lower left', fontsize='small')
-
-plt.savefig(outputpath / 'health_officer_minutes_per_district.pdf', bbox_inches='tight')
-
-# STAFF COUNTS PER HEALTH OFFICER CATEGORY BY DISTRICT
-data_districts = data.dropna(inplace=False)
-dat = pd.DataFrame(data_districts.groupby(['District', 'Officer_Category'], as_index=False)['Staff_Count'].sum())
-tab = dat.pivot(index='District', columns='Officer_Category', values='Staff_Count')
-ax = tab.plot.bar(stacked=True)
-plt.ylabel('Staff counts')
-plt.xlabel('District')
-
-ax.legend(ncol=3, bbox_to_anchor=(0, 1),
-          loc='lower left', fontsize='small')
-
-plt.savefig(outputpath / 'staff_allocation_per_district.pdf', bbox_inches='tight')
-
-
-# MINUTES PER HEALTH OFFICER CATEGORY BY LEVEL
-dat = pd.DataFrame(data.groupby(['Facility_Level', 'Officer_Category'], as_index=False)['Total_Mins_Per_Day'].sum())
-tab = dat.pivot(index='Facility_Level', columns='Officer_Category', values='Total_Mins_Per_Day')
-ax = tab.plot.bar(stacked=True)
-# ax = tab.plot.bar(stacked=True, log=True)
-plt.ylabel('Minutes per day')
-plt.xlabel('Facility level')
-
-ax.tick_params(axis='x', rotation=0)
-
-formatter = ScalarFormatter()
-formatter.set_scientific(False)
-ax.yaxis.set_major_formatter(formatter)
-
-ax.legend(ncol=3, bbox_to_anchor=(0, 1),
-          loc='lower left', fontsize='small')
-
-plt.savefig(outputpath / 'health_officer_minutes_per_level.pdf', bbox_inches='tight')
-
-# STAFF COUNTS PER HEALTH OFFICER CATEGORY BY LEVEL
-dat = pd.DataFrame(data.groupby(['Facility_Level', 'Officer_Category'], as_index=False)['Staff_Count'].sum())
-tab = dat.pivot(index='Facility_Level', columns='Officer_Category', values='Staff_Count')
-ax = tab.plot.bar(stacked=True)
-# ax = tab.plot.bar(stacked=True, log=True)
-plt.ylabel('Staff counts')
-plt.xlabel('Facility level')
-
-ax.tick_params(axis='x', rotation=0)
-
-ax.legend(ncol=3, bbox_to_anchor=(0, 1),
-          loc='lower left', fontsize='small')
-
-plt.savefig(outputpath / 'staff_allocation_per_level.pdf', bbox_inches='tight')
-
-
-# MINUTES PER HEALTH OFFICER CATEGORY BY LEVEL
-
-# Level 0
-data_level = data.loc[data['Facility_Level'] == '0', :]
-tab = data_level.pivot(index='District', columns='Officer_Category', values='Total_Mins_Per_Day')
-ax = tab.plot.bar(stacked=True)
-plt.ylabel('Minutes per day at level 0')
-plt.xlabel('District')
-
-ax.legend(ncol=3, bbox_to_anchor=(0, 1),
-          loc='lower left', fontsize='small')
-
-plt.savefig(outputpath / 'health_officer_minutes_per_district_level_0.pdf', bbox_inches='tight')
-
-# Level 1a
-data_level = data.loc[data['Facility_Level'] == '1a', :]
-tab = data_level.pivot(index='District', columns='Officer_Category', values='Total_Mins_Per_Day')
-ax = tab.plot.bar(stacked=True)
-plt.ylabel('Minutes per day at level 1a')
-plt.xlabel('District')
-
-ax.legend(ncol=3, bbox_to_anchor=(0, 1),
-          loc='lower left', fontsize='small')
-
-plt.savefig(outputpath / 'health_officer_minutes_per_district_level_1a.pdf', bbox_inches='tight')
-
-# Level 1b
-data_level = data.loc[data['Facility_Level'] == '1b', :]
-tab = data_level.pivot(index='District', columns='Officer_Category', values='Total_Mins_Per_Day')
-ax = tab.plot.bar(stacked=True)
-plt.ylabel('Minutes per day at level 1b')
-plt.xlabel('District')
-
-ax.legend(ncol=3, bbox_to_anchor=(0, 1),
-          loc='lower left', fontsize='small')
-
-plt.savefig(outputpath / 'health_officer_minutes_per_district_level_1b.pdf', bbox_inches='tight')
-
-# Level 2
-data_level = data.loc[data['Facility_Level'] == '2', :]
-tab = data_level.pivot(index='District', columns='Officer_Category', values='Total_Mins_Per_Day')
-ax = tab.plot.bar(stacked=True)
-plt.ylabel('Minutes per day at level 2')
-plt.xlabel('District')
-
-ax.legend(ncol=3, bbox_to_anchor=(0, 1),
-          loc='lower left', fontsize='small')
-
-plt.savefig(outputpath / 'health_officer_minutes_per_district_level_2.pdf', bbox_inches='tight')
-
-# Level 3
-data_level = data.loc[data['Facility_Level'] == '3', :]
-tab = data_level.pivot(index='Region', columns='Officer_Category', values='Total_Mins_Per_Day')
-ax = tab.plot.bar(stacked=True)
-plt.ylabel('Minutes per day at level 3')
-plt.xlabel('Regional Referral Hospital')
-ax.tick_params(axis='x', rotation=0)
-
-ax.legend(ncol=3, bbox_to_anchor=(0, 1),
-          loc='lower left', fontsize='small')
-
-plt.savefig(outputpath / 'health_officer_minutes_per_district_level_3.pdf', bbox_inches='tight')
-
-# Level 4
-data_level = data.loc[data['Facility_Level'] == '4', :]
-tab = data_level.pivot(index='Facility_Name', columns='Officer_Category', values='Total_Mins_Per_Day')
-ax = tab.plot.bar(stacked=True, width=0.1)
-plt.ylabel('Minutes per day at level 4')
-plt.xlabel('National resource hospital')
-ax.tick_params(axis='x', rotation=0)
-
-ax.legend(ncol=3, bbox_to_anchor=(0, 1),
-          loc='lower left', fontsize='small')
-
-plt.savefig(outputpath / 'health_officer_minutes_per_district_level_4.pdf', bbox_inches='tight')

From dfc77da10b073cbeb3016cac7ff4d9e69adc8f27 Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Fri, 10 Dec 2021 17:54:23 +0000
Subject: [PATCH 014/131] Update the file plotting health system daily
 capabilities

---
 .../hsi_in_typical_run/scenario_hsi_in_typical_run.py         | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
index 1525af97f8..385cb63422 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
@@ -48,8 +48,8 @@ def __init__(self):
         super().__init__()
         self.seed = 0
         self.start_date = Date(2010, 1, 1)
-        self.end_date = Date(2010, 1, 31)
-        self.pop_size = 1000  # 20_000
+        self.end_date = Date(2019, 1, 31)
+        self.pop_size = 20_000
         self.number_of_draws = 1
         self.runs_per_draw = 1
 

From de5cf14f467fae7b036582fd28330ae200e4216e Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Fri, 10 Dec 2021 17:54:23 +0000
Subject: [PATCH 015/131] Reset date and pop

---
 .../hsi_in_typical_run/scenario_hsi_in_typical_run.py         | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
index 1525af97f8..385cb63422 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
@@ -48,8 +48,8 @@ def __init__(self):
         super().__init__()
         self.seed = 0
         self.start_date = Date(2010, 1, 1)
-        self.end_date = Date(2010, 1, 31)
-        self.pop_size = 1000  # 20_000
+        self.end_date = Date(2019, 1, 31)
+        self.pop_size = 20_000
         self.number_of_draws = 1
         self.runs_per_draw = 1
 

From 9d718cb86457b64f635a4f1b4c594dc523a735f8 Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Mon, 13 Dec 2021 11:20:09 +0000
Subject: [PATCH 016/131] The jupyter notebook that generate sankey diagrams
 mapping coarse officers and appointments

---
 ...lysis_sankey_coarse_officer_and_appt.ipynb | 286 ++++++++++++++++++
 1 file changed, 286 insertions(+)
 create mode 100644 src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb

diff --git a/src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb b/src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb
new file mode 100644
index 0000000000..42659ffde7
--- /dev/null
+++ b/src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb
@@ -0,0 +1,286 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "4103c24d",
+   "metadata": {},
+   "source": [
+    "### Install floweaver in Anaconda Prompt / PyCharm Terminal:\n",
+    "\n",
+    "pip install floweaver\n",
+    "\n",
+    "pip install ipysankeywidget\n",
+    "\n",
+    "jupyter nbextension enable --py --sys-prefix ipysankeywidget\n",
+    "\n",
+    "jupyter notebook (to open jupyter notebook)\n",
+    "\n",
+    "### To display and save the output figures:\n",
+    "Run the cells -> Open Event Log -> Open in Browser -> Find the script and run all cells"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "913300ab",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import tlo\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from ipysankeywidget import SankeyWidget\n",
+    "from matplotlib import pyplot as plt\n",
+    "from floweaver import *\n",
+    "from pathlib import Path\n",
+    "from ipywidgets import HBox, VBox\n",
+    "\n",
+    "# get the tlo path\n",
+    "tlopath = Path(tlo.__file__).parent.parent.parent\n",
+    "\n",
+    "# Get the path of current folder that stores the data\n",
+    "workingpath = tlopath / Path('resources/healthsystem/human_resources/definitions')\n",
+    "\n",
+    "# Define the path of output Sankeys\n",
+    "outputpath = tlopath / Path('outputs/healthsystem/human_resources/sankey_diagrams')\n",
+    "\n",
+    "# Read the data of appointment time table\n",
+    "appointment = pd.read_csv(workingpath / 'ResourceFile_Appt_Time_Table.csv')\n",
+    "\n",
+    "# Rename\n",
+    "appointment.loc[appointment['Officer_Category'] == 'Nursing_and_Midwifery',\n",
+    "                'Officer_Category'] = 'Nursing and Midwifery'\n",
+    "\n",
+    "# Read the data of appointment types table\n",
+    "appt_types = pd.read_csv(workingpath / 'ResourceFile_Appt_Types_Table.csv')\n",
+    "# Rename\n",
+    "appt_types.loc[appt_types['Appt_Cat'] == 'GENERAL_INPATIENT_AND_OUTPATIENT_CARE',\n",
+    "              'Appt_Cat'] = 'IPOP'\n",
+    "appt_types.loc[appt_types['Appt_Cat'] == 'Nutrition',\n",
+    "              'Appt_Cat'] = 'NUTRITION'\n",
+    "appt_types.loc[appt_types['Appt_Cat'] == 'Misc',\n",
+    "              'Appt_Cat'] = 'MISC'\n",
+    "appt_types.loc[appt_types['Appt_Cat'] == 'Mental_Health',\n",
+    "              'Appt_Cat'] = 'MENTAL'\n",
+    "\n",
+    "# Merge appt category to the time table\n",
+    "appointment = appointment.merge(appt_types[['Appt_Type_Code', 'Appt_Cat']],\n",
+    "                               on='Appt_Type_Code', how='left')\n",
+    "\n",
+    "# Add prefix 'Facility_Level'\n",
+    "appointment['Facility_Level'] = 'Facility_Level_' + appointment['Facility_Level'].astype(str)\n",
+    "\n",
+    "# Draw a diagram using appointment time table itself.\n",
+    "# Currentlt, we do not know how many appointments of a type at a level happened or \\\n",
+    "# how many minutes of an officer category at a level go to that appointment.\n",
+    "# We consider the mapping between officer categories and appointment types, \\\n",
+    "# which can be derived from the appointment time table."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "b3e06de5",
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8b6451ea2a6345079a8905078efed1ae",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "SankeyWidget(groups=[{'id': 'Officer', 'type': 'process', 'title': '', 'nodes': ['Officer^Clinical', 'Officer^…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# The flow maps 9 officer categories and 11 appt cateogories at all levels\n",
+    "flow_coarse_officer_appt = pd.DataFrame(\n",
+    "    appointment.groupby(['Officer_Category', 'Appt_Cat', 'Facility_Level'],\n",
+    "                        dropna=False, sort=False).sum()\n",
+    ").reset_index()\n",
+    "# Drop column of minutes and add column 'value'\n",
+    "flow_coarse_officer_appt.drop(columns = ['Time_Taken_Mins'], inplace=True)\n",
+    "flow_coarse_officer_appt['value'] = 1\n",
+    "\n",
+    "# Add 'source' and 'target' columns\n",
+    "flow_coarse_officer_appt['source'] = 'Officer_Category'\n",
+    "flow_coarse_officer_appt['target'] = 'Appt_Cat'\n",
+    "\n",
+    "size = dict(width=800, height=800, margins=dict(left=180, right=180))\n",
+    "\n",
+    "partition_officer_cat = Partition.Simple('Officer_Category',\n",
+    "                                             np.unique(flow_coarse_officer_appt['Officer_Category']))\n",
+    "\n",
+    "partition_appt_cat = Partition.Simple('Appt_Cat',\n",
+    "                                          np.unique(flow_coarse_officer_appt['Appt_Cat']))\n",
+    "\n",
+    "partition_facility_level = Partition.Simple('Facility_Level',\n",
+    "                                            np.unique(flow_coarse_officer_appt['Facility_Level']))\n",
+    "\n",
+    "nodes = {\n",
+    "    'Officer': ProcessGroup(['Officer_Category'], partition_officer_cat),\n",
+    "    'Appt': ProcessGroup(['Appt_Cat'], partition_appt_cat),\n",
+    "}\n",
+    "\n",
+    "# Add nodes Waypoint\n",
+    "nodes['waypoint'] = Waypoint(partition_facility_level)\n",
+    "\n",
+    "bundles = [\n",
+    "    Bundle('Officer', 'Appt', waypoints=['waypoint']),\n",
+    "]\n",
+    "\n",
+    "ordering = [\n",
+    "    ['Officer'],  # left\n",
+    "    ['waypoint'],    # middle\n",
+    "    ['Appt'],     # right\n",
+    "    ]\n",
+    "\n",
+    "# Set the color for each officer category\n",
+    "palette = {'Clinical': 'skyblue', 'Nursing and Midwifery': 'lightpink',\n",
+    "           'Pharmacy': 'khaki', 'Laboratory': 'cadetblue',\n",
+    "           'Radiography': 'yellowgreen', 'Dental': 'salmon',\n",
+    "           'Mental': 'mediumorchid', 'DCSA': 'royalblue'\n",
+    "          }\n",
+    "\n",
+    "\n",
+    "# Sankey diagram definition (SDD)\n",
+    "sdd = SankeyDefinition(nodes, bundles, ordering, flow_partition=partition_officer_cat)\n",
+    "\n",
+    "sankey_coarse_officer_and_coarse_appt = weave(sdd, flow_coarse_officer_appt,\n",
+    "                                                palette=palette, measures='value').to_widget(**size)\n",
+    "\n",
+    "sankey_coarse_officer_and_coarse_appt.auto_save_png(outputpath /'Sankey_coarse_officer_and_coarse_appt.png')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "fe0021cb",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1e700fc889c34d19ac176aff4077908d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HBox(children=(SankeyWidget(groups=[{'id': 'Officer', 'type': 'process', 'title': '', 'nodes': …"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# The flow maps 9 officer categories and 51 appt types at an individaul level\n",
+    "flow_coarse_officer = pd.DataFrame(\n",
+    "    appointment.groupby(['Officer_Category', 'Appt_Type_Code', 'Facility_Level'],\n",
+    "                        dropna=False, sort=False).sum()\n",
+    ").reset_index()\n",
+    "# As we do not care about the flow proportions, we add a 'value' columns with constant value 1.\n",
+    "flow_coarse_officer['value'] = 1\n",
+    "# Add 'source' and 'target' columns\n",
+    "flow_coarse_officer['source'] = 'Officer_Category'\n",
+    "flow_coarse_officer['target'] = 'Appt_Type_Code'\n",
+    "\n",
+    "def sankey_level_coarse_officer(level, h):\n",
+    "    flow_coarse_officer_level = flow_coarse_officer.loc[flow_coarse_officer['Facility_Level'] == level, :].copy()\n",
+    "    flow_coarse_officer_level.drop(columns = 'Facility_Level', inplace=True)\n",
+    "    flow_coarse_officer_level.reset_index(drop=True, inplace=True)\n",
+    "\n",
+    "    size = dict(width=800, height=h, margins=dict(left=180, right=180))\n",
+    "\n",
+    "    partition_officer_cat = Partition.Simple('Officer_Category', np.unique(flow_coarse_officer_level['Officer_Category']))\n",
+    "    partition_appt_type = Partition.Simple('Appt_Type_Code', np.unique(flow_coarse_officer_level['Appt_Type_Code']))\n",
+    "\n",
+    "    nodes = {\n",
+    "        'Officer': ProcessGroup(['Officer_Category'], partition_officer_cat),\n",
+    "        'Appt': ProcessGroup(['Appt_Type_Code'], partition_appt_type),\n",
+    "    }\n",
+    "\n",
+    "    bundles = [\n",
+    "        Bundle('Officer', 'Appt'),\n",
+    "    ]\n",
+    "\n",
+    "    ordering = [\n",
+    "        ['Officer'],\n",
+    "        ['Appt'],\n",
+    "    ]\n",
+    "\n",
+    "    # Set the color for each officer category\n",
+    "    palette = {'Clinical': 'skyblue', 'Nursing and Midwifery': 'lightpink',\n",
+    "               'Pharmacy': 'khaki', 'Laboratory': 'cadetblue',\n",
+    "               'Radiography': 'yellowgreen', 'Dental': 'salmon',\n",
+    "               'Mental': 'mediumorchid', 'DCSA': 'royalblue'\n",
+    "              }\n",
+    "\n",
+    "    sdd = SankeyDefinition(nodes, bundles, ordering, flow_partition=partition_officer_cat) # color by officer cat\n",
+    "\n",
+    "    return weave(sdd, flow_coarse_officer_level, palette=palette, measures='value').to_widget(**size)\n",
+    "\n",
+    "sankey_coarse_officer_level_0 = sankey_level_coarse_officer('Facility_Level_0', 100)\n",
+    "sankey_coarse_officer_level_0.auto_save_png(outputpath /'Sankey_coarse_officer_and_fine_appt_level_0.png')\n",
+    "\n",
+    "sankey_coarse_officer_level_1a = sankey_level_coarse_officer('Facility_Level_1a', 1200)\n",
+    "sankey_coarse_officer_level_1a.auto_save_png(outputpath /'Sankey_coarse_officer_and_fine_appt_level_1a.png')\n",
+    "\n",
+    "sankey_coarse_officer_level_1b = sankey_level_coarse_officer('Facility_Level_1b', 1200)\n",
+    "sankey_coarse_officer_level_1b.auto_save_png(outputpath /'Sankey_coarse_officer_and_fine_appt_level_1b.png')\n",
+    "\n",
+    "sankey_coarse_officer_level_2 = sankey_level_coarse_officer('Facility_Level_2', 1200)\n",
+    "sankey_coarse_officer_level_2.auto_save_png(outputpath /'Sankey_coarse_officer_and_fine_appt_level_2.png')\n",
+    "\n",
+    "sankey_coarse_officer_level_3 = sankey_level_coarse_officer('Facility_Level_3', 1200)\n",
+    "sankey_coarse_officer_level_3.auto_save_png(outputpath /'Sankey_coarse_officer_and_fine_appt_level_3.png')\n",
+    "\n",
+    "sankey_coarse_officer_level_4 = sankey_level_coarse_officer('Facility_Level_4', 200)\n",
+    "sankey_coarse_officer_level_4.auto_save_png(outputpath /'Sankey_coarse_officer_and_fine_appt_level_4.png')\n",
+    "\n",
+    "top_box = HBox([sankey_coarse_officer_level_0, sankey_coarse_officer_level_4])\n",
+    "mid_box = HBox([sankey_coarse_officer_level_1a, sankey_coarse_officer_level_1b])\n",
+    "bottom_box = HBox([sankey_coarse_officer_level_2, sankey_coarse_officer_level_3])\n",
+    "VBox([top_box, mid_box, bottom_box])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "PyCharm (TLOmodel)",
+   "language": "python",
+   "name": "pycharm-551d1069"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
\ No newline at end of file

From 66879cb22ae46d8766c889544ab534188fdf544e Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Mon, 13 Dec 2021 11:32:50 +0000
Subject: [PATCH 017/131] Update annotation

---
 .../analysis_sankey_coarse_officer_and_appt.ipynb      | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb b/src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb
index 42659ffde7..3f6c1e6dbc 100644
--- a/src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb
+++ b/src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb
@@ -5,7 +5,7 @@
    "id": "4103c24d",
    "metadata": {},
    "source": [
-    "### Install floweaver in Anaconda Prompt / PyCharm Terminal:\n",
+    "### Install floweaver in Anaconda Prompt (if use Jupyter Notebook) / PyCharm Terminal:\n",
     "\n",
     "pip install floweaver\n",
     "\n",
@@ -16,7 +16,13 @@
     "jupyter notebook (to open jupyter notebook)\n",
     "\n",
     "### To display and save the output figures:\n",
-    "Run the cells -> Open Event Log -> Open in Browser -> Find the script and run all cells"
+    "Select Start Jupyter Server from the Jupyter Actions Menu (lightbulb icon next to Run All cells icon)\n",
+    "\n",
+    "Open Event Log\n",
+    "\n",
+    "Open in Browser\n",
+    "\n",
+    "Find the script and run all cells"
    ]
   },
   {

From 1c688531722508ae30497676d5c0bf97e75f4847 Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Mon, 13 Dec 2021 12:06:28 +0000
Subject: [PATCH 018/131] Update annotation

---
 .../hsi_in_typical_run/analysis_hsi_in_typical_run.py         | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
index 2da10ae350..a502fe5ff3 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
@@ -47,7 +47,7 @@
 hsi["month"] = hsi["date"].dt.month
 
 # Number of HSI that are taking place by originating module, by month
-year = 2010
+year = 2018
 hsi["Module"] = hsi["TREATMENT_ID"].str.split('_').apply(lambda x: x[0])
 evs = hsi.loc[hsi.date.dt.year == year]\
     .groupby(by=['month', 'Module'])\
@@ -65,7 +65,7 @@
 # Plot the breakdown of all HSI, over all the years
 evs = hsi.groupby(by=['Module']).size()
 evs.plot.pie()
-plt.title(f"HSI by Module")
+plt.title("HSI by Module")
 plt.tight_layout()
 plt.savefig(make_graph_file_name('HSI_per_module'))
 plt.show()

From 1db10579ce4b57a1b0746b2cfd309f2f652a3400 Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Mon, 13 Dec 2021 12:08:24 +0000
Subject: [PATCH 019/131] Revert "Update annotation"

This reverts commit 1c688531
---
 .../hsi_in_typical_run/analysis_hsi_in_typical_run.py         | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
index a502fe5ff3..2da10ae350 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
@@ -47,7 +47,7 @@
 hsi["month"] = hsi["date"].dt.month
 
 # Number of HSI that are taking place by originating module, by month
-year = 2018
+year = 2010
 hsi["Module"] = hsi["TREATMENT_ID"].str.split('_').apply(lambda x: x[0])
 evs = hsi.loc[hsi.date.dt.year == year]\
     .groupby(by=['month', 'Module'])\
@@ -65,7 +65,7 @@
 # Plot the breakdown of all HSI, over all the years
 evs = hsi.groupby(by=['Module']).size()
 evs.plot.pie()
-plt.title("HSI by Module")
+plt.title(f"HSI by Module")
 plt.tight_layout()
 plt.savefig(make_graph_file_name('HSI_per_module'))
 plt.show()

From a65bec1da8f622931db06e5faabb09f6ed5c7351 Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Mon, 13 Dec 2021 12:09:06 +0000
Subject: [PATCH 020/131] Refactor

---
 .../hsi_in_typical_run/analysis_hsi_in_typical_run.py         | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
index 2da10ae350..a502fe5ff3 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
@@ -47,7 +47,7 @@
 hsi["month"] = hsi["date"].dt.month
 
 # Number of HSI that are taking place by originating module, by month
-year = 2010
+year = 2018
 hsi["Module"] = hsi["TREATMENT_ID"].str.split('_').apply(lambda x: x[0])
 evs = hsi.loc[hsi.date.dt.year == year]\
     .groupby(by=['month', 'Module'])\
@@ -65,7 +65,7 @@
 # Plot the breakdown of all HSI, over all the years
 evs = hsi.groupby(by=['Module']).size()
 evs.plot.pie()
-plt.title(f"HSI by Module")
+plt.title("HSI by Module")
 plt.tight_layout()
 plt.savefig(make_graph_file_name('HSI_per_module'))
 plt.show()

From f3373a95d398d776eab7894ef146f3f4d2ceb113 Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Mon, 13 Dec 2021 15:07:49 +0000
Subject: [PATCH 021/131] Refactor

---
 ...alysis_describe_healthsystem_capabilities.py |  1 +
 .../analysis_hsi_in_typical_run.py              | 17 ++++++++++++++---
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities.py b/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities.py
index e54c20327b..b3b050a338 100644
--- a/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities.py
+++ b/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities.py
@@ -4,6 +4,7 @@
 """
 
 from pathlib import Path
+
 import pandas as pd
 from matplotlib import pyplot as plt
 from matplotlib.ticker import ScalarFormatter
diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
index a502fe5ff3..65a83d4ae6 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
@@ -3,9 +3,11 @@
 
 # %% Declare the name of the file that specified the scenarios used in this run.
 from pathlib import Path
+
 import matplotlib.pyplot as plt
 # import numpy as np
 import pandas as pd
+
 from tlo.analysis.utils import get_scenario_outputs, load_pickled_dataframes
 
 scenario_filename = 'scenario_hsi_in_typical_run.py'
@@ -31,13 +33,22 @@
 cap = cap.set_index('date')
 
 frac_time_used = cap['Frac_Time_Used_Overall']
+frac_time_used_2014_2018 = frac_time_used.loc['2013-12-31':'2019-01-01']
+frac_time_used_2016 = frac_time_used.loc['2015-12-31':'2017-01-01']
 
 # Plot:
-frac_time_used.plot()
-plt.title("Fraction of Total Healthcare Worker Time Used")
+frac_time_used_2014_2018.plot()
+plt.title("Fraction of Total Healthcare Worker Time Used 2014-2018")
+plt.xlabel("Date")
+plt.tight_layout()
+plt.savefig(make_graph_file_name('HSI_Frac_time_used_2014_2018'))
+plt.show()
+
+frac_time_used_2016.plot()
+plt.title("Fraction of Total Healthcare Worker Time Used 2016")
 plt.xlabel("Date")
 plt.tight_layout()
-plt.savefig(make_graph_file_name('HSI_Frac_time_used'))
+plt.savefig(make_graph_file_name('HSI_Frac_time_used_2016'))
 plt.show()
 
 # %% Number of HSI:

From 483d130d456f5da75865d3895e13dc8279f53c04 Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Mon, 13 Dec 2021 19:59:48 +0000
Subject: [PATCH 022/131] Updating existing plots

---
 .../analysis_hsi_in_typical_run.py            | 30 ++++++++++++++-----
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
index 65a83d4ae6..709e6544cb 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
@@ -7,6 +7,7 @@
 import matplotlib.pyplot as plt
 # import numpy as np
 import pandas as pd
+from matplotlib.cm import get_cmap
 
 from tlo.analysis.utils import get_scenario_outputs, load_pickled_dataframes
 
@@ -38,14 +39,14 @@
 
 # Plot:
 frac_time_used_2014_2018.plot()
-plt.title("Fraction of Total Healthcare Worker Time Used 2014-2018")
+plt.title("Fraction of Total Healthcare Worker Time Used (year 2014-2018)")
 plt.xlabel("Date")
 plt.tight_layout()
 plt.savefig(make_graph_file_name('HSI_Frac_time_used_2014_2018'))
 plt.show()
 
 frac_time_used_2016.plot()
-plt.title("Fraction of Total Healthcare Worker Time Used 2016")
+plt.title("Fraction of Total Healthcare Worker Time Used (year 2016)")
 plt.xlabel("Date")
 plt.tight_layout()
 plt.savefig(make_graph_file_name('HSI_Frac_time_used_2016'))
@@ -58,7 +59,7 @@
 hsi["month"] = hsi["date"].dt.month
 
 # Number of HSI that are taking place by originating module, by month
-year = 2018
+year = 2016
 hsi["Module"] = hsi["TREATMENT_ID"].str.split('_').apply(lambda x: x[0])
 evs = hsi.loc[hsi.date.dt.year == year]\
     .groupby(by=['month', 'Module'])\
@@ -66,17 +67,30 @@
     .pivot_table(index='month', columns='Module', values='count', fill_value=0)
 
 # Plot:
-evs.plot.bar(stacked=True)
+# Use colormap tab20 so that each module has a unique color
+color_tab20 = get_cmap('tab20_r')
+evs.plot.bar(stacked=True, color=color_tab20.colors)
 plt.title(f"HSI by Module, per Month (year {year})")
 plt.ylabel('Total per month')
 plt.tight_layout()
+plt.legend(ncol=3, loc='center', fontsize='xx-small')
 plt.savefig(make_graph_file_name('HSI_per_module_per_month'))
 plt.show()
 
-# Plot the breakdown of all HSI, over all the years
-evs = hsi.groupby(by=['Module']).size()
-evs.plot.pie()
-plt.title("HSI by Module")
+# Plot the breakdown of all HSI, over all the years 2010-2018
+evs = pd.DataFrame(hsi.groupby(by=['Module']).size())
+# Calculate the fraction
+evs[1] = 100*evs[0]/evs[0].sum()
+patches, texts = plt.pie(evs[0], colors=color_tab20.colors)
+labels = ['{0} - {1:1.2f} %'.format(i, j) for i, j in zip(evs.index, evs[1])]
+# Sort legend
+sort_legend = True
+if sort_legend:
+    patches, labels, dummy = zip(*sorted(zip(patches, labels, evs[0]),
+                                         key=lambda x: x[2],
+                                         reverse=True))
+plt.legend(patches, labels, ncol=3, loc='lower center', fontsize='xx-small')
+plt.title("HSI by Module (year 2010-2018)")
 plt.tight_layout()
 plt.savefig(make_graph_file_name('HSI_per_module'))
 plt.show()

From 19c8b0833b623fa7edc3e87147fed068ec13283b Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Tue, 14 Dec 2021 13:53:01 +0000
Subject: [PATCH 023/131] Update annotation

---
 .../analysis_sankey_coarse_officer_and_appt.ipynb              | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb b/src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb
index 3f6c1e6dbc..0b3e866efa 100644
--- a/src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb
+++ b/src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb
@@ -5,6 +5,9 @@
    "id": "4103c24d",
    "metadata": {},
    "source": [
+    "This file uses floweaver to generate Sankey diagrams that map coarse officers to appointments.\n",
+    "Below is the instruction to run the file.\n",
+    "\n",
     "### Install floweaver in Anaconda Prompt (if use Jupyter Notebook) / PyCharm Terminal:\n",
     "\n",
     "pip install floweaver\n",

From dab2f57eec24fb112ed38e0e7690a87d39df7b0f Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Tue, 14 Dec 2021 13:57:40 +0000
Subject: [PATCH 024/131] Update annotation

---
 .../hsi_in_typical_run/analysis_hsi_in_typical_run.py            | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
index 709e6544cb..181fc0cc08 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
@@ -104,3 +104,4 @@
     hsi.set_index('TREATMENT_ID')['Number_By_Appt_Type_Code'].drop_duplicates().apply(pd.Series).fillna(0.0)
 
 # Plot...
+# See the Sankey plot in analysis_sankey_appt_and_hsi.ipynb (in the same folder)

From 1bd78e10331c6b1e35d8d5f501538c5363e24b1e Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Tue, 14 Dec 2021 16:56:12 +0000
Subject: [PATCH 025/131] Refactor

---
 .../analysis_sankey_coarse_officer_and_appt.ipynb                | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb b/src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb
index 0b3e866efa..c56694c88a 100644
--- a/src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb
+++ b/src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb
@@ -6,6 +6,7 @@
    "metadata": {},
    "source": [
     "This file uses floweaver to generate Sankey diagrams that map coarse officers to appointments.\n",
+    "\n",
     "Below is the instruction to run the file.\n",
     "\n",
     "### Install floweaver in Anaconda Prompt (if use Jupyter Notebook) / PyCharm Terminal:\n",

From 25e74406b2097bedc052cdfe27529c79ee87be20 Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Tue, 14 Dec 2021 16:58:26 +0000
Subject: [PATCH 026/131] Great a file to plot the sankey diagram that maps
 appointment type to hsi events

---
 .../analysis_sankey_appt_and_hsi.ipynb        | 207 ++++++++++++++++++
 1 file changed, 207 insertions(+)
 create mode 100644 src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb
new file mode 100644
index 0000000000..77bd091512
--- /dev/null
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb
@@ -0,0 +1,207 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "This file uses the run generated by `scenario_hsi_in_typical_run.py` and floweaver\n",
+    "to produce a Sankey diagram that maps appointments with HSI events.\n",
+    "\n",
+    "Below is the instruction to run the file.\n",
+    "\n",
+    "### Install floweaver in Anaconda Prompt (if use Jupyter Notebook) / PyCharm Terminal:\n",
+    "\n",
+    "pip install floweaver\n",
+    "\n",
+    "pip install ipysankeywidget\n",
+    "\n",
+    "jupyter nbextension enable --py --sys-prefix ipysankeywidget\n",
+    "\n",
+    "jupyter notebook (to open jupyter notebook)\n",
+    "\n",
+    "### To display and save the output figures:\n",
+    "Select Start Jupyter Server from the Jupyter Actions Menu (lightbulb icon next to Run All cells icon)\n",
+    "\n",
+    "Open Event Log\n",
+    "\n",
+    "Open in Browser\n",
+    "\n",
+    "Find the script and run all cells\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 103,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import tlo\n",
+    "\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from tlo.analysis.utils import get_scenario_outputs, load_pickled_dataframes\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "import numpy as np\n",
+    "\n",
+    "from matplotlib import pyplot as plt\n",
+    "\n",
+    "from ipysankeywidget import SankeyWidget\n",
+    "\n",
+    "from floweaver import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Results folder is: C:\\Users\\jdbb1\\Desktop\\TLOmodel\\outputs\\bshe@ic.ac.uk\\scenario_hsi_in_typical_run-2021-12-10T180225Z\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Declare the name of the file that specified the scenarios used in this run.\n",
+    "scenario_filename = 'scenario_hsi_in_typical_run.py'\n",
+    "\n",
+    "# Declare usual paths:\n",
+    "# Get the tlo path\n",
+    "tlopath = Path(tlo.__file__).parent.parent.parent\n",
+    "outputspath = tlopath / Path('outputs/bshe@ic.ac.uk')\n",
+    "\n",
+    "# Find results folder (most recent run generated using that scenario_filename)\n",
+    "results_folder = get_scenario_outputs(scenario_filename, outputspath)[-1]\n",
+    "print(f\"Results folder is: {results_folder}\")\n",
+    "\n",
+    "# Declare path for output graphs from this script\n",
+    "make_graph_file_name = lambda stub: results_folder / f\"{stub}.png\"  # noqa: E731\n",
+    "\n",
+    "# Extract results\n",
+    "log = load_pickled_dataframes(results_folder)['tlo.methods.healthsystem']  # (There was only one draw and one run)\n",
+    "\n",
+    "# Number of HSI:\n",
+    "hsi = log['HSI_Event']\n",
+    "hsi[\"date\"] = pd.to_datetime(hsi[\"date\"])\n",
+    "hsi[\"month\"] = hsi[\"date\"].dt.month\n",
+    "hsi[\"Module\"] = hsi[\"TREATMENT_ID\"].str.split('_').apply(lambda x: x[0])\n",
+    "\n",
+    "# Demand for appointments\n",
+    "num_hsi_by_treatment_id = hsi.groupby(hsi.TREATMENT_ID)['Number_By_Appt_Type_Code'].size()\n",
+    "\n",
+    "# Find the appt footprint for each treatment_id\n",
+    "appts_by_treatment_id = \\\n",
+    "    hsi.set_index('TREATMENT_ID')['Number_By_Appt_Type_Code'].drop_duplicates().apply(pd.Series).fillna(0.0)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 105,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "SankeyWidget(groups=[{'id': 'Appt', 'type': 'process', 'title': '', 'nodes': ['Appt^ANCSubsequent', 'Appt^Ante…",
+      "application/vnd.jupyter.widget-view+json": {
+       "version_major": 2,
+       "version_minor": 0,
+       "model_id": "f83c5d9371f04805b40570af1bfaf842"
+      }
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# The Sankey\n",
+    "appt_and_hsi = appts_by_treatment_id.reset_index().copy()\n",
+    "appt_and_hsi = pd.melt(appt_and_hsi, id_vars=['TREATMENT_ID'], value_vars=appt_and_hsi.columns[1:],\n",
+    "                      var_name='Appt_Type')\n",
+    "appt_and_hsi = appt_and_hsi[appt_and_hsi['value'] > 0].copy()\n",
+    "\n",
+    "# Define the flow\n",
+    "appt_and_hsi['source'] = 'Appt_Type'\n",
+    "appt_and_hsi['target'] = 'TREATMENT_ID'\n",
+    "\n",
+    "size = dict(width=1000, height=800, margins=dict(left=120, right=500))\n",
+    "\n",
+    "partition_appt_type = Partition.Simple('Appt_Type', np.unique(appt_and_hsi['Appt_Type']))\n",
+    "\n",
+    "partition_treatment_id = Partition.Simple('TREATMENT_ID', np.unique(appt_and_hsi['TREATMENT_ID']))\n",
+    "\n",
+    "nodes = {\n",
+    "    'Appt': ProcessGroup(['Appt_Type'], partition_appt_type),\n",
+    "    'HSI': ProcessGroup(['TREATMENT_ID'], partition_treatment_id),\n",
+    "}\n",
+    "\n",
+    "bundles = [\n",
+    "    Bundle('Appt', 'HSI'),\n",
+    "]\n",
+    "\n",
+    "ordering = [\n",
+    "    ['Appt'],  # left\n",
+    "    ['HSI'],   # right\n",
+    "    ]\n",
+    "\n",
+    "# Set the color for each appt type\n",
+    "palette = {'IPAdmission': 'lightsteelblue', 'InpatientDays': 'skyblue',\n",
+    "           'Over5OPD': 'cornflowerblue', 'Under5OPD': 'steelblue',\n",
+    "           'AntenatalFirst': 'plum', 'ANCSubsequent': 'hotpink',\n",
+    "           'CompDelivery': 'tomato', 'NormalDelivery': 'darksalmon',\n",
+    "           'FamPlan': 'gold', 'MajorSurg': 'orange', 'ConWithDCSA': 'mediumpurple',\n",
+    "           'MaleCirc': 'lightgreen', 'NewAdult': 'mediumseagreen',\n",
+    "           'VCTNegative': 'greenyellow', 'VCTPositive': 'olivedrab',\n",
+    "          }\n",
+    "\n",
+    "# Sankey diagram definition (SDD)\n",
+    "sdd = SankeyDefinition(nodes, bundles, ordering, flow_partition=partition_appt_type)\n",
+    "\n",
+    "sankey_appt_and_hsi = weave(sdd, appt_and_hsi, palette=palette, measures='value').to_widget(**size)\n",
+    "\n",
+    "sankey_appt_and_hsi.auto_save_png(outputspath /'Sankey_appt_and_hsi.png')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "PyCharm (TLOmodel)",
+   "language": "python",
+   "name": "pycharm-551d1069"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
\ No newline at end of file

From 0902f0a91182c477cdeddfcf0317bd271b273198 Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Tue, 14 Dec 2021 17:40:08 +0000
Subject: [PATCH 027/131] Refactor

---
 .../analysis_sankey_appt_and_hsi.ipynb               | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb
index 77bd091512..7f1df7cf4c 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb
@@ -36,7 +36,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 103,
+   "execution_count": 113,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -63,7 +63,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 104,
+   "execution_count": 114,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -113,7 +113,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 105,
+   "execution_count": 115,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -126,7 +126,7 @@
       "application/vnd.jupyter.widget-view+json": {
        "version_major": 2,
        "version_minor": 0,
-       "model_id": "f83c5d9371f04805b40570af1bfaf842"
+       "model_id": "c0071ac5754b488481475850b2abdf01"
       }
      },
      "metadata": {},
@@ -144,7 +144,7 @@
     "appt_and_hsi['source'] = 'Appt_Type'\n",
     "appt_and_hsi['target'] = 'TREATMENT_ID'\n",
     "\n",
-    "size = dict(width=1000, height=800, margins=dict(left=120, right=500))\n",
+    "size = dict(width=1000, height=800, margins=dict(left=120, right=520))\n",
     "\n",
     "partition_appt_type = Partition.Simple('Appt_Type', np.unique(appt_and_hsi['Appt_Type']))\n",
     "\n",
@@ -179,7 +179,7 @@
     "\n",
     "sankey_appt_and_hsi = weave(sdd, appt_and_hsi, palette=palette, measures='value').to_widget(**size)\n",
     "\n",
-    "sankey_appt_and_hsi.auto_save_png(outputspath /'Sankey_appt_and_hsi.png')"
+    "sankey_appt_and_hsi.auto_save_png(make_graph_file_name('Sankey_appt_and_hsi'))"
    ]
   }
  ],

From 4319a79e1d04a241b899938a65baa10f108731c5 Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Wed, 15 Dec 2021 11:08:40 +0000
Subject: [PATCH 028/131] Questions and todo

---
 .../hsi_in_typical_run/analysis_hsi_in_typical_run.py        | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
index 181fc0cc08..24e5c258c1 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
@@ -61,6 +61,9 @@
 # Number of HSI that are taking place by originating module, by month
 year = 2016
 hsi["Module"] = hsi["TREATMENT_ID"].str.split('_').apply(lambda x: x[0])
+# todo: Rename Module HSI to Generic First Appointment? \
+#  (Q: The treatment_id for module HSI seems not related to Generic First Appt; \
+#  Besides, there are similar modules called GenericEmergency... and GenericFirstAppt...)
 evs = hsi.loc[hsi.date.dt.year == year]\
     .groupby(by=['month', 'Module'])\
     .size().reset_index().rename(columns={0: 'count'})\
@@ -102,6 +105,8 @@
 # find the appt footprint for each treatment_id
 appts_by_treatment_id = \
     hsi.set_index('TREATMENT_ID')['Number_By_Appt_Type_Code'].drop_duplicates().apply(pd.Series).fillna(0.0)
+# todo: the drop_duplicates() and fillna(0.0) functions seem delete a lot of rows \
+#  wherein an hsi has called an appt type. Need to regenerate the correct appts_by_treatment_id
 
 # Plot...
 # See the Sankey plot in analysis_sankey_appt_and_hsi.ipynb (in the same folder)

From 258421b76cb476adea243bba10099f87e7e4559b Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Wed, 15 Dec 2021 11:13:06 +0000
Subject: [PATCH 029/131] Questions and todo

---
 .../analysis_sankey_appt_and_hsi.ipynb        | 40 ++++++++++---------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb
index 7f1df7cf4c..b3e1100a66 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb
@@ -36,7 +36,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 113,
+   "execution_count": 1,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -63,7 +63,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 114,
+   "execution_count": 2,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -104,42 +104,43 @@
     "hsi[\"Module\"] = hsi[\"TREATMENT_ID\"].str.split('_').apply(lambda x: x[0])\n",
     "\n",
     "# Demand for appointments\n",
-    "num_hsi_by_treatment_id = hsi.groupby(hsi.TREATMENT_ID)['Number_By_Appt_Type_Code'].size()\n",
+    "num_hsi_by_treatment_id = pd.DataFrame(hsi.groupby(hsi.TREATMENT_ID)['Number_By_Appt_Type_Code'].size())\n",
+    "num_hsi_by_treatment_id.rename(columns={'Number_By_Appt_Type_Code': 'Number_of_HSI'}, inplace=True)\n",
     "\n",
     "# Find the appt footprint for each treatment_id\n",
     "appts_by_treatment_id = \\\n",
-    "    hsi.set_index('TREATMENT_ID')['Number_By_Appt_Type_Code'].drop_duplicates().apply(pd.Series).fillna(0.0)\n"
+    "    hsi.set_index('TREATMENT_ID')['Number_By_Appt_Type_Code'].drop_duplicates().apply(pd.Series).fillna(0.0)\n",
+    "# todo: follow the change in analysis_hsi_in_typical_run"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 115,
+   "execution_count": 4,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
     }
    },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": "SankeyWidget(groups=[{'id': 'Appt', 'type': 'process', 'title': '', 'nodes': ['Appt^ANCSubsequent', 'Appt^Ante…",
-      "application/vnd.jupyter.widget-view+json": {
-       "version_major": 2,
-       "version_minor": 0,
-       "model_id": "c0071ac5754b488481475850b2abdf01"
-      }
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# The Sankey\n",
+    "\n",
+    "# Prepare the data: appt type and number per hsi\n",
     "appt_and_hsi = appts_by_treatment_id.reset_index().copy()\n",
     "appt_and_hsi = pd.melt(appt_and_hsi, id_vars=['TREATMENT_ID'], value_vars=appt_and_hsi.columns[1:],\n",
     "                      var_name='Appt_Type')\n",
     "appt_and_hsi = appt_and_hsi[appt_and_hsi['value'] > 0].copy()\n",
     "\n",
+    "# todo:\n",
+    "# Prepare the data plus: total number of appts per hsi for year 2010-2018\n",
+    "# num_appt_by_hsi = appts_by_treatment_id.copy()\n",
+    "# for event in num_appt_by_hsi.index:\n",
+    "#     num_appt_by_hsi.loc[event,:] = appts_by_treatment_id.loc[event,:] * num_hsi_by_treatment_id.loc[event, 'Number_of_HSI']\n",
+    "# num_appt_by_hsi = num_appt_by_hsi.reset_index().copy()\n",
+    "# num_appt_by_hsi = pd.melt(num_appt_by_hsi, id_vars=['TREATMENT_ID'], value_vars=num_appt_by_hsi.columns[1:],\n",
+    "#                       var_name='Appt_Type')\n",
+    "# num_appt_by_hsi = num_appt_by_hsi[num_appt_by_hsi['value'] > 0].copy()\n",
+    "\n",
     "# Define the flow\n",
     "appt_and_hsi['source'] = 'Appt_Type'\n",
     "appt_and_hsi['target'] = 'TREATMENT_ID'\n",
@@ -177,6 +178,7 @@
     "# Sankey diagram definition (SDD)\n",
     "sdd = SankeyDefinition(nodes, bundles, ordering, flow_partition=partition_appt_type)\n",
     "\n",
+    "# Generate and save Sankey\n",
     "sankey_appt_and_hsi = weave(sdd, appt_and_hsi, palette=palette, measures='value').to_widget(**size)\n",
     "\n",
     "sankey_appt_and_hsi.auto_save_png(make_graph_file_name('Sankey_appt_and_hsi'))"

From efe9e75bf24b48342df02058592229b7d06e9e3e Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Wed, 15 Dec 2021 13:16:48 +0000
Subject: [PATCH 030/131] Questions and todo deleted

---
 .../hsi_in_typical_run/analysis_hsi_in_typical_run.py           | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
index 24e5c258c1..a4597bbd44 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
@@ -105,8 +105,6 @@
 # find the appt footprint for each treatment_id
 appts_by_treatment_id = \
     hsi.set_index('TREATMENT_ID')['Number_By_Appt_Type_Code'].drop_duplicates().apply(pd.Series).fillna(0.0)
-# todo: the drop_duplicates() and fillna(0.0) functions seem delete a lot of rows \
-#  wherein an hsi has called an appt type. Need to regenerate the correct appts_by_treatment_id
 
 # Plot...
 # See the Sankey plot in analysis_sankey_appt_and_hsi.ipynb (in the same folder)

From 8f5a3eee67cc5e7438b5150e54252837b8cdbc65 Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Wed, 15 Dec 2021 14:11:59 +0000
Subject: [PATCH 031/131] Two sankeys mapping appt and hsi

---
 .../analysis_sankey_appt_and_hsi.ipynb        | 118 +++++++++++++++---
 1 file changed, 100 insertions(+), 18 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb
index b3e1100a66..a8df49e85e 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb
@@ -36,7 +36,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 23,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -63,7 +63,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 24,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -109,37 +109,40 @@
     "\n",
     "# Find the appt footprint for each treatment_id\n",
     "appts_by_treatment_id = \\\n",
-    "    hsi.set_index('TREATMENT_ID')['Number_By_Appt_Type_Code'].drop_duplicates().apply(pd.Series).fillna(0.0)\n",
-    "# todo: follow the change in analysis_hsi_in_typical_run"
+    "    hsi.set_index('TREATMENT_ID')['Number_By_Appt_Type_Code'].drop_duplicates().apply(pd.Series).fillna(0.0)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 25,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
     }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "SankeyWidget(groups=[{'id': 'Appt', 'type': 'process', 'title': '', 'nodes': ['Appt^ANCSubsequent', 'Appt^Ante…",
+      "application/vnd.jupyter.widget-view+json": {
+       "version_major": 2,
+       "version_minor": 0,
+       "model_id": "589ac656dff4489a9dc70869184214d9"
+      }
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
-    "# The Sankey\n",
+    "# Sankey 0 that map appt to hsi considering only appt footprint for each hsi\n",
     "\n",
     "# Prepare the data: appt type and number per hsi\n",
     "appt_and_hsi = appts_by_treatment_id.reset_index().copy()\n",
     "appt_and_hsi = pd.melt(appt_and_hsi, id_vars=['TREATMENT_ID'], value_vars=appt_and_hsi.columns[1:],\n",
     "                      var_name='Appt_Type')\n",
-    "appt_and_hsi = appt_and_hsi[appt_and_hsi['value'] > 0].copy()\n",
-    "\n",
-    "# todo:\n",
-    "# Prepare the data plus: total number of appts per hsi for year 2010-2018\n",
-    "# num_appt_by_hsi = appts_by_treatment_id.copy()\n",
-    "# for event in num_appt_by_hsi.index:\n",
-    "#     num_appt_by_hsi.loc[event,:] = appts_by_treatment_id.loc[event,:] * num_hsi_by_treatment_id.loc[event, 'Number_of_HSI']\n",
-    "# num_appt_by_hsi = num_appt_by_hsi.reset_index().copy()\n",
-    "# num_appt_by_hsi = pd.melt(num_appt_by_hsi, id_vars=['TREATMENT_ID'], value_vars=num_appt_by_hsi.columns[1:],\n",
-    "#                       var_name='Appt_Type')\n",
-    "# num_appt_by_hsi = num_appt_by_hsi[num_appt_by_hsi['value'] > 0].copy()\n",
+    "# Only consider non-zero entries\n",
+    "# appt_and_hsi = appt_and_hsi[appt_and_hsi['value'] > 0].copy()\n",
     "\n",
     "# Define the flow\n",
     "appt_and_hsi['source'] = 'Appt_Type'\n",
@@ -183,6 +186,85 @@
     "\n",
     "sankey_appt_and_hsi.auto_save_png(make_graph_file_name('Sankey_appt_and_hsi'))"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "SankeyWidget(groups=[{'id': 'Appt', 'type': 'process', 'title': '', 'nodes': ['Appt^ANCSubsequent', 'Appt^Ante…",
+      "application/vnd.jupyter.widget-view+json": {
+       "version_major": 2,
+       "version_minor": 0,
+       "model_id": "271213232ff14b2ba3199c1390d8c044"
+      }
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Sankey 1 that maps appt to hsi considering appt footprint for each hsi and number of each hsi\n",
+    "\n",
+    "# Prepare the data plus: total number of appts per hsi for year 2010-2018\n",
+    "num_appt_by_hsi = appts_by_treatment_id.copy()\n",
+    "for event in num_appt_by_hsi.index:\n",
+    "    num_appt_by_hsi.loc[event,:] = appts_by_treatment_id.loc[event,:] * num_hsi_by_treatment_id.loc[event, 'Number_of_HSI']\n",
+    "num_appt_by_hsi = num_appt_by_hsi.reset_index().copy()\n",
+    "num_appt_by_hsi = pd.melt(num_appt_by_hsi, id_vars=['TREATMENT_ID'], value_vars=num_appt_by_hsi.columns[1:],\n",
+    "                      var_name='Appt_Type')\n",
+    "# Only consider non-zero entries\n",
+    "# num_appt_by_hsi = num_appt_by_hsi[num_appt_by_hsi['value'] > 0].copy()\n",
+    "\n",
+    "# Define the flow\n",
+    "num_appt_by_hsi['source'] = 'Appt_Type'\n",
+    "num_appt_by_hsi['target'] = 'TREATMENT_ID'\n",
+    "\n",
+    "size = dict(width=1000, height=800, margins=dict(left=120, right=520))\n",
+    "\n",
+    "partition_appt_type = Partition.Simple('Appt_Type', np.unique(num_appt_by_hsi['Appt_Type']))\n",
+    "\n",
+    "partition_treatment_id = Partition.Simple('TREATMENT_ID', np.unique(num_appt_by_hsi['TREATMENT_ID']))\n",
+    "\n",
+    "nodes = {\n",
+    "    'Appt': ProcessGroup(['Appt_Type'], partition_appt_type),\n",
+    "    'HSI': ProcessGroup(['TREATMENT_ID'], partition_treatment_id),\n",
+    "}\n",
+    "\n",
+    "bundles = [\n",
+    "    Bundle('Appt', 'HSI'),\n",
+    "]\n",
+    "\n",
+    "ordering = [\n",
+    "    ['Appt'],  # left\n",
+    "    ['HSI'],   # right\n",
+    "    ]\n",
+    "\n",
+    "# Set the color for each appt type\n",
+    "palette = {'IPAdmission': 'lightsteelblue', 'InpatientDays': 'skyblue',\n",
+    "           'Over5OPD': 'cornflowerblue', 'Under5OPD': 'steelblue',\n",
+    "           'AntenatalFirst': 'plum', 'ANCSubsequent': 'hotpink',\n",
+    "           'CompDelivery': 'tomato', 'NormalDelivery': 'darksalmon',\n",
+    "           'FamPlan': 'gold', 'MajorSurg': 'orange', 'ConWithDCSA': 'mediumpurple',\n",
+    "           'MaleCirc': 'lightgreen', 'NewAdult': 'mediumseagreen',\n",
+    "           'VCTNegative': 'greenyellow', 'VCTPositive': 'olivedrab',\n",
+    "          }\n",
+    "\n",
+    "# Sankey diagram definition (SDD)\n",
+    "sdd = SankeyDefinition(nodes, bundles, ordering, flow_partition=partition_appt_type)\n",
+    "\n",
+    "# Generate and save Sankey\n",
+    "sankey_num_appt_by_hsi = weave(sdd, num_appt_by_hsi, palette=palette, measures='value').to_widget(**size)\n",
+    "\n",
+    "sankey_num_appt_by_hsi.auto_save_png(make_graph_file_name('Sankey_num_appt_by_hsi'))\n",
+    "\n"
+   ]
   }
  ],
  "metadata": {

From b59d3804b7cff242cf9584ba719e3323f1a413e9 Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Wed, 15 Dec 2021 14:21:47 +0000
Subject: [PATCH 032/131] Raise an issue and todo

---
 .../hsi_in_typical_run/analysis_hsi_in_typical_run.py        | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
index a4597bbd44..6a1f6e661e 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
@@ -106,5 +106,10 @@
 appts_by_treatment_id = \
     hsi.set_index('TREATMENT_ID')['Number_By_Appt_Type_Code'].drop_duplicates().apply(pd.Series).fillna(0.0)
 
+# Todo: Since the resulted appts_by_treatment_id deleted many hsi events \
+#  (i.e., the hsi list is much shorter than in num_hsi_by_treatment_id), \
+#  will try delete empty entries first then apply drop_duplicates().
+
+
 # Plot...
 # See the Sankey plot in analysis_sankey_appt_and_hsi.ipynb (in the same folder)

From 19aaf5bdc60522ed679db626df7532d1d837e42c Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Wed, 15 Dec 2021 19:05:24 +0000
Subject: [PATCH 033/131] Rename weird modules

---
 .../analysis_hsi_in_typical_run.py            | 28 ++++++++++++++-----
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
index 6a1f6e661e..1a452667d4 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
@@ -61,9 +61,12 @@
 # Number of HSI that are taking place by originating module, by month
 year = 2016
 hsi["Module"] = hsi["TREATMENT_ID"].str.split('_').apply(lambda x: x[0])
-# todo: Rename Module HSI to Generic First Appointment? \
-#  (Q: The treatment_id for module HSI seems not related to Generic First Appt; \
-#  Besides, there are similar modules called GenericEmergency... and GenericFirstAppt...)
+# Rename Module 'HSI' to 'CareOfWomenDuringPregnancy'
+hsi["Module"] = hsi["Module"].replace("HSI", "CareOfWomenDuringPregnancy")
+# Rename Module 'GenericEmergency...' and 'GenericFirstAppt...' to 'HealthSeekingBehaviour' (or 'GenericFirstAppt')
+hsi["Module"] = hsi["Module"].replace("GenericFirstApptAtFacilityLevel0", "HealthSeekingBehaviour")
+hsi["Module"] = hsi["Module"].replace("GenericEmergencyFirstApptAtFacilityLevel1", "HealthSeekingBehaviour")
+
 evs = hsi.loc[hsi.date.dt.year == year]\
     .groupby(by=['month', 'Module'])\
     .size().reset_index().rename(columns={0: 'count'})\
@@ -71,12 +74,12 @@
 
 # Plot:
 # Use colormap tab20 so that each module has a unique color
-color_tab20 = get_cmap('tab20_r')
+color_tab20 = get_cmap('tab20')
 evs.plot.bar(stacked=True, color=color_tab20.colors)
 plt.title(f"HSI by Module, per Month (year {year})")
 plt.ylabel('Total per month')
 plt.tight_layout()
-plt.legend(ncol=3, loc='center', fontsize='xx-small')
+plt.legend(ncol=3, loc='center', fontsize='x-small')
 plt.savefig(make_graph_file_name('HSI_per_module_per_month'))
 plt.show()
 
@@ -84,6 +87,7 @@
 evs = pd.DataFrame(hsi.groupby(by=['Module']).size())
 # Calculate the fraction
 evs[1] = 100*evs[0]/evs[0].sum()
+color_tab20 = get_cmap('tab20_r')
 patches, texts = plt.pie(evs[0], colors=color_tab20.colors)
 labels = ['{0} - {1:1.2f} %'.format(i, j) for i, j in zip(evs.index, evs[1])]
 # Sort legend
@@ -92,7 +96,7 @@
     patches, labels, dummy = zip(*sorted(zip(patches, labels, evs[0]),
                                          key=lambda x: x[2],
                                          reverse=True))
-plt.legend(patches, labels, ncol=3, loc='lower center', fontsize='xx-small')
+plt.legend(patches, labels, ncol=3, loc='lower center', fontsize='x-small')
 plt.title("HSI by Module (year 2010-2018)")
 plt.tight_layout()
 plt.savefig(make_graph_file_name('HSI_per_module'))
@@ -108,7 +112,17 @@
 
 # Todo: Since the resulted appts_by_treatment_id deleted many hsi events \
 #  (i.e., the hsi list is much shorter than in num_hsi_by_treatment_id), \
-#  will try delete empty entries first then apply drop_duplicates().
+#  need to regenerate this table.
+# appts_by_treatment_id_alt = hsi[["TREATMENT_ID", 'Number_By_Appt_Type_Code']]
+# # drop rows that have empty entries
+# null_appt_idx = appts_by_treatment_id_alt[appts_by_treatment_id_alt['Number_By_Appt_Type_Code'] == {}].index
+# appts_by_treatment_id_alt = appts_by_treatment_id_alt.drop(index=null_appt_idx).copy()
+# # set index
+# appts_by_treatment_id_alt.set_index('TREATMENT_ID', inplace=True)
+# # turn to series
+# appts_by_treatment_id_alt = appts_by_treatment_id_alt.squeeze('columns')
+# # drop repetitive rows that have same index and value
+# # appts_by_treatment_id_alt = appts_by_treatment_id_alt.drop_duplicates()
 
 
 # Plot...

From 649375fbc96bcd653a9b773e499d28875c6c38a7 Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Thu, 16 Dec 2021 11:42:18 +0000
Subject: [PATCH 034/131] Reorder the nodes to make clear sankey

---
 ...lysis_sankey_coarse_officer_and_appt.ipynb | 36 +++++++++++++------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb b/src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb
index c56694c88a..94890a6ad6 100644
--- a/src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb
+++ b/src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb
@@ -31,7 +31,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 30,
    "id": "913300ab",
    "metadata": {
     "pycharm": {
@@ -94,7 +94,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 31,
    "id": "b3e06de5",
    "metadata": {
     "pycharm": {
@@ -105,12 +105,12 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "8b6451ea2a6345079a8905078efed1ae",
+       "model_id": "7264a742fef54db29db4bcc7a1b84194",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "SankeyWidget(groups=[{'id': 'Officer', 'type': 'process', 'title': '', 'nodes': ['Officer^Clinical', 'Officer^…"
+       "SankeyWidget(groups=[{'id': 'Officer', 'type': 'process', 'title': '', 'nodes': ['Officer^DCSA', 'Officer^Clin…"
       ]
      },
      "metadata": {},
@@ -133,11 +133,24 @@
     "\n",
     "size = dict(width=800, height=800, margins=dict(left=180, right=180))\n",
     "\n",
+    "# Different ways to order nodes\n",
+    "# Sorted\n",
+    "# partition_officer_cat = Partition.Simple('Officer_Category',\n",
+    "#                                              np.unique(flow_coarse_officer_appt['Officer_Category']))\n",
+    "# Unsorted\n",
+    "# partition_officer_cat = Partition.Simple('Officer_Category',\n",
+    "#                                              pd.unique(pd.Series(flow_coarse_officer_appt['Officer_Category'])))\n",
+    "# Fixed\n",
     "partition_officer_cat = Partition.Simple('Officer_Category',\n",
-    "                                             np.unique(flow_coarse_officer_appt['Officer_Category']))\n",
+    "                                         pd.array(['DCSA', 'Clinical', 'Nursing and Midwifery', 'Pharmacy',\n",
+    "                                                   'Laboratory', 'Dental', 'Radiography', 'Mental']))\n",
     "\n",
+    "# partition_appt_cat = Partition.Simple('Appt_Cat',\n",
+    "#                                           np.unique(flow_coarse_officer_appt['Appt_Cat']))\n",
     "partition_appt_cat = Partition.Simple('Appt_Cat',\n",
-    "                                          np.unique(flow_coarse_officer_appt['Appt_Cat']))\n",
+    "                                      pd.array(['ConWithDCSA', 'IPOP', 'RMNCH', 'MISC',\n",
+    "                                      'HIV', 'TB', 'NUTRITION', 'LABORATORY',\n",
+    "                                      'DENTAL', 'RADIOGRAPHY', 'MENTAL']))\n",
     "\n",
     "partition_facility_level = Partition.Simple('Facility_Level',\n",
     "                                            np.unique(flow_coarse_officer_appt['Facility_Level']))\n",
@@ -179,7 +192,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 32,
    "id": "fe0021cb",
    "metadata": {
     "scrolled": true
@@ -188,7 +201,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "1e700fc889c34d19ac176aff4077908d",
+       "model_id": "bb749fcbe713456bbcfdeaeb940c8251",
        "version_major": 2,
        "version_minor": 0
       },
@@ -219,8 +232,11 @@
     "\n",
     "    size = dict(width=800, height=h, margins=dict(left=180, right=180))\n",
     "\n",
-    "    partition_officer_cat = Partition.Simple('Officer_Category', np.unique(flow_coarse_officer_level['Officer_Category']))\n",
-    "    partition_appt_type = Partition.Simple('Appt_Type_Code', np.unique(flow_coarse_officer_level['Appt_Type_Code']))\n",
+    "    partition_officer_cat = Partition.Simple('Officer_Category',\n",
+    "                                             pd.array(['DCSA', 'Clinical', 'Nursing and Midwifery', 'Pharmacy',\n",
+    "                                                       'Laboratory', 'Radiography', 'Dental', 'Mental']))\n",
+    "    partition_appt_type = Partition.Simple('Appt_Type_Code', pd.unique(pd.Series(appt_types['Appt_Type_Code'])))\n",
+    "\n",
     "\n",
     "    nodes = {\n",
     "        'Officer': ProcessGroup(['Officer_Category'], partition_officer_cat),\n",

From 48272f5e242fe4637043398446135696df8465fe Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Thu, 16 Dec 2021 12:05:55 +0000
Subject: [PATCH 035/131] Todo: update input data and reset node order

---
 .../analysis_sankey_appt_and_hsi.ipynb        | 32 ++++++++++++-------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb
index a8df49e85e..b54a1fd748 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb
@@ -36,7 +36,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 46,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -63,7 +63,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 47,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -102,6 +102,12 @@
     "hsi[\"date\"] = pd.to_datetime(hsi[\"date\"])\n",
     "hsi[\"month\"] = hsi[\"date\"].dt.month\n",
     "hsi[\"Module\"] = hsi[\"TREATMENT_ID\"].str.split('_').apply(lambda x: x[0])\n",
+    "# # Rename some modules\n",
+    "# # Rename Module 'HSI' to 'CareOfWomenDuringPregnancy'\n",
+    "# hsi[\"Module\"] = hsi[\"Module\"].replace(\"HSI\", \"CareOfWomenDuringPregnancy\")\n",
+    "# # Rename Module 'GenericEmergency...' and 'GenericFirstAppt...' to 'HealthSeekingBehaviour' (or 'GenericFirstAppt')\n",
+    "# hsi[\"Module\"] = hsi[\"Module\"].replace(\"GenericFirstApptAtFacilityLevel0\", \"HealthSeekingBehaviour\")\n",
+    "# hsi[\"Module\"] = hsi[\"Module\"].replace(\"GenericEmergencyFirstApptAtFacilityLevel1\", \"HealthSeekingBehaviour\")\n",
     "\n",
     "# Demand for appointments\n",
     "num_hsi_by_treatment_id = pd.DataFrame(hsi.groupby(hsi.TREATMENT_ID)['Number_By_Appt_Type_Code'].size())\n",
@@ -114,7 +120,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 48,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -123,11 +129,11 @@
    "outputs": [
     {
      "data": {
-      "text/plain": "SankeyWidget(groups=[{'id': 'Appt', 'type': 'process', 'title': '', 'nodes': ['Appt^ANCSubsequent', 'Appt^Ante…",
+      "text/plain": "SankeyWidget(groups=[{'id': 'Appt', 'type': 'process', 'title': '', 'nodes': ['Appt^Over5OPD', 'Appt^Under5OPD…",
       "application/vnd.jupyter.widget-view+json": {
        "version_major": 2,
        "version_minor": 0,
-       "model_id": "589ac656dff4489a9dc70869184214d9"
+       "model_id": "9c99b64fb36c43f899a8ce6e2eeee64f"
       }
      },
      "metadata": {},
@@ -150,9 +156,12 @@
     "\n",
     "size = dict(width=1000, height=800, margins=dict(left=120, right=520))\n",
     "\n",
-    "partition_appt_type = Partition.Simple('Appt_Type', np.unique(appt_and_hsi['Appt_Type']))\n",
-    "\n",
-    "partition_treatment_id = Partition.Simple('TREATMENT_ID', np.unique(appt_and_hsi['TREATMENT_ID']))\n",
+    "# partition_appt_type = Partition.Simple('Appt_Type', np.unique(appt_and_hsi['Appt_Type']))\n",
+    "# partition_treatment_id = Partition.Simple('TREATMENT_ID', np.unique(appt_and_hsi['TREATMENT_ID']))\n",
+    "# if to keep the order\n",
+    "partition_appt_type = Partition.Simple('Appt_Type', pd.unique(pd.Series(appt_and_hsi['Appt_Type'])))\n",
+    "partition_treatment_id = Partition.Simple('TREATMENT_ID', pd.unique(pd.Series(appt_and_hsi['TREATMENT_ID'])))\n",
+    "# todo: once the input data updated, can fix the oder of the nodes in the way we want\n",
     "\n",
     "nodes = {\n",
     "    'Appt': ProcessGroup(['Appt_Type'], partition_appt_type),\n",
@@ -189,7 +198,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 49,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -202,7 +211,7 @@
       "application/vnd.jupyter.widget-view+json": {
        "version_major": 2,
        "version_minor": 0,
-       "model_id": "271213232ff14b2ba3199c1390d8c044"
+       "model_id": "18543c73a4714391ba867cd8b92d8d53"
       }
      },
      "metadata": {},
@@ -262,8 +271,7 @@
     "# Generate and save Sankey\n",
     "sankey_num_appt_by_hsi = weave(sdd, num_appt_by_hsi, palette=palette, measures='value').to_widget(**size)\n",
     "\n",
-    "sankey_num_appt_by_hsi.auto_save_png(make_graph_file_name('Sankey_num_appt_by_hsi'))\n",
-    "\n"
+    "sankey_num_appt_by_hsi.auto_save_png(make_graph_file_name('Sankey_num_appt_by_hsi'))"
    ]
   }
  ],

From 50134d0e1b8f6e7cbab637d8cf4bf2c7e496503d Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Thu, 16 Dec 2021 16:17:29 +0000
Subject: [PATCH 036/131] Fix the issue of missing HSI events

---
 .../analysis_hsi_in_typical_run.py            | 28 ++++++++-----------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
index 1a452667d4..a676ce7ad9 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
@@ -107,23 +107,17 @@
 num_hsi_by_treatment_id = hsi.groupby(hsi.TREATMENT_ID)['Number_By_Appt_Type_Code'].size()
 
 # find the appt footprint for each treatment_id
-appts_by_treatment_id = \
-    hsi.set_index('TREATMENT_ID')['Number_By_Appt_Type_Code'].drop_duplicates().apply(pd.Series).fillna(0.0)
-
-# Todo: Since the resulted appts_by_treatment_id deleted many hsi events \
-#  (i.e., the hsi list is much shorter than in num_hsi_by_treatment_id), \
-#  need to regenerate this table.
-# appts_by_treatment_id_alt = hsi[["TREATMENT_ID", 'Number_By_Appt_Type_Code']]
-# # drop rows that have empty entries
-# null_appt_idx = appts_by_treatment_id_alt[appts_by_treatment_id_alt['Number_By_Appt_Type_Code'] == {}].index
-# appts_by_treatment_id_alt = appts_by_treatment_id_alt.drop(index=null_appt_idx).copy()
-# # set index
-# appts_by_treatment_id_alt.set_index('TREATMENT_ID', inplace=True)
-# # turn to series
-# appts_by_treatment_id_alt = appts_by_treatment_id_alt.squeeze('columns')
-# # drop repetitive rows that have same index and value
-# # appts_by_treatment_id_alt = appts_by_treatment_id_alt.drop_duplicates()
-
+# in hsi, drop rows with empty 'Number_By_Appt_Type_Code' to avoid warnings of empty series
+null_hsi_idx = hsi[hsi['Number_By_Appt_Type_Code'] == {}].index
+hsi.drop(index=null_hsi_idx, inplace=True)
+# generate the table
+appts_by_treatment_id =pd.DataFrame({
+    _treatment_id: pd.Series(hsi.loc[hsi.TREATMENT_ID == _treatment_id, 'Number_By_Appt_Type_Code'].values[0]) for _treatment_id in num_hsi_by_treatment_id.index
+}).fillna(0.0).T
+
+# the tricky one that omits many hsi events
+# appts_by_treatment_id = \
+#     hsi.set_index('TREATMENT_ID')['Number_By_Appt_Type_Code'].drop_duplicates().apply(pd.Series).fillna(0.0)
 
 # Plot...
 # See the Sankey plot in analysis_sankey_appt_and_hsi.ipynb (in the same folder)

From 921336b62ca30ad157387ca00e1a2acfc4b80b8e Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Thu, 16 Dec 2021 17:14:47 +0000
Subject: [PATCH 037/131] Correct a typo for a treatment_id in Joe's module

---
 src/tlo/methods/care_of_women_during_pregnancy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tlo/methods/care_of_women_during_pregnancy.py b/src/tlo/methods/care_of_women_during_pregnancy.py
index a8b6e5cd9d..00c99973b0 100644
--- a/src/tlo/methods/care_of_women_during_pregnancy.py
+++ b/src/tlo/methods/care_of_women_during_pregnancy.py
@@ -2578,7 +2578,7 @@ def __init__(self, module, person_id):
         super().__init__(module, person_id=person_id)
         assert isinstance(module, CareOfWomenDuringPregnancy)
 
-        self.TREATMENT_ID = 'HSI_CareOfWomenDuringPregnancy_PresentsForInductionOfLabour'
+        self.TREATMENT_ID = 'CareOfWomenDuringPregnancy_PresentsForInductionOfLabour'
 
         the_appt_footprint = self.sim.modules['HealthSystem'].get_blank_appt_footprint()
         the_appt_footprint['Over5OPD'] = 1

From ab8923c16e8731010245922290d34a7931386d29 Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Thu, 16 Dec 2021 17:48:48 +0000
Subject: [PATCH 038/131] Do nor rename modules

---
 .../analysis_hsi_in_typical_run.py                   | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
index a676ce7ad9..7ef5b0ae53 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
@@ -62,10 +62,10 @@
 year = 2016
 hsi["Module"] = hsi["TREATMENT_ID"].str.split('_').apply(lambda x: x[0])
 # Rename Module 'HSI' to 'CareOfWomenDuringPregnancy'
-hsi["Module"] = hsi["Module"].replace("HSI", "CareOfWomenDuringPregnancy")
+# hsi["Module"] = hsi["Module"].replace("HSI", "CareOfWomenDuringPregnancy")
 # Rename Module 'GenericEmergency...' and 'GenericFirstAppt...' to 'HealthSeekingBehaviour' (or 'GenericFirstAppt')
-hsi["Module"] = hsi["Module"].replace("GenericFirstApptAtFacilityLevel0", "HealthSeekingBehaviour")
-hsi["Module"] = hsi["Module"].replace("GenericEmergencyFirstApptAtFacilityLevel1", "HealthSeekingBehaviour")
+# hsi["Module"] = hsi["Module"].replace("GenericFirstApptAtFacilityLevel0", "HealthSeekingBehaviour")
+# hsi["Module"] = hsi["Module"].replace("GenericEmergencyFirstApptAtFacilityLevel1", "HealthSeekingBehaviour")
 
 evs = hsi.loc[hsi.date.dt.year == year]\
     .groupby(by=['month', 'Module'])\
@@ -74,12 +74,12 @@
 
 # Plot:
 # Use colormap tab20 so that each module has a unique color
-color_tab20 = get_cmap('tab20')
+color_tab20 = get_cmap('tab20_r')
 evs.plot.bar(stacked=True, color=color_tab20.colors)
 plt.title(f"HSI by Module, per Month (year {year})")
 plt.ylabel('Total per month')
 plt.tight_layout()
-plt.legend(ncol=3, loc='center', fontsize='x-small')
+plt.legend(ncol=3, loc='center', fontsize='xx-small')
 plt.savefig(make_graph_file_name('HSI_per_module_per_month'))
 plt.show()
 
@@ -96,7 +96,7 @@
     patches, labels, dummy = zip(*sorted(zip(patches, labels, evs[0]),
                                          key=lambda x: x[2],
                                          reverse=True))
-plt.legend(patches, labels, ncol=3, loc='lower center', fontsize='x-small')
+plt.legend(patches, labels, ncol=3, loc='lower center', fontsize='xx-small')
 plt.title("HSI by Module (year 2010-2018)")
 plt.tight_layout()
 plt.savefig(make_graph_file_name('HSI_per_module'))

From eed1444ca9e818d903a184130f59e66176883cf6 Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Thu, 16 Dec 2021 18:18:42 +0000
Subject: [PATCH 039/131] Refactor

---
 .../analysis_sankey_appt_and_hsi.ipynb        | 75 ++++++++++---------
 1 file changed, 41 insertions(+), 34 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb
index b54a1fd748..c2776e892d 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb
@@ -36,7 +36,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
+   "execution_count": 73,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -63,7 +63,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 74,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -102,25 +102,30 @@
     "hsi[\"date\"] = pd.to_datetime(hsi[\"date\"])\n",
     "hsi[\"month\"] = hsi[\"date\"].dt.month\n",
     "hsi[\"Module\"] = hsi[\"TREATMENT_ID\"].str.split('_').apply(lambda x: x[0])\n",
-    "# # Rename some modules\n",
-    "# # Rename Module 'HSI' to 'CareOfWomenDuringPregnancy'\n",
-    "# hsi[\"Module\"] = hsi[\"Module\"].replace(\"HSI\", \"CareOfWomenDuringPregnancy\")\n",
-    "# # Rename Module 'GenericEmergency...' and 'GenericFirstAppt...' to 'HealthSeekingBehaviour' (or 'GenericFirstAppt')\n",
-    "# hsi[\"Module\"] = hsi[\"Module\"].replace(\"GenericFirstApptAtFacilityLevel0\", \"HealthSeekingBehaviour\")\n",
-    "# hsi[\"Module\"] = hsi[\"Module\"].replace(\"GenericEmergencyFirstApptAtFacilityLevel1\", \"HealthSeekingBehaviour\")\n",
     "\n",
     "# Demand for appointments\n",
     "num_hsi_by_treatment_id = pd.DataFrame(hsi.groupby(hsi.TREATMENT_ID)['Number_By_Appt_Type_Code'].size())\n",
     "num_hsi_by_treatment_id.rename(columns={'Number_By_Appt_Type_Code': 'Number_of_HSI'}, inplace=True)\n",
+    "# Note that some hsi events, e.g. VCTPositive, have zero number/frequency.\n",
     "\n",
-    "# Find the appt footprint for each treatment_id\n",
-    "appts_by_treatment_id = \\\n",
+    "# find the appt footprint for each treatment_id\n",
+    "# in hsi, drop rows with empty 'Number_By_Appt_Type_Code' to avoid warnings of empty series\n",
+    "null_hsi_idx = hsi[hsi['Number_By_Appt_Type_Code'] == {}].index\n",
+    "hsi.drop(index=null_hsi_idx, inplace=True)\n",
+    "\n",
+    "# generate the full table\n",
+    "appts_by_treatment_id_full =pd.DataFrame({\n",
+    "    _treatment_id: pd.Series(hsi.loc[hsi.TREATMENT_ID == _treatment_id, 'Number_By_Appt_Type_Code'].values[0]) for _treatment_id in num_hsi_by_treatment_id.index\n",
+    "}).fillna(0.0).T\n",
+    "\n",
+    "# generate the short table (the tricky one that omits many hsi events)\n",
+    "appts_by_treatment_id_short = \\\n",
     "    hsi.set_index('TREATMENT_ID')['Number_By_Appt_Type_Code'].drop_duplicates().apply(pd.Series).fillna(0.0)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": 75,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -129,12 +134,14 @@
    "outputs": [
     {
      "data": {
-      "text/plain": "SankeyWidget(groups=[{'id': 'Appt', 'type': 'process', 'title': '', 'nodes': ['Appt^Over5OPD', 'Appt^Under5OPD…",
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2e4285d845124aae928d7bbf409157a9",
        "version_major": 2,
-       "version_minor": 0,
-       "model_id": "9c99b64fb36c43f899a8ce6e2eeee64f"
-      }
+       "version_minor": 0
+      },
+      "text/plain": [
+       "SankeyWidget(groups=[{'id': 'Appt', 'type': 'process', 'title': '', 'nodes': ['Appt^Over5OPD', 'Appt^Under5OPD…"
+      ]
      },
      "metadata": {},
      "output_type": "display_data"
@@ -144,11 +151,10 @@
     "# Sankey 0 that map appt to hsi considering only appt footprint for each hsi\n",
     "\n",
     "# Prepare the data: appt type and number per hsi\n",
-    "appt_and_hsi = appts_by_treatment_id.reset_index().copy()\n",
+    "appt_and_hsi = appts_by_treatment_id_short.reset_index().copy()\n",
+    "appt_and_hsi.rename(columns={'index': 'TREATMENT_ID'}, inplace=True)\n",
     "appt_and_hsi = pd.melt(appt_and_hsi, id_vars=['TREATMENT_ID'], value_vars=appt_and_hsi.columns[1:],\n",
     "                      var_name='Appt_Type')\n",
-    "# Only consider non-zero entries\n",
-    "# appt_and_hsi = appt_and_hsi[appt_and_hsi['value'] > 0].copy()\n",
     "\n",
     "# Define the flow\n",
     "appt_and_hsi['source'] = 'Appt_Type'\n",
@@ -156,12 +162,12 @@
     "\n",
     "size = dict(width=1000, height=800, margins=dict(left=120, right=520))\n",
     "\n",
-    "# partition_appt_type = Partition.Simple('Appt_Type', np.unique(appt_and_hsi['Appt_Type']))\n",
-    "# partition_treatment_id = Partition.Simple('TREATMENT_ID', np.unique(appt_and_hsi['TREATMENT_ID']))\n",
+    "partition_appt_type = Partition.Simple('Appt_Type', np.unique(appt_and_hsi['Appt_Type']))\n",
+    "partition_treatment_id = Partition.Simple('TREATMENT_ID', np.unique(appt_and_hsi['TREATMENT_ID']))\n",
     "# if to keep the order\n",
-    "partition_appt_type = Partition.Simple('Appt_Type', pd.unique(pd.Series(appt_and_hsi['Appt_Type'])))\n",
-    "partition_treatment_id = Partition.Simple('TREATMENT_ID', pd.unique(pd.Series(appt_and_hsi['TREATMENT_ID'])))\n",
-    "# todo: once the input data updated, can fix the oder of the nodes in the way we want\n",
+    "# partition_appt_type = Partition.Simple('Appt_Type', pd.unique(pd.Series(appt_and_hsi['Appt_Type'])))\n",
+    "# partition_treatment_id = Partition.Simple('TREATMENT_ID', pd.unique(pd.Series(appt_and_hsi['TREATMENT_ID'])))\n",
+    "# Can fix the oder of the nodes in the way we want\n",
     "\n",
     "nodes = {\n",
     "    'Appt': ProcessGroup(['Appt_Type'], partition_appt_type),\n",
@@ -198,7 +204,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": 76,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -207,12 +213,14 @@
    "outputs": [
     {
      "data": {
-      "text/plain": "SankeyWidget(groups=[{'id': 'Appt', 'type': 'process', 'title': '', 'nodes': ['Appt^ANCSubsequent', 'Appt^Ante…",
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "80aaa0f35ab641308c4c7293f5ca0f08",
        "version_major": 2,
-       "version_minor": 0,
-       "model_id": "18543c73a4714391ba867cd8b92d8d53"
-      }
+       "version_minor": 0
+      },
+      "text/plain": [
+       "SankeyWidget(groups=[{'id': 'Appt', 'type': 'process', 'title': '', 'nodes': ['Appt^ANCSubsequent', 'Appt^Ante…"
+      ]
      },
      "metadata": {},
      "output_type": "display_data"
@@ -221,15 +229,14 @@
    "source": [
     "# Sankey 1 that maps appt to hsi considering appt footprint for each hsi and number of each hsi\n",
     "\n",
-    "# Prepare the data plus: total number of appts per hsi for year 2010-2018\n",
-    "num_appt_by_hsi = appts_by_treatment_id.copy()\n",
+    "# Prepare the data: total number of appts per hsi for year 2010-2018\n",
+    "num_appt_by_hsi = appts_by_treatment_id_full.copy()\n",
     "for event in num_appt_by_hsi.index:\n",
-    "    num_appt_by_hsi.loc[event,:] = appts_by_treatment_id.loc[event,:] * num_hsi_by_treatment_id.loc[event, 'Number_of_HSI']\n",
+    "    num_appt_by_hsi.loc[event,:] = appts_by_treatment_id_full.loc[event,:] * num_hsi_by_treatment_id.loc[event, 'Number_of_HSI']\n",
     "num_appt_by_hsi = num_appt_by_hsi.reset_index().copy()\n",
+    "num_appt_by_hsi.rename(columns={'index': 'TREATMENT_ID'}, inplace=True)\n",
     "num_appt_by_hsi = pd.melt(num_appt_by_hsi, id_vars=['TREATMENT_ID'], value_vars=num_appt_by_hsi.columns[1:],\n",
     "                      var_name='Appt_Type')\n",
-    "# Only consider non-zero entries\n",
-    "# num_appt_by_hsi = num_appt_by_hsi[num_appt_by_hsi['value'] > 0].copy()\n",
     "\n",
     "# Define the flow\n",
     "num_appt_by_hsi['source'] = 'Appt_Type'\n",
@@ -271,7 +278,7 @@
     "# Generate and save Sankey\n",
     "sankey_num_appt_by_hsi = weave(sdd, num_appt_by_hsi, palette=palette, measures='value').to_widget(**size)\n",
     "\n",
-    "sankey_num_appt_by_hsi.auto_save_png(make_graph_file_name('Sankey_num_appt_by_hsi'))"
+    "sankey_num_appt_by_hsi.auto_save_png(make_graph_file_name('Sankey_num_appt_by_hsi'))\n"
    ]
   }
  ],

From bc100b3df81f8982332ea540d3b7718dacd5dc54 Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Thu, 16 Dec 2021 18:26:22 +0000
Subject: [PATCH 040/131] Refactor

---
 .../hsi_in_typical_run/analysis_hsi_in_typical_run.py       | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
index 7ef5b0ae53..714d454e7d 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
@@ -111,8 +111,10 @@
 null_hsi_idx = hsi[hsi['Number_By_Appt_Type_Code'] == {}].index
 hsi.drop(index=null_hsi_idx, inplace=True)
 # generate the table
-appts_by_treatment_id =pd.DataFrame({
-    _treatment_id: pd.Series(hsi.loc[hsi.TREATMENT_ID == _treatment_id, 'Number_By_Appt_Type_Code'].values[0]) for _treatment_id in num_hsi_by_treatment_id.index
+appts_by_treatment_id = pd.DataFrame({
+    _treatment_id: pd.Series(
+        hsi.loc[hsi.TREATMENT_ID == _treatment_id, 'Number_By_Appt_Type_Code'].values[0]
+    ) for _treatment_id in num_hsi_by_treatment_id.index
 }).fillna(0.0).T
 
 # the tricky one that omits many hsi events

From 81def5828ae942952563f38b307feb2e9ac73826 Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Fri, 17 Dec 2021 11:07:42 +0000
Subject: [PATCH 041/131] Note down a possible issue

---
 .../hsi_in_typical_run/analysis_hsi_in_typical_run.py    | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
index 714d454e7d..c1f047576d 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
@@ -118,8 +118,13 @@
 }).fillna(0.0).T
 
 # the tricky one that omits many hsi events
-# appts_by_treatment_id = \
-#     hsi.set_index('TREATMENT_ID')['Number_By_Appt_Type_Code'].drop_duplicates().apply(pd.Series).fillna(0.0)
+appts_by_treatment_id_short = \
+    hsi.set_index('TREATMENT_ID')['Number_By_Appt_Type_Code'].drop_duplicates().apply(pd.Series).fillna(0.0)
+
+# A possible issue:
+# set(appts_by_treatment_id_short.columns)-set(appts_by_treatment_id.columns)
+# the output is: {'NormalDelivery', 'VCTPositive'}
+# not clear yet why the two appts are not in the table appts_by_treatment_id
 
 # Plot...
 # See the Sankey plot in analysis_sankey_appt_and_hsi.ipynb (in the same folder)

From 6a787031d1800059a554847e96bdf6ca5aa411ff Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Sat, 18 Dec 2021 09:57:05 +0000
Subject: [PATCH 042/131] Refactor

---
 .../hsi_in_typical_run/analysis_hsi_in_typical_run.py        | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
index c1f047576d..aa218f81ac 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
@@ -61,11 +61,6 @@
 # Number of HSI that are taking place by originating module, by month
 year = 2016
 hsi["Module"] = hsi["TREATMENT_ID"].str.split('_').apply(lambda x: x[0])
-# Rename Module 'HSI' to 'CareOfWomenDuringPregnancy'
-# hsi["Module"] = hsi["Module"].replace("HSI", "CareOfWomenDuringPregnancy")
-# Rename Module 'GenericEmergency...' and 'GenericFirstAppt...' to 'HealthSeekingBehaviour' (or 'GenericFirstAppt')
-# hsi["Module"] = hsi["Module"].replace("GenericFirstApptAtFacilityLevel0", "HealthSeekingBehaviour")
-# hsi["Module"] = hsi["Module"].replace("GenericEmergencyFirstApptAtFacilityLevel1", "HealthSeekingBehaviour")
 
 evs = hsi.loc[hsi.date.dt.year == year]\
     .groupby(by=['month', 'Module'])\

From 57f61195caf1c1ac9f07fc0a3c1ef32be51c7efb Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Sun, 19 Dec 2021 16:25:01 +0000
Subject: [PATCH 043/131] Fix nodes order in the sankey

---
 .../analysis_sankey_appt_and_hsi.ipynb        | 53 +++++++++++--------
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb
index c2776e892d..c3c933a6b1 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb
@@ -36,7 +36,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 73,
+   "execution_count": 35,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -63,7 +63,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 74,
+   "execution_count": 36,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -74,7 +74,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Results folder is: C:\\Users\\jdbb1\\Desktop\\TLOmodel\\outputs\\bshe@ic.ac.uk\\scenario_hsi_in_typical_run-2021-12-10T180225Z\n"
+      "Results folder is: C:\\Users\\jdbb1\\Desktop\\TLOmodel\\outputs\\bshe@ic.ac.uk\\scenario_hsi_in_typical_run-2021-12-17T212823Z\n"
      ]
     }
    ],
@@ -125,7 +125,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 75,
+   "execution_count": 37,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -134,14 +134,12 @@
    "outputs": [
     {
      "data": {
+      "text/plain": "SankeyWidget(groups=[{'id': 'Appt', 'type': 'process', 'title': '', 'nodes': ['Appt^IPAdmission', 'Appt^Inpati…",
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "2e4285d845124aae928d7bbf409157a9",
        "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "SankeyWidget(groups=[{'id': 'Appt', 'type': 'process', 'title': '', 'nodes': ['Appt^Over5OPD', 'Appt^Under5OPD…"
-      ]
+       "version_minor": 0,
+       "model_id": "7e191fcd3cd24c41a1f3476f35698a4c"
+      }
      },
      "metadata": {},
      "output_type": "display_data"
@@ -162,12 +160,27 @@
     "\n",
     "size = dict(width=1000, height=800, margins=dict(left=120, right=520))\n",
     "\n",
-    "partition_appt_type = Partition.Simple('Appt_Type', np.unique(appt_and_hsi['Appt_Type']))\n",
-    "partition_treatment_id = Partition.Simple('TREATMENT_ID', np.unique(appt_and_hsi['TREATMENT_ID']))\n",
-    "# if to keep the order\n",
+    "# Nodes in alphabetic order\n",
+    "# partition_appt_type = Partition.Simple('Appt_Type', np.unique(appt_and_hsi['Appt_Type']))\n",
+    "# partition_treatment_id = Partition.Simple('TREATMENT_ID', np.unique(appt_and_hsi['TREATMENT_ID']))\n",
+    "# if to keep the order in the dataframe\n",
     "# partition_appt_type = Partition.Simple('Appt_Type', pd.unique(pd.Series(appt_and_hsi['Appt_Type'])))\n",
     "# partition_treatment_id = Partition.Simple('TREATMENT_ID', pd.unique(pd.Series(appt_and_hsi['TREATMENT_ID'])))\n",
-    "# Can fix the oder of the nodes in the way we want\n",
+    "# if to fix the oder of the nodes in the way we want\n",
+    "partition_appt_type = Partition.Simple('Appt_Type', pd.array([\n",
+    "    'IPAdmission', 'InpatientDays', 'Over5OPD', 'Under5OPD',\n",
+    "    'AntenatalFirst', 'ANCSubsequent', 'CompDelivery', 'NormalDelivery',\n",
+    "    'FamPlan', 'MajorSurg', 'ConWithDCSA',\n",
+    "    'MaleCirc', 'NewAdult', 'VCTNegative', 'VCTPositive']))\n",
+    "partition_treatment_id = Partition.Simple('TREATMENT_ID', pd.array([\n",
+    "    'Malaria_treatment_complicated_child', 'Malaria_IPTp',\n",
+    "    'Diarrhoea_Treatment_Inpatient', 'Depression_Antidepressant_Refill',\n",
+    "    'PostnatalSupervisor_NeonatalWardInpatientCare', 'CareOfWomenDuringPregnancy_FirstAntenatalCareContact',\n",
+    "    'CareOfWomenDuringPregnancy_AntenatalOutpatientManagementOfAnaemia',\n",
+    "    'CareOfWomenDuringPregnancy_PostAbortionCaseManagement', 'Labour_ReceivesSkilledBirthAttendanceDuringLabour',\n",
+    "    'Contraception_FamilyPlanningAppt', 'GenericEmergencyFirstApptAtFacilityLevel1',\n",
+    "    'GenericFirstApptAtFacilityLevel0', 'OesophagealCancer_StartTreatment', 'OtherAdultCancer_StartTreatment',\n",
+    "    'Hiv_Circumcision', 'Hiv_Treatment_InitiationOrContinuation', 'Hiv_TestAndRefer']))\n",
     "\n",
     "nodes = {\n",
     "    'Appt': ProcessGroup(['Appt_Type'], partition_appt_type),\n",
@@ -204,7 +217,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 76,
+   "execution_count": 38,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -213,14 +226,12 @@
    "outputs": [
     {
      "data": {
+      "text/plain": "SankeyWidget(groups=[{'id': 'Appt', 'type': 'process', 'title': '', 'nodes': ['Appt^ANCSubsequent', 'Appt^Ante…",
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "80aaa0f35ab641308c4c7293f5ca0f08",
        "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "SankeyWidget(groups=[{'id': 'Appt', 'type': 'process', 'title': '', 'nodes': ['Appt^ANCSubsequent', 'Appt^Ante…"
-      ]
+       "version_minor": 0,
+       "model_id": "f62cf5beaf3e4bc096cf5e3181722cbd"
+      }
      },
      "metadata": {},
      "output_type": "display_data"

From cf51d930774cb6241fb529bf7724706eb8ce3f2f Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Thu, 10 Feb 2022 18:13:30 +0000
Subject: [PATCH 044/131] Embark the update of healthsystem data

---
 src/scripts/data_file_processing/formatting_healthsystem_data.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/scripts/data_file_processing/formatting_healthsystem_data.py b/src/scripts/data_file_processing/formatting_healthsystem_data.py
index f9950d8dd6..228605fa97 100644
--- a/src/scripts/data_file_processing/formatting_healthsystem_data.py
+++ b/src/scripts/data_file_processing/formatting_healthsystem_data.py
@@ -36,6 +36,7 @@
   Scenario 'funded' -> appt_have_or_miss_capability
   Scenario 'funded_plus' -> appt_have_or_miss_capability
 """
+# Task: incorporate the new version of human-resources input data
 
 from pathlib import Path
 

From b19ea34f785ecee3b7e0e70955da479501b4a7dc Mon Sep 17 00:00:00 2001
From: BinglingICL <b.she@imperial.ac.uk>
Date: Mon, 14 Feb 2022 14:20:02 +0000
Subject: [PATCH 045/131] Generate a new file to process the new CHAI data

---
 .../formatting_healthsystem_data_update.py    | 1814 +++++++++++++++++
 1 file changed, 1814 insertions(+)
 create mode 100644 src/scripts/data_file_processing/formatting_healthsystem_data_update.py

diff --git a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
new file mode 100644
index 0000000000..b2d3091bcc
--- /dev/null
+++ b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
@@ -0,0 +1,1814 @@
+"""
+This file sets up the health system resources for each district, each region, and also national level.
+
+It defines 7 levels for facility types, i.e., Facility_Levels = [0,1a,1b,2,3,4,5].
+
+It creates one facility of each level for each district.
+
+It allocates health care workers ('officers') to one of the seven Facility Levels.
+
+The tables generated is listed below:
+- capability tables to repo
+  Scenario 'actual' -> ResourceFile_Daily_Capabilities (./resources/healthsystem/human_resources/actual/)
+  Scenario 'funded' -> ResourceFile_Daily_Capabilities (./resources/healthsystem/human_resources/funded/)
+  Scenario 'funded_plus' -> ResourceFile_Daily_Capabilities (./resources/healthsystem/human_resources/funded_plus/)
+
+- definition tables to repo
+  ResourceFile_Appt_Time_Table (./resources/healthsystem/human_resources/definitions/)
+  ResourceFile_Appt_Types_Table (./resources/healthsystem/human_resources/definitions/)
+  ResourceFile_ApptType_By_FacLevel_Table (./resources/healthsystem/human_resources/definitions/)
+  ResourceFile_Officers_Types_Table (./resources/healthsystem/human_resources/definitions/)
+
+- organisation tables to repo
+  ResourceFile_Master_Facilities_List_Table (./resources/healthsystem/human_resources/organisation/)
+
+- other tables that can be generated by this file
+  Scenario 'actual' -> ResourceFile_Staff_Table
+  Scenario 'funded' -> ResourceFile_Staff_Table
+  Scenario 'funded_plus' -> ResourceFile_Staff_Table
+  Scenario 'actual' -> ResourceFile_Staff_Distribution_Assumption
+  Scenario 'funded' -> ResourceFile_Staff_Distribution_Assumption
+  ResourceFile_Staff_Distribution_Compare
+  ResourceFile_Patient_Facing_Time
+  ResourceFile_District_Population_Data
+  ResourceFile_Facilities_For_Each_District
+  Scenario 'actual' -> appt_have_or_miss_capability
+  Scenario 'funded' -> appt_have_or_miss_capability
+  Scenario 'funded_plus' -> appt_have_or_miss_capability
+"""
+# Task: incorporate the new version of human-resources input data
+# Since the two data versions have quite a few differences, will first explore the new version
+# and then decide how to use all the available at hand.
+
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+
+resourcefilepath = Path('./resources')
+
+path_to_dropbox = Path(
+    '/Users/jdbb1/Dropbox/Thanzi La Onse')  # <-- point to the TLO dropbox locally
+
+workingfile = Path(
+    '/Users/jdbb1/OneDrive/Desktop/healthsystem data update/Malawi optimization model import_2022-02-11.xlsx'
+)  # <-- point to the new data locally
+
+path_to_auxiliaryfiles = (path_to_dropbox /
+                          '05 - Resources' /
+                          'Module-healthsystem' /
+                          'chai ehp resource use data' /
+                          'Auxiliary CHAI Data from CHAI HR Team 12 Sep 2021')
+
+outputlocation = Path('/Users/jdbb1/OneDrive/Desktop/healthsystem data update/output')  # <-- output locally
+
+# ---------------------------------------------------------------------------------------------------------------------
+# *** create and save population_by_district data
+population = pd.read_csv(
+    resourcefilepath/'demography'/'ResourceFile_PopulationSize_2018Census.csv'
+)
+
+pop_by_district = pd.DataFrame(population.groupby('District')['Count'].sum())
+
+# Add the column of Region
+for d in pop_by_district.index:
+    pop_by_district.loc[d, 'Region'] = population.loc[population['District'] == d, 'Region'].values[0]
+
+# Save
+# pop_by_district.to_csv(outputlocation / 'organisation' / 'ResourceFile_District_Population_Data.csv', index=True)
+
+# ---------------------------------------------------------------------------------------------------------------------
+# *** Below we generate staffing tables: fund_staffing_table for funded/established staff, and\
+# curr_staffing_table for current staff
+# Before generating the tables, we need to prepare wb_import, officer_types_table, and\
+# make assumptions of curr_staff_return distribution and fund_staff_return distribution using Auxiliary CHAI Data
+
+# --- wb_import for staff information
+
+# Import all of the 'CurrentStaff' sheet, including both data of current and funded staff
+wb_import = pd.read_excel(workingfile, sheet_name='Staff', header=None)
+
+# --- officer_types_table
+# Make dataframe summarising the officer types and the officer codes:
+officer_types_table = wb_import.loc[2:3, 64:84].transpose().reset_index(drop=True).copy()
+officer_types_table.columns = ['Officer_Type', 'Officer_Type_Code']
+
+# Add the categories of officers
+officer_types_table.loc[0:2, 'Officer_Category'] = 'Clinical'
+officer_types_table.loc[3:4, 'Officer_Category'] = 'Nursing_and_Midwifery'
+officer_types_table.loc[5:7, 'Officer_Category'] = 'Pharmacy'
+officer_types_table.loc[8:10, 'Officer_Category'] = 'Laboratory'
+officer_types_table.loc[11, 'Officer_Category'] = 'DCSA'
+officer_types_table.loc[12:14, 'Officer_Category'] = 'Dental'
+officer_types_table.loc[15, 'Officer_Category'] = 'Mental'
+officer_types_table.loc[16, 'Officer_Category'] = 'Nutrition'
+officer_types_table.loc[17:20, 'Officer_Category'] = 'Radiography'
+
+# Save
+officer_types_table.to_csv(outputlocation / 'human_resources' / 'definitions' / 'ResourceFile_Officer_Types_Table.csv',
+                           index=False)
+
+# --- Generate assumptions of current staff distribution at facility levels 0&1a&1b&2
+# Read compiled staff return data from CHAI auxiliary datasets
+compiled_staff_return = pd.read_excel(path_to_auxiliaryfiles / 'Compiled Staff Returns.xlsx',
+                                      sheet_name='Compiled Staff Returns', skiprows=range(5))
+
+# Get relevant columns
+curr_staff_return = compiled_staff_return[['District / Central Hospital', 'MOH/ CHAM', 'Name of Incumbent', 'Cadre',
+                                           'Health Facility', 'Health Facility Type']].copy()
+
+# Drop rows with missing elements
+curr_staff_return.dropna(inplace=True)
+
+# Drop rows that associate to '_NOT INCLUDED' and '_MISSING'
+curr_staff_return.drop(curr_staff_return[curr_staff_return['Cadre'] == '_NOT INCLUDED'].index, inplace=True)
+curr_staff_return.drop(curr_staff_return[curr_staff_return['Cadre'] == '_MISSING'].index, inplace=True)
+
+# Drop rows that associate to 'Home Craft Worker' and 'Educ/Environ Health Officer',
+# as these cadres are not included in 'Time_Base' and 'PFT'.
+curr_staff_return.drop(curr_staff_return[curr_staff_return['Cadre'] == 'Home Craft Worker'].index, inplace=True)
+curr_staff_return.drop(curr_staff_return[curr_staff_return['Cadre'] == 'Educ/Environ Health Officer'].index,
+                       inplace=True)
+
+# Replace 'HSA' by 'DCSA', 'Nutrition Officer' by 'Nutrition Staff',
+# 'Pharmacy Technician' by 'Pharm Technician', 'Pharmacy Assistant' by 'Pharm Assistant',
+# to be consistent with officer_types_table
+idx_hsa = curr_staff_return[curr_staff_return['Cadre'] == 'HSA'].index
+curr_staff_return.loc[idx_hsa, 'Cadre'] = 'DCSA'
+
+idx_nutri = curr_staff_return[curr_staff_return['Cadre'] == 'Nutrition Officer'].index
+curr_staff_return.loc[idx_nutri, 'Cadre'] = 'Nutrition Staff'
+
+idx_pt = curr_staff_return[curr_staff_return['Cadre'] == 'Pharmacy Technician'].index
+curr_staff_return.loc[idx_pt, 'Cadre'] = 'Pharm Technician'
+
+idx_pa = curr_staff_return[curr_staff_return['Cadre'] == 'Pharmacy Assistant'].index
+curr_staff_return.loc[idx_pa, 'Cadre'] = 'Pharm Assistant'
+
+# Replace health facility type "Karonga Hospital" to "District Hospital"
+idx_Karonga = curr_staff_return[curr_staff_return['Health Facility Type'] == 'Karonga Hospital'].index
+curr_staff_return.loc[idx_Karonga, 'Health Facility Type'] = 'District Hospital'
+
+# Reassign the facility type of Zomba Mental Hospital as 'Zomba Mental Hospital', instead of 'Central Hospital',
+# to differentiate it with other central hospitals
+idx_ZMH = curr_staff_return[curr_staff_return['Health Facility'] == 'Zomba Mental Hospital'].index
+curr_staff_return.loc[idx_ZMH, 'Health Facility Type'] = 'Zomba Mental Hospital'
+
+# Add a column 'Staff_Count' to denote the no. of staff
+curr_staff_return['Staff_Count'] = 1
+
+# Reset index
+curr_staff_return.reset_index(drop=True, inplace=True)
+
+# Important definition: Facility_Levels = [0, 1a, 1b, 2, 3, 4, 5]
+# 0: Community/Local level - HP, Village Health Committee, Community initiatives
+# 1a: Primary level - Dispensary, HC, Clinic, Maternity facility
+# 1b: Primary level - Community/Rural Hospital, CHAM (Community) Hospitals
+# 2: Second level - District hospital, DHO
+# 3: Tertiary/Referral level - KCH, MCH, ZCH + QECH as referral hospitals
+# 4: Zomba Mental Hospital, which has very limited data in CHAI dataset
+# 5: Headquarter, which has staff data (but no Time_Base or Incidence_Curr data)
+
+# Get the Health Facility Type list and Cadre list
+# Note three cadres of 'R04 Radiotherapy Technician', 'R03 Sonographer', 'D03 Dental Assistant' have no data
+# in CHAI current and funded staff sheet and complied staff return dataset.
+fac_types_list = pd.unique(curr_staff_return['Health Facility Type'])  # Level_0 Facs and Headquarter not included
+cadre_list = pd.unique(curr_staff_return['Cadre'])  # Radiotherapy Technician/Sonographer/Dental Assistant not included
+
+# Add column 'Facility_Level'; HQ not listed in compiled staff return table
+idx_urbhc = curr_staff_return[curr_staff_return['Health Facility Type'] == 'Urban Health Center'].index
+curr_staff_return.loc[idx_urbhc, 'Facility_Level'] = 'Facility_Level_1a'  # Including CHAM HCs
+
+idx_rurhc = curr_staff_return[curr_staff_return['Health Facility Type'] == 'Rural Health Center'].index
+curr_staff_return.loc[idx_rurhc, 'Facility_Level'] = 'Facility_Level_1a'  # Including CHAM HCs
+
+idx_comhos = curr_staff_return[curr_staff_return['Health Facility Type'] == 'Community Hospital'].index
+curr_staff_return.loc[idx_comhos, 'Facility_Level'] = 'Facility_Level_1b'  # Including CHAM community hospitals
+
+idx_dishos = curr_staff_return[curr_staff_return['Health Facility Type'] == 'District Hospital'].index
+curr_staff_return.loc[idx_dishos, 'Facility_Level'] = 'Facility_Level_2'
+
+idx_cenhos = curr_staff_return[curr_staff_return['Health Facility Type'] == 'Central Hospital'].index
+curr_staff_return.loc[idx_cenhos, 'Facility_Level'] = 'Facility_Level_3'
+
+idx_zmhfac = curr_staff_return[curr_staff_return['Health Facility Type'] == 'Zomba Mental Hospital'].index
+curr_staff_return.loc[idx_zmhfac, 'Facility_Level'] = 'Facility_Level_4'
+
+# Add column 'Cadre_Code'
+for c in cadre_list:
+    curr_staff_return.loc[curr_staff_return['Cadre'] == c, 'Cadre_Code'] = officer_types_table.loc[
+        officer_types_table['Officer_Type'] == c, 'Officer_Type_Code'].copy().values[0]
+
+# Check no blanks in this table
+assert not pd.isnull(curr_staff_return).any().any()
+
+# curr_staff_return ready!
+
+# Get curr_staff_return distribution among levels 0, 1a, 1b and 2, i.e., staff distribution within a district
+# Specifically, only and all DCSAs/HSAs are to be allocated at level 0;
+# Other cadres are to be allocated at level 1a and above.
+
+curr_staff_district = curr_staff_return[['Facility_Level', 'Cadre_Code', 'Staff_Count']].copy()
+
+# Group staff by facility level
+curr_staff_distribution = pd.DataFrame(
+    curr_staff_district.groupby(by=['Cadre_Code', 'Facility_Level'], sort=False).sum())
+curr_staff_distribution.sort_index(level=[0, 1], inplace=True)
+curr_staff_distribution.reset_index(drop=False, inplace=True)
+
+# Make the curr_staff_distribution includes all cadres and facility levels (0,1a,1b,2,3,4) as index and columns
+cadre_faclevel = pd.DataFrame(columns=['Cadre_Code', 'Facility_Level_0', 'Facility_Level_1a',
+                                       'Facility_Level_1b', 'Facility_Level_2', 'Facility_Level_3',
+                                       'Facility_Level_4'])
+cadre_faclevel['Cadre_Code'] = officer_types_table['Officer_Type_Code']
+cadre_faclevel = pd.melt(cadre_faclevel, id_vars='Cadre_Code', value_vars=cadre_faclevel.columns[1:],
+                         var_name='Facility_Level')
+# Merge
+curr_staff_distribution = curr_staff_distribution.merge(cadre_faclevel, how='right')
+# Fill null with 0
+curr_staff_distribution.fillna(0, inplace=True)
+# Sort
+curr_staff_distribution.set_index(['Cadre_Code', 'Facility_Level'], inplace=True)
+curr_staff_distribution.sort_index(level=[0, 1], inplace=True)
+curr_staff_distribution.reset_index(drop=False, inplace=True)
+curr_staff_distribution.drop(['value'], axis=1, inplace=True)
+
+# Save the the complete current staff distribution table
+# curr_staff_distribution_complete = curr_staff_distribution.copy()
+
+# Keep and focus on rows of levels 0, 1a, 1b, and 2
+idx_keep = curr_staff_distribution[(curr_staff_distribution['Facility_Level'] == 'Facility_Level_0') |
+                                   (curr_staff_distribution['Facility_Level'] == 'Facility_Level_1a') |
+                                   (curr_staff_distribution['Facility_Level'] == 'Facility_Level_1b') |
+                                   (curr_staff_distribution['Facility_Level'] == 'Facility_Level_2')].index
+curr_staff_distribution = curr_staff_distribution.loc[idx_keep, :].copy()
+curr_staff_distribution.reset_index(drop=True, inplace=True)
+
+# Add column 'Proportion', denoting the percents of staff per cadre between level 0, level_1a, level_1b, and level_2
+for i in range(21):
+    # Proportion; Cadres except DCSA are allocated at level 1a and above
+    if curr_staff_distribution.loc[4 * i + 1:4 * i + 3, 'Staff_Count'].sum() > 0:  # sum of 4i+1,4i+2,4i+3
+
+        curr_staff_distribution.loc[4 * i + 1, 'Proportion'] = (
+            curr_staff_distribution.loc[4 * i + 1, 'Staff_Count'] /
+            curr_staff_distribution.loc[4 * i + 1:4 * i + 3, 'Staff_Count'].sum()
+        )
+
+        curr_staff_distribution.loc[4 * i + 2, 'Proportion'] = (
+            curr_staff_distribution.loc[4 * i + 2, 'Staff_Count'] /
+            curr_staff_distribution.loc[4 * i + 1:4 * i + 3, 'Staff_Count'].sum()
+        )
+
+        curr_staff_distribution.loc[4 * i + 3, 'Proportion'] = (
+            curr_staff_distribution.loc[4 * i + 3, 'Staff_Count'] /
+            curr_staff_distribution.loc[4 * i + 1:4 * i + 3, 'Staff_Count'].sum()
+        )
+
+# fillna
+curr_staff_distribution.fillna(0, inplace=True)
+
+# For DCSA individually, reassign their proportions since we assume all DCSAs are located at level 0
+idx_dcsa = curr_staff_distribution[curr_staff_distribution['Cadre_Code'] == 'E01'].index
+curr_staff_distribution.loc[idx_dcsa[0], 'Proportion'] = 1.00
+curr_staff_distribution.loc[idx_dcsa[1:4], 'Proportion'] = 0.00
+# Alternatively, DCSAs 50% at level 0 and 50% at level 1a?
+
+# curr_staff_distribution ready!
+
+# Save
+# curr_staff_distribution.to_csv(
+#     outputlocation / 'human_resources' / 'actual' / 'ResourceFile_Staff_Distribution_Assumption.csv',
+#     index=False)
+
+# --- Generate assumptions of established/funded staff distribution at facility levels 0&1a&1b&2
+# Read 2018-03-09 Facility-level establishment MOH & CHAM from CHAI auxiliary datasets
+fund_staff_2018_raw = pd.read_excel(path_to_auxiliaryfiles / '2018-03-09 Facility-level establishment MOH & CHAM.xlsx',
+                                    sheet_name='Establishment listing')
+
+# Get relevant columns
+fund_staff_2018 = fund_staff_2018_raw[['Number of positions', 'Facility', 'Facility Type', 'WFOM Cadre']].copy()
+
+# Drop rows with missing/blank elements
+fund_staff_2018.dropna(inplace=True)
+# Drop rows that associate to '_NOT INCLUDED'
+fund_staff_2018.drop(fund_staff_2018[fund_staff_2018['WFOM Cadre'] == '_NOT INCLUDED'].index, inplace=True)
+# Drop rows for 'Training Institution'
+fund_staff_2018.drop(fund_staff_2018[fund_staff_2018['Facility Type'] == 'Training Institution'].index, inplace=True)
+# Reset index after drop
+fund_staff_2018.reset_index(drop=True, inplace=True)
+
+# Reform column 'WFOM Cadre'
+# Note 'Cadre_Extra' records 'Clinical ' or 'Nursing ' for C01 and C02.
+# We combine C01 and C02 into C01 denoting mental health staff cadre to be consistent with 'curr_staff_return'.
+fund_staff_2018[['Cadre_No.', 'Cadre_Code', 'Cadre', 'Cadre_Extra']] = \
+    fund_staff_2018['WFOM Cadre'].str.split(pat='-| - ', expand=True).copy()
+fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Cadre_Code'] == 'C02'].index, 'Cadre_Code'] = 'C01'
+# Drop columns ['WFOM Cadre','Cadre_No.','Cadre_Extra']
+fund_staff_2018.drop(columns=['WFOM Cadre', 'Cadre_No.', 'Cadre_Extra'], inplace=True)
+
+# Drop rows that associate to 'Home Craft Worker', 'Educ/Environ Health Officer', and 'Community Midwife Assistant'
+# as these cadres are not included in 'Time_Base' and 'PFT'.
+fund_staff_2018.drop(fund_staff_2018[fund_staff_2018['Cadre'] == 'Home Craft Worker'].index, inplace=True)
+fund_staff_2018.drop(fund_staff_2018[fund_staff_2018['Cadre'] == 'Educ/Environ Health Officer'].index, inplace=True)
+fund_staff_2018.drop(fund_staff_2018[fund_staff_2018['Cadre'] == 'Community Midwife Assistant'].index, inplace=True)
+# Reset index
+fund_staff_2018.reset_index(drop=True, inplace=True)
+
+# Replace {
+# 'HSA' by 'DCSA' (and 'E02' by 'E01') , 'Medical Assistant' by 'Med. Assistant', 'Laboratory Officer' by 'Lab Officer',
+# 'Laboratory Technician' by 'Lab Technician', 'Laboratory Assistant' by 'Lab Assistant'
+# 'Nursing Officer/Registered Nurse' by 'Nurse Officer', 'Dentist' by 'Dental Officer',
+# 'Nutrition Officer' by 'Nutrition Staff', 'Pharmacy Technician' by 'Pharm Technician',
+# 'Pharmacy Assistant' by 'Pharm Assistant', 'Pharmacy Officer' by 'Pharmacist' }
+# to be consistent with officer_types_table
+fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Cadre'] == 'HSA'].index, 'Cadre'] = 'DCSA'
+fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Cadre_Code'] == 'E02'].index, 'Cadre_Code'] = 'E01'
+fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Cadre'] == 'Medical Assistant'].index, 'Cadre'] = 'Med. Assistant'
+fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Cadre'] == 'Laboratory Officer'].index, 'Cadre'] = 'Lab Officer'
+fund_staff_2018.loc[
+    fund_staff_2018[fund_staff_2018['Cadre'] == 'Laboratory Technician'].index, 'Cadre'] = 'Lab Technician'
+fund_staff_2018.loc[
+    fund_staff_2018[fund_staff_2018['Cadre'] == 'Laboratory Assistant'].index, 'Cadre'] = 'Lab Assistant'
+fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Cadre'] == 'Nursing Officer/Registered Nurse'].index,
+                    'Cadre'] = 'Nurse Officer'
+fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Cadre'] == 'Dentist'].index, 'Cadre'] = 'Dental Officer'
+fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Cadre'] == 'Nutrition Officer'].index, 'Cadre'] = 'Nutrition Staff'
+fund_staff_2018.loc[
+    fund_staff_2018[fund_staff_2018['Cadre'] == 'Pharmacy Technician'].index, 'Cadre'] = 'Pharm Technician'
+fund_staff_2018.loc[
+    fund_staff_2018[fund_staff_2018['Cadre'] == 'Pharmacy Assistant'].index, 'Cadre'] = 'Pharm Assistant'
+fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Cadre'] == 'Pharmacy Officer'].index, 'Cadre'] = 'Pharmacist'
+
+# Note that {D03 'Dental Assistant', R03 'Radiotherapy Technician', R04 'Sonographer'} are not included in this dataset.
+# This is OK because CHAI current and funded staff sheet has no data regarding the three cadres.
+
+# Reassign the facility type of Zomba Mental Hospital as 'Zomba Mental Hospital'.
+fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Facility'] == 'Zomba Mental Hospital'].index,
+                    'Facility Type'] = 'Zomba Mental Hospital'
+
+# Important definition: Facility_Levels = [0, 1a, 1b, 2, 3, 4, 5]
+# 0: Community/Local level - HP, Village Health Committee, Community initiatives
+# 1a: Primary level - Dispensary, HC, Clinic, Maternity facility
+# 1b: Primary level - Community/Rural Hospital, CHAM (Community) Hospitals
+# 2: Second level - District hospital, DHO
+# 3: Tertiary/Referral level - KCH, MCH, ZCH + QECH as referral hospitals
+# 4: Zomba Mental Hospital, which has very limited data in CHAI dataset
+# 5: Headquarter, which has staff data (but no Time_Base or Incidence_Curr data)
+
+# Get the Health Facility Type list
+# fac_types_list = pd.unique(fund_staff_2018['Facility Type']) # Level_0 Facs not included
+
+# Add column 'Facility_Level'
+fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Facility Type'] == 'Urban Health Center'].index,
+                    'Facility_Level'] = 'Facility_Level_1a'
+fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Facility Type'] == 'Rural Health Center'].index,
+                    'Facility_Level'] = 'Facility_Level_1a'
+fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Facility Type'] == 'Health Center (with maternity)'].index,
+                    'Facility_Level'] = 'Facility_Level_1a'
+fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Facility Type'] == 'Health Center (without maternity)'].index,
+                    'Facility_Level'] = 'Facility_Level_1a'
+
+fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Facility Type'] == 'Rural/Community Hospital'].index,
+                    'Facility_Level'] = 'Facility_Level_1b'
+
+fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Facility Type'] == 'District Hospital'].index,
+                    'Facility_Level'] = 'Facility_Level_2'
+fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Facility Type'] == 'DHO'].index,
+                    'Facility_Level'] = 'Facility_Level_2'
+
+fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Facility Type'] == 'Central Hospital'].index,
+                    'Facility_Level'] = 'Facility_Level_3'
+
+fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Facility Type'] == 'Zomba Mental Hospital'].index,
+                    'Facility_Level'] = 'Facility_Level_4'
+
+fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Facility Type'] == 'Headquarters'].index,
+                    'Facility_Level'] = 'Facility_Level_5'
+
+# Check no blanks in this table
+assert not pd.isnull(fund_staff_2018).any().any()
+
+# fund_staff_2018 ready!
+
+# Get fund_staff_return distribution among levels 0, 1a, 1b and 2, i.e., staff distribution within a district
+# Specifically, only and all DCSAs/HSAs are to be allocated at level 0;
+# Other cadres are to be allocated at level 1a and above.
+
+fund_staff_district = fund_staff_2018[['Facility_Level', 'Cadre_Code', 'Number of positions']].copy()
+
+# Group staff by facility level
+fund_staff_distribution = pd.DataFrame(
+    fund_staff_district.groupby(by=['Cadre_Code', 'Facility_Level'], sort=False).sum())
+fund_staff_distribution.sort_index(level=[0, 1], inplace=True)
+fund_staff_distribution.reset_index(drop=False, inplace=True)
+
+# Make the fund_staff_distribution includes all cadres and facility levels (0,1a,1b,2,3,4,5) as index and columns
+fund_cadre_faclevel = pd.DataFrame(columns=['Cadre_Code', 'Facility_Level_0', 'Facility_Level_1a',
+                                            'Facility_Level_1b', 'Facility_Level_2', 'Facility_Level_3',
+                                            'Facility_Level_4', 'Facility_Level_5'])
+fund_cadre_faclevel['Cadre_Code'] = officer_types_table['Officer_Type_Code']
+fund_cadre_faclevel = pd.melt(fund_cadre_faclevel, id_vars='Cadre_Code', value_vars=fund_cadre_faclevel.columns[1:],
+                              var_name='Facility_Level')
+# Merge
+fund_staff_distribution = fund_staff_distribution.merge(fund_cadre_faclevel, how='right')
+# Fill null with 0
+fund_staff_distribution.fillna(0, inplace=True)
+# Sort
+fund_staff_distribution.set_index(['Cadre_Code', 'Facility_Level'], inplace=True)
+fund_staff_distribution.sort_index(level=[0, 1], inplace=True)
+fund_staff_distribution.reset_index(drop=False, inplace=True)
+fund_staff_distribution.drop(['value'], axis=1, inplace=True)
+
+# Save the the complete funded staff distribution table
+# fund_staff_distribution_complete = fund_staff_distribution.copy()
+
+# Keep and focus on rows of levels 0, 1a, 1b, and 2
+fund_idx_keep = fund_staff_distribution[(fund_staff_distribution['Facility_Level'] == 'Facility_Level_0') |
+                                        (fund_staff_distribution['Facility_Level'] == 'Facility_Level_1a') |
+                                        (fund_staff_distribution['Facility_Level'] == 'Facility_Level_1b') |
+                                        (fund_staff_distribution['Facility_Level'] == 'Facility_Level_2')].index
+fund_staff_distribution = fund_staff_distribution.loc[fund_idx_keep, :].copy()
+fund_staff_distribution.reset_index(drop=True, inplace=True)
+
+# Add column 'Proportion', denoting the percents of staff per cadre between level 0, level_1a, level_1b, and level_2
+for i in range(21):
+    # Proportion; Cadres except DCSA are allocated at level 1a and above
+    if fund_staff_distribution.loc[4 * i + 1:4 * i + 3, 'Number of positions'].sum() > 0:  # sum of 4i+1,4i+2,4i+3
+
+        fund_staff_distribution.loc[4 * i + 1, 'Proportion_Fund'] = (
+            fund_staff_distribution.loc[4 * i + 1, 'Number of positions'] /
+            fund_staff_distribution.loc[4 * i + 1:4 * i + 3, 'Number of positions'].sum()
+        )
+
+        fund_staff_distribution.loc[4 * i + 2, 'Proportion_Fund'] = (
+            fund_staff_distribution.loc[4 * i + 2, 'Number of positions'] /
+            fund_staff_distribution.loc[4 * i + 1:4 * i + 3, 'Number of positions'].sum()
+        )
+
+        fund_staff_distribution.loc[4 * i + 3, 'Proportion_Fund'] = (
+            fund_staff_distribution.loc[4 * i + 3, 'Number of positions'] /
+            fund_staff_distribution.loc[4 * i + 1:4 * i + 3, 'Number of positions'].sum()
+        )
+
+# fillna
+fund_staff_distribution.fillna(0, inplace=True)
+
+# For DCSA individually, reassign their proportions since we assume all DCSAs are located at level 0
+fund_idx_dcsa = fund_staff_distribution[fund_staff_distribution['Cadre_Code'] == 'E01'].index
+fund_staff_distribution.loc[fund_idx_dcsa[0], 'Proportion_Fund'] = 1.00
+fund_staff_distribution.loc[fund_idx_dcsa[1:4], 'Proportion_Fund'] = 0.00
+# Alternatively, DCSAs 50% at level 0 and 50% at level 1a?
+
+# fund_staff_distribution ready!
+
+# Save
+# fund_staff_distribution.to_csv(
+#     outputlocation / 'human_resources' / 'funded' / 'ResourceFile_Staff_Distribution_Assumption.csv',
+#     index=False)
+
+# We read info from CHAI estimates of optimal and immediately needed workforce for comparison wherever possible
+# --- CHAI WFOM optimal workforce and immediately needed staff distribution
+
+# Preparing optimal workforce from CHAI auxiliary datasets
+opt_workforce = pd.read_excel(path_to_auxiliaryfiles / 'MalawiOptimization_OUTPUT2022 SH 2019-10-19.xlsx',
+                              sheet_name='Sums by facility type')
+# Drop redundant row
+opt_workforce.drop(0, inplace=True)
+opt_workforce.reset_index(drop=True, inplace=True)
+
+# Add column 'Facility_level'
+opt_workforce.insert(2, 'Facility_Level', ['Facility_Level_3',
+                                           'Facility_Level_1b',
+                                           'Facility_Level_2',
+                                           'Facility_Level_1a',
+                                           'Facility_Level_1a'])
+
+# Get staff distribution between level_1a, level_1b and level_2 per cadre
+cols_matter = opt_workforce.columns[2:24]
+opt_workforce_distribution = opt_workforce.loc[1:4, cols_matter].copy()  # drop row Facility_Level_3
+opt_workforce_distribution = pd.DataFrame(opt_workforce_distribution.groupby(by=['Facility_Level'], sort=False).sum())
+opt_workforce_distribution.sort_index(inplace=True)
+# Reset index
+opt_workforce_distribution.reset_index(drop=False, inplace=True)
+
+# Transform to long format
+opt_workforce_distribution = pd.melt(opt_workforce_distribution, id_vars='Facility_Level', value_vars=cols_matter[1:],
+                                     var_name='Cadre_Opt', value_name='Staff_Count_Opt')
+
+# Add column 'Cadre_Code'
+for i in range(63):
+    opt_workforce_distribution.loc[i, 'Cadre_Code'] = str(opt_workforce_distribution.loc[i, 'Cadre_Opt'])[7:10]
+
+# Sort to be consistent with curr_staff_distribution
+# Drop unnecessary column
+opt_workforce_distribution.set_index(['Cadre_Code', 'Facility_Level'], inplace=True)
+opt_workforce_distribution.sort_index(level=[0, 1], inplace=True)
+opt_workforce_distribution.reset_index(drop=False, inplace=True)
+opt_workforce_distribution.drop(columns=['Cadre_Opt'], inplace=True)
+
+# Add column 'Proportion', denoting the percents of staff per cadre between level_1a, level_1b and level_2
+for i in range(21):
+    if opt_workforce_distribution.loc[3 * i:3 * i + 2, 'Staff_Count_Opt'].sum() > 0:  # sum of 3i,3i+1,3i+2
+        opt_workforce_distribution.loc[3 * i, 'Proportion_Opt'] = (
+            opt_workforce_distribution.loc[3 * i, 'Staff_Count_Opt'] /
+            opt_workforce_distribution.loc[3 * i:3 * i + 2, 'Staff_Count_Opt'].sum()
+        )
+
+        opt_workforce_distribution.loc[3 * i + 1, 'Proportion_Opt'] = (
+            opt_workforce_distribution.loc[3 * i + 1, 'Staff_Count_Opt'] /
+            opt_workforce_distribution.loc[3 * i:3 * i + 2, 'Staff_Count_Opt'].sum()
+        )
+
+        opt_workforce_distribution.loc[3 * i + 2, 'Proportion_Opt'] = (
+            opt_workforce_distribution.loc[3 * i + 2, 'Staff_Count_Opt'] /
+            opt_workforce_distribution.loc[3 * i:3 * i + 2, 'Staff_Count_Opt'].sum()
+        )
+
+# fillna
+opt_workforce_distribution.fillna(0, inplace=True)
+
+# opt_workforce_distribution ready!
+
+# Preparing immediately needed estimates from CHAI auxiliary datasets
+immed_need = pd.read_excel(path_to_auxiliaryfiles / 'MalawiOptimization_OUTPUT_ALLYEARS_Curr.xlsx',
+                           sheet_name='CurrBase Output')
+
+# Select relevant data
+idx_year = immed_need[immed_need['OutputYear'] == 2016].index
+immed_need_distribution = immed_need.loc[idx_year, immed_need.columns[np.r_[1, 3, 49:70]]]
+immed_need_distribution.dropna(inplace=True)
+
+# Add column 'Facility_Level'
+immed_need_distribution.loc[immed_need_distribution[immed_need_distribution['FacilityType'] ==
+                                                    'UrbHC'].index, 'Facility_Level'] = 'Facility_Level_1a'
+
+immed_need_distribution.loc[immed_need_distribution[immed_need_distribution['FacilityType'] ==
+                                                    'RurHC'].index, 'Facility_Level'] = 'Facility_Level_1a'
+
+immed_need_distribution.loc[immed_need_distribution[immed_need_distribution['FacilityType'] ==
+                                                    'ComHos'].index, 'Facility_Level'] = 'Facility_Level_1b'
+
+immed_need_distribution.loc[immed_need_distribution[immed_need_distribution['FacilityType'] ==
+                                                    'DisHos'].index, 'Facility_Level'] = 'Facility_Level_2'
+
+immed_need_distribution.loc[immed_need_distribution[immed_need_distribution['FacilityType'] ==
+                                                    'CenHos'].index, 'Facility_Level'] = 'Facility_Level_3'
+
+# Group staff by levels
+immed_need_distribution = pd.DataFrame(immed_need_distribution.groupby(by=['Facility_Level'], sort=False).sum())
+# Drop level 3
+immed_need_distribution.drop(index='Facility_Level_3', inplace=True)
+# Reset index
+immed_need_distribution.reset_index(inplace=True)
+
+# Transform to long format
+assert set(immed_need_distribution.columns[1:]) == set(cols_matter[1:])
+immed_need_distribution = pd.melt(immed_need_distribution, id_vars='Facility_Level', value_vars=cols_matter[1:],
+                                  var_name='Cadre_ImmedNeed', value_name='Staff_Count_ImmedNeed')
+
+# Add column 'Cadre_Code'
+for i in range(63):
+    immed_need_distribution.loc[i, 'Cadre_Code'] = str(immed_need_distribution.loc[i, 'Cadre_ImmedNeed'])[7:10]
+
+# Sort to be consistent with curr_staff_distribution
+# Drop unnecessary column
+immed_need_distribution.set_index(['Cadre_Code', 'Facility_Level'], inplace=True)
+immed_need_distribution.sort_index(level=[0, 1], inplace=True)
+immed_need_distribution.reset_index(drop=False, inplace=True)
+immed_need_distribution.drop(columns=['Cadre_ImmedNeed'], inplace=True)
+
+# Add column 'Proportion', denoting the percents of staff per cadre among level_1a, level_1b, and level_2
+for i in range(21):
+    if immed_need_distribution.loc[3 * i:3 * i + 2, 'Staff_Count_ImmedNeed'].sum() > 0:  # sum of 3i,3i+1,3i+2
+        immed_need_distribution.loc[3 * i, 'Proportion_ImmedNeed'] = (
+            immed_need_distribution.loc[3 * i, 'Staff_Count_ImmedNeed'] /
+            immed_need_distribution.loc[3 * i:3 * i + 2, 'Staff_Count_ImmedNeed'].sum()
+        )
+
+        immed_need_distribution.loc[3 * i + 1, 'Proportion_ImmedNeed'] = (
+            immed_need_distribution.loc[3 * i + 1, 'Staff_Count_ImmedNeed'] /
+            immed_need_distribution.loc[3 * i:3 * i + 2, 'Staff_Count_ImmedNeed'].sum()
+        )
+
+        immed_need_distribution.loc[3 * i + 2, 'Proportion_ImmedNeed'] = (
+            immed_need_distribution.loc[3 * i + 2, 'Staff_Count_ImmedNeed'] /
+            immed_need_distribution.loc[3 * i:3 * i + 2, 'Staff_Count_ImmedNeed'].sum()
+        )
+
+# fillna
+immed_need_distribution.fillna(0, inplace=True)
+
+# immed_need_distribution ready!
+
+# --- Combine curr_staff_distribution, fund_staff_distribution, opt_workforce_distribution, and immed_need_distribution
+# Compare if possible
+
+# Merge curr and opt data
+# First, drop rows of level_0 of curr_staff_distribution, for compare_staff_distribution
+idx_level0 = curr_staff_distribution[curr_staff_distribution['Facility_Level'] == 'Facility_Level_0'].index
+compare_staff_distribution = curr_staff_distribution.drop(idx_level0, axis=0, inplace=False).copy()
+# Merge
+compare_staff_distribution = curr_staff_distribution.merge(opt_workforce_distribution, how='right')
+
+# Check before adding ImmedNeed data
+assert (compare_staff_distribution['Cadre_Code'] == immed_need_distribution['Cadre_Code']).all()
+assert (compare_staff_distribution['Facility_Level'] == immed_need_distribution['Facility_Level']).all()
+# Add Staff_Count_ImmedNeed and Proportion_ImmedNeed to the merged table
+compare_staff_distribution['Staff_Count_ImmedNeed'] = immed_need_distribution['Staff_Count_ImmedNeed'].copy()
+compare_staff_distribution['Proportion_ImmedNeed'] = immed_need_distribution['Proportion_ImmedNeed'].copy()
+
+# Add fund data
+# First, drop rows of level_0 of fund_staff_distribution
+fund_idx_level0 = fund_staff_distribution[fund_staff_distribution['Facility_Level'] == 'Facility_Level_0'].index
+fund_staff_distribution_nolevel0 = fund_staff_distribution.drop(fund_idx_level0, axis=0, inplace=False).copy()
+fund_staff_distribution_nolevel0.reset_index(drop=True, inplace=True)
+# Check before combination
+assert (compare_staff_distribution['Cadre_Code'] == fund_staff_distribution_nolevel0['Cadre_Code']).all()
+assert (compare_staff_distribution['Facility_Level'] == fund_staff_distribution_nolevel0['Facility_Level']).all()
+# Add Number of positions and Proportion_Fund to the merged table
+compare_staff_distribution.insert(4, 'Staff_Count_Fund', fund_staff_distribution_nolevel0['Number of positions'].values)
+compare_staff_distribution.insert(5, 'Proportion_Fund', fund_staff_distribution_nolevel0['Proportion_Fund'].values)
+
+# Calculate the difference
+for i in range(63):
+    # Current data compared with Fund, Opt, and ImmedNeed
+    if compare_staff_distribution.loc[i, 'Proportion_Fund'] > 0:
+        compare_staff_distribution.loc[i, 'Curr_vs_Fund'] = (
+            (compare_staff_distribution.loc[i, 'Proportion'] - compare_staff_distribution.loc[i, 'Proportion_Fund']) /
+            compare_staff_distribution.loc[i, 'Proportion_Fund']
+        )
+
+    if compare_staff_distribution.loc[i, 'Proportion_Opt'] > 0:
+        compare_staff_distribution.loc[i, 'Curr_vs_Opt'] = (
+            (compare_staff_distribution.loc[i, 'Proportion'] - compare_staff_distribution.loc[i, 'Proportion_Opt']) /
+            compare_staff_distribution.loc[i, 'Proportion_Opt']
+        )
+
+    if compare_staff_distribution.loc[i, 'Proportion_ImmedNeed'] > 0:
+        compare_staff_distribution.loc[i, 'Curr_vs_ImmedNeed'] = (
+            (compare_staff_distribution.loc[i, 'Proportion'] -
+             compare_staff_distribution.loc[i, 'Proportion_ImmedNeed']) /
+            compare_staff_distribution.loc[i, 'Proportion_ImmedNeed']
+        )
+    # Funded data compared with Opt and ImmedNeed
+    if compare_staff_distribution.loc[i, 'Proportion_Opt'] > 0:
+        compare_staff_distribution.loc[i, 'Fund_vs_Opt'] = (
+            (compare_staff_distribution.loc[i, 'Proportion_Fund'] -
+             compare_staff_distribution.loc[i, 'Proportion_Opt']) /
+            compare_staff_distribution.loc[i, 'Proportion_Opt']
+        )
+
+    if compare_staff_distribution.loc[i, 'Proportion_ImmedNeed'] > 0:
+        compare_staff_distribution.loc[i, 'Fund_vs_ImmedNeed'] = (
+            (compare_staff_distribution.loc[i, 'Proportion_Fund'] -
+             compare_staff_distribution.loc[i, 'Proportion_ImmedNeed']) /
+            compare_staff_distribution.loc[i, 'Proportion_ImmedNeed']
+        )
+
+# Save
+# compare_staff_distribution.to_csv(outputlocation / 'ResourceFile_Staff_Distribution_Compare.csv', index=False)
+
+# ***
+# --- fund_staffing_table for funded/established staff
+# Extract just the section about "Funded TOTAl Staff'
+wb_extract = wb_import.loc[3:37, 64:84]
+wb_extract = wb_extract.drop([4, 5])
+wb_extract.columns = wb_extract.iloc[0]
+wb_extract = wb_extract.drop([3])
+wb_extract = wb_extract.reset_index(drop=True)
+wb_extract.fillna(0, inplace=True)  # replace all null values with zero values
+
+# Add in the column to the dataframe for the labels that distinguishes whether
+# these officers are allocated to the district-or-lower levels or one of the key hospitals.
+labels = wb_import.loc[6:37, 0].reset_index(drop=True)
+is_distlevel = labels.copy()
+is_distlevel[0:27] = True  # for district-or-lower levels
+is_distlevel[27:] = False  # for CenHos-or-above levels
+
+wb_extract.loc[:, 'District_Or_Hospital'] = labels
+wb_extract.loc[:, 'Is_DistrictLevel'] = is_distlevel
+
+# Finished import from the CHAI excel:
+fund_staffing_table = wb_extract.copy()
+
+# There are a large number of officer_types EO1 (DCSA/Comm Health Workers) at HQ level, which is non-sensible
+# Therefore, re-distribute these evenly to the districts.
+extra_CHW = fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'HQ or missing',
+                                    fund_staffing_table.columns[fund_staffing_table.columns == 'E01']].values[0][0]
+fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'HQ or missing',
+                        fund_staffing_table.columns[fund_staffing_table.columns == 'E01']] = 0
+extra_CHW_per_district = int(np.floor(extra_CHW / fund_staffing_table['Is_DistrictLevel'].sum()))
+fund_staffing_table.loc[fund_staffing_table['Is_DistrictLevel'], 'E01'] = \
+    fund_staffing_table.loc[fund_staffing_table['Is_DistrictLevel'], 'E01'] + \
+    extra_CHW_per_district
+
+# The imported staffing table suggest that there is 1 Dental officer (D01) in each district,
+# but the TimeBase data (below) suggest that no appointment occurring at a district-level Facility can incur
+# the time such an officer. Therefore reallocate the D01 officers to the Referral Hospitals
+extra_D01 = fund_staffing_table.loc[
+    ~fund_staffing_table['District_Or_Hospital'].isin(['KCH', 'MCH', 'QECH', 'ZCH']),
+    fund_staffing_table.columns[fund_staffing_table.columns == 'D01']].sum().values[0]
+fund_staffing_table.loc[~fund_staffing_table['District_Or_Hospital'].isin(['KCH', 'MCH', 'QECH', 'ZCH']),
+                        fund_staffing_table.columns[fund_staffing_table.columns == 'D01']] = 0
+extra_D01_per_referralhosp = extra_D01 / 4  # divided by 4 CenHos
+fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'].isin(['KCH', 'MCH', 'QECH', 'ZCH']), 'D01'] = \
+    fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'].isin(['KCH', 'MCH', 'QECH', 'ZCH']), 'D01'] + \
+    extra_D01_per_referralhosp
+
+# *** Only for funded_plus ********************************************************************************************
+# Since districts Balaka,Machinga,Mwanza,Neno,Ntchisi,Salima and central hospitals have 0 C01, while C01 is \
+# required by Mental appts at level 1b, level 2 and level 3, we move some C01 from 'HQ or missing' to them. \
+# To achieve this, we evenly distribute 30 C01 at HQ to all districts and central hospitals (27 DisHos, 4 CenHos)
+C01_at_HQ = fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'HQ or missing', 'C01'].values
+extra_C01_per_district_CenHos = C01_at_HQ / 31
+fund_staffing_table.loc[~fund_staffing_table['District_Or_Hospital'].isin(['HQ or missing']), 'C01'] = (
+    fund_staffing_table.loc[~fund_staffing_table['District_Or_Hospital'].isin(['HQ or missing']), 'C01'] +
+    extra_C01_per_district_CenHos)
+fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'HQ or missing', 'C01'] = 0
+# *********************************************************************************************************************
+
+# Sort out which are district allocations and which are central hospitals and above
+
+# We assign HQ to HQ; KCH as RefHos in Central region; MCH as RefHos in Northern region;
+# QECH and ZCH as RefHos in Southern region (QECH is in Southwest and ZCH is in Southeast).
+fund_staffing_table.loc[
+    fund_staffing_table['District_Or_Hospital'] == 'HQ or missing', 'District_Or_Hospital'] = 'Headquarter'
+fund_staffing_table.loc[
+    fund_staffing_table['District_Or_Hospital'] == 'KCH', 'District_Or_Hospital'] = 'Referral Hospital_Central'
+fund_staffing_table.loc[
+    fund_staffing_table['District_Or_Hospital'] == 'MCH', 'District_Or_Hospital'] = 'Referral Hospital_Northern'
+fund_staffing_table.loc[
+    fund_staffing_table['District_Or_Hospital'] == 'QECH', 'District_Or_Hospital'] = 'Referral Hospital_Southern'
+# fund_staffing_table.loc[
+# fund_staffing_table['District_Or_Hospital'] == 'QECH', 'District_Or_Hospital'] = 'Referral Hospital_Southwest'
+fund_staffing_table.loc[
+    fund_staffing_table['District_Or_Hospital'] == 'ZCH', 'District_Or_Hospital'] = 'Referral Hospital_Southern'
+# fund_staffing_table.loc[
+# fund_staffing_table['District_Or_Hospital'] == 'ZCH', 'District_Or_Hospital'] = 'Referral Hospital_Southeast'
+
+# Group the referral hospitals QECH and ZCH as Referral Hospital_Southern
+Is_DistrictLevel = fund_staffing_table['Is_DistrictLevel'].values  # Save the column 'Is_DistrictLevel' first
+fund_staffing_table = pd.DataFrame(
+    fund_staffing_table.groupby(by=['District_Or_Hospital'], sort=False).sum()).reset_index()
+fund_staffing_table.insert(1, 'Is_DistrictLevel', Is_DistrictLevel[:-1])  # Add the column 'Is_DistrictLevel'
+
+# Add a row for Zomba Mental Hospital with 3 C01 mental health staff
+# (according to data in 2018-03-09 Facility-level establishment MOH & CHAM)
+# (This is much less than the current 12 C01.)
+fund_ZMH = pd.DataFrame(columns=fund_staffing_table.columns.copy())
+fund_ZMH.loc[0, 'District_Or_Hospital'] = 'Zomba Mental Hospital'
+fund_ZMH.loc[0, 'Is_DistrictLevel'] = False
+fund_ZMH.loc[0, 'C01'] = 3
+# Alternatively, if consider all potential cadres from compiled staff return
+# fund_cadres_ZMH = pd.DataFrame(index = [0], columns = ['M01','M02','M03','N01','N02','C01','P02','L02'],
+#                          data = np.array([[2,13,14,8,30,3,1,1]]))
+# for col in fund_cadres_ZMH.columns:
+#    fund_ZMH.loc[0,col] = fund_cadres_ZMH.loc[0,col].copy()
+
+# Concat
+fund_staffing_table = pd.concat([fund_staffing_table, fund_ZMH])
+fund_staffing_table.reset_index(drop=True, inplace=True)
+fund_staffing_table.fillna(0, inplace=True)
+
+# File 2018-03-09 Facility-level establishment MOH & CHAM indicates that ZMH is assigned to Zomba District,
+# We therefore subtract the 3 C01 staff from Zomba District.
+fund_idx_ZombaDist = fund_staffing_table[fund_staffing_table['District_Or_Hospital'] == 'Zomba'].index
+fund_staffing_table.loc[fund_idx_ZombaDist, 'C01'] = \
+    fund_staffing_table.loc[fund_idx_ZombaDist, 'C01'] - fund_ZMH.loc[0, 'C01']
+# Alternatively, if consider all potential cadres from compiled staff return
+# fund_staffing_table.loc[fund_idx_ZombaDist, :] =\
+# fund_staffing_table.loc[fund_idx_ZombaDist, :] - fund_ZMH.loc[0,:]
+
+# Check that fund_staffing_table.loc[fund_idx_ZombaDist, :] >=0
+assert (fund_staffing_table.loc[fund_idx_ZombaDist, 'M01':'R04'].values >= 0).all()
+
+# The following districts are not in the CHAI data because they are included within other districts.
+# For now, we will say that the division of staff between these cities and the wide district (where they are included)
+# is consistent with the population recorded for them.
+# i.e., to use population-based weights to reallocate staff
+
+# Add in Likoma (part Nkhata Bay)
+# Add in Lilongwe City (part of Lilongwe)
+# Add in Mzuzu City (part of Mziba) ASSUMED
+# Add in Zomba City (part of Zomba)
+# Add in Blantyre City (part of Blantyre)
+
+# create mapping: the new districts : super_district
+split_districts = (
+    ('Likoma', 'Nkhata Bay'),
+    ('Lilongwe City', 'Lilongwe'),
+    ('Mzuzu City', 'Mzimba'),
+    ('Zomba City', 'Zomba'),
+    ('Blantyre City', 'Blantyre')
+)
+
+# reallocating staff to the new districts
+for i in np.arange(0, len(split_districts)):
+    new_district = split_districts[i][0]
+    super_district = split_districts[i][1]
+
+    record = fund_staffing_table.iloc[0].copy()  # get a row of the staffing table
+
+    # make a the record for the new district
+    record['District_Or_Hospital'] = new_district
+    record['Is_DistrictLevel'] = True
+
+    # get total staff level from the super districts
+    cols = set(fund_staffing_table.columns).intersection(set(officer_types_table.Officer_Type_Code))
+
+    total_staff = fund_staffing_table.loc[
+        fund_staffing_table['District_Or_Hospital'] == super_district, cols].values.squeeze()
+
+    # get the weight; The original weights w0 for the 5 new districts in order are 0.05,0.60,0.24,0.14,1.77(> 1)
+    w0 = pop_by_district.loc[new_district, 'Count'] / pop_by_district.loc[super_district, 'Count']
+    if w0 < 1:
+        w = w0
+    else:
+        w = 0.5
+
+    # assign w * 100% staff to the new district
+    record.loc[cols] = w * total_staff
+    fund_staffing_table = fund_staffing_table.append(record).reset_index(drop=True)
+
+    # take staff away from the super district
+    fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == super_district, cols] = \
+        fund_staffing_table.loc[
+            fund_staffing_table[
+                'District_Or_Hospital'] == super_district, cols] - record.loc[cols]
+
+# Confirm the merging will be perfect:
+pop = pop_by_district.reset_index(drop=False, inplace=False)
+assert set(pop['District'].values) == set(
+    fund_staffing_table.loc[fund_staffing_table['Is_DistrictLevel'], 'District_Or_Hospital'])
+assert len(pop['District'].values) == len(
+    fund_staffing_table.loc[fund_staffing_table['Is_DistrictLevel'], 'District_Or_Hospital'])
+
+# ... double check by doing the merge explicitly
+pop_districts = pd.DataFrame({'District': pd.unique(pop['District'])})  # data frame
+chai_districts = pd.DataFrame(
+    {'District': fund_staffing_table.loc[fund_staffing_table['Is_DistrictLevel'], 'District_Or_Hospital']})
+
+merge_result = pop_districts.merge(chai_districts, how='inner', indicator=True)
+assert all(merge_result['_merge'] == 'both')
+assert len(merge_result) == len(pop_districts)
+
+# Split staff within each district to level 0 (All DCSAs at HP), level 1a (Disp, HC, etc.),
+# level 1b (ComHos, CHAM ComHos), and level 2 (DisHos, etc.), according to fund_staff_distribution.
+
+# First, generate a df with all districts and facility levels 0 - 2 per district
+district_faclevel = pd.DataFrame(columns=['District_Or_Hospital', 'Facility_Level_0', 'Facility_Level_1a',
+                                          'Facility_Level_1b', 'Facility_Level_2'])
+district_faclevel['District_Or_Hospital'] = pop['District'].values.copy()
+district_faclevel = pd.melt(district_faclevel, id_vars='District_Or_Hospital', value_vars=district_faclevel.columns[1:],
+                            var_name='Facility_Level')
+district_faclevel.set_index(['District_Or_Hospital', 'Facility_Level'], inplace=True)
+district_faclevel.sort_index(level=[0, 1], inplace=True)
+district_faclevel.reset_index(drop=False, inplace=True)
+district_faclevel.drop(columns=['value'], axis=1, inplace=True)
+# Merge
+fund_staffing_table = district_faclevel.merge(fund_staffing_table, how='outer')
+
+# Split staff among levels
+
+# *** Only for funded_plus ********************************************************************************************
+# Before split, update the funded C01 distributions at levels 1a, 1b and 2 using CHAI Optimal Workforce estimates. \
+# This is because funded C01 are all at level 1b (100%), meanwhile appt time base requires C01 at level 2. \
+# CHAI Optimal Workforce locates C01 47.92% at level 1b and 52.08% at level 2, which seems more sensible.
+idx_c01_level_1b = fund_staff_distribution[
+    (fund_staff_distribution['Cadre_Code'] == 'C01') &
+    (fund_staff_distribution['Facility_Level'] == 'Facility_Level_1b')].index
+fund_staff_distribution.loc[idx_c01_level_1b, 'Proportion_Fund'] = 0.4792
+
+idx_c01_level_2 = fund_staff_distribution[
+    (fund_staff_distribution['Cadre_Code'] == 'C01') &
+    (fund_staff_distribution['Facility_Level'] == 'Facility_Level_2')].index
+fund_staff_distribution.loc[idx_c01_level_2, 'Proportion_Fund'] = 0.5208
+# *********************************************************************************************************************
+
+# Split
+for district in pop['District']:
+    for cadre in set(fund_staffing_table.columns[3:]):
+        # The proportions
+        weight = fund_staff_distribution.loc[fund_staff_distribution['Cadre_Code'] == cadre,
+                                             ['Facility_Level', 'Proportion_Fund']].copy()
+        # The staff count before splitting
+        old_count = fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == district,
+                                            ['Facility_Level', cadre]].copy()
+
+        # Check that Facility levels of weight and old_count are consistent
+        assert (weight['Facility_Level'].values == old_count['Facility_Level'].values).all()
+
+        # Check that if old_count is not 0, then weight is not 0, guaranteeing that staff are split
+        if (old_count[cadre] > 0).any():
+            assert (weight['Proportion_Fund'] > 0).any()
+
+        # Split
+        fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == district, cadre] = (
+            old_count[cadre].values * weight['Proportion_Fund'].values)
+
+# Add facility levels for HQ, CenHos and ZMH
+fund_staffing_table.loc[128:132, 'Facility_Level'] = ['Facility_Level_5', 'Facility_Level_3',
+                                                      'Facility_Level_3', 'Facility_Level_3',
+                                                      'Facility_Level_4']
+
+# fund_staffing_table ready!
+
+# Save the table without column 'Is_DistrictLevel'; staff counts in floats
+fund_staffing_table_to_save = fund_staffing_table.drop(columns='Is_DistrictLevel', inplace=False)
+# fund_staffing_table_to_save.to_csv(
+#     outputlocation / 'human_resources' / 'funded' / 'ResourceFile_Staff_Table.csv', index=False)
+# fund_staffing_table_to_save.to_csv(
+#     outputlocation / 'human_resources' / 'funded_plus' / 'ResourceFile_Staff_Table.csv', index=False)
+
+# ***
+# --- Creating curr_staffing_table and curr_staff_list for current staff
+# Extract the section about "Current TOTAl Staff'
+hcw_curr_extract = wb_import.loc[3:37, 1:21]
+hcw_curr_extract = hcw_curr_extract.drop([4, 5])
+hcw_curr_extract.columns = hcw_curr_extract.iloc[0]
+hcw_curr_extract = hcw_curr_extract.drop([3])
+hcw_curr_extract = hcw_curr_extract.reset_index(drop=True)
+hcw_curr_extract.fillna(0, inplace=True)
+
+# Add in the columns to the dataframe for the labels that distinguishes whether
+# these officers are allocated to the district-or-lower levels or one of the key hospitals.
+hcw_curr_extract.loc[:, 'District_Or_Hospital'] = labels
+hcw_curr_extract.loc[:, 'Is_DistrictLevel'] = is_distlevel
+
+# Finished import from the CHAI excel
+curr_staffing_table = hcw_curr_extract.copy()
+
+# Check the cadre columns of curr_staffing_table is identical to fund_staffing_table
+assert set(curr_staffing_table.columns[0:21]) == set(fund_staffing_table.columns[-21:])
+
+# For curr_staffing_table, reallocating D01 from districts to referral hospitals
+# Treat KCH, MCH, QECH, ZCH as referral hospitals
+# The operation of reallocating E01 in HQ to districts is not needed for curr_staffing_table,
+# as no. of E01 in curr_staffing_table at HQ is zero.
+
+curr_extra_D01 = curr_staffing_table.loc[
+    ~curr_staffing_table['District_Or_Hospital'].isin(['KCH', 'MCH', 'QECH', 'ZCH']), curr_staffing_table.columns[
+        curr_staffing_table.columns == 'D01']].sum().values[0]
+curr_staffing_table.loc[
+    ~curr_staffing_table['District_Or_Hospital'].isin(['KCH', 'MCH', 'QECH', 'ZCH']), curr_staffing_table.columns[
+        curr_staffing_table.columns == 'D01']] = 0
+curr_extra_D01_per_referralhosp = curr_extra_D01 / 4
+curr_staffing_table.loc[curr_staffing_table['District_Or_Hospital'].isin(['KCH', 'MCH', 'QECH', 'ZCH']), 'D01'] = \
+    curr_staffing_table.loc[curr_staffing_table['District_Or_Hospital'].isin(['KCH', 'MCH', 'QECH', 'ZCH']), 'D01'] + \
+    curr_extra_D01_per_referralhosp
+
+# For curr_staffing_table, sort out the districts and central hospitals
+curr_staffing_table.loc[
+    curr_staffing_table['District_Or_Hospital'] == 'HQ or missing', 'District_Or_Hospital'] = 'Headquarter'
+curr_staffing_table.loc[
+    curr_staffing_table['District_Or_Hospital'] == 'KCH', 'District_Or_Hospital'] = 'Referral Hospital_Central'
+curr_staffing_table.loc[
+    curr_staffing_table['District_Or_Hospital'] == 'MCH', 'District_Or_Hospital'] = 'Referral Hospital_Northern'
+curr_staffing_table.loc[
+    curr_staffing_table['District_Or_Hospital'] == 'QECH', 'District_Or_Hospital'] = 'Referral Hospital_Southern'
+curr_staffing_table.loc[
+    curr_staffing_table['District_Or_Hospital'] == 'ZCH', 'District_Or_Hospital'] = 'Referral Hospital_Southern'
+
+# Group the referral hospitals QECH and ZCH as Referral Hospital_Southern
+Is_DistrictLevel = curr_staffing_table['Is_DistrictLevel'].values  # Save the column 'Is_DistrictLevel' first
+curr_staffing_table = pd.DataFrame(
+    curr_staffing_table.groupby(by=['District_Or_Hospital'], sort=False).sum()).reset_index()
+curr_staffing_table.insert(1, 'Is_DistrictLevel', Is_DistrictLevel[:-1])  # Add the column 'Is_DistrictLevel'
+
+# Add a row for Zomba Mental Hospital, which has 12 mental health staff according to compiled staff return
+curr_ZMH = pd.DataFrame(columns=curr_staffing_table.columns.copy())
+curr_ZMH.loc[0, 'District_Or_Hospital'] = 'Zomba Mental Hospital'
+curr_ZMH.loc[0, 'Is_DistrictLevel'] = False
+curr_ZMH.loc[0, 'C01'] = 12
+# Alternatively, if consider all potential cadres from compiled staff return
+# curr_cadres_ZMH = pd.DataFrame(index = [0], columns = ['M01','M02','N01','N02','C01','P02','P03'],
+#                          data = np.array([[2,5,19,27,12,1,1]]))
+# for col in curr_cadres_ZMH.columns:
+#    curr_ZMH.loc[0,col] = curr_cadres_ZMH.loc[0,col].copy()
+
+curr_staffing_table = pd.concat([curr_staffing_table, curr_ZMH])
+curr_staffing_table.reset_index(drop=True, inplace=True)
+curr_staffing_table.fillna(0, inplace=True)
+
+# For Zomba district, there are 12 mental health staff C01;
+# However, compiled staff return does not record any C01 in Zomba district;
+# We therefore assume that its 12 C01 are from Zomba Mental Hospital.
+curr_idx_ZombaDist = curr_staffing_table[curr_staffing_table['District_Or_Hospital'] == 'Zomba'].index
+curr_staffing_table.loc[curr_idx_ZombaDist, 'C01'] = \
+    curr_staffing_table.loc[curr_idx_ZombaDist, 'C01'] - curr_ZMH.loc[0, 'C01']
+# Alternatively, if consider all potential cadres from compiled staff return
+# curr_staffing_table.loc[curr_idx_ZombaDist, :] = curr_staffing_table.loc[curr_idx_ZombaDist, :] - curr_ZMH.loc[0,:]
+
+# Check that curr_staffing_table.loc[curr_idx_ZombaDist, :] >=0
+assert (curr_staffing_table.loc[curr_idx_ZombaDist, 'M01':'R04'].values >= 0).all()
+
+# Similarly split staff to 5 special districts as done for funded staff
+# split_districts = (
+#    ('Likoma', 'Nkhata Bay'),
+#    ('Lilongwe City', 'Lilongwe'),
+#    ('Mzuzu City', 'Mzimba'),
+#    ('Zomba City', 'Zomba'),
+#    ('Blantyre City', 'Blantyre')
+# )
+
+for i in np.arange(0, len(split_districts)):
+    new_district = split_districts[i][0]
+    super_district = split_districts[i][1]
+
+    record = curr_staffing_table.iloc[0].copy()  # get a row of the staffing table
+
+    # make a the record for the new district
+    record['District_Or_Hospital'] = new_district
+    record['Is_DistrictLevel'] = True
+
+    # get total staff level from the super districts
+    cols = set(curr_staffing_table.columns).intersection(set(officer_types_table.Officer_Type_Code))
+
+    total_staff = curr_staffing_table.loc[
+        curr_staffing_table['District_Or_Hospital'] == super_district, cols].values.squeeze()
+
+    # get the weight
+    w0 = pop_by_district.loc[new_district, 'Count'] / pop_by_district.loc[
+        super_district, 'Count']  # The values in order are 0.05,0.60,0.24,0.14,1.77
+    if w0 < 1:
+        w = w0
+    else:
+        w = 0.5
+
+    # assign w * 100% staff to the new district
+    record.loc[cols] = w * total_staff
+    curr_staffing_table = curr_staffing_table.append(record).reset_index(drop=True)
+
+    # take staff away from the super district
+    curr_staffing_table.loc[curr_staffing_table['District_Or_Hospital'] == super_district, cols] = \
+        curr_staffing_table.loc[
+            curr_staffing_table[
+                'District_Or_Hospital'] == super_district, cols] - record.loc[cols]
+
+# Confirm the merging will be perfect:
+# pop = pop_by_district.reset_index(drop = False, inplace = False)
+assert set(pop['District'].values) == set(
+    curr_staffing_table.loc[curr_staffing_table['Is_DistrictLevel'], 'District_Or_Hospital'])
+assert len(pop['District'].values) == len(
+    curr_staffing_table.loc[curr_staffing_table['Is_DistrictLevel'], 'District_Or_Hospital'])
+
+# ... double check by doing the merge explicitly
+# pop_districts = pd.DataFrame({'District': pd.unique(pop['District'])})
+chai_districts = pd.DataFrame(
+    {'District': curr_staffing_table.loc[curr_staffing_table['Is_DistrictLevel'], 'District_Or_Hospital']})
+
+merge_result = pop_districts.merge(chai_districts, how='inner', indicator=True)
+assert all(merge_result['_merge'] == 'both')
+assert len(merge_result) == len(pop_districts)
+
+# Split staff within each district to level 0 (All DCSAs at HP), level 1a (Disp, HC, etc.),
+# level 1b (ComHos, CHAM ComHos), and level 2 (DisHos, etc.), according to curr_staff_distribution.
+
+# First, make the table including all districts and facility levels 0 - 2 per district,\
+# by merging with district_faclevel defined previously.
+curr_staffing_table = district_faclevel.merge(curr_staffing_table, how='outer')
+
+# Split staff among levels
+for district in pop['District']:
+    for cadre in set(curr_staffing_table.columns[3:]):
+        # The proportions
+        weight = curr_staff_distribution.loc[curr_staff_distribution['Cadre_Code'] == cadre,
+                                             ['Facility_Level', 'Proportion']].copy()
+        # The staff count before splitting
+        old_count = curr_staffing_table.loc[curr_staffing_table['District_Or_Hospital'] == district,
+                                            ['Facility_Level', cadre]].copy()
+
+        # Check that Facility levels of weight and old_count are consistent
+        assert (weight['Facility_Level'].values == old_count['Facility_Level'].values).all()
+
+        # Check that if old_count is not 0, then weight is not 0, guaranteeing that staff are split
+        if (old_count[cadre] > 0).any():
+            assert (weight['Proportion'] > 0).any()
+
+        # Split
+        curr_staffing_table.loc[curr_staffing_table['District_Or_Hospital'] == district, cadre] = (
+            old_count[cadre].values * weight['Proportion'].values)
+
+# Add facility levels for HQ, CenHos and ZMH
+curr_staffing_table.loc[128:133, 'Facility_Level'] = ['Facility_Level_5', 'Facility_Level_3',
+                                                      'Facility_Level_3', 'Facility_Level_3',
+                                                      'Facility_Level_4']  # 128:132 also OK
+
+# Save the table without column 'Is_DistrictLevel'; staff counts in floats
+curr_staffing_table_to_save = curr_staffing_table.drop(columns='Is_DistrictLevel', inplace=False)
+# curr_staffing_table_to_save.to_csv(
+#     outputlocation / 'human_resources' / 'actual' / 'ResourceFile_Staff_Table.csv', index=False)
+
+# ---------------------------------------------------------------------------------------------------------------------
+# *** Create the Master Facilities List
+# This will be a listing of each facility and the district(s) to which they attach
+# The different Facility Types are notional at this stage
+# The Facility Level is the important variable for the staffing: staff are assumed to be allocated
+# to a particular level within a district，or a referral hospital, or others
+# They do not associate with a particular type of Facility
+
+Facility_Levels = [0, '1a', '1b', 2, 3, 4, 5]
+# 0: Community/Local level - HP, Village Health Committee, Community initiatives
+# 1a: Primary level - Dispensary, HC, Clinic, Maternity facility
+# 1b: Primary level - Community/Rural Hospital, CHAM (Community) Hospitals
+# 2: Second level - District hospital, DHO
+# 3: Tertiary/Referral level - KCH, MCH, ZCH + QECH as referral hospitals
+# 4: Zomba Mental Hospital, which has very limited data in CHAI dataset
+# 5: Headquarter, which has staff data (but no Time_Base or Incidence_Curr data)
+
+# declare the Facility_Type variable
+# Facility_Types = ['Health Post', 'Dispensary', 'Health Centre', 'Community or Rural Hospital', 'CHAM Hospital',
+#                   'District Hospital', 'DHO', 'Referral Hospital', 'Zomba Mental Hospital']
+# Facility_Types_Levels = dict(zip(Facility_Types, Facility_Levels))
+
+
+# Create empty dataframe that will be the Master Facilities List (mfl)
+mfl = pd.DataFrame(columns=['Facility_Level', 'District', 'Region'])
+
+pop_districts = pop['District'].values  # array; the 'pop_districts' used in previous lines is a DataFrame
+pop_regions = pd.unique(pop['Region'])
+
+# Each district is assigned with a set of community level facs, a set of primary level facs,
+# and a set of second level facs.
+# Therefore, the total sets of facs is 4 * no. of districts + 3 (RefHos per Region) + 1 (HQ) + 1 (ZMH) \
+# = 4 * 32 + 5 = 133
+for d in pop_districts:
+    df = pd.DataFrame({'Facility_Level': Facility_Levels[0:4], 'District': d,
+                       'Region': pop.loc[pop['District'] == d, 'Region'].values[0]})
+    mfl = mfl.append(df, ignore_index=True, sort=True)
+
+# Add in the Referral Hospitals, one for each region
+for r in pop_regions:
+    mfl = mfl.append(pd.DataFrame({
+        'Facility_Level': Facility_Levels[4], 'District': None, 'Region': r
+    }, index=[0]), ignore_index=True, sort=True)
+
+# Add the ZMH
+mfl = mfl.append(pd.DataFrame({
+    'Facility_Level': Facility_Levels[5], 'District': None, 'Region': None
+}, index=[0]), ignore_index=True, sort=True)
+
+# Add the HQ
+mfl = mfl.append(pd.DataFrame({
+    'Facility_Level': Facility_Levels[6], 'District': None, 'Region': None
+}, index=[0]), ignore_index=True, sort=True)
+
+# Create the Facility_ID
+mfl.loc[:, 'Facility_ID'] = mfl.index
+
+# Create a unique name for each Facility
+name = 'Facility_Level_' + mfl['Facility_Level'].astype(str) + '_' + mfl['District']
+name.loc[mfl['Facility_Level'] == 3] = 'Referral Hospital' + '_' + mfl.loc[
+    mfl['Facility_Level'] == 3, 'Region']
+name.loc[mfl['Facility_Level'] == 4] = 'Zomba Mental Hospital'
+name.loc[mfl['Facility_Level'] == 5] = 'Headquarter'
+
+mfl.loc[:, 'Facility_Name'] = name
+
+# Save
+mfl.to_csv(outputlocation / 'organisation' / 'ResourceFile_Master_Facilities_List.csv', index=False)
+
+# ---------------------------------------------------------------------------------------------------------------------
+# *** Create a simple mapping of all the facilities that persons in a district can access
+facilities_by_district = pd.DataFrame(columns=mfl.columns)
+
+# Each district in pop_districts has access to five facility levels.
+for d in pop_districts:
+    the_region = pop.loc[pop['District'] == d, 'Region'].copy().values[0]
+
+    district_facs = mfl.loc[mfl['District'] == d]  # Include facs from level 0 to level 2
+
+    region_fac = mfl.loc[pd.isnull(mfl['District']) & (mfl['Region'] == the_region)].copy().reset_index(drop=True)
+    region_fac.loc[0, 'District'] = d  # Level 3, referral hospital
+
+    zmh_fac = mfl.loc[pd.isnull(mfl['District']) & pd.isnull(mfl['Region']) &
+                      (mfl['Facility_Name'] == 'Zomba Mental Hospital')].copy().reset_index(drop=True)
+    zmh_fac.loc[0, 'District'] = d  # Level 4, Zomba Mental Hospital
+
+    headquarter_fac = mfl.loc[pd.isnull(mfl['District']) & pd.isnull(mfl['Region']) &
+                              (mfl['Facility_Name'] == 'Headquarter')].copy().reset_index(drop=True)
+    headquarter_fac.loc[0, 'District'] = d  # Level 5, Headquarter
+
+    facilities_by_district = pd.concat([facilities_by_district, district_facs, region_fac, zmh_fac, headquarter_fac],
+                                       ignore_index=True)
+
+# check that the no. of facs is no. of districts times no. of fac levels = 32 * 7 = 224
+assert len(facilities_by_district) == len(pop_districts) * len(Facility_Levels)
+
+# Save
+# facilities_by_district.to_csv(outputlocation / 'organisation' / 'ResourceFile_Facilities_For_Each_District.csv',
+#                               index=False)
+
+# ---------------------------------------------------------------------------------------------------------------------
+# *** Now look at the types of appointments
+sheet = pd.read_excel(workingfile, sheet_name='Time_Base', header=None)
+
+# get rid of the junk rows
+trimmed = sheet.loc[[7, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, 24, 26, 27]]
+data_import = pd.DataFrame(data=trimmed.iloc[1:, 2:].values, columns=trimmed.iloc[0, 2:], index=trimmed.iloc[1:, 1])
+
+data_import = data_import.dropna(axis='columns', how='all')  # get rid of the 'spacer' columns
+data_import = data_import.fillna(0)
+
+# get rid of records for which there is no call on time of any type of officer
+data_import = data_import.drop(columns=data_import.columns[data_import.sum() == 0])
+
+# We note that the DCSA (CHW) never has a time requirement and that no appointments can be serviced at the HealthPost.
+# We remedy this by inserting a new type of appointment, which only the DCSA can service, \
+# and the time taken is 10 minutes.
+new_appt_for_CHW = pd.Series(index=data_import.index,
+                             name='E01_ConWithDCSA',
+                             # New appointment type is a consultation with the DCSA (community health worker)
+                             data=[
+                                 0,  # Central Hosp - Time
+                                 0,  # Central Hosp - Percent
+                                 0,  # District Hosp - Time
+                                 0,  # District Hosp - Percent
+                                 0,  # Comm Hosp - Time
+                                 0,  # Comm Hosp - Percent
+                                 0,  # Urban Health Centre - Time     #10 mins
+                                 0,  # Urban Health Centre - Percent  #100%
+                                 0,  # Rural Health Centre - Time     #10 mins
+                                 0,  # Rural Health Centre - Percent  #100%
+                                 10.0,  # Health Post - Time
+                                 1.0,  # Health Post - Percent
+                                 0,  # Dispensary - Time              #10 mins
+                                 0,  # Dispensary - Percent           #100%
+                             ])
+
+data_import = pd.concat([data_import, new_appt_for_CHW], axis=1)
+
+# Add service times for DHOs, which has quite a few data in 'Incidence_Curr', by copying the data of DisHos
+new_rows_for_DHO = pd.DataFrame(index=['DHO', 'DHO_Per'], columns=data_import.columns.copy(),
+                                data=data_import.loc[['DisHos', 'DisHos_Per'], :].copy().values)
+
+# Add service times (Mental OPD and Mental Clinic Visit) for Zomba Mental Hospital, by copying data of CenHos
+new_rows_for_ZMH = pd.DataFrame(index=['ZMH', 'ZMH_Per'], columns=data_import.columns.copy(),
+                                data=0)
+new_rows_for_ZMH.loc[:, ['C01_MentOPD', 'C01_MentClinic']] = data_import.loc[
+    ['CenHos', 'CenHos_Per'], ['C01_MentOPD', 'C01_MentClinic']].copy().values
+# If consider all potential cadres from compiled staff return and all associated services
+# new_rows_for_ZMH = pd.DataFrame(index=['ZMH','ZMH_Per'],columns=data_import.columns.copy(),
+#                               data=data_import.loc[['CenHos','CenHos_Per'],:].copy().values)
+
+data_import = pd.concat([data_import, new_rows_for_DHO, new_rows_for_ZMH])
+
+# data_import ready!
+
+# Break apart composite to give the appt_type and the officer_type
+# This is used to know which column to read below...
+chai_composite_code = pd.Series(data_import.columns)
+chai_code = chai_composite_code.str.split(pat='_', expand=True).reset_index(drop=True)
+chai_code = chai_code.rename(columns={0: 'Officer_Type_Code', 1: 'Appt_Type_Code'})
+
+# check that officer codes line up with the officer codes already imported
+assert set(chai_code['Officer_Type_Code']).issubset(set(officer_types_table['Officer_Type_Code']))
+
+# Make dataframe summarising the types of appointments
+
+retained_appt_type_code = pd.unique(chai_code['Appt_Type_Code'])
+
+appt_types_table_import = sheet.loc[(1, 2, 6), 2:].transpose().reset_index(drop=True).copy()
+appt_types_table_import = appt_types_table_import.rename(columns={1: 'Appt_Cat', 2: 'Appt_Type', 6: 'Appt_Type_Code'})
+appt_types_table_import['Appt_Cat'] = pd.Series(appt_types_table_import['Appt_Cat']).fillna(method='ffill')
+appt_types_table_import['Appt_Type'] = pd.Series(appt_types_table_import['Appt_Type']).fillna(method='ffill')
+appt_types_table_import['Appt_Type_Code'] = pd.Series(appt_types_table_import['Appt_Type_Code']).fillna(method='ffill')
+appt_types_table_import = appt_types_table_import.drop_duplicates().reset_index(drop=True)
+
+# starting with the retained appt codes, merge in these descriptions
+appt_types_table = pd.DataFrame(data={'Appt_Type_Code': retained_appt_type_code}).merge(appt_types_table_import,
+                                                                                        on='Appt_Type_Code', how='left',
+                                                                                        indicator=True)
+
+# Fill in the missing information about the appointment type that was added above
+appt_types_table.loc[appt_types_table['Appt_Type_Code'] == new_appt_for_CHW.name.split('_')[1], 'Appt_Cat'] = \
+    new_appt_for_CHW.name.split('_')[1]
+appt_types_table.loc[appt_types_table['Appt_Type_Code'] == new_appt_for_CHW.name.split('_')[1], 'Appt_Type'] = \
+    new_appt_for_CHW.name.split('_')[1]
+
+# drop the merge check column
+appt_types_table.drop(columns='_merge', inplace=True)
+
+# Replace space with underscore in the Appt_Cat
+appt_types_table['Appt_Cat'].replace(to_replace='  ', value='_', regex=True, inplace=True)
+appt_types_table['Appt_Cat'].replace(to_replace=' ', value='_', regex=True, inplace=True)
+
+# Check no holes
+assert not pd.isnull(appt_types_table).any().any()
+
+# Save
+appt_types_table.to_csv(outputlocation / 'human_resources' / 'definitions' / 'ResourceFile_Appt_Types_Table.csv',
+                        index=False)
+
+# ---------------------------------------------------------------------------------------------------------------------
+# *** Now, make the ApptTimeTable
+# (Table that gives for each appointment, when occurring in each appt_type at each facility type, the time of each \
+# type of officer required
+
+# The sheet gives the % of appointments that require a particular type of officer and the time taken if it does
+# So, turn that into an Expectation of the time taken for each type of officer (multiplying together)
+
+# This sheet distinguished between different types of facility in terms of the time taken by appointments occurring \
+# at each.
+# But the CHAI data do not distinguish how many officers work at each different level of facility
+# (Available staff counts for only districts (level = 0,1a,1b,2), CenHos (level = 3), and HQ (level = 5))
+# Therefore, we will map these to the facility level that have been defined.
+# NB. In doing this, we:
+# - assume that the time taken for all appointments at each level is modelled by that for the average of \
+#     facility types at that level
+
+# CHAI: Headquarter ---> our "headquarter" (level = 5)
+# CHAI: Zomba Mental Hospital ---> our 'Zomba Mental Hospital' / 'ZMH' (level = 4)
+# CHAI: Central_Hospital ---> our "Referral Hospital" (level = 3)
+# CHAI: District_Hospital ---> averaged into our "second level" facilities (level = 2)
+# CHAI: DHO ---> averaged into our "second level" facilities (level = 2)
+# CHAI: Community_Hospital ---> averaged into our "primary level" facilities (level = 1b)
+# CHAI: Urban_HealthCentre ---> averaged into our "primary level" facilities (level = 1a)
+# CHAI: Rural_HealthCentre ---> averaged into our "primary level" facilities (level = 1a)
+# CHAI: Dispensary ---> averaged into our "primary level" facilities (level = 1a)
+# CHAI: HealthPost ---> averaged into our "community level" facilities (level = 0)
+
+# level 4
+ZMH_ExpectTime = data_import.loc['ZMH'] * data_import.loc['ZMH_Per']
+
+# Level 3
+Central_Hospital_ExpecTime = data_import.loc['CenHos'] * data_import.loc['CenHos_Per']
+
+# level 5; No data available for Headquarter; we assign NAN to it
+HQ_ExpecTime = Central_Hospital_ExpecTime.copy()
+HQ_ExpecTime.loc[:] = np.nan
+
+# level 2
+District_Hospital_ExpecTime = data_import.loc['DisHos'] * data_import.loc['DisHos_Per']
+DHO_ExpecTime = data_import.loc['DHO'] * data_import.loc['DHO_Per']
+
+# level 1b
+Community_Hospital_ExpecTime = data_import.loc['ComHos'] * data_import.loc['ComHos_Per']
+
+# level 1a
+Urban_HealthCentre_ExpecTime = data_import.loc['UrbHC'] * data_import.loc['UrbHC_Per']
+Rural_HealthCentre_ExpecTime = data_import.loc['RurHC'] * data_import.loc['RurHC_Per']
+Disp_ExpecTime = data_import.loc['Disp'] * data_import.loc['Disp_Per']
+
+# level 0
+HealthPost_ExpecTime = data_import.loc['HP'] * data_import.loc['HP_Per']
+
+# Average time for levels 2 and 1a, which have data for more than 1 facility types
+Avg_Level2_ExpectTime = (District_Hospital_ExpecTime + DHO_ExpecTime) / 2  # Identical to DisHos Expected Time
+Avg_Level1a_ExpectTime = (Disp_ExpecTime + Urban_HealthCentre_ExpecTime + Rural_HealthCentre_ExpecTime) / 3
+
+# Assemble
+X = pd.DataFrame({
+    5: HQ_ExpecTime,  # (Headquarter)
+    4: ZMH_ExpectTime,  # (Zomba Mental Hospital)
+    3: Central_Hospital_ExpecTime,  # (our "Referral Hospital" at region level)
+    2: Avg_Level2_ExpectTime,  # (DHO and DisHos at second level )
+    '1b': Community_Hospital_ExpecTime,  # (ComHos at primary level)
+    '1a': Avg_Level1a_ExpectTime,  # (UrbHC,RurHC and Disp at primary level)
+    0: HealthPost_ExpecTime  # (HP at community level)
+})
+
+assert set(X.columns) == set(Facility_Levels)
+
+# Split out the index into appointment type and officer type
+labels = pd.Series(X.index, index=X.index).str.split(pat='_', expand=True)
+labels = labels.rename(columns={0: 'Officer_Type_Code', 1: 'Appt_Type_Code'})
+Y = pd.concat([X, labels], axis=1)
+ApptTimeTable = pd.melt(Y, id_vars=['Officer_Type_Code', 'Appt_Type_Code'],
+                        var_name='Facility_Level', value_name='Time_Taken_Mins')
+
+# Confirm that Facility_Level is an int ---> No longer needed, as level 1a and 1b are not integers
+# ApptTimeTable['Facility_Level'] = ApptTimeTable['Facility_Level'].astype(int)
+
+# Merge in Officer_Type
+ApptTimeTable = ApptTimeTable.merge(officer_types_table, on='Officer_Type_Code')
+
+# confirm that we have the same number of entries as we were expecting
+assert len(ApptTimeTable) == len(Facility_Levels) * len(data_import.columns)
+
+# drop the rows that contain no call on resources, including NAN values
+ApptTimeTable = ApptTimeTable.drop(ApptTimeTable[ApptTimeTable['Time_Taken_Mins'] == 0].index)
+ApptTimeTable = ApptTimeTable.drop(ApptTimeTable[pd.isnull(ApptTimeTable['Time_Taken_Mins'])].index)
+# reset index
+ApptTimeTable.reset_index(drop=True, inplace=True)
+
+# Generate appt_time_table_coarse with officer_category, instead of officer_type
+appt_time_table_coarse = pd.DataFrame(
+    ApptTimeTable.groupby(['Appt_Type_Code', 'Facility_Level', 'Officer_Category']).sum()
+).reset_index()
+
+# Save
+appt_time_table_coarse.to_csv(outputlocation / 'human_resources' / 'definitions' / 'ResourceFile_Appt_Time_Table.csv',
+                              index=False)
+
+# ---------------------------------------------------------------------------------------------------------------------
+# *** Create a table that determines what kind of appointment can be serviced in each Facility Level
+ApptType_By_FacLevel = pd.DataFrame(index=appt_types_table['Appt_Type_Code'],
+                                    columns=Facility_Levels,
+                                    data=False,
+                                    dtype=bool)
+
+for appt_type in ApptType_By_FacLevel.index:
+    for fac_level in ApptType_By_FacLevel.columns:
+        # Can this appt_type happen at this facility_level?
+        # Check to see if ApptTimeTable has any time requirement
+
+        ApptType_By_FacLevel.at[appt_type, fac_level] = \
+            ((ApptTimeTable['Facility_Level'] == fac_level) & (ApptTimeTable['Appt_Type_Code'] == appt_type)).any()
+
+ApptType_By_FacLevel = ApptType_By_FacLevel.add_prefix('Facility_Level_')
+
+# Generate appt_type_by_level_coarse consider officer_category, instead of officer_type
+appt_type_by_level_coarse = pd.DataFrame(index=appt_types_table['Appt_Type_Code'],
+                                         columns=Facility_Levels,
+                                         data=False,
+                                         dtype=bool)
+
+for appt_type in appt_type_by_level_coarse.index:
+    for fac_level in appt_type_by_level_coarse.columns:
+        # Can this appt_type happen at this facility_level?
+        # Check to see if appt_time_table_coarse has any time requirement
+
+        appt_type_by_level_coarse.at[appt_type, fac_level] = \
+            ((appt_time_table_coarse['Facility_Level'] == fac_level) & (
+                appt_time_table_coarse['Appt_Type_Code'] == appt_type)).any()
+
+appt_type_by_level_coarse = appt_type_by_level_coarse.add_prefix('Facility_Level_')
+
+# Check; The two tables should be equal
+assert (appt_type_by_level_coarse == ApptType_By_FacLevel).all().all()
+
+# Save
+ApptType_By_FacLevel.to_csv(
+    outputlocation / 'human_resources' / 'definitions' / 'ResourceFile_ApptType_By_FacLevel.csv', index=True)
+
+# --- check
+# Look to see where different types of staff member need to be located:
+# This is just a reverse reading of where there are non-zero requests for time of particular officer-types
+
+Officers_Need_For_Appt = pd.DataFrame(columns=['Facility_Level', 'Appt_Type_Code', 'Officer_Type_Codes'])
+
+for a in appt_types_table['Appt_Type_Code'].values:
+    for f in Facility_Levels:
+
+        # get the staff types required for this appt
+
+        block = ApptTimeTable.loc[(ApptTimeTable['Appt_Type_Code'] == a) & (ApptTimeTable['Facility_Level'] == f)]
+
+        if len(block) == 0:
+            # no requirement expressed => The appt is not possible at this location
+            Officers_Need_For_Appt = Officers_Need_For_Appt.append(
+                {'Facility_Level': f,
+                 'Appt_Type_Code': a,
+                 'Officer_Type_Codes': False
+                 }, ignore_index=True)
+
+        else:
+            need_officer_types = list(block['Officer_Type_Code'])
+            Officers_Need_For_Appt = Officers_Need_For_Appt.append(
+                {'Facility_Level': f,
+                 'Appt_Type_Code': a,
+                 'Officer_Type_Codes': need_officer_types
+                 }, ignore_index=True)
+
+# Turn this into the the set of staff that are required for each type of appointment
+FacLevel_By_Officer = pd.DataFrame(columns=Facility_Levels,
+                                   index=officer_types_table['Officer_Type_Code'].values)
+FacLevel_By_Officer = FacLevel_By_Officer.fillna(False)
+
+for o in officer_types_table['Officer_Type_Code'].values:
+
+    for i in Officers_Need_For_Appt.index:
+
+        fac_level = Officers_Need_For_Appt.loc[i].Facility_Level
+        officer_types = Officers_Need_For_Appt.loc[i].Officer_Type_Codes
+
+        if officer_types is not False:  # (i.e. such an appointment at such a a facility is possible)
+
+            if o in officer_types:
+                FacLevel_By_Officer.loc[(FacLevel_By_Officer.index == o), fac_level] = True
+
+# We note that three officer_types ("T01: Nutrition Staff", "R03: Sonographer" and "RO4: Radiotherapy technician") are\
+#  apparently not called by any appointment type
+
+# Assign that the Nutrition Staff will go to the Referral Hospitals (level = 3)
+FacLevel_By_Officer.loc['T01', 3] = True
+
+# Assign that the Sonographer will go to the Referral Hospitals (level = 3)
+FacLevel_By_Officer.loc['R03', 3] = True
+
+# Assign that the Radiotherapist will go to the Referral Hospitals (level = 3)
+FacLevel_By_Officer.loc['R04', 3] = True
+
+# As an option, we could assign staff at HQ to level 5 according to the info of staff
+# Get the sets of officers of funded and current staff
+fund_staff_HQ = fund_staffing_table[fund_staffing_table['District_Or_Hospital'] == 'Headquarter'].copy()
+curr_staff_HQ = curr_staffing_table[curr_staffing_table['District_Or_Hospital'] == 'Headquarter'].copy()
+fund_staff_HQ.drop(columns=['District_Or_Hospital', 'Facility_Level', 'Is_DistrictLevel'], inplace=True)
+curr_staff_HQ.drop(columns=['District_Or_Hospital', 'Facility_Level', 'Is_DistrictLevel'], inplace=True)
+fund_staff_HQ_Positive = fund_staff_HQ.loc[:, (fund_staff_HQ > 0).any(axis=0)]
+curr_staff_HQ_Positive = curr_staff_HQ.loc[:, (curr_staff_HQ > 0).any(axis=0)]
+# The union of the two sets
+staff_call_at_HQ = fund_staff_HQ_Positive.columns.union(curr_staff_HQ_Positive.columns)
+# Assign true value to staff_call_at_HQ
+for s in staff_call_at_HQ:
+    FacLevel_By_Officer.loc[s, 5] = True
+
+# Check that all types of officer are allocated to at least one type of facility excl. HQ/Level_5
+assert (FacLevel_By_Officer.iloc[:, 0:6].sum(axis=1) > 0).all()
+
+# Change columns names: 0 -> Facility_Level_0
+FacLevel_By_Officer = FacLevel_By_Officer.add_prefix('Facility_Level_')
+
+# ---------------------------------------------------------------------------------------------------------------------
+# *** Get Hours and Minutes Worked Per Staff Member, i.e., the daily capabilities
+# First, read-in the number of working hours and days for each type of officer
+
+pft_sheet = pd.read_excel(workingfile, sheet_name='PFT', header=None)
+officer_types_import = pft_sheet.iloc[2, np.arange(2, 23)]
+
+assert set(officer_types_import) == set(officer_types_table['Officer_Type_Code'])
+assert len(officer_types_import) == len(officer_types_table['Officer_Type_Code'])
+
+# patient facing hours daily at hospitals
+hours_hospital = pft_sheet.iloc[38, np.arange(2, 23)]
+
+# patient facing hours daily at health centres
+work_mins_hc = pft_sheet.iloc[26, np.arange(2, 23)]
+admin_mins_hc = pft_sheet.iloc[34, np.arange(2, 23)]
+hours_hc = (work_mins_hc - admin_mins_hc) / 60
+
+# Total working days per year
+days_per_year_men = pft_sheet.iloc[15, np.arange(2, 23)]
+days_per_year_women = pft_sheet.iloc[16, np.arange(2, 23)]
+days_per_year_pregwomen = pft_sheet.iloc[17, np.arange(2, 23)]
+
+# Percents of men, nonpregnant women, and pregnant women
+fr_men = pft_sheet.iloc[53, np.arange(2, 23)]
+fr_pregwomen = pft_sheet.iloc[55, np.arange(2, 23)] * pft_sheet.iloc[57, np.arange(2, 23)]
+fr_nonpregwomen = pft_sheet.iloc[55, np.arange(2, 23)] * (1 - pft_sheet.iloc[57, np.arange(2, 23)])
+
+# Total average working days
+workingdays = (fr_men * days_per_year_men) + (fr_nonpregwomen * days_per_year_women) + (
+    fr_pregwomen * days_per_year_pregwomen)
+
+# --- patient facing time
+# Average mins per year, Average hours per day, Average number of mins per day in Malawi
+
+mins_per_day_hospital = hours_hospital * 60
+mins_per_day_hc = hours_hc * 60
+
+mins_per_year_hospital = mins_per_day_hospital * workingdays
+mins_per_year_hc = mins_per_day_hc * workingdays
+
+av_mins_per_day_hospital = mins_per_year_hospital / 365.25
+av_mins_per_day_hc = mins_per_year_hc / 365.25
+
+# PFT - hospital and health centre individually
+HosHC_patient_facing_time = pd.DataFrame(
+    {'Officer_Type_Code': officer_types_import, 'Working_Days_Per_Year': workingdays,
+     'Hospital_Hours_Per_Day': hours_hospital, 'HC_Hours_Per_Day': hours_hc,
+     'Hospital_Av_Mins_Per_Day': av_mins_per_day_hospital,
+     'HC_Av_Mins_Per_Day': av_mins_per_day_hc}
+).reset_index(drop=True)
+
+# PFT table ready!
+
+# Create final tables of daily time available at each facility by officer type: Facility_ID, Facility_Type,
+# Facility_Level, Officer_Type, Officer_Type_Code, Total Average Minutes Per Day, Staff_Count
+
+# --- Daily capability for funded staff; staff counts in floats
+# For float staff counts, calculate total minutes per day
+funded_staff_floats = fund_staffing_table_to_save.copy()  # staff counts
+funded_daily_minutes = funded_staff_floats.copy()  # total minutes per day
+
+for i in funded_daily_minutes.index:
+    the_level = funded_daily_minutes.loc[i, 'Facility_Level']
+    for officer in officer_types_table['Officer_Type_Code']:
+        if the_level in ['Facility_Level_0', 'Facility_Level_1a']:  # Levels 0, 1a; HC minutes
+            t = (funded_staff_floats.loc[i, officer] *
+                 HosHC_patient_facing_time.loc[HosHC_patient_facing_time['Officer_Type_Code'] == officer,
+                                               'HC_Av_Mins_Per_Day'])
+            funded_daily_minutes.loc[i, officer] = t.values
+        else:  # Levels 1b, 2, and above; Hospital minutes
+            t = (funded_staff_floats.loc[i, officer] *
+                 HosHC_patient_facing_time.loc[HosHC_patient_facing_time['Officer_Type_Code'] == officer,
+                                               'Hospital_Av_Mins_Per_Day'])
+            funded_daily_minutes.loc[i, officer] = t.values
+
+# Long format
+funded_staff_floats = pd.melt(funded_staff_floats, id_vars=['District_Or_Hospital', 'Facility_Level'],
+                              var_name='Officer_Type_Code', value_name='Staff_Count')
+funded_daily_minutes = pd.melt(funded_daily_minutes, id_vars=['District_Or_Hospital', 'Facility_Level'],
+                               var_name='Officer_Type_Code', value_name='Total_Mins_Per_Day')
+# Merge into daily capability table
+funded_daily_capability = funded_daily_minutes.merge(funded_staff_floats, how='left')
+
+# Reset facility level column to exclude 'Facility_Level_'
+funded_daily_capability['Facility_Level'] = \
+    funded_daily_capability['Facility_Level'].str.split(pat='_', expand=True).iloc[:, 2]
+# Drop row with zero minutes (also zero staff counts)
+funded_daily_capability.drop(
+    index=funded_daily_capability[funded_daily_capability['Total_Mins_Per_Day'] == 0].index, inplace=True)
+# Reset index
+funded_daily_capability.reset_index(drop=True, inplace=True)
+
+# Add 'District' and 'Facility_Name' columns
+for i in funded_daily_capability.index:
+    the_level = funded_daily_capability.loc[i, 'Facility_Level']
+    if the_level in ['0', '1a', '1b', '2']:
+        the_district = funded_daily_capability.loc[i, 'District_Or_Hospital']
+        funded_daily_capability.loc[i, 'District'] = the_district
+        funded_daily_capability.loc[i, 'Facility_Name'] = 'Facility_Level_' + str(the_level) + '_' + the_district
+    else:
+        funded_daily_capability.loc[i, 'Facility_Name'] = funded_daily_capability.loc[i, 'District_Or_Hospital']
+# Drop column 'District_Or_Hospital'
+funded_daily_capability.drop(columns='District_Or_Hospital', inplace=True)
+
+# Add info from mfl: Region and Facility ID
+for i in funded_daily_capability.index:
+    the_facility_name = funded_daily_capability.loc[i, 'Facility_Name']
+    the_ID = mfl.loc[mfl['Facility_Name'] == the_facility_name, 'Facility_ID']
+    the_region = mfl.loc[mfl['Facility_Name'] == the_facility_name, 'Region']
+
+    funded_daily_capability.loc[i, 'Facility_ID'] = the_ID.values
+    funded_daily_capability.loc[i, 'Region'] = the_region.values
+
+# Add 'officer_category' info
+funded_daily_capability = funded_daily_capability.merge(officer_types_table, on='Officer_Type_Code', how='left')
+
+# Group by officer categories; consider coarse officers
+funded_daily_capability_coarse = pd.DataFrame(
+    funded_daily_capability.groupby(
+        ['Facility_ID', 'Facility_Name', 'Facility_Level', 'District', 'Region', 'Officer_Category'],
+        dropna=False).sum()
+).reset_index()
+# Drop columns of officer types
+funded_daily_capability_coarse.drop(columns=['Officer_Type_Code', 'Officer_Type'], inplace=True)
+funded_daily_capability_coarse.reset_index(drop=True, inplace=True)
+
+# --- Daily capability for current staff; staff counts in floats
+# For float staff counts, calculate total minutes per day
+curr_staff_floats = curr_staffing_table_to_save.copy()  # staff counts
+curr_daily_minutes = curr_staff_floats.copy()  # total minutes per day
+
+for i in curr_daily_minutes.index:
+    the_level = curr_daily_minutes.loc[i, 'Facility_Level']
+    for officer in officer_types_table['Officer_Type_Code']:
+        if the_level in ['Facility_Level_0', 'Facility_Level_1a']:  # Levels 0, 1a; HC minutes
+            t = (curr_staff_floats.loc[i, officer] *
+                 HosHC_patient_facing_time.loc[HosHC_patient_facing_time['Officer_Type_Code'] == officer,
+                                               'HC_Av_Mins_Per_Day'])
+            curr_daily_minutes.loc[i, officer] = t.values
+        else:  # Levels 1b, 2, and above; Hospital minutes
+            t = (curr_staff_floats.loc[i, officer] *
+                 HosHC_patient_facing_time.loc[HosHC_patient_facing_time['Officer_Type_Code'] == officer,
+                                               'Hospital_Av_Mins_Per_Day'])
+            curr_daily_minutes.loc[i, officer] = t.values
+
+# Long format
+curr_staff_floats = pd.melt(curr_staff_floats, id_vars=['District_Or_Hospital', 'Facility_Level'],
+                            var_name='Officer_Type_Code', value_name='Staff_Count')
+curr_daily_minutes = pd.melt(curr_daily_minutes, id_vars=['District_Or_Hospital', 'Facility_Level'],
+                             var_name='Officer_Type_Code', value_name='Total_Mins_Per_Day')
+# Merge into daily capability table
+curr_daily_capability = curr_daily_minutes.merge(curr_staff_floats, how='left')
+
+# Reset facility level column to exclude 'Facility_Level_'
+curr_daily_capability['Facility_Level'] = \
+    curr_daily_capability['Facility_Level'].str.split(pat='_', expand=True).iloc[:, 2]
+# Drop row with zero minutes (also zero staff counts)
+curr_daily_capability.drop(
+    index=curr_daily_capability[curr_daily_capability['Total_Mins_Per_Day'] == 0].index, inplace=True)
+# Reset index
+curr_daily_capability.reset_index(drop=True, inplace=True)
+
+# Add 'District' and 'Facility_Name' columns
+for i in curr_daily_capability.index:
+    the_level = curr_daily_capability.loc[i, 'Facility_Level']
+    if the_level in ['0', '1a', '1b', '2']:
+        the_district = curr_daily_capability.loc[i, 'District_Or_Hospital']
+        curr_daily_capability.loc[i, 'District'] = the_district
+        curr_daily_capability.loc[i, 'Facility_Name'] = 'Facility_Level_' + str(the_level) + '_' + the_district
+    else:
+        curr_daily_capability.loc[i, 'Facility_Name'] = curr_daily_capability.loc[i, 'District_Or_Hospital']
+# Drop column 'District_Or_Hospital'
+curr_daily_capability.drop(columns='District_Or_Hospital', inplace=True)
+
+# Add info from mfl: Region and Facility ID
+for i in curr_daily_capability.index:
+    the_facility_name = curr_daily_capability.loc[i, 'Facility_Name']
+    the_ID = mfl.loc[mfl['Facility_Name'] == the_facility_name, 'Facility_ID']
+    the_region = mfl.loc[mfl['Facility_Name'] == the_facility_name, 'Region']
+
+    curr_daily_capability.loc[i, 'Facility_ID'] = the_ID.values
+    curr_daily_capability.loc[i, 'Region'] = the_region.values
+
+# Add 'officer_category' info
+curr_daily_capability = curr_daily_capability.merge(officer_types_table, on='Officer_Type_Code', how='left')
+
+# Group by officer categories; consider coarse officers
+curr_daily_capability_coarse = pd.DataFrame(
+    curr_daily_capability.groupby(
+        ['Facility_ID', 'Facility_Name', 'Facility_Level', 'District', 'Region', 'Officer_Category'],
+        dropna=False).sum()
+).reset_index()
+# Drop columns of officer types
+curr_daily_capability_coarse.drop(columns=['Officer_Type_Code', 'Officer_Type'], inplace=True)
+curr_daily_capability_coarse.reset_index(drop=True, inplace=True)
+
+# Save
+# HosHC_patient_facing_time.to_csv(
+#     outputlocation / 'human_resources' / 'definitions' / 'ResourceFile_Patient_Facing_Time.csv', index=False)
+
+# Need to # two lines below when generate funded_plus capability
+# funded_daily_capability_coarse.to_csv(
+#     outputlocation / 'human_resources' / 'funded' / 'ResourceFile_Daily_Capabilities.csv', index=False)
+
+# *** Only for funded_plus ********************************************************************************************
+funded_daily_capability_coarse.to_csv(
+    outputlocation / 'human_resources' / 'funded_plus' / 'ResourceFile_Daily_Capabilities.csv', index=False)
+# *********************************************************************************************************************
+
+curr_daily_capability_coarse.to_csv(
+    outputlocation / 'human_resources' / 'actual' / 'ResourceFile_Daily_Capabilities.csv', index=False)
+
+
+# ---------------------------------------------------------------------------------------------------------------------
+# final check that for an appointment required at a particular level (in Appt_Time_Table), \
+# then indeed, the staff capabilities are available to satisfy that, for a person in any district \
+# (including the regional and national facilities)
+
+# Define the check function
+def all_appts_can_run(capability):
+    # Creat a table storing whether the appts have consistent requirements/demand and capabilities/supply
+    appt_have_or_miss_capability = appt_time_table_coarse.copy()
+    # Delete the column of minutes
+    appt_have_or_miss_capability.drop(columns=['Time_Taken_Mins'], inplace=True)
+    # Store the info of district (including central hospital, ZMH) that fails
+    appt_have_or_miss_capability.loc[:, 'fail_district'] = ''
+
+    for _I in appt_have_or_miss_capability.index:  # Loop through all appts
+        # Get the info of app, level and officer_category
+        # the_appt = appt_have_or_miss_capability.loc[I, 'Appt_Type_Code']
+        L = appt_have_or_miss_capability.loc[_I, 'Facility_Level']
+        the_officer_category = appt_have_or_miss_capability.loc[_I, 'Officer_Category']
+
+        # Check in daily_capabilities that the required officer_category at a level is there or not, for every district
+        # Store the info of district (including central hospital, ZMH) that fails
+        if L in Facility_Levels[0:4]:  # Levels 0, 1a, 1b, 2
+            k = 0  # Record how many districts fail
+            for D in pop_districts:
+                idx = capability[
+                    (capability['District'] == D) &
+                    (capability['Facility_Level'] == str(L)) &
+                    (capability['Officer_Category'] == the_officer_category)].index
+                if idx.size == 0:
+                    # Store the district that fails to provide required officer_category
+                    appt_have_or_miss_capability.loc[_I, 'fail_district'] = \
+                        appt_have_or_miss_capability.loc[_I, 'fail_district'] + D + ','
+                    k += 1
+            if k == 0:
+                appt_have_or_miss_capability.loc[_I, 'fail_district'] = 'All districts pass'
+        elif L == 3:  # Level 3 central hospital
+            m = 0  # Record how many regions fail
+            for region in pop_regions:
+                idx1 = capability[
+                    (capability['Region'] == region) &
+                    (capability['Facility_Level'] == str(L)) &
+                    (capability['Officer_Category'] == the_officer_category)].index
+                if idx1.size == 0:
+                    # Store the regional hospital that fails
+                    appt_have_or_miss_capability.loc[_I, 'fail_district'] = \
+                        appt_have_or_miss_capability.loc[_I, 'fail_district'] + 'Referral Hospital_' + region + ','
+                    m += 1
+            if m == 0:
+                appt_have_or_miss_capability.loc[_I, 'fail_district'] = 'All districts pass'
+        elif L == 4:  # Zomba Mental Hospital
+            n = 0  # Record is ZMH failed
+            idx2 = capability[
+                (capability['Facility_Level'] == str(L)) &
+                (capability['Officer_Category'] == the_officer_category)].index
+            if idx2.size == 0:
+                appt_have_or_miss_capability.loc[_I, 'fail_district'] = \
+                    appt_have_or_miss_capability.loc[_I, 'fail_district'] + 'Zomba Mental Hospital,'
+                n += 1
+            if n == 0:
+                appt_have_or_miss_capability.loc[_I, 'fail_district'] = 'All districts pass'
+        else:
+            assert 0 == 1  # There should be no 'else'; otherwise, the generated tables above is incorrect
+
+    return appt_have_or_miss_capability
+
+
+# Save results for funded
+# appt_have_or_miss_capability_funded = all_appts_can_run(funded_daily_capability_coarse)
+# appt_have_or_miss_capability_funded.to_csv(
+#     outputlocation / 'human_resources' / 'funded' / 'appt_have_or_miss_capability.csv', index=False)
+# appt_have_or_miss_capability_funded.to_csv(
+#     outputlocation / 'human_resources' / 'funded_plus' / 'appt_have_or_miss_capability.csv', index=False)
+
+# Save results for actual
+# appt_have_or_miss_capability_actual = all_appts_can_run(curr_daily_capability_coarse)
+# appt_have_or_miss_capability_actual.to_csv(
+#     outputlocation / 'human_resources' / 'actual' / 'appt_have_or_miss_capability.csv', index=False)

From a8b80e9449a368e4d2c9f549c86db771d26aa2b5 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Tue, 15 Feb 2022 15:10:02 +0000
Subject: [PATCH 046/131] Update to line 878: same 21 cadres; CHAI
 establishment staff has data for Likoma, so no need to create a row for
 Likoma; but note here that Likoma has 0 DCSA staff

---
 .../formatting_healthsystem_data_update.py    | 120 ++++++------------
 1 file changed, 38 insertions(+), 82 deletions(-)

diff --git a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
index b2d3091bcc..503673d1db 100644
--- a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
+++ b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
@@ -65,7 +65,7 @@
 # ---------------------------------------------------------------------------------------------------------------------
 # *** create and save population_by_district data
 population = pd.read_csv(
-    resourcefilepath/'demography'/'ResourceFile_PopulationSize_2018Census.csv'
+    resourcefilepath / 'demography' / 'ResourceFile_PopulationSize_2018Census.csv'
 )
 
 pop_by_district = pd.DataFrame(population.groupby('District')['Count'].sum())
@@ -78,14 +78,14 @@
 # pop_by_district.to_csv(outputlocation / 'organisation' / 'ResourceFile_District_Population_Data.csv', index=True)
 
 # ---------------------------------------------------------------------------------------------------------------------
-# *** Below we generate staffing tables: fund_staffing_table for funded/established staff, and\
+# *** Below we generate staffing tables: fund_staffing_table for established staff, and\
 # curr_staffing_table for current staff
 # Before generating the tables, we need to prepare wb_import, officer_types_table, and\
 # make assumptions of curr_staff_return distribution and fund_staff_return distribution using Auxiliary CHAI Data
 
 # --- wb_import for staff information
 
-# Import all of the 'CurrentStaff' sheet, including both data of current and funded staff
+# Import all of the 'Staff' sheet, including both data of current and funded staff
 wb_import = pd.read_excel(workingfile, sheet_name='Staff', header=None)
 
 # --- officer_types_table
@@ -669,9 +669,9 @@
 # compare_staff_distribution.to_csv(outputlocation / 'ResourceFile_Staff_Distribution_Compare.csv', index=False)
 
 # ***
-# --- fund_staffing_table for funded/established staff
+# --- fund_staffing_table for established staff
 # Extract just the section about "Funded TOTAl Staff'
-wb_extract = wb_import.loc[3:37, 64:84]
+wb_extract = wb_import.loc[3:39, 64:84]
 wb_extract = wb_extract.drop([4, 5])
 wb_extract.columns = wb_extract.iloc[0]
 wb_extract = wb_extract.drop([3])
@@ -680,10 +680,10 @@
 
 # Add in the column to the dataframe for the labels that distinguishes whether
 # these officers are allocated to the district-or-lower levels or one of the key hospitals.
-labels = wb_import.loc[6:37, 0].reset_index(drop=True)
+labels = wb_import.loc[6:39, 0].reset_index(drop=True)
 is_distlevel = labels.copy()
-is_distlevel[0:27] = True  # for district-or-lower levels
-is_distlevel[27:] = False  # for CenHos-or-above levels
+is_distlevel[0:28] = True  # for district-or-lower levels
+is_distlevel[28:] = False  # for CenHos-or-above levels
 
 wb_extract.loc[:, 'District_Or_Hospital'] = labels
 wb_extract.loc[:, 'Is_DistrictLevel'] = is_distlevel
@@ -691,40 +691,21 @@
 # Finished import from the CHAI excel:
 fund_staffing_table = wb_extract.copy()
 
-# There are a large number of officer_types EO1 (DCSA/Comm Health Workers) at HQ level, which is non-sensible
-# Therefore, re-distribute these evenly to the districts.
-extra_CHW = fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'HQ or missing',
-                                    fund_staffing_table.columns[fund_staffing_table.columns == 'E01']].values[0][0]
-fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'HQ or missing',
-                        fund_staffing_table.columns[fund_staffing_table.columns == 'E01']] = 0
-extra_CHW_per_district = int(np.floor(extra_CHW / fund_staffing_table['Is_DistrictLevel'].sum()))
-fund_staffing_table.loc[fund_staffing_table['Is_DistrictLevel'], 'E01'] = \
-    fund_staffing_table.loc[fund_staffing_table['Is_DistrictLevel'], 'E01'] + \
-    extra_CHW_per_district
-
-# The imported staffing table suggest that there is 1 Dental officer (D01) in each district,
-# but the TimeBase data (below) suggest that no appointment occurring at a district-level Facility can incur
-# the time such an officer. Therefore reallocate the D01 officers to the Referral Hospitals
-extra_D01 = fund_staffing_table.loc[
-    ~fund_staffing_table['District_Or_Hospital'].isin(['KCH', 'MCH', 'QECH', 'ZCH']),
-    fund_staffing_table.columns[fund_staffing_table.columns == 'D01']].sum().values[0]
-fund_staffing_table.loc[~fund_staffing_table['District_Or_Hospital'].isin(['KCH', 'MCH', 'QECH', 'ZCH']),
-                        fund_staffing_table.columns[fund_staffing_table.columns == 'D01']] = 0
-extra_D01_per_referralhosp = extra_D01 / 4  # divided by 4 CenHos
-fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'].isin(['KCH', 'MCH', 'QECH', 'ZCH']), 'D01'] = \
-    fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'].isin(['KCH', 'MCH', 'QECH', 'ZCH']), 'D01'] + \
-    extra_D01_per_referralhosp
+# The imported staffing table suggest that there is some Dental officer (D01) in most districts,
+# but the TimeBase data (below) suggest that D01 is only needed at central hospitals.
+# This potential inconsistency can be solved by re-allocating D01 from districts to central hospitals, but
+# currently we keep the source data as it is the establishment and CHAI team does not recommend such re-allocation.
 
 # *** Only for funded_plus ********************************************************************************************
 # Since districts Balaka,Machinga,Mwanza,Neno,Ntchisi,Salima and central hospitals have 0 C01, while C01 is \
 # required by Mental appts at level 1b, level 2 and level 3, we move some C01 from 'HQ or missing' to them. \
 # To achieve this, we evenly distribute 30 C01 at HQ to all districts and central hospitals (27 DisHos, 4 CenHos)
-C01_at_HQ = fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'HQ or missing', 'C01'].values
-extra_C01_per_district_CenHos = C01_at_HQ / 31
-fund_staffing_table.loc[~fund_staffing_table['District_Or_Hospital'].isin(['HQ or missing']), 'C01'] = (
-    fund_staffing_table.loc[~fund_staffing_table['District_Or_Hospital'].isin(['HQ or missing']), 'C01'] +
-    extra_C01_per_district_CenHos)
-fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'HQ or missing', 'C01'] = 0
+# C01_at_HQ = fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'HQ or missing', 'C01'].values
+# extra_C01_per_district_CenHos = C01_at_HQ / 31
+# fund_staffing_table.loc[~fund_staffing_table['District_Or_Hospital'].isin(['HQ or missing']), 'C01'] = (
+#     fund_staffing_table.loc[~fund_staffing_table['District_Or_Hospital'].isin(['HQ or missing']), 'C01'] +
+#     extra_C01_per_district_CenHos)
+# fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'HQ or missing', 'C01'] = 0
 # *********************************************************************************************************************
 
 # Sort out which are district allocations and which are central hospitals and above
@@ -732,7 +713,7 @@
 # We assign HQ to HQ; KCH as RefHos in Central region; MCH as RefHos in Northern region;
 # QECH and ZCH as RefHos in Southern region (QECH is in Southwest and ZCH is in Southeast).
 fund_staffing_table.loc[
-    fund_staffing_table['District_Or_Hospital'] == 'HQ or missing', 'District_Or_Hospital'] = 'Headquarter'
+    fund_staffing_table['District_Or_Hospital'] == 'HQ', 'District_Or_Hospital'] = 'Headquarter'
 fund_staffing_table.loc[
     fund_staffing_table['District_Or_Hospital'] == 'KCH', 'District_Or_Hospital'] = 'Referral Hospital_Central'
 fund_staffing_table.loc[
@@ -745,6 +726,8 @@
     fund_staffing_table['District_Or_Hospital'] == 'ZCH', 'District_Or_Hospital'] = 'Referral Hospital_Southern'
 # fund_staffing_table.loc[
 # fund_staffing_table['District_Or_Hospital'] == 'ZCH', 'District_Or_Hospital'] = 'Referral Hospital_Southeast'
+fund_staffing_table.loc[
+    fund_staffing_table['District_Or_Hospital'] == 'ZMH', 'District_Or_Hospital'] = 'Zomba Mental Hospital'
 
 # Group the referral hospitals QECH and ZCH as Referral Hospital_Southern
 Is_DistrictLevel = fund_staffing_table['Is_DistrictLevel'].values  # Save the column 'Is_DistrictLevel' first
@@ -752,42 +735,14 @@
     fund_staffing_table.groupby(by=['District_Or_Hospital'], sort=False).sum()).reset_index()
 fund_staffing_table.insert(1, 'Is_DistrictLevel', Is_DistrictLevel[:-1])  # Add the column 'Is_DistrictLevel'
 
-# Add a row for Zomba Mental Hospital with 3 C01 mental health staff
-# (according to data in 2018-03-09 Facility-level establishment MOH & CHAM)
-# (This is much less than the current 12 C01.)
-fund_ZMH = pd.DataFrame(columns=fund_staffing_table.columns.copy())
-fund_ZMH.loc[0, 'District_Or_Hospital'] = 'Zomba Mental Hospital'
-fund_ZMH.loc[0, 'Is_DistrictLevel'] = False
-fund_ZMH.loc[0, 'C01'] = 3
-# Alternatively, if consider all potential cadres from compiled staff return
-# fund_cadres_ZMH = pd.DataFrame(index = [0], columns = ['M01','M02','M03','N01','N02','C01','P02','L02'],
-#                          data = np.array([[2,13,14,8,30,3,1,1]]))
-# for col in fund_cadres_ZMH.columns:
-#    fund_ZMH.loc[0,col] = fund_cadres_ZMH.loc[0,col].copy()
-
-# Concat
-fund_staffing_table = pd.concat([fund_staffing_table, fund_ZMH])
-fund_staffing_table.reset_index(drop=True, inplace=True)
-fund_staffing_table.fillna(0, inplace=True)
-
-# File 2018-03-09 Facility-level establishment MOH & CHAM indicates that ZMH is assigned to Zomba District,
-# We therefore subtract the 3 C01 staff from Zomba District.
-fund_idx_ZombaDist = fund_staffing_table[fund_staffing_table['District_Or_Hospital'] == 'Zomba'].index
-fund_staffing_table.loc[fund_idx_ZombaDist, 'C01'] = \
-    fund_staffing_table.loc[fund_idx_ZombaDist, 'C01'] - fund_ZMH.loc[0, 'C01']
-# Alternatively, if consider all potential cadres from compiled staff return
-# fund_staffing_table.loc[fund_idx_ZombaDist, :] =\
-# fund_staffing_table.loc[fund_idx_ZombaDist, :] - fund_ZMH.loc[0,:]
-
-# Check that fund_staffing_table.loc[fund_idx_ZombaDist, :] >=0
-assert (fund_staffing_table.loc[fund_idx_ZombaDist, 'M01':'R04'].values >= 0).all()
+# Check that in fund_staffing_table every staff count entry >= 0
+assert (fund_staffing_table.loc[:, 'M01':'R04'].values >= 0).all()
 
 # The following districts are not in the CHAI data because they are included within other districts.
 # For now, we will say that the division of staff between these cities and the wide district (where they are included)
-# is consistent with the population recorded for them.
+# is consistent with the population recorded for them (Malawi 2018 census),
 # i.e., to use population-based weights to reallocate staff
 
-# Add in Likoma (part Nkhata Bay)
 # Add in Lilongwe City (part of Lilongwe)
 # Add in Mzuzu City (part of Mziba) ASSUMED
 # Add in Zomba City (part of Zomba)
@@ -795,7 +750,6 @@
 
 # create mapping: the new districts : super_district
 split_districts = (
-    ('Likoma', 'Nkhata Bay'),
     ('Lilongwe City', 'Lilongwe'),
     ('Mzuzu City', 'Mzimba'),
     ('Zomba City', 'Zomba'),
@@ -819,7 +773,7 @@
     total_staff = fund_staffing_table.loc[
         fund_staffing_table['District_Or_Hospital'] == super_district, cols].values.squeeze()
 
-    # get the weight; The original weights w0 for the 5 new districts in order are 0.05,0.60,0.24,0.14,1.77(> 1)
+    # get the weight; The original weights w0 for the 4 new districts in order are 0.60,0.24,0.14,1.77(> 1)
     w0 = pop_by_district.loc[new_district, 'Count'] / pop_by_district.loc[super_district, 'Count']
     if w0 < 1:
         w = w0
@@ -874,15 +828,15 @@
 # Before split, update the funded C01 distributions at levels 1a, 1b and 2 using CHAI Optimal Workforce estimates. \
 # This is because funded C01 are all at level 1b (100%), meanwhile appt time base requires C01 at level 2. \
 # CHAI Optimal Workforce locates C01 47.92% at level 1b and 52.08% at level 2, which seems more sensible.
-idx_c01_level_1b = fund_staff_distribution[
-    (fund_staff_distribution['Cadre_Code'] == 'C01') &
-    (fund_staff_distribution['Facility_Level'] == 'Facility_Level_1b')].index
-fund_staff_distribution.loc[idx_c01_level_1b, 'Proportion_Fund'] = 0.4792
-
-idx_c01_level_2 = fund_staff_distribution[
-    (fund_staff_distribution['Cadre_Code'] == 'C01') &
-    (fund_staff_distribution['Facility_Level'] == 'Facility_Level_2')].index
-fund_staff_distribution.loc[idx_c01_level_2, 'Proportion_Fund'] = 0.5208
+# idx_c01_level_1b = fund_staff_distribution[
+#     (fund_staff_distribution['Cadre_Code'] == 'C01') &
+#     (fund_staff_distribution['Facility_Level'] == 'Facility_Level_1b')].index
+# fund_staff_distribution.loc[idx_c01_level_1b, 'Proportion_Fund'] = 0.4792
+#
+# idx_c01_level_2 = fund_staff_distribution[
+#     (fund_staff_distribution['Cadre_Code'] == 'C01') &
+#     (fund_staff_distribution['Facility_Level'] == 'Facility_Level_2')].index
+# fund_staff_distribution.loc[idx_c01_level_2, 'Proportion_Fund'] = 0.5208
 # *********************************************************************************************************************
 
 # Split
@@ -911,12 +865,14 @@
                                                       'Facility_Level_3', 'Facility_Level_3',
                                                       'Facility_Level_4']
 
+# Check that in fund_staffing_table every staff count entry >= 0
+assert (fund_staffing_table.loc[:, 'M01':'R04'].values >= 0).all()
 # fund_staffing_table ready!
 
 # Save the table without column 'Is_DistrictLevel'; staff counts in floats
 fund_staffing_table_to_save = fund_staffing_table.drop(columns='Is_DistrictLevel', inplace=False)
-# fund_staffing_table_to_save.to_csv(
-#     outputlocation / 'human_resources' / 'funded' / 'ResourceFile_Staff_Table.csv', index=False)
+fund_staffing_table_to_save.to_csv(
+    outputlocation / 'human_resources' / 'funded' / 'ResourceFile_Staff_Table.csv', index=False)
 # fund_staffing_table_to_save.to_csv(
 #     outputlocation / 'human_resources' / 'funded_plus' / 'ResourceFile_Staff_Table.csv', index=False)
 

From d6a7461b07d36f7fe06dfeedb48c2c506a2cfc54 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Tue, 15 Feb 2022 16:38:40 +0000
Subject: [PATCH 047/131] Update to line 1035: CHAI current staff has no data
 for Likoma, thereby created a row for Likoma; again no reallocation of dental
 officer and DCSA

---
 .../formatting_healthsystem_data_update.py    | 80 +++++++------------
 1 file changed, 28 insertions(+), 52 deletions(-)

diff --git a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
index 503673d1db..6152443036 100644
--- a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
+++ b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
@@ -879,7 +879,7 @@
 # ***
 # --- Creating curr_staffing_table and curr_staff_list for current staff
 # Extract the section about "Current TOTAl Staff'
-hcw_curr_extract = wb_import.loc[3:37, 1:21]
+hcw_curr_extract = wb_import.loc[3:39, 1:21]
 hcw_curr_extract = hcw_curr_extract.drop([4, 5])
 hcw_curr_extract.columns = hcw_curr_extract.iloc[0]
 hcw_curr_extract = hcw_curr_extract.drop([3])
@@ -897,25 +897,16 @@
 # Check the cadre columns of curr_staffing_table is identical to fund_staffing_table
 assert set(curr_staffing_table.columns[0:21]) == set(fund_staffing_table.columns[-21:])
 
-# For curr_staffing_table, reallocating D01 from districts to referral hospitals
-# Treat KCH, MCH, QECH, ZCH as referral hospitals
+# For curr_staffing_table, do not re-allocate Dental officer with the same reason above for established staff;
+# Also, the central/referral hospitals have Dental officer allocated to meet dental service demand,
+# thus no risk of not able to meet such demand at level 3.
+
 # The operation of reallocating E01 in HQ to districts is not needed for curr_staffing_table,
 # as no. of E01 in curr_staffing_table at HQ is zero.
 
-curr_extra_D01 = curr_staffing_table.loc[
-    ~curr_staffing_table['District_Or_Hospital'].isin(['KCH', 'MCH', 'QECH', 'ZCH']), curr_staffing_table.columns[
-        curr_staffing_table.columns == 'D01']].sum().values[0]
-curr_staffing_table.loc[
-    ~curr_staffing_table['District_Or_Hospital'].isin(['KCH', 'MCH', 'QECH', 'ZCH']), curr_staffing_table.columns[
-        curr_staffing_table.columns == 'D01']] = 0
-curr_extra_D01_per_referralhosp = curr_extra_D01 / 4
-curr_staffing_table.loc[curr_staffing_table['District_Or_Hospital'].isin(['KCH', 'MCH', 'QECH', 'ZCH']), 'D01'] = \
-    curr_staffing_table.loc[curr_staffing_table['District_Or_Hospital'].isin(['KCH', 'MCH', 'QECH', 'ZCH']), 'D01'] + \
-    curr_extra_D01_per_referralhosp
-
 # For curr_staffing_table, sort out the districts and central hospitals
 curr_staffing_table.loc[
-    curr_staffing_table['District_Or_Hospital'] == 'HQ or missing', 'District_Or_Hospital'] = 'Headquarter'
+    curr_staffing_table['District_Or_Hospital'] == 'HQ', 'District_Or_Hospital'] = 'Headquarter'
 curr_staffing_table.loc[
     curr_staffing_table['District_Or_Hospital'] == 'KCH', 'District_Or_Hospital'] = 'Referral Hospital_Central'
 curr_staffing_table.loc[
@@ -924,6 +915,8 @@
     curr_staffing_table['District_Or_Hospital'] == 'QECH', 'District_Or_Hospital'] = 'Referral Hospital_Southern'
 curr_staffing_table.loc[
     curr_staffing_table['District_Or_Hospital'] == 'ZCH', 'District_Or_Hospital'] = 'Referral Hospital_Southern'
+curr_staffing_table.loc[
+    curr_staffing_table['District_Or_Hospital'] == 'ZMH', 'District_Or_Hospital'] = 'Zomba Mental Hospital'
 
 # Group the referral hospitals QECH and ZCH as Referral Hospital_Southern
 Is_DistrictLevel = curr_staffing_table['Is_DistrictLevel'].values  # Save the column 'Is_DistrictLevel' first
@@ -931,41 +924,24 @@
     curr_staffing_table.groupby(by=['District_Or_Hospital'], sort=False).sum()).reset_index()
 curr_staffing_table.insert(1, 'Is_DistrictLevel', Is_DistrictLevel[:-1])  # Add the column 'Is_DistrictLevel'
 
-# Add a row for Zomba Mental Hospital, which has 12 mental health staff according to compiled staff return
-curr_ZMH = pd.DataFrame(columns=curr_staffing_table.columns.copy())
-curr_ZMH.loc[0, 'District_Or_Hospital'] = 'Zomba Mental Hospital'
-curr_ZMH.loc[0, 'Is_DistrictLevel'] = False
-curr_ZMH.loc[0, 'C01'] = 12
-# Alternatively, if consider all potential cadres from compiled staff return
-# curr_cadres_ZMH = pd.DataFrame(index = [0], columns = ['M01','M02','N01','N02','C01','P02','P03'],
-#                          data = np.array([[2,5,19,27,12,1,1]]))
-# for col in curr_cadres_ZMH.columns:
-#    curr_ZMH.loc[0,col] = curr_cadres_ZMH.loc[0,col].copy()
-
-curr_staffing_table = pd.concat([curr_staffing_table, curr_ZMH])
-curr_staffing_table.reset_index(drop=True, inplace=True)
-curr_staffing_table.fillna(0, inplace=True)
-
-# For Zomba district, there are 12 mental health staff C01;
-# However, compiled staff return does not record any C01 in Zomba district;
-# We therefore assume that its 12 C01 are from Zomba Mental Hospital.
-curr_idx_ZombaDist = curr_staffing_table[curr_staffing_table['District_Or_Hospital'] == 'Zomba'].index
-curr_staffing_table.loc[curr_idx_ZombaDist, 'C01'] = \
-    curr_staffing_table.loc[curr_idx_ZombaDist, 'C01'] - curr_ZMH.loc[0, 'C01']
-# Alternatively, if consider all potential cadres from compiled staff return
-# curr_staffing_table.loc[curr_idx_ZombaDist, :] = curr_staffing_table.loc[curr_idx_ZombaDist, :] - curr_ZMH.loc[0,:]
-
-# Check that curr_staffing_table.loc[curr_idx_ZombaDist, :] >=0
-assert (curr_staffing_table.loc[curr_idx_ZombaDist, 'M01':'R04'].values >= 0).all()
-
-# Similarly split staff to 5 special districts as done for funded staff
-# split_districts = (
-#    ('Likoma', 'Nkhata Bay'),
-#    ('Lilongwe City', 'Lilongwe'),
-#    ('Mzuzu City', 'Mzimba'),
-#    ('Zomba City', 'Zomba'),
-#    ('Blantyre City', 'Blantyre')
-# )
+# No need to add a row for Zomba Mental Hospital, as the updated CHAI data has this row for ZMH.
+# Check that in curr_staffing_table each staff count entry >=0
+assert (curr_staffing_table.loc[:, 'M01':'R04'].values >= 0).all()
+
+# Split staff to 5 special districts;
+# for current staff, we include Likoma here because CHAI has no current staff allocated in Likoma
+# (CHAI team they will allocate some staff to Likoma but not yet done)
+split_districts = (
+   ('Likoma', 'Nkhata Bay'),
+   ('Lilongwe City', 'Lilongwe'),
+   ('Mzuzu City', 'Mzimba'),
+   ('Zomba City', 'Zomba'),
+   ('Blantyre City', 'Blantyre')
+)
+
+# drop the original placeholder row for Likoma
+curr_staffing_table.drop([9], inplace=True)
+curr_staffing_table.reset_index(inplace=True, drop=True)
 
 for i in np.arange(0, len(split_districts)):
     new_district = split_districts[i][0]
@@ -1052,8 +1028,8 @@
 
 # Save the table without column 'Is_DistrictLevel'; staff counts in floats
 curr_staffing_table_to_save = curr_staffing_table.drop(columns='Is_DistrictLevel', inplace=False)
-# curr_staffing_table_to_save.to_csv(
-#     outputlocation / 'human_resources' / 'actual' / 'ResourceFile_Staff_Table.csv', index=False)
+curr_staffing_table_to_save.to_csv(
+    outputlocation / 'human_resources' / 'actual' / 'ResourceFile_Staff_Table.csv', index=False)
 
 # ---------------------------------------------------------------------------------------------------------------------
 # *** Create the Master Facilities List

From 5dc1cca0c5cab02b045bad2240c8bbb4a843bf27 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Tue, 15 Feb 2022 18:04:14 +0000
Subject: [PATCH 048/131] Update to line 1376: mfl is the same; not consider
 DHO service time anymore; service time at level 1a does not consider Disp as
 Disp has no service time requirement; Growth Monitoring appt is excluded in
 the Nutrition category

---
 .../formatting_healthsystem_data_update.py    | 53 +++++++++----------
 1 file changed, 26 insertions(+), 27 deletions(-)

diff --git a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
index 6152443036..dcdce45d8b 100644
--- a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
+++ b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
@@ -1053,15 +1053,14 @@
 #                   'District Hospital', 'DHO', 'Referral Hospital', 'Zomba Mental Hospital']
 # Facility_Types_Levels = dict(zip(Facility_Types, Facility_Levels))
 
-
 # Create empty dataframe that will be the Master Facilities List (mfl)
 mfl = pd.DataFrame(columns=['Facility_Level', 'District', 'Region'])
 
 pop_districts = pop['District'].values  # array; the 'pop_districts' used in previous lines is a DataFrame
 pop_regions = pd.unique(pop['Region'])
 
-# Each district is assigned with a set of community level facs, a set of primary level facs,
-# and a set of second level facs.
+# Each district is assigned with a set of community level facs (0), a set of primary level facs (1a, 1b),
+# and a set of second level facs (2).
 # Therefore, the total sets of facs is 4 * no. of districts + 3 (RefHos per Region) + 1 (HQ) + 1 (ZMH) \
 # = 4 * 32 + 5 = 133
 for d in pop_districts:
@@ -1132,8 +1131,8 @@
 #                               index=False)
 
 # ---------------------------------------------------------------------------------------------------------------------
-# *** Now look at the types of appointments
-sheet = pd.read_excel(workingfile, sheet_name='Time_Base', header=None)
+# *** Now look at the types of appointments from the sheet 'Time_Curr'
+sheet = pd.read_excel(workingfile, sheet_name='Time_Curr', header=None)
 
 # get rid of the junk rows
 trimmed = sheet.loc[[7, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, 24, 26, 27]]
@@ -1142,10 +1141,14 @@
 data_import = data_import.dropna(axis='columns', how='all')  # get rid of the 'spacer' columns
 data_import = data_import.fillna(0)
 
-# get rid of records for which there is no call on time of any type of officer
+# get rid of records for which there is no call on time of any type of officer at any fac type
 data_import = data_import.drop(columns=data_import.columns[data_import.sum() == 0])
 
-# We note that the DCSA (CHW) never has a time requirement and that no appointments can be serviced at the HealthPost.
+# Note that in the updated 'Time_Curr', Disp has no time requirements at all for medical assistant M03,
+# which is different from the previous version
+assert data_import.loc['Disp', :].sum() == 0
+
+# Note that the DCSA (CHW) never has a time requirement and that no appointments can be serviced at the HealthPost.
 # We remedy this by inserting a new type of appointment, which only the DCSA can service, \
 # and the time taken is 10 minutes.
 new_appt_for_CHW = pd.Series(index=data_import.index,
@@ -1169,21 +1172,16 @@
                              ])
 
 data_import = pd.concat([data_import, new_appt_for_CHW], axis=1)
+assert data_import.loc['HP', :].sum() == 10.0
 
-# Add service times for DHOs, which has quite a few data in 'Incidence_Curr', by copying the data of DisHos
-new_rows_for_DHO = pd.DataFrame(index=['DHO', 'DHO_Per'], columns=data_import.columns.copy(),
-                                data=data_import.loc[['DisHos', 'DisHos_Per'], :].copy().values)
+# We now do not add service time for DHO as we think DHO does not deliver services directly
+# Also, DHO itself in both DHIS2 and CHAI updated data does not have service record
 
-# Add service times (Mental OPD and Mental Clinic Visit) for Zomba Mental Hospital, by copying data of CenHos
+# Add service times for Zomba Mental Hospital, by copying data of CenHos
 new_rows_for_ZMH = pd.DataFrame(index=['ZMH', 'ZMH_Per'], columns=data_import.columns.copy(),
-                                data=0)
-new_rows_for_ZMH.loc[:, ['C01_MentOPD', 'C01_MentClinic']] = data_import.loc[
-    ['CenHos', 'CenHos_Per'], ['C01_MentOPD', 'C01_MentClinic']].copy().values
-# If consider all potential cadres from compiled staff return and all associated services
-# new_rows_for_ZMH = pd.DataFrame(index=['ZMH','ZMH_Per'],columns=data_import.columns.copy(),
-#                               data=data_import.loc[['CenHos','CenHos_Per'],:].copy().values)
+                                data=data_import.loc[['CenHos', 'CenHos_Per'], :].copy().values)
 
-data_import = pd.concat([data_import, new_rows_for_DHO, new_rows_for_ZMH])
+data_import = pd.concat([data_import, new_rows_for_ZMH])
 
 # data_import ready!
 
@@ -1272,7 +1270,6 @@
 
 # level 2
 District_Hospital_ExpecTime = data_import.loc['DisHos'] * data_import.loc['DisHos_Per']
-DHO_ExpecTime = data_import.loc['DHO'] * data_import.loc['DHO_Per']
 
 # level 1b
 Community_Hospital_ExpecTime = data_import.loc['ComHos'] * data_import.loc['ComHos_Per']
@@ -1280,23 +1277,21 @@
 # level 1a
 Urban_HealthCentre_ExpecTime = data_import.loc['UrbHC'] * data_import.loc['UrbHC_Per']
 Rural_HealthCentre_ExpecTime = data_import.loc['RurHC'] * data_import.loc['RurHC_Per']
-Disp_ExpecTime = data_import.loc['Disp'] * data_import.loc['Disp_Per']
 
 # level 0
 HealthPost_ExpecTime = data_import.loc['HP'] * data_import.loc['HP_Per']
 
-# Average time for levels 2 and 1a, which have data for more than 1 facility types
-Avg_Level2_ExpectTime = (District_Hospital_ExpecTime + DHO_ExpecTime) / 2  # Identical to DisHos Expected Time
-Avg_Level1a_ExpectTime = (Disp_ExpecTime + Urban_HealthCentre_ExpecTime + Rural_HealthCentre_ExpecTime) / 3
+# Average time for levels 1a, which have data for more than 1 facility types
+Avg_Level1a_ExpectTime = (Urban_HealthCentre_ExpecTime + Rural_HealthCentre_ExpecTime) / 2
 
 # Assemble
 X = pd.DataFrame({
     5: HQ_ExpecTime,  # (Headquarter)
     4: ZMH_ExpectTime,  # (Zomba Mental Hospital)
     3: Central_Hospital_ExpecTime,  # (our "Referral Hospital" at region level)
-    2: Avg_Level2_ExpectTime,  # (DHO and DisHos at second level )
+    2: District_Hospital_ExpecTime,  # (DisHos at second level )
     '1b': Community_Hospital_ExpecTime,  # (ComHos at primary level)
-    '1a': Avg_Level1a_ExpectTime,  # (UrbHC,RurHC and Disp at primary level)
+    '1a': Avg_Level1a_ExpectTime,  # (UrbHC,RurHC at primary level)
     0: HealthPost_ExpecTime  # (HP at community level)
 })
 
@@ -1330,8 +1325,12 @@
 ).reset_index()
 
 # Save
-appt_time_table_coarse.to_csv(outputlocation / 'human_resources' / 'definitions' / 'ResourceFile_Appt_Time_Table.csv',
-                              index=False)
+ApptTimeTable.to_csv(
+    outputlocation / 'human_resources' / 'definitions' / 'ResourceFile_Appt_Time_Table.csv',
+    index=False)
+appt_time_table_coarse.to_csv(
+    outputlocation / 'human_resources' / 'definitions' / 'ResourceFile_Appt_Time_Table_Coarse.csv',
+    index=False)
 
 # ---------------------------------------------------------------------------------------------------------------------
 # *** Create a table that determines what kind of appointment can be serviced in each Facility Level

From f5fdd0dce2921d948ffa84d7af5fa7f896b9165a Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Wed, 16 Feb 2022 11:42:06 +0000
Subject: [PATCH 049/131] Update to line 1511: PFT table is different --
 facility types include DisHos, ComHos, HC; DCSA, dental, mental, nutrition,
 radiography cadres have no data; working  hours and minutes have changed

---
 .../formatting_healthsystem_data_update.py    | 57 ++++++++++---------
 1 file changed, 31 insertions(+), 26 deletions(-)

diff --git a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
index dcdce45d8b..0289ea3887 100644
--- a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
+++ b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
@@ -1456,51 +1456,56 @@
 # First, read-in the number of working hours and days for each type of officer
 
 pft_sheet = pd.read_excel(workingfile, sheet_name='PFT', header=None)
-officer_types_import = pft_sheet.iloc[2, np.arange(2, 23)]
+officer_types_import = pft_sheet.iloc[3, np.arange(2, 23)]
 
 assert set(officer_types_import) == set(officer_types_table['Officer_Type_Code'])
 assert len(officer_types_import) == len(officer_types_table['Officer_Type_Code'])
 
-# patient facing hours daily at hospitals
-hours_hospital = pft_sheet.iloc[38, np.arange(2, 23)]
+# fill nan with 0
 
-# patient facing hours daily at health centres
-work_mins_hc = pft_sheet.iloc[26, np.arange(2, 23)]
-admin_mins_hc = pft_sheet.iloc[34, np.arange(2, 23)]
-hours_hc = (work_mins_hc - admin_mins_hc) / 60
 
 # Total working days per year
-days_per_year_men = pft_sheet.iloc[15, np.arange(2, 23)]
-days_per_year_women = pft_sheet.iloc[16, np.arange(2, 23)]
-days_per_year_pregwomen = pft_sheet.iloc[17, np.arange(2, 23)]
+days_per_year_men = pft_sheet.iloc[16, np.arange(2, 23)]
+days_per_year_women = pft_sheet.iloc[17, np.arange(2, 23)]
+days_per_year_pregwomen = pft_sheet.iloc[18, np.arange(2, 23)]
 
 # Percents of men, nonpregnant women, and pregnant women
-fr_men = pft_sheet.iloc[53, np.arange(2, 23)]
-fr_pregwomen = pft_sheet.iloc[55, np.arange(2, 23)] * pft_sheet.iloc[57, np.arange(2, 23)]
-fr_nonpregwomen = pft_sheet.iloc[55, np.arange(2, 23)] * (1 - pft_sheet.iloc[57, np.arange(2, 23)])
+fr_men = pft_sheet.iloc[66, np.arange(2, 23)]
+fr_pregwomen = pft_sheet.iloc[71, np.arange(2, 23)]
+fr_nonpregwomen = pft_sheet.iloc[68, np.arange(2, 23)] - pft_sheet.iloc[71, np.arange(2, 23)]
 
 # Total average working days
 workingdays = (fr_men * days_per_year_men) + (fr_nonpregwomen * days_per_year_women) + (
     fr_pregwomen * days_per_year_pregwomen)
 
-# --- patient facing time
-# Average mins per year, Average hours per day, Average number of mins per day in Malawi
+# patient facing (i.e. non-admin working) minutes and hours daily at
+# district hospitals, community hospitals, health centres
+mins_daily_dishos = pft_sheet.iloc[37, np.arange(2, 23)]
+hrs_daily_dishos = mins_daily_dishos / 60
 
-mins_per_day_hospital = hours_hospital * 60
-mins_per_day_hc = hours_hc * 60
+mins_daily_comhos = pft_sheet.iloc[42, np.arange(2, 23)]
+hrs_daily_comhos = mins_daily_comhos / 60
 
-mins_per_year_hospital = mins_per_day_hospital * workingdays
-mins_per_year_hc = mins_per_day_hc * workingdays
+mins_daily_hc = pft_sheet.iloc[46, np.arange(2, 23)]
+hrs_daily_hc = mins_daily_hc / 60
 
-av_mins_per_day_hospital = mins_per_year_hospital / 365.25
-av_mins_per_day_hc = mins_per_year_hc / 365.25
+# Total mins per year, Average number of mins per day at
+# district hospitals, community hospitals, health centres
+mins_yearly_dishos = mins_daily_dishos * workingdays
+mins_yearly_comhos = mins_daily_comhos * workingdays
+mins_yearly_hc = mins_daily_hc * workingdays
 
-# PFT - hospital and health centre individually
+av_mins_daily_dishos = mins_yearly_dishos / 365.25
+av_mins_daily_comhos = mins_yearly_comhos / 365.25
+av_mins_daily_hc = mins_yearly_hc / 365.25
+
+# PFT - dishos, comhos, hc individual columns
+# note that the average is calculated on 365.25 days (not the working days) per year
 HosHC_patient_facing_time = pd.DataFrame(
-    {'Officer_Type_Code': officer_types_import, 'Working_Days_Per_Year': workingdays,
-     'Hospital_Hours_Per_Day': hours_hospital, 'HC_Hours_Per_Day': hours_hc,
-     'Hospital_Av_Mins_Per_Day': av_mins_per_day_hospital,
-     'HC_Av_Mins_Per_Day': av_mins_per_day_hc}
+    {'Officer_Type_Code': officer_types_import,
+     'DisHos_Av_Mins_Per_Day': av_mins_daily_dishos,
+     'ComHos_Av_Mins_Per_Day': av_mins_daily_comhos,
+     'HC_Av_Mins_Per_Day': av_mins_daily_hc}
 ).reset_index(drop=True)
 
 # PFT table ready!

From 98657605b9e82c5e3e4b3b8c189682afcc22558b Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Wed, 16 Feb 2022 13:06:49 +0000
Subject: [PATCH 050/131] Update to line 1696: because of the change of PFT and
 staff counts, the daily capability tables chang accordingly

---
 .../formatting_healthsystem_data_update.py    | 59 +++++++++++++------
 1 file changed, 41 insertions(+), 18 deletions(-)

diff --git a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
index 0289ea3887..f33386a312 100644
--- a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
+++ b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
@@ -1508,6 +1508,18 @@
      'HC_Av_Mins_Per_Day': av_mins_daily_hc}
 ).reset_index(drop=True)
 
+# The new PFT has no minutes for M01 at health centres,
+# but in Time_Curr, IPAdmissions/RMNCH/... appointments at Urban HCs all need time from M01.
+# We therefore assume the minutes for M01 at HCs are the average of those at DisHos and CenHos,
+# to solve inconsistency between PFT and Time_Curr
+# HosHC_patient_facing_time.loc[0, 'HC_Av_Mins_Per_Day'] = (
+#     HosHC_patient_facing_time.loc[0, 'DisHos_Av_Mins_Per_Day'] +
+#     HosHC_patient_facing_time.loc[0, 'ComHos_Av_Mins_Per_Day']
+#                                                          ) / 2
+
+# How to do with cadres that do not have minutes at all in PFT,
+# whereas they have time requirements in Time_Curr?
+
 # PFT table ready!
 
 # Create final tables of daily time available at each facility by officer type: Facility_ID, Facility_Type,
@@ -1525,12 +1537,17 @@
             t = (funded_staff_floats.loc[i, officer] *
                  HosHC_patient_facing_time.loc[HosHC_patient_facing_time['Officer_Type_Code'] == officer,
                                                'HC_Av_Mins_Per_Day'])
-            funded_daily_minutes.loc[i, officer] = t.values
-        else:  # Levels 1b, 2, and above; Hospital minutes
+            funded_daily_minutes.loc[i, officer] = t.values[0]
+        elif the_level == 'Facility_Level_1b':  # Level 1b; ComHos minutes
+            t = (funded_staff_floats.loc[i, officer] *
+                 HosHC_patient_facing_time.loc[HosHC_patient_facing_time['Officer_Type_Code'] == officer,
+                                               'ComHos_Av_Mins_Per_Day'])
+            funded_daily_minutes.loc[i, officer] = t.values[0]
+        else:  # Levels 2 and above; DisHos and CenHos minutes
             t = (funded_staff_floats.loc[i, officer] *
                  HosHC_patient_facing_time.loc[HosHC_patient_facing_time['Officer_Type_Code'] == officer,
-                                               'Hospital_Av_Mins_Per_Day'])
-            funded_daily_minutes.loc[i, officer] = t.values
+                                               'DisHos_Av_Mins_Per_Day'])
+            funded_daily_minutes.loc[i, officer] = t.values[0]
 
 # Long format
 funded_staff_floats = pd.melt(funded_staff_floats, id_vars=['District_Or_Hospital', 'Facility_Level'],
@@ -1543,7 +1560,7 @@
 # Reset facility level column to exclude 'Facility_Level_'
 funded_daily_capability['Facility_Level'] = \
     funded_daily_capability['Facility_Level'].str.split(pat='_', expand=True).iloc[:, 2]
-# Drop row with zero minutes (also zero staff counts)
+# Drop row with zero minutes (due to either zero staff counts or zero daily minutes)
 funded_daily_capability.drop(
     index=funded_daily_capability[funded_daily_capability['Total_Mins_Per_Day'] == 0].index, inplace=True)
 # Reset index
@@ -1595,12 +1612,17 @@
             t = (curr_staff_floats.loc[i, officer] *
                  HosHC_patient_facing_time.loc[HosHC_patient_facing_time['Officer_Type_Code'] == officer,
                                                'HC_Av_Mins_Per_Day'])
-            curr_daily_minutes.loc[i, officer] = t.values
-        else:  # Levels 1b, 2, and above; Hospital minutes
+            curr_daily_minutes.loc[i, officer] = t.values[0]
+        elif the_level == 'Facility_Level_1b':  # Level 1b; ComHos minutes
             t = (curr_staff_floats.loc[i, officer] *
                  HosHC_patient_facing_time.loc[HosHC_patient_facing_time['Officer_Type_Code'] == officer,
-                                               'Hospital_Av_Mins_Per_Day'])
-            curr_daily_minutes.loc[i, officer] = t.values
+                                               'ComHos_Av_Mins_Per_Day'])
+            curr_daily_minutes.loc[i, officer] = t.values[0]
+        else:  # Levels 2 and above; DisHos and CenHos minutes
+            t = (curr_staff_floats.loc[i, officer] *
+                 HosHC_patient_facing_time.loc[HosHC_patient_facing_time['Officer_Type_Code'] == officer,
+                                               'DisHos_Av_Mins_Per_Day'])
+            curr_daily_minutes.loc[i, officer] = t.values[0]
 
 # Long format
 curr_staff_floats = pd.melt(curr_staff_floats, id_vars=['District_Or_Hospital', 'Facility_Level'],
@@ -1654,20 +1676,21 @@
 curr_daily_capability_coarse.reset_index(drop=True, inplace=True)
 
 # Save
-# HosHC_patient_facing_time.to_csv(
-#     outputlocation / 'human_resources' / 'definitions' / 'ResourceFile_Patient_Facing_Time.csv', index=False)
+HosHC_patient_facing_time.to_csv(
+    outputlocation / 'human_resources' / 'definitions' / 'ResourceFile_Patient_Facing_Time.csv', index=False)
 
-# Need to # two lines below when generate funded_plus capability
-# funded_daily_capability_coarse.to_csv(
-#     outputlocation / 'human_resources' / 'funded' / 'ResourceFile_Daily_Capabilities.csv', index=False)
+curr_daily_capability_coarse.to_csv(
+    outputlocation / 'human_resources' / 'actual' / 'ResourceFile_Daily_Capabilities.csv', index=False)
 
-# *** Only for funded_plus ********************************************************************************************
+# Need to # following lines below when generate funded_plus capability
 funded_daily_capability_coarse.to_csv(
-    outputlocation / 'human_resources' / 'funded_plus' / 'ResourceFile_Daily_Capabilities.csv', index=False)
+    outputlocation / 'human_resources' / 'funded' / 'ResourceFile_Daily_Capabilities.csv', index=False)
+
+# *** Only for funded_plus ********************************************************************************************
+# funded_daily_capability_coarse.to_csv(
+#     outputlocation / 'human_resources' / 'funded_plus' / 'ResourceFile_Daily_Capabilities.csv', index=False)
 # *********************************************************************************************************************
 
-curr_daily_capability_coarse.to_csv(
-    outputlocation / 'human_resources' / 'actual' / 'ResourceFile_Daily_Capabilities.csv', index=False)
 
 
 # ---------------------------------------------------------------------------------------------------------------------

From 51eda1a1845ac128739fd8c5c0e3fe1b8abd5517 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Wed, 16 Feb 2022 14:06:32 +0000
Subject: [PATCH 051/131] Update to the end: because of missing PFT data for
 some cadres, the funded capability cannot meet the service demand for those
 cadres (if assign the previous pft time to them, this inconsistency should be
 solved)

---
 .../formatting_healthsystem_data_update.py    | 22 ++++++++-----------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
index f33386a312..b3433561e3 100644
--- a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
+++ b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
@@ -1560,7 +1560,8 @@
 # Reset facility level column to exclude 'Facility_Level_'
 funded_daily_capability['Facility_Level'] = \
     funded_daily_capability['Facility_Level'].str.split(pat='_', expand=True).iloc[:, 2]
-# Drop row with zero minutes (due to either zero staff counts or zero daily minutes)
+# Drop row with zero or nan minutes (due to either zero staff counts or nan daily minutes)
+funded_daily_capability.fillna(0, inplace=True)
 funded_daily_capability.drop(
     index=funded_daily_capability[funded_daily_capability['Total_Mins_Per_Day'] == 0].index, inplace=True)
 # Reset index
@@ -1596,9 +1597,6 @@
         ['Facility_ID', 'Facility_Name', 'Facility_Level', 'District', 'Region', 'Officer_Category'],
         dropna=False).sum()
 ).reset_index()
-# Drop columns of officer types
-funded_daily_capability_coarse.drop(columns=['Officer_Type_Code', 'Officer_Type'], inplace=True)
-funded_daily_capability_coarse.reset_index(drop=True, inplace=True)
 
 # --- Daily capability for current staff; staff counts in floats
 # For float staff counts, calculate total minutes per day
@@ -1636,6 +1634,7 @@
 curr_daily_capability['Facility_Level'] = \
     curr_daily_capability['Facility_Level'].str.split(pat='_', expand=True).iloc[:, 2]
 # Drop row with zero minutes (also zero staff counts)
+curr_daily_capability.fillna(0, inplace=True)
 curr_daily_capability.drop(
     index=curr_daily_capability[curr_daily_capability['Total_Mins_Per_Day'] == 0].index, inplace=True)
 # Reset index
@@ -1671,9 +1670,6 @@
         ['Facility_ID', 'Facility_Name', 'Facility_Level', 'District', 'Region', 'Officer_Category'],
         dropna=False).sum()
 ).reset_index()
-# Drop columns of officer types
-curr_daily_capability_coarse.drop(columns=['Officer_Type_Code', 'Officer_Type'], inplace=True)
-curr_daily_capability_coarse.reset_index(drop=True, inplace=True)
 
 # Save
 HosHC_patient_facing_time.to_csv(
@@ -1761,13 +1757,13 @@ def all_appts_can_run(capability):
 
 
 # Save results for funded
-# appt_have_or_miss_capability_funded = all_appts_can_run(funded_daily_capability_coarse)
-# appt_have_or_miss_capability_funded.to_csv(
-#     outputlocation / 'human_resources' / 'funded' / 'appt_have_or_miss_capability.csv', index=False)
+appt_have_or_miss_capability_funded = all_appts_can_run(funded_daily_capability_coarse)
+appt_have_or_miss_capability_funded.to_csv(
+    outputlocation / 'human_resources' / 'funded' / 'appt_have_or_miss_capability.csv', index=False)
 # appt_have_or_miss_capability_funded.to_csv(
 #     outputlocation / 'human_resources' / 'funded_plus' / 'appt_have_or_miss_capability.csv', index=False)
 
 # Save results for actual
-# appt_have_or_miss_capability_actual = all_appts_can_run(curr_daily_capability_coarse)
-# appt_have_or_miss_capability_actual.to_csv(
-#     outputlocation / 'human_resources' / 'actual' / 'appt_have_or_miss_capability.csv', index=False)
+appt_have_or_miss_capability_actual = all_appts_can_run(curr_daily_capability_coarse)
+appt_have_or_miss_capability_actual.to_csv(
+    outputlocation / 'human_resources' / 'actual' / 'appt_have_or_miss_capability.csv', index=False)

From 0815e12c82f85c5fb5894bab4a9b9f4a973a4b02 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Wed, 16 Feb 2022 14:21:05 +0000
Subject: [PATCH 052/131] Minor modification

---
 .../formatting_healthsystem_data_update.py          | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
index b3433561e3..98bb9feeeb 100644
--- a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
+++ b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
@@ -932,11 +932,11 @@
 # for current staff, we include Likoma here because CHAI has no current staff allocated in Likoma
 # (CHAI team they will allocate some staff to Likoma but not yet done)
 split_districts = (
-   ('Likoma', 'Nkhata Bay'),
-   ('Lilongwe City', 'Lilongwe'),
-   ('Mzuzu City', 'Mzimba'),
-   ('Zomba City', 'Zomba'),
-   ('Blantyre City', 'Blantyre')
+    ('Likoma', 'Nkhata Bay'),
+    ('Lilongwe City', 'Lilongwe'),
+    ('Mzuzu City', 'Mzimba'),
+    ('Zomba City', 'Zomba'),
+    ('Blantyre City', 'Blantyre')
 )
 
 # drop the original placeholder row for Likoma
@@ -1682,13 +1682,12 @@
 funded_daily_capability_coarse.to_csv(
     outputlocation / 'human_resources' / 'funded' / 'ResourceFile_Daily_Capabilities.csv', index=False)
 
+
 # *** Only for funded_plus ********************************************************************************************
 # funded_daily_capability_coarse.to_csv(
 #     outputlocation / 'human_resources' / 'funded_plus' / 'ResourceFile_Daily_Capabilities.csv', index=False)
 # *********************************************************************************************************************
 
-
-
 # ---------------------------------------------------------------------------------------------------------------------
 # final check that for an appointment required at a particular level (in Appt_Time_Table), \
 # then indeed, the staff capabilities are available to satisfy that, for a person in any district \

From 80d2e0daad799fb03c8cee9b8bb6c4a2569bb08a Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Wed, 16 Feb 2022 17:20:06 +0000
Subject: [PATCH 053/131] Correct typo and propose a fix to the issue of PFT

---
 .../data_file_processing/formatting_healthsystem_data.py | 2 +-
 .../formatting_healthsystem_data_update.py               | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/scripts/data_file_processing/formatting_healthsystem_data.py b/src/scripts/data_file_processing/formatting_healthsystem_data.py
index 228605fa97..88b606abc4 100644
--- a/src/scripts/data_file_processing/formatting_healthsystem_data.py
+++ b/src/scripts/data_file_processing/formatting_healthsystem_data.py
@@ -1,5 +1,5 @@
 ﻿"""
-This file set ups the health system resources for each district, each region, and also national level.
+This file sets up the health system resources for each district, each region, and also national level.
 
 It defines 7 levels for facility types, i.e., Facility_Levels = [0,1a,1b,2,3,4,5].
 
diff --git a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
index 98bb9feeeb..6ff91d72f4 100644
--- a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
+++ b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
@@ -1461,9 +1461,6 @@
 assert set(officer_types_import) == set(officer_types_table['Officer_Type_Code'])
 assert len(officer_types_import) == len(officer_types_table['Officer_Type_Code'])
 
-# fill nan with 0
-
-
 # Total working days per year
 days_per_year_men = pft_sheet.iloc[16, np.arange(2, 23)]
 days_per_year_women = pft_sheet.iloc[17, np.arange(2, 23)]
@@ -1517,8 +1514,12 @@
 #     HosHC_patient_facing_time.loc[0, 'ComHos_Av_Mins_Per_Day']
 #                                                          ) / 2
 
-# How to do with cadres that do not have minutes at all in PFT,
+# How to deal with cadres (DCSA, Dental, Mental, Radiography) that do not have minutes at all in PFT,
 # whereas they have time requirements in Time_Curr?
+# (Compared to old PFT sheet,
+# the new PFT has updated all info on available working days/non-admin daily minutes/portion of male/female/pregfemale)
+# A quick fix is to use the average daily minutes of those cadres from old PFT table;
+# The info required to calculate these minutes will be from the old PFT table.
 
 # PFT table ready!
 

From 025c43e6096842571993585564d20a99c70903f7 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Thu, 17 Feb 2022 11:30:56 +0000
Subject: [PATCH 054/131] Try fixing the missing entries in PFT table

---
 .../formatting_healthsystem_data_update.py    | 62 +++++++++++++++++--
 1 file changed, 57 insertions(+), 5 deletions(-)

diff --git a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
index 6ff91d72f4..1e5cec9f89 100644
--- a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
+++ b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
@@ -1509,17 +1509,69 @@
 # but in Time_Curr, IPAdmissions/RMNCH/... appointments at Urban HCs all need time from M01.
 # We therefore assume the minutes for M01 at HCs are the average of those at DisHos and CenHos,
 # to solve inconsistency between PFT and Time_Curr
-# HosHC_patient_facing_time.loc[0, 'HC_Av_Mins_Per_Day'] = (
-#     HosHC_patient_facing_time.loc[0, 'DisHos_Av_Mins_Per_Day'] +
-#     HosHC_patient_facing_time.loc[0, 'ComHos_Av_Mins_Per_Day']
-#                                                          ) / 2
-
+HosHC_patient_facing_time.loc[0, 'HC_Av_Mins_Per_Day'] = (
+    HosHC_patient_facing_time.loc[0, 'DisHos_Av_Mins_Per_Day'] +
+    HosHC_patient_facing_time.loc[0, 'ComHos_Av_Mins_Per_Day']
+                                                         ) / 2
 # How to deal with cadres (DCSA, Dental, Mental, Radiography) that do not have minutes at all in PFT,
 # whereas they have time requirements in Time_Curr?
 # (Compared to old PFT sheet,
 # the new PFT has updated all info on available working days/non-admin daily minutes/portion of male/female/pregfemale)
 # A quick fix is to use the average daily minutes of those cadres from old PFT table;
 # The info required to calculate these minutes will be from the old PFT table.
+working_file_old = (path_to_dropbox /
+                    '05 - Resources' / 'Module-healthsystem' / 'chai ehp resource use data' / 'ORIGINAL' /
+                    'Optimization model import_Malawi_20180315 v10.xlsx')
+
+pft_old = pd.read_excel(working_file_old, sheet_name='PFT', header=None)
+
+officer_types_old = pft_old.iloc[2, np.arange(2, 23)]
+assert set(officer_types_old) == set(officer_types_table['Officer_Type_Code'])
+assert len(officer_types_old) == len(officer_types_table['Officer_Type_Code'])
+
+# Total working days per year
+days_men_old = pft_old.iloc[15, np.arange(2, 23)]
+days_women_old = pft_old.iloc[16, np.arange(2, 23)]
+days_pregwomen_old = pft_old.iloc[17, np.arange(2, 23)]
+
+# Percents of men, nonpregnant women, and pregnant women
+fr_men_old = pft_old.iloc[53, np.arange(2, 23)]
+fr_pregwomen_old = pft_old.iloc[55, np.arange(2, 23)] * pft_old.iloc[57, np.arange(2, 23)]
+fr_nonpregwomen_old = pft_old.iloc[55, np.arange(2, 23)] * (1 - pft_old.iloc[57, np.arange(2, 23)])
+
+# Total average working days
+working_days_old = (fr_men_old * days_men_old) + (fr_nonpregwomen_old * days_women_old) + (
+    fr_pregwomen_old * days_pregwomen_old)
+
+# patient facing (i.e. non-admin working) minutes and hours daily at
+# hospitals and health centres
+mins_daily_hos_old = pft_old.iloc[36, np.arange(2, 23)]
+hrs_daily_hos_old = mins_daily_hos_old / 60
+
+mins_daily_hc_old = pft_old.iloc[26, np.arange(2, 23)] - pft_old.iloc[34, np.arange(2, 23)]
+hrs_daily_hc_old = mins_daily_hc_old / 60
+
+# Total mins per year, Average number of mins per day at
+# hospitals and health centres
+mins_yearly_hos_old = mins_daily_hos_old * working_days_old
+av_mins_daily_hos_old = mins_yearly_hos_old / 365.25
+
+mins_yearly_hc_old = mins_daily_hc_old * working_days_old
+av_mins_daily_hc_old = mins_yearly_hc_old / 365.25
+
+# PFT - DisHos, ComHos, HC individually
+# DisHos and ComHos both use hos data
+HosHC_patient_facing_time_old = pd.DataFrame(
+    {'Officer_Type_Code': officer_types_old,
+     'DisHos_Av_Mins_Per_Day': av_mins_daily_hos_old,
+     'ComHos_Av_Mins_Per_Day': av_mins_daily_hos_old,
+     'HC_Av_Mins_Per_Day': av_mins_daily_hc_old}
+).reset_index(drop=True)
+
+# now add the old data of those missing cadres to the updated PFT table
+assert (HosHC_patient_facing_time_old['Officer_Type_Code'] == HosHC_patient_facing_time['Officer_Type_Code']).all()
+assert (HosHC_patient_facing_time_old.columns == HosHC_patient_facing_time.columns).all()
+HosHC_patient_facing_time.iloc[11:, :] = HosHC_patient_facing_time_old.iloc[11:, :].copy()
 
 # PFT table ready!
 

From fd177f81888c3ddb246d5d085a336937037df761 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Thu, 17 Feb 2022 12:26:22 +0000
Subject: [PATCH 055/131] Try fixing the issue of zero DCSA staff in Likoma

---
 .../formatting_healthsystem_data_update.py          | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
index 1e5cec9f89..e6b918c121 100644
--- a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
+++ b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
@@ -708,6 +708,19 @@
 # fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'HQ or missing', 'C01'] = 0
 # *********************************************************************************************************************
 
+# *** Only for funded_plus ********************************************************************************************
+# In the funded staff table, it does not make sense that Likoma has no DCSA staff,
+# whereas all other district has at least 250 DCSA staff
+# As CHAI indicates Likoma's data is mostly bounded into Nhkata Bay,
+# we draw some DCSA from Nhkata Bay to Likoma using population as the weight
+idx_likoma = fund_staffing_table[fund_staffing_table['District_Or_Hospital'] == 'Likoma'].index
+idx_nkhatabay = fund_staffing_table[fund_staffing_table['District_Or_Hospital'] == 'Nkhata Bay'].index
+fund_staffing_table.loc[idx_likoma, 'E01'] = fund_staffing_table.loc[idx_nkhatabay, 'E01'].values[0] * (
+    pop_by_district.loc['Likoma', 'Count'] / pop_by_district.loc['Nkhata Bay', 'Count'])
+fund_staffing_table.loc[idx_nkhatabay, 'E01'] = (
+    fund_staffing_table.loc[idx_nkhatabay, 'E01'].values[0] - fund_staffing_table.loc[idx_likoma, 'E01'].values[0])
+# *********************************************************************************************************************
+
 # Sort out which are district allocations and which are central hospitals and above
 
 # We assign HQ to HQ; KCH as RefHos in Central region; MCH as RefHos in Northern region;

From d79c7252ea16947c192fca34714f28a780d7414f Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Thu, 17 Feb 2022 12:52:26 +0000
Subject: [PATCH 056/131] Correcting ZMH service types and time requirements

---
 .../formatting_healthsystem_data_update.py                 | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
index e6b918c121..d8736ffa4e 100644
--- a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
+++ b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
@@ -1190,9 +1190,12 @@
 # We now do not add service time for DHO as we think DHO does not deliver services directly
 # Also, DHO itself in both DHIS2 and CHAI updated data does not have service record
 
-# Add service times for Zomba Mental Hospital, by copying data of CenHos
+# Add service times for Zomba Mental Hospital, by copying mental health appointment data of CenHos
+# (Assuming ZMH only provide mental health services)
 new_rows_for_ZMH = pd.DataFrame(index=['ZMH', 'ZMH_Per'], columns=data_import.columns.copy(),
-                                data=data_import.loc[['CenHos', 'CenHos_Per'], :].copy().values)
+                                data=0)
+new_rows_for_ZMH.loc[:, ['C01_MentOPD', 'C01_MentClinic']] = data_import.loc[
+    ['CenHos', 'CenHos_Per'], ['C01_MentOPD', 'C01_MentClinic']].copy().values
 
 data_import = pd.concat([data_import, new_rows_for_ZMH])
 

From b349b2d442825aa846a341e23d29797b3cd3606f Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Thu, 17 Feb 2022 13:09:11 +0000
Subject: [PATCH 057/131] Trying to fix inconsistency of mental health service
 demand and staff capability supply in establishment scenario; still have
 issues that several districts have 0 mental health staff

---
 .../formatting_healthsystem_data_update.py    | 30 ++++++++-----------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
index d8736ffa4e..f24c8a7de2 100644
--- a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
+++ b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
@@ -697,15 +697,9 @@
 # currently we keep the source data as it is the establishment and CHAI team does not recommend such re-allocation.
 
 # *** Only for funded_plus ********************************************************************************************
-# Since districts Balaka,Machinga,Mwanza,Neno,Ntchisi,Salima and central hospitals have 0 C01, while C01 is \
-# required by Mental appts at level 1b, level 2 and level 3, we move some C01 from 'HQ or missing' to them. \
-# To achieve this, we evenly distribute 30 C01 at HQ to all districts and central hospitals (27 DisHos, 4 CenHos)
-# C01_at_HQ = fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'HQ or missing', 'C01'].values
-# extra_C01_per_district_CenHos = C01_at_HQ / 31
-# fund_staffing_table.loc[~fund_staffing_table['District_Or_Hospital'].isin(['HQ or missing']), 'C01'] = (
-#     fund_staffing_table.loc[~fund_staffing_table['District_Or_Hospital'].isin(['HQ or missing']), 'C01'] +
-#     extra_C01_per_district_CenHos)
-# fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'HQ or missing', 'C01'] = 0
+# Districts Balaka, Machinga, Mwanza, Neno, Nkhata Bay, Ntchisi, Salima have 0 mental health staff C01 in establishment,
+# while C01 is required by mental health appts at level 1b, level 2 and level 3.
+# To fix this inconsistency, we have to assign at least 1 C01 to each of these districts, but from where?
 # *********************************************************************************************************************
 
 # *** Only for funded_plus ********************************************************************************************
@@ -841,15 +835,15 @@
 # Before split, update the funded C01 distributions at levels 1a, 1b and 2 using CHAI Optimal Workforce estimates. \
 # This is because funded C01 are all at level 1b (100%), meanwhile appt time base requires C01 at level 2. \
 # CHAI Optimal Workforce locates C01 47.92% at level 1b and 52.08% at level 2, which seems more sensible.
-# idx_c01_level_1b = fund_staff_distribution[
-#     (fund_staff_distribution['Cadre_Code'] == 'C01') &
-#     (fund_staff_distribution['Facility_Level'] == 'Facility_Level_1b')].index
-# fund_staff_distribution.loc[idx_c01_level_1b, 'Proportion_Fund'] = 0.4792
-#
-# idx_c01_level_2 = fund_staff_distribution[
-#     (fund_staff_distribution['Cadre_Code'] == 'C01') &
-#     (fund_staff_distribution['Facility_Level'] == 'Facility_Level_2')].index
-# fund_staff_distribution.loc[idx_c01_level_2, 'Proportion_Fund'] = 0.5208
+idx_c01_level_1b = fund_staff_distribution[
+    (fund_staff_distribution['Cadre_Code'] == 'C01') &
+    (fund_staff_distribution['Facility_Level'] == 'Facility_Level_1b')].index
+fund_staff_distribution.loc[idx_c01_level_1b, 'Proportion_Fund'] = 0.4792
+
+idx_c01_level_2 = fund_staff_distribution[
+    (fund_staff_distribution['Cadre_Code'] == 'C01') &
+    (fund_staff_distribution['Facility_Level'] == 'Facility_Level_2')].index
+fund_staff_distribution.loc[idx_c01_level_2, 'Proportion_Fund'] = 0.5208
 # *********************************************************************************************************************
 
 # Split

From a634a0cd33e32d585b0f569fa423a9d66b1e2971 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Fri, 18 Feb 2022 12:10:53 +0000
Subject: [PATCH 058/131] Move mental health staff to districts with no such
 staff from their referral hospitals

---
 .../formatting_healthsystem_data_update.py    | 30 ++++++++++++++-----
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
index f24c8a7de2..58a2b64902 100644
--- a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
+++ b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
@@ -694,12 +694,30 @@
 # The imported staffing table suggest that there is some Dental officer (D01) in most districts,
 # but the TimeBase data (below) suggest that D01 is only needed at central hospitals.
 # This potential inconsistency can be solved by re-allocating D01 from districts to central hospitals, but
-# currently we keep the source data as it is the establishment and CHAI team does not recommend such re-allocation.
+# currently we keep the source data as it is the establishment and CHAI team does not recommend such re-allocation;
+# Also, the central/referral hospitals have Dental officer allocated to meet dental service demand,
+# thus no risk of not able to meet such demand at level 3.
 
 # *** Only for funded_plus ********************************************************************************************
-# Districts Balaka, Machinga, Mwanza, Neno, Nkhata Bay, Ntchisi, Salima have 0 mental health staff C01 in establishment,
-# while C01 is required by mental health appts at level 1b, level 2 and level 3.
-# To fix this inconsistency, we have to assign at least 1 C01 to each of these districts, but from where?
+# Districts Balaka/Machinga/Mwanza/Neno (4 in South), Nkhata Bay (1 in North), Ntchisi/ Salima (2 in Central)
+# have 0 mental health staff C01 in establishment,
+# whereas C01 is required by mental health appts at level 1b, level 2 and level 3.
+# To fix this inconsistency, we have to move at least 1 C01 to each of these districts from the referral hospitals.
+# (QECH and ZCH in South, MCH in North, KCH in Central; ZCH has no C01)
+non_c01_district_idx = fund_staffing_table[(fund_staffing_table['C01'] == 0) &
+                                           (fund_staffing_table['Is_DistrictLevel'])].index
+non_c01_districts = pd.DataFrame(fund_staffing_table.loc[non_c01_district_idx, 'District_Or_Hospital'])
+non_c01_districts['Region'] = pop_by_district.loc[non_c01_districts['District_Or_Hospital'], 'Region'].values
+fund_staffing_table.loc[non_c01_district_idx, 'C01'] = 1
+fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'QECH', 'C01'] = (
+    fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'QECH', 'C01'] - 4
+)
+fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'MCH', 'C01'] = (
+    fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'MCH', 'C01'] - 1
+)
+fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'KCH', 'C01'] = (
+    fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'KCH', 'C01'] - 2
+)
 # *********************************************************************************************************************
 
 # *** Only for funded_plus ********************************************************************************************
@@ -904,9 +922,7 @@
 # Check the cadre columns of curr_staffing_table is identical to fund_staffing_table
 assert set(curr_staffing_table.columns[0:21]) == set(fund_staffing_table.columns[-21:])
 
-# For curr_staffing_table, do not re-allocate Dental officer with the same reason above for established staff;
-# Also, the central/referral hospitals have Dental officer allocated to meet dental service demand,
-# thus no risk of not able to meet such demand at level 3.
+# For curr_staffing_table, do not re-allocate Dental officer with the same reason above for established staff
 
 # The operation of reallocating E01 in HQ to districts is not needed for curr_staffing_table,
 # as no. of E01 in curr_staffing_table at HQ is zero.

From 490a5982f3badb0248dc271f1743cb019877f680 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Fri, 18 Feb 2022 16:56:29 +0000
Subject: [PATCH 059/131] Check difference of new and previous PFT

---
 .../formatting_healthsystem_data_update.py    | 32 +++++++++++++++++--
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
index 58a2b64902..29bddc481a 100644
--- a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
+++ b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
@@ -1528,7 +1528,12 @@
     {'Officer_Type_Code': officer_types_import,
      'DisHos_Av_Mins_Per_Day': av_mins_daily_dishos,
      'ComHos_Av_Mins_Per_Day': av_mins_daily_comhos,
-     'HC_Av_Mins_Per_Day': av_mins_daily_hc}
+     'HC_Av_Mins_Per_Day': av_mins_daily_hc,
+     'Total_Av_Working_Days': workingdays,
+     'DisHos_Hrs_Per_Day': hrs_daily_dishos,
+     'ComHos_Hrs_Per_Day': hrs_daily_comhos,
+     'HC_Hrs_Per_Day': hrs_daily_hc
+     }
 ).reset_index(drop=True)
 
 # The new PFT has no minutes for M01 at health centres,
@@ -1539,6 +1544,7 @@
     HosHC_patient_facing_time.loc[0, 'DisHos_Av_Mins_Per_Day'] +
     HosHC_patient_facing_time.loc[0, 'ComHos_Av_Mins_Per_Day']
                                                          ) / 2
+
 # How to deal with cadres (DCSA, Dental, Mental, Radiography) that do not have minutes at all in PFT,
 # whereas they have time requirements in Time_Curr?
 # (Compared to old PFT sheet,
@@ -1591,12 +1597,32 @@
     {'Officer_Type_Code': officer_types_old,
      'DisHos_Av_Mins_Per_Day': av_mins_daily_hos_old,
      'ComHos_Av_Mins_Per_Day': av_mins_daily_hos_old,
-     'HC_Av_Mins_Per_Day': av_mins_daily_hc_old}
+     'HC_Av_Mins_Per_Day': av_mins_daily_hc_old,
+     'Total_Av_Working_Days': working_days_old,
+     'DisHos_Hrs_Per_Day': hrs_daily_hos_old,
+     'ComHos_Hrs_Per_Day': hrs_daily_hos_old,
+     'HC_Hrs_Per_Day': hrs_daily_hc_old
+     }
 ).reset_index(drop=True)
 
-# now add the old data of those missing cadres to the updated PFT table
+# check the new and old tables have same columns and officers (in the same order)
 assert (HosHC_patient_facing_time_old['Officer_Type_Code'] == HosHC_patient_facing_time['Officer_Type_Code']).all()
 assert (HosHC_patient_facing_time_old.columns == HosHC_patient_facing_time.columns).all()
+
+# check new and old pft difference
+HosHC_pft_diff = pd.DataFrame(columns=HosHC_patient_facing_time.columns)
+HosHC_pft_diff['Officer_Type_Code'] = HosHC_patient_facing_time['Officer_Type_Code'].values
+HosHC_pft_diff.iloc[:, 1:] = (
+    (HosHC_patient_facing_time.iloc[:, 1:].values -
+     HosHC_patient_facing_time_old.iloc[:, 1:].values) /
+    HosHC_patient_facing_time_old.iloc[:, 1:].values
+)
+HosHC_pft_diff = HosHC_pft_diff.append(HosHC_pft_diff.iloc[:, 1:].mean(axis=0), ignore_index=True)
+# HosHC_pft_diff.to_csv(
+#     outputlocation / 'human_resources' / 'definitions' / 'New_Old_PFT_Difference.csv',
+#     index=False)
+
+# now add the old data of those blanks cadres to the updated PFT table
 HosHC_patient_facing_time.iloc[11:, :] = HosHC_patient_facing_time_old.iloc[11:, :].copy()
 
 # PFT table ready!

From c79fa434fc76b08d6fd3bd09c0ea5c602e620823 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Fri, 18 Feb 2022 17:47:15 +0000
Subject: [PATCH 060/131] Minor format to be consistent with original
 formatting_healthsystem_data file

---
 .../formatting_healthsystem_data_update.py    | 59 ++++++++++---------
 1 file changed, 32 insertions(+), 27 deletions(-)

diff --git a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
index 29bddc481a..17d1351597 100644
--- a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
+++ b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
@@ -692,10 +692,10 @@
 fund_staffing_table = wb_extract.copy()
 
 # The imported staffing table suggest that there is some Dental officer (D01) in most districts,
-# but the TimeBase data (below) suggest that D01 is only needed at central hospitals.
+# but the Time_Curr data (below) suggest that D01 is only needed at central hospitals (not yet validated by CHAI).
 # This potential inconsistency can be solved by re-allocating D01 from districts to central hospitals, but
-# currently we keep the source data as it is the establishment and CHAI team does not recommend such re-allocation;
-# Also, the central/referral hospitals have Dental officer allocated to meet dental service demand,
+# currently we do not do such reallocation to reduce the assumptions we have to make;
+# Also because the central/referral hospitals have Dental officer allocated to meet dental service demand,
 # thus no risk of not able to meet such demand at level 3.
 
 # *** Only for funded_plus ********************************************************************************************
@@ -896,8 +896,8 @@
 
 # Save the table without column 'Is_DistrictLevel'; staff counts in floats
 fund_staffing_table_to_save = fund_staffing_table.drop(columns='Is_DistrictLevel', inplace=False)
-fund_staffing_table_to_save.to_csv(
-    outputlocation / 'human_resources' / 'funded' / 'ResourceFile_Staff_Table.csv', index=False)
+# fund_staffing_table_to_save.to_csv(
+#     outputlocation / 'human_resources' / 'funded' / 'ResourceFile_Staff_Table.csv', index=False)
 # fund_staffing_table_to_save.to_csv(
 #     outputlocation / 'human_resources' / 'funded_plus' / 'ResourceFile_Staff_Table.csv', index=False)
 
@@ -1051,8 +1051,8 @@
 
 # Save the table without column 'Is_DistrictLevel'; staff counts in floats
 curr_staffing_table_to_save = curr_staffing_table.drop(columns='Is_DistrictLevel', inplace=False)
-curr_staffing_table_to_save.to_csv(
-    outputlocation / 'human_resources' / 'actual' / 'ResourceFile_Staff_Table.csv', index=False)
+# curr_staffing_table_to_save.to_csv(
+#     outputlocation / 'human_resources' / 'actual' / 'ResourceFile_Staff_Table.csv', index=False)
 
 # ---------------------------------------------------------------------------------------------------------------------
 # *** Create the Master Facilities List
@@ -1351,11 +1351,11 @@
 ).reset_index()
 
 # Save
-ApptTimeTable.to_csv(
-    outputlocation / 'human_resources' / 'definitions' / 'ResourceFile_Appt_Time_Table.csv',
-    index=False)
+# ApptTimeTable.to_csv(
+#     outputlocation / 'human_resources' / 'definitions' / 'ResourceFile_Appt_Time_Table.csv',
+#     index=False)
 appt_time_table_coarse.to_csv(
-    outputlocation / 'human_resources' / 'definitions' / 'ResourceFile_Appt_Time_Table_Coarse.csv',
+    outputlocation / 'human_resources' / 'definitions' / 'ResourceFile_Appt_Time_Table.csv',
     index=False)
 
 # ---------------------------------------------------------------------------------------------------------------------
@@ -1541,8 +1541,9 @@
 # We therefore assume the minutes for M01 at HCs are the average of those at DisHos and CenHos,
 # to solve inconsistency between PFT and Time_Curr
 HosHC_patient_facing_time.loc[0, 'HC_Av_Mins_Per_Day'] = (
-    HosHC_patient_facing_time.loc[0, 'DisHos_Av_Mins_Per_Day'] +
-    HosHC_patient_facing_time.loc[0, 'ComHos_Av_Mins_Per_Day']
+                                                             HosHC_patient_facing_time.loc[
+                                                                 0, 'DisHos_Av_Mins_Per_Day'] +
+                                                             HosHC_patient_facing_time.loc[0, 'ComHos_Av_Mins_Per_Day']
                                                          ) / 2
 
 # How to deal with cadres (DCSA, Dental, Mental, Radiography) that do not have minutes at all in PFT,
@@ -1618,6 +1619,8 @@
     HosHC_patient_facing_time_old.iloc[:, 1:].values
 )
 HosHC_pft_diff = HosHC_pft_diff.append(HosHC_pft_diff.iloc[:, 1:].mean(axis=0), ignore_index=True)
+
+# save
 # HosHC_pft_diff.to_csv(
 #     outputlocation / 'human_resources' / 'definitions' / 'New_Old_PFT_Difference.csv',
 #     index=False)
@@ -1777,20 +1780,18 @@
 ).reset_index()
 
 # Save
-HosHC_patient_facing_time.to_csv(
-    outputlocation / 'human_resources' / 'definitions' / 'ResourceFile_Patient_Facing_Time.csv', index=False)
-
 curr_daily_capability_coarse.to_csv(
     outputlocation / 'human_resources' / 'actual' / 'ResourceFile_Daily_Capabilities.csv', index=False)
 
 # Need to # following lines below when generate funded_plus capability
+# funded_daily_capability_coarse.to_csv(
+#     outputlocation / 'human_resources' / 'funded' / 'ResourceFile_Daily_Capabilities.csv', index=False)
+
+# *** Only for funded_plus ********************************************************************************************
 funded_daily_capability_coarse.to_csv(
-    outputlocation / 'human_resources' / 'funded' / 'ResourceFile_Daily_Capabilities.csv', index=False)
+    outputlocation / 'human_resources' / 'funded_plus' / 'ResourceFile_Daily_Capabilities.csv', index=False)
 
 
-# *** Only for funded_plus ********************************************************************************************
-# funded_daily_capability_coarse.to_csv(
-#     outputlocation / 'human_resources' / 'funded_plus' / 'ResourceFile_Daily_Capabilities.csv', index=False)
 # *********************************************************************************************************************
 
 # ---------------------------------------------------------------------------------------------------------------------
@@ -1859,15 +1860,19 @@ def all_appts_can_run(capability):
 
     return appt_have_or_miss_capability
 
-
 # Save results for funded
-appt_have_or_miss_capability_funded = all_appts_can_run(funded_daily_capability_coarse)
-appt_have_or_miss_capability_funded.to_csv(
-    outputlocation / 'human_resources' / 'funded' / 'appt_have_or_miss_capability.csv', index=False)
+# Need to # following lines below when generate funded_plus capability
+# appt_have_or_miss_capability_funded = all_appts_can_run(funded_daily_capability_coarse)
+# appt_have_or_miss_capability_funded.to_csv(
+#     outputlocation / 'human_resources' / 'funded' / 'appt_have_or_miss_capability.csv', index=False)
+
+# *** Only for funded_plus ********************************************************************************************
+# appt_have_or_miss_capability_funded = all_appts_can_run(funded_daily_capability_coarse)
 # appt_have_or_miss_capability_funded.to_csv(
 #     outputlocation / 'human_resources' / 'funded_plus' / 'appt_have_or_miss_capability.csv', index=False)
+# *********************************************************************************************************************
 
 # Save results for actual
-appt_have_or_miss_capability_actual = all_appts_can_run(curr_daily_capability_coarse)
-appt_have_or_miss_capability_actual.to_csv(
-    outputlocation / 'human_resources' / 'actual' / 'appt_have_or_miss_capability.csv', index=False)
+# appt_have_or_miss_capability_actual = all_appts_can_run(curr_daily_capability_coarse)
+# appt_have_or_miss_capability_actual.to_csv(
+#     outputlocation / 'human_resources' / 'actual' / 'appt_have_or_miss_capability.csv', index=False)

From 9d5dcb99e4671fc2de0eefa51fe107e922d566cb Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Fri, 18 Feb 2022 18:25:50 +0000
Subject: [PATCH 061/131] Move old data path to the beginning and notice the
 need to upload the new data to dropbox and change of workingfile and output
 paths

---
 .../formatting_healthsystem_data_update.py          | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
index 17d1351597..b653f6284b 100644
--- a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
+++ b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
@@ -52,7 +52,12 @@
 
 workingfile = Path(
     '/Users/jdbb1/OneDrive/Desktop/healthsystem data update/Malawi optimization model import_2022-02-11.xlsx'
-)  # <-- point to the new data locally
+)  # <-- point to the new data locally; need update
+
+
+working_file_old = (path_to_dropbox /
+                    '05 - Resources' / 'Module-healthsystem' / 'chai ehp resource use data' / 'ORIGINAL' /
+                    'Optimization model import_Malawi_20180315 v10.xlsx')  # <-- point to the old data locally
 
 path_to_auxiliaryfiles = (path_to_dropbox /
                           '05 - Resources' /
@@ -61,6 +66,8 @@
                           'Auxiliary CHAI Data from CHAI HR Team 12 Sep 2021')
 
 outputlocation = Path('/Users/jdbb1/OneDrive/Desktop/healthsystem data update/output')  # <-- output locally
+# Need update to
+# outputlocation = resourcefilepath / 'healthsystem'
 
 # ---------------------------------------------------------------------------------------------------------------------
 # *** create and save population_by_district data
@@ -1552,10 +1559,6 @@
 # the new PFT has updated all info on available working days/non-admin daily minutes/portion of male/female/pregfemale)
 # A quick fix is to use the average daily minutes of those cadres from old PFT table;
 # The info required to calculate these minutes will be from the old PFT table.
-working_file_old = (path_to_dropbox /
-                    '05 - Resources' / 'Module-healthsystem' / 'chai ehp resource use data' / 'ORIGINAL' /
-                    'Optimization model import_Malawi_20180315 v10.xlsx')
-
 pft_old = pd.read_excel(working_file_old, sheet_name='PFT', header=None)
 
 officer_types_old = pft_old.iloc[2, np.arange(2, 23)]

From a64e796c1084c6a5f37f38d4e4898fa115ec69b8 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Mon, 21 Feb 2022 10:42:28 +0000
Subject: [PATCH 062/131] Update the input and output path (need upload the
 CHAI data to dropbox)

---
 .../ResourceFile_Daily_Capabilities.csv       |  4 +-
 .../ResourceFile_ApptType_By_FacLevel.csv     |  4 +-
 .../ResourceFile_Appt_Time_Table.csv          |  4 +-
 .../ResourceFile_Appt_Types_Table.csv         |  4 +-
 .../ResourceFile_Daily_Capabilities.csv       |  4 +-
 .../ResourceFile_Daily_Capabilities.csv       |  4 +-
 .../formatting_healthsystem_data_update.py    | 78 +++++++++----------
 7 files changed, 50 insertions(+), 52 deletions(-)

diff --git a/resources/healthsystem/human_resources/actual/ResourceFile_Daily_Capabilities.csv b/resources/healthsystem/human_resources/actual/ResourceFile_Daily_Capabilities.csv
index e0dfb7799a..a969791987 100644
--- a/resources/healthsystem/human_resources/actual/ResourceFile_Daily_Capabilities.csv
+++ b/resources/healthsystem/human_resources/actual/ResourceFile_Daily_Capabilities.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cadbd8c910c533e07136685b6ad5b4910f8da965c36d481d34eebc7e2402aadb
-size 70832
+oid sha256:9bb4ee9a5a25722c31e2b400e5d58d108ec5c9ac1481b7915c98c58d5302384e
+size 73045
diff --git a/resources/healthsystem/human_resources/definitions/ResourceFile_ApptType_By_FacLevel.csv b/resources/healthsystem/human_resources/definitions/ResourceFile_ApptType_By_FacLevel.csv
index ee9f8f2266..5a04dbd0f2 100644
--- a/resources/healthsystem/human_resources/definitions/ResourceFile_ApptType_By_FacLevel.csv
+++ b/resources/healthsystem/human_resources/definitions/ResourceFile_ApptType_By_FacLevel.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3dfd53751c3e2898f17e9b25ef47e48ffe15ce3ca1bf1dc5606845bd300535c8
-size 2672
+oid sha256:c44052405e28db0269af783e8fa64d713b09acbb5d9f1bb84c1a5b739774a70e
+size 2564
diff --git a/resources/healthsystem/human_resources/definitions/ResourceFile_Appt_Time_Table.csv b/resources/healthsystem/human_resources/definitions/ResourceFile_Appt_Time_Table.csv
index e955984be9..6f2627c08c 100644
--- a/resources/healthsystem/human_resources/definitions/ResourceFile_Appt_Time_Table.csv
+++ b/resources/healthsystem/human_resources/definitions/ResourceFile_Appt_Time_Table.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0fab28246dd98e02eb3962e1e50e925627f1bfee5c2452f5da6add55e9cbaa29
-size 11025
+oid sha256:253582e585add2fbbba439352c9f50fc969caecbc726dffb7e624f0117c1f61f
+size 10855
diff --git a/resources/healthsystem/human_resources/definitions/ResourceFile_Appt_Types_Table.csv b/resources/healthsystem/human_resources/definitions/ResourceFile_Appt_Types_Table.csv
index 745d58dbd2..10b52d7f75 100644
--- a/resources/healthsystem/human_resources/definitions/ResourceFile_Appt_Types_Table.csv
+++ b/resources/healthsystem/human_resources/definitions/ResourceFile_Appt_Types_Table.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6dcef9919c5529ecdd1ce4c09bf33b88ce8609292707ea316da3e3aee9f760a9
-size 2471
+oid sha256:17dce96228386fac6cd2e057cf901b4e10b2e059174bb21c5b518c4b16e5375e
+size 2388
diff --git a/resources/healthsystem/human_resources/funded/ResourceFile_Daily_Capabilities.csv b/resources/healthsystem/human_resources/funded/ResourceFile_Daily_Capabilities.csv
index 26958a662f..53ea7ff45e 100644
--- a/resources/healthsystem/human_resources/funded/ResourceFile_Daily_Capabilities.csv
+++ b/resources/healthsystem/human_resources/funded/ResourceFile_Daily_Capabilities.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3fc14aa153fb4fa18570016364462007706fb3e43b8a41bfeca8055a194fbd84
-size 67958
+oid sha256:89fda62a8f1d39ac16f315c62f2dbf8988a5686d2996f30ebb6c4f4bb67155ea
+size 67872
diff --git a/resources/healthsystem/human_resources/funded_plus/ResourceFile_Daily_Capabilities.csv b/resources/healthsystem/human_resources/funded_plus/ResourceFile_Daily_Capabilities.csv
index d1336d6c05..e2421bb900 100644
--- a/resources/healthsystem/human_resources/funded_plus/ResourceFile_Daily_Capabilities.csv
+++ b/resources/healthsystem/human_resources/funded_plus/ResourceFile_Daily_Capabilities.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b0b7077ba8ae80a4273b45ae6864b982b8dc04a86abe204b0983418e188d1bfd
-size 71841
+oid sha256:8743976085d3c8d30314b28215254c2fe5297d921e67c453e9781baf0c6a46dc
+size 71442
diff --git a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
index b653f6284b..fdd46d46b6 100644
--- a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
+++ b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
@@ -50,10 +50,10 @@
 path_to_dropbox = Path(
     '/Users/jdbb1/Dropbox/Thanzi La Onse')  # <-- point to the TLO dropbox locally
 
-workingfile = Path(
-    '/Users/jdbb1/OneDrive/Desktop/healthsystem data update/Malawi optimization model import_2022-02-11.xlsx'
-)  # <-- point to the new data locally; need update
-
+workingfile = (path_to_dropbox /
+               '05 - Resources' / 'Module-healthsystem' / 'chai ehp resource use data' / 'ORIGINAL' /
+               'Malawi optimization model import_2022-02-11.xlsx'
+)  # <-- point to the new data locally; need upload the excel file to shared dropbox
 
 working_file_old = (path_to_dropbox /
                     '05 - Resources' / 'Module-healthsystem' / 'chai ehp resource use data' / 'ORIGINAL' /
@@ -65,9 +65,7 @@
                           'chai ehp resource use data' /
                           'Auxiliary CHAI Data from CHAI HR Team 12 Sep 2021')
 
-outputlocation = Path('/Users/jdbb1/OneDrive/Desktop/healthsystem data update/output')  # <-- output locally
-# Need update to
-# outputlocation = resourcefilepath / 'healthsystem'
+outputlocation = resourcefilepath / 'healthsystem'
 
 # ---------------------------------------------------------------------------------------------------------------------
 # *** create and save population_by_district data
@@ -711,20 +709,20 @@
 # whereas C01 is required by mental health appts at level 1b, level 2 and level 3.
 # To fix this inconsistency, we have to move at least 1 C01 to each of these districts from the referral hospitals.
 # (QECH and ZCH in South, MCH in North, KCH in Central; ZCH has no C01)
-non_c01_district_idx = fund_staffing_table[(fund_staffing_table['C01'] == 0) &
-                                           (fund_staffing_table['Is_DistrictLevel'])].index
-non_c01_districts = pd.DataFrame(fund_staffing_table.loc[non_c01_district_idx, 'District_Or_Hospital'])
-non_c01_districts['Region'] = pop_by_district.loc[non_c01_districts['District_Or_Hospital'], 'Region'].values
-fund_staffing_table.loc[non_c01_district_idx, 'C01'] = 1
-fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'QECH', 'C01'] = (
-    fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'QECH', 'C01'] - 4
-)
-fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'MCH', 'C01'] = (
-    fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'MCH', 'C01'] - 1
-)
-fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'KCH', 'C01'] = (
-    fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'KCH', 'C01'] - 2
-)
+# non_c01_district_idx = fund_staffing_table[(fund_staffing_table['C01'] == 0) &
+#                                            (fund_staffing_table['Is_DistrictLevel'])].index
+# non_c01_districts = pd.DataFrame(fund_staffing_table.loc[non_c01_district_idx, 'District_Or_Hospital'])
+# non_c01_districts['Region'] = pop_by_district.loc[non_c01_districts['District_Or_Hospital'], 'Region'].values
+# fund_staffing_table.loc[non_c01_district_idx, 'C01'] = 1
+# fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'QECH', 'C01'] = (
+#     fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'QECH', 'C01'] - 4
+# )
+# fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'MCH', 'C01'] = (
+#     fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'MCH', 'C01'] - 1
+# )
+# fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'KCH', 'C01'] = (
+#     fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'KCH', 'C01'] - 2
+# )
 # *********************************************************************************************************************
 
 # *** Only for funded_plus ********************************************************************************************
@@ -732,12 +730,12 @@
 # whereas all other district has at least 250 DCSA staff
 # As CHAI indicates Likoma's data is mostly bounded into Nhkata Bay,
 # we draw some DCSA from Nhkata Bay to Likoma using population as the weight
-idx_likoma = fund_staffing_table[fund_staffing_table['District_Or_Hospital'] == 'Likoma'].index
-idx_nkhatabay = fund_staffing_table[fund_staffing_table['District_Or_Hospital'] == 'Nkhata Bay'].index
-fund_staffing_table.loc[idx_likoma, 'E01'] = fund_staffing_table.loc[idx_nkhatabay, 'E01'].values[0] * (
-    pop_by_district.loc['Likoma', 'Count'] / pop_by_district.loc['Nkhata Bay', 'Count'])
-fund_staffing_table.loc[idx_nkhatabay, 'E01'] = (
-    fund_staffing_table.loc[idx_nkhatabay, 'E01'].values[0] - fund_staffing_table.loc[idx_likoma, 'E01'].values[0])
+# idx_likoma = fund_staffing_table[fund_staffing_table['District_Or_Hospital'] == 'Likoma'].index
+# idx_nkhatabay = fund_staffing_table[fund_staffing_table['District_Or_Hospital'] == 'Nkhata Bay'].index
+# fund_staffing_table.loc[idx_likoma, 'E01'] = fund_staffing_table.loc[idx_nkhatabay, 'E01'].values[0] * (
+#     pop_by_district.loc['Likoma', 'Count'] / pop_by_district.loc['Nkhata Bay', 'Count'])
+# fund_staffing_table.loc[idx_nkhatabay, 'E01'] = (
+#     fund_staffing_table.loc[idx_nkhatabay, 'E01'].values[0] - fund_staffing_table.loc[idx_likoma, 'E01'].values[0])
 # *********************************************************************************************************************
 
 # Sort out which are district allocations and which are central hospitals and above
@@ -860,15 +858,15 @@
 # Before split, update the funded C01 distributions at levels 1a, 1b and 2 using CHAI Optimal Workforce estimates. \
 # This is because funded C01 are all at level 1b (100%), meanwhile appt time base requires C01 at level 2. \
 # CHAI Optimal Workforce locates C01 47.92% at level 1b and 52.08% at level 2, which seems more sensible.
-idx_c01_level_1b = fund_staff_distribution[
-    (fund_staff_distribution['Cadre_Code'] == 'C01') &
-    (fund_staff_distribution['Facility_Level'] == 'Facility_Level_1b')].index
-fund_staff_distribution.loc[idx_c01_level_1b, 'Proportion_Fund'] = 0.4792
-
-idx_c01_level_2 = fund_staff_distribution[
-    (fund_staff_distribution['Cadre_Code'] == 'C01') &
-    (fund_staff_distribution['Facility_Level'] == 'Facility_Level_2')].index
-fund_staff_distribution.loc[idx_c01_level_2, 'Proportion_Fund'] = 0.5208
+# idx_c01_level_1b = fund_staff_distribution[
+#     (fund_staff_distribution['Cadre_Code'] == 'C01') &
+#     (fund_staff_distribution['Facility_Level'] == 'Facility_Level_1b')].index
+# fund_staff_distribution.loc[idx_c01_level_1b, 'Proportion_Fund'] = 0.4792
+#
+# idx_c01_level_2 = fund_staff_distribution[
+#     (fund_staff_distribution['Cadre_Code'] == 'C01') &
+#     (fund_staff_distribution['Facility_Level'] == 'Facility_Level_2')].index
+# fund_staff_distribution.loc[idx_c01_level_2, 'Proportion_Fund'] = 0.5208
 # *********************************************************************************************************************
 
 # Split
@@ -1787,12 +1785,12 @@
     outputlocation / 'human_resources' / 'actual' / 'ResourceFile_Daily_Capabilities.csv', index=False)
 
 # Need to # following lines below when generate funded_plus capability
-# funded_daily_capability_coarse.to_csv(
-#     outputlocation / 'human_resources' / 'funded' / 'ResourceFile_Daily_Capabilities.csv', index=False)
+funded_daily_capability_coarse.to_csv(
+    outputlocation / 'human_resources' / 'funded' / 'ResourceFile_Daily_Capabilities.csv', index=False)
 
 # *** Only for funded_plus ********************************************************************************************
-funded_daily_capability_coarse.to_csv(
-    outputlocation / 'human_resources' / 'funded_plus' / 'ResourceFile_Daily_Capabilities.csv', index=False)
+# funded_daily_capability_coarse.to_csv(
+#     outputlocation / 'human_resources' / 'funded_plus' / 'ResourceFile_Daily_Capabilities.csv', index=False)
 
 
 # *********************************************************************************************************************

From 6acad5ef84eb0f14778c565d5522ba485d0cc8ea Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Mon, 21 Feb 2022 10:56:31 +0000
Subject: [PATCH 063/131] Refactor; ready for review

---
 .../formatting_healthsystem_data_update.py                    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
index fdd46d46b6..43fb458394 100644
--- a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
+++ b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
@@ -52,8 +52,8 @@
 
 workingfile = (path_to_dropbox /
                '05 - Resources' / 'Module-healthsystem' / 'chai ehp resource use data' / 'ORIGINAL' /
-               'Malawi optimization model import_2022-02-11.xlsx'
-)  # <-- point to the new data locally; need upload the excel file to shared dropbox
+               'Malawi optimization model import_2022-02-11.xlsx')
+# <-- point to the new data locally; need upload the excel file to shared dropbox
 
 working_file_old = (path_to_dropbox /
                     '05 - Resources' / 'Module-healthsystem' / 'chai ehp resource use data' / 'ORIGINAL' /

From 90896f09b7b30124d1b29aa65708dbfc550fcbfd Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Mon, 21 Feb 2022 11:57:05 +0000
Subject: [PATCH 064/131] Overwrite the old formatting_healthsystem_data file

---
 .../formatting_healthsystem_data.py           |  500 +++--
 .../formatting_healthsystem_data_update.py    | 1879 -----------------
 2 files changed, 280 insertions(+), 2099 deletions(-)
 delete mode 100644 src/scripts/data_file_processing/formatting_healthsystem_data_update.py

diff --git a/src/scripts/data_file_processing/formatting_healthsystem_data.py b/src/scripts/data_file_processing/formatting_healthsystem_data.py
index 88b606abc4..ea9d57edb8 100644
--- a/src/scripts/data_file_processing/formatting_healthsystem_data.py
+++ b/src/scripts/data_file_processing/formatting_healthsystem_data.py
@@ -1,4 +1,4 @@
-﻿"""
+"""
 This file sets up the health system resources for each district, each region, and also national level.
 
 It defines 7 levels for facility types, i.e., Facility_Levels = [0,1a,1b,2,3,4,5].
@@ -36,7 +36,6 @@
   Scenario 'funded' -> appt_have_or_miss_capability
   Scenario 'funded_plus' -> appt_have_or_miss_capability
 """
-# Task: incorporate the new version of human-resources input data
 
 from pathlib import Path
 
@@ -48,20 +47,21 @@
 path_to_dropbox = Path(
     '/Users/jdbb1/Dropbox/Thanzi La Onse')  # <-- point to the TLO dropbox locally
 
-# LOCATION OF INPUT FILES:
 workingfile = (path_to_dropbox /
-               '05 - Resources' /
-               'Module-healthsystem' /
-               'chai ehp resource use data' /
-               'ORIGINAL' /
-               'Optimization model import_Malawi_20180315 v10.xlsx')
+               '05 - Resources' / 'Module-healthsystem' / 'chai ehp resource use data' / 'ORIGINAL' /
+               'Malawi optimization model import_2022-02-11.xlsx')
+# <-- point to the new data locally; need upload the excel file to shared dropbox
+
+working_file_old = (path_to_dropbox /
+                    '05 - Resources' / 'Module-healthsystem' / 'chai ehp resource use data' / 'ORIGINAL' /
+                    'Optimization model import_Malawi_20180315 v10.xlsx')  # <-- point to the old data locally
+
 path_to_auxiliaryfiles = (path_to_dropbox /
                           '05 - Resources' /
                           'Module-healthsystem' /
                           'chai ehp resource use data' /
                           'Auxiliary CHAI Data from CHAI HR Team 12 Sep 2021')
 
-# OUTPUT RESOURCE_FILES TO:
 outputlocation = resourcefilepath / 'healthsystem'
 
 # ---------------------------------------------------------------------------------------------------------------------
@@ -80,15 +80,15 @@
 # pop_by_district.to_csv(outputlocation / 'organisation' / 'ResourceFile_District_Population_Data.csv', index=True)
 
 # ---------------------------------------------------------------------------------------------------------------------
-# *** Below we generate staffing tables: fund_staffing_table for funded/established staff, and\
+# *** Below we generate staffing tables: fund_staffing_table for established staff, and\
 # curr_staffing_table for current staff
 # Before generating the tables, we need to prepare wb_import, officer_types_table, and\
 # make assumptions of curr_staff_return distribution and fund_staff_return distribution using Auxiliary CHAI Data
 
 # --- wb_import for staff information
 
-# Import all of the 'CurrentStaff' sheet, including both data of current and funded staff
-wb_import = pd.read_excel(workingfile, sheet_name='CurrentStaff', header=None)
+# Import all of the 'Staff' sheet, including both data of current and funded staff
+wb_import = pd.read_excel(workingfile, sheet_name='Staff', header=None)
 
 # --- officer_types_table
 # Make dataframe summarising the officer types and the officer codes:
@@ -671,9 +671,9 @@
 # compare_staff_distribution.to_csv(outputlocation / 'ResourceFile_Staff_Distribution_Compare.csv', index=False)
 
 # ***
-# --- fund_staffing_table for funded/established staff
+# --- fund_staffing_table for established staff
 # Extract just the section about "Funded TOTAl Staff'
-wb_extract = wb_import.loc[3:37, 64:84]
+wb_extract = wb_import.loc[3:39, 64:84]
 wb_extract = wb_extract.drop([4, 5])
 wb_extract.columns = wb_extract.iloc[0]
 wb_extract = wb_extract.drop([3])
@@ -682,10 +682,10 @@
 
 # Add in the column to the dataframe for the labels that distinguishes whether
 # these officers are allocated to the district-or-lower levels or one of the key hospitals.
-labels = wb_import.loc[6:37, 0].reset_index(drop=True)
+labels = wb_import.loc[6:39, 0].reset_index(drop=True)
 is_distlevel = labels.copy()
-is_distlevel[0:27] = True  # for district-or-lower levels
-is_distlevel[27:] = False  # for CenHos-or-above levels
+is_distlevel[0:28] = True  # for district-or-lower levels
+is_distlevel[28:] = False  # for CenHos-or-above levels
 
 wb_extract.loc[:, 'District_Or_Hospital'] = labels
 wb_extract.loc[:, 'Is_DistrictLevel'] = is_distlevel
@@ -693,40 +693,46 @@
 # Finished import from the CHAI excel:
 fund_staffing_table = wb_extract.copy()
 
-# There are a large number of officer_types EO1 (DCSA/Comm Health Workers) at HQ level, which is non-sensible
-# Therefore, re-distribute these evenly to the districts.
-extra_CHW = fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'HQ or missing',
-                                    fund_staffing_table.columns[fund_staffing_table.columns == 'E01']].values[0][0]
-fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'HQ or missing',
-                        fund_staffing_table.columns[fund_staffing_table.columns == 'E01']] = 0
-extra_CHW_per_district = int(np.floor(extra_CHW / fund_staffing_table['Is_DistrictLevel'].sum()))
-fund_staffing_table.loc[fund_staffing_table['Is_DistrictLevel'], 'E01'] = \
-    fund_staffing_table.loc[fund_staffing_table['Is_DistrictLevel'], 'E01'] + \
-    extra_CHW_per_district
-
-# The imported staffing table suggest that there is 1 Dental officer (D01) in each district,
-# but the TimeBase data (below) suggest that no appointment occurring at a district-level Facility can incur
-# the time such an officer. Therefore reallocate the D01 officers to the Referral Hospitals
-extra_D01 = fund_staffing_table.loc[
-    ~fund_staffing_table['District_Or_Hospital'].isin(['KCH', 'MCH', 'QECH', 'ZCH']),
-    fund_staffing_table.columns[fund_staffing_table.columns == 'D01']].sum().values[0]
-fund_staffing_table.loc[~fund_staffing_table['District_Or_Hospital'].isin(['KCH', 'MCH', 'QECH', 'ZCH']),
-                        fund_staffing_table.columns[fund_staffing_table.columns == 'D01']] = 0
-extra_D01_per_referralhosp = extra_D01 / 4  # divided by 4 CenHos
-fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'].isin(['KCH', 'MCH', 'QECH', 'ZCH']), 'D01'] = \
-    fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'].isin(['KCH', 'MCH', 'QECH', 'ZCH']), 'D01'] + \
-    extra_D01_per_referralhosp
+# The imported staffing table suggest that there is some Dental officer (D01) in most districts,
+# but the Time_Curr data (below) suggest that D01 is only needed at central hospitals (not yet validated by CHAI).
+# This potential inconsistency can be solved by re-allocating D01 from districts to central hospitals, but
+# currently we do not do such reallocation to reduce the assumptions we have to make;
+# Also because the central/referral hospitals have Dental officer allocated to meet dental service demand,
+# thus no risk of not able to meet such demand at level 3.
+
+# *** Only for funded_plus ********************************************************************************************
+# Districts Balaka/Machinga/Mwanza/Neno (4 in South), Nkhata Bay (1 in North), Ntchisi/ Salima (2 in Central)
+# have 0 mental health staff C01 in establishment,
+# whereas C01 is required by mental health appts at level 1b, level 2 and level 3.
+# To fix this inconsistency, we have to move at least 1 C01 to each of these districts from the referral hospitals.
+# (QECH and ZCH in South, MCH in North, KCH in Central; ZCH has no C01)
+# non_c01_district_idx = fund_staffing_table[(fund_staffing_table['C01'] == 0) &
+#                                            (fund_staffing_table['Is_DistrictLevel'])].index
+# non_c01_districts = pd.DataFrame(fund_staffing_table.loc[non_c01_district_idx, 'District_Or_Hospital'])
+# non_c01_districts['Region'] = pop_by_district.loc[non_c01_districts['District_Or_Hospital'], 'Region'].values
+# fund_staffing_table.loc[non_c01_district_idx, 'C01'] = 1
+# fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'QECH', 'C01'] = (
+#     fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'QECH', 'C01'] - 4
+# )
+# fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'MCH', 'C01'] = (
+#     fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'MCH', 'C01'] - 1
+# )
+# fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'KCH', 'C01'] = (
+#     fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'KCH', 'C01'] - 2
+# )
+# *********************************************************************************************************************
 
 # *** Only for funded_plus ********************************************************************************************
-# Since districts Balaka,Machinga,Mwanza,Neno,Ntchisi,Salima and central hospitals have 0 C01, while C01 is \
-# required by Mental appts at level 1b, level 2 and level 3, we move some C01 from 'HQ or missing' to them. \
-# To achieve this, we evenly distribute 30 C01 at HQ to all districts and central hospitals (27 DisHos, 4 CenHos)
-C01_at_HQ = fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'HQ or missing', 'C01'].values
-extra_C01_per_district_CenHos = C01_at_HQ / 31
-fund_staffing_table.loc[~fund_staffing_table['District_Or_Hospital'].isin(['HQ or missing']), 'C01'] = (
-    fund_staffing_table.loc[~fund_staffing_table['District_Or_Hospital'].isin(['HQ or missing']), 'C01'] +
-    extra_C01_per_district_CenHos)
-fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'HQ or missing', 'C01'] = 0
+# In the funded staff table, it does not make sense that Likoma has no DCSA staff,
+# whereas all other district has at least 250 DCSA staff
+# As CHAI indicates Likoma's data is mostly bounded into Nhkata Bay,
+# we draw some DCSA from Nhkata Bay to Likoma using population as the weight
+# idx_likoma = fund_staffing_table[fund_staffing_table['District_Or_Hospital'] == 'Likoma'].index
+# idx_nkhatabay = fund_staffing_table[fund_staffing_table['District_Or_Hospital'] == 'Nkhata Bay'].index
+# fund_staffing_table.loc[idx_likoma, 'E01'] = fund_staffing_table.loc[idx_nkhatabay, 'E01'].values[0] * (
+#     pop_by_district.loc['Likoma', 'Count'] / pop_by_district.loc['Nkhata Bay', 'Count'])
+# fund_staffing_table.loc[idx_nkhatabay, 'E01'] = (
+#     fund_staffing_table.loc[idx_nkhatabay, 'E01'].values[0] - fund_staffing_table.loc[idx_likoma, 'E01'].values[0])
 # *********************************************************************************************************************
 
 # Sort out which are district allocations and which are central hospitals and above
@@ -734,7 +740,7 @@
 # We assign HQ to HQ; KCH as RefHos in Central region; MCH as RefHos in Northern region;
 # QECH and ZCH as RefHos in Southern region (QECH is in Southwest and ZCH is in Southeast).
 fund_staffing_table.loc[
-    fund_staffing_table['District_Or_Hospital'] == 'HQ or missing', 'District_Or_Hospital'] = 'Headquarter'
+    fund_staffing_table['District_Or_Hospital'] == 'HQ', 'District_Or_Hospital'] = 'Headquarter'
 fund_staffing_table.loc[
     fund_staffing_table['District_Or_Hospital'] == 'KCH', 'District_Or_Hospital'] = 'Referral Hospital_Central'
 fund_staffing_table.loc[
@@ -747,6 +753,8 @@
     fund_staffing_table['District_Or_Hospital'] == 'ZCH', 'District_Or_Hospital'] = 'Referral Hospital_Southern'
 # fund_staffing_table.loc[
 # fund_staffing_table['District_Or_Hospital'] == 'ZCH', 'District_Or_Hospital'] = 'Referral Hospital_Southeast'
+fund_staffing_table.loc[
+    fund_staffing_table['District_Or_Hospital'] == 'ZMH', 'District_Or_Hospital'] = 'Zomba Mental Hospital'
 
 # Group the referral hospitals QECH and ZCH as Referral Hospital_Southern
 Is_DistrictLevel = fund_staffing_table['Is_DistrictLevel'].values  # Save the column 'Is_DistrictLevel' first
@@ -754,42 +762,14 @@
     fund_staffing_table.groupby(by=['District_Or_Hospital'], sort=False).sum()).reset_index()
 fund_staffing_table.insert(1, 'Is_DistrictLevel', Is_DistrictLevel[:-1])  # Add the column 'Is_DistrictLevel'
 
-# Add a row for Zomba Mental Hospital with 3 C01 mental health staff
-# (according to data in 2018-03-09 Facility-level establishment MOH & CHAM)
-# (This is much less than the current 12 C01.)
-fund_ZMH = pd.DataFrame(columns=fund_staffing_table.columns.copy())
-fund_ZMH.loc[0, 'District_Or_Hospital'] = 'Zomba Mental Hospital'
-fund_ZMH.loc[0, 'Is_DistrictLevel'] = False
-fund_ZMH.loc[0, 'C01'] = 3
-# Alternatively, if consider all potential cadres from compiled staff return
-# fund_cadres_ZMH = pd.DataFrame(index = [0], columns = ['M01','M02','M03','N01','N02','C01','P02','L02'],
-#                          data = np.array([[2,13,14,8,30,3,1,1]]))
-# for col in fund_cadres_ZMH.columns:
-#    fund_ZMH.loc[0,col] = fund_cadres_ZMH.loc[0,col].copy()
-
-# Concat
-fund_staffing_table = pd.concat([fund_staffing_table, fund_ZMH])
-fund_staffing_table.reset_index(drop=True, inplace=True)
-fund_staffing_table.fillna(0, inplace=True)
-
-# File 2018-03-09 Facility-level establishment MOH & CHAM indicates that ZMH is assigned to Zomba District,
-# We therefore subtract the 3 C01 staff from Zomba District.
-fund_idx_ZombaDist = fund_staffing_table[fund_staffing_table['District_Or_Hospital'] == 'Zomba'].index
-fund_staffing_table.loc[fund_idx_ZombaDist, 'C01'] = \
-    fund_staffing_table.loc[fund_idx_ZombaDist, 'C01'] - fund_ZMH.loc[0, 'C01']
-# Alternatively, if consider all potential cadres from compiled staff return
-# fund_staffing_table.loc[fund_idx_ZombaDist, :] =\
-# fund_staffing_table.loc[fund_idx_ZombaDist, :] - fund_ZMH.loc[0,:]
-
-# Check that fund_staffing_table.loc[fund_idx_ZombaDist, :] >=0
-assert (fund_staffing_table.loc[fund_idx_ZombaDist, 'M01':'R04'].values >= 0).all()
+# Check that in fund_staffing_table every staff count entry >= 0
+assert (fund_staffing_table.loc[:, 'M01':'R04'].values >= 0).all()
 
 # The following districts are not in the CHAI data because they are included within other districts.
 # For now, we will say that the division of staff between these cities and the wide district (where they are included)
-# is consistent with the population recorded for them.
+# is consistent with the population recorded for them (Malawi 2018 census),
 # i.e., to use population-based weights to reallocate staff
 
-# Add in Likoma (part Nkhata Bay)
 # Add in Lilongwe City (part of Lilongwe)
 # Add in Mzuzu City (part of Mziba) ASSUMED
 # Add in Zomba City (part of Zomba)
@@ -797,7 +777,6 @@
 
 # create mapping: the new districts : super_district
 split_districts = (
-    ('Likoma', 'Nkhata Bay'),
     ('Lilongwe City', 'Lilongwe'),
     ('Mzuzu City', 'Mzimba'),
     ('Zomba City', 'Zomba'),
@@ -821,7 +800,7 @@
     total_staff = fund_staffing_table.loc[
         fund_staffing_table['District_Or_Hospital'] == super_district, cols].values.squeeze()
 
-    # get the weight; The original weights w0 for the 5 new districts in order are 0.05,0.60,0.24,0.14,1.77(> 1)
+    # get the weight; The original weights w0 for the 4 new districts in order are 0.60,0.24,0.14,1.77(> 1)
     w0 = pop_by_district.loc[new_district, 'Count'] / pop_by_district.loc[super_district, 'Count']
     if w0 < 1:
         w = w0
@@ -876,15 +855,15 @@
 # Before split, update the funded C01 distributions at levels 1a, 1b and 2 using CHAI Optimal Workforce estimates. \
 # This is because funded C01 are all at level 1b (100%), meanwhile appt time base requires C01 at level 2. \
 # CHAI Optimal Workforce locates C01 47.92% at level 1b and 52.08% at level 2, which seems more sensible.
-idx_c01_level_1b = fund_staff_distribution[
-    (fund_staff_distribution['Cadre_Code'] == 'C01') &
-    (fund_staff_distribution['Facility_Level'] == 'Facility_Level_1b')].index
-fund_staff_distribution.loc[idx_c01_level_1b, 'Proportion_Fund'] = 0.4792
-
-idx_c01_level_2 = fund_staff_distribution[
-    (fund_staff_distribution['Cadre_Code'] == 'C01') &
-    (fund_staff_distribution['Facility_Level'] == 'Facility_Level_2')].index
-fund_staff_distribution.loc[idx_c01_level_2, 'Proportion_Fund'] = 0.5208
+# idx_c01_level_1b = fund_staff_distribution[
+#     (fund_staff_distribution['Cadre_Code'] == 'C01') &
+#     (fund_staff_distribution['Facility_Level'] == 'Facility_Level_1b')].index
+# fund_staff_distribution.loc[idx_c01_level_1b, 'Proportion_Fund'] = 0.4792
+#
+# idx_c01_level_2 = fund_staff_distribution[
+#     (fund_staff_distribution['Cadre_Code'] == 'C01') &
+#     (fund_staff_distribution['Facility_Level'] == 'Facility_Level_2')].index
+# fund_staff_distribution.loc[idx_c01_level_2, 'Proportion_Fund'] = 0.5208
 # *********************************************************************************************************************
 
 # Split
@@ -913,6 +892,8 @@
                                                       'Facility_Level_3', 'Facility_Level_3',
                                                       'Facility_Level_4']
 
+# Check that in fund_staffing_table every staff count entry >= 0
+assert (fund_staffing_table.loc[:, 'M01':'R04'].values >= 0).all()
 # fund_staffing_table ready!
 
 # Save the table without column 'Is_DistrictLevel'; staff counts in floats
@@ -925,7 +906,7 @@
 # ***
 # --- Creating curr_staffing_table and curr_staff_list for current staff
 # Extract the section about "Current TOTAl Staff'
-hcw_curr_extract = wb_import.loc[3:37, 1:21]
+hcw_curr_extract = wb_import.loc[3:39, 1:21]
 hcw_curr_extract = hcw_curr_extract.drop([4, 5])
 hcw_curr_extract.columns = hcw_curr_extract.iloc[0]
 hcw_curr_extract = hcw_curr_extract.drop([3])
@@ -943,25 +924,14 @@
 # Check the cadre columns of curr_staffing_table is identical to fund_staffing_table
 assert set(curr_staffing_table.columns[0:21]) == set(fund_staffing_table.columns[-21:])
 
-# For curr_staffing_table, reallocating D01 from districts to referral hospitals
-# Treat KCH, MCH, QECH, ZCH as referral hospitals
+# For curr_staffing_table, do not re-allocate Dental officer with the same reason above for established staff
+
 # The operation of reallocating E01 in HQ to districts is not needed for curr_staffing_table,
 # as no. of E01 in curr_staffing_table at HQ is zero.
 
-curr_extra_D01 = curr_staffing_table.loc[
-    ~curr_staffing_table['District_Or_Hospital'].isin(['KCH', 'MCH', 'QECH', 'ZCH']), curr_staffing_table.columns[
-        curr_staffing_table.columns == 'D01']].sum().values[0]
-curr_staffing_table.loc[
-    ~curr_staffing_table['District_Or_Hospital'].isin(['KCH', 'MCH', 'QECH', 'ZCH']), curr_staffing_table.columns[
-        curr_staffing_table.columns == 'D01']] = 0
-curr_extra_D01_per_referralhosp = curr_extra_D01 / 4
-curr_staffing_table.loc[curr_staffing_table['District_Or_Hospital'].isin(['KCH', 'MCH', 'QECH', 'ZCH']), 'D01'] = \
-    curr_staffing_table.loc[curr_staffing_table['District_Or_Hospital'].isin(['KCH', 'MCH', 'QECH', 'ZCH']), 'D01'] + \
-    curr_extra_D01_per_referralhosp
-
 # For curr_staffing_table, sort out the districts and central hospitals
 curr_staffing_table.loc[
-    curr_staffing_table['District_Or_Hospital'] == 'HQ or missing', 'District_Or_Hospital'] = 'Headquarter'
+    curr_staffing_table['District_Or_Hospital'] == 'HQ', 'District_Or_Hospital'] = 'Headquarter'
 curr_staffing_table.loc[
     curr_staffing_table['District_Or_Hospital'] == 'KCH', 'District_Or_Hospital'] = 'Referral Hospital_Central'
 curr_staffing_table.loc[
@@ -970,6 +940,8 @@
     curr_staffing_table['District_Or_Hospital'] == 'QECH', 'District_Or_Hospital'] = 'Referral Hospital_Southern'
 curr_staffing_table.loc[
     curr_staffing_table['District_Or_Hospital'] == 'ZCH', 'District_Or_Hospital'] = 'Referral Hospital_Southern'
+curr_staffing_table.loc[
+    curr_staffing_table['District_Or_Hospital'] == 'ZMH', 'District_Or_Hospital'] = 'Zomba Mental Hospital'
 
 # Group the referral hospitals QECH and ZCH as Referral Hospital_Southern
 Is_DistrictLevel = curr_staffing_table['Is_DistrictLevel'].values  # Save the column 'Is_DistrictLevel' first
@@ -977,41 +949,24 @@
     curr_staffing_table.groupby(by=['District_Or_Hospital'], sort=False).sum()).reset_index()
 curr_staffing_table.insert(1, 'Is_DistrictLevel', Is_DistrictLevel[:-1])  # Add the column 'Is_DistrictLevel'
 
-# Add a row for Zomba Mental Hospital, which has 12 mental health staff according to compiled staff return
-curr_ZMH = pd.DataFrame(columns=curr_staffing_table.columns.copy())
-curr_ZMH.loc[0, 'District_Or_Hospital'] = 'Zomba Mental Hospital'
-curr_ZMH.loc[0, 'Is_DistrictLevel'] = False
-curr_ZMH.loc[0, 'C01'] = 12
-# Alternatively, if consider all potential cadres from compiled staff return
-# curr_cadres_ZMH = pd.DataFrame(index = [0], columns = ['M01','M02','N01','N02','C01','P02','P03'],
-#                          data = np.array([[2,5,19,27,12,1,1]]))
-# for col in curr_cadres_ZMH.columns:
-#    curr_ZMH.loc[0,col] = curr_cadres_ZMH.loc[0,col].copy()
-
-curr_staffing_table = pd.concat([curr_staffing_table, curr_ZMH])
-curr_staffing_table.reset_index(drop=True, inplace=True)
-curr_staffing_table.fillna(0, inplace=True)
-
-# For Zomba district, there are 12 mental health staff C01;
-# However, compiled staff return does not record any C01 in Zomba district;
-# We therefore assume that its 12 C01 are from Zomba Mental Hospital.
-curr_idx_ZombaDist = curr_staffing_table[curr_staffing_table['District_Or_Hospital'] == 'Zomba'].index
-curr_staffing_table.loc[curr_idx_ZombaDist, 'C01'] = \
-    curr_staffing_table.loc[curr_idx_ZombaDist, 'C01'] - curr_ZMH.loc[0, 'C01']
-# Alternatively, if consider all potential cadres from compiled staff return
-# curr_staffing_table.loc[curr_idx_ZombaDist, :] = curr_staffing_table.loc[curr_idx_ZombaDist, :] - curr_ZMH.loc[0,:]
-
-# Check that curr_staffing_table.loc[curr_idx_ZombaDist, :] >=0
-assert (curr_staffing_table.loc[curr_idx_ZombaDist, 'M01':'R04'].values >= 0).all()
-
-# Similarly split staff to 5 special districts as done for funded staff
-# split_districts = (
-#    ('Likoma', 'Nkhata Bay'),
-#    ('Lilongwe City', 'Lilongwe'),
-#    ('Mzuzu City', 'Mzimba'),
-#    ('Zomba City', 'Zomba'),
-#    ('Blantyre City', 'Blantyre')
-# )
+# No need to add a row for Zomba Mental Hospital, as the updated CHAI data has this row for ZMH.
+# Check that in curr_staffing_table each staff count entry >=0
+assert (curr_staffing_table.loc[:, 'M01':'R04'].values >= 0).all()
+
+# Split staff to 5 special districts;
+# for current staff, we include Likoma here because CHAI has no current staff allocated in Likoma
+# (CHAI team they will allocate some staff to Likoma but not yet done)
+split_districts = (
+    ('Likoma', 'Nkhata Bay'),
+    ('Lilongwe City', 'Lilongwe'),
+    ('Mzuzu City', 'Mzimba'),
+    ('Zomba City', 'Zomba'),
+    ('Blantyre City', 'Blantyre')
+)
+
+# drop the original placeholder row for Likoma
+curr_staffing_table.drop([9], inplace=True)
+curr_staffing_table.reset_index(inplace=True, drop=True)
 
 for i in np.arange(0, len(split_districts)):
     new_district = split_districts[i][0]
@@ -1123,15 +1078,14 @@
 #                   'District Hospital', 'DHO', 'Referral Hospital', 'Zomba Mental Hospital']
 # Facility_Types_Levels = dict(zip(Facility_Types, Facility_Levels))
 
-
 # Create empty dataframe that will be the Master Facilities List (mfl)
 mfl = pd.DataFrame(columns=['Facility_Level', 'District', 'Region'])
 
 pop_districts = pop['District'].values  # array; the 'pop_districts' used in previous lines is a DataFrame
 pop_regions = pd.unique(pop['Region'])
 
-# Each district is assigned with a set of community level facs, a set of primary level facs,
-# and a set of second level facs.
+# Each district is assigned with a set of community level facs (0), a set of primary level facs (1a, 1b),
+# and a set of second level facs (2).
 # Therefore, the total sets of facs is 4 * no. of districts + 3 (RefHos per Region) + 1 (HQ) + 1 (ZMH) \
 # = 4 * 32 + 5 = 133
 for d in pop_districts:
@@ -1202,8 +1156,8 @@
 #                               index=False)
 
 # ---------------------------------------------------------------------------------------------------------------------
-# *** Now look at the types of appointments
-sheet = pd.read_excel(workingfile, sheet_name='Time_Base', header=None)
+# *** Now look at the types of appointments from the sheet 'Time_Curr'
+sheet = pd.read_excel(workingfile, sheet_name='Time_Curr', header=None)
 
 # get rid of the junk rows
 trimmed = sheet.loc[[7, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, 24, 26, 27]]
@@ -1212,10 +1166,14 @@
 data_import = data_import.dropna(axis='columns', how='all')  # get rid of the 'spacer' columns
 data_import = data_import.fillna(0)
 
-# get rid of records for which there is no call on time of any type of officer
+# get rid of records for which there is no call on time of any type of officer at any fac type
 data_import = data_import.drop(columns=data_import.columns[data_import.sum() == 0])
 
-# We note that the DCSA (CHW) never has a time requirement and that no appointments can be serviced at the HealthPost.
+# Note that in the updated 'Time_Curr', Disp has no time requirements at all for medical assistant M03,
+# which is different from the previous version
+assert data_import.loc['Disp', :].sum() == 0
+
+# Note that the DCSA (CHW) never has a time requirement and that no appointments can be serviced at the HealthPost.
 # We remedy this by inserting a new type of appointment, which only the DCSA can service, \
 # and the time taken is 10 minutes.
 new_appt_for_CHW = pd.Series(index=data_import.index,
@@ -1239,21 +1197,19 @@
                              ])
 
 data_import = pd.concat([data_import, new_appt_for_CHW], axis=1)
+assert data_import.loc['HP', :].sum() == 10.0
 
-# Add service times for DHOs, which has quite a few data in 'Incidence_Curr', by copying the data of DisHos
-new_rows_for_DHO = pd.DataFrame(index=['DHO', 'DHO_Per'], columns=data_import.columns.copy(),
-                                data=data_import.loc[['DisHos', 'DisHos_Per'], :].copy().values)
+# We now do not add service time for DHO as we think DHO does not deliver services directly
+# Also, DHO itself in both DHIS2 and CHAI updated data does not have service record
 
-# Add service times (Mental OPD and Mental Clinic Visit) for Zomba Mental Hospital, by copying data of CenHos
+# Add service times for Zomba Mental Hospital, by copying mental health appointment data of CenHos
+# (Assuming ZMH only provide mental health services)
 new_rows_for_ZMH = pd.DataFrame(index=['ZMH', 'ZMH_Per'], columns=data_import.columns.copy(),
                                 data=0)
 new_rows_for_ZMH.loc[:, ['C01_MentOPD', 'C01_MentClinic']] = data_import.loc[
     ['CenHos', 'CenHos_Per'], ['C01_MentOPD', 'C01_MentClinic']].copy().values
-# If consider all potential cadres from compiled staff return and all associated services
-# new_rows_for_ZMH = pd.DataFrame(index=['ZMH','ZMH_Per'],columns=data_import.columns.copy(),
-#                               data=data_import.loc[['CenHos','CenHos_Per'],:].copy().values)
 
-data_import = pd.concat([data_import, new_rows_for_DHO, new_rows_for_ZMH])
+data_import = pd.concat([data_import, new_rows_for_ZMH])
 
 # data_import ready!
 
@@ -1342,7 +1298,6 @@
 
 # level 2
 District_Hospital_ExpecTime = data_import.loc['DisHos'] * data_import.loc['DisHos_Per']
-DHO_ExpecTime = data_import.loc['DHO'] * data_import.loc['DHO_Per']
 
 # level 1b
 Community_Hospital_ExpecTime = data_import.loc['ComHos'] * data_import.loc['ComHos_Per']
@@ -1350,23 +1305,21 @@
 # level 1a
 Urban_HealthCentre_ExpecTime = data_import.loc['UrbHC'] * data_import.loc['UrbHC_Per']
 Rural_HealthCentre_ExpecTime = data_import.loc['RurHC'] * data_import.loc['RurHC_Per']
-Disp_ExpecTime = data_import.loc['Disp'] * data_import.loc['Disp_Per']
 
 # level 0
 HealthPost_ExpecTime = data_import.loc['HP'] * data_import.loc['HP_Per']
 
-# Average time for levels 2 and 1a, which have data for more than 1 facility types
-Avg_Level2_ExpectTime = (District_Hospital_ExpecTime + DHO_ExpecTime) / 2  # Identical to DisHos Expected Time
-Avg_Level1a_ExpectTime = (Disp_ExpecTime + Urban_HealthCentre_ExpecTime + Rural_HealthCentre_ExpecTime) / 3
+# Average time for levels 1a, which have data for more than 1 facility types
+Avg_Level1a_ExpectTime = (Urban_HealthCentre_ExpecTime + Rural_HealthCentre_ExpecTime) / 2
 
 # Assemble
 X = pd.DataFrame({
     5: HQ_ExpecTime,  # (Headquarter)
     4: ZMH_ExpectTime,  # (Zomba Mental Hospital)
     3: Central_Hospital_ExpecTime,  # (our "Referral Hospital" at region level)
-    2: Avg_Level2_ExpectTime,  # (DHO and DisHos at second level )
+    2: District_Hospital_ExpecTime,  # (DisHos at second level )
     '1b': Community_Hospital_ExpecTime,  # (ComHos at primary level)
-    '1a': Avg_Level1a_ExpectTime,  # (UrbHC,RurHC and Disp at primary level)
+    '1a': Avg_Level1a_ExpectTime,  # (UrbHC,RurHC at primary level)
     0: HealthPost_ExpecTime  # (HP at community level)
 })
 
@@ -1400,8 +1353,12 @@
 ).reset_index()
 
 # Save
-appt_time_table_coarse.to_csv(outputlocation / 'human_resources' / 'definitions' / 'ResourceFile_Appt_Time_Table.csv',
-                              index=False)
+# ApptTimeTable.to_csv(
+#     outputlocation / 'human_resources' / 'definitions' / 'ResourceFile_Appt_Time_Table.csv',
+#     index=False)
+appt_time_table_coarse.to_csv(
+    outputlocation / 'human_resources' / 'definitions' / 'ResourceFile_Appt_Time_Table.csv',
+    index=False)
 
 # ---------------------------------------------------------------------------------------------------------------------
 # *** Create a table that determines what kind of appointment can be serviced in each Facility Level
@@ -1527,53 +1484,148 @@
 # First, read-in the number of working hours and days for each type of officer
 
 pft_sheet = pd.read_excel(workingfile, sheet_name='PFT', header=None)
-officer_types_import = pft_sheet.iloc[2, np.arange(2, 23)]
+officer_types_import = pft_sheet.iloc[3, np.arange(2, 23)]
 
 assert set(officer_types_import) == set(officer_types_table['Officer_Type_Code'])
 assert len(officer_types_import) == len(officer_types_table['Officer_Type_Code'])
 
-# patient facing hours daily at hospitals
-hours_hospital = pft_sheet.iloc[38, np.arange(2, 23)]
-
-# patient facing hours daily at health centres
-work_mins_hc = pft_sheet.iloc[26, np.arange(2, 23)]
-admin_mins_hc = pft_sheet.iloc[34, np.arange(2, 23)]
-hours_hc = (work_mins_hc - admin_mins_hc) / 60
-
 # Total working days per year
-days_per_year_men = pft_sheet.iloc[15, np.arange(2, 23)]
-days_per_year_women = pft_sheet.iloc[16, np.arange(2, 23)]
-days_per_year_pregwomen = pft_sheet.iloc[17, np.arange(2, 23)]
+days_per_year_men = pft_sheet.iloc[16, np.arange(2, 23)]
+days_per_year_women = pft_sheet.iloc[17, np.arange(2, 23)]
+days_per_year_pregwomen = pft_sheet.iloc[18, np.arange(2, 23)]
 
 # Percents of men, nonpregnant women, and pregnant women
-fr_men = pft_sheet.iloc[53, np.arange(2, 23)]
-fr_pregwomen = pft_sheet.iloc[55, np.arange(2, 23)] * pft_sheet.iloc[57, np.arange(2, 23)]
-fr_nonpregwomen = pft_sheet.iloc[55, np.arange(2, 23)] * (1 - pft_sheet.iloc[57, np.arange(2, 23)])
+fr_men = pft_sheet.iloc[66, np.arange(2, 23)]
+fr_pregwomen = pft_sheet.iloc[71, np.arange(2, 23)]
+fr_nonpregwomen = pft_sheet.iloc[68, np.arange(2, 23)] - pft_sheet.iloc[71, np.arange(2, 23)]
 
 # Total average working days
 workingdays = (fr_men * days_per_year_men) + (fr_nonpregwomen * days_per_year_women) + (
     fr_pregwomen * days_per_year_pregwomen)
 
-# --- patient facing time
-# Average mins per year, Average hours per day, Average number of mins per day in Malawi
+# patient facing (i.e. non-admin working) minutes and hours daily at
+# district hospitals, community hospitals, health centres
+mins_daily_dishos = pft_sheet.iloc[37, np.arange(2, 23)]
+hrs_daily_dishos = mins_daily_dishos / 60
 
-mins_per_day_hospital = hours_hospital * 60
-mins_per_day_hc = hours_hc * 60
+mins_daily_comhos = pft_sheet.iloc[42, np.arange(2, 23)]
+hrs_daily_comhos = mins_daily_comhos / 60
 
-mins_per_year_hospital = mins_per_day_hospital * workingdays
-mins_per_year_hc = mins_per_day_hc * workingdays
+mins_daily_hc = pft_sheet.iloc[46, np.arange(2, 23)]
+hrs_daily_hc = mins_daily_hc / 60
 
-av_mins_per_day_hospital = mins_per_year_hospital / 365.25
-av_mins_per_day_hc = mins_per_year_hc / 365.25
+# Total mins per year, Average number of mins per day at
+# district hospitals, community hospitals, health centres
+mins_yearly_dishos = mins_daily_dishos * workingdays
+mins_yearly_comhos = mins_daily_comhos * workingdays
+mins_yearly_hc = mins_daily_hc * workingdays
 
-# PFT - hospital and health centre individually
+av_mins_daily_dishos = mins_yearly_dishos / 365.25
+av_mins_daily_comhos = mins_yearly_comhos / 365.25
+av_mins_daily_hc = mins_yearly_hc / 365.25
+
+# PFT - dishos, comhos, hc individual columns
+# note that the average is calculated on 365.25 days (not the working days) per year
 HosHC_patient_facing_time = pd.DataFrame(
-    {'Officer_Type_Code': officer_types_import, 'Working_Days_Per_Year': workingdays,
-     'Hospital_Hours_Per_Day': hours_hospital, 'HC_Hours_Per_Day': hours_hc,
-     'Hospital_Av_Mins_Per_Day': av_mins_per_day_hospital,
-     'HC_Av_Mins_Per_Day': av_mins_per_day_hc}
+    {'Officer_Type_Code': officer_types_import,
+     'DisHos_Av_Mins_Per_Day': av_mins_daily_dishos,
+     'ComHos_Av_Mins_Per_Day': av_mins_daily_comhos,
+     'HC_Av_Mins_Per_Day': av_mins_daily_hc,
+     'Total_Av_Working_Days': workingdays,
+     'DisHos_Hrs_Per_Day': hrs_daily_dishos,
+     'ComHos_Hrs_Per_Day': hrs_daily_comhos,
+     'HC_Hrs_Per_Day': hrs_daily_hc
+     }
+).reset_index(drop=True)
+
+# The new PFT has no minutes for M01 at health centres,
+# but in Time_Curr, IPAdmissions/RMNCH/... appointments at Urban HCs all need time from M01.
+# We therefore assume the minutes for M01 at HCs are the average of those at DisHos and CenHos,
+# to solve inconsistency between PFT and Time_Curr
+HosHC_patient_facing_time.loc[0, 'HC_Av_Mins_Per_Day'] = (
+                                                             HosHC_patient_facing_time.loc[
+                                                                 0, 'DisHos_Av_Mins_Per_Day'] +
+                                                             HosHC_patient_facing_time.loc[0, 'ComHos_Av_Mins_Per_Day']
+                                                         ) / 2
+
+# How to deal with cadres (DCSA, Dental, Mental, Radiography) that do not have minutes at all in PFT,
+# whereas they have time requirements in Time_Curr?
+# (Compared to old PFT sheet,
+# the new PFT has updated all info on available working days/non-admin daily minutes/portion of male/female/pregfemale)
+# A quick fix is to use the average daily minutes of those cadres from old PFT table;
+# The info required to calculate these minutes will be from the old PFT table.
+pft_old = pd.read_excel(working_file_old, sheet_name='PFT', header=None)
+
+officer_types_old = pft_old.iloc[2, np.arange(2, 23)]
+assert set(officer_types_old) == set(officer_types_table['Officer_Type_Code'])
+assert len(officer_types_old) == len(officer_types_table['Officer_Type_Code'])
+
+# Total working days per year
+days_men_old = pft_old.iloc[15, np.arange(2, 23)]
+days_women_old = pft_old.iloc[16, np.arange(2, 23)]
+days_pregwomen_old = pft_old.iloc[17, np.arange(2, 23)]
+
+# Percents of men, nonpregnant women, and pregnant women
+fr_men_old = pft_old.iloc[53, np.arange(2, 23)]
+fr_pregwomen_old = pft_old.iloc[55, np.arange(2, 23)] * pft_old.iloc[57, np.arange(2, 23)]
+fr_nonpregwomen_old = pft_old.iloc[55, np.arange(2, 23)] * (1 - pft_old.iloc[57, np.arange(2, 23)])
+
+# Total average working days
+working_days_old = (fr_men_old * days_men_old) + (fr_nonpregwomen_old * days_women_old) + (
+    fr_pregwomen_old * days_pregwomen_old)
+
+# patient facing (i.e. non-admin working) minutes and hours daily at
+# hospitals and health centres
+mins_daily_hos_old = pft_old.iloc[36, np.arange(2, 23)]
+hrs_daily_hos_old = mins_daily_hos_old / 60
+
+mins_daily_hc_old = pft_old.iloc[26, np.arange(2, 23)] - pft_old.iloc[34, np.arange(2, 23)]
+hrs_daily_hc_old = mins_daily_hc_old / 60
+
+# Total mins per year, Average number of mins per day at
+# hospitals and health centres
+mins_yearly_hos_old = mins_daily_hos_old * working_days_old
+av_mins_daily_hos_old = mins_yearly_hos_old / 365.25
+
+mins_yearly_hc_old = mins_daily_hc_old * working_days_old
+av_mins_daily_hc_old = mins_yearly_hc_old / 365.25
+
+# PFT - DisHos, ComHos, HC individually
+# DisHos and ComHos both use hos data
+HosHC_patient_facing_time_old = pd.DataFrame(
+    {'Officer_Type_Code': officer_types_old,
+     'DisHos_Av_Mins_Per_Day': av_mins_daily_hos_old,
+     'ComHos_Av_Mins_Per_Day': av_mins_daily_hos_old,
+     'HC_Av_Mins_Per_Day': av_mins_daily_hc_old,
+     'Total_Av_Working_Days': working_days_old,
+     'DisHos_Hrs_Per_Day': hrs_daily_hos_old,
+     'ComHos_Hrs_Per_Day': hrs_daily_hos_old,
+     'HC_Hrs_Per_Day': hrs_daily_hc_old
+     }
 ).reset_index(drop=True)
 
+# check the new and old tables have same columns and officers (in the same order)
+assert (HosHC_patient_facing_time_old['Officer_Type_Code'] == HosHC_patient_facing_time['Officer_Type_Code']).all()
+assert (HosHC_patient_facing_time_old.columns == HosHC_patient_facing_time.columns).all()
+
+# check new and old pft difference
+HosHC_pft_diff = pd.DataFrame(columns=HosHC_patient_facing_time.columns)
+HosHC_pft_diff['Officer_Type_Code'] = HosHC_patient_facing_time['Officer_Type_Code'].values
+HosHC_pft_diff.iloc[:, 1:] = (
+    (HosHC_patient_facing_time.iloc[:, 1:].values -
+     HosHC_patient_facing_time_old.iloc[:, 1:].values) /
+    HosHC_patient_facing_time_old.iloc[:, 1:].values
+)
+HosHC_pft_diff = HosHC_pft_diff.append(HosHC_pft_diff.iloc[:, 1:].mean(axis=0), ignore_index=True)
+
+# save
+# HosHC_pft_diff.to_csv(
+#     outputlocation / 'human_resources' / 'definitions' / 'New_Old_PFT_Difference.csv',
+#     index=False)
+
+# now add the old data of those blanks cadres to the updated PFT table
+HosHC_patient_facing_time.iloc[11:, :] = HosHC_patient_facing_time_old.iloc[11:, :].copy()
+
 # PFT table ready!
 
 # Create final tables of daily time available at each facility by officer type: Facility_ID, Facility_Type,
@@ -1591,12 +1643,17 @@
             t = (funded_staff_floats.loc[i, officer] *
                  HosHC_patient_facing_time.loc[HosHC_patient_facing_time['Officer_Type_Code'] == officer,
                                                'HC_Av_Mins_Per_Day'])
-            funded_daily_minutes.loc[i, officer] = t.values
-        else:  # Levels 1b, 2, and above; Hospital minutes
+            funded_daily_minutes.loc[i, officer] = t.values[0]
+        elif the_level == 'Facility_Level_1b':  # Level 1b; ComHos minutes
             t = (funded_staff_floats.loc[i, officer] *
                  HosHC_patient_facing_time.loc[HosHC_patient_facing_time['Officer_Type_Code'] == officer,
-                                               'Hospital_Av_Mins_Per_Day'])
-            funded_daily_minutes.loc[i, officer] = t.values
+                                               'ComHos_Av_Mins_Per_Day'])
+            funded_daily_minutes.loc[i, officer] = t.values[0]
+        else:  # Levels 2 and above; DisHos and CenHos minutes
+            t = (funded_staff_floats.loc[i, officer] *
+                 HosHC_patient_facing_time.loc[HosHC_patient_facing_time['Officer_Type_Code'] == officer,
+                                               'DisHos_Av_Mins_Per_Day'])
+            funded_daily_minutes.loc[i, officer] = t.values[0]
 
 # Long format
 funded_staff_floats = pd.melt(funded_staff_floats, id_vars=['District_Or_Hospital', 'Facility_Level'],
@@ -1609,7 +1666,8 @@
 # Reset facility level column to exclude 'Facility_Level_'
 funded_daily_capability['Facility_Level'] = \
     funded_daily_capability['Facility_Level'].str.split(pat='_', expand=True).iloc[:, 2]
-# Drop row with zero minutes (also zero staff counts)
+# Drop row with zero or nan minutes (due to either zero staff counts or nan daily minutes)
+funded_daily_capability.fillna(0, inplace=True)
 funded_daily_capability.drop(
     index=funded_daily_capability[funded_daily_capability['Total_Mins_Per_Day'] == 0].index, inplace=True)
 # Reset index
@@ -1645,9 +1703,6 @@
         ['Facility_ID', 'Facility_Name', 'Facility_Level', 'District', 'Region', 'Officer_Category'],
         dropna=False).sum()
 ).reset_index()
-# Drop columns of officer types
-funded_daily_capability_coarse.drop(columns=['Officer_Type_Code', 'Officer_Type'], inplace=True)
-funded_daily_capability_coarse.reset_index(drop=True, inplace=True)
 
 # --- Daily capability for current staff; staff counts in floats
 # For float staff counts, calculate total minutes per day
@@ -1661,12 +1716,17 @@
             t = (curr_staff_floats.loc[i, officer] *
                  HosHC_patient_facing_time.loc[HosHC_patient_facing_time['Officer_Type_Code'] == officer,
                                                'HC_Av_Mins_Per_Day'])
-            curr_daily_minutes.loc[i, officer] = t.values
-        else:  # Levels 1b, 2, and above; Hospital minutes
+            curr_daily_minutes.loc[i, officer] = t.values[0]
+        elif the_level == 'Facility_Level_1b':  # Level 1b; ComHos minutes
+            t = (curr_staff_floats.loc[i, officer] *
+                 HosHC_patient_facing_time.loc[HosHC_patient_facing_time['Officer_Type_Code'] == officer,
+                                               'ComHos_Av_Mins_Per_Day'])
+            curr_daily_minutes.loc[i, officer] = t.values[0]
+        else:  # Levels 2 and above; DisHos and CenHos minutes
             t = (curr_staff_floats.loc[i, officer] *
                  HosHC_patient_facing_time.loc[HosHC_patient_facing_time['Officer_Type_Code'] == officer,
-                                               'Hospital_Av_Mins_Per_Day'])
-            curr_daily_minutes.loc[i, officer] = t.values
+                                               'DisHos_Av_Mins_Per_Day'])
+            curr_daily_minutes.loc[i, officer] = t.values[0]
 
 # Long format
 curr_staff_floats = pd.melt(curr_staff_floats, id_vars=['District_Or_Hospital', 'Facility_Level'],
@@ -1680,6 +1740,7 @@
 curr_daily_capability['Facility_Level'] = \
     curr_daily_capability['Facility_Level'].str.split(pat='_', expand=True).iloc[:, 2]
 # Drop row with zero minutes (also zero staff counts)
+curr_daily_capability.fillna(0, inplace=True)
 curr_daily_capability.drop(
     index=curr_daily_capability[curr_daily_capability['Total_Mins_Per_Day'] == 0].index, inplace=True)
 # Reset index
@@ -1715,26 +1776,21 @@
         ['Facility_ID', 'Facility_Name', 'Facility_Level', 'District', 'Region', 'Officer_Category'],
         dropna=False).sum()
 ).reset_index()
-# Drop columns of officer types
-curr_daily_capability_coarse.drop(columns=['Officer_Type_Code', 'Officer_Type'], inplace=True)
-curr_daily_capability_coarse.reset_index(drop=True, inplace=True)
 
 # Save
-# HosHC_patient_facing_time.to_csv(
-#     outputlocation / 'human_resources' / 'definitions' / 'ResourceFile_Patient_Facing_Time.csv', index=False)
+curr_daily_capability_coarse.to_csv(
+    outputlocation / 'human_resources' / 'actual' / 'ResourceFile_Daily_Capabilities.csv', index=False)
 
-# Need to # two lines below when generate funded_plus capability
-# funded_daily_capability_coarse.to_csv(
-#     outputlocation / 'human_resources' / 'funded' / 'ResourceFile_Daily_Capabilities.csv', index=False)
+# Need to # following lines below when generate funded_plus capability
+funded_daily_capability_coarse.to_csv(
+    outputlocation / 'human_resources' / 'funded' / 'ResourceFile_Daily_Capabilities.csv', index=False)
 
 # *** Only for funded_plus ********************************************************************************************
-funded_daily_capability_coarse.to_csv(
-    outputlocation / 'human_resources' / 'funded_plus' / 'ResourceFile_Daily_Capabilities.csv', index=False)
-# *********************************************************************************************************************
+# funded_daily_capability_coarse.to_csv(
+#     outputlocation / 'human_resources' / 'funded_plus' / 'ResourceFile_Daily_Capabilities.csv', index=False)
 
-curr_daily_capability_coarse.to_csv(
-    outputlocation / 'human_resources' / 'actual' / 'ResourceFile_Daily_Capabilities.csv', index=False)
 
+# *********************************************************************************************************************
 
 # ---------------------------------------------------------------------------------------------------------------------
 # final check that for an appointment required at a particular level (in Appt_Time_Table), \
@@ -1802,13 +1858,17 @@ def all_appts_can_run(capability):
 
     return appt_have_or_miss_capability
 
-
 # Save results for funded
+# Need to # following lines below when generate funded_plus capability
 # appt_have_or_miss_capability_funded = all_appts_can_run(funded_daily_capability_coarse)
 # appt_have_or_miss_capability_funded.to_csv(
 #     outputlocation / 'human_resources' / 'funded' / 'appt_have_or_miss_capability.csv', index=False)
+
+# *** Only for funded_plus ********************************************************************************************
+# appt_have_or_miss_capability_funded = all_appts_can_run(funded_daily_capability_coarse)
 # appt_have_or_miss_capability_funded.to_csv(
 #     outputlocation / 'human_resources' / 'funded_plus' / 'appt_have_or_miss_capability.csv', index=False)
+# *********************************************************************************************************************
 
 # Save results for actual
 # appt_have_or_miss_capability_actual = all_appts_can_run(curr_daily_capability_coarse)
diff --git a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py b/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
deleted file mode 100644
index 43fb458394..0000000000
--- a/src/scripts/data_file_processing/formatting_healthsystem_data_update.py
+++ /dev/null
@@ -1,1879 +0,0 @@
-"""
-This file sets up the health system resources for each district, each region, and also national level.
-
-It defines 7 levels for facility types, i.e., Facility_Levels = [0,1a,1b,2,3,4,5].
-
-It creates one facility of each level for each district.
-
-It allocates health care workers ('officers') to one of the seven Facility Levels.
-
-The tables generated is listed below:
-- capability tables to repo
-  Scenario 'actual' -> ResourceFile_Daily_Capabilities (./resources/healthsystem/human_resources/actual/)
-  Scenario 'funded' -> ResourceFile_Daily_Capabilities (./resources/healthsystem/human_resources/funded/)
-  Scenario 'funded_plus' -> ResourceFile_Daily_Capabilities (./resources/healthsystem/human_resources/funded_plus/)
-
-- definition tables to repo
-  ResourceFile_Appt_Time_Table (./resources/healthsystem/human_resources/definitions/)
-  ResourceFile_Appt_Types_Table (./resources/healthsystem/human_resources/definitions/)
-  ResourceFile_ApptType_By_FacLevel_Table (./resources/healthsystem/human_resources/definitions/)
-  ResourceFile_Officers_Types_Table (./resources/healthsystem/human_resources/definitions/)
-
-- organisation tables to repo
-  ResourceFile_Master_Facilities_List_Table (./resources/healthsystem/human_resources/organisation/)
-
-- other tables that can be generated by this file
-  Scenario 'actual' -> ResourceFile_Staff_Table
-  Scenario 'funded' -> ResourceFile_Staff_Table
-  Scenario 'funded_plus' -> ResourceFile_Staff_Table
-  Scenario 'actual' -> ResourceFile_Staff_Distribution_Assumption
-  Scenario 'funded' -> ResourceFile_Staff_Distribution_Assumption
-  ResourceFile_Staff_Distribution_Compare
-  ResourceFile_Patient_Facing_Time
-  ResourceFile_District_Population_Data
-  ResourceFile_Facilities_For_Each_District
-  Scenario 'actual' -> appt_have_or_miss_capability
-  Scenario 'funded' -> appt_have_or_miss_capability
-  Scenario 'funded_plus' -> appt_have_or_miss_capability
-"""
-# Task: incorporate the new version of human-resources input data
-# Since the two data versions have quite a few differences, will first explore the new version
-# and then decide how to use all the available at hand.
-
-from pathlib import Path
-
-import numpy as np
-import pandas as pd
-
-resourcefilepath = Path('./resources')
-
-path_to_dropbox = Path(
-    '/Users/jdbb1/Dropbox/Thanzi La Onse')  # <-- point to the TLO dropbox locally
-
-workingfile = (path_to_dropbox /
-               '05 - Resources' / 'Module-healthsystem' / 'chai ehp resource use data' / 'ORIGINAL' /
-               'Malawi optimization model import_2022-02-11.xlsx')
-# <-- point to the new data locally; need upload the excel file to shared dropbox
-
-working_file_old = (path_to_dropbox /
-                    '05 - Resources' / 'Module-healthsystem' / 'chai ehp resource use data' / 'ORIGINAL' /
-                    'Optimization model import_Malawi_20180315 v10.xlsx')  # <-- point to the old data locally
-
-path_to_auxiliaryfiles = (path_to_dropbox /
-                          '05 - Resources' /
-                          'Module-healthsystem' /
-                          'chai ehp resource use data' /
-                          'Auxiliary CHAI Data from CHAI HR Team 12 Sep 2021')
-
-outputlocation = resourcefilepath / 'healthsystem'
-
-# ---------------------------------------------------------------------------------------------------------------------
-# *** create and save population_by_district data
-population = pd.read_csv(
-    resourcefilepath / 'demography' / 'ResourceFile_PopulationSize_2018Census.csv'
-)
-
-pop_by_district = pd.DataFrame(population.groupby('District')['Count'].sum())
-
-# Add the column of Region
-for d in pop_by_district.index:
-    pop_by_district.loc[d, 'Region'] = population.loc[population['District'] == d, 'Region'].values[0]
-
-# Save
-# pop_by_district.to_csv(outputlocation / 'organisation' / 'ResourceFile_District_Population_Data.csv', index=True)
-
-# ---------------------------------------------------------------------------------------------------------------------
-# *** Below we generate staffing tables: fund_staffing_table for established staff, and\
-# curr_staffing_table for current staff
-# Before generating the tables, we need to prepare wb_import, officer_types_table, and\
-# make assumptions of curr_staff_return distribution and fund_staff_return distribution using Auxiliary CHAI Data
-
-# --- wb_import for staff information
-
-# Import all of the 'Staff' sheet, including both data of current and funded staff
-wb_import = pd.read_excel(workingfile, sheet_name='Staff', header=None)
-
-# --- officer_types_table
-# Make dataframe summarising the officer types and the officer codes:
-officer_types_table = wb_import.loc[2:3, 64:84].transpose().reset_index(drop=True).copy()
-officer_types_table.columns = ['Officer_Type', 'Officer_Type_Code']
-
-# Add the categories of officers
-officer_types_table.loc[0:2, 'Officer_Category'] = 'Clinical'
-officer_types_table.loc[3:4, 'Officer_Category'] = 'Nursing_and_Midwifery'
-officer_types_table.loc[5:7, 'Officer_Category'] = 'Pharmacy'
-officer_types_table.loc[8:10, 'Officer_Category'] = 'Laboratory'
-officer_types_table.loc[11, 'Officer_Category'] = 'DCSA'
-officer_types_table.loc[12:14, 'Officer_Category'] = 'Dental'
-officer_types_table.loc[15, 'Officer_Category'] = 'Mental'
-officer_types_table.loc[16, 'Officer_Category'] = 'Nutrition'
-officer_types_table.loc[17:20, 'Officer_Category'] = 'Radiography'
-
-# Save
-officer_types_table.to_csv(outputlocation / 'human_resources' / 'definitions' / 'ResourceFile_Officer_Types_Table.csv',
-                           index=False)
-
-# --- Generate assumptions of current staff distribution at facility levels 0&1a&1b&2
-# Read compiled staff return data from CHAI auxiliary datasets
-compiled_staff_return = pd.read_excel(path_to_auxiliaryfiles / 'Compiled Staff Returns.xlsx',
-                                      sheet_name='Compiled Staff Returns', skiprows=range(5))
-
-# Get relevant columns
-curr_staff_return = compiled_staff_return[['District / Central Hospital', 'MOH/ CHAM', 'Name of Incumbent', 'Cadre',
-                                           'Health Facility', 'Health Facility Type']].copy()
-
-# Drop rows with missing elements
-curr_staff_return.dropna(inplace=True)
-
-# Drop rows that associate to '_NOT INCLUDED' and '_MISSING'
-curr_staff_return.drop(curr_staff_return[curr_staff_return['Cadre'] == '_NOT INCLUDED'].index, inplace=True)
-curr_staff_return.drop(curr_staff_return[curr_staff_return['Cadre'] == '_MISSING'].index, inplace=True)
-
-# Drop rows that associate to 'Home Craft Worker' and 'Educ/Environ Health Officer',
-# as these cadres are not included in 'Time_Base' and 'PFT'.
-curr_staff_return.drop(curr_staff_return[curr_staff_return['Cadre'] == 'Home Craft Worker'].index, inplace=True)
-curr_staff_return.drop(curr_staff_return[curr_staff_return['Cadre'] == 'Educ/Environ Health Officer'].index,
-                       inplace=True)
-
-# Replace 'HSA' by 'DCSA', 'Nutrition Officer' by 'Nutrition Staff',
-# 'Pharmacy Technician' by 'Pharm Technician', 'Pharmacy Assistant' by 'Pharm Assistant',
-# to be consistent with officer_types_table
-idx_hsa = curr_staff_return[curr_staff_return['Cadre'] == 'HSA'].index
-curr_staff_return.loc[idx_hsa, 'Cadre'] = 'DCSA'
-
-idx_nutri = curr_staff_return[curr_staff_return['Cadre'] == 'Nutrition Officer'].index
-curr_staff_return.loc[idx_nutri, 'Cadre'] = 'Nutrition Staff'
-
-idx_pt = curr_staff_return[curr_staff_return['Cadre'] == 'Pharmacy Technician'].index
-curr_staff_return.loc[idx_pt, 'Cadre'] = 'Pharm Technician'
-
-idx_pa = curr_staff_return[curr_staff_return['Cadre'] == 'Pharmacy Assistant'].index
-curr_staff_return.loc[idx_pa, 'Cadre'] = 'Pharm Assistant'
-
-# Replace health facility type "Karonga Hospital" to "District Hospital"
-idx_Karonga = curr_staff_return[curr_staff_return['Health Facility Type'] == 'Karonga Hospital'].index
-curr_staff_return.loc[idx_Karonga, 'Health Facility Type'] = 'District Hospital'
-
-# Reassign the facility type of Zomba Mental Hospital as 'Zomba Mental Hospital', instead of 'Central Hospital',
-# to differentiate it with other central hospitals
-idx_ZMH = curr_staff_return[curr_staff_return['Health Facility'] == 'Zomba Mental Hospital'].index
-curr_staff_return.loc[idx_ZMH, 'Health Facility Type'] = 'Zomba Mental Hospital'
-
-# Add a column 'Staff_Count' to denote the no. of staff
-curr_staff_return['Staff_Count'] = 1
-
-# Reset index
-curr_staff_return.reset_index(drop=True, inplace=True)
-
-# Important definition: Facility_Levels = [0, 1a, 1b, 2, 3, 4, 5]
-# 0: Community/Local level - HP, Village Health Committee, Community initiatives
-# 1a: Primary level - Dispensary, HC, Clinic, Maternity facility
-# 1b: Primary level - Community/Rural Hospital, CHAM (Community) Hospitals
-# 2: Second level - District hospital, DHO
-# 3: Tertiary/Referral level - KCH, MCH, ZCH + QECH as referral hospitals
-# 4: Zomba Mental Hospital, which has very limited data in CHAI dataset
-# 5: Headquarter, which has staff data (but no Time_Base or Incidence_Curr data)
-
-# Get the Health Facility Type list and Cadre list
-# Note three cadres of 'R04 Radiotherapy Technician', 'R03 Sonographer', 'D03 Dental Assistant' have no data
-# in CHAI current and funded staff sheet and complied staff return dataset.
-fac_types_list = pd.unique(curr_staff_return['Health Facility Type'])  # Level_0 Facs and Headquarter not included
-cadre_list = pd.unique(curr_staff_return['Cadre'])  # Radiotherapy Technician/Sonographer/Dental Assistant not included
-
-# Add column 'Facility_Level'; HQ not listed in compiled staff return table
-idx_urbhc = curr_staff_return[curr_staff_return['Health Facility Type'] == 'Urban Health Center'].index
-curr_staff_return.loc[idx_urbhc, 'Facility_Level'] = 'Facility_Level_1a'  # Including CHAM HCs
-
-idx_rurhc = curr_staff_return[curr_staff_return['Health Facility Type'] == 'Rural Health Center'].index
-curr_staff_return.loc[idx_rurhc, 'Facility_Level'] = 'Facility_Level_1a'  # Including CHAM HCs
-
-idx_comhos = curr_staff_return[curr_staff_return['Health Facility Type'] == 'Community Hospital'].index
-curr_staff_return.loc[idx_comhos, 'Facility_Level'] = 'Facility_Level_1b'  # Including CHAM community hospitals
-
-idx_dishos = curr_staff_return[curr_staff_return['Health Facility Type'] == 'District Hospital'].index
-curr_staff_return.loc[idx_dishos, 'Facility_Level'] = 'Facility_Level_2'
-
-idx_cenhos = curr_staff_return[curr_staff_return['Health Facility Type'] == 'Central Hospital'].index
-curr_staff_return.loc[idx_cenhos, 'Facility_Level'] = 'Facility_Level_3'
-
-idx_zmhfac = curr_staff_return[curr_staff_return['Health Facility Type'] == 'Zomba Mental Hospital'].index
-curr_staff_return.loc[idx_zmhfac, 'Facility_Level'] = 'Facility_Level_4'
-
-# Add column 'Cadre_Code'
-for c in cadre_list:
-    curr_staff_return.loc[curr_staff_return['Cadre'] == c, 'Cadre_Code'] = officer_types_table.loc[
-        officer_types_table['Officer_Type'] == c, 'Officer_Type_Code'].copy().values[0]
-
-# Check no blanks in this table
-assert not pd.isnull(curr_staff_return).any().any()
-
-# curr_staff_return ready!
-
-# Get curr_staff_return distribution among levels 0, 1a, 1b and 2, i.e., staff distribution within a district
-# Specifically, only and all DCSAs/HSAs are to be allocated at level 0;
-# Other cadres are to be allocated at level 1a and above.
-
-curr_staff_district = curr_staff_return[['Facility_Level', 'Cadre_Code', 'Staff_Count']].copy()
-
-# Group staff by facility level
-curr_staff_distribution = pd.DataFrame(
-    curr_staff_district.groupby(by=['Cadre_Code', 'Facility_Level'], sort=False).sum())
-curr_staff_distribution.sort_index(level=[0, 1], inplace=True)
-curr_staff_distribution.reset_index(drop=False, inplace=True)
-
-# Make the curr_staff_distribution includes all cadres and facility levels (0,1a,1b,2,3,4) as index and columns
-cadre_faclevel = pd.DataFrame(columns=['Cadre_Code', 'Facility_Level_0', 'Facility_Level_1a',
-                                       'Facility_Level_1b', 'Facility_Level_2', 'Facility_Level_3',
-                                       'Facility_Level_4'])
-cadre_faclevel['Cadre_Code'] = officer_types_table['Officer_Type_Code']
-cadre_faclevel = pd.melt(cadre_faclevel, id_vars='Cadre_Code', value_vars=cadre_faclevel.columns[1:],
-                         var_name='Facility_Level')
-# Merge
-curr_staff_distribution = curr_staff_distribution.merge(cadre_faclevel, how='right')
-# Fill null with 0
-curr_staff_distribution.fillna(0, inplace=True)
-# Sort
-curr_staff_distribution.set_index(['Cadre_Code', 'Facility_Level'], inplace=True)
-curr_staff_distribution.sort_index(level=[0, 1], inplace=True)
-curr_staff_distribution.reset_index(drop=False, inplace=True)
-curr_staff_distribution.drop(['value'], axis=1, inplace=True)
-
-# Save the the complete current staff distribution table
-# curr_staff_distribution_complete = curr_staff_distribution.copy()
-
-# Keep and focus on rows of levels 0, 1a, 1b, and 2
-idx_keep = curr_staff_distribution[(curr_staff_distribution['Facility_Level'] == 'Facility_Level_0') |
-                                   (curr_staff_distribution['Facility_Level'] == 'Facility_Level_1a') |
-                                   (curr_staff_distribution['Facility_Level'] == 'Facility_Level_1b') |
-                                   (curr_staff_distribution['Facility_Level'] == 'Facility_Level_2')].index
-curr_staff_distribution = curr_staff_distribution.loc[idx_keep, :].copy()
-curr_staff_distribution.reset_index(drop=True, inplace=True)
-
-# Add column 'Proportion', denoting the percents of staff per cadre between level 0, level_1a, level_1b, and level_2
-for i in range(21):
-    # Proportion; Cadres except DCSA are allocated at level 1a and above
-    if curr_staff_distribution.loc[4 * i + 1:4 * i + 3, 'Staff_Count'].sum() > 0:  # sum of 4i+1,4i+2,4i+3
-
-        curr_staff_distribution.loc[4 * i + 1, 'Proportion'] = (
-            curr_staff_distribution.loc[4 * i + 1, 'Staff_Count'] /
-            curr_staff_distribution.loc[4 * i + 1:4 * i + 3, 'Staff_Count'].sum()
-        )
-
-        curr_staff_distribution.loc[4 * i + 2, 'Proportion'] = (
-            curr_staff_distribution.loc[4 * i + 2, 'Staff_Count'] /
-            curr_staff_distribution.loc[4 * i + 1:4 * i + 3, 'Staff_Count'].sum()
-        )
-
-        curr_staff_distribution.loc[4 * i + 3, 'Proportion'] = (
-            curr_staff_distribution.loc[4 * i + 3, 'Staff_Count'] /
-            curr_staff_distribution.loc[4 * i + 1:4 * i + 3, 'Staff_Count'].sum()
-        )
-
-# fillna
-curr_staff_distribution.fillna(0, inplace=True)
-
-# For DCSA individually, reassign their proportions since we assume all DCSAs are located at level 0
-idx_dcsa = curr_staff_distribution[curr_staff_distribution['Cadre_Code'] == 'E01'].index
-curr_staff_distribution.loc[idx_dcsa[0], 'Proportion'] = 1.00
-curr_staff_distribution.loc[idx_dcsa[1:4], 'Proportion'] = 0.00
-# Alternatively, DCSAs 50% at level 0 and 50% at level 1a?
-
-# curr_staff_distribution ready!
-
-# Save
-# curr_staff_distribution.to_csv(
-#     outputlocation / 'human_resources' / 'actual' / 'ResourceFile_Staff_Distribution_Assumption.csv',
-#     index=False)
-
-# --- Generate assumptions of established/funded staff distribution at facility levels 0&1a&1b&2
-# Read 2018-03-09 Facility-level establishment MOH & CHAM from CHAI auxiliary datasets
-fund_staff_2018_raw = pd.read_excel(path_to_auxiliaryfiles / '2018-03-09 Facility-level establishment MOH & CHAM.xlsx',
-                                    sheet_name='Establishment listing')
-
-# Get relevant columns
-fund_staff_2018 = fund_staff_2018_raw[['Number of positions', 'Facility', 'Facility Type', 'WFOM Cadre']].copy()
-
-# Drop rows with missing/blank elements
-fund_staff_2018.dropna(inplace=True)
-# Drop rows that associate to '_NOT INCLUDED'
-fund_staff_2018.drop(fund_staff_2018[fund_staff_2018['WFOM Cadre'] == '_NOT INCLUDED'].index, inplace=True)
-# Drop rows for 'Training Institution'
-fund_staff_2018.drop(fund_staff_2018[fund_staff_2018['Facility Type'] == 'Training Institution'].index, inplace=True)
-# Reset index after drop
-fund_staff_2018.reset_index(drop=True, inplace=True)
-
-# Reform column 'WFOM Cadre'
-# Note 'Cadre_Extra' records 'Clinical ' or 'Nursing ' for C01 and C02.
-# We combine C01 and C02 into C01 denoting mental health staff cadre to be consistent with 'curr_staff_return'.
-fund_staff_2018[['Cadre_No.', 'Cadre_Code', 'Cadre', 'Cadre_Extra']] = \
-    fund_staff_2018['WFOM Cadre'].str.split(pat='-| - ', expand=True).copy()
-fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Cadre_Code'] == 'C02'].index, 'Cadre_Code'] = 'C01'
-# Drop columns ['WFOM Cadre','Cadre_No.','Cadre_Extra']
-fund_staff_2018.drop(columns=['WFOM Cadre', 'Cadre_No.', 'Cadre_Extra'], inplace=True)
-
-# Drop rows that associate to 'Home Craft Worker', 'Educ/Environ Health Officer', and 'Community Midwife Assistant'
-# as these cadres are not included in 'Time_Base' and 'PFT'.
-fund_staff_2018.drop(fund_staff_2018[fund_staff_2018['Cadre'] == 'Home Craft Worker'].index, inplace=True)
-fund_staff_2018.drop(fund_staff_2018[fund_staff_2018['Cadre'] == 'Educ/Environ Health Officer'].index, inplace=True)
-fund_staff_2018.drop(fund_staff_2018[fund_staff_2018['Cadre'] == 'Community Midwife Assistant'].index, inplace=True)
-# Reset index
-fund_staff_2018.reset_index(drop=True, inplace=True)
-
-# Replace {
-# 'HSA' by 'DCSA' (and 'E02' by 'E01') , 'Medical Assistant' by 'Med. Assistant', 'Laboratory Officer' by 'Lab Officer',
-# 'Laboratory Technician' by 'Lab Technician', 'Laboratory Assistant' by 'Lab Assistant'
-# 'Nursing Officer/Registered Nurse' by 'Nurse Officer', 'Dentist' by 'Dental Officer',
-# 'Nutrition Officer' by 'Nutrition Staff', 'Pharmacy Technician' by 'Pharm Technician',
-# 'Pharmacy Assistant' by 'Pharm Assistant', 'Pharmacy Officer' by 'Pharmacist' }
-# to be consistent with officer_types_table
-fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Cadre'] == 'HSA'].index, 'Cadre'] = 'DCSA'
-fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Cadre_Code'] == 'E02'].index, 'Cadre_Code'] = 'E01'
-fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Cadre'] == 'Medical Assistant'].index, 'Cadre'] = 'Med. Assistant'
-fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Cadre'] == 'Laboratory Officer'].index, 'Cadre'] = 'Lab Officer'
-fund_staff_2018.loc[
-    fund_staff_2018[fund_staff_2018['Cadre'] == 'Laboratory Technician'].index, 'Cadre'] = 'Lab Technician'
-fund_staff_2018.loc[
-    fund_staff_2018[fund_staff_2018['Cadre'] == 'Laboratory Assistant'].index, 'Cadre'] = 'Lab Assistant'
-fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Cadre'] == 'Nursing Officer/Registered Nurse'].index,
-                    'Cadre'] = 'Nurse Officer'
-fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Cadre'] == 'Dentist'].index, 'Cadre'] = 'Dental Officer'
-fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Cadre'] == 'Nutrition Officer'].index, 'Cadre'] = 'Nutrition Staff'
-fund_staff_2018.loc[
-    fund_staff_2018[fund_staff_2018['Cadre'] == 'Pharmacy Technician'].index, 'Cadre'] = 'Pharm Technician'
-fund_staff_2018.loc[
-    fund_staff_2018[fund_staff_2018['Cadre'] == 'Pharmacy Assistant'].index, 'Cadre'] = 'Pharm Assistant'
-fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Cadre'] == 'Pharmacy Officer'].index, 'Cadre'] = 'Pharmacist'
-
-# Note that {D03 'Dental Assistant', R03 'Radiotherapy Technician', R04 'Sonographer'} are not included in this dataset.
-# This is OK because CHAI current and funded staff sheet has no data regarding the three cadres.
-
-# Reassign the facility type of Zomba Mental Hospital as 'Zomba Mental Hospital'.
-fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Facility'] == 'Zomba Mental Hospital'].index,
-                    'Facility Type'] = 'Zomba Mental Hospital'
-
-# Important definition: Facility_Levels = [0, 1a, 1b, 2, 3, 4, 5]
-# 0: Community/Local level - HP, Village Health Committee, Community initiatives
-# 1a: Primary level - Dispensary, HC, Clinic, Maternity facility
-# 1b: Primary level - Community/Rural Hospital, CHAM (Community) Hospitals
-# 2: Second level - District hospital, DHO
-# 3: Tertiary/Referral level - KCH, MCH, ZCH + QECH as referral hospitals
-# 4: Zomba Mental Hospital, which has very limited data in CHAI dataset
-# 5: Headquarter, which has staff data (but no Time_Base or Incidence_Curr data)
-
-# Get the Health Facility Type list
-# fac_types_list = pd.unique(fund_staff_2018['Facility Type']) # Level_0 Facs not included
-
-# Add column 'Facility_Level'
-fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Facility Type'] == 'Urban Health Center'].index,
-                    'Facility_Level'] = 'Facility_Level_1a'
-fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Facility Type'] == 'Rural Health Center'].index,
-                    'Facility_Level'] = 'Facility_Level_1a'
-fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Facility Type'] == 'Health Center (with maternity)'].index,
-                    'Facility_Level'] = 'Facility_Level_1a'
-fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Facility Type'] == 'Health Center (without maternity)'].index,
-                    'Facility_Level'] = 'Facility_Level_1a'
-
-fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Facility Type'] == 'Rural/Community Hospital'].index,
-                    'Facility_Level'] = 'Facility_Level_1b'
-
-fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Facility Type'] == 'District Hospital'].index,
-                    'Facility_Level'] = 'Facility_Level_2'
-fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Facility Type'] == 'DHO'].index,
-                    'Facility_Level'] = 'Facility_Level_2'
-
-fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Facility Type'] == 'Central Hospital'].index,
-                    'Facility_Level'] = 'Facility_Level_3'
-
-fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Facility Type'] == 'Zomba Mental Hospital'].index,
-                    'Facility_Level'] = 'Facility_Level_4'
-
-fund_staff_2018.loc[fund_staff_2018[fund_staff_2018['Facility Type'] == 'Headquarters'].index,
-                    'Facility_Level'] = 'Facility_Level_5'
-
-# Check no blanks in this table
-assert not pd.isnull(fund_staff_2018).any().any()
-
-# fund_staff_2018 ready!
-
-# Get fund_staff_return distribution among levels 0, 1a, 1b and 2, i.e., staff distribution within a district
-# Specifically, only and all DCSAs/HSAs are to be allocated at level 0;
-# Other cadres are to be allocated at level 1a and above.
-
-fund_staff_district = fund_staff_2018[['Facility_Level', 'Cadre_Code', 'Number of positions']].copy()
-
-# Group staff by facility level
-fund_staff_distribution = pd.DataFrame(
-    fund_staff_district.groupby(by=['Cadre_Code', 'Facility_Level'], sort=False).sum())
-fund_staff_distribution.sort_index(level=[0, 1], inplace=True)
-fund_staff_distribution.reset_index(drop=False, inplace=True)
-
-# Make the fund_staff_distribution includes all cadres and facility levels (0,1a,1b,2,3,4,5) as index and columns
-fund_cadre_faclevel = pd.DataFrame(columns=['Cadre_Code', 'Facility_Level_0', 'Facility_Level_1a',
-                                            'Facility_Level_1b', 'Facility_Level_2', 'Facility_Level_3',
-                                            'Facility_Level_4', 'Facility_Level_5'])
-fund_cadre_faclevel['Cadre_Code'] = officer_types_table['Officer_Type_Code']
-fund_cadre_faclevel = pd.melt(fund_cadre_faclevel, id_vars='Cadre_Code', value_vars=fund_cadre_faclevel.columns[1:],
-                              var_name='Facility_Level')
-# Merge
-fund_staff_distribution = fund_staff_distribution.merge(fund_cadre_faclevel, how='right')
-# Fill null with 0
-fund_staff_distribution.fillna(0, inplace=True)
-# Sort
-fund_staff_distribution.set_index(['Cadre_Code', 'Facility_Level'], inplace=True)
-fund_staff_distribution.sort_index(level=[0, 1], inplace=True)
-fund_staff_distribution.reset_index(drop=False, inplace=True)
-fund_staff_distribution.drop(['value'], axis=1, inplace=True)
-
-# Save the the complete funded staff distribution table
-# fund_staff_distribution_complete = fund_staff_distribution.copy()
-
-# Keep and focus on rows of levels 0, 1a, 1b, and 2
-fund_idx_keep = fund_staff_distribution[(fund_staff_distribution['Facility_Level'] == 'Facility_Level_0') |
-                                        (fund_staff_distribution['Facility_Level'] == 'Facility_Level_1a') |
-                                        (fund_staff_distribution['Facility_Level'] == 'Facility_Level_1b') |
-                                        (fund_staff_distribution['Facility_Level'] == 'Facility_Level_2')].index
-fund_staff_distribution = fund_staff_distribution.loc[fund_idx_keep, :].copy()
-fund_staff_distribution.reset_index(drop=True, inplace=True)
-
-# Add column 'Proportion', denoting the percents of staff per cadre between level 0, level_1a, level_1b, and level_2
-for i in range(21):
-    # Proportion; Cadres except DCSA are allocated at level 1a and above
-    if fund_staff_distribution.loc[4 * i + 1:4 * i + 3, 'Number of positions'].sum() > 0:  # sum of 4i+1,4i+2,4i+3
-
-        fund_staff_distribution.loc[4 * i + 1, 'Proportion_Fund'] = (
-            fund_staff_distribution.loc[4 * i + 1, 'Number of positions'] /
-            fund_staff_distribution.loc[4 * i + 1:4 * i + 3, 'Number of positions'].sum()
-        )
-
-        fund_staff_distribution.loc[4 * i + 2, 'Proportion_Fund'] = (
-            fund_staff_distribution.loc[4 * i + 2, 'Number of positions'] /
-            fund_staff_distribution.loc[4 * i + 1:4 * i + 3, 'Number of positions'].sum()
-        )
-
-        fund_staff_distribution.loc[4 * i + 3, 'Proportion_Fund'] = (
-            fund_staff_distribution.loc[4 * i + 3, 'Number of positions'] /
-            fund_staff_distribution.loc[4 * i + 1:4 * i + 3, 'Number of positions'].sum()
-        )
-
-# fillna
-fund_staff_distribution.fillna(0, inplace=True)
-
-# For DCSA individually, reassign their proportions since we assume all DCSAs are located at level 0
-fund_idx_dcsa = fund_staff_distribution[fund_staff_distribution['Cadre_Code'] == 'E01'].index
-fund_staff_distribution.loc[fund_idx_dcsa[0], 'Proportion_Fund'] = 1.00
-fund_staff_distribution.loc[fund_idx_dcsa[1:4], 'Proportion_Fund'] = 0.00
-# Alternatively, DCSAs 50% at level 0 and 50% at level 1a?
-
-# fund_staff_distribution ready!
-
-# Save
-# fund_staff_distribution.to_csv(
-#     outputlocation / 'human_resources' / 'funded' / 'ResourceFile_Staff_Distribution_Assumption.csv',
-#     index=False)
-
-# We read info from CHAI estimates of optimal and immediately needed workforce for comparison wherever possible
-# --- CHAI WFOM optimal workforce and immediately needed staff distribution
-
-# Preparing optimal workforce from CHAI auxiliary datasets
-opt_workforce = pd.read_excel(path_to_auxiliaryfiles / 'MalawiOptimization_OUTPUT2022 SH 2019-10-19.xlsx',
-                              sheet_name='Sums by facility type')
-# Drop redundant row
-opt_workforce.drop(0, inplace=True)
-opt_workforce.reset_index(drop=True, inplace=True)
-
-# Add column 'Facility_level'
-opt_workforce.insert(2, 'Facility_Level', ['Facility_Level_3',
-                                           'Facility_Level_1b',
-                                           'Facility_Level_2',
-                                           'Facility_Level_1a',
-                                           'Facility_Level_1a'])
-
-# Get staff distribution between level_1a, level_1b and level_2 per cadre
-cols_matter = opt_workforce.columns[2:24]
-opt_workforce_distribution = opt_workforce.loc[1:4, cols_matter].copy()  # drop row Facility_Level_3
-opt_workforce_distribution = pd.DataFrame(opt_workforce_distribution.groupby(by=['Facility_Level'], sort=False).sum())
-opt_workforce_distribution.sort_index(inplace=True)
-# Reset index
-opt_workforce_distribution.reset_index(drop=False, inplace=True)
-
-# Transform to long format
-opt_workforce_distribution = pd.melt(opt_workforce_distribution, id_vars='Facility_Level', value_vars=cols_matter[1:],
-                                     var_name='Cadre_Opt', value_name='Staff_Count_Opt')
-
-# Add column 'Cadre_Code'
-for i in range(63):
-    opt_workforce_distribution.loc[i, 'Cadre_Code'] = str(opt_workforce_distribution.loc[i, 'Cadre_Opt'])[7:10]
-
-# Sort to be consistent with curr_staff_distribution
-# Drop unnecessary column
-opt_workforce_distribution.set_index(['Cadre_Code', 'Facility_Level'], inplace=True)
-opt_workforce_distribution.sort_index(level=[0, 1], inplace=True)
-opt_workforce_distribution.reset_index(drop=False, inplace=True)
-opt_workforce_distribution.drop(columns=['Cadre_Opt'], inplace=True)
-
-# Add column 'Proportion', denoting the percents of staff per cadre between level_1a, level_1b and level_2
-for i in range(21):
-    if opt_workforce_distribution.loc[3 * i:3 * i + 2, 'Staff_Count_Opt'].sum() > 0:  # sum of 3i,3i+1,3i+2
-        opt_workforce_distribution.loc[3 * i, 'Proportion_Opt'] = (
-            opt_workforce_distribution.loc[3 * i, 'Staff_Count_Opt'] /
-            opt_workforce_distribution.loc[3 * i:3 * i + 2, 'Staff_Count_Opt'].sum()
-        )
-
-        opt_workforce_distribution.loc[3 * i + 1, 'Proportion_Opt'] = (
-            opt_workforce_distribution.loc[3 * i + 1, 'Staff_Count_Opt'] /
-            opt_workforce_distribution.loc[3 * i:3 * i + 2, 'Staff_Count_Opt'].sum()
-        )
-
-        opt_workforce_distribution.loc[3 * i + 2, 'Proportion_Opt'] = (
-            opt_workforce_distribution.loc[3 * i + 2, 'Staff_Count_Opt'] /
-            opt_workforce_distribution.loc[3 * i:3 * i + 2, 'Staff_Count_Opt'].sum()
-        )
-
-# fillna
-opt_workforce_distribution.fillna(0, inplace=True)
-
-# opt_workforce_distribution ready!
-
-# Preparing immediately needed estimates from CHAI auxiliary datasets
-immed_need = pd.read_excel(path_to_auxiliaryfiles / 'MalawiOptimization_OUTPUT_ALLYEARS_Curr.xlsx',
-                           sheet_name='CurrBase Output')
-
-# Select relevant data
-idx_year = immed_need[immed_need['OutputYear'] == 2016].index
-immed_need_distribution = immed_need.loc[idx_year, immed_need.columns[np.r_[1, 3, 49:70]]]
-immed_need_distribution.dropna(inplace=True)
-
-# Add column 'Facility_Level'
-immed_need_distribution.loc[immed_need_distribution[immed_need_distribution['FacilityType'] ==
-                                                    'UrbHC'].index, 'Facility_Level'] = 'Facility_Level_1a'
-
-immed_need_distribution.loc[immed_need_distribution[immed_need_distribution['FacilityType'] ==
-                                                    'RurHC'].index, 'Facility_Level'] = 'Facility_Level_1a'
-
-immed_need_distribution.loc[immed_need_distribution[immed_need_distribution['FacilityType'] ==
-                                                    'ComHos'].index, 'Facility_Level'] = 'Facility_Level_1b'
-
-immed_need_distribution.loc[immed_need_distribution[immed_need_distribution['FacilityType'] ==
-                                                    'DisHos'].index, 'Facility_Level'] = 'Facility_Level_2'
-
-immed_need_distribution.loc[immed_need_distribution[immed_need_distribution['FacilityType'] ==
-                                                    'CenHos'].index, 'Facility_Level'] = 'Facility_Level_3'
-
-# Group staff by levels
-immed_need_distribution = pd.DataFrame(immed_need_distribution.groupby(by=['Facility_Level'], sort=False).sum())
-# Drop level 3
-immed_need_distribution.drop(index='Facility_Level_3', inplace=True)
-# Reset index
-immed_need_distribution.reset_index(inplace=True)
-
-# Transform to long format
-assert set(immed_need_distribution.columns[1:]) == set(cols_matter[1:])
-immed_need_distribution = pd.melt(immed_need_distribution, id_vars='Facility_Level', value_vars=cols_matter[1:],
-                                  var_name='Cadre_ImmedNeed', value_name='Staff_Count_ImmedNeed')
-
-# Add column 'Cadre_Code'
-for i in range(63):
-    immed_need_distribution.loc[i, 'Cadre_Code'] = str(immed_need_distribution.loc[i, 'Cadre_ImmedNeed'])[7:10]
-
-# Sort to be consistent with curr_staff_distribution
-# Drop unnecessary column
-immed_need_distribution.set_index(['Cadre_Code', 'Facility_Level'], inplace=True)
-immed_need_distribution.sort_index(level=[0, 1], inplace=True)
-immed_need_distribution.reset_index(drop=False, inplace=True)
-immed_need_distribution.drop(columns=['Cadre_ImmedNeed'], inplace=True)
-
-# Add column 'Proportion', denoting the percents of staff per cadre among level_1a, level_1b, and level_2
-for i in range(21):
-    if immed_need_distribution.loc[3 * i:3 * i + 2, 'Staff_Count_ImmedNeed'].sum() > 0:  # sum of 3i,3i+1,3i+2
-        immed_need_distribution.loc[3 * i, 'Proportion_ImmedNeed'] = (
-            immed_need_distribution.loc[3 * i, 'Staff_Count_ImmedNeed'] /
-            immed_need_distribution.loc[3 * i:3 * i + 2, 'Staff_Count_ImmedNeed'].sum()
-        )
-
-        immed_need_distribution.loc[3 * i + 1, 'Proportion_ImmedNeed'] = (
-            immed_need_distribution.loc[3 * i + 1, 'Staff_Count_ImmedNeed'] /
-            immed_need_distribution.loc[3 * i:3 * i + 2, 'Staff_Count_ImmedNeed'].sum()
-        )
-
-        immed_need_distribution.loc[3 * i + 2, 'Proportion_ImmedNeed'] = (
-            immed_need_distribution.loc[3 * i + 2, 'Staff_Count_ImmedNeed'] /
-            immed_need_distribution.loc[3 * i:3 * i + 2, 'Staff_Count_ImmedNeed'].sum()
-        )
-
-# fillna
-immed_need_distribution.fillna(0, inplace=True)
-
-# immed_need_distribution ready!
-
-# --- Combine curr_staff_distribution, fund_staff_distribution, opt_workforce_distribution, and immed_need_distribution
-# Compare if possible
-
-# Merge curr and opt data
-# First, drop rows of level_0 of curr_staff_distribution, for compare_staff_distribution
-idx_level0 = curr_staff_distribution[curr_staff_distribution['Facility_Level'] == 'Facility_Level_0'].index
-compare_staff_distribution = curr_staff_distribution.drop(idx_level0, axis=0, inplace=False).copy()
-# Merge
-compare_staff_distribution = curr_staff_distribution.merge(opt_workforce_distribution, how='right')
-
-# Check before adding ImmedNeed data
-assert (compare_staff_distribution['Cadre_Code'] == immed_need_distribution['Cadre_Code']).all()
-assert (compare_staff_distribution['Facility_Level'] == immed_need_distribution['Facility_Level']).all()
-# Add Staff_Count_ImmedNeed and Proportion_ImmedNeed to the merged table
-compare_staff_distribution['Staff_Count_ImmedNeed'] = immed_need_distribution['Staff_Count_ImmedNeed'].copy()
-compare_staff_distribution['Proportion_ImmedNeed'] = immed_need_distribution['Proportion_ImmedNeed'].copy()
-
-# Add fund data
-# First, drop rows of level_0 of fund_staff_distribution
-fund_idx_level0 = fund_staff_distribution[fund_staff_distribution['Facility_Level'] == 'Facility_Level_0'].index
-fund_staff_distribution_nolevel0 = fund_staff_distribution.drop(fund_idx_level0, axis=0, inplace=False).copy()
-fund_staff_distribution_nolevel0.reset_index(drop=True, inplace=True)
-# Check before combination
-assert (compare_staff_distribution['Cadre_Code'] == fund_staff_distribution_nolevel0['Cadre_Code']).all()
-assert (compare_staff_distribution['Facility_Level'] == fund_staff_distribution_nolevel0['Facility_Level']).all()
-# Add Number of positions and Proportion_Fund to the merged table
-compare_staff_distribution.insert(4, 'Staff_Count_Fund', fund_staff_distribution_nolevel0['Number of positions'].values)
-compare_staff_distribution.insert(5, 'Proportion_Fund', fund_staff_distribution_nolevel0['Proportion_Fund'].values)
-
-# Calculate the difference
-for i in range(63):
-    # Current data compared with Fund, Opt, and ImmedNeed
-    if compare_staff_distribution.loc[i, 'Proportion_Fund'] > 0:
-        compare_staff_distribution.loc[i, 'Curr_vs_Fund'] = (
-            (compare_staff_distribution.loc[i, 'Proportion'] - compare_staff_distribution.loc[i, 'Proportion_Fund']) /
-            compare_staff_distribution.loc[i, 'Proportion_Fund']
-        )
-
-    if compare_staff_distribution.loc[i, 'Proportion_Opt'] > 0:
-        compare_staff_distribution.loc[i, 'Curr_vs_Opt'] = (
-            (compare_staff_distribution.loc[i, 'Proportion'] - compare_staff_distribution.loc[i, 'Proportion_Opt']) /
-            compare_staff_distribution.loc[i, 'Proportion_Opt']
-        )
-
-    if compare_staff_distribution.loc[i, 'Proportion_ImmedNeed'] > 0:
-        compare_staff_distribution.loc[i, 'Curr_vs_ImmedNeed'] = (
-            (compare_staff_distribution.loc[i, 'Proportion'] -
-             compare_staff_distribution.loc[i, 'Proportion_ImmedNeed']) /
-            compare_staff_distribution.loc[i, 'Proportion_ImmedNeed']
-        )
-    # Funded data compared with Opt and ImmedNeed
-    if compare_staff_distribution.loc[i, 'Proportion_Opt'] > 0:
-        compare_staff_distribution.loc[i, 'Fund_vs_Opt'] = (
-            (compare_staff_distribution.loc[i, 'Proportion_Fund'] -
-             compare_staff_distribution.loc[i, 'Proportion_Opt']) /
-            compare_staff_distribution.loc[i, 'Proportion_Opt']
-        )
-
-    if compare_staff_distribution.loc[i, 'Proportion_ImmedNeed'] > 0:
-        compare_staff_distribution.loc[i, 'Fund_vs_ImmedNeed'] = (
-            (compare_staff_distribution.loc[i, 'Proportion_Fund'] -
-             compare_staff_distribution.loc[i, 'Proportion_ImmedNeed']) /
-            compare_staff_distribution.loc[i, 'Proportion_ImmedNeed']
-        )
-
-# Save
-# compare_staff_distribution.to_csv(outputlocation / 'ResourceFile_Staff_Distribution_Compare.csv', index=False)
-
-# ***
-# --- fund_staffing_table for established staff
-# Extract just the section about "Funded TOTAl Staff'
-wb_extract = wb_import.loc[3:39, 64:84]
-wb_extract = wb_extract.drop([4, 5])
-wb_extract.columns = wb_extract.iloc[0]
-wb_extract = wb_extract.drop([3])
-wb_extract = wb_extract.reset_index(drop=True)
-wb_extract.fillna(0, inplace=True)  # replace all null values with zero values
-
-# Add in the column to the dataframe for the labels that distinguishes whether
-# these officers are allocated to the district-or-lower levels or one of the key hospitals.
-labels = wb_import.loc[6:39, 0].reset_index(drop=True)
-is_distlevel = labels.copy()
-is_distlevel[0:28] = True  # for district-or-lower levels
-is_distlevel[28:] = False  # for CenHos-or-above levels
-
-wb_extract.loc[:, 'District_Or_Hospital'] = labels
-wb_extract.loc[:, 'Is_DistrictLevel'] = is_distlevel
-
-# Finished import from the CHAI excel:
-fund_staffing_table = wb_extract.copy()
-
-# The imported staffing table suggest that there is some Dental officer (D01) in most districts,
-# but the Time_Curr data (below) suggest that D01 is only needed at central hospitals (not yet validated by CHAI).
-# This potential inconsistency can be solved by re-allocating D01 from districts to central hospitals, but
-# currently we do not do such reallocation to reduce the assumptions we have to make;
-# Also because the central/referral hospitals have Dental officer allocated to meet dental service demand,
-# thus no risk of not able to meet such demand at level 3.
-
-# *** Only for funded_plus ********************************************************************************************
-# Districts Balaka/Machinga/Mwanza/Neno (4 in South), Nkhata Bay (1 in North), Ntchisi/ Salima (2 in Central)
-# have 0 mental health staff C01 in establishment,
-# whereas C01 is required by mental health appts at level 1b, level 2 and level 3.
-# To fix this inconsistency, we have to move at least 1 C01 to each of these districts from the referral hospitals.
-# (QECH and ZCH in South, MCH in North, KCH in Central; ZCH has no C01)
-# non_c01_district_idx = fund_staffing_table[(fund_staffing_table['C01'] == 0) &
-#                                            (fund_staffing_table['Is_DistrictLevel'])].index
-# non_c01_districts = pd.DataFrame(fund_staffing_table.loc[non_c01_district_idx, 'District_Or_Hospital'])
-# non_c01_districts['Region'] = pop_by_district.loc[non_c01_districts['District_Or_Hospital'], 'Region'].values
-# fund_staffing_table.loc[non_c01_district_idx, 'C01'] = 1
-# fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'QECH', 'C01'] = (
-#     fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'QECH', 'C01'] - 4
-# )
-# fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'MCH', 'C01'] = (
-#     fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'MCH', 'C01'] - 1
-# )
-# fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'KCH', 'C01'] = (
-#     fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == 'KCH', 'C01'] - 2
-# )
-# *********************************************************************************************************************
-
-# *** Only for funded_plus ********************************************************************************************
-# In the funded staff table, it does not make sense that Likoma has no DCSA staff,
-# whereas all other district has at least 250 DCSA staff
-# As CHAI indicates Likoma's data is mostly bounded into Nhkata Bay,
-# we draw some DCSA from Nhkata Bay to Likoma using population as the weight
-# idx_likoma = fund_staffing_table[fund_staffing_table['District_Or_Hospital'] == 'Likoma'].index
-# idx_nkhatabay = fund_staffing_table[fund_staffing_table['District_Or_Hospital'] == 'Nkhata Bay'].index
-# fund_staffing_table.loc[idx_likoma, 'E01'] = fund_staffing_table.loc[idx_nkhatabay, 'E01'].values[0] * (
-#     pop_by_district.loc['Likoma', 'Count'] / pop_by_district.loc['Nkhata Bay', 'Count'])
-# fund_staffing_table.loc[idx_nkhatabay, 'E01'] = (
-#     fund_staffing_table.loc[idx_nkhatabay, 'E01'].values[0] - fund_staffing_table.loc[idx_likoma, 'E01'].values[0])
-# *********************************************************************************************************************
-
-# Sort out which are district allocations and which are central hospitals and above
-
-# We assign HQ to HQ; KCH as RefHos in Central region; MCH as RefHos in Northern region;
-# QECH and ZCH as RefHos in Southern region (QECH is in Southwest and ZCH is in Southeast).
-fund_staffing_table.loc[
-    fund_staffing_table['District_Or_Hospital'] == 'HQ', 'District_Or_Hospital'] = 'Headquarter'
-fund_staffing_table.loc[
-    fund_staffing_table['District_Or_Hospital'] == 'KCH', 'District_Or_Hospital'] = 'Referral Hospital_Central'
-fund_staffing_table.loc[
-    fund_staffing_table['District_Or_Hospital'] == 'MCH', 'District_Or_Hospital'] = 'Referral Hospital_Northern'
-fund_staffing_table.loc[
-    fund_staffing_table['District_Or_Hospital'] == 'QECH', 'District_Or_Hospital'] = 'Referral Hospital_Southern'
-# fund_staffing_table.loc[
-# fund_staffing_table['District_Or_Hospital'] == 'QECH', 'District_Or_Hospital'] = 'Referral Hospital_Southwest'
-fund_staffing_table.loc[
-    fund_staffing_table['District_Or_Hospital'] == 'ZCH', 'District_Or_Hospital'] = 'Referral Hospital_Southern'
-# fund_staffing_table.loc[
-# fund_staffing_table['District_Or_Hospital'] == 'ZCH', 'District_Or_Hospital'] = 'Referral Hospital_Southeast'
-fund_staffing_table.loc[
-    fund_staffing_table['District_Or_Hospital'] == 'ZMH', 'District_Or_Hospital'] = 'Zomba Mental Hospital'
-
-# Group the referral hospitals QECH and ZCH as Referral Hospital_Southern
-Is_DistrictLevel = fund_staffing_table['Is_DistrictLevel'].values  # Save the column 'Is_DistrictLevel' first
-fund_staffing_table = pd.DataFrame(
-    fund_staffing_table.groupby(by=['District_Or_Hospital'], sort=False).sum()).reset_index()
-fund_staffing_table.insert(1, 'Is_DistrictLevel', Is_DistrictLevel[:-1])  # Add the column 'Is_DistrictLevel'
-
-# Check that in fund_staffing_table every staff count entry >= 0
-assert (fund_staffing_table.loc[:, 'M01':'R04'].values >= 0).all()
-
-# The following districts are not in the CHAI data because they are included within other districts.
-# For now, we will say that the division of staff between these cities and the wide district (where they are included)
-# is consistent with the population recorded for them (Malawi 2018 census),
-# i.e., to use population-based weights to reallocate staff
-
-# Add in Lilongwe City (part of Lilongwe)
-# Add in Mzuzu City (part of Mziba) ASSUMED
-# Add in Zomba City (part of Zomba)
-# Add in Blantyre City (part of Blantyre)
-
-# create mapping: the new districts : super_district
-split_districts = (
-    ('Lilongwe City', 'Lilongwe'),
-    ('Mzuzu City', 'Mzimba'),
-    ('Zomba City', 'Zomba'),
-    ('Blantyre City', 'Blantyre')
-)
-
-# reallocating staff to the new districts
-for i in np.arange(0, len(split_districts)):
-    new_district = split_districts[i][0]
-    super_district = split_districts[i][1]
-
-    record = fund_staffing_table.iloc[0].copy()  # get a row of the staffing table
-
-    # make a the record for the new district
-    record['District_Or_Hospital'] = new_district
-    record['Is_DistrictLevel'] = True
-
-    # get total staff level from the super districts
-    cols = set(fund_staffing_table.columns).intersection(set(officer_types_table.Officer_Type_Code))
-
-    total_staff = fund_staffing_table.loc[
-        fund_staffing_table['District_Or_Hospital'] == super_district, cols].values.squeeze()
-
-    # get the weight; The original weights w0 for the 4 new districts in order are 0.60,0.24,0.14,1.77(> 1)
-    w0 = pop_by_district.loc[new_district, 'Count'] / pop_by_district.loc[super_district, 'Count']
-    if w0 < 1:
-        w = w0
-    else:
-        w = 0.5
-
-    # assign w * 100% staff to the new district
-    record.loc[cols] = w * total_staff
-    fund_staffing_table = fund_staffing_table.append(record).reset_index(drop=True)
-
-    # take staff away from the super district
-    fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == super_district, cols] = \
-        fund_staffing_table.loc[
-            fund_staffing_table[
-                'District_Or_Hospital'] == super_district, cols] - record.loc[cols]
-
-# Confirm the merging will be perfect:
-pop = pop_by_district.reset_index(drop=False, inplace=False)
-assert set(pop['District'].values) == set(
-    fund_staffing_table.loc[fund_staffing_table['Is_DistrictLevel'], 'District_Or_Hospital'])
-assert len(pop['District'].values) == len(
-    fund_staffing_table.loc[fund_staffing_table['Is_DistrictLevel'], 'District_Or_Hospital'])
-
-# ... double check by doing the merge explicitly
-pop_districts = pd.DataFrame({'District': pd.unique(pop['District'])})  # data frame
-chai_districts = pd.DataFrame(
-    {'District': fund_staffing_table.loc[fund_staffing_table['Is_DistrictLevel'], 'District_Or_Hospital']})
-
-merge_result = pop_districts.merge(chai_districts, how='inner', indicator=True)
-assert all(merge_result['_merge'] == 'both')
-assert len(merge_result) == len(pop_districts)
-
-# Split staff within each district to level 0 (All DCSAs at HP), level 1a (Disp, HC, etc.),
-# level 1b (ComHos, CHAM ComHos), and level 2 (DisHos, etc.), according to fund_staff_distribution.
-
-# First, generate a df with all districts and facility levels 0 - 2 per district
-district_faclevel = pd.DataFrame(columns=['District_Or_Hospital', 'Facility_Level_0', 'Facility_Level_1a',
-                                          'Facility_Level_1b', 'Facility_Level_2'])
-district_faclevel['District_Or_Hospital'] = pop['District'].values.copy()
-district_faclevel = pd.melt(district_faclevel, id_vars='District_Or_Hospital', value_vars=district_faclevel.columns[1:],
-                            var_name='Facility_Level')
-district_faclevel.set_index(['District_Or_Hospital', 'Facility_Level'], inplace=True)
-district_faclevel.sort_index(level=[0, 1], inplace=True)
-district_faclevel.reset_index(drop=False, inplace=True)
-district_faclevel.drop(columns=['value'], axis=1, inplace=True)
-# Merge
-fund_staffing_table = district_faclevel.merge(fund_staffing_table, how='outer')
-
-# Split staff among levels
-
-# *** Only for funded_plus ********************************************************************************************
-# Before split, update the funded C01 distributions at levels 1a, 1b and 2 using CHAI Optimal Workforce estimates. \
-# This is because funded C01 are all at level 1b (100%), meanwhile appt time base requires C01 at level 2. \
-# CHAI Optimal Workforce locates C01 47.92% at level 1b and 52.08% at level 2, which seems more sensible.
-# idx_c01_level_1b = fund_staff_distribution[
-#     (fund_staff_distribution['Cadre_Code'] == 'C01') &
-#     (fund_staff_distribution['Facility_Level'] == 'Facility_Level_1b')].index
-# fund_staff_distribution.loc[idx_c01_level_1b, 'Proportion_Fund'] = 0.4792
-#
-# idx_c01_level_2 = fund_staff_distribution[
-#     (fund_staff_distribution['Cadre_Code'] == 'C01') &
-#     (fund_staff_distribution['Facility_Level'] == 'Facility_Level_2')].index
-# fund_staff_distribution.loc[idx_c01_level_2, 'Proportion_Fund'] = 0.5208
-# *********************************************************************************************************************
-
-# Split
-for district in pop['District']:
-    for cadre in set(fund_staffing_table.columns[3:]):
-        # The proportions
-        weight = fund_staff_distribution.loc[fund_staff_distribution['Cadre_Code'] == cadre,
-                                             ['Facility_Level', 'Proportion_Fund']].copy()
-        # The staff count before splitting
-        old_count = fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == district,
-                                            ['Facility_Level', cadre]].copy()
-
-        # Check that Facility levels of weight and old_count are consistent
-        assert (weight['Facility_Level'].values == old_count['Facility_Level'].values).all()
-
-        # Check that if old_count is not 0, then weight is not 0, guaranteeing that staff are split
-        if (old_count[cadre] > 0).any():
-            assert (weight['Proportion_Fund'] > 0).any()
-
-        # Split
-        fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == district, cadre] = (
-            old_count[cadre].values * weight['Proportion_Fund'].values)
-
-# Add facility levels for HQ, CenHos and ZMH
-fund_staffing_table.loc[128:132, 'Facility_Level'] = ['Facility_Level_5', 'Facility_Level_3',
-                                                      'Facility_Level_3', 'Facility_Level_3',
-                                                      'Facility_Level_4']
-
-# Check that in fund_staffing_table every staff count entry >= 0
-assert (fund_staffing_table.loc[:, 'M01':'R04'].values >= 0).all()
-# fund_staffing_table ready!
-
-# Save the table without column 'Is_DistrictLevel'; staff counts in floats
-fund_staffing_table_to_save = fund_staffing_table.drop(columns='Is_DistrictLevel', inplace=False)
-# fund_staffing_table_to_save.to_csv(
-#     outputlocation / 'human_resources' / 'funded' / 'ResourceFile_Staff_Table.csv', index=False)
-# fund_staffing_table_to_save.to_csv(
-#     outputlocation / 'human_resources' / 'funded_plus' / 'ResourceFile_Staff_Table.csv', index=False)
-
-# ***
-# --- Creating curr_staffing_table and curr_staff_list for current staff
-# Extract the section about "Current TOTAl Staff'
-hcw_curr_extract = wb_import.loc[3:39, 1:21]
-hcw_curr_extract = hcw_curr_extract.drop([4, 5])
-hcw_curr_extract.columns = hcw_curr_extract.iloc[0]
-hcw_curr_extract = hcw_curr_extract.drop([3])
-hcw_curr_extract = hcw_curr_extract.reset_index(drop=True)
-hcw_curr_extract.fillna(0, inplace=True)
-
-# Add in the columns to the dataframe for the labels that distinguishes whether
-# these officers are allocated to the district-or-lower levels or one of the key hospitals.
-hcw_curr_extract.loc[:, 'District_Or_Hospital'] = labels
-hcw_curr_extract.loc[:, 'Is_DistrictLevel'] = is_distlevel
-
-# Finished import from the CHAI excel
-curr_staffing_table = hcw_curr_extract.copy()
-
-# Check the cadre columns of curr_staffing_table is identical to fund_staffing_table
-assert set(curr_staffing_table.columns[0:21]) == set(fund_staffing_table.columns[-21:])
-
-# For curr_staffing_table, do not re-allocate Dental officer with the same reason above for established staff
-
-# The operation of reallocating E01 in HQ to districts is not needed for curr_staffing_table,
-# as no. of E01 in curr_staffing_table at HQ is zero.
-
-# For curr_staffing_table, sort out the districts and central hospitals
-curr_staffing_table.loc[
-    curr_staffing_table['District_Or_Hospital'] == 'HQ', 'District_Or_Hospital'] = 'Headquarter'
-curr_staffing_table.loc[
-    curr_staffing_table['District_Or_Hospital'] == 'KCH', 'District_Or_Hospital'] = 'Referral Hospital_Central'
-curr_staffing_table.loc[
-    curr_staffing_table['District_Or_Hospital'] == 'MCH', 'District_Or_Hospital'] = 'Referral Hospital_Northern'
-curr_staffing_table.loc[
-    curr_staffing_table['District_Or_Hospital'] == 'QECH', 'District_Or_Hospital'] = 'Referral Hospital_Southern'
-curr_staffing_table.loc[
-    curr_staffing_table['District_Or_Hospital'] == 'ZCH', 'District_Or_Hospital'] = 'Referral Hospital_Southern'
-curr_staffing_table.loc[
-    curr_staffing_table['District_Or_Hospital'] == 'ZMH', 'District_Or_Hospital'] = 'Zomba Mental Hospital'
-
-# Group the referral hospitals QECH and ZCH as Referral Hospital_Southern
-Is_DistrictLevel = curr_staffing_table['Is_DistrictLevel'].values  # Save the column 'Is_DistrictLevel' first
-curr_staffing_table = pd.DataFrame(
-    curr_staffing_table.groupby(by=['District_Or_Hospital'], sort=False).sum()).reset_index()
-curr_staffing_table.insert(1, 'Is_DistrictLevel', Is_DistrictLevel[:-1])  # Add the column 'Is_DistrictLevel'
-
-# No need to add a row for Zomba Mental Hospital, as the updated CHAI data has this row for ZMH.
-# Check that in curr_staffing_table each staff count entry >=0
-assert (curr_staffing_table.loc[:, 'M01':'R04'].values >= 0).all()
-
-# Split staff to 5 special districts;
-# for current staff, we include Likoma here because CHAI has no current staff allocated in Likoma
-# (CHAI team they will allocate some staff to Likoma but not yet done)
-split_districts = (
-    ('Likoma', 'Nkhata Bay'),
-    ('Lilongwe City', 'Lilongwe'),
-    ('Mzuzu City', 'Mzimba'),
-    ('Zomba City', 'Zomba'),
-    ('Blantyre City', 'Blantyre')
-)
-
-# drop the original placeholder row for Likoma
-curr_staffing_table.drop([9], inplace=True)
-curr_staffing_table.reset_index(inplace=True, drop=True)
-
-for i in np.arange(0, len(split_districts)):
-    new_district = split_districts[i][0]
-    super_district = split_districts[i][1]
-
-    record = curr_staffing_table.iloc[0].copy()  # get a row of the staffing table
-
-    # make a the record for the new district
-    record['District_Or_Hospital'] = new_district
-    record['Is_DistrictLevel'] = True
-
-    # get total staff level from the super districts
-    cols = set(curr_staffing_table.columns).intersection(set(officer_types_table.Officer_Type_Code))
-
-    total_staff = curr_staffing_table.loc[
-        curr_staffing_table['District_Or_Hospital'] == super_district, cols].values.squeeze()
-
-    # get the weight
-    w0 = pop_by_district.loc[new_district, 'Count'] / pop_by_district.loc[
-        super_district, 'Count']  # The values in order are 0.05,0.60,0.24,0.14,1.77
-    if w0 < 1:
-        w = w0
-    else:
-        w = 0.5
-
-    # assign w * 100% staff to the new district
-    record.loc[cols] = w * total_staff
-    curr_staffing_table = curr_staffing_table.append(record).reset_index(drop=True)
-
-    # take staff away from the super district
-    curr_staffing_table.loc[curr_staffing_table['District_Or_Hospital'] == super_district, cols] = \
-        curr_staffing_table.loc[
-            curr_staffing_table[
-                'District_Or_Hospital'] == super_district, cols] - record.loc[cols]
-
-# Confirm the merging will be perfect:
-# pop = pop_by_district.reset_index(drop = False, inplace = False)
-assert set(pop['District'].values) == set(
-    curr_staffing_table.loc[curr_staffing_table['Is_DistrictLevel'], 'District_Or_Hospital'])
-assert len(pop['District'].values) == len(
-    curr_staffing_table.loc[curr_staffing_table['Is_DistrictLevel'], 'District_Or_Hospital'])
-
-# ... double check by doing the merge explicitly
-# pop_districts = pd.DataFrame({'District': pd.unique(pop['District'])})
-chai_districts = pd.DataFrame(
-    {'District': curr_staffing_table.loc[curr_staffing_table['Is_DistrictLevel'], 'District_Or_Hospital']})
-
-merge_result = pop_districts.merge(chai_districts, how='inner', indicator=True)
-assert all(merge_result['_merge'] == 'both')
-assert len(merge_result) == len(pop_districts)
-
-# Split staff within each district to level 0 (All DCSAs at HP), level 1a (Disp, HC, etc.),
-# level 1b (ComHos, CHAM ComHos), and level 2 (DisHos, etc.), according to curr_staff_distribution.
-
-# First, make the table including all districts and facility levels 0 - 2 per district,\
-# by merging with district_faclevel defined previously.
-curr_staffing_table = district_faclevel.merge(curr_staffing_table, how='outer')
-
-# Split staff among levels
-for district in pop['District']:
-    for cadre in set(curr_staffing_table.columns[3:]):
-        # The proportions
-        weight = curr_staff_distribution.loc[curr_staff_distribution['Cadre_Code'] == cadre,
-                                             ['Facility_Level', 'Proportion']].copy()
-        # The staff count before splitting
-        old_count = curr_staffing_table.loc[curr_staffing_table['District_Or_Hospital'] == district,
-                                            ['Facility_Level', cadre]].copy()
-
-        # Check that Facility levels of weight and old_count are consistent
-        assert (weight['Facility_Level'].values == old_count['Facility_Level'].values).all()
-
-        # Check that if old_count is not 0, then weight is not 0, guaranteeing that staff are split
-        if (old_count[cadre] > 0).any():
-            assert (weight['Proportion'] > 0).any()
-
-        # Split
-        curr_staffing_table.loc[curr_staffing_table['District_Or_Hospital'] == district, cadre] = (
-            old_count[cadre].values * weight['Proportion'].values)
-
-# Add facility levels for HQ, CenHos and ZMH
-curr_staffing_table.loc[128:133, 'Facility_Level'] = ['Facility_Level_5', 'Facility_Level_3',
-                                                      'Facility_Level_3', 'Facility_Level_3',
-                                                      'Facility_Level_4']  # 128:132 also OK
-
-# Save the table without column 'Is_DistrictLevel'; staff counts in floats
-curr_staffing_table_to_save = curr_staffing_table.drop(columns='Is_DistrictLevel', inplace=False)
-# curr_staffing_table_to_save.to_csv(
-#     outputlocation / 'human_resources' / 'actual' / 'ResourceFile_Staff_Table.csv', index=False)
-
-# ---------------------------------------------------------------------------------------------------------------------
-# *** Create the Master Facilities List
-# This will be a listing of each facility and the district(s) to which they attach
-# The different Facility Types are notional at this stage
-# The Facility Level is the important variable for the staffing: staff are assumed to be allocated
-# to a particular level within a district，or a referral hospital, or others
-# They do not associate with a particular type of Facility
-
-Facility_Levels = [0, '1a', '1b', 2, 3, 4, 5]
-# 0: Community/Local level - HP, Village Health Committee, Community initiatives
-# 1a: Primary level - Dispensary, HC, Clinic, Maternity facility
-# 1b: Primary level - Community/Rural Hospital, CHAM (Community) Hospitals
-# 2: Second level - District hospital, DHO
-# 3: Tertiary/Referral level - KCH, MCH, ZCH + QECH as referral hospitals
-# 4: Zomba Mental Hospital, which has very limited data in CHAI dataset
-# 5: Headquarter, which has staff data (but no Time_Base or Incidence_Curr data)
-
-# declare the Facility_Type variable
-# Facility_Types = ['Health Post', 'Dispensary', 'Health Centre', 'Community or Rural Hospital', 'CHAM Hospital',
-#                   'District Hospital', 'DHO', 'Referral Hospital', 'Zomba Mental Hospital']
-# Facility_Types_Levels = dict(zip(Facility_Types, Facility_Levels))
-
-# Create empty dataframe that will be the Master Facilities List (mfl)
-mfl = pd.DataFrame(columns=['Facility_Level', 'District', 'Region'])
-
-pop_districts = pop['District'].values  # array; the 'pop_districts' used in previous lines is a DataFrame
-pop_regions = pd.unique(pop['Region'])
-
-# Each district is assigned with a set of community level facs (0), a set of primary level facs (1a, 1b),
-# and a set of second level facs (2).
-# Therefore, the total sets of facs is 4 * no. of districts + 3 (RefHos per Region) + 1 (HQ) + 1 (ZMH) \
-# = 4 * 32 + 5 = 133
-for d in pop_districts:
-    df = pd.DataFrame({'Facility_Level': Facility_Levels[0:4], 'District': d,
-                       'Region': pop.loc[pop['District'] == d, 'Region'].values[0]})
-    mfl = mfl.append(df, ignore_index=True, sort=True)
-
-# Add in the Referral Hospitals, one for each region
-for r in pop_regions:
-    mfl = mfl.append(pd.DataFrame({
-        'Facility_Level': Facility_Levels[4], 'District': None, 'Region': r
-    }, index=[0]), ignore_index=True, sort=True)
-
-# Add the ZMH
-mfl = mfl.append(pd.DataFrame({
-    'Facility_Level': Facility_Levels[5], 'District': None, 'Region': None
-}, index=[0]), ignore_index=True, sort=True)
-
-# Add the HQ
-mfl = mfl.append(pd.DataFrame({
-    'Facility_Level': Facility_Levels[6], 'District': None, 'Region': None
-}, index=[0]), ignore_index=True, sort=True)
-
-# Create the Facility_ID
-mfl.loc[:, 'Facility_ID'] = mfl.index
-
-# Create a unique name for each Facility
-name = 'Facility_Level_' + mfl['Facility_Level'].astype(str) + '_' + mfl['District']
-name.loc[mfl['Facility_Level'] == 3] = 'Referral Hospital' + '_' + mfl.loc[
-    mfl['Facility_Level'] == 3, 'Region']
-name.loc[mfl['Facility_Level'] == 4] = 'Zomba Mental Hospital'
-name.loc[mfl['Facility_Level'] == 5] = 'Headquarter'
-
-mfl.loc[:, 'Facility_Name'] = name
-
-# Save
-mfl.to_csv(outputlocation / 'organisation' / 'ResourceFile_Master_Facilities_List.csv', index=False)
-
-# ---------------------------------------------------------------------------------------------------------------------
-# *** Create a simple mapping of all the facilities that persons in a district can access
-facilities_by_district = pd.DataFrame(columns=mfl.columns)
-
-# Each district in pop_districts has access to five facility levels.
-for d in pop_districts:
-    the_region = pop.loc[pop['District'] == d, 'Region'].copy().values[0]
-
-    district_facs = mfl.loc[mfl['District'] == d]  # Include facs from level 0 to level 2
-
-    region_fac = mfl.loc[pd.isnull(mfl['District']) & (mfl['Region'] == the_region)].copy().reset_index(drop=True)
-    region_fac.loc[0, 'District'] = d  # Level 3, referral hospital
-
-    zmh_fac = mfl.loc[pd.isnull(mfl['District']) & pd.isnull(mfl['Region']) &
-                      (mfl['Facility_Name'] == 'Zomba Mental Hospital')].copy().reset_index(drop=True)
-    zmh_fac.loc[0, 'District'] = d  # Level 4, Zomba Mental Hospital
-
-    headquarter_fac = mfl.loc[pd.isnull(mfl['District']) & pd.isnull(mfl['Region']) &
-                              (mfl['Facility_Name'] == 'Headquarter')].copy().reset_index(drop=True)
-    headquarter_fac.loc[0, 'District'] = d  # Level 5, Headquarter
-
-    facilities_by_district = pd.concat([facilities_by_district, district_facs, region_fac, zmh_fac, headquarter_fac],
-                                       ignore_index=True)
-
-# check that the no. of facs is no. of districts times no. of fac levels = 32 * 7 = 224
-assert len(facilities_by_district) == len(pop_districts) * len(Facility_Levels)
-
-# Save
-# facilities_by_district.to_csv(outputlocation / 'organisation' / 'ResourceFile_Facilities_For_Each_District.csv',
-#                               index=False)
-
-# ---------------------------------------------------------------------------------------------------------------------
-# *** Now look at the types of appointments from the sheet 'Time_Curr'
-sheet = pd.read_excel(workingfile, sheet_name='Time_Curr', header=None)
-
-# get rid of the junk rows
-trimmed = sheet.loc[[7, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, 24, 26, 27]]
-data_import = pd.DataFrame(data=trimmed.iloc[1:, 2:].values, columns=trimmed.iloc[0, 2:], index=trimmed.iloc[1:, 1])
-
-data_import = data_import.dropna(axis='columns', how='all')  # get rid of the 'spacer' columns
-data_import = data_import.fillna(0)
-
-# get rid of records for which there is no call on time of any type of officer at any fac type
-data_import = data_import.drop(columns=data_import.columns[data_import.sum() == 0])
-
-# Note that in the updated 'Time_Curr', Disp has no time requirements at all for medical assistant M03,
-# which is different from the previous version
-assert data_import.loc['Disp', :].sum() == 0
-
-# Note that the DCSA (CHW) never has a time requirement and that no appointments can be serviced at the HealthPost.
-# We remedy this by inserting a new type of appointment, which only the DCSA can service, \
-# and the time taken is 10 minutes.
-new_appt_for_CHW = pd.Series(index=data_import.index,
-                             name='E01_ConWithDCSA',
-                             # New appointment type is a consultation with the DCSA (community health worker)
-                             data=[
-                                 0,  # Central Hosp - Time
-                                 0,  # Central Hosp - Percent
-                                 0,  # District Hosp - Time
-                                 0,  # District Hosp - Percent
-                                 0,  # Comm Hosp - Time
-                                 0,  # Comm Hosp - Percent
-                                 0,  # Urban Health Centre - Time     #10 mins
-                                 0,  # Urban Health Centre - Percent  #100%
-                                 0,  # Rural Health Centre - Time     #10 mins
-                                 0,  # Rural Health Centre - Percent  #100%
-                                 10.0,  # Health Post - Time
-                                 1.0,  # Health Post - Percent
-                                 0,  # Dispensary - Time              #10 mins
-                                 0,  # Dispensary - Percent           #100%
-                             ])
-
-data_import = pd.concat([data_import, new_appt_for_CHW], axis=1)
-assert data_import.loc['HP', :].sum() == 10.0
-
-# We now do not add service time for DHO as we think DHO does not deliver services directly
-# Also, DHO itself in both DHIS2 and CHAI updated data does not have service record
-
-# Add service times for Zomba Mental Hospital, by copying mental health appointment data of CenHos
-# (Assuming ZMH only provide mental health services)
-new_rows_for_ZMH = pd.DataFrame(index=['ZMH', 'ZMH_Per'], columns=data_import.columns.copy(),
-                                data=0)
-new_rows_for_ZMH.loc[:, ['C01_MentOPD', 'C01_MentClinic']] = data_import.loc[
-    ['CenHos', 'CenHos_Per'], ['C01_MentOPD', 'C01_MentClinic']].copy().values
-
-data_import = pd.concat([data_import, new_rows_for_ZMH])
-
-# data_import ready!
-
-# Break apart composite to give the appt_type and the officer_type
-# This is used to know which column to read below...
-chai_composite_code = pd.Series(data_import.columns)
-chai_code = chai_composite_code.str.split(pat='_', expand=True).reset_index(drop=True)
-chai_code = chai_code.rename(columns={0: 'Officer_Type_Code', 1: 'Appt_Type_Code'})
-
-# check that officer codes line up with the officer codes already imported
-assert set(chai_code['Officer_Type_Code']).issubset(set(officer_types_table['Officer_Type_Code']))
-
-# Make dataframe summarising the types of appointments
-
-retained_appt_type_code = pd.unique(chai_code['Appt_Type_Code'])
-
-appt_types_table_import = sheet.loc[(1, 2, 6), 2:].transpose().reset_index(drop=True).copy()
-appt_types_table_import = appt_types_table_import.rename(columns={1: 'Appt_Cat', 2: 'Appt_Type', 6: 'Appt_Type_Code'})
-appt_types_table_import['Appt_Cat'] = pd.Series(appt_types_table_import['Appt_Cat']).fillna(method='ffill')
-appt_types_table_import['Appt_Type'] = pd.Series(appt_types_table_import['Appt_Type']).fillna(method='ffill')
-appt_types_table_import['Appt_Type_Code'] = pd.Series(appt_types_table_import['Appt_Type_Code']).fillna(method='ffill')
-appt_types_table_import = appt_types_table_import.drop_duplicates().reset_index(drop=True)
-
-# starting with the retained appt codes, merge in these descriptions
-appt_types_table = pd.DataFrame(data={'Appt_Type_Code': retained_appt_type_code}).merge(appt_types_table_import,
-                                                                                        on='Appt_Type_Code', how='left',
-                                                                                        indicator=True)
-
-# Fill in the missing information about the appointment type that was added above
-appt_types_table.loc[appt_types_table['Appt_Type_Code'] == new_appt_for_CHW.name.split('_')[1], 'Appt_Cat'] = \
-    new_appt_for_CHW.name.split('_')[1]
-appt_types_table.loc[appt_types_table['Appt_Type_Code'] == new_appt_for_CHW.name.split('_')[1], 'Appt_Type'] = \
-    new_appt_for_CHW.name.split('_')[1]
-
-# drop the merge check column
-appt_types_table.drop(columns='_merge', inplace=True)
-
-# Replace space with underscore in the Appt_Cat
-appt_types_table['Appt_Cat'].replace(to_replace='  ', value='_', regex=True, inplace=True)
-appt_types_table['Appt_Cat'].replace(to_replace=' ', value='_', regex=True, inplace=True)
-
-# Check no holes
-assert not pd.isnull(appt_types_table).any().any()
-
-# Save
-appt_types_table.to_csv(outputlocation / 'human_resources' / 'definitions' / 'ResourceFile_Appt_Types_Table.csv',
-                        index=False)
-
-# ---------------------------------------------------------------------------------------------------------------------
-# *** Now, make the ApptTimeTable
-# (Table that gives for each appointment, when occurring in each appt_type at each facility type, the time of each \
-# type of officer required
-
-# The sheet gives the % of appointments that require a particular type of officer and the time taken if it does
-# So, turn that into an Expectation of the time taken for each type of officer (multiplying together)
-
-# This sheet distinguished between different types of facility in terms of the time taken by appointments occurring \
-# at each.
-# But the CHAI data do not distinguish how many officers work at each different level of facility
-# (Available staff counts for only districts (level = 0,1a,1b,2), CenHos (level = 3), and HQ (level = 5))
-# Therefore, we will map these to the facility level that have been defined.
-# NB. In doing this, we:
-# - assume that the time taken for all appointments at each level is modelled by that for the average of \
-#     facility types at that level
-
-# CHAI: Headquarter ---> our "headquarter" (level = 5)
-# CHAI: Zomba Mental Hospital ---> our 'Zomba Mental Hospital' / 'ZMH' (level = 4)
-# CHAI: Central_Hospital ---> our "Referral Hospital" (level = 3)
-# CHAI: District_Hospital ---> averaged into our "second level" facilities (level = 2)
-# CHAI: DHO ---> averaged into our "second level" facilities (level = 2)
-# CHAI: Community_Hospital ---> averaged into our "primary level" facilities (level = 1b)
-# CHAI: Urban_HealthCentre ---> averaged into our "primary level" facilities (level = 1a)
-# CHAI: Rural_HealthCentre ---> averaged into our "primary level" facilities (level = 1a)
-# CHAI: Dispensary ---> averaged into our "primary level" facilities (level = 1a)
-# CHAI: HealthPost ---> averaged into our "community level" facilities (level = 0)
-
-# level 4
-ZMH_ExpectTime = data_import.loc['ZMH'] * data_import.loc['ZMH_Per']
-
-# Level 3
-Central_Hospital_ExpecTime = data_import.loc['CenHos'] * data_import.loc['CenHos_Per']
-
-# level 5; No data available for Headquarter; we assign NAN to it
-HQ_ExpecTime = Central_Hospital_ExpecTime.copy()
-HQ_ExpecTime.loc[:] = np.nan
-
-# level 2
-District_Hospital_ExpecTime = data_import.loc['DisHos'] * data_import.loc['DisHos_Per']
-
-# level 1b
-Community_Hospital_ExpecTime = data_import.loc['ComHos'] * data_import.loc['ComHos_Per']
-
-# level 1a
-Urban_HealthCentre_ExpecTime = data_import.loc['UrbHC'] * data_import.loc['UrbHC_Per']
-Rural_HealthCentre_ExpecTime = data_import.loc['RurHC'] * data_import.loc['RurHC_Per']
-
-# level 0
-HealthPost_ExpecTime = data_import.loc['HP'] * data_import.loc['HP_Per']
-
-# Average time for levels 1a, which have data for more than 1 facility types
-Avg_Level1a_ExpectTime = (Urban_HealthCentre_ExpecTime + Rural_HealthCentre_ExpecTime) / 2
-
-# Assemble
-X = pd.DataFrame({
-    5: HQ_ExpecTime,  # (Headquarter)
-    4: ZMH_ExpectTime,  # (Zomba Mental Hospital)
-    3: Central_Hospital_ExpecTime,  # (our "Referral Hospital" at region level)
-    2: District_Hospital_ExpecTime,  # (DisHos at second level )
-    '1b': Community_Hospital_ExpecTime,  # (ComHos at primary level)
-    '1a': Avg_Level1a_ExpectTime,  # (UrbHC,RurHC at primary level)
-    0: HealthPost_ExpecTime  # (HP at community level)
-})
-
-assert set(X.columns) == set(Facility_Levels)
-
-# Split out the index into appointment type and officer type
-labels = pd.Series(X.index, index=X.index).str.split(pat='_', expand=True)
-labels = labels.rename(columns={0: 'Officer_Type_Code', 1: 'Appt_Type_Code'})
-Y = pd.concat([X, labels], axis=1)
-ApptTimeTable = pd.melt(Y, id_vars=['Officer_Type_Code', 'Appt_Type_Code'],
-                        var_name='Facility_Level', value_name='Time_Taken_Mins')
-
-# Confirm that Facility_Level is an int ---> No longer needed, as level 1a and 1b are not integers
-# ApptTimeTable['Facility_Level'] = ApptTimeTable['Facility_Level'].astype(int)
-
-# Merge in Officer_Type
-ApptTimeTable = ApptTimeTable.merge(officer_types_table, on='Officer_Type_Code')
-
-# confirm that we have the same number of entries as we were expecting
-assert len(ApptTimeTable) == len(Facility_Levels) * len(data_import.columns)
-
-# drop the rows that contain no call on resources, including NAN values
-ApptTimeTable = ApptTimeTable.drop(ApptTimeTable[ApptTimeTable['Time_Taken_Mins'] == 0].index)
-ApptTimeTable = ApptTimeTable.drop(ApptTimeTable[pd.isnull(ApptTimeTable['Time_Taken_Mins'])].index)
-# reset index
-ApptTimeTable.reset_index(drop=True, inplace=True)
-
-# Generate appt_time_table_coarse with officer_category, instead of officer_type
-appt_time_table_coarse = pd.DataFrame(
-    ApptTimeTable.groupby(['Appt_Type_Code', 'Facility_Level', 'Officer_Category']).sum()
-).reset_index()
-
-# Save
-# ApptTimeTable.to_csv(
-#     outputlocation / 'human_resources' / 'definitions' / 'ResourceFile_Appt_Time_Table.csv',
-#     index=False)
-appt_time_table_coarse.to_csv(
-    outputlocation / 'human_resources' / 'definitions' / 'ResourceFile_Appt_Time_Table.csv',
-    index=False)
-
-# ---------------------------------------------------------------------------------------------------------------------
-# *** Create a table that determines what kind of appointment can be serviced in each Facility Level
-ApptType_By_FacLevel = pd.DataFrame(index=appt_types_table['Appt_Type_Code'],
-                                    columns=Facility_Levels,
-                                    data=False,
-                                    dtype=bool)
-
-for appt_type in ApptType_By_FacLevel.index:
-    for fac_level in ApptType_By_FacLevel.columns:
-        # Can this appt_type happen at this facility_level?
-        # Check to see if ApptTimeTable has any time requirement
-
-        ApptType_By_FacLevel.at[appt_type, fac_level] = \
-            ((ApptTimeTable['Facility_Level'] == fac_level) & (ApptTimeTable['Appt_Type_Code'] == appt_type)).any()
-
-ApptType_By_FacLevel = ApptType_By_FacLevel.add_prefix('Facility_Level_')
-
-# Generate appt_type_by_level_coarse consider officer_category, instead of officer_type
-appt_type_by_level_coarse = pd.DataFrame(index=appt_types_table['Appt_Type_Code'],
-                                         columns=Facility_Levels,
-                                         data=False,
-                                         dtype=bool)
-
-for appt_type in appt_type_by_level_coarse.index:
-    for fac_level in appt_type_by_level_coarse.columns:
-        # Can this appt_type happen at this facility_level?
-        # Check to see if appt_time_table_coarse has any time requirement
-
-        appt_type_by_level_coarse.at[appt_type, fac_level] = \
-            ((appt_time_table_coarse['Facility_Level'] == fac_level) & (
-                appt_time_table_coarse['Appt_Type_Code'] == appt_type)).any()
-
-appt_type_by_level_coarse = appt_type_by_level_coarse.add_prefix('Facility_Level_')
-
-# Check; The two tables should be equal
-assert (appt_type_by_level_coarse == ApptType_By_FacLevel).all().all()
-
-# Save
-ApptType_By_FacLevel.to_csv(
-    outputlocation / 'human_resources' / 'definitions' / 'ResourceFile_ApptType_By_FacLevel.csv', index=True)
-
-# --- check
-# Look to see where different types of staff member need to be located:
-# This is just a reverse reading of where there are non-zero requests for time of particular officer-types
-
-Officers_Need_For_Appt = pd.DataFrame(columns=['Facility_Level', 'Appt_Type_Code', 'Officer_Type_Codes'])
-
-for a in appt_types_table['Appt_Type_Code'].values:
-    for f in Facility_Levels:
-
-        # get the staff types required for this appt
-
-        block = ApptTimeTable.loc[(ApptTimeTable['Appt_Type_Code'] == a) & (ApptTimeTable['Facility_Level'] == f)]
-
-        if len(block) == 0:
-            # no requirement expressed => The appt is not possible at this location
-            Officers_Need_For_Appt = Officers_Need_For_Appt.append(
-                {'Facility_Level': f,
-                 'Appt_Type_Code': a,
-                 'Officer_Type_Codes': False
-                 }, ignore_index=True)
-
-        else:
-            need_officer_types = list(block['Officer_Type_Code'])
-            Officers_Need_For_Appt = Officers_Need_For_Appt.append(
-                {'Facility_Level': f,
-                 'Appt_Type_Code': a,
-                 'Officer_Type_Codes': need_officer_types
-                 }, ignore_index=True)
-
-# Turn this into the the set of staff that are required for each type of appointment
-FacLevel_By_Officer = pd.DataFrame(columns=Facility_Levels,
-                                   index=officer_types_table['Officer_Type_Code'].values)
-FacLevel_By_Officer = FacLevel_By_Officer.fillna(False)
-
-for o in officer_types_table['Officer_Type_Code'].values:
-
-    for i in Officers_Need_For_Appt.index:
-
-        fac_level = Officers_Need_For_Appt.loc[i].Facility_Level
-        officer_types = Officers_Need_For_Appt.loc[i].Officer_Type_Codes
-
-        if officer_types is not False:  # (i.e. such an appointment at such a a facility is possible)
-
-            if o in officer_types:
-                FacLevel_By_Officer.loc[(FacLevel_By_Officer.index == o), fac_level] = True
-
-# We note that three officer_types ("T01: Nutrition Staff", "R03: Sonographer" and "RO4: Radiotherapy technician") are\
-#  apparently not called by any appointment type
-
-# Assign that the Nutrition Staff will go to the Referral Hospitals (level = 3)
-FacLevel_By_Officer.loc['T01', 3] = True
-
-# Assign that the Sonographer will go to the Referral Hospitals (level = 3)
-FacLevel_By_Officer.loc['R03', 3] = True
-
-# Assign that the Radiotherapist will go to the Referral Hospitals (level = 3)
-FacLevel_By_Officer.loc['R04', 3] = True
-
-# As an option, we could assign staff at HQ to level 5 according to the info of staff
-# Get the sets of officers of funded and current staff
-fund_staff_HQ = fund_staffing_table[fund_staffing_table['District_Or_Hospital'] == 'Headquarter'].copy()
-curr_staff_HQ = curr_staffing_table[curr_staffing_table['District_Or_Hospital'] == 'Headquarter'].copy()
-fund_staff_HQ.drop(columns=['District_Or_Hospital', 'Facility_Level', 'Is_DistrictLevel'], inplace=True)
-curr_staff_HQ.drop(columns=['District_Or_Hospital', 'Facility_Level', 'Is_DistrictLevel'], inplace=True)
-fund_staff_HQ_Positive = fund_staff_HQ.loc[:, (fund_staff_HQ > 0).any(axis=0)]
-curr_staff_HQ_Positive = curr_staff_HQ.loc[:, (curr_staff_HQ > 0).any(axis=0)]
-# The union of the two sets
-staff_call_at_HQ = fund_staff_HQ_Positive.columns.union(curr_staff_HQ_Positive.columns)
-# Assign true value to staff_call_at_HQ
-for s in staff_call_at_HQ:
-    FacLevel_By_Officer.loc[s, 5] = True
-
-# Check that all types of officer are allocated to at least one type of facility excl. HQ/Level_5
-assert (FacLevel_By_Officer.iloc[:, 0:6].sum(axis=1) > 0).all()
-
-# Change columns names: 0 -> Facility_Level_0
-FacLevel_By_Officer = FacLevel_By_Officer.add_prefix('Facility_Level_')
-
-# ---------------------------------------------------------------------------------------------------------------------
-# *** Get Hours and Minutes Worked Per Staff Member, i.e., the daily capabilities
-# First, read-in the number of working hours and days for each type of officer
-
-pft_sheet = pd.read_excel(workingfile, sheet_name='PFT', header=None)
-officer_types_import = pft_sheet.iloc[3, np.arange(2, 23)]
-
-assert set(officer_types_import) == set(officer_types_table['Officer_Type_Code'])
-assert len(officer_types_import) == len(officer_types_table['Officer_Type_Code'])
-
-# Total working days per year
-days_per_year_men = pft_sheet.iloc[16, np.arange(2, 23)]
-days_per_year_women = pft_sheet.iloc[17, np.arange(2, 23)]
-days_per_year_pregwomen = pft_sheet.iloc[18, np.arange(2, 23)]
-
-# Percents of men, nonpregnant women, and pregnant women
-fr_men = pft_sheet.iloc[66, np.arange(2, 23)]
-fr_pregwomen = pft_sheet.iloc[71, np.arange(2, 23)]
-fr_nonpregwomen = pft_sheet.iloc[68, np.arange(2, 23)] - pft_sheet.iloc[71, np.arange(2, 23)]
-
-# Total average working days
-workingdays = (fr_men * days_per_year_men) + (fr_nonpregwomen * days_per_year_women) + (
-    fr_pregwomen * days_per_year_pregwomen)
-
-# patient facing (i.e. non-admin working) minutes and hours daily at
-# district hospitals, community hospitals, health centres
-mins_daily_dishos = pft_sheet.iloc[37, np.arange(2, 23)]
-hrs_daily_dishos = mins_daily_dishos / 60
-
-mins_daily_comhos = pft_sheet.iloc[42, np.arange(2, 23)]
-hrs_daily_comhos = mins_daily_comhos / 60
-
-mins_daily_hc = pft_sheet.iloc[46, np.arange(2, 23)]
-hrs_daily_hc = mins_daily_hc / 60
-
-# Total mins per year, Average number of mins per day at
-# district hospitals, community hospitals, health centres
-mins_yearly_dishos = mins_daily_dishos * workingdays
-mins_yearly_comhos = mins_daily_comhos * workingdays
-mins_yearly_hc = mins_daily_hc * workingdays
-
-av_mins_daily_dishos = mins_yearly_dishos / 365.25
-av_mins_daily_comhos = mins_yearly_comhos / 365.25
-av_mins_daily_hc = mins_yearly_hc / 365.25
-
-# PFT - dishos, comhos, hc individual columns
-# note that the average is calculated on 365.25 days (not the working days) per year
-HosHC_patient_facing_time = pd.DataFrame(
-    {'Officer_Type_Code': officer_types_import,
-     'DisHos_Av_Mins_Per_Day': av_mins_daily_dishos,
-     'ComHos_Av_Mins_Per_Day': av_mins_daily_comhos,
-     'HC_Av_Mins_Per_Day': av_mins_daily_hc,
-     'Total_Av_Working_Days': workingdays,
-     'DisHos_Hrs_Per_Day': hrs_daily_dishos,
-     'ComHos_Hrs_Per_Day': hrs_daily_comhos,
-     'HC_Hrs_Per_Day': hrs_daily_hc
-     }
-).reset_index(drop=True)
-
-# The new PFT has no minutes for M01 at health centres,
-# but in Time_Curr, IPAdmissions/RMNCH/... appointments at Urban HCs all need time from M01.
-# We therefore assume the minutes for M01 at HCs are the average of those at DisHos and CenHos,
-# to solve inconsistency between PFT and Time_Curr
-HosHC_patient_facing_time.loc[0, 'HC_Av_Mins_Per_Day'] = (
-                                                             HosHC_patient_facing_time.loc[
-                                                                 0, 'DisHos_Av_Mins_Per_Day'] +
-                                                             HosHC_patient_facing_time.loc[0, 'ComHos_Av_Mins_Per_Day']
-                                                         ) / 2
-
-# How to deal with cadres (DCSA, Dental, Mental, Radiography) that do not have minutes at all in PFT,
-# whereas they have time requirements in Time_Curr?
-# (Compared to old PFT sheet,
-# the new PFT has updated all info on available working days/non-admin daily minutes/portion of male/female/pregfemale)
-# A quick fix is to use the average daily minutes of those cadres from old PFT table;
-# The info required to calculate these minutes will be from the old PFT table.
-pft_old = pd.read_excel(working_file_old, sheet_name='PFT', header=None)
-
-officer_types_old = pft_old.iloc[2, np.arange(2, 23)]
-assert set(officer_types_old) == set(officer_types_table['Officer_Type_Code'])
-assert len(officer_types_old) == len(officer_types_table['Officer_Type_Code'])
-
-# Total working days per year
-days_men_old = pft_old.iloc[15, np.arange(2, 23)]
-days_women_old = pft_old.iloc[16, np.arange(2, 23)]
-days_pregwomen_old = pft_old.iloc[17, np.arange(2, 23)]
-
-# Percents of men, nonpregnant women, and pregnant women
-fr_men_old = pft_old.iloc[53, np.arange(2, 23)]
-fr_pregwomen_old = pft_old.iloc[55, np.arange(2, 23)] * pft_old.iloc[57, np.arange(2, 23)]
-fr_nonpregwomen_old = pft_old.iloc[55, np.arange(2, 23)] * (1 - pft_old.iloc[57, np.arange(2, 23)])
-
-# Total average working days
-working_days_old = (fr_men_old * days_men_old) + (fr_nonpregwomen_old * days_women_old) + (
-    fr_pregwomen_old * days_pregwomen_old)
-
-# patient facing (i.e. non-admin working) minutes and hours daily at
-# hospitals and health centres
-mins_daily_hos_old = pft_old.iloc[36, np.arange(2, 23)]
-hrs_daily_hos_old = mins_daily_hos_old / 60
-
-mins_daily_hc_old = pft_old.iloc[26, np.arange(2, 23)] - pft_old.iloc[34, np.arange(2, 23)]
-hrs_daily_hc_old = mins_daily_hc_old / 60
-
-# Total mins per year, Average number of mins per day at
-# hospitals and health centres
-mins_yearly_hos_old = mins_daily_hos_old * working_days_old
-av_mins_daily_hos_old = mins_yearly_hos_old / 365.25
-
-mins_yearly_hc_old = mins_daily_hc_old * working_days_old
-av_mins_daily_hc_old = mins_yearly_hc_old / 365.25
-
-# PFT - DisHos, ComHos, HC individually
-# DisHos and ComHos both use hos data
-HosHC_patient_facing_time_old = pd.DataFrame(
-    {'Officer_Type_Code': officer_types_old,
-     'DisHos_Av_Mins_Per_Day': av_mins_daily_hos_old,
-     'ComHos_Av_Mins_Per_Day': av_mins_daily_hos_old,
-     'HC_Av_Mins_Per_Day': av_mins_daily_hc_old,
-     'Total_Av_Working_Days': working_days_old,
-     'DisHos_Hrs_Per_Day': hrs_daily_hos_old,
-     'ComHos_Hrs_Per_Day': hrs_daily_hos_old,
-     'HC_Hrs_Per_Day': hrs_daily_hc_old
-     }
-).reset_index(drop=True)
-
-# check the new and old tables have same columns and officers (in the same order)
-assert (HosHC_patient_facing_time_old['Officer_Type_Code'] == HosHC_patient_facing_time['Officer_Type_Code']).all()
-assert (HosHC_patient_facing_time_old.columns == HosHC_patient_facing_time.columns).all()
-
-# check new and old pft difference
-HosHC_pft_diff = pd.DataFrame(columns=HosHC_patient_facing_time.columns)
-HosHC_pft_diff['Officer_Type_Code'] = HosHC_patient_facing_time['Officer_Type_Code'].values
-HosHC_pft_diff.iloc[:, 1:] = (
-    (HosHC_patient_facing_time.iloc[:, 1:].values -
-     HosHC_patient_facing_time_old.iloc[:, 1:].values) /
-    HosHC_patient_facing_time_old.iloc[:, 1:].values
-)
-HosHC_pft_diff = HosHC_pft_diff.append(HosHC_pft_diff.iloc[:, 1:].mean(axis=0), ignore_index=True)
-
-# save
-# HosHC_pft_diff.to_csv(
-#     outputlocation / 'human_resources' / 'definitions' / 'New_Old_PFT_Difference.csv',
-#     index=False)
-
-# now add the old data of those blanks cadres to the updated PFT table
-HosHC_patient_facing_time.iloc[11:, :] = HosHC_patient_facing_time_old.iloc[11:, :].copy()
-
-# PFT table ready!
-
-# Create final tables of daily time available at each facility by officer type: Facility_ID, Facility_Type,
-# Facility_Level, Officer_Type, Officer_Type_Code, Total Average Minutes Per Day, Staff_Count
-
-# --- Daily capability for funded staff; staff counts in floats
-# For float staff counts, calculate total minutes per day
-funded_staff_floats = fund_staffing_table_to_save.copy()  # staff counts
-funded_daily_minutes = funded_staff_floats.copy()  # total minutes per day
-
-for i in funded_daily_minutes.index:
-    the_level = funded_daily_minutes.loc[i, 'Facility_Level']
-    for officer in officer_types_table['Officer_Type_Code']:
-        if the_level in ['Facility_Level_0', 'Facility_Level_1a']:  # Levels 0, 1a; HC minutes
-            t = (funded_staff_floats.loc[i, officer] *
-                 HosHC_patient_facing_time.loc[HosHC_patient_facing_time['Officer_Type_Code'] == officer,
-                                               'HC_Av_Mins_Per_Day'])
-            funded_daily_minutes.loc[i, officer] = t.values[0]
-        elif the_level == 'Facility_Level_1b':  # Level 1b; ComHos minutes
-            t = (funded_staff_floats.loc[i, officer] *
-                 HosHC_patient_facing_time.loc[HosHC_patient_facing_time['Officer_Type_Code'] == officer,
-                                               'ComHos_Av_Mins_Per_Day'])
-            funded_daily_minutes.loc[i, officer] = t.values[0]
-        else:  # Levels 2 and above; DisHos and CenHos minutes
-            t = (funded_staff_floats.loc[i, officer] *
-                 HosHC_patient_facing_time.loc[HosHC_patient_facing_time['Officer_Type_Code'] == officer,
-                                               'DisHos_Av_Mins_Per_Day'])
-            funded_daily_minutes.loc[i, officer] = t.values[0]
-
-# Long format
-funded_staff_floats = pd.melt(funded_staff_floats, id_vars=['District_Or_Hospital', 'Facility_Level'],
-                              var_name='Officer_Type_Code', value_name='Staff_Count')
-funded_daily_minutes = pd.melt(funded_daily_minutes, id_vars=['District_Or_Hospital', 'Facility_Level'],
-                               var_name='Officer_Type_Code', value_name='Total_Mins_Per_Day')
-# Merge into daily capability table
-funded_daily_capability = funded_daily_minutes.merge(funded_staff_floats, how='left')
-
-# Reset facility level column to exclude 'Facility_Level_'
-funded_daily_capability['Facility_Level'] = \
-    funded_daily_capability['Facility_Level'].str.split(pat='_', expand=True).iloc[:, 2]
-# Drop row with zero or nan minutes (due to either zero staff counts or nan daily minutes)
-funded_daily_capability.fillna(0, inplace=True)
-funded_daily_capability.drop(
-    index=funded_daily_capability[funded_daily_capability['Total_Mins_Per_Day'] == 0].index, inplace=True)
-# Reset index
-funded_daily_capability.reset_index(drop=True, inplace=True)
-
-# Add 'District' and 'Facility_Name' columns
-for i in funded_daily_capability.index:
-    the_level = funded_daily_capability.loc[i, 'Facility_Level']
-    if the_level in ['0', '1a', '1b', '2']:
-        the_district = funded_daily_capability.loc[i, 'District_Or_Hospital']
-        funded_daily_capability.loc[i, 'District'] = the_district
-        funded_daily_capability.loc[i, 'Facility_Name'] = 'Facility_Level_' + str(the_level) + '_' + the_district
-    else:
-        funded_daily_capability.loc[i, 'Facility_Name'] = funded_daily_capability.loc[i, 'District_Or_Hospital']
-# Drop column 'District_Or_Hospital'
-funded_daily_capability.drop(columns='District_Or_Hospital', inplace=True)
-
-# Add info from mfl: Region and Facility ID
-for i in funded_daily_capability.index:
-    the_facility_name = funded_daily_capability.loc[i, 'Facility_Name']
-    the_ID = mfl.loc[mfl['Facility_Name'] == the_facility_name, 'Facility_ID']
-    the_region = mfl.loc[mfl['Facility_Name'] == the_facility_name, 'Region']
-
-    funded_daily_capability.loc[i, 'Facility_ID'] = the_ID.values
-    funded_daily_capability.loc[i, 'Region'] = the_region.values
-
-# Add 'officer_category' info
-funded_daily_capability = funded_daily_capability.merge(officer_types_table, on='Officer_Type_Code', how='left')
-
-# Group by officer categories; consider coarse officers
-funded_daily_capability_coarse = pd.DataFrame(
-    funded_daily_capability.groupby(
-        ['Facility_ID', 'Facility_Name', 'Facility_Level', 'District', 'Region', 'Officer_Category'],
-        dropna=False).sum()
-).reset_index()
-
-# --- Daily capability for current staff; staff counts in floats
-# For float staff counts, calculate total minutes per day
-curr_staff_floats = curr_staffing_table_to_save.copy()  # staff counts
-curr_daily_minutes = curr_staff_floats.copy()  # total minutes per day
-
-for i in curr_daily_minutes.index:
-    the_level = curr_daily_minutes.loc[i, 'Facility_Level']
-    for officer in officer_types_table['Officer_Type_Code']:
-        if the_level in ['Facility_Level_0', 'Facility_Level_1a']:  # Levels 0, 1a; HC minutes
-            t = (curr_staff_floats.loc[i, officer] *
-                 HosHC_patient_facing_time.loc[HosHC_patient_facing_time['Officer_Type_Code'] == officer,
-                                               'HC_Av_Mins_Per_Day'])
-            curr_daily_minutes.loc[i, officer] = t.values[0]
-        elif the_level == 'Facility_Level_1b':  # Level 1b; ComHos minutes
-            t = (curr_staff_floats.loc[i, officer] *
-                 HosHC_patient_facing_time.loc[HosHC_patient_facing_time['Officer_Type_Code'] == officer,
-                                               'ComHos_Av_Mins_Per_Day'])
-            curr_daily_minutes.loc[i, officer] = t.values[0]
-        else:  # Levels 2 and above; DisHos and CenHos minutes
-            t = (curr_staff_floats.loc[i, officer] *
-                 HosHC_patient_facing_time.loc[HosHC_patient_facing_time['Officer_Type_Code'] == officer,
-                                               'DisHos_Av_Mins_Per_Day'])
-            curr_daily_minutes.loc[i, officer] = t.values[0]
-
-# Long format
-curr_staff_floats = pd.melt(curr_staff_floats, id_vars=['District_Or_Hospital', 'Facility_Level'],
-                            var_name='Officer_Type_Code', value_name='Staff_Count')
-curr_daily_minutes = pd.melt(curr_daily_minutes, id_vars=['District_Or_Hospital', 'Facility_Level'],
-                             var_name='Officer_Type_Code', value_name='Total_Mins_Per_Day')
-# Merge into daily capability table
-curr_daily_capability = curr_daily_minutes.merge(curr_staff_floats, how='left')
-
-# Reset facility level column to exclude 'Facility_Level_'
-curr_daily_capability['Facility_Level'] = \
-    curr_daily_capability['Facility_Level'].str.split(pat='_', expand=True).iloc[:, 2]
-# Drop row with zero minutes (also zero staff counts)
-curr_daily_capability.fillna(0, inplace=True)
-curr_daily_capability.drop(
-    index=curr_daily_capability[curr_daily_capability['Total_Mins_Per_Day'] == 0].index, inplace=True)
-# Reset index
-curr_daily_capability.reset_index(drop=True, inplace=True)
-
-# Add 'District' and 'Facility_Name' columns
-for i in curr_daily_capability.index:
-    the_level = curr_daily_capability.loc[i, 'Facility_Level']
-    if the_level in ['0', '1a', '1b', '2']:
-        the_district = curr_daily_capability.loc[i, 'District_Or_Hospital']
-        curr_daily_capability.loc[i, 'District'] = the_district
-        curr_daily_capability.loc[i, 'Facility_Name'] = 'Facility_Level_' + str(the_level) + '_' + the_district
-    else:
-        curr_daily_capability.loc[i, 'Facility_Name'] = curr_daily_capability.loc[i, 'District_Or_Hospital']
-# Drop column 'District_Or_Hospital'
-curr_daily_capability.drop(columns='District_Or_Hospital', inplace=True)
-
-# Add info from mfl: Region and Facility ID
-for i in curr_daily_capability.index:
-    the_facility_name = curr_daily_capability.loc[i, 'Facility_Name']
-    the_ID = mfl.loc[mfl['Facility_Name'] == the_facility_name, 'Facility_ID']
-    the_region = mfl.loc[mfl['Facility_Name'] == the_facility_name, 'Region']
-
-    curr_daily_capability.loc[i, 'Facility_ID'] = the_ID.values
-    curr_daily_capability.loc[i, 'Region'] = the_region.values
-
-# Add 'officer_category' info
-curr_daily_capability = curr_daily_capability.merge(officer_types_table, on='Officer_Type_Code', how='left')
-
-# Group by officer categories; consider coarse officers
-curr_daily_capability_coarse = pd.DataFrame(
-    curr_daily_capability.groupby(
-        ['Facility_ID', 'Facility_Name', 'Facility_Level', 'District', 'Region', 'Officer_Category'],
-        dropna=False).sum()
-).reset_index()
-
-# Save
-curr_daily_capability_coarse.to_csv(
-    outputlocation / 'human_resources' / 'actual' / 'ResourceFile_Daily_Capabilities.csv', index=False)
-
-# Need to # following lines below when generate funded_plus capability
-funded_daily_capability_coarse.to_csv(
-    outputlocation / 'human_resources' / 'funded' / 'ResourceFile_Daily_Capabilities.csv', index=False)
-
-# *** Only for funded_plus ********************************************************************************************
-# funded_daily_capability_coarse.to_csv(
-#     outputlocation / 'human_resources' / 'funded_plus' / 'ResourceFile_Daily_Capabilities.csv', index=False)
-
-
-# *********************************************************************************************************************
-
-# ---------------------------------------------------------------------------------------------------------------------
-# final check that for an appointment required at a particular level (in Appt_Time_Table), \
-# then indeed, the staff capabilities are available to satisfy that, for a person in any district \
-# (including the regional and national facilities)
-
-# Define the check function
-def all_appts_can_run(capability):
-    # Creat a table storing whether the appts have consistent requirements/demand and capabilities/supply
-    appt_have_or_miss_capability = appt_time_table_coarse.copy()
-    # Delete the column of minutes
-    appt_have_or_miss_capability.drop(columns=['Time_Taken_Mins'], inplace=True)
-    # Store the info of district (including central hospital, ZMH) that fails
-    appt_have_or_miss_capability.loc[:, 'fail_district'] = ''
-
-    for _I in appt_have_or_miss_capability.index:  # Loop through all appts
-        # Get the info of app, level and officer_category
-        # the_appt = appt_have_or_miss_capability.loc[I, 'Appt_Type_Code']
-        L = appt_have_or_miss_capability.loc[_I, 'Facility_Level']
-        the_officer_category = appt_have_or_miss_capability.loc[_I, 'Officer_Category']
-
-        # Check in daily_capabilities that the required officer_category at a level is there or not, for every district
-        # Store the info of district (including central hospital, ZMH) that fails
-        if L in Facility_Levels[0:4]:  # Levels 0, 1a, 1b, 2
-            k = 0  # Record how many districts fail
-            for D in pop_districts:
-                idx = capability[
-                    (capability['District'] == D) &
-                    (capability['Facility_Level'] == str(L)) &
-                    (capability['Officer_Category'] == the_officer_category)].index
-                if idx.size == 0:
-                    # Store the district that fails to provide required officer_category
-                    appt_have_or_miss_capability.loc[_I, 'fail_district'] = \
-                        appt_have_or_miss_capability.loc[_I, 'fail_district'] + D + ','
-                    k += 1
-            if k == 0:
-                appt_have_or_miss_capability.loc[_I, 'fail_district'] = 'All districts pass'
-        elif L == 3:  # Level 3 central hospital
-            m = 0  # Record how many regions fail
-            for region in pop_regions:
-                idx1 = capability[
-                    (capability['Region'] == region) &
-                    (capability['Facility_Level'] == str(L)) &
-                    (capability['Officer_Category'] == the_officer_category)].index
-                if idx1.size == 0:
-                    # Store the regional hospital that fails
-                    appt_have_or_miss_capability.loc[_I, 'fail_district'] = \
-                        appt_have_or_miss_capability.loc[_I, 'fail_district'] + 'Referral Hospital_' + region + ','
-                    m += 1
-            if m == 0:
-                appt_have_or_miss_capability.loc[_I, 'fail_district'] = 'All districts pass'
-        elif L == 4:  # Zomba Mental Hospital
-            n = 0  # Record is ZMH failed
-            idx2 = capability[
-                (capability['Facility_Level'] == str(L)) &
-                (capability['Officer_Category'] == the_officer_category)].index
-            if idx2.size == 0:
-                appt_have_or_miss_capability.loc[_I, 'fail_district'] = \
-                    appt_have_or_miss_capability.loc[_I, 'fail_district'] + 'Zomba Mental Hospital,'
-                n += 1
-            if n == 0:
-                appt_have_or_miss_capability.loc[_I, 'fail_district'] = 'All districts pass'
-        else:
-            assert 0 == 1  # There should be no 'else'; otherwise, the generated tables above is incorrect
-
-    return appt_have_or_miss_capability
-
-# Save results for funded
-# Need to # following lines below when generate funded_plus capability
-# appt_have_or_miss_capability_funded = all_appts_can_run(funded_daily_capability_coarse)
-# appt_have_or_miss_capability_funded.to_csv(
-#     outputlocation / 'human_resources' / 'funded' / 'appt_have_or_miss_capability.csv', index=False)
-
-# *** Only for funded_plus ********************************************************************************************
-# appt_have_or_miss_capability_funded = all_appts_can_run(funded_daily_capability_coarse)
-# appt_have_or_miss_capability_funded.to_csv(
-#     outputlocation / 'human_resources' / 'funded_plus' / 'appt_have_or_miss_capability.csv', index=False)
-# *********************************************************************************************************************
-
-# Save results for actual
-# appt_have_or_miss_capability_actual = all_appts_can_run(curr_daily_capability_coarse)
-# appt_have_or_miss_capability_actual.to_csv(
-#     outputlocation / 'human_resources' / 'actual' / 'appt_have_or_miss_capability.csv', index=False)

From eac331e0828488e388d51a2413cec2862b7a3845 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Mon, 21 Feb 2022 12:58:48 +0000
Subject: [PATCH 065/131] Update the simulation period of scenario hsi

---
 .../hsi_in_typical_run/scenario_hsi_in_typical_run.py         | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
index 385cb63422..d6469a0b3b 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
@@ -1,5 +1,5 @@
 """
-This file is used to capture the HSI that are run during a typical simulation, 2010-2014. It defines a large population
+This file is used to capture the HSI that are run during a typical simulation, 2010-2021. It defines a large population
  with all disease modules registered and an unconstrained (mode_appt_constraints=0) HealthSystem.
 
 Run on the remote batch system using:
@@ -48,7 +48,7 @@ def __init__(self):
         super().__init__()
         self.seed = 0
         self.start_date = Date(2010, 1, 1)
-        self.end_date = Date(2019, 1, 31)
+        self.end_date = Date(2021, 12, 31)
         self.pop_size = 20_000
         self.number_of_draws = 1
         self.runs_per_draw = 1

From 458bee43dccf543a4480de8a19976267be10ca21 Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Mon, 21 Feb 2022 21:28:35 +0000
Subject: [PATCH 066/131] Refactor (and see if all tests pass)

---
 .../data_file_processing/formatting_healthsystem_data.py       | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/scripts/data_file_processing/formatting_healthsystem_data.py b/src/scripts/data_file_processing/formatting_healthsystem_data.py
index ea9d57edb8..795b78c240 100644
--- a/src/scripts/data_file_processing/formatting_healthsystem_data.py
+++ b/src/scripts/data_file_processing/formatting_healthsystem_data.py
@@ -50,11 +50,10 @@
 workingfile = (path_to_dropbox /
                '05 - Resources' / 'Module-healthsystem' / 'chai ehp resource use data' / 'ORIGINAL' /
                'Malawi optimization model import_2022-02-11.xlsx')
-# <-- point to the new data locally; need upload the excel file to shared dropbox
 
 working_file_old = (path_to_dropbox /
                     '05 - Resources' / 'Module-healthsystem' / 'chai ehp resource use data' / 'ORIGINAL' /
-                    'Optimization model import_Malawi_20180315 v10.xlsx')  # <-- point to the old data locally
+                    'Optimization model import_Malawi_20180315 v10.xlsx')
 
 path_to_auxiliaryfiles = (path_to_dropbox /
                           '05 - Resources' /

From 72dd1bb7eb2e0bf439ef6c03c1300ba86123adfb Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Mon, 21 Feb 2022 21:58:16 +0000
Subject: [PATCH 067/131] Update the simulation period of scenario hsi

---
 .../hsi_in_typical_run/analysis_hsi_in_typical_run.py         | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
index aa218f81ac..981f66da68 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
@@ -78,7 +78,7 @@
 plt.savefig(make_graph_file_name('HSI_per_module_per_month'))
 plt.show()
 
-# Plot the breakdown of all HSI, over all the years 2010-2018
+# Plot the breakdown of all HSI, over all the years 2010-2021
 evs = pd.DataFrame(hsi.groupby(by=['Module']).size())
 # Calculate the fraction
 evs[1] = 100*evs[0]/evs[0].sum()
@@ -92,7 +92,7 @@
                                          key=lambda x: x[2],
                                          reverse=True))
 plt.legend(patches, labels, ncol=3, loc='lower center', fontsize='xx-small')
-plt.title("HSI by Module (year 2010-2018)")
+plt.title("HSI by Module (year 2010-2021)")
 plt.tight_layout()
 plt.savefig(make_graph_file_name('HSI_per_module'))
 plt.show()

From f4ab441fe2bbe37ac934354b75ac7549500f7c90 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Mon, 21 Feb 2022 21:59:17 +0000
Subject: [PATCH 068/131] Update the simulation period of scenario hsi

---
 .../hsi_in_typical_run/scenario_hsi_in_typical_run.py           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
index d6469a0b3b..6baf307c76 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
@@ -48,7 +48,7 @@ def __init__(self):
         super().__init__()
         self.seed = 0
         self.start_date = Date(2010, 1, 1)
-        self.end_date = Date(2021, 12, 31)
+        self.end_date = Date(2022, 1, 1)
         self.pop_size = 20_000
         self.number_of_draws = 1
         self.runs_per_draw = 1

From 06371e83ff5cfee39b5152ef4185a433a7403da5 Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Wed, 23 Feb 2022 15:26:33 +0000
Subject: [PATCH 069/131] Update figures

---
 .../hsi_in_typical_run/analysis_hsi_in_typical_run.py    | 9 ++++++---
 .../analysis_sankey_appt_and_hsi.ipynb                   | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
index 981f66da68..2a518a21f8 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
@@ -116,10 +116,13 @@
 appts_by_treatment_id_short = \
     hsi.set_index('TREATMENT_ID')['Number_By_Appt_Type_Code'].drop_duplicates().apply(pd.Series).fillna(0.0)
 
-# A possible issue:
+# Possible issues:
 # set(appts_by_treatment_id_short.columns)-set(appts_by_treatment_id.columns)
-# the output is: {'NormalDelivery', 'VCTPositive'}
-# not clear yet why the two appts are not in the table appts_by_treatment_id
+# the output is: {'ComDelivery', 'VCTPositive'}, not clear why the two appts are not in the table appts_by_treatment_id
+# There are inconsistencies in appts_by_treatment_id_short:
+# e.g. breastCancer_StartTreatment and oesophagealCancer_StartTreatment call for different appts,
+# Labour_ReceivesComprehensiveEmergencyObstetricCare calls for non appt.
+
 
 # Plot...
 # See the Sankey plot in analysis_sankey_appt_and_hsi.ipynb (in the same folder)
diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb
index c3c933a6b1..72dd747e99 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb
@@ -179,7 +179,7 @@
     "    'CareOfWomenDuringPregnancy_AntenatalOutpatientManagementOfAnaemia',\n",
     "    'CareOfWomenDuringPregnancy_PostAbortionCaseManagement', 'Labour_ReceivesSkilledBirthAttendanceDuringLabour',\n",
     "    'Contraception_FamilyPlanningAppt', 'GenericEmergencyFirstApptAtFacilityLevel1',\n",
-    "    'GenericFirstApptAtFacilityLevel0', 'OesophagealCancer_StartTreatment', 'OtherAdultCancer_StartTreatment',\n",
+    "    'GenericFirstApptAtFacilityLevel0', 'OesophagealCancer_StartTreatment', 'breastCancer_StartTreatment',\n",
     "    'Hiv_Circumcision', 'Hiv_Treatment_InitiationOrContinuation', 'Hiv_TestAndRefer']))\n",
     "\n",
     "nodes = {\n",

From 7010aecf9a817a1db7cc3df690211c186ddeda9c Mon Sep 17 00:00:00 2001
From: Bingling <bshe@ic.ac.uk>
Date: Wed, 23 Feb 2022 16:32:48 +0000
Subject: [PATCH 070/131] Update fontsize in figures

---
 ...ysis_describe_healthsystem_capabilities.py | 84 +++++++++----------
 1 file changed, 42 insertions(+), 42 deletions(-)

diff --git a/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities.py b/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities.py
index b3b050a338..1159920391 100644
--- a/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities.py
+++ b/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities.py
@@ -10,10 +10,10 @@
 from matplotlib.ticker import ScalarFormatter
 
 # Get the path of the folder that stores the data - three scenarios: actual, funded, funded_plus
-workingpath = Path('./resources/healthsystem/human_resources/funded_plus')
+workingpath = Path('./resources/healthsystem/human_resources/actual')
 
 # Define the path of output histograms - three scenarios: actual, funded, funded_plus
-outputpath = Path('./outputs/healthsystem/human_resources/funded_plus')
+outputpath = Path('./outputs/healthsystem/human_resources/actual')
 
 # Read data
 data = pd.read_csv(workingpath / 'ResourceFile_Daily_Capabilities.csv')
@@ -23,12 +23,12 @@
 data_districts = data.dropna(inplace=False)
 dat = pd.DataFrame(data_districts.groupby(['District', 'Officer_Category'], as_index=False)['Total_Mins_Per_Day'].sum())
 tab = dat.pivot(index='District', columns='Officer_Category', values='Total_Mins_Per_Day')
-ax = tab.plot.bar(stacked=True)
-plt.ylabel('Minutes per day')
-plt.xlabel('District')
+ax = tab.plot.bar(stacked=True, fontsize='medium')
+plt.ylabel('Average Total Minutes per Day', fontsize='large')
+plt.xlabel('District', fontsize='large')
 
 ax.legend(ncol=3, bbox_to_anchor=(0, 1),
-          loc='lower left', fontsize='small')
+          loc='lower left', fontsize='medium')
 
 plt.savefig(outputpath / 'health_officer_minutes_per_district.pdf', bbox_inches='tight')
 
@@ -36,12 +36,12 @@
 data_districts = data.dropna(inplace=False)
 dat = pd.DataFrame(data_districts.groupby(['District', 'Officer_Category'], as_index=False)['Staff_Count'].sum())
 tab = dat.pivot(index='District', columns='Officer_Category', values='Staff_Count')
-ax = tab.plot.bar(stacked=True)
-plt.ylabel('Staff counts')
-plt.xlabel('District')
+ax = tab.plot.bar(stacked=True, fontsize='medium')
+plt.ylabel('Staff counts', fontsize='large')
+plt.xlabel('District', fontsize='large')
 
 ax.legend(ncol=3, bbox_to_anchor=(0, 1),
-          loc='lower left', fontsize='small')
+          loc='lower left', fontsize='medium')
 
 plt.savefig(outputpath / 'staff_allocation_per_district.pdf', bbox_inches='tight')
 
@@ -49,10 +49,10 @@
 # MINUTES PER HEALTH OFFICER CATEGORY BY LEVEL
 dat = pd.DataFrame(data.groupby(['Facility_Level', 'Officer_Category'], as_index=False)['Total_Mins_Per_Day'].sum())
 tab = dat.pivot(index='Facility_Level', columns='Officer_Category', values='Total_Mins_Per_Day')
-ax = tab.plot.bar(stacked=True)
+ax = tab.plot.bar(stacked=True, fontsize='medium')
 # ax = tab.plot.bar(stacked=True, log=True)
-plt.ylabel('Minutes per day')
-plt.xlabel('Facility level')
+plt.ylabel('Average Total Minutes per Day', fontsize='large')
+plt.xlabel('Facility level', fontsize='large')
 
 ax.tick_params(axis='x', rotation=0)
 
@@ -61,22 +61,22 @@
 ax.yaxis.set_major_formatter(formatter)
 
 ax.legend(ncol=3, bbox_to_anchor=(0, 1),
-          loc='lower left', fontsize='small')
+          loc='lower left', fontsize='medium')
 
 plt.savefig(outputpath / 'health_officer_minutes_per_level.pdf', bbox_inches='tight')
 
 # STAFF COUNTS PER HEALTH OFFICER CATEGORY BY LEVEL
 dat = pd.DataFrame(data.groupby(['Facility_Level', 'Officer_Category'], as_index=False)['Staff_Count'].sum())
 tab = dat.pivot(index='Facility_Level', columns='Officer_Category', values='Staff_Count')
-ax = tab.plot.bar(stacked=True)
+ax = tab.plot.bar(stacked=True, fontsize='medium')
 # ax = tab.plot.bar(stacked=True, log=True)
-plt.ylabel('Staff counts')
-plt.xlabel('Facility level')
+plt.ylabel('Staff counts', fontsize='large')
+plt.xlabel('Facility level', fontsize='large')
 
 ax.tick_params(axis='x', rotation=0)
 
 ax.legend(ncol=3, bbox_to_anchor=(0, 1),
-          loc='lower left', fontsize='small')
+          loc='lower left', fontsize='medium')
 
 plt.savefig(outputpath / 'staff_allocation_per_level.pdf', bbox_inches='tight')
 
@@ -86,73 +86,73 @@
 # Level 0
 data_level = data.loc[data['Facility_Level'] == '0', :]
 tab = data_level.pivot(index='District', columns='Officer_Category', values='Total_Mins_Per_Day')
-ax = tab.plot.bar(stacked=True)
-plt.ylabel('Minutes per day at level 0')
-plt.xlabel('District')
+ax = tab.plot.bar(stacked=True, fontsize='medium')
+plt.ylabel('Average Total Minutes per Day at Level 0', fontsize='large')
+plt.xlabel('District', fontsize='large')
 
 ax.legend(ncol=3, bbox_to_anchor=(0, 1),
-          loc='lower left', fontsize='small')
+          loc='lower left', fontsize='medium')
 
 plt.savefig(outputpath / 'health_officer_minutes_per_district_level_0.pdf', bbox_inches='tight')
 
 # Level 1a
 data_level = data.loc[data['Facility_Level'] == '1a', :]
 tab = data_level.pivot(index='District', columns='Officer_Category', values='Total_Mins_Per_Day')
-ax = tab.plot.bar(stacked=True)
-plt.ylabel('Minutes per day at level 1a')
-plt.xlabel('District')
+ax = tab.plot.bar(stacked=True, fontsize='medium')
+plt.ylabel('Average Total Minutes per Day at Level 1a', fontsize='large')
+plt.xlabel('District', fontsize='large')
 
 ax.legend(ncol=3, bbox_to_anchor=(0, 1),
-          loc='lower left', fontsize='small')
+          loc='lower left', fontsize='medium')
 
 plt.savefig(outputpath / 'health_officer_minutes_per_district_level_1a.pdf', bbox_inches='tight')
 
 # Level 1b
 data_level = data.loc[data['Facility_Level'] == '1b', :]
 tab = data_level.pivot(index='District', columns='Officer_Category', values='Total_Mins_Per_Day')
-ax = tab.plot.bar(stacked=True)
-plt.ylabel('Minutes per day at level 1b')
-plt.xlabel('District')
+ax = tab.plot.bar(stacked=True, fontsize='medium')
+plt.ylabel('Average Total Minutes per Day at Level 1b', fontsize='large')
+plt.xlabel('District', fontsize='large')
 
 ax.legend(ncol=3, bbox_to_anchor=(0, 1),
-          loc='lower left', fontsize='small')
+          loc='lower left', fontsize='medium')
 
 plt.savefig(outputpath / 'health_officer_minutes_per_district_level_1b.pdf', bbox_inches='tight')
 
 # Level 2
 data_level = data.loc[data['Facility_Level'] == '2', :]
 tab = data_level.pivot(index='District', columns='Officer_Category', values='Total_Mins_Per_Day')
-ax = tab.plot.bar(stacked=True)
-plt.ylabel('Minutes per day at level 2')
-plt.xlabel('District')
+ax = tab.plot.bar(stacked=True, fontsize='medium')
+plt.ylabel('Average Total Minutes per Day at Level 2', fontsize='large')
+plt.xlabel('District', fontsize='large')
 
 ax.legend(ncol=3, bbox_to_anchor=(0, 1),
-          loc='lower left', fontsize='small')
+          loc='lower left', fontsize='medium')
 
 plt.savefig(outputpath / 'health_officer_minutes_per_district_level_2.pdf', bbox_inches='tight')
 
 # Level 3
 data_level = data.loc[data['Facility_Level'] == '3', :]
 tab = data_level.pivot(index='Region', columns='Officer_Category', values='Total_Mins_Per_Day')
-ax = tab.plot.bar(stacked=True)
-plt.ylabel('Minutes per day at level 3')
-plt.xlabel('Regional Referral Hospital')
+ax = tab.plot.bar(stacked=True, fontsize='medium')
+plt.ylabel('Average Total Minutes per Day at Level 3', fontsize='large')
+plt.xlabel('Regional Referral Hospital', fontsize='large')
 ax.tick_params(axis='x', rotation=0)
 
 ax.legend(ncol=3, bbox_to_anchor=(0, 1),
-          loc='lower left', fontsize='small')
+          loc='lower left', fontsize='medium')
 
 plt.savefig(outputpath / 'health_officer_minutes_per_district_level_3.pdf', bbox_inches='tight')
 
 # Level 4
 data_level = data.loc[data['Facility_Level'] == '4', :]
 tab = data_level.pivot(index='Facility_Name', columns='Officer_Category', values='Total_Mins_Per_Day')
-ax = tab.plot.bar(stacked=True, width=0.1)
-plt.ylabel('Minutes per day at level 4')
-plt.xlabel('National resource hospital')
+ax = tab.plot.bar(stacked=True, width=0.1, fontsize='medium')
+plt.ylabel('Average Total Minutes per Day at Level 4', fontsize='large')
+plt.xlabel('National resource hospital', fontsize='large')
 ax.tick_params(axis='x', rotation=0)
 
 ax.legend(ncol=3, bbox_to_anchor=(0, 1),
-          loc='lower left', fontsize='small')
+          loc='lower left', fontsize='medium')
 
 plt.savefig(outputpath / 'health_officer_minutes_per_district_level_4.pdf', bbox_inches='tight')

From 2e691f203cc3398f7e52043e403b20e01e19ef1b Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Mon, 28 Mar 2022 22:12:30 +0100
Subject: [PATCH 071/131] get the appointment usage data of year 2019

---
 .../analysis_hsi_in_typical_run.py                  | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
index 2a518a21f8..8ce9afb67c 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
@@ -116,6 +116,18 @@
 appts_by_treatment_id_short = \
     hsi.set_index('TREATMENT_ID')['Number_By_Appt_Type_Code'].drop_duplicates().apply(pd.Series).fillna(0.0)
 
+# get the appointment usage per month year 2019
+hsi_2019 = hsi.loc[hsi.date.dt.year == 2019].copy()
+M = range(1, 13)
+D = {}
+appt_usage_2019 = pd.DataFrame()
+for m in M:
+    a = hsi_2019.loc[hsi_2019.month == m, 'Number_By_Appt_Type_Code'].apply(pd.Series)
+    D[m] = pd.DataFrame(columns=[m], data=a.sum(axis=0))
+    appt_usage_2019 = appt_usage_2019.join(D[m], how='outer')
+# save
+# appt_usage_2019.to_csv(outputspath / 'appt_usage_2019.csv')
+
 # Possible issues:
 # set(appts_by_treatment_id_short.columns)-set(appts_by_treatment_id.columns)
 # the output is: {'ComDelivery', 'VCTPositive'}, not clear why the two appts are not in the table appts_by_treatment_id
@@ -123,6 +135,5 @@
 # e.g. breastCancer_StartTreatment and oesophagealCancer_StartTreatment call for different appts,
 # Labour_ReceivesComprehensiveEmergencyObstetricCare calls for non appt.
 
-
 # Plot...
 # See the Sankey plot in analysis_sankey_appt_and_hsi.ipynb (in the same folder)

From 35f451c8325e990c496c2842ffa01e2d3dfcf801 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Tue, 29 Mar 2022 12:30:19 +0100
Subject: [PATCH 072/131] add log of demography module

---
 .../hsi_in_typical_run/scenario_hsi_in_typical_run.py            | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
index 6baf307c76..aaf3b51d32 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
@@ -60,6 +60,7 @@ def log_configuration(self):
             'custom_levels': {
                 '*': logging.WARNING,
                 'tlo.methods.healthsystem': logging.INFO,
+                'tlo.methods.demography': logging.INFO,
             }
         }
 

From 35463d436cc38eedf2ed196b7d0455a11a679f60 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Tue, 7 Jun 2022 13:13:13 +0100
Subject: [PATCH 073/131] update simulation end_date

---
 .../hsi_in_typical_run/scenario_hsi_in_typical_run.py           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
index aaf3b51d32..489b1e6f0a 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/scenario_hsi_in_typical_run.py
@@ -48,7 +48,7 @@ def __init__(self):
         super().__init__()
         self.seed = 0
         self.start_date = Date(2010, 1, 1)
-        self.end_date = Date(2022, 1, 1)
+        self.end_date = Date(2020, 1, 1)  # looking at the usage from 2010 to 2019
         self.pop_size = 20_000
         self.number_of_draws = 1
         self.runs_per_draw = 1

From 9de67b2807fe7bd4c7f6f32f4c35fbb4d73fa9c7 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Tue, 7 Jun 2022 15:26:16 +0100
Subject: [PATCH 074/131] add year column to hsi event table

---
 .../hsi_in_typical_run/analysis_hsi_in_typical_run.py            | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
index 8ce9afb67c..f826d9cd74 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
@@ -57,6 +57,7 @@
 hsi = log['HSI_Event']
 hsi["date"] = pd.to_datetime(hsi["date"])
 hsi["month"] = hsi["date"].dt.month
+hsi["year"] = hsi["date"].dt.year
 
 # Number of HSI that are taking place by originating module, by month
 year = 2016

From ae057a45bc264069886f7a7c2240ab9b93e59a80 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Tue, 8 Nov 2022 18:38:38 +0000
Subject: [PATCH 075/131] Re-run jupter notebook to check sankey diagram; need
 to re-generate outputs from scenario_hsi_in_typical_run

---
 ...lysis_sankey_coarse_officer_and_appt.ipynb | 64 +++++++-----------
 .../analysis_sankey_appt_and_hsi.ipynb        | 65 +++++--------------
 2 files changed, 39 insertions(+), 90 deletions(-)

diff --git a/src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb b/src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb
index 94890a6ad6..a328f17124 100644
--- a/src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb
+++ b/src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb
@@ -2,8 +2,6 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "4103c24d",
-   "metadata": {},
    "source": [
     "This file uses floweaver to generate Sankey diagrams that map coarse officers to appointments.\n",
     "\n",
@@ -27,14 +25,21 @@
     "Open in Browser\n",
     "\n",
     "Find the script and run all cells"
-   ]
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 1,
    "id": "913300ab",
    "metadata": {
     "pycharm": {
+     "is_executing": true,
      "name": "#%%\n"
     }
    },
@@ -94,29 +99,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": null,
    "id": "b3e06de5",
    "metadata": {
     "pycharm": {
-     "name": "#%%\n"
+     "name": "#%%\n",
+     "is_executing": true
     }
    },
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "7264a742fef54db29db4bcc7a1b84194",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "SankeyWidget(groups=[{'id': 'Officer', 'type': 'process', 'title': '', 'nodes': ['Officer^DCSA', 'Officer^Clin…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# The flow maps 9 officer categories and 11 appt cateogories at all levels\n",
     "flow_coarse_officer_appt = pd.DataFrame(\n",
@@ -192,27 +183,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": null,
    "id": "fe0021cb",
    "metadata": {
+    "pycharm": {
+     "name": "#%%\n",
+     "is_executing": true
+    },
     "scrolled": true
    },
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "bb749fcbe713456bbcfdeaeb940c8251",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "VBox(children=(HBox(children=(SankeyWidget(groups=[{'id': 'Officer', 'type': 'process', 'title': '', 'nodes': …"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# The flow maps 9 officer categories and 51 appt types at an individaul level\n",
     "flow_coarse_officer = pd.DataFrame(\n",
@@ -290,9 +270,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "PyCharm (TLOmodel)",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
-   "name": "pycharm-551d1069"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -304,7 +284,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.8.13"
   }
  },
  "nbformat": 4,
diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb
index 72dd747e99..1aec511003 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_and_hsi.ipynb
@@ -36,7 +36,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 1,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -63,21 +63,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
-     "name": "#%%\n"
+     "name": "#%%\n",
+     "is_executing": true
     }
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Results folder is: C:\\Users\\jdbb1\\Desktop\\TLOmodel\\outputs\\bshe@ic.ac.uk\\scenario_hsi_in_typical_run-2021-12-17T212823Z\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Declare the name of the file that specified the scenarios used in this run.\n",
     "scenario_filename = 'scenario_hsi_in_typical_run.py'\n",
@@ -125,26 +118,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
-     "name": "#%%\n"
+     "name": "#%%\n",
+     "is_executing": true
     }
    },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": "SankeyWidget(groups=[{'id': 'Appt', 'type': 'process', 'title': '', 'nodes': ['Appt^IPAdmission', 'Appt^Inpati…",
-      "application/vnd.jupyter.widget-view+json": {
-       "version_major": 2,
-       "version_minor": 0,
-       "model_id": "7e191fcd3cd24c41a1f3476f35698a4c"
-      }
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Sankey 0 that map appt to hsi considering only appt footprint for each hsi\n",
     "\n",
@@ -217,26 +198,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
-     "name": "#%%\n"
+     "name": "#%%\n",
+     "is_executing": true
     }
    },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": "SankeyWidget(groups=[{'id': 'Appt', 'type': 'process', 'title': '', 'nodes': ['Appt^ANCSubsequent', 'Appt^Ante…",
-      "application/vnd.jupyter.widget-view+json": {
-       "version_major": 2,
-       "version_minor": 0,
-       "model_id": "f62cf5beaf3e4bc096cf5e3181722cbd"
-      }
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Sankey 1 that maps appt to hsi considering appt footprint for each hsi and number of each hsi\n",
     "\n",
@@ -289,15 +258,15 @@
     "# Generate and save Sankey\n",
     "sankey_num_appt_by_hsi = weave(sdd, num_appt_by_hsi, palette=palette, measures='value').to_widget(**size)\n",
     "\n",
-    "sankey_num_appt_by_hsi.auto_save_png(make_graph_file_name('Sankey_num_appt_by_hsi'))\n"
+    "sankey_num_appt_by_hsi.auto_save_png(make_graph_file_name('Sankey_num_appt_by_hsi'))"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "PyCharm (TLOmodel)",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
-   "name": "pycharm-551d1069"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -309,7 +278,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.8.13"
   }
  },
  "nbformat": 4,

From 49b434fbd3c2ab191f69333bcf4ba67c42ff6f87 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Mon, 21 Nov 2022 16:13:34 +0000
Subject: [PATCH 076/131] change scenario_filename

---
 .../hsi_in_typical_run/analysis_hsi_in_typical_run.py           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
index f826d9cd74..37f0bcc781 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hsi_in_typical_run.py
@@ -11,7 +11,7 @@
 
 from tlo.analysis.utils import get_scenario_outputs, load_pickled_dataframes
 
-scenario_filename = 'scenario_hsi_in_typical_run.py'
+scenario_filename = 'long_run_all_diseases.py'
 
 # %% Declare usual paths:
 outputspath = Path('./outputs/bshe@ic.ac.uk')

From 1cd75948074791a52df9b7f085996ae5954a2467 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Fri, 9 Jun 2023 16:36:17 +0100
Subject: [PATCH 077/131] add 'PharmDispensing' to the diagrams

---
 .../analysis_sankey_coarse_officer_and_appt.ipynb               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb b/src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb
index a328f17124..6a874ce64d 100644
--- a/src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb
+++ b/src/scripts/healthsystem/descriptions_of_input_data/analysis_sankey_coarse_officer_and_appt.ipynb
@@ -140,7 +140,7 @@
     "#                                           np.unique(flow_coarse_officer_appt['Appt_Cat']))\n",
     "partition_appt_cat = Partition.Simple('Appt_Cat',\n",
     "                                      pd.array(['ConWithDCSA', 'IPOP', 'RMNCH', 'MISC',\n",
-    "                                      'HIV', 'TB', 'NUTRITION', 'LABORATORY',\n",
+    "                                      'HIV', 'TB', 'NUTRITION', 'PharmDispensing', 'LABORATORY',\n",
     "                                      'DENTAL', 'RADIOGRAPHY', 'MENTAL']))\n",
     "\n",
     "partition_facility_level = Partition.Simple('Facility_Level',\n",

From 3eef04fcbd43b36c790cf6d78b59d84c5b63edf7 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Thu, 22 Jun 2023 16:11:47 +0100
Subject: [PATCH 078/131] compare actual and funded plus scenarios

---
 ...ysis_describe_healthsystem_capabilities.py | 40 +++++++++++++++++--
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities.py b/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities.py
index 1159920391..2786d0af96 100644
--- a/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities.py
+++ b/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities.py
@@ -10,15 +10,23 @@
 from matplotlib.ticker import ScalarFormatter
 
 # Get the path of the folder that stores the data - three scenarios: actual, funded, funded_plus
-workingpath = Path('./resources/healthsystem/human_resources/actual')
+workingpath = Path('./resources/healthsystem/human_resources')
+wp_actual = workingpath / 'actual'
+wp_funded_plus = workingpath / 'funded_plus'
 
 # Define the path of output histograms - three scenarios: actual, funded, funded_plus
 outputpath = Path('./outputs/healthsystem/human_resources/actual')
+op_actual = outputpath / 'actual'
+op_funded_plus = outputpath / 'funded_plus'
 
-# Read data
-data = pd.read_csv(workingpath / 'ResourceFile_Daily_Capabilities.csv')
+# Read actual data
+data = pd.read_csv(wp_actual / 'ResourceFile_Daily_Capabilities.csv')
 
+# Read funded_plus data
+data_funded_plus = pd.read_csv(wp_funded_plus / 'ResourceFile_Daily_Capabilities.csv')
 
+
+# ***for actual scenario***
 # MINUTES PER HEALTH OFFICER CATEGORY BY DISTRICT
 data_districts = data.dropna(inplace=False)
 dat = pd.DataFrame(data_districts.groupby(['District', 'Officer_Category'], as_index=False)['Total_Mins_Per_Day'].sum())
@@ -156,3 +164,29 @@
           loc='lower left', fontsize='medium')
 
 plt.savefig(outputpath / 'health_officer_minutes_per_district_level_4.pdf', bbox_inches='tight')
+
+# ***end of actual scenario***
+
+# ***compare actual and funded_plus scenarios***
+total_actual = data.drop_duplicates().groupby(['Officer_Category']).agg(
+    {'Total_Mins_Per_Day': 'sum', 'Staff_Count': 'sum'}).reset_index()
+total_actual['Total_Mins_Per_Year'] = total_actual['Total_Mins_Per_Day'] * 365.25
+total_actual['Scenario'] = 'Actual'
+total_actual[['Abs_Change_Staff_Count', 'Rel_Change_Staff_Count', 'Abs_Change_Total_Mins', 'Rel_Change_Total_Mins']] = 0
+
+total_funded_plus = data_funded_plus.drop_duplicates().groupby(['Officer_Category']).agg(
+    {'Total_Mins_Per_Day': 'sum', 'Staff_Count': 'sum'}).reset_index()
+total_funded_plus['Total_Mins_Per_Year'] = total_funded_plus['Total_Mins_Per_Day'] * 365.25
+total_funded_plus['Scenario'] = 'Establishment'
+
+assert (total_actual.Officer_Category == total_funded_plus.Officer_Category).all()
+total_funded_plus['Abs_Change_Staff_Count'] = total_funded_plus['Staff_Count'] - total_actual['Staff_Count']
+total_funded_plus['Rel_Change_Staff_Count'] = (total_funded_plus['Staff_Count'] - total_actual['Staff_Count']
+                                               ) / total_actual['Staff_Count']
+total_funded_plus['Abs_Change_Total_Mins'] = (total_funded_plus['Total_Mins_Per_Year'] -
+                                              total_actual['Total_Mins_Per_Year'])
+total_funded_plus['Rel_Change_Total_Mins'] = (total_funded_plus['Total_Mins_Per_Year'] -
+                                              total_actual['Total_Mins_Per_Year']
+                                              ) / total_actual['Total_Mins_Per_Year']
+
+total = pd.concat([total_actual, total_funded_plus]).reset_index(drop=True)

From 81071ca910c745396d6e264ee8c1429d2cee1449 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Fri, 14 Jul 2023 14:18:28 +0100
Subject: [PATCH 079/131] create the scale run script to get most recent
 simulation results

---
 .../hsi_in_typical_run/10_year_scale_run.py   | 86 +++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100644 src/scripts/healthsystem/hsi_in_typical_run/10_year_scale_run.py

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/10_year_scale_run.py b/src/scripts/healthsystem/hsi_in_typical_run/10_year_scale_run.py
new file mode 100644
index 0000000000..3c45709813
--- /dev/null
+++ b/src/scripts/healthsystem/hsi_in_typical_run/10_year_scale_run.py
@@ -0,0 +1,86 @@
+"""
+This file defines a batch run of a large population for a long time with all disease modules and full use of HSIs
+It's used for analysis of TLO implementation re. HCW and health services usage, for the paper on HCW.
+
+Run on the batch system using:
+```tlo batch-submit src/scripts/healthsystem/hsi_in_typical_run/10_year_scale_run.py```
+
+or locally using:
+    ```tlo scenario-run src/scripts/healthsystem/hsi_in_typical_run/10_year_scale_run.py```
+
+"""
+from pathlib import Path
+from typing import Dict
+
+from tlo import Date, logging
+from tlo.analysis.utils import get_parameters_for_status_quo, mix_scenarios
+from tlo.methods.fullmodel import fullmodel
+from tlo.methods.scenario_switcher import ScenarioSwitcher
+from tlo.scenario import BaseScenario
+
+
+class LongRun(BaseScenario):
+    def __init__(self):
+        super().__init__()
+        self.seed = 0
+        self.start_date = Date(2010, 1, 1)
+        self.end_date = Date(2020, 1, 1)
+        self.pop_size = 20_000
+        self._scenarios = self._get_scenarios()
+        self.number_of_draws = len(self._scenarios)
+        self.runs_per_draw = 10
+
+    def log_configuration(self):
+        return {
+            'filename': 'long_run',  # <- (specified only for local running)
+            'directory': './outputs',  # <- (specified only for local running)
+            'custom_levels': {
+                '*': logging.WARNING,
+                'tlo.methods.demography': logging.INFO,
+                'tlo.methods.demography.detail': logging.WARNING,
+                'tlo.methods.healthburden': logging.INFO,
+                'tlo.methods.healthsystem': logging.INFO,
+                'tlo.methods.healthsystem.summary': logging.INFO,
+            }
+        }
+
+    def modules(self):
+        return fullmodel(resourcefilepath=self.resources) + [ScenarioSwitcher(resourcefilepath=self.resources)]
+
+    def draw_parameters(self, draw_number, rng):
+        return list(self._scenarios.values())[draw_number]
+
+    def _get_scenarios(self) -> Dict[str, Dict]:
+        """Return the Dict with values for the parameters that are changed, keyed by a name for the scenario."""
+
+        return {
+            "Status Quo":
+                mix_scenarios(
+                    get_parameters_for_status_quo()
+                ),
+
+            "Establishment HCW":
+                mix_scenarios(
+                    get_parameters_for_status_quo(),
+                    {'HealthSystem': {'use_funded_or_actual_staffing': 'funded_plus'}}
+                ),
+
+            "Perfect Healthcare Seeking":
+                mix_scenarios(
+                    get_parameters_for_status_quo(),
+                    {'ScenarioSwitcher': {'max_healthsystem_function': False, 'max_healthcare_seeking': True}},
+                ),
+
+            "Establishment HCW + Perfect Healthcare Seeking":
+                mix_scenarios(
+                    get_parameters_for_status_quo(),
+                    {'HealthSystem': {'use_funded_or_actual_staffing': 'funded_plus'}},
+                    {'ScenarioSwitcher': {'max_healthsystem_function': False, 'max_healthcare_seeking': True}},
+                ),
+        }
+
+
+if __name__ == '__main__':
+    from tlo.cli import scenario_run
+
+    scenario_run([__file__])

From 750778b89210fd69dbaca0a8dcb11e08aed0acca Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Fri, 14 Jul 2023 15:17:11 +0100
Subject: [PATCH 080/131] minor update

---
 .../healthsystem/hsi_in_typical_run/10_year_scale_run.py      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/10_year_scale_run.py b/src/scripts/healthsystem/hsi_in_typical_run/10_year_scale_run.py
index 3c45709813..a4b4beeac0 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/10_year_scale_run.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/10_year_scale_run.py
@@ -32,8 +32,8 @@ def __init__(self):
 
     def log_configuration(self):
         return {
-            'filename': 'long_run',  # <- (specified only for local running)
-            'directory': './outputs',  # <- (specified only for local running)
+            'filename': 'scale_run_for_hcw_analysis',
+            'directory': Path('./outputs'),  # <- (specified only for local running)
             'custom_levels': {
                 '*': logging.WARNING,
                 'tlo.methods.demography': logging.INFO,

From cffd1eaedd117ab230237c91bdf4052697fcb71f Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Mon, 17 Jul 2023 10:57:08 +0100
Subject: [PATCH 081/131] Revert "Merge HR resources and Consumables at
 facility levels 1b and 2 so that squeeze is more balanced  (#979)"

This reverts commit 0f139b2b3d32f5342c7d5ac1793b9c7fce9f9db2.
---
 ..._compare_appt_usage_real_and_simulation.py |   2 -
 .../analysis_hsi_descriptions.py              |   3 +-
 src/tlo/methods/healthsystem.py               | 142 +-----------------
 tests/test_alri.py                            |  37 +++--
 tests/test_healthcareseeking.py               |  20 ++-
 5 files changed, 32 insertions(+), 172 deletions(-)

diff --git a/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py b/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
index e96bd591eb..54410ea030 100644
--- a/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
+++ b/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
@@ -162,11 +162,9 @@ def format_and_save(_fig, _ax, _name_of_plot):
     make_graph_file_name = lambda stub: output_folder / f"{PREFIX_ON_FILENAME}_{stub}.png"  # noqa: E731
 
     simulation_usage = get_simulation_usage(results_folder)
-    simulation_usage = simulation_usage.reset_index().replace({'index': {'1b': '2'}}).groupby('index').sum()
 
     real_usage = get_real_usage(resourcefilepath)
     real_usage = adjust_real_usage_on_mentalall(real_usage)
-    real_usage = real_usage.reset_index().replace({'Facility_Level': {'1b': '2'}}).groupby('Facility_Level').sum()
 
     # Plot Simulation vs Real usage (Across all levels) (trimmed to 0.1 and 10)
     rel_diff_all_levels = (
diff --git a/src/scripts/calibration_analyses/analysis_scripts/analysis_hsi_descriptions.py b/src/scripts/calibration_analyses/analysis_scripts/analysis_hsi_descriptions.py
index ae5471f59b..47d00d14ce 100644
--- a/src/scripts/calibration_analyses/analysis_scripts/analysis_hsi_descriptions.py
+++ b/src/scripts/calibration_analyses/analysis_scripts/analysis_hsi_descriptions.py
@@ -391,7 +391,7 @@ def get_share_of_time_used_for_each_officer_at_each_level(_df):
     ).set_index('Facility_ID')
 
     def find_level_for_facility(id):
-        return mfl.loc[id].Facility_Level if mfl.loc[id].Facility_Level != '1b' else '2'
+        return mfl.loc[id].Facility_Level
 
     color_for_level = {'0': 'blue', '1a': 'yellow', '1b': 'green', '2': 'grey', '3': 'orange', '4': 'black',
                        '5': 'white'}
@@ -422,7 +422,6 @@ def find_level_for_facility(id):
     capacity_unstacked_average = capacity_by_facility.unstack().mean()
     # levels = [find_level_for_facility(i) if i != 'All' else 'All' for i in capacity_unstacked_average.index]
     xpos_for_level = dict(zip((color_for_level.keys()), range(len(color_for_level))))
-    xpos_for_level.update({'1b': 2, '2': 2, '3': 3, '4': 4, '5': 5})
     for id, val in capacity_unstacked_average.items():
         if id != 'All':
             _level = find_level_for_facility(id)
diff --git a/src/tlo/methods/healthsystem.py b/src/tlo/methods/healthsystem.py
index b76b37c980..24a522154a 100644
--- a/src/tlo/methods/healthsystem.py
+++ b/src/tlo/methods/healthsystem.py
@@ -16,7 +16,6 @@
 
 import numpy as np
 import pandas as pd
-from pandas.testing import assert_series_equal
 
 import tlo
 from tlo import Date, DateOffset, Module, Parameter, Property, Types, logging
@@ -37,74 +36,6 @@
 logger_summary = logging.getLogger(f"{__name__}.summary")
 logger_summary.setLevel(logging.INFO)
 
-# Declare the level which will be used to represent the merging of levels '1b' and '2'
-LABEL_FOR_MERGED_FACILITY_LEVELS_1B_AND_2 = '2'
-
-# Declare the assumption for the availability of consumables at the merged levels '1b' and '2'. This can be a
-#  list of facility_levels over which an average is taken (within a district): e.g. ['1b', '2'].
-AVAILABILITY_OF_CONSUMABLES_AT_MERGED_LEVELS_1B_AND_2 = ['1b']  # <-- Implies that availability at merged level '1b & 2'
-#                                                                     is equal to availability at level '1b'. This is
-#                                                                     reasonable because the '1b' are more numerous than
-#                                                                     those of '2' and have more overall capacity, so
-#                                                                     probably account for the majority of the
-#                                                                     interactions.
-
-
-def adjust_facility_level_to_merge_1b_and_2(level: str) -> str:
-    """Adjust the facility level of an HSI_Event so that HSI_Events scheduled at level '1b' and '2' are both directed
-    to level '2'"""
-    return level if level not in ('1b', '2') else LABEL_FOR_MERGED_FACILITY_LEVELS_1B_AND_2
-
-
-def pool_capabilities_at_levels_1b_and_2(df_original: pd.DataFrame) -> pd.DataFrame:
-    """Return a modified version of the imported capabilities DataFrame to reflect that the capabilities of level 1b
-    are pooled with those of level 2, and all labelled as level 2."""
-
-    # Find total minutes and staff count after the re-allocation of capabilities from '1b' to '2'
-    tots_after_reallocation = df_original \
-        .assign(Facility_Level=lambda df: df.Facility_Level.replace({
-                            '1b': LABEL_FOR_MERGED_FACILITY_LEVELS_1B_AND_2,
-                            '2': LABEL_FOR_MERGED_FACILITY_LEVELS_1B_AND_2})
-                ) \
-        .groupby(by=['Facility_Level', 'District', 'Region', 'Officer_Category'], dropna=False)[[
-            'Total_Mins_Per_Day', 'Staff_Count']] \
-        .sum() \
-        .reset_index()
-
-    # Construct a new version of the dataframe that uses the new totals
-    df_updated = df_original \
-        .drop(columns=['Total_Mins_Per_Day', 'Staff_Count'])\
-        .merge(tots_after_reallocation,
-               on=['Facility_Level', 'District', 'Region', 'Officer_Category'],
-               how='left',
-               ) \
-        .assign(
-            Total_Mins_Per_Day=lambda df: df.Total_Mins_Per_Day.fillna(0.0),
-            Staff_Count=lambda df: df.Staff_Count.fillna(0.0)
-        )
-
-    # Check that the *total* number of minutes per officer in each district/region is the same as before the change
-    assert_series_equal(
-        df_updated.groupby(by=['District', 'Region', 'Officer_Category'], dropna=False)['Total_Mins_Per_Day'].sum(),
-        df_original.groupby(by=['District', 'Region', 'Officer_Category'], dropna=False)['Total_Mins_Per_Day'].sum()
-    )
-
-    df_updated.groupby('Facility_Level')['Total_Mins_Per_Day'].sum()
-
-    # Check size/shape of the updated dataframe is as expected
-    assert df_updated.shape == df_original.shape
-    assert (df_updated.dtypes == df_original.dtypes).all()
-
-    for _level in ['0', '1a', '3', '4']:
-        assert df_original.loc[df_original.Facility_Level == _level].equals(
-            df_updated.loc[df_updated.Facility_Level == _level])
-
-    assert df_updated.loc[df_updated.Facility_Level == LABEL_FOR_MERGED_FACILITY_LEVELS_1B_AND_2,
-                          'Total_Mins_Per_Day'].sum() \
-           == df_updated.loc[df_updated.Facility_Level.isin(['1b', '2']), 'Total_Mins_Per_Day'].sum()
-
-    return df_updated
-
 
 class FacilityInfo(NamedTuple):
     """Information about a specific health facility."""
@@ -346,9 +277,6 @@ def initialise(self):
         """
         health_system = self.sim.modules['HealthSystem']
 
-        # Over-write ACCEPTED_FACILITY_LEVEL to to redirect all '1b' appointments to '2'
-        self.ACCEPTED_FACILITY_LEVEL = adjust_facility_level_to_merge_1b_and_2(self.ACCEPTED_FACILITY_LEVEL)
-
         if not isinstance(self.target, tlo.population.Population):
             self.facility_info = health_system.get_facility_info(self)
 
@@ -803,12 +731,9 @@ def pre_initialise_population(self):
         self.bed_days.pre_initialise_population()
 
         # Initialise the Consumables class
-        self.consumables = Consumables(
-            data=self.update_consumables_availability_to_represent_merging_of_levels_1b_and_2(
-                self.parameters['availability_estimates']),
-            rng=rng_for_consumables,
-            availability=self.get_cons_availability()
-        )
+        self.consumables = Consumables(data=self.parameters['availability_estimates'],
+                                       rng=rng_for_consumables,
+                                       availability=self.get_cons_availability())
 
         # Convert PriorityRank dataframe to dictionary
         if self.adopt_priority_policy:
@@ -1000,8 +925,7 @@ def format_daily_capabilities(self, use_funded_or_actual_staffing: str) -> pd.Se
         """
 
         # Get the capabilities data imported (according to the specified underlying assumptions).
-        capabilities = pool_capabilities_at_levels_1b_and_2(
-            self.parameters[f'Daily_Capabilities_{use_funded_or_actual_staffing}'])
+        capabilities = self.parameters[f'Daily_Capabilities_{use_funded_or_actual_staffing}']
         capabilities = capabilities.rename(columns={'Officer_Category': 'Officer_Type_Code'})  # neaten
 
         # Create dataframe containing background information about facility and officer types
@@ -1054,63 +978,6 @@ def format_daily_capabilities(self, use_funded_or_actual_staffing: str) -> pd.Se
         # return the pd.Series of `Total_Minutes_Per_Day' indexed for each type of officer at each facility
         return capabilities_ex['Total_Minutes_Per_Day']
 
-    def update_consumables_availability_to_represent_merging_of_levels_1b_and_2(self, df_original):
-        """To represent that facility levels '1b' and '2' are merged together under the label '2', we replace the
-        availability of consumables at level 2 with new values."""
-
-        # get master facilities list
-        mfl = self.parameters['Master_Facilities_List']
-
-        # merge in facility level
-        dfx = df_original.merge(
-            mfl[['Facility_ID', 'District', 'Facility_Level']],
-            left_on='Facility_ID',
-            right_on='Facility_ID',
-            how='left'
-        )
-
-        # compute the updated availability at the merged level '1b' and '2'
-        availability_at_1b_and_2 = \
-            dfx.drop(dfx.index[~dfx['Facility_Level'].isin(AVAILABILITY_OF_CONSUMABLES_AT_MERGED_LEVELS_1B_AND_2)]) \
-               .groupby(by=['District', 'month', 'item_code'])['available_prop'] \
-               .mean() \
-               .reset_index()\
-               .assign(Facility_Level=LABEL_FOR_MERGED_FACILITY_LEVELS_1B_AND_2)
-
-        # assign facility_id
-        availability_at_1b_and_2 = availability_at_1b_and_2.merge(
-            mfl[['Facility_ID', 'District', 'Facility_Level']],
-            left_on=['District', 'Facility_Level'],
-            right_on=['District', 'Facility_Level'],
-            how='left'
-        )
-
-        # assign these availabilities to the corresponding level 2 facilities (dropping the original values)
-        df_updated = pd.concat([
-            dfx.drop(dfx.index[dfx['Facility_Level'] == LABEL_FOR_MERGED_FACILITY_LEVELS_1B_AND_2]),
-            availability_at_1b_and_2[dfx.columns],
-            ]
-        ).drop(columns=['Facility_Level', 'District'])\
-         .sort_values(['Facility_ID', 'month', 'item_code']).reset_index(drop=True)
-
-        # check size/shape/dtypes preserved
-        assert df_updated.shape == df_original.shape
-        assert (df_updated.columns == df_original.columns).all()
-        assert (df_updated.dtypes == df_original.dtypes).all()
-
-        # check values the same for everything apart from the facility level '2' facilities
-        facilities_with_any_differences = set(
-            df_updated.loc[
-                ~(df_original == df_updated).all(axis=1),
-                'Facility_ID']
-        )
-        level2_facilities = set(
-            mfl.loc[mfl['Facility_Level'] == '2', 'Facility_ID']
-        )
-        assert facilities_with_any_differences.issubset(level2_facilities)
-
-        return df_updated
-
     def get_service_availability(self) -> List[str]:
         """Returns service availability. (Should be equal to what is specified by the parameter, but overwrite with what
         was provided in argument if an argument was specified -- provided for backward compatibility/debugging.)"""
@@ -1726,7 +1593,6 @@ def log_current_capabilities_and_usage(self):
         # Compute Fraction of Time For Each Officer and level
         officer = [_f.rsplit('Officer_')[1] for _f in comparison.index]
         level = [self._facility_by_facility_id[int(_fac_id)].level for _fac_id in facility_id]
-        level = list(map(lambda x: x.replace('1b', '2'), level))
         summary_by_officer = comparison.groupby(by=[officer, level])[['Total_Minutes_Per_Day', 'Minutes_Used']].sum()
         summary_by_officer['Fraction_Time_Used'] = (
             summary_by_officer['Minutes_Used'] / summary_by_officer['Total_Minutes_Per_Day']
diff --git a/tests/test_alri.py b/tests/test_alri.py
index 8f79f912b2..ffcf66c97f 100644
--- a/tests/test_alri.py
+++ b/tests/test_alri.py
@@ -1101,9 +1101,9 @@ def test_treatment_pathway_if_all_consumables_severe_case(seed, tmpdir):
     # If the child is older than 2 months (classification will be `danger_signs_pneumonia`).
     # - If Treatments Works --> No follow-up
     assert [
-               ('FirstAttendance_Emergency', '2'),  # <-- these would all be '1b' if levels '1b' and '2' are separate
-               ('Alri_Pneumonia_Treatment_Outpatient', '2'),
-               ('Alri_Pneumonia_Treatment_Inpatient', '2'),
+               ('FirstAttendance_Emergency', '1b'),
+               ('Alri_Pneumonia_Treatment_Outpatient', '1b'),
+               ('Alri_Pneumonia_Treatment_Inpatient', '1b'),
            ] == generate_hsi_sequence(sim=get_sim(seed=seed, tmpdir=tmpdir, cons_available='all'),
                                       incident_case_event=AlriIncidentCase_Lethal_DangerSigns_Pneumonia,
                                       treatment_effect='perfectly_effective',
@@ -1112,10 +1112,10 @@ def test_treatment_pathway_if_all_consumables_severe_case(seed, tmpdir):
 
     # - If Treatment Does Not Work --> One follow-up as an inpatient.
     assert [
-               ('FirstAttendance_Emergency', '2'),   # <-- these would all be '1b' if levels '1b' and '2' are separate
-               ('Alri_Pneumonia_Treatment_Outpatient', '2'),
-               ('Alri_Pneumonia_Treatment_Inpatient', '2'),
-               ('Alri_Pneumonia_Treatment_Inpatient_Followup', '2')
+               ('FirstAttendance_Emergency', '1b'),
+               ('Alri_Pneumonia_Treatment_Outpatient', '1b'),
+               ('Alri_Pneumonia_Treatment_Inpatient', '1b'),
+               ('Alri_Pneumonia_Treatment_Inpatient_Followup', '1b')
            ] == generate_hsi_sequence(sim=get_sim(seed=seed, tmpdir=tmpdir, cons_available='all'),
                                       incident_case_event=AlriIncidentCase_Lethal_DangerSigns_Pneumonia,
                                       treatment_effect='perfectly_ineffective',
@@ -1125,9 +1125,9 @@ def test_treatment_pathway_if_all_consumables_severe_case(seed, tmpdir):
     # If the child is younger than 2 months
     # - If Treatments Works --> No follow-up
     assert [
-               ('FirstAttendance_Emergency', '2'),   # <-- these would all be '1b' if levels '1b' and '2' are separate
-               ('Alri_Pneumonia_Treatment_Outpatient', '2'),
-               ('Alri_Pneumonia_Treatment_Inpatient', '2'),
+               ('FirstAttendance_Emergency', '1b'),
+               ('Alri_Pneumonia_Treatment_Outpatient', '1b'),
+               ('Alri_Pneumonia_Treatment_Inpatient', '1b'),
            ] == generate_hsi_sequence(sim=get_sim(seed=seed, tmpdir=tmpdir, cons_available='all'),
                                       incident_case_event=AlriIncidentCase_Lethal_DangerSigns_Pneumonia,
                                       age_of_person_under_2_months=True,
@@ -1136,10 +1136,10 @@ def test_treatment_pathway_if_all_consumables_severe_case(seed, tmpdir):
 
     # - If Treatment Does Not Work --> One follow-up as an inpatient.
     assert [
-               ('FirstAttendance_Emergency', '2'),   # <-- these would all be '1b' if levels '1b' and '2' are separate
-               ('Alri_Pneumonia_Treatment_Outpatient', '2'),
-               ('Alri_Pneumonia_Treatment_Inpatient', '2'),
-               ('Alri_Pneumonia_Treatment_Inpatient_Followup', '2'),
+               ('FirstAttendance_Emergency', '1b'),
+               ('Alri_Pneumonia_Treatment_Outpatient', '1b'),
+               ('Alri_Pneumonia_Treatment_Inpatient', '1b'),
+               ('Alri_Pneumonia_Treatment_Inpatient_Followup', '1b'),
            ] == generate_hsi_sequence(sim=get_sim(seed=seed, tmpdir=tmpdir, cons_available='all'),
                                       incident_case_event=AlriIncidentCase_Lethal_DangerSigns_Pneumonia,
                                       age_of_person_under_2_months=True,
@@ -1156,8 +1156,7 @@ def test_treatment_pathway_if_no_consumables_mild_case(seed, tmpdir):
                ('FirstAttendance_NonEmergency', '0'),
                ('Alri_Pneumonia_Treatment_Outpatient', '0'),
                ('Alri_Pneumonia_Treatment_Outpatient', '1a'),  # <-- referral due to lack of consumables
-               # ('Alri_Pneumonia_Treatment_Outpatient', '1b'),  # <-- referral due to lack of consumables
-               #                                                    (would occur if levels '1b' and '2' are separate)
+               ('Alri_Pneumonia_Treatment_Outpatient', '1b'),  # <-- referral due to lack of consumables
                ('Alri_Pneumonia_Treatment_Outpatient', '2'),  # <-- referral due to lack of consumables
                ('Alri_Pneumonia_Treatment_Inpatient_Followup', '2'),  # <-- follow-up because treatment not successful
            ] == generate_hsi_sequence(sim=get_sim(seed=seed, tmpdir=tmpdir, cons_available='none'),
@@ -1171,9 +1170,9 @@ def test_treatment_pathway_if_no_consumables_severe_case(seed, tmpdir):
     # Severe case and not available consumables --> successive referrals up to level 2, following emergency
     # appointment, plus follow-up appointment because treatment was not successful.
     assert [
-               ('FirstAttendance_Emergency', '2'),
-               ('Alri_Pneumonia_Treatment_Outpatient', '2'),
-               # ('Alri_Pneumonia_Treatment_Inpatient', '1b'),  # <-- would occur if levels '1b' and '2' are separate
+               ('FirstAttendance_Emergency', '1b'),
+               ('Alri_Pneumonia_Treatment_Outpatient', '1b'),
+               ('Alri_Pneumonia_Treatment_Inpatient', '1b'),
                ('Alri_Pneumonia_Treatment_Inpatient', '2'),  # <-- referral due to lack of consumables
                ('Alri_Pneumonia_Treatment_Inpatient_Followup', '2'),  # <-- follow-up because treatment not successful
            ] == generate_hsi_sequence(sim=get_sim(seed=seed, tmpdir=tmpdir, cons_available='none'),
diff --git a/tests/test_healthcareseeking.py b/tests/test_healthcareseeking.py
index a53006166c..e85ee4226c 100644
--- a/tests/test_healthcareseeking.py
+++ b/tests/test_healthcareseeking.py
@@ -1579,19 +1579,17 @@ def on_birth(self, mother, child):
     assert {'1a': 1.0} == get_events_scheduled_following_hcs_poll(
         prob_non_emergency_care_seeking_by_level=[0.0, 1.0, 0.0, 0.0])
 
-    # 100% chance that non-emergency-appointment is at level ('1b') {occurs at level labelled as '2' with merge of
-    # levels '1b & 2')
-    assert {'2': 1.0} == get_events_scheduled_following_hcs_poll(
+    # 100% chance that non-emergency-appointment is at level ('1b')
+    assert {'1b': 1.0} == get_events_scheduled_following_hcs_poll(
         prob_non_emergency_care_seeking_by_level=[0.0, 0.0, 1.0, 0.0])
 
-    # 100% chance that non-emergency-appointment is at level ('2')
-    assert {'2': 1.0} == get_events_scheduled_following_hcs_poll(
-        prob_non_emergency_care_seeking_by_level=[0.0, 0.0, 0.0, 1.0])
-
-    # A mixture of 0 / 1a / (1b) / 2
-    props = get_events_scheduled_following_hcs_poll(prob_non_emergency_care_seeking_by_level=[0.25, 0.25, 0.25, 0.25])
-    assert ('0' in set(props.keys())) and ('1a' in set(props.keys())) and ('2' in set(props.keys()))
-    assert all(np.array(list(props.values())) > 0)
+    # A mixture of 0 / 1a / 1b / 2
+    props = np.array(list(
+        get_events_scheduled_following_hcs_poll(
+            prob_non_emergency_care_seeking_by_level=[0.25, 0.25, 0.25, 0.25]).values()
+    ))
+    assert 4 == len(props)
+    assert all(props > 0)
 
 
 def test_custom_function_is_equivalent_to_linear_model(seed):

From 5f73293c425bdc335d257d0958f1bcdf8a5a39bc Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Fri, 21 Jul 2023 17:17:24 +0100
Subject: [PATCH 082/131] revert the changes of merging levels 1b and 2

---
 .../analysis_compare_appt_usage_real_and_simulation.py        | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py b/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
index 2f6a9d2dd6..f7f2ee887c 100644
--- a/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
+++ b/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
@@ -51,7 +51,6 @@ def unpack_nested_dict_in_series(_raw: pd.Series):
             .loc[pd.to_datetime(_df['date']).between(*TARGET_PERIOD), 'Number_By_Appt_Type_Code_And_Level'] \
             .pipe(unpack_nested_dict_in_series) \
             .rename(columns=appt_dict, level=1) \
-            .rename(columns={'1b': '2'}, level=0) \
             .groupby(level=[0, 1], axis=1).sum() \
             .mean(axis=0)  # mean over each year (row)
 
@@ -87,7 +86,6 @@ def unpack_nested_dict_in_series(_raw: pd.Series):
             .loc[pd.to_datetime(_df['date']).between(*TARGET_PERIOD), 'Number_By_Appt_Type_Code_And_Level'] \
             .pipe(unpack_nested_dict_in_series) \
             .rename(columns=appt_dict, level=1) \
-            .rename(columns={'1b': '2'}, level=0) \
             .groupby(level=[0, 1], axis=1).sum() \
             .mean(axis=0)  # mean over each year (row)
 
@@ -270,7 +268,7 @@ def get_real_usage(resourcefilepath, adjusted=True) -> pd.DataFrame:
     annual_usage_by_level = pd.concat([totals_by_year.reset_index(), totals_by_year_TB.reset_index()], axis=0)
 
     # group levels 1b and 2 into 2
-    annual_usage_by_level['Facility_Level'] = annual_usage_by_level['Facility_Level'].replace({'1b': '2'})
+    # annual_usage_by_level['Facility_Level'] = annual_usage_by_level['Facility_Level'].replace({'1b': '2'})
     annual_usage_by_level = annual_usage_by_level.groupby(
         ['Year', 'Appt_Type', 'Facility_Level'])['Usage'].sum().reset_index()
 

From d1933482330c82352233ad5e7317353923954b95 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Fri, 21 Jul 2023 17:38:44 +0100
Subject: [PATCH 083/131] trim rel diff (for new plots) to [0.1, 10.0]

---
 .../analysis_compare_appt_usage_real_and_simulation.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py b/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
index f7f2ee887c..ec7fb5da81 100644
--- a/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
+++ b/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
@@ -390,7 +390,7 @@ def format_real_usage():
         _rel_diff['upper_error'] = (_rel_diff['upper'] - _rel_diff['mean'])
         _asymmetric_error = [_rel_diff['lower_error'].values, _rel_diff['upper_error'].values]
 
-        _rel_diff = pd.DataFrame(_rel_diff['mean'])
+        _rel_diff = pd.DataFrame(_rel_diff['mean'].clip(lower=0.1, upper=10.0))
 
         return _rel_diff, _asymmetric_error
 

From 4e7eae2d3ac1d68f52983c8a013a8279a497bd6b Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Fri, 21 Jul 2023 17:39:05 +0100
Subject: [PATCH 084/131] Revert "trim rel diff (for new plots) to [0.1, 10.0]"

This reverts commit d1933482330c82352233ad5e7317353923954b95.
---
 .../analysis_compare_appt_usage_real_and_simulation.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py b/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
index ec7fb5da81..f7f2ee887c 100644
--- a/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
+++ b/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
@@ -390,7 +390,7 @@ def format_real_usage():
         _rel_diff['upper_error'] = (_rel_diff['upper'] - _rel_diff['mean'])
         _asymmetric_error = [_rel_diff['lower_error'].values, _rel_diff['upper_error'].values]
 
-        _rel_diff = pd.DataFrame(_rel_diff['mean'].clip(lower=0.1, upper=10.0))
+        _rel_diff = pd.DataFrame(_rel_diff['mean'])
 
         return _rel_diff, _asymmetric_error
 

From 25162a41e6e9d770e15978eb267e007e4ba243be Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Tue, 25 Jul 2023 17:24:37 +0100
Subject: [PATCH 085/131] add todo for HCW paper plots

---
 .../analysis_compare_appt_usage_real_and_simulation.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py b/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
index f7f2ee887c..713d2652a8 100644
--- a/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
+++ b/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
@@ -300,6 +300,16 @@ def get_real_usage(resourcefilepath, adjusted=True) -> pd.DataFrame:
     return average_annual_by_level, annual_usage_by_level_with_ci, annual_usage_with_ci
 
 
+# todo: plot hcw time usage against capability per cadre/facility level/disease (represented by short treatment id),
+#  and comparing 4 scenarios, i.e., Actual/Establishment(funded_plus) HCW * Default/Maximal health care seeking
+def get_expected_appt_time(resourcefilepath) -> pd.DataFrame:
+    """ This is to return the expected time requirements per appointment type per coarse cadre per facility level."""
+    expected_appt_time = pd.read_csv(
+        resourcefilepath / 'healthsystem' / 'organisation' / 'ResourceFile_Appt_Time_Table.csv')
+
+    return expected_appt_time
+
+
 def apply(results_folder: Path, output_folder: Path, resourcefilepath: Path = None):
     """Compare appointment usage from model output with real appointment usage.
     The real appointment usage is collected from DHIS2 system and HIV Dept."""

From 648e223760b0037b64f3f3738bdcfd5cf952369f Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Wed, 26 Jul 2023 10:10:29 +0100
Subject: [PATCH 086/131] create a separate file for HCW paper plots

---
 ...alysis_hcw_usage_by_appt_and_by_disease.py | 570 ++++++++++++++++++
 1 file changed, 570 insertions(+)
 create mode 100644 src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
new file mode 100644
index 0000000000..58a5f66a69
--- /dev/null
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
@@ -0,0 +1,570 @@
+from pathlib import Path
+
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+
+from tlo import Date
+from tlo.analysis.utils import extract_results, get_scenario_outputs, summarize
+
+PREFIX_ON_FILENAME = '6'
+
+# Declare period for which the results will be generated (defined inclusively)
+TARGET_PERIOD = (Date(2015, 1, 1), Date(2019, 12, 31))
+
+# appointment dict to match model and data
+appt_dict = {'Under5OPD': 'OPD',
+             'Over5OPD': 'OPD',
+             'AntenatalFirst': 'AntenatalTotal',
+             'ANCSubsequent': 'AntenatalTotal',
+             'NormalDelivery': 'Delivery',
+             'CompDelivery': 'Delivery',
+             'EstMedCom': 'EstAdult',
+             'EstNonCom': 'EstAdult',
+             'VCTPositive': 'VCTTests',
+             'VCTNegative': 'VCTTests',
+             'DentAccidEmerg': 'DentalAll',
+             'DentSurg': 'DentalAll',
+             'DentU5': 'DentalAll',
+             'DentO5': 'DentalAll',
+             'MentOPD': 'MentalAll',
+             'MentClinic': 'MentalAll'
+             }
+
+
+def get_annual_num_appts_by_level(results_folder: Path) -> pd.DataFrame:
+    """Return pd.DataFrame gives the (mean) simulated annual number of appointments of each type at each level."""
+
+    def get_counts_of_appts(_df):
+        """Get the mean number of appointments of each type being used each year at each level.
+        Need to rename appts to match standardized categories from the DHIS2 data."""
+
+        def unpack_nested_dict_in_series(_raw: pd.Series):
+            return pd.concat(
+                {
+                  idx: pd.DataFrame.from_dict(mydict) for idx, mydict in _raw.items()
+                 }
+             ).unstack().fillna(0.0).astype(int)
+
+        return _df \
+            .loc[pd.to_datetime(_df['date']).between(*TARGET_PERIOD), 'Number_By_Appt_Type_Code_And_Level'] \
+            .pipe(unpack_nested_dict_in_series) \
+            .rename(columns=appt_dict, level=1) \
+            .groupby(level=[0, 1], axis=1).sum() \
+            .mean(axis=0)  # mean over each year (row)
+
+    return summarize(
+        extract_results(
+                results_folder,
+                module='tlo.methods.healthsystem.summary',
+                key='HSI_Event',
+                custom_generate_series=get_counts_of_appts,
+                do_scaling=True
+            ),
+        only_mean=True,
+        collapse_columns=True,
+        ).unstack().astype(int)
+
+
+def get_annual_num_appts_by_level_with_confidence_interval(results_folder: Path) -> pd.DataFrame:
+    """Return pd.DataFrame gives the (mean) simulated annual number of appointments of each type at each level,
+    with 95% confidence interval."""
+
+    def get_counts_of_appts(_df):
+        """Get the mean number of appointments of each type being used each year at each level.
+        Need to rename appts to match standardized categories from the DHIS2 data."""
+
+        def unpack_nested_dict_in_series(_raw: pd.Series):
+            return pd.concat(
+                {
+                  idx: pd.DataFrame.from_dict(mydict) for idx, mydict in _raw.iteritems()
+                 }
+             ).unstack().fillna(0.0).astype(int)
+
+        return _df \
+            .loc[pd.to_datetime(_df['date']).between(*TARGET_PERIOD), 'Number_By_Appt_Type_Code_And_Level'] \
+            .pipe(unpack_nested_dict_in_series) \
+            .rename(columns=appt_dict, level=1) \
+            .groupby(level=[0, 1], axis=1).sum() \
+            .mean(axis=0)  # mean over each year (row)
+
+    return summarize(
+        extract_results(
+                results_folder,
+                module='tlo.methods.healthsystem.summary',
+                key='HSI_Event',
+                custom_generate_series=get_counts_of_appts,
+                do_scaling=True
+            ),
+        only_mean=False,
+        collapse_columns=True,
+        ).unstack().astype(int)
+
+
+def get_annual_num_appts_with_confidence_interval(results_folder: Path) -> pd.DataFrame:
+    """Return pd.DataFrame gives the (mean) simulated annual number of appointments of each type at all levels,
+    with 95% confidence interval."""
+
+    def get_counts_of_appts(_df) -> pd.Series:
+        """Get the mean number of appointments of each type being used each year at all levels.
+        Need to rename appts to match standardized categories from the DHIS2 data."""
+
+        return _df \
+            .loc[pd.to_datetime(_df['date']).between(*TARGET_PERIOD), 'Number_By_Appt_Type_Code'] \
+            .apply(pd.Series) \
+            .rename(columns=appt_dict) \
+            .groupby(level=0, axis=1).sum() \
+            .mean(axis=0)  # mean over each year (row)
+
+    return summarize(
+        extract_results(
+                results_folder,
+                module='tlo.methods.healthsystem.summary',
+                key='HSI_Event',
+                custom_generate_series=get_counts_of_appts,
+                do_scaling=True
+            ),
+        only_mean=False,
+        collapse_columns=True,
+        ).unstack().astype(int)
+
+
+def get_simulation_usage_by_level(results_folder: Path) -> pd.DataFrame:
+    """Returns the simulated MEAN USAGE PER YEAR DURING THE TIME_PERIOD, by appointment type and level.
+    """
+
+    # Get model outputs
+    model_output = get_annual_num_appts_by_level(results_folder=results_folder)
+
+    return model_output
+
+
+def get_simulation_usage_by_level_with_confidence_interval(results_folder: Path) -> pd.DataFrame:
+    """Returns the simulated MEAN USAGE PER YEAR DURING THE TIME_PERIOD with 95% confidence interval,
+    by appointment type and level.
+    """
+
+    # Get model outputs
+    model_output = get_annual_num_appts_by_level_with_confidence_interval(results_folder=results_folder)
+
+    # Reformat
+    model_output.columns = [' '.join(col).strip() for col in model_output.columns.values]
+    model_output = model_output.melt(var_name='name', value_name='value', ignore_index=False)
+    model_output['name'] = model_output['name'].str.split(' ')
+    model_output['value_type'] = model_output['name'].str[0]
+    model_output['appt_type'] = model_output['name'].str[1]
+    model_output.drop(columns='name', inplace=True)
+    model_output.reset_index(drop=False, inplace=True)
+    model_output.rename(columns={'index': 'facility_level'}, inplace=True)
+
+    return model_output
+
+
+def get_simulation_usage_with_confidence_interval(results_folder: Path) -> pd.DataFrame:
+    """Returns the simulated MEAN USAGE PER YEAR DURING THE TIME_PERIOD with 95% confidence interval,
+    by appointment type.
+    """
+
+    # Get model outputs
+    model_output = get_annual_num_appts_with_confidence_interval(results_folder=results_folder)
+
+    # Reformat
+    model_output = pd.DataFrame(model_output).T
+    model_output.columns = [' '.join(col).strip() for col in model_output.columns.values]
+    model_output = model_output.melt(var_name='name', value_name='value', ignore_index=False)
+    model_output['name'] = model_output['name'].str.split(' ')
+    model_output['value_type'] = model_output['name'].str[0]
+    model_output['appt_type'] = model_output['name'].str[1]
+    model_output.drop(columns='name', inplace=True)
+    model_output.reset_index(drop=True, inplace=True)
+
+    return model_output
+
+
+def adjust_real_usage_on_mentalall(real_usage_df) -> pd.DataFrame:
+    """This is to adjust the annual MentalAll usage in real usage dataframe.
+    The MentalAll usage was not adjusted in the preprocessing stage considering individual facilities and very low
+    reporting rates.
+    We now directly adjust its annual usage by facility level using the aggregated annual reporting rates by
+    facility level. The latter is calculated based on DHIS2 Mental Health Report reporting rates."""
+    # the annual reporting rates for Mental Health Report by facility level (%), 2015-2019
+    # could turn the reporting rates data into a ResourceFile if necessary
+    rr = pd.DataFrame(index=['1a', '1b', '2', '3'], columns=list(range(2015, 2020)),
+                      data=[[44.00, 39.33, 79.00, 97.33, 95.00],
+                            [10.42, 12.50, 25.00, 40.00, 68.33],
+                            [36.67, 39.44, 37.22, 63.89, 56.67],
+                            [50.00, 45.83, 45.83, 50.00, 45.83]])
+    # make the adjustment assuming 100% reporting rates for each year
+    for level in ['1a', '1b', '2', '3']:
+        for y in range(2015, 2020):
+            real_usage_df.loc[(real_usage_df.Facility_Level == level)
+                              & (real_usage_df.Year == y)
+                              & (real_usage_df.Appt_Type == 'MentalAll'), 'Usage'] = (
+                real_usage_df.loc[(real_usage_df.Facility_Level == level)
+                                  & (real_usage_df.Year == y)
+                                  & (real_usage_df.Appt_Type == 'MentalAll'), 'Usage'] * 100 / rr.loc[level, y]
+            )
+
+    return real_usage_df
+
+
+def get_real_usage(resourcefilepath, adjusted=True) -> pd.DataFrame:
+    """
+    Returns the adjusted (default) or unadjusted real data on the (MEAN) USAGE PER YEAR DURING THE TIME_PERIOD
+    for each appointment at each level and all levels.
+    """
+
+    # add facility level and district columns to both real and simulation usage
+    mfl = pd.read_csv(
+        resourcefilepath / 'healthsystem' / 'organisation' / 'ResourceFile_Master_Facilities_List.csv')
+
+    # Get real usage data
+    # For the details of adjustment of real usage data, refer to Paper
+    # "The Changes in Health Service Utilisation in Malawi during the COVID-19 Pandemic"
+    if adjusted:
+        real_usage = pd.read_csv(
+            resourcefilepath / 'healthsystem' / 'real_appt_usage_data' /
+            'real_monthly_usage_of_appt_type.csv')
+    else:
+        real_usage = pd.read_csv(
+            resourcefilepath / 'healthsystem' / 'real_appt_usage_data' /
+            'unadjusted_real_monthly_usage_of_appt_type.csv')
+
+    # add Csection usage to Delivery, as Delivery has excluded Csection in real data file (to avoid overlap)
+    # whereas Delivery in tlo output has included Csection
+    real_delivery = real_usage.loc[(real_usage.Appt_Type == 'Delivery') | (real_usage.Appt_Type == 'Csection')
+                                   ].groupby(['Year', 'Month', 'Facility_ID']).agg({'Usage': 'sum'}).reset_index()
+    real_delivery['Appt_Type'] = 'Delivery'
+    real_usage = pd.concat([real_usage.drop(real_usage[real_usage.Appt_Type == 'Delivery'].index),
+                            real_delivery])
+
+    # get facility_level for each record
+    real_usage = real_usage.merge(mfl[['Facility_ID', 'Facility_Level']], left_on='Facility_ID', right_on='Facility_ID')
+
+    # adjust annual MentalAll usage using annual reporting rates
+    if adjusted:
+        real_usage = adjust_real_usage_on_mentalall(real_usage)
+
+    # assign date to each record
+    real_usage['date'] = pd.to_datetime({'year': real_usage['Year'], 'month': real_usage['Month'], 'day': 1})
+
+    # Produce table of the AVERAGE NUMBER PER YEAR DURING THE TIME_PERIOD of appointment type by level
+    # limit to date
+    totals_by_year = real_usage \
+        .loc[real_usage['date'].between(*TARGET_PERIOD)] \
+        .groupby(['Year', 'Appt_Type', 'Facility_Level'])['Usage'].sum()
+
+    # Combine the TB data [which is yearly] (after dropping period outside 2017-2019 according to data consistency
+    # and pandemic) with the rest of the data.
+    # Note that TB data is not adjusted considering comparability with NTP reports.
+    real_usage_TB = pd.read_csv(
+        resourcefilepath / 'healthsystem' / 'real_appt_usage_data' / 'real_yearly_usage_of_TBNotifiedAll.csv')
+    real_usage_TB = real_usage_TB.loc[real_usage_TB['Year'].isin([2017, 2018, 2019])]
+    real_usage_TB = real_usage_TB.merge(mfl[['Facility_ID', 'Facility_Level']],
+                                        left_on='Facility_ID', right_on='Facility_ID')
+    totals_by_year_TB = real_usage_TB.groupby(['Year', 'Appt_Type', 'Facility_Level'])['Usage'].sum()
+
+    annual_usage_by_level = pd.concat([totals_by_year.reset_index(), totals_by_year_TB.reset_index()], axis=0)
+
+    # group levels 1b and 2 into 2
+    # annual_usage_by_level['Facility_Level'] = annual_usage_by_level['Facility_Level'].replace({'1b': '2'})
+    annual_usage_by_level = annual_usage_by_level.groupby(
+        ['Year', 'Appt_Type', 'Facility_Level'])['Usage'].sum().reset_index()
+
+    # prepare annual usage by level with mean, 97.5% percentile, and 2.5% percentile
+    annual_usage_by_level_with_ci = annual_usage_by_level.drop(columns='Year').groupby(
+        ['Appt_Type', 'Facility_Level']
+    ).describe(percentiles=[0.025, 0.975]
+               ).stack(level=[0])[['mean', '2.5%', '97.5%']].reset_index().drop(columns='level_2')
+
+    average_annual_by_level = annual_usage_by_level_with_ci[['Appt_Type', 'Facility_Level', 'mean']].set_index(
+        ['Appt_Type', 'Facility_Level']).unstack()
+    average_annual_by_level.columns = average_annual_by_level.columns.get_level_values(1)
+    average_annual_by_level = average_annual_by_level.T
+
+    annual_usage_by_level_with_ci = pd.melt(annual_usage_by_level_with_ci,
+                                            id_vars=['Appt_Type', 'Facility_Level'], var_name='value_type')
+    annual_usage_by_level_with_ci.value_type = annual_usage_by_level_with_ci.value_type.replace({'2.5%': 'lower',
+                                                                                                 '97.5%': 'upper'})
+
+    # prepare annual usage at all levels with mean, 97.5% percentile, and 2.5% percentile
+    annual_usage_with_ci = annual_usage_by_level.groupby(
+        ['Year', 'Appt_Type'])['Usage'].sum().reset_index().drop(columns='Year').groupby(
+        'Appt_Type').describe(percentiles=[0.025, 0.975]
+                              ).stack(level=[0])[['mean', '2.5%', '97.5%']].reset_index().drop(columns='level_1')
+    annual_usage_with_ci = pd.melt(annual_usage_with_ci,
+                                   id_vars='Appt_Type', var_name='value_type')
+    annual_usage_with_ci.value_type = annual_usage_with_ci.value_type.replace({'2.5%': 'lower', '97.5%': 'upper'})
+
+    return average_annual_by_level, annual_usage_by_level_with_ci, annual_usage_with_ci
+
+
+# todo: plot hcw time usage against capability per cadre/facility level/disease (represented by short treatment id),
+#  and comparing 4 scenarios, i.e., Actual/Establishment(funded_plus) HCW * Default/Maximal health care seeking
+def get_expected_appt_time(resourcefilepath) -> pd.DataFrame:
+    """ This is to return the expected time requirements per appointment type per coarse cadre per facility level."""
+    expected_appt_time = pd.read_csv(
+        resourcefilepath / 'healthsystem' / 'organisation' / 'ResourceFile_Appt_Time_Table.csv')
+
+    return expected_appt_time
+
+
+def apply(results_folder: Path, output_folder: Path, resourcefilepath: Path = None):
+    """Compare appointment usage from model output with real appointment usage.
+    The real appointment usage is collected from DHIS2 system and HIV Dept."""
+
+    make_graph_file_name = lambda stub: output_folder / f"{PREFIX_ON_FILENAME}_{stub}.png"  # noqa: E731
+
+    # Plot Simulation vs Real usage (Across all levels and At each level) (trimmed to 0.1 and 10)
+    # format plot
+    def format_and_save(_fig, _ax, _name_of_plot):
+        _ax.set_title(_name_of_plot)
+        _ax.set_yscale('log')
+        _ax.set_ylim(1 / 20, 20)
+        _ax.set_yticks([1 / 10, 1.0, 10])
+        _ax.set_yticklabels(("<= 1/10", "1.0", ">= 10"))
+        _ax.set_ylabel('Model / Data')
+        _ax.set_xlabel('Appointment Type')
+        _ax.tick_params(axis='x', labelrotation=90)
+        _ax.xaxis.grid(True, which='major', linestyle='--')
+        _ax.yaxis.grid(True, which='both', linestyle='--')
+        _ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
+        _fig.tight_layout()
+        _fig.savefig(make_graph_file_name(_name_of_plot.replace(',', '').replace('\n', '_').replace(' ', '_')))
+        _fig.show()
+        plt.close(_fig)
+
+    # get average annual usage by level for Simulation and Real
+    simulation_usage = get_simulation_usage_by_level(results_folder)
+
+    real_usage = get_real_usage(resourcefilepath)[0]
+
+    # find appts that are not included in both simulation and real usage dataframe
+    appts_real_only = set(real_usage.columns.values) - set(simulation_usage.columns.values)
+    appts_simulation_only = set(simulation_usage.columns.values) - set(real_usage.columns.values)
+
+    # format data
+    rel_diff_all_levels = (
+        simulation_usage.sum(axis=0) / real_usage.sum(axis=0)
+    ).clip(lower=0.1, upper=10.0)
+
+    # plot for all levels
+    name_of_plot = 'Model vs Data usage per appt type at all facility levels' \
+                   '\n[Model average annual, Adjusted Data average annual]'
+    fig, ax = plt.subplots()
+    ax.stem(rel_diff_all_levels.index, rel_diff_all_levels.values, bottom=1.0, label='All levels')
+    for idx in rel_diff_all_levels.index:
+        if not pd.isna(rel_diff_all_levels[idx]):
+            ax.text(idx, rel_diff_all_levels[idx]*(1+0.2), round(rel_diff_all_levels[idx], 1),
+                    ha='left', fontsize=8)
+    format_and_save(fig, ax, name_of_plot)
+
+    # plot for each level
+    rel_diff_by_levels = (
+        simulation_usage / real_usage
+    ).clip(upper=10, lower=0.1).dropna(how='all', axis=0)
+
+    name_of_plot = 'Model vs Data usage per appt type per facility level' \
+                   '\n[Model average annual, Adjusted Data average annual]'
+    fig, ax = plt.subplots()
+    marker_dict = {'0': 0,
+                   '1a': 4,
+                   '1b': 5,
+                   '2': 6,
+                   '3': 7,
+                   '4': 1}  # Note that level 0/3/4 has very limited data
+    for _level, _results in rel_diff_by_levels.iterrows():
+        ax.plot(_results.index, _results.values, label=_level, linestyle='none', marker=marker_dict[_level])
+    ax.axhline(1.0, color='r')
+    format_and_save(fig, ax, name_of_plot)
+
+    # Plot Simulation with 95% CI vs Adjusted Real usage by appt type, across all levels (trimmed to 0.1 and 10)
+    # format data
+    def format_rel_diff(adjusted=True):
+        def format_real_usage():
+            _real_usage = get_real_usage(resourcefilepath, adjusted)[0]
+            _real_usage_all_levels = _real_usage.sum(axis=0).reset_index().rename(
+                columns={0: 'real_usage_all_levels', 'Appt_Type': 'appt_type'})
+            return _real_usage_all_levels
+
+        simulation_usage_all_levels_with_ci = get_simulation_usage_with_confidence_interval(results_folder)
+        _rel_diff = simulation_usage_all_levels_with_ci.merge(format_real_usage(), on='appt_type', how='outer')
+
+        _rel_diff['ratio'] = (_rel_diff['value'] / _rel_diff['real_usage_all_levels'])
+
+        _rel_diff = _rel_diff[['appt_type', 'value_type', 'ratio']].pivot(
+            index='appt_type', columns='value_type', values='ratio').dropna(axis=1, how='all')
+
+        _rel_diff['lower_error'] = (_rel_diff['mean'] - _rel_diff['lower'])
+        _rel_diff['upper_error'] = (_rel_diff['upper'] - _rel_diff['mean'])
+        _asymmetric_error = [_rel_diff['lower_error'].values, _rel_diff['upper_error'].values]
+
+        _rel_diff = pd.DataFrame(_rel_diff['mean'])
+
+        return _rel_diff, _asymmetric_error
+
+    rel_diff_real, err_real = format_rel_diff(adjusted=True)
+
+    # plot
+    name_of_plot = 'Model vs Data usage per appointment type at all facility levels' \
+                   '\n[Model average annual 95% CI, Adjusted Data average annual]'
+    fig, ax = plt.subplots()
+    ax.errorbar(rel_diff_real.index.values,
+                rel_diff_real['mean'].values,
+                err_real, fmt='.', capsize=3.0, label='All levels')
+    ax.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
+    for idx in rel_diff_real.index:
+        if not pd.isna(rel_diff_real.loc[idx, 'mean']):
+            ax.text(idx,
+                    rel_diff_real.loc[idx, 'mean'] * (1 + 0.2),
+                    round(rel_diff_real.loc[idx, 'mean'], 1),
+                    ha='left', fontsize=8)
+    ax.axhline(1.0, color='r')
+    format_and_save(fig, ax, name_of_plot)
+
+    # Plot Simulation vs Real usage by appt type and show fraction of usage at each level
+    # Model, Adjusted real and Unadjusted real average annual usage all normalised to 1
+    # format data
+    def format_simulation_usage_fraction():
+        _usage = get_simulation_usage_by_level(results_folder)
+        _usage_all_levels = _usage.sum(axis=0).reset_index().rename(
+            columns={0: '_usage_all_levels', 'index': 'appt_type'})
+
+        _usage = pd.melt(_usage.reset_index(), id_vars='index',
+                         var_name='appt_type', value_name='_usage'
+                         ).rename(columns={'index': 'facility_level'})
+
+        _usage_fraction = _usage.merge(_usage_all_levels, on='appt_type', how='outer')
+        _usage_fraction['ratio'] = (_usage_fraction['_usage'] /
+                                    _usage_fraction['_usage_all_levels'])
+
+        _usage_fraction = pd.pivot(_usage_fraction, index='appt_type', columns='facility_level', values='ratio')
+
+        # add nan rows of appts_real_only
+        nan_df = pd.DataFrame(index=appts_real_only, columns=_usage_fraction.columns)
+        _usage_fraction = pd.concat([_usage_fraction, nan_df]).sort_index()
+
+        # make row of appts_simulation_only nan
+        _usage_fraction.loc[_usage_fraction.index.isin(appts_simulation_only), :] = np.NaN
+
+        return _usage_fraction
+
+    def format_real_usage_fraction(adjusted=True):
+        _usage = get_real_usage(resourcefilepath, adjusted)[0]
+        _usage_all_levels = _usage.sum(axis=0).reset_index().rename(
+            columns={0: '_usage_all_levels', 'Appt_Type': 'appt_type'})
+
+        _usage = pd.melt(_usage.reset_index(), id_vars='Facility_Level',
+                         var_name='appt_type', value_name='_usage'
+                         ).rename(columns={'Facility_Level': 'facility_level'})
+
+        _usage_fraction = _usage.merge(_usage_all_levels, on='appt_type', how='outer')
+        _usage_fraction['ratio'] = (_usage_fraction['_usage'] /
+                                    _usage_fraction['_usage_all_levels'])
+
+        _usage_fraction = pd.pivot(_usage_fraction, index='appt_type', columns='facility_level', values='ratio')
+
+        # add nan rows of appts_simulation_only
+        nan_df = pd.DataFrame(index=appts_simulation_only, columns=_usage_fraction.columns)
+        _usage_fraction = pd.concat([_usage_fraction, nan_df]).sort_index()
+
+        # make row of appts_real_only nan
+        _usage_fraction.loc[_usage_fraction.index.isin(appts_real_only), :] = np.NaN
+
+        return _usage_fraction
+
+    simulation_usage_plot = format_simulation_usage_fraction()
+    real_usage_plot = format_real_usage_fraction(adjusted=True)
+    unadjusted_real_usage_plot = format_real_usage_fraction(adjusted=False)
+    assert simulation_usage_plot.index.equals(real_usage_plot.index)
+    assert simulation_usage_plot.index.equals(unadjusted_real_usage_plot.index)
+
+    # plot
+    name_of_plot = 'Model vs Data usage per appointment type on fraction per level' \
+                   '\n[Model average annual, Adjusted & Unadjusted Data average annual]'
+    fig, ax = plt.subplots(figsize=(12, 5))
+    simulation_usage_plot.plot(kind='bar', stacked=True, width=0.3,
+                               edgecolor='dimgrey', hatch='',
+                               ax=ax, position=0)
+    real_usage_plot.plot(kind='bar', stacked=True, width=0.25,
+                         edgecolor='dimgrey', hatch='.',
+                         ax=ax, position=1)
+    unadjusted_real_usage_plot.plot(kind='bar', stacked=True, width=0.25,
+                                    edgecolor='dimgrey', hatch='//',
+                                    ax=ax, position=2)
+    ax.set_xlim(right=len(simulation_usage_plot) - 0.45)
+    ax.set_ylabel('Usage per level / Usage all levels')
+    ax.set_xlabel('Appointment Type')
+    ax.set_title(name_of_plot)
+    legend_1 = plt.legend(simulation_usage_plot.columns, loc='upper left', bbox_to_anchor=(1.0, 0.5),
+                          title='Facility Level')
+    patch_simulation = matplotlib.patches.Patch(facecolor='lightgrey', hatch='', edgecolor="dimgrey", label='Model')
+    patch_real = matplotlib.patches.Patch(facecolor='lightgrey', hatch='...', edgecolor="dimgrey",
+                                          label='Adjusted Data')
+    patch_unadjusted_real = matplotlib.patches.Patch(facecolor='lightgrey', hatch='///', edgecolor="dimgrey",
+                                                     label='Unadjusted Data')
+
+    plt.legend(handles=[patch_unadjusted_real, patch_real, patch_simulation],
+               loc='lower left', bbox_to_anchor=(1.0, 0.6))
+    fig.add_artist(legend_1)
+    fig.tight_layout()
+    fig.savefig(make_graph_file_name(name_of_plot.replace(',', '').replace('\n', '_').replace(' ', '_')))
+    plt.show()
+
+    # appendix - plot Simulation with 95% CI vs Adjusted & Unadjusted real, across all levels
+    def format_data_for_bar_plot(_usage):
+        """reduce the model/data ratio by 1.0, for the bar plot that starts from y=1.0 instead of y=0.0."""
+        _usage['mean'] = _usage['mean'] - 1.0
+        return _usage
+
+    rel_diff_unadjusted_real, err_unadjusted_real = format_rel_diff(adjusted=False)
+    rel_diff_unadjusted_real = format_data_for_bar_plot(rel_diff_unadjusted_real)
+    rel_diff_real = format_data_for_bar_plot(rel_diff_real)
+    assert (rel_diff_unadjusted_real.index == rel_diff_real.index).all()
+
+    name_of_plot = 'Model vs Data usage per appointment type at all facility levels' \
+                   '\n[Model average annual 95% CI, Adjusted & Unadjusted Data average annual]'
+    fig, ax = plt.subplots(figsize=(8, 5))
+    rel_diff_unadjusted_real.plot(kind='bar', yerr=err_unadjusted_real, width=0.4,
+                                  ax=ax, position=0, bottom=1.0,
+                                  legend=False, color='salmon')
+    rel_diff_real.plot(kind='bar', yerr=err_real, width=0.4,
+                       ax=ax, position=1, bottom=1.0,
+                       legend=False, color='yellowgreen')
+    ax.axhline(1.0, color='r')
+    ax.set_xlim(right=len(rel_diff_real) - 0.3)
+    ax.set_yscale('log')
+    ax.set_ylim(1 / 20, 20)
+    ax.set_yticks([1 / 10, 1.0, 10])
+    ax.set_yticklabels(("<= 1/10", "1.0", ">= 10"))
+    ax.set_ylabel('Model / Data')
+    ax.set_xlabel('Appointment Type')
+    ax.xaxis.grid(True, which='major', linestyle='--')
+    ax.yaxis.grid(True, which='both', linestyle='--')
+    ax.set_title(name_of_plot)
+    patch_real = matplotlib.patches.Patch(facecolor='yellowgreen', label='Adjusted Data')
+    patch_unadjusted_real = matplotlib.patches.Patch(facecolor='salmon', label='Unadjusted Data')
+    legend = plt.legend(handles=[patch_real, patch_unadjusted_real], loc='center left', bbox_to_anchor=(1.0, 0.5))
+    fig.add_artist(legend)
+    fig.tight_layout()
+    fig.savefig(make_graph_file_name(name_of_plot.replace(',', '').replace('\n', '_').replace(' ', '_')))
+    plt.show()
+
+
+if __name__ == "__main__":
+    outputspath = Path('./outputs/bshe@ic.ac.uk')
+    rfp = Path('./resources')
+
+    # Find results folder (most recent run generated using that scenario_filename)
+    scenario_filename = 'long_run_all_diseases.py'
+    results_folder = get_scenario_outputs(scenario_filename, outputspath)[-1]
+
+    # Test dataset:
+    # results_folder = Path('/Users/tbh03/GitHub/TLOmodel/outputs/tbh03@ic.ac.uk/long_run_all_diseases-small')
+
+    # If needed -- in the case that pickles were not created remotely during batch
+    # create_pickles_locally(results_folder)
+
+    # Run all the calibrations
+    apply(results_folder=results_folder, output_folder=results_folder, resourcefilepath=rfp)

From 6b2e505905fc55227121337c4a816c596d788169 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Wed, 26 Jul 2023 10:10:42 +0100
Subject: [PATCH 087/131] Revert "add todo for HCW paper plots"

This reverts commit 25162a41e6e9d770e15978eb267e007e4ba243be.
---
 .../analysis_compare_appt_usage_real_and_simulation.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py b/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
index 713d2652a8..f7f2ee887c 100644
--- a/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
+++ b/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
@@ -300,16 +300,6 @@ def get_real_usage(resourcefilepath, adjusted=True) -> pd.DataFrame:
     return average_annual_by_level, annual_usage_by_level_with_ci, annual_usage_with_ci
 
 
-# todo: plot hcw time usage against capability per cadre/facility level/disease (represented by short treatment id),
-#  and comparing 4 scenarios, i.e., Actual/Establishment(funded_plus) HCW * Default/Maximal health care seeking
-def get_expected_appt_time(resourcefilepath) -> pd.DataFrame:
-    """ This is to return the expected time requirements per appointment type per coarse cadre per facility level."""
-    expected_appt_time = pd.read_csv(
-        resourcefilepath / 'healthsystem' / 'organisation' / 'ResourceFile_Appt_Time_Table.csv')
-
-    return expected_appt_time
-
-
 def apply(results_folder: Path, output_folder: Path, resourcefilepath: Path = None):
     """Compare appointment usage from model output with real appointment usage.
     The real appointment usage is collected from DHIS2 system and HIV Dept."""

From 192242ebf1c7ce8f7666cd658d343bb23772974b Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Wed, 26 Jul 2023 12:33:36 +0100
Subject: [PATCH 088/131] formate hcw usage

---
 ...alysis_hcw_usage_by_appt_and_by_disease.py | 364 ++----------------
 1 file changed, 41 insertions(+), 323 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
index 58a5f66a69..be58927d8c 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
@@ -50,7 +50,6 @@ def unpack_nested_dict_in_series(_raw: pd.Series):
         return _df \
             .loc[pd.to_datetime(_df['date']).between(*TARGET_PERIOD), 'Number_By_Appt_Type_Code_And_Level'] \
             .pipe(unpack_nested_dict_in_series) \
-            .rename(columns=appt_dict, level=1) \
             .groupby(level=[0, 1], axis=1).sum() \
             .mean(axis=0)  # mean over each year (row)
 
@@ -67,69 +66,6 @@ def unpack_nested_dict_in_series(_raw: pd.Series):
         ).unstack().astype(int)
 
 
-def get_annual_num_appts_by_level_with_confidence_interval(results_folder: Path) -> pd.DataFrame:
-    """Return pd.DataFrame gives the (mean) simulated annual number of appointments of each type at each level,
-    with 95% confidence interval."""
-
-    def get_counts_of_appts(_df):
-        """Get the mean number of appointments of each type being used each year at each level.
-        Need to rename appts to match standardized categories from the DHIS2 data."""
-
-        def unpack_nested_dict_in_series(_raw: pd.Series):
-            return pd.concat(
-                {
-                  idx: pd.DataFrame.from_dict(mydict) for idx, mydict in _raw.iteritems()
-                 }
-             ).unstack().fillna(0.0).astype(int)
-
-        return _df \
-            .loc[pd.to_datetime(_df['date']).between(*TARGET_PERIOD), 'Number_By_Appt_Type_Code_And_Level'] \
-            .pipe(unpack_nested_dict_in_series) \
-            .rename(columns=appt_dict, level=1) \
-            .groupby(level=[0, 1], axis=1).sum() \
-            .mean(axis=0)  # mean over each year (row)
-
-    return summarize(
-        extract_results(
-                results_folder,
-                module='tlo.methods.healthsystem.summary',
-                key='HSI_Event',
-                custom_generate_series=get_counts_of_appts,
-                do_scaling=True
-            ),
-        only_mean=False,
-        collapse_columns=True,
-        ).unstack().astype(int)
-
-
-def get_annual_num_appts_with_confidence_interval(results_folder: Path) -> pd.DataFrame:
-    """Return pd.DataFrame gives the (mean) simulated annual number of appointments of each type at all levels,
-    with 95% confidence interval."""
-
-    def get_counts_of_appts(_df) -> pd.Series:
-        """Get the mean number of appointments of each type being used each year at all levels.
-        Need to rename appts to match standardized categories from the DHIS2 data."""
-
-        return _df \
-            .loc[pd.to_datetime(_df['date']).between(*TARGET_PERIOD), 'Number_By_Appt_Type_Code'] \
-            .apply(pd.Series) \
-            .rename(columns=appt_dict) \
-            .groupby(level=0, axis=1).sum() \
-            .mean(axis=0)  # mean over each year (row)
-
-    return summarize(
-        extract_results(
-                results_folder,
-                module='tlo.methods.healthsystem.summary',
-                key='HSI_Event',
-                custom_generate_series=get_counts_of_appts,
-                do_scaling=True
-            ),
-        only_mean=False,
-        collapse_columns=True,
-        ).unstack().astype(int)
-
-
 def get_simulation_usage_by_level(results_folder: Path) -> pd.DataFrame:
     """Returns the simulated MEAN USAGE PER YEAR DURING THE TIME_PERIOD, by appointment type and level.
     """
@@ -140,48 +76,6 @@ def get_simulation_usage_by_level(results_folder: Path) -> pd.DataFrame:
     return model_output
 
 
-def get_simulation_usage_by_level_with_confidence_interval(results_folder: Path) -> pd.DataFrame:
-    """Returns the simulated MEAN USAGE PER YEAR DURING THE TIME_PERIOD with 95% confidence interval,
-    by appointment type and level.
-    """
-
-    # Get model outputs
-    model_output = get_annual_num_appts_by_level_with_confidence_interval(results_folder=results_folder)
-
-    # Reformat
-    model_output.columns = [' '.join(col).strip() for col in model_output.columns.values]
-    model_output = model_output.melt(var_name='name', value_name='value', ignore_index=False)
-    model_output['name'] = model_output['name'].str.split(' ')
-    model_output['value_type'] = model_output['name'].str[0]
-    model_output['appt_type'] = model_output['name'].str[1]
-    model_output.drop(columns='name', inplace=True)
-    model_output.reset_index(drop=False, inplace=True)
-    model_output.rename(columns={'index': 'facility_level'}, inplace=True)
-
-    return model_output
-
-
-def get_simulation_usage_with_confidence_interval(results_folder: Path) -> pd.DataFrame:
-    """Returns the simulated MEAN USAGE PER YEAR DURING THE TIME_PERIOD with 95% confidence interval,
-    by appointment type.
-    """
-
-    # Get model outputs
-    model_output = get_annual_num_appts_with_confidence_interval(results_folder=results_folder)
-
-    # Reformat
-    model_output = pd.DataFrame(model_output).T
-    model_output.columns = [' '.join(col).strip() for col in model_output.columns.values]
-    model_output = model_output.melt(var_name='name', value_name='value', ignore_index=False)
-    model_output['name'] = model_output['name'].str.split(' ')
-    model_output['value_type'] = model_output['name'].str[0]
-    model_output['appt_type'] = model_output['name'].str[1]
-    model_output.drop(columns='name', inplace=True)
-    model_output.reset_index(drop=True, inplace=True)
-
-    return model_output
-
-
 def adjust_real_usage_on_mentalall(real_usage_df) -> pd.DataFrame:
     """This is to adjust the annual MentalAll usage in real usage dataframe.
     The MentalAll usage was not adjusted in the preprocessing stage considering individual facilities and very low
@@ -300,23 +194,34 @@ def get_real_usage(resourcefilepath, adjusted=True) -> pd.DataFrame:
     return average_annual_by_level, annual_usage_by_level_with_ci, annual_usage_with_ci
 
 
-# todo: plot hcw time usage against capability per cadre/facility level/disease (represented by short treatment id),
-#  and comparing 4 scenarios, i.e., Actual/Establishment(funded_plus) HCW * Default/Maximal health care seeking
 def get_expected_appt_time(resourcefilepath) -> pd.DataFrame:
-    """ This is to return the expected time requirements per appointment type per coarse cadre per facility level."""
+    """This is to return the expected time requirements per appointment type per coarse cadre per facility level."""
     expected_appt_time = pd.read_csv(
-        resourcefilepath / 'healthsystem' / 'organisation' / 'ResourceFile_Appt_Time_Table.csv')
+        resourcefilepath / 'healthsystem' / 'human_resources' / 'definitions' / 'ResourceFile_Appt_Time_Table.csv')
 
     return expected_appt_time
 
 
+def get_hcw_capability(resourcefilepath, hcwscenario='actual') -> pd.DataFrame:
+    """This is to return the annual hcw capabilities per cadre per facility level.
+       Argument hcwscenario can be actual, funded_plus."""
+    hcw_capability = pd.read_csv(
+        resourcefilepath / 'healthsystem' / 'human_resources' / hcwscenario / 'ResourceFile_Daily_Capabilities.csv'
+    )
+    hcw_capability = hcw_capability.groupby(['Facility_Level', 'Officer_Category']
+                                            )['Total_Mins_Per_Day'].sum().reset_index()
+    hcw_capability['Total_Mins_Per_Year'] = hcw_capability['Total_Mins_Per_Day'] * 365.25
+    hcw_capability.drop(columns='Total_Mins_Per_Day', inplace=True)
+
+    return hcw_capability
+
+
 def apply(results_folder: Path, output_folder: Path, resourcefilepath: Path = None):
     """Compare appointment usage from model output with real appointment usage.
     The real appointment usage is collected from DHIS2 system and HIV Dept."""
 
     make_graph_file_name = lambda stub: output_folder / f"{PREFIX_ON_FILENAME}_{stub}.png"  # noqa: E731
 
-    # Plot Simulation vs Real usage (Across all levels and At each level) (trimmed to 0.1 and 10)
     # format plot
     def format_and_save(_fig, _ax, _name_of_plot):
         _ax.set_title(_name_of_plot)
@@ -340,216 +245,29 @@ def format_and_save(_fig, _ax, _name_of_plot):
 
     real_usage = get_real_usage(resourcefilepath)[0]
 
-    # find appts that are not included in both simulation and real usage dataframe
-    appts_real_only = set(real_usage.columns.values) - set(simulation_usage.columns.values)
-    appts_simulation_only = set(simulation_usage.columns.values) - set(real_usage.columns.values)
-
-    # format data
-    rel_diff_all_levels = (
-        simulation_usage.sum(axis=0) / real_usage.sum(axis=0)
-    ).clip(lower=0.1, upper=10.0)
-
-    # plot for all levels
-    name_of_plot = 'Model vs Data usage per appt type at all facility levels' \
-                   '\n[Model average annual, Adjusted Data average annual]'
-    fig, ax = plt.subplots()
-    ax.stem(rel_diff_all_levels.index, rel_diff_all_levels.values, bottom=1.0, label='All levels')
-    for idx in rel_diff_all_levels.index:
-        if not pd.isna(rel_diff_all_levels[idx]):
-            ax.text(idx, rel_diff_all_levels[idx]*(1+0.2), round(rel_diff_all_levels[idx], 1),
-                    ha='left', fontsize=8)
-    format_and_save(fig, ax, name_of_plot)
-
-    # plot for each level
-    rel_diff_by_levels = (
-        simulation_usage / real_usage
-    ).clip(upper=10, lower=0.1).dropna(how='all', axis=0)
-
-    name_of_plot = 'Model vs Data usage per appt type per facility level' \
-                   '\n[Model average annual, Adjusted Data average annual]'
-    fig, ax = plt.subplots()
-    marker_dict = {'0': 0,
-                   '1a': 4,
-                   '1b': 5,
-                   '2': 6,
-                   '3': 7,
-                   '4': 1}  # Note that level 0/3/4 has very limited data
-    for _level, _results in rel_diff_by_levels.iterrows():
-        ax.plot(_results.index, _results.values, label=_level, linestyle='none', marker=marker_dict[_level])
-    ax.axhline(1.0, color='r')
-    format_and_save(fig, ax, name_of_plot)
-
-    # Plot Simulation with 95% CI vs Adjusted Real usage by appt type, across all levels (trimmed to 0.1 and 10)
-    # format data
-    def format_rel_diff(adjusted=True):
-        def format_real_usage():
-            _real_usage = get_real_usage(resourcefilepath, adjusted)[0]
-            _real_usage_all_levels = _real_usage.sum(axis=0).reset_index().rename(
-                columns={0: 'real_usage_all_levels', 'Appt_Type': 'appt_type'})
-            return _real_usage_all_levels
-
-        simulation_usage_all_levels_with_ci = get_simulation_usage_with_confidence_interval(results_folder)
-        _rel_diff = simulation_usage_all_levels_with_ci.merge(format_real_usage(), on='appt_type', how='outer')
-
-        _rel_diff['ratio'] = (_rel_diff['value'] / _rel_diff['real_usage_all_levels'])
-
-        _rel_diff = _rel_diff[['appt_type', 'value_type', 'ratio']].pivot(
-            index='appt_type', columns='value_type', values='ratio').dropna(axis=1, how='all')
-
-        _rel_diff['lower_error'] = (_rel_diff['mean'] - _rel_diff['lower'])
-        _rel_diff['upper_error'] = (_rel_diff['upper'] - _rel_diff['mean'])
-        _asymmetric_error = [_rel_diff['lower_error'].values, _rel_diff['upper_error'].values]
-
-        _rel_diff = pd.DataFrame(_rel_diff['mean'])
-
-        return _rel_diff, _asymmetric_error
-
-    rel_diff_real, err_real = format_rel_diff(adjusted=True)
-
-    # plot
-    name_of_plot = 'Model vs Data usage per appointment type at all facility levels' \
-                   '\n[Model average annual 95% CI, Adjusted Data average annual]'
-    fig, ax = plt.subplots()
-    ax.errorbar(rel_diff_real.index.values,
-                rel_diff_real['mean'].values,
-                err_real, fmt='.', capsize=3.0, label='All levels')
-    ax.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
-    for idx in rel_diff_real.index:
-        if not pd.isna(rel_diff_real.loc[idx, 'mean']):
-            ax.text(idx,
-                    rel_diff_real.loc[idx, 'mean'] * (1 + 0.2),
-                    round(rel_diff_real.loc[idx, 'mean'], 1),
-                    ha='left', fontsize=8)
-    ax.axhline(1.0, color='r')
-    format_and_save(fig, ax, name_of_plot)
-
-    # Plot Simulation vs Real usage by appt type and show fraction of usage at each level
-    # Model, Adjusted real and Unadjusted real average annual usage all normalised to 1
-    # format data
-    def format_simulation_usage_fraction():
-        _usage = get_simulation_usage_by_level(results_folder)
-        _usage_all_levels = _usage.sum(axis=0).reset_index().rename(
-            columns={0: '_usage_all_levels', 'index': 'appt_type'})
-
-        _usage = pd.melt(_usage.reset_index(), id_vars='index',
-                         var_name='appt_type', value_name='_usage'
-                         ).rename(columns={'index': 'facility_level'})
-
-        _usage_fraction = _usage.merge(_usage_all_levels, on='appt_type', how='outer')
-        _usage_fraction['ratio'] = (_usage_fraction['_usage'] /
-                                    _usage_fraction['_usage_all_levels'])
-
-        _usage_fraction = pd.pivot(_usage_fraction, index='appt_type', columns='facility_level', values='ratio')
-
-        # add nan rows of appts_real_only
-        nan_df = pd.DataFrame(index=appts_real_only, columns=_usage_fraction.columns)
-        _usage_fraction = pd.concat([_usage_fraction, nan_df]).sort_index()
-
-        # make row of appts_simulation_only nan
-        _usage_fraction.loc[_usage_fraction.index.isin(appts_simulation_only), :] = np.NaN
-
-        return _usage_fraction
-
-    def format_real_usage_fraction(adjusted=True):
-        _usage = get_real_usage(resourcefilepath, adjusted)[0]
-        _usage_all_levels = _usage.sum(axis=0).reset_index().rename(
-            columns={0: '_usage_all_levels', 'Appt_Type': 'appt_type'})
-
-        _usage = pd.melt(_usage.reset_index(), id_vars='Facility_Level',
-                         var_name='appt_type', value_name='_usage'
-                         ).rename(columns={'Facility_Level': 'facility_level'})
-
-        _usage_fraction = _usage.merge(_usage_all_levels, on='appt_type', how='outer')
-        _usage_fraction['ratio'] = (_usage_fraction['_usage'] /
-                                    _usage_fraction['_usage_all_levels'])
-
-        _usage_fraction = pd.pivot(_usage_fraction, index='appt_type', columns='facility_level', values='ratio')
-
-        # add nan rows of appts_simulation_only
-        nan_df = pd.DataFrame(index=appts_simulation_only, columns=_usage_fraction.columns)
-        _usage_fraction = pd.concat([_usage_fraction, nan_df]).sort_index()
-
-        # make row of appts_real_only nan
-        _usage_fraction.loc[_usage_fraction.index.isin(appts_real_only), :] = np.NaN
-
-        return _usage_fraction
-
-    simulation_usage_plot = format_simulation_usage_fraction()
-    real_usage_plot = format_real_usage_fraction(adjusted=True)
-    unadjusted_real_usage_plot = format_real_usage_fraction(adjusted=False)
-    assert simulation_usage_plot.index.equals(real_usage_plot.index)
-    assert simulation_usage_plot.index.equals(unadjusted_real_usage_plot.index)
-
-    # plot
-    name_of_plot = 'Model vs Data usage per appointment type on fraction per level' \
-                   '\n[Model average annual, Adjusted & Unadjusted Data average annual]'
-    fig, ax = plt.subplots(figsize=(12, 5))
-    simulation_usage_plot.plot(kind='bar', stacked=True, width=0.3,
-                               edgecolor='dimgrey', hatch='',
-                               ax=ax, position=0)
-    real_usage_plot.plot(kind='bar', stacked=True, width=0.25,
-                         edgecolor='dimgrey', hatch='.',
-                         ax=ax, position=1)
-    unadjusted_real_usage_plot.plot(kind='bar', stacked=True, width=0.25,
-                                    edgecolor='dimgrey', hatch='//',
-                                    ax=ax, position=2)
-    ax.set_xlim(right=len(simulation_usage_plot) - 0.45)
-    ax.set_ylabel('Usage per level / Usage all levels')
-    ax.set_xlabel('Appointment Type')
-    ax.set_title(name_of_plot)
-    legend_1 = plt.legend(simulation_usage_plot.columns, loc='upper left', bbox_to_anchor=(1.0, 0.5),
-                          title='Facility Level')
-    patch_simulation = matplotlib.patches.Patch(facecolor='lightgrey', hatch='', edgecolor="dimgrey", label='Model')
-    patch_real = matplotlib.patches.Patch(facecolor='lightgrey', hatch='...', edgecolor="dimgrey",
-                                          label='Adjusted Data')
-    patch_unadjusted_real = matplotlib.patches.Patch(facecolor='lightgrey', hatch='///', edgecolor="dimgrey",
-                                                     label='Unadjusted Data')
-
-    plt.legend(handles=[patch_unadjusted_real, patch_real, patch_simulation],
-               loc='lower left', bbox_to_anchor=(1.0, 0.6))
-    fig.add_artist(legend_1)
-    fig.tight_layout()
-    fig.savefig(make_graph_file_name(name_of_plot.replace(',', '').replace('\n', '_').replace(' ', '_')))
-    plt.show()
-
-    # appendix - plot Simulation with 95% CI vs Adjusted & Unadjusted real, across all levels
-    def format_data_for_bar_plot(_usage):
-        """reduce the model/data ratio by 1.0, for the bar plot that starts from y=1.0 instead of y=0.0."""
-        _usage['mean'] = _usage['mean'] - 1.0
-        return _usage
-
-    rel_diff_unadjusted_real, err_unadjusted_real = format_rel_diff(adjusted=False)
-    rel_diff_unadjusted_real = format_data_for_bar_plot(rel_diff_unadjusted_real)
-    rel_diff_real = format_data_for_bar_plot(rel_diff_real)
-    assert (rel_diff_unadjusted_real.index == rel_diff_real.index).all()
-
-    name_of_plot = 'Model vs Data usage per appointment type at all facility levels' \
-                   '\n[Model average annual 95% CI, Adjusted & Unadjusted Data average annual]'
-    fig, ax = plt.subplots(figsize=(8, 5))
-    rel_diff_unadjusted_real.plot(kind='bar', yerr=err_unadjusted_real, width=0.4,
-                                  ax=ax, position=0, bottom=1.0,
-                                  legend=False, color='salmon')
-    rel_diff_real.plot(kind='bar', yerr=err_real, width=0.4,
-                       ax=ax, position=1, bottom=1.0,
-                       legend=False, color='yellowgreen')
-    ax.axhline(1.0, color='r')
-    ax.set_xlim(right=len(rel_diff_real) - 0.3)
-    ax.set_yscale('log')
-    ax.set_ylim(1 / 20, 20)
-    ax.set_yticks([1 / 10, 1.0, 10])
-    ax.set_yticklabels(("<= 1/10", "1.0", ">= 10"))
-    ax.set_ylabel('Model / Data')
-    ax.set_xlabel('Appointment Type')
-    ax.xaxis.grid(True, which='major', linestyle='--')
-    ax.yaxis.grid(True, which='both', linestyle='--')
-    ax.set_title(name_of_plot)
-    patch_real = matplotlib.patches.Patch(facecolor='yellowgreen', label='Adjusted Data')
-    patch_unadjusted_real = matplotlib.patches.Patch(facecolor='salmon', label='Unadjusted Data')
-    legend = plt.legend(handles=[patch_real, patch_unadjusted_real], loc='center left', bbox_to_anchor=(1.0, 0.5))
-    fig.add_artist(legend)
-    fig.tight_layout()
-    fig.savefig(make_graph_file_name(name_of_plot.replace(',', '').replace('\n', '_').replace(' ', '_')))
-    plt.show()
+    # get expected appt time and hcw capability
+    appt_time = get_expected_appt_time(resourcefilepath)
+
+    hcw_capability = get_hcw_capability(resourcefilepath, hcwscenario='actual')
+
+    # check that appts in simulation_usage are in appt_time
+    appts_def = set(appt_time.Appt_Type_Code)
+    appts_sim = set(simulation_usage.columns.values)
+    assert appts_sim.issubset(appts_def)
+
+    # todo: plot hcw time usage against capability per cadre
+    #  /facility level (focus on level 1a, 1b and 2)/disease (represented by short treatment id),
+    #  and comparing 4 scenarios, i.e., Actual/Establishment(funded_plus) HCW * Default/Maximal health care seeking
+
+    # hcw usage per cadre per facility level (1a, 1b, 2), against actual capability
+    hcw_usage = appt_time.drop(index=appt_time[~appt_time.Appt_Type_Code.isin(appts_sim)].index).reset_index(drop=True)
+    for idx in hcw_usage.index:
+        hcw_usage.loc[idx, 'Total_Mins_Used_Per_Year'] = (hcw_usage.loc[idx, 'Time_Taken_Mins'] *
+                                                          simulation_usage.loc[hcw_usage.loc[idx, 'Facility_Level'],
+                                                                               hcw_usage.loc[idx, 'Appt_Type_Code']])
+    hcw_usage = hcw_usage.groupby(['Facility_Level', 'Officer_Category']
+                                  )['Total_Mins_Used_Per_Year'].sum().reset_index()
+    hcw_usage = hcw_usage.merge(hcw_capability, on=['Facility_Level', 'Officer_Category'], how='left')
 
 
 if __name__ == "__main__":
@@ -557,8 +275,8 @@ def format_data_for_bar_plot(_usage):
     rfp = Path('./resources')
 
     # Find results folder (most recent run generated using that scenario_filename)
-    scenario_filename = 'long_run_all_diseases.py'
-    results_folder = get_scenario_outputs(scenario_filename, outputspath)[-1]
+    scenario_filename = '10_year_scale_run.py'
+    results_folder = get_scenario_outputs(scenario_filename, outputspath)[-4]
 
     # Test dataset:
     # results_folder = Path('/Users/tbh03/GitHub/TLOmodel/outputs/tbh03@ic.ac.uk/long_run_all_diseases-small')

From 6822950bdbc37ae515ec1c5966d72999fb0b2c12 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Wed, 26 Jul 2023 16:49:29 +0100
Subject: [PATCH 089/131] further format the hcw usage

---
 ...analysis_hcw_usage_by_appt_and_by_disease.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
index be58927d8c..3d0135e9f7 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
@@ -198,6 +198,17 @@ def get_expected_appt_time(resourcefilepath) -> pd.DataFrame:
     """This is to return the expected time requirements per appointment type per coarse cadre per facility level."""
     expected_appt_time = pd.read_csv(
         resourcefilepath / 'healthsystem' / 'human_resources' / 'definitions' / 'ResourceFile_Appt_Time_Table.csv')
+    appt_type = pd.read_csv(
+        resourcefilepath / 'healthsystem' / 'human_resources' / 'definitions' / 'ResourceFile_Appt_Types_Table.csv')
+    expected_appt_time = expected_appt_time.merge(
+        appt_type[['Appt_Type_Code', 'Appt_Cat']], on='Appt_Type_Code', how='left')
+    # rename Appt_Cat
+    appt_cat = {'GENERAL_INPATIENT_AND_OUTPATIENT_CARE': 'IPOP',
+                'Nutrition': 'NUTRITION',
+                'Misc': 'MISC',
+                'Mental_Health': 'MENTAL'}
+    expected_appt_time['Appt_Cat'] = expected_appt_time['Appt_Cat'].replace(appt_cat)
+    expected_appt_time.rename(columns={'Appt_Cat': 'Appt_Category'}, inplace=True)
 
     return expected_appt_time
 
@@ -259,15 +270,15 @@ def format_and_save(_fig, _ax, _name_of_plot):
     #  /facility level (focus on level 1a, 1b and 2)/disease (represented by short treatment id),
     #  and comparing 4 scenarios, i.e., Actual/Establishment(funded_plus) HCW * Default/Maximal health care seeking
 
-    # hcw usage per cadre per facility level (1a, 1b, 2), against actual capability
+    # hcw usage per cadre per facility level (1a, 1b, 2) per appointment type
     hcw_usage = appt_time.drop(index=appt_time[~appt_time.Appt_Type_Code.isin(appts_sim)].index).reset_index(drop=True)
     for idx in hcw_usage.index:
         hcw_usage.loc[idx, 'Total_Mins_Used_Per_Year'] = (hcw_usage.loc[idx, 'Time_Taken_Mins'] *
                                                           simulation_usage.loc[hcw_usage.loc[idx, 'Facility_Level'],
                                                                                hcw_usage.loc[idx, 'Appt_Type_Code']])
-    hcw_usage = hcw_usage.groupby(['Facility_Level', 'Officer_Category']
+    hcw_usage.drop(columns='Time_Taken_Mins', inplace=True)
+    hcw_usage = hcw_usage.groupby(['Facility_Level', 'Officer_Category', 'Appt_Category']
                                   )['Total_Mins_Used_Per_Year'].sum().reset_index()
-    hcw_usage = hcw_usage.merge(hcw_capability, on=['Facility_Level', 'Officer_Category'], how='left')
 
 
 if __name__ == "__main__":

From d71d4a3278e123fe8a719aea3ad14ea03e44eb6b Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Wed, 26 Jul 2023 20:41:26 +0100
Subject: [PATCH 090/131] try get hsi count by appt type and facility level

---
 ...alysis_hcw_usage_by_appt_and_by_disease.py | 40 ++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
index 3d0135e9f7..d787632840 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
@@ -1,4 +1,5 @@
 from pathlib import Path
+from collections import Counter
 
 import matplotlib
 import matplotlib.pyplot as plt
@@ -6,7 +7,13 @@
 import pandas as pd
 
 from tlo import Date
-from tlo.analysis.utils import extract_results, get_scenario_outputs, summarize
+from tlo.analysis.utils import (
+    bin_hsi_event_details,
+    compute_mean_across_runs,
+    extract_results,
+    get_coarse_appt_type,
+    get_scenario_outputs,
+    summarize)
 
 PREFIX_ON_FILENAME = '6'
 
@@ -66,6 +73,34 @@ def unpack_nested_dict_in_series(_raw: pd.Series):
         ).unstack().astype(int)
 
 
+# todo: add level info
+def get_annual_num_hsi_by_appt_and_level(results_folder: Path) -> pd.DataFrame:
+    """Return pd.DataFrame gives the (mean) simulated annual count of hsi
+    per treatment id per each appt type per level."""
+    counts_by_treatment_id_and_coarse_appt_type = compute_mean_across_runs(
+        bin_hsi_event_details(
+            results_folder,
+            lambda event_details, count: sum(
+                [
+                    Counter({
+                        (
+                            event_details["treatment_id"].split("_")[0],
+                            get_coarse_appt_type(appt_type)
+                        ):
+                            count * appt_number
+                    })
+                    for appt_type, appt_number in event_details["appt_footprint"]
+                ],
+                Counter()
+            ),
+            *TARGET_PERIOD,
+            True
+        )
+    )[0]
+
+    return counts_by_treatment_id_and_coarse_appt_type
+
+
 def get_simulation_usage_by_level(results_folder: Path) -> pd.DataFrame:
     """Returns the simulated MEAN USAGE PER YEAR DURING THE TIME_PERIOD, by appointment type and level.
     """
@@ -280,6 +315,9 @@ def format_and_save(_fig, _ax, _name_of_plot):
     hcw_usage = hcw_usage.groupby(['Facility_Level', 'Officer_Category', 'Appt_Category']
                                   )['Total_Mins_Used_Per_Year'].sum().reset_index()
 
+    # hcw usage per cadre per facility level per hsi
+    hsi_count = get_annual_num_hsi_by_appt_and_level(results_folder)
+
 
 if __name__ == "__main__":
     outputspath = Path('./outputs/bshe@ic.ac.uk')

From 4c93e17fc4fcd629eaa55e5ab3dde1665b4f106e Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Wed, 26 Jul 2023 21:13:44 +0100
Subject: [PATCH 091/131] get hsi count by treatment id and appt type and
 facility level

---
 .../analysis_hcw_usage_by_appt_and_by_disease.py  | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
index d787632840..742dd29859 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
@@ -73,19 +73,19 @@ def unpack_nested_dict_in_series(_raw: pd.Series):
         ).unstack().astype(int)
 
 
-# todo: add level info
 def get_annual_num_hsi_by_appt_and_level(results_folder: Path) -> pd.DataFrame:
     """Return pd.DataFrame gives the (mean) simulated annual count of hsi
     per treatment id per each appt type per level."""
-    counts_by_treatment_id_and_coarse_appt_type = compute_mean_across_runs(
+    hsi_count = compute_mean_across_runs(
         bin_hsi_event_details(
             results_folder,
             lambda event_details, count: sum(
                 [
                     Counter({
                         (
-                            event_details["treatment_id"].split("_")[0],
-                            get_coarse_appt_type(appt_type)
+                            event_details['treatment_id'],
+                            appt_type,
+                            event_details['facility_level'],
                         ):
                             count * appt_number
                     })
@@ -98,7 +98,12 @@ def get_annual_num_hsi_by_appt_and_level(results_folder: Path) -> pd.DataFrame:
         )
     )[0]
 
-    return counts_by_treatment_id_and_coarse_appt_type
+    hsi_count = pd.DataFrame.from_dict(hsi_count, orient='index').reset_index().rename(columns={0: 'count'})
+    hsi_count[['treatment_id', 'appt', 'facility_level']] = pd.DataFrame(hsi_count['index'].tolist(),
+                                                                         index=hsi_count.index)
+    hsi_count = hsi_count.groupby(['treatment_id', 'appt', 'facility_level'])['count'].sum().reset_index()
+
+    return hsi_count
 
 
 def get_simulation_usage_by_level(results_folder: Path) -> pd.DataFrame:

From a3e24190f099816ed88260878838811451c4ecee Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Fri, 28 Jul 2023 11:44:29 +0100
Subject: [PATCH 092/131] omit level info (as hsi calibration on faciliy level
 is not yet donw) and add todo tasks

---
 ...alysis_hcw_usage_by_appt_and_by_disease.py | 31 ++++++++++++++-----
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
index 742dd29859..cf89a747a1 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
@@ -73,6 +73,7 @@ def unpack_nested_dict_in_series(_raw: pd.Series):
         ).unstack().astype(int)
 
 
+# todo: fix the issue that this func may over count the hsi
 def get_annual_num_hsi_by_appt_and_level(results_folder: Path) -> pd.DataFrame:
     """Return pd.DataFrame gives the (mean) simulated annual count of hsi
     per treatment id per each appt type per level."""
@@ -98,10 +99,10 @@ def get_annual_num_hsi_by_appt_and_level(results_folder: Path) -> pd.DataFrame:
         )
     )[0]
 
-    hsi_count = pd.DataFrame.from_dict(hsi_count, orient='index').reset_index().rename(columns={0: 'count'})
-    hsi_count[['treatment_id', 'appt', 'facility_level']] = pd.DataFrame(hsi_count['index'].tolist(),
-                                                                         index=hsi_count.index)
-    hsi_count = hsi_count.groupby(['treatment_id', 'appt', 'facility_level'])['count'].sum().reset_index()
+    hsi_count = pd.DataFrame.from_dict(hsi_count, orient='index').reset_index().rename(columns={0: 'Count'})
+    hsi_count[['Treatment_ID', 'Appt_Type_Code', 'Facility_Level']] = pd.DataFrame(hsi_count['index'].tolist(),
+                                                                                   index=hsi_count.index)
+    hsi_count = hsi_count.groupby(['Treatment_ID', 'Appt_Type_Code', 'Facility_Level'])['Count'].sum().reset_index()
 
     return hsi_count
 
@@ -310,18 +311,32 @@ def format_and_save(_fig, _ax, _name_of_plot):
     #  /facility level (focus on level 1a, 1b and 2)/disease (represented by short treatment id),
     #  and comparing 4 scenarios, i.e., Actual/Establishment(funded_plus) HCW * Default/Maximal health care seeking
 
-    # hcw usage per cadre per facility level (1a, 1b, 2) per appointment type
+    # hcw usage per cadre per appointment type
     hcw_usage = appt_time.drop(index=appt_time[~appt_time.Appt_Type_Code.isin(appts_sim)].index).reset_index(drop=True)
     for idx in hcw_usage.index:
         hcw_usage.loc[idx, 'Total_Mins_Used_Per_Year'] = (hcw_usage.loc[idx, 'Time_Taken_Mins'] *
                                                           simulation_usage.loc[hcw_usage.loc[idx, 'Facility_Level'],
                                                                                hcw_usage.loc[idx, 'Appt_Type_Code']])
-    hcw_usage.drop(columns='Time_Taken_Mins', inplace=True)
-    hcw_usage = hcw_usage.groupby(['Facility_Level', 'Officer_Category', 'Appt_Category']
+    hcw_usage = hcw_usage.groupby(['Officer_Category', 'Appt_Category']
                                   )['Total_Mins_Used_Per_Year'].sum().reset_index()
 
-    # hcw usage per cadre per facility level per hsi
+    # hcw usage per cadre per appt per hsi
     hsi_count = get_annual_num_hsi_by_appt_and_level(results_folder)
+    hsi_count_alt = hsi_count.groupby(['Appt_Type_Code', 'Facility_Level'])['Count'].sum().reset_index().pivot(
+        index='Facility_Level', columns='Appt_Type_Code', values='Count').fillna(0.0)
+    # todo: fix this assert error
+    assert (hsi_count_alt == simulation_usage.drop(index='4')).all().all()
+
+    hcw_usage_hsi = appt_time.drop(index=appt_time[~appt_time.Appt_Type_Code.isin(appts_sim)].index
+                                   ).reset_index(drop=True)
+    hcw_usage_hsi = hsi_count.merge(hcw_usage_hsi, on=['Facility_Level', 'Appt_Type_Code'], how='left')
+    hcw_usage_hsi['Total_Mins_Used_Per_Year'] = hcw_usage_hsi['Count'] * hcw_usage_hsi['Time_Taken_Mins']
+    hcw_usage_hsi = hcw_usage_hsi.groupby(['Treatment_ID', 'Appt_Category', 'Officer_Category']
+                                          )['Total_Mins_Used_Per_Year'].sum().reset_index()
+    hcw_usage_alt = hcw_usage_hsi.groupby(['Officer_Category', 'Appt_Category']
+                                          )['Total_Mins_Used_Per_Year'].sum().reset_index()
+    # todo: check
+    assert (hcw_usage_alt == hcw_usage).all().all()
 
 
 if __name__ == "__main__":

From 46c251b4ca9e4595473c267968801dab9376a9cc Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Sat, 29 Jul 2023 18:39:40 +0100
Subject: [PATCH 093/131] plot simulated hcw time usage per cadre

---
 ...alysis_hcw_usage_by_appt_and_by_disease.py | 156 +++++++++++++++---
 1 file changed, 135 insertions(+), 21 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
index cf89a747a1..86f2789d27 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
@@ -73,6 +73,72 @@ def unpack_nested_dict_in_series(_raw: pd.Series):
         ).unstack().astype(int)
 
 
+def get_annual_hcw_time_used_with_confidence_interval(results_folder: Path, resourcefilepath: Path) -> pd.DataFrame:
+    """Return pd.DataFrame gives the (mean) simulated annual hcw time used per cadre across all levels,
+    with 95% confidence interval."""
+
+    def get_annual_hcw_time_used(_df) -> pd.Series:
+        """Get the annual hcw time used per cadre across all levels"""
+
+        # get annual counts of appt per level
+        def unpack_nested_dict_in_series(_raw: pd.Series):
+            return pd.concat(
+                {
+                  idx: pd.DataFrame.from_dict(mydict) for idx, mydict in _raw.items()
+                 }
+             ).unstack().fillna(0.0).astype(int)
+
+        annual_counts_of_appts_per_level = _df \
+            .loc[pd.to_datetime(_df['date']).between(*TARGET_PERIOD), 'Number_By_Appt_Type_Code_And_Level'] \
+            .pipe(unpack_nested_dict_in_series) \
+            .groupby(level=[0, 1], axis=1).sum() \
+            .mean(axis=0) \
+            .to_frame().reset_index() \
+            .rename(columns={'level_0': 'Facility_Level', 'level_1': 'Appt_Type_Code', 0: 'Count'}) \
+            .pivot(index='Facility_Level', columns='Appt_Type_Code', values='Count')
+
+        # get appt time definitions
+        appt_time = get_expected_appt_time(resourcefilepath)
+
+        appts_def = set(appt_time.Appt_Type_Code)
+        appts_sim = set(annual_counts_of_appts_per_level.columns.values)
+        assert appts_sim.issubset(appts_def)
+
+        # get hcw time used per cadre per level
+        _hcw_usage = appt_time.drop(index=appt_time[~appt_time.Appt_Type_Code.isin(appts_sim)].index).reset_index(
+            drop=True)
+        for idx in _hcw_usage.index:
+            fl = _hcw_usage.loc[idx, 'Facility_Level']
+            appt = _hcw_usage.loc[idx, 'Appt_Type_Code']
+            _hcw_usage.loc[idx, 'Total_Mins_Used_Per_Year'] = (_hcw_usage.loc[idx, 'Time_Taken_Mins'] *
+                                                              annual_counts_of_appts_per_level.loc[fl, appt])
+
+        # get hcw time used per cadre
+        _hcw_usage = _hcw_usage.groupby(['Officer_Category'])['Total_Mins_Used_Per_Year'].sum()
+
+        return _hcw_usage
+
+    # get hcw time used per cadre with CI
+    hcw_usage = summarize(
+        extract_results(
+                results_folder,
+                module='tlo.methods.healthsystem.summary',
+                key='HSI_Event',
+                custom_generate_series=get_annual_hcw_time_used,
+                do_scaling=True
+            ),
+        only_mean=False,
+        collapse_columns=True,
+        ).unstack().astype(int)
+
+    # reformat
+    hcw_usage = hcw_usage.to_frame().reset_index() \
+        .rename(columns={'stat': 'Value_Type', 0: 'Value'}) \
+        .pivot(index='Officer_Category', columns='Value_Type', values='Value')
+
+    return hcw_usage
+
+
 # todo: fix the issue that this func may over count the hsi
 def get_annual_num_hsi_by_appt_and_level(results_folder: Path) -> pd.DataFrame:
     """Return pd.DataFrame gives the (mean) simulated annual count of hsi
@@ -274,34 +340,14 @@ def apply(results_folder: Path, output_folder: Path, resourcefilepath: Path = No
 
     make_graph_file_name = lambda stub: output_folder / f"{PREFIX_ON_FILENAME}_{stub}.png"  # noqa: E731
 
-    # format plot
-    def format_and_save(_fig, _ax, _name_of_plot):
-        _ax.set_title(_name_of_plot)
-        _ax.set_yscale('log')
-        _ax.set_ylim(1 / 20, 20)
-        _ax.set_yticks([1 / 10, 1.0, 10])
-        _ax.set_yticklabels(("<= 1/10", "1.0", ">= 10"))
-        _ax.set_ylabel('Model / Data')
-        _ax.set_xlabel('Appointment Type')
-        _ax.tick_params(axis='x', labelrotation=90)
-        _ax.xaxis.grid(True, which='major', linestyle='--')
-        _ax.yaxis.grid(True, which='both', linestyle='--')
-        _ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
-        _fig.tight_layout()
-        _fig.savefig(make_graph_file_name(_name_of_plot.replace(',', '').replace('\n', '_').replace(' ', '_')))
-        _fig.show()
-        plt.close(_fig)
-
     # get average annual usage by level for Simulation and Real
     simulation_usage = get_simulation_usage_by_level(results_folder)
 
     real_usage = get_real_usage(resourcefilepath)[0]
 
-    # get expected appt time and hcw capability
+    # get expected appt time
     appt_time = get_expected_appt_time(resourcefilepath)
 
-    hcw_capability = get_hcw_capability(resourcefilepath, hcwscenario='actual')
-
     # check that appts in simulation_usage are in appt_time
     appts_def = set(appt_time.Appt_Type_Code)
     appts_sim = set(simulation_usage.columns.values)
@@ -320,6 +366,74 @@ def format_and_save(_fig, _ax, _name_of_plot):
     hcw_usage = hcw_usage.groupby(['Officer_Category', 'Appt_Category']
                                   )['Total_Mins_Used_Per_Year'].sum().reset_index()
 
+    # check that hcw time simulated derived from different methods are equal (or with negligible difference)
+    hcw_time_used_1 = hcw_usage.groupby(['Officer_Category'])['Total_Mins_Used_Per_Year'].sum().to_frame()
+    hcw_time_used_0 = get_annual_hcw_time_used_with_confidence_interval(results_folder, resourcefilepath)
+    assert (hcw_time_used_1.index == hcw_time_used_0.index).all()
+    assert (abs(
+        (hcw_time_used_1['Total_Mins_Used_Per_Year'] - hcw_time_used_0['mean']) / hcw_time_used_0['mean']) < 1e-5
+            ).all()
+
+    # todo: get actual hcw time used derived from DHIS2 data and plot
+
+    # format data and plot bar chart
+    def format_hcw_usage(hcwscenario='actual'):
+        """format data for bar plot"""
+        # get hcw capability in actual or establishment (funded_plus) scenarios
+        hcw_capability = get_hcw_capability(resourcefilepath, hcwscenario=hcwscenario) \
+            .groupby('Officer_Category')['Total_Mins_Per_Year'].sum().to_frame() \
+            .rename(columns={'Total_Mins_Per_Year': 'capability'})
+
+        # calculate hcw time usage ratio against capability with CI
+        hcw_usage_ratio = hcw_time_used_0.join(hcw_capability)
+        hcw_usage_ratio.loc['All'] = hcw_usage_ratio.sum()
+        hcw_usage_ratio['mean'] = hcw_usage_ratio['mean'] / hcw_usage_ratio['capability']
+        hcw_usage_ratio['lower'] = hcw_usage_ratio['lower'] / hcw_usage_ratio['capability']
+        hcw_usage_ratio['upper'] = hcw_usage_ratio['upper'] / hcw_usage_ratio['capability']
+
+        hcw_usage_ratio['lower_error'] = (hcw_usage_ratio['mean'] - hcw_usage_ratio['lower'])
+        hcw_usage_ratio['upper_error'] = (hcw_usage_ratio['upper'] - hcw_usage_ratio['mean'])
+
+        asymmetric_error = [hcw_usage_ratio['lower_error'].values, hcw_usage_ratio['upper_error'].values]
+        hcw_usage_ratio = pd.DataFrame(hcw_usage_ratio['mean']) \
+            .clip(lower=0.1, upper=10.0)
+
+        # reduce the mean ratio by 1.0, for the bar plot that starts from y=1.0 instead of y=0.0
+        hcw_usage_ratio['mean'] = hcw_usage_ratio['mean'] - 1.0
+
+        return hcw_usage_ratio, asymmetric_error
+
+    hcw_usage_ratio_actual, error_actual = format_hcw_usage(hcwscenario='actual')
+    hcw_usage_ratio_establishment, error_establishment = format_hcw_usage(hcwscenario='funded_plus')
+
+    name_of_plot = 'Simulated annual working time vs Capability per cadre'
+    fig, ax = plt.subplots(figsize=(8, 5))
+    hcw_usage_ratio_establishment.plot(kind='bar', yerr=error_establishment, width=0.4,
+                                       ax=ax, position=0, bottom=1.0,
+                                       legend=False, color='c')
+    hcw_usage_ratio_actual.plot(kind='bar', yerr=error_actual, width=0.4,
+                                ax=ax, position=1, bottom=1.0,
+                                legend=False, color='y')
+    ax.axhline(1.0, color='r')
+    ax.set_xlim(right=len(hcw_usage_ratio_establishment) - 0.3)
+    ax.set_yscale('log')
+    ax.set_ylim(1 / 20, 20)
+    ax.set_yticks([1 / 10, 1.0, 10])
+    ax.set_yticklabels(("<= 1/10", "1.0", ">= 10"))
+    ax.set_ylabel('Working time / Capability')
+    ax.set_xlabel('Cadre Category')
+    plt.xticks(rotation=60, ha='right')
+    ax.xaxis.grid(True, which='major', linestyle='--')
+    ax.yaxis.grid(True, which='both', linestyle='--')
+    ax.set_title(name_of_plot)
+    patch_establishment = matplotlib.patches.Patch(facecolor='c', label='Establishment capability')
+    patch_actual = matplotlib.patches.Patch(facecolor='y', label='Actual capability')
+    legend = plt.legend(handles=[patch_actual, patch_establishment], loc='center left', bbox_to_anchor=(1.0, 0.5))
+    fig.add_artist(legend)
+    fig.tight_layout()
+    fig.savefig(make_graph_file_name(name_of_plot.replace(',', '').replace('\n', '_').replace(' ', '_')))
+    plt.show()
+
     # hcw usage per cadre per appt per hsi
     hsi_count = get_annual_num_hsi_by_appt_and_level(results_folder)
     hsi_count_alt = hsi_count.groupby(['Appt_Type_Code', 'Facility_Level'])['Count'].sum().reset_index().pivot(

From 8e2789719b5942896b37c38cc114b1060724c105 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Sat, 29 Jul 2023 18:55:47 +0100
Subject: [PATCH 094/131] minor update

---
 .../analysis_hcw_usage_by_appt_and_by_disease.py            | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
index 86f2789d27..91d27ac298 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
@@ -84,7 +84,7 @@ def get_annual_hcw_time_used(_df) -> pd.Series:
         def unpack_nested_dict_in_series(_raw: pd.Series):
             return pd.concat(
                 {
-                  idx: pd.DataFrame.from_dict(mydict) for idx, mydict in _raw.items()
+                  _idx: pd.DataFrame.from_dict(mydict) for _idx, mydict in _raw.items()
                  }
              ).unstack().fillna(0.0).astype(int)
 
@@ -111,7 +111,7 @@ def unpack_nested_dict_in_series(_raw: pd.Series):
             fl = _hcw_usage.loc[idx, 'Facility_Level']
             appt = _hcw_usage.loc[idx, 'Appt_Type_Code']
             _hcw_usage.loc[idx, 'Total_Mins_Used_Per_Year'] = (_hcw_usage.loc[idx, 'Time_Taken_Mins'] *
-                                                              annual_counts_of_appts_per_level.loc[fl, appt])
+                                                               annual_counts_of_appts_per_level.loc[fl, appt])
 
         # get hcw time used per cadre
         _hcw_usage = _hcw_usage.groupby(['Officer_Category'])['Total_Mins_Used_Per_Year'].sum()
@@ -371,7 +371,7 @@ def apply(results_folder: Path, output_folder: Path, resourcefilepath: Path = No
     hcw_time_used_0 = get_annual_hcw_time_used_with_confidence_interval(results_folder, resourcefilepath)
     assert (hcw_time_used_1.index == hcw_time_used_0.index).all()
     assert (abs(
-        (hcw_time_used_1['Total_Mins_Used_Per_Year'] - hcw_time_used_0['mean']) / hcw_time_used_0['mean']) < 1e-5
+        (hcw_time_used_1['Total_Mins_Used_Per_Year'] - hcw_time_used_0['mean']) / hcw_time_used_0['mean']) < 1e-4
             ).all()
 
     # todo: get actual hcw time used derived from DHIS2 data and plot

From 0025b1ac8525bbd07c40a4836e811ecd7877880e Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Sat, 29 Jul 2023 20:54:35 +0100
Subject: [PATCH 095/131] minor but crucial update

---
 .../analysis_hcw_usage_by_appt_and_by_disease.py                | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
index 91d27ac298..6ca9407303 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
@@ -129,7 +129,7 @@ def unpack_nested_dict_in_series(_raw: pd.Series):
             ),
         only_mean=False,
         collapse_columns=True,
-        ).unstack().astype(int)
+        ).unstack()
 
     # reformat
     hcw_usage = hcw_usage.to_frame().reset_index() \

From 53fb6412cdb29f012fb482338e05b6c7e312702f Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Mon, 31 Jul 2023 10:56:37 +0100
Subject: [PATCH 096/131] fix the hsi count issue: should calculate average
 annual count

---
 ...alysis_hcw_usage_by_appt_and_by_disease.py | 27 ++++++++++++-------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
index 6ca9407303..8b41357698 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
@@ -168,7 +168,10 @@ def get_annual_num_hsi_by_appt_and_level(results_folder: Path) -> pd.DataFrame:
     hsi_count = pd.DataFrame.from_dict(hsi_count, orient='index').reset_index().rename(columns={0: 'Count'})
     hsi_count[['Treatment_ID', 'Appt_Type_Code', 'Facility_Level']] = pd.DataFrame(hsi_count['index'].tolist(),
                                                                                    index=hsi_count.index)
-    hsi_count = hsi_count.groupby(['Treatment_ID', 'Appt_Type_Code', 'Facility_Level'])['Count'].sum().reset_index()
+    # average annual count by treatment id, appt type and facility level
+    yr_count = TARGET_PERIOD[1].year - TARGET_PERIOD[0].year + 1
+    hsi_count = hsi_count.groupby(['Treatment_ID', 'Appt_Type_Code', 'Facility_Level'])['Count'].sum()/yr_count
+    hsi_count = hsi_count.to_frame().reset_index()
 
     return hsi_count
 
@@ -353,10 +356,6 @@ def apply(results_folder: Path, output_folder: Path, resourcefilepath: Path = No
     appts_sim = set(simulation_usage.columns.values)
     assert appts_sim.issubset(appts_def)
 
-    # todo: plot hcw time usage against capability per cadre
-    #  /facility level (focus on level 1a, 1b and 2)/disease (represented by short treatment id),
-    #  and comparing 4 scenarios, i.e., Actual/Establishment(funded_plus) HCW * Default/Maximal health care seeking
-
     # hcw usage per cadre per appointment type
     hcw_usage = appt_time.drop(index=appt_time[~appt_time.Appt_Type_Code.isin(appts_sim)].index).reset_index(drop=True)
     for idx in hcw_usage.index:
@@ -436,21 +435,31 @@ def format_hcw_usage(hcwscenario='actual'):
 
     # hcw usage per cadre per appt per hsi
     hsi_count = get_annual_num_hsi_by_appt_and_level(results_folder)
+
+    # first check that hsi count by different methods are equal (or with small difference)
     hsi_count_alt = hsi_count.groupby(['Appt_Type_Code', 'Facility_Level'])['Count'].sum().reset_index().pivot(
         index='Facility_Level', columns='Appt_Type_Code', values='Count').fillna(0.0)
-    # todo: fix this assert error
-    assert (hsi_count_alt == simulation_usage.drop(index='4')).all().all()
+    assert (hsi_count_alt - simulation_usage.drop(index='4') < 1.0).all().all()
 
+    # then calculate the hcw working time per treatment id, appt type and cadre
     hcw_usage_hsi = appt_time.drop(index=appt_time[~appt_time.Appt_Type_Code.isin(appts_sim)].index
                                    ).reset_index(drop=True)
     hcw_usage_hsi = hsi_count.merge(hcw_usage_hsi, on=['Facility_Level', 'Appt_Type_Code'], how='left')
     hcw_usage_hsi['Total_Mins_Used_Per_Year'] = hcw_usage_hsi['Count'] * hcw_usage_hsi['Time_Taken_Mins']
     hcw_usage_hsi = hcw_usage_hsi.groupby(['Treatment_ID', 'Appt_Category', 'Officer_Category']
                                           )['Total_Mins_Used_Per_Year'].sum().reset_index()
+
+    # also check that the hcw time from different methods are equal (or with small difference)
     hcw_usage_alt = hcw_usage_hsi.groupby(['Officer_Category', 'Appt_Category']
                                           )['Total_Mins_Used_Per_Year'].sum().reset_index()
-    # todo: check
-    assert (hcw_usage_alt == hcw_usage).all().all()
+    assert (hcw_usage_alt.Officer_Category == hcw_usage.Officer_Category).all
+    assert (hcw_usage_alt.Appt_Category == hcw_usage.Appt_Category).all()
+    assert ((abs(hcw_usage_alt.Total_Mins_Used_Per_Year - hcw_usage.Total_Mins_Used_Per_Year) /
+            hcw_usage.Total_Mins_Used_Per_Year) < 1e-4
+            ).all().all()
+
+    # save the data to draw sankey diagram
+    hcw_usage_hsi.to_csv(output_folder/'hcw_working_time_per_hsi.csv', index=False)
 
 
 if __name__ == "__main__":

From 872555c2128423e769671aeb2917cbfd223d1f5f Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Mon, 31 Jul 2023 11:06:01 +0100
Subject: [PATCH 097/131] reorganise the script

---
 ...alysis_hcw_usage_by_appt_and_by_disease.py | 164 +++++++++---------
 1 file changed, 82 insertions(+), 82 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
index 8b41357698..fb9690715d 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
@@ -73,6 +73,52 @@ def unpack_nested_dict_in_series(_raw: pd.Series):
         ).unstack().astype(int)
 
 
+def get_annual_num_hsi_by_appt_and_level(results_folder: Path) -> pd.DataFrame:
+    """Return pd.DataFrame gives the (mean) simulated annual count of hsi
+    per treatment id per each appt type per level."""
+    hsi_count = compute_mean_across_runs(
+        bin_hsi_event_details(
+            results_folder,
+            lambda event_details, count: sum(
+                [
+                    Counter({
+                        (
+                            event_details['treatment_id'],
+                            appt_type,
+                            event_details['facility_level'],
+                        ):
+                            count * appt_number
+                    })
+                    for appt_type, appt_number in event_details["appt_footprint"]
+                ],
+                Counter()
+            ),
+            *TARGET_PERIOD,
+            True
+        )
+    )[0]
+
+    hsi_count = pd.DataFrame.from_dict(hsi_count, orient='index').reset_index().rename(columns={0: 'Count'})
+    hsi_count[['Treatment_ID', 'Appt_Type_Code', 'Facility_Level']] = pd.DataFrame(hsi_count['index'].tolist(),
+                                                                                   index=hsi_count.index)
+    # average annual count by treatment id, appt type and facility level
+    yr_count = TARGET_PERIOD[1].year - TARGET_PERIOD[0].year + 1
+    hsi_count = hsi_count.groupby(['Treatment_ID', 'Appt_Type_Code', 'Facility_Level'])['Count'].sum()/yr_count
+    hsi_count = hsi_count.to_frame().reset_index()
+
+    return hsi_count
+
+
+def get_simulation_usage_by_level(results_folder: Path) -> pd.DataFrame:
+    """Returns the simulated MEAN USAGE PER YEAR DURING THE TIME_PERIOD, by appointment type and level.
+    """
+
+    # Get model outputs
+    model_output = get_annual_num_appts_by_level(results_folder=results_folder)
+
+    return model_output
+
+
 def get_annual_hcw_time_used_with_confidence_interval(results_folder: Path, resourcefilepath: Path) -> pd.DataFrame:
     """Return pd.DataFrame gives the (mean) simulated annual hcw time used per cadre across all levels,
     with 95% confidence interval."""
@@ -139,53 +185,6 @@ def unpack_nested_dict_in_series(_raw: pd.Series):
     return hcw_usage
 
 
-# todo: fix the issue that this func may over count the hsi
-def get_annual_num_hsi_by_appt_and_level(results_folder: Path) -> pd.DataFrame:
-    """Return pd.DataFrame gives the (mean) simulated annual count of hsi
-    per treatment id per each appt type per level."""
-    hsi_count = compute_mean_across_runs(
-        bin_hsi_event_details(
-            results_folder,
-            lambda event_details, count: sum(
-                [
-                    Counter({
-                        (
-                            event_details['treatment_id'],
-                            appt_type,
-                            event_details['facility_level'],
-                        ):
-                            count * appt_number
-                    })
-                    for appt_type, appt_number in event_details["appt_footprint"]
-                ],
-                Counter()
-            ),
-            *TARGET_PERIOD,
-            True
-        )
-    )[0]
-
-    hsi_count = pd.DataFrame.from_dict(hsi_count, orient='index').reset_index().rename(columns={0: 'Count'})
-    hsi_count[['Treatment_ID', 'Appt_Type_Code', 'Facility_Level']] = pd.DataFrame(hsi_count['index'].tolist(),
-                                                                                   index=hsi_count.index)
-    # average annual count by treatment id, appt type and facility level
-    yr_count = TARGET_PERIOD[1].year - TARGET_PERIOD[0].year + 1
-    hsi_count = hsi_count.groupby(['Treatment_ID', 'Appt_Type_Code', 'Facility_Level'])['Count'].sum()/yr_count
-    hsi_count = hsi_count.to_frame().reset_index()
-
-    return hsi_count
-
-
-def get_simulation_usage_by_level(results_folder: Path) -> pd.DataFrame:
-    """Returns the simulated MEAN USAGE PER YEAR DURING THE TIME_PERIOD, by appointment type and level.
-    """
-
-    # Get model outputs
-    model_output = get_annual_num_appts_by_level(results_folder=results_folder)
-
-    return model_output
-
-
 def adjust_real_usage_on_mentalall(real_usage_df) -> pd.DataFrame:
     """This is to adjust the annual MentalAll usage in real usage dataframe.
     The MentalAll usage was not adjusted in the preprocessing stage considering individual facilities and very low
@@ -343,39 +342,7 @@ def apply(results_folder: Path, output_folder: Path, resourcefilepath: Path = No
 
     make_graph_file_name = lambda stub: output_folder / f"{PREFIX_ON_FILENAME}_{stub}.png"  # noqa: E731
 
-    # get average annual usage by level for Simulation and Real
-    simulation_usage = get_simulation_usage_by_level(results_folder)
-
-    real_usage = get_real_usage(resourcefilepath)[0]
-
-    # get expected appt time
-    appt_time = get_expected_appt_time(resourcefilepath)
-
-    # check that appts in simulation_usage are in appt_time
-    appts_def = set(appt_time.Appt_Type_Code)
-    appts_sim = set(simulation_usage.columns.values)
-    assert appts_sim.issubset(appts_def)
-
-    # hcw usage per cadre per appointment type
-    hcw_usage = appt_time.drop(index=appt_time[~appt_time.Appt_Type_Code.isin(appts_sim)].index).reset_index(drop=True)
-    for idx in hcw_usage.index:
-        hcw_usage.loc[idx, 'Total_Mins_Used_Per_Year'] = (hcw_usage.loc[idx, 'Time_Taken_Mins'] *
-                                                          simulation_usage.loc[hcw_usage.loc[idx, 'Facility_Level'],
-                                                                               hcw_usage.loc[idx, 'Appt_Type_Code']])
-    hcw_usage = hcw_usage.groupby(['Officer_Category', 'Appt_Category']
-                                  )['Total_Mins_Used_Per_Year'].sum().reset_index()
-
-    # check that hcw time simulated derived from different methods are equal (or with negligible difference)
-    hcw_time_used_1 = hcw_usage.groupby(['Officer_Category'])['Total_Mins_Used_Per_Year'].sum().to_frame()
-    hcw_time_used_0 = get_annual_hcw_time_used_with_confidence_interval(results_folder, resourcefilepath)
-    assert (hcw_time_used_1.index == hcw_time_used_0.index).all()
-    assert (abs(
-        (hcw_time_used_1['Total_Mins_Used_Per_Year'] - hcw_time_used_0['mean']) / hcw_time_used_0['mean']) < 1e-4
-            ).all()
-
-    # todo: get actual hcw time used derived from DHIS2 data and plot
-
-    # format data and plot bar chart
+    # format data and plot bar chart for hcw working time per cadre
     def format_hcw_usage(hcwscenario='actual'):
         """format data for bar plot"""
         # get hcw capability in actual or establishment (funded_plus) scenarios
@@ -384,7 +351,8 @@ def format_hcw_usage(hcwscenario='actual'):
             .rename(columns={'Total_Mins_Per_Year': 'capability'})
 
         # calculate hcw time usage ratio against capability with CI
-        hcw_usage_ratio = hcw_time_used_0.join(hcw_capability)
+        hcw_usage = get_annual_hcw_time_used_with_confidence_interval(results_folder, resourcefilepath)
+        hcw_usage_ratio = hcw_usage.join(hcw_capability)
         hcw_usage_ratio.loc['All'] = hcw_usage_ratio.sum()
         hcw_usage_ratio['mean'] = hcw_usage_ratio['mean'] / hcw_usage_ratio['capability']
         hcw_usage_ratio['lower'] = hcw_usage_ratio['lower'] / hcw_usage_ratio['capability']
@@ -433,6 +401,38 @@ def format_hcw_usage(hcwscenario='actual'):
     fig.savefig(make_graph_file_name(name_of_plot.replace(',', '').replace('\n', '_').replace(' ', '_')))
     plt.show()
 
+    # todo: get actual hcw time used derived from DHIS2 data and plot
+
+    # get average annual usage by level for Simulation and Real
+    simulation_usage = get_simulation_usage_by_level(results_folder)
+
+    real_usage = get_real_usage(resourcefilepath)[0]
+
+    # get expected appt time
+    appt_time = get_expected_appt_time(resourcefilepath)
+
+    # check that appts in simulation_usage are in appt_time
+    appts_def = set(appt_time.Appt_Type_Code)
+    appts_sim = set(simulation_usage.columns.values)
+    assert appts_sim.issubset(appts_def)
+
+    # hcw usage per cadre per appointment type
+    hcw_usage = appt_time.drop(index=appt_time[~appt_time.Appt_Type_Code.isin(appts_sim)].index).reset_index(drop=True)
+    for idx in hcw_usage.index:
+        hcw_usage.loc[idx, 'Total_Mins_Used_Per_Year'] = (hcw_usage.loc[idx, 'Time_Taken_Mins'] *
+                                                          simulation_usage.loc[hcw_usage.loc[idx, 'Facility_Level'],
+                                                                               hcw_usage.loc[idx, 'Appt_Type_Code']])
+    hcw_usage = hcw_usage.groupby(['Officer_Category', 'Appt_Category']
+                                  )['Total_Mins_Used_Per_Year'].sum().reset_index()
+
+    # check that hcw time simulated derived from different methods are equal (or with negligible difference)
+    hcw_time_used_1 = hcw_usage.groupby(['Officer_Category'])['Total_Mins_Used_Per_Year'].sum().to_frame()
+    hcw_time_used_0 = get_annual_hcw_time_used_with_confidence_interval(results_folder, resourcefilepath)
+    assert (hcw_time_used_1.index == hcw_time_used_0.index).all()
+    assert (abs(
+        (hcw_time_used_1['Total_Mins_Used_Per_Year'] - hcw_time_used_0['mean']) / hcw_time_used_0['mean']) < 1e-4
+            ).all()
+
     # hcw usage per cadre per appt per hsi
     hsi_count = get_annual_num_hsi_by_appt_and_level(results_folder)
 
@@ -468,7 +468,7 @@ def format_hcw_usage(hcwscenario='actual'):
 
     # Find results folder (most recent run generated using that scenario_filename)
     scenario_filename = '10_year_scale_run.py'
-    results_folder = get_scenario_outputs(scenario_filename, outputspath)[-4]
+    results_folder = get_scenario_outputs(scenario_filename, outputspath)[-1]
 
     # Test dataset:
     # results_folder = Path('/Users/tbh03/GitHub/TLOmodel/outputs/tbh03@ic.ac.uk/long_run_all_diseases-small')

From 5ebd88f1b88463fed145dc5e0f04edf289752555 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Mon, 31 Jul 2023 11:20:20 +0100
Subject: [PATCH 098/131] delete experimental coding

---
 ...alysis_hcw_usage_by_appt_and_by_disease.py | 89 +------------------
 1 file changed, 4 insertions(+), 85 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
index fb9690715d..db55aa7600 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
@@ -40,39 +40,6 @@
              }
 
 
-def get_annual_num_appts_by_level(results_folder: Path) -> pd.DataFrame:
-    """Return pd.DataFrame gives the (mean) simulated annual number of appointments of each type at each level."""
-
-    def get_counts_of_appts(_df):
-        """Get the mean number of appointments of each type being used each year at each level.
-        Need to rename appts to match standardized categories from the DHIS2 data."""
-
-        def unpack_nested_dict_in_series(_raw: pd.Series):
-            return pd.concat(
-                {
-                  idx: pd.DataFrame.from_dict(mydict) for idx, mydict in _raw.items()
-                 }
-             ).unstack().fillna(0.0).astype(int)
-
-        return _df \
-            .loc[pd.to_datetime(_df['date']).between(*TARGET_PERIOD), 'Number_By_Appt_Type_Code_And_Level'] \
-            .pipe(unpack_nested_dict_in_series) \
-            .groupby(level=[0, 1], axis=1).sum() \
-            .mean(axis=0)  # mean over each year (row)
-
-    return summarize(
-        extract_results(
-                results_folder,
-                module='tlo.methods.healthsystem.summary',
-                key='HSI_Event',
-                custom_generate_series=get_counts_of_appts,
-                do_scaling=True
-            ),
-        only_mean=True,
-        collapse_columns=True,
-        ).unstack().astype(int)
-
-
 def get_annual_num_hsi_by_appt_and_level(results_folder: Path) -> pd.DataFrame:
     """Return pd.DataFrame gives the (mean) simulated annual count of hsi
     per treatment id per each appt type per level."""
@@ -109,16 +76,6 @@ def get_annual_num_hsi_by_appt_and_level(results_folder: Path) -> pd.DataFrame:
     return hsi_count
 
 
-def get_simulation_usage_by_level(results_folder: Path) -> pd.DataFrame:
-    """Returns the simulated MEAN USAGE PER YEAR DURING THE TIME_PERIOD, by appointment type and level.
-    """
-
-    # Get model outputs
-    model_output = get_annual_num_appts_by_level(results_folder=results_folder)
-
-    return model_output
-
-
 def get_annual_hcw_time_used_with_confidence_interval(results_folder: Path, resourcefilepath: Path) -> pd.DataFrame:
     """Return pd.DataFrame gives the (mean) simulated annual hcw time used per cadre across all levels,
     with 95% confidence interval."""
@@ -403,44 +360,15 @@ def format_hcw_usage(hcwscenario='actual'):
 
     # todo: get actual hcw time used derived from DHIS2 data and plot
 
-    # get average annual usage by level for Simulation and Real
-    simulation_usage = get_simulation_usage_by_level(results_folder)
-
-    real_usage = get_real_usage(resourcefilepath)[0]
+    # hcw usage per cadre per appt per hsi
+    hsi_count = get_annual_num_hsi_by_appt_and_level(results_folder)
 
-    # get expected appt time
+    # first compare appts defined and appts in simulation/model
     appt_time = get_expected_appt_time(resourcefilepath)
-
-    # check that appts in simulation_usage are in appt_time
     appts_def = set(appt_time.Appt_Type_Code)
-    appts_sim = set(simulation_usage.columns.values)
+    appts_sim = set(hsi_count.Appt_Type_Code)
     assert appts_sim.issubset(appts_def)
 
-    # hcw usage per cadre per appointment type
-    hcw_usage = appt_time.drop(index=appt_time[~appt_time.Appt_Type_Code.isin(appts_sim)].index).reset_index(drop=True)
-    for idx in hcw_usage.index:
-        hcw_usage.loc[idx, 'Total_Mins_Used_Per_Year'] = (hcw_usage.loc[idx, 'Time_Taken_Mins'] *
-                                                          simulation_usage.loc[hcw_usage.loc[idx, 'Facility_Level'],
-                                                                               hcw_usage.loc[idx, 'Appt_Type_Code']])
-    hcw_usage = hcw_usage.groupby(['Officer_Category', 'Appt_Category']
-                                  )['Total_Mins_Used_Per_Year'].sum().reset_index()
-
-    # check that hcw time simulated derived from different methods are equal (or with negligible difference)
-    hcw_time_used_1 = hcw_usage.groupby(['Officer_Category'])['Total_Mins_Used_Per_Year'].sum().to_frame()
-    hcw_time_used_0 = get_annual_hcw_time_used_with_confidence_interval(results_folder, resourcefilepath)
-    assert (hcw_time_used_1.index == hcw_time_used_0.index).all()
-    assert (abs(
-        (hcw_time_used_1['Total_Mins_Used_Per_Year'] - hcw_time_used_0['mean']) / hcw_time_used_0['mean']) < 1e-4
-            ).all()
-
-    # hcw usage per cadre per appt per hsi
-    hsi_count = get_annual_num_hsi_by_appt_and_level(results_folder)
-
-    # first check that hsi count by different methods are equal (or with small difference)
-    hsi_count_alt = hsi_count.groupby(['Appt_Type_Code', 'Facility_Level'])['Count'].sum().reset_index().pivot(
-        index='Facility_Level', columns='Appt_Type_Code', values='Count').fillna(0.0)
-    assert (hsi_count_alt - simulation_usage.drop(index='4') < 1.0).all().all()
-
     # then calculate the hcw working time per treatment id, appt type and cadre
     hcw_usage_hsi = appt_time.drop(index=appt_time[~appt_time.Appt_Type_Code.isin(appts_sim)].index
                                    ).reset_index(drop=True)
@@ -449,15 +377,6 @@ def format_hcw_usage(hcwscenario='actual'):
     hcw_usage_hsi = hcw_usage_hsi.groupby(['Treatment_ID', 'Appt_Category', 'Officer_Category']
                                           )['Total_Mins_Used_Per_Year'].sum().reset_index()
 
-    # also check that the hcw time from different methods are equal (or with small difference)
-    hcw_usage_alt = hcw_usage_hsi.groupby(['Officer_Category', 'Appt_Category']
-                                          )['Total_Mins_Used_Per_Year'].sum().reset_index()
-    assert (hcw_usage_alt.Officer_Category == hcw_usage.Officer_Category).all
-    assert (hcw_usage_alt.Appt_Category == hcw_usage.Appt_Category).all()
-    assert ((abs(hcw_usage_alt.Total_Mins_Used_Per_Year - hcw_usage.Total_Mins_Used_Per_Year) /
-            hcw_usage.Total_Mins_Used_Per_Year) < 1e-4
-            ).all().all()
-
     # save the data to draw sankey diagram
     hcw_usage_hsi.to_csv(output_folder/'hcw_working_time_per_hsi.csv', index=False)
 

From 6dae0678609f1ad356f4d4dee475aeb169f93399 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Mon, 31 Jul 2023 11:55:26 +0100
Subject: [PATCH 099/131] comment on todo task of drawing plot for DHIS2 data

---
 .../analysis_hcw_usage_by_appt_and_by_disease.py   | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
index db55aa7600..192239da5a 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
@@ -358,8 +358,6 @@ def format_hcw_usage(hcwscenario='actual'):
     fig.savefig(make_graph_file_name(name_of_plot.replace(',', '').replace('\n', '_').replace(' ', '_')))
     plt.show()
 
-    # todo: get actual hcw time used derived from DHIS2 data and plot
-
     # hcw usage per cadre per appt per hsi
     hsi_count = get_annual_num_hsi_by_appt_and_level(results_folder)
 
@@ -380,6 +378,18 @@ def format_hcw_usage(hcwscenario='actual'):
     # save the data to draw sankey diagram
     hcw_usage_hsi.to_csv(output_folder/'hcw_working_time_per_hsi.csv', index=False)
 
+    # todo: get actual hcw time used derived from DHIS2 data and plot
+    # It is tricky that dhis2 datasets have different appt types, thus
+    # to make the intended comparison of hcw time usage between simulation and data,
+    # we need first to select the common appts/remap some appts and then make the comparison.
+    # A simpler way is to not draw this plot,
+    # but add discussion and appendix to explain the difference between simulation and data usage per appts.
+
+    # dhis_usage = get_real_usage(resourcefilepath)[0]
+
+    # first compare appts defined and appts in DHIS2 and HIV Dept datasets
+    # appts_dhis = set(dhis_usage.columns.values)
+
 
 if __name__ == "__main__":
     outputspath = Path('./outputs/bshe@ic.ac.uk')

From 703313631a9bf25c0efb8a77bcf2d040019e6bb6 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Mon, 31 Jul 2023 11:58:04 +0100
Subject: [PATCH 100/131] currently delete the todo task of drawing the plots
 for DHIS2 data

---
 ...alysis_hcw_usage_by_appt_and_by_disease.py | 130 ------------------
 1 file changed, 130 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
index 192239da5a..42e8c90f1e 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
@@ -142,124 +142,6 @@ def unpack_nested_dict_in_series(_raw: pd.Series):
     return hcw_usage
 
 
-def adjust_real_usage_on_mentalall(real_usage_df) -> pd.DataFrame:
-    """This is to adjust the annual MentalAll usage in real usage dataframe.
-    The MentalAll usage was not adjusted in the preprocessing stage considering individual facilities and very low
-    reporting rates.
-    We now directly adjust its annual usage by facility level using the aggregated annual reporting rates by
-    facility level. The latter is calculated based on DHIS2 Mental Health Report reporting rates."""
-    # the annual reporting rates for Mental Health Report by facility level (%), 2015-2019
-    # could turn the reporting rates data into a ResourceFile if necessary
-    rr = pd.DataFrame(index=['1a', '1b', '2', '3'], columns=list(range(2015, 2020)),
-                      data=[[44.00, 39.33, 79.00, 97.33, 95.00],
-                            [10.42, 12.50, 25.00, 40.00, 68.33],
-                            [36.67, 39.44, 37.22, 63.89, 56.67],
-                            [50.00, 45.83, 45.83, 50.00, 45.83]])
-    # make the adjustment assuming 100% reporting rates for each year
-    for level in ['1a', '1b', '2', '3']:
-        for y in range(2015, 2020):
-            real_usage_df.loc[(real_usage_df.Facility_Level == level)
-                              & (real_usage_df.Year == y)
-                              & (real_usage_df.Appt_Type == 'MentalAll'), 'Usage'] = (
-                real_usage_df.loc[(real_usage_df.Facility_Level == level)
-                                  & (real_usage_df.Year == y)
-                                  & (real_usage_df.Appt_Type == 'MentalAll'), 'Usage'] * 100 / rr.loc[level, y]
-            )
-
-    return real_usage_df
-
-
-def get_real_usage(resourcefilepath, adjusted=True) -> pd.DataFrame:
-    """
-    Returns the adjusted (default) or unadjusted real data on the (MEAN) USAGE PER YEAR DURING THE TIME_PERIOD
-    for each appointment at each level and all levels.
-    """
-
-    # add facility level and district columns to both real and simulation usage
-    mfl = pd.read_csv(
-        resourcefilepath / 'healthsystem' / 'organisation' / 'ResourceFile_Master_Facilities_List.csv')
-
-    # Get real usage data
-    # For the details of adjustment of real usage data, refer to Paper
-    # "The Changes in Health Service Utilisation in Malawi during the COVID-19 Pandemic"
-    if adjusted:
-        real_usage = pd.read_csv(
-            resourcefilepath / 'healthsystem' / 'real_appt_usage_data' /
-            'real_monthly_usage_of_appt_type.csv')
-    else:
-        real_usage = pd.read_csv(
-            resourcefilepath / 'healthsystem' / 'real_appt_usage_data' /
-            'unadjusted_real_monthly_usage_of_appt_type.csv')
-
-    # add Csection usage to Delivery, as Delivery has excluded Csection in real data file (to avoid overlap)
-    # whereas Delivery in tlo output has included Csection
-    real_delivery = real_usage.loc[(real_usage.Appt_Type == 'Delivery') | (real_usage.Appt_Type == 'Csection')
-                                   ].groupby(['Year', 'Month', 'Facility_ID']).agg({'Usage': 'sum'}).reset_index()
-    real_delivery['Appt_Type'] = 'Delivery'
-    real_usage = pd.concat([real_usage.drop(real_usage[real_usage.Appt_Type == 'Delivery'].index),
-                            real_delivery])
-
-    # get facility_level for each record
-    real_usage = real_usage.merge(mfl[['Facility_ID', 'Facility_Level']], left_on='Facility_ID', right_on='Facility_ID')
-
-    # adjust annual MentalAll usage using annual reporting rates
-    if adjusted:
-        real_usage = adjust_real_usage_on_mentalall(real_usage)
-
-    # assign date to each record
-    real_usage['date'] = pd.to_datetime({'year': real_usage['Year'], 'month': real_usage['Month'], 'day': 1})
-
-    # Produce table of the AVERAGE NUMBER PER YEAR DURING THE TIME_PERIOD of appointment type by level
-    # limit to date
-    totals_by_year = real_usage \
-        .loc[real_usage['date'].between(*TARGET_PERIOD)] \
-        .groupby(['Year', 'Appt_Type', 'Facility_Level'])['Usage'].sum()
-
-    # Combine the TB data [which is yearly] (after dropping period outside 2017-2019 according to data consistency
-    # and pandemic) with the rest of the data.
-    # Note that TB data is not adjusted considering comparability with NTP reports.
-    real_usage_TB = pd.read_csv(
-        resourcefilepath / 'healthsystem' / 'real_appt_usage_data' / 'real_yearly_usage_of_TBNotifiedAll.csv')
-    real_usage_TB = real_usage_TB.loc[real_usage_TB['Year'].isin([2017, 2018, 2019])]
-    real_usage_TB = real_usage_TB.merge(mfl[['Facility_ID', 'Facility_Level']],
-                                        left_on='Facility_ID', right_on='Facility_ID')
-    totals_by_year_TB = real_usage_TB.groupby(['Year', 'Appt_Type', 'Facility_Level'])['Usage'].sum()
-
-    annual_usage_by_level = pd.concat([totals_by_year.reset_index(), totals_by_year_TB.reset_index()], axis=0)
-
-    # group levels 1b and 2 into 2
-    # annual_usage_by_level['Facility_Level'] = annual_usage_by_level['Facility_Level'].replace({'1b': '2'})
-    annual_usage_by_level = annual_usage_by_level.groupby(
-        ['Year', 'Appt_Type', 'Facility_Level'])['Usage'].sum().reset_index()
-
-    # prepare annual usage by level with mean, 97.5% percentile, and 2.5% percentile
-    annual_usage_by_level_with_ci = annual_usage_by_level.drop(columns='Year').groupby(
-        ['Appt_Type', 'Facility_Level']
-    ).describe(percentiles=[0.025, 0.975]
-               ).stack(level=[0])[['mean', '2.5%', '97.5%']].reset_index().drop(columns='level_2')
-
-    average_annual_by_level = annual_usage_by_level_with_ci[['Appt_Type', 'Facility_Level', 'mean']].set_index(
-        ['Appt_Type', 'Facility_Level']).unstack()
-    average_annual_by_level.columns = average_annual_by_level.columns.get_level_values(1)
-    average_annual_by_level = average_annual_by_level.T
-
-    annual_usage_by_level_with_ci = pd.melt(annual_usage_by_level_with_ci,
-                                            id_vars=['Appt_Type', 'Facility_Level'], var_name='value_type')
-    annual_usage_by_level_with_ci.value_type = annual_usage_by_level_with_ci.value_type.replace({'2.5%': 'lower',
-                                                                                                 '97.5%': 'upper'})
-
-    # prepare annual usage at all levels with mean, 97.5% percentile, and 2.5% percentile
-    annual_usage_with_ci = annual_usage_by_level.groupby(
-        ['Year', 'Appt_Type'])['Usage'].sum().reset_index().drop(columns='Year').groupby(
-        'Appt_Type').describe(percentiles=[0.025, 0.975]
-                              ).stack(level=[0])[['mean', '2.5%', '97.5%']].reset_index().drop(columns='level_1')
-    annual_usage_with_ci = pd.melt(annual_usage_with_ci,
-                                   id_vars='Appt_Type', var_name='value_type')
-    annual_usage_with_ci.value_type = annual_usage_with_ci.value_type.replace({'2.5%': 'lower', '97.5%': 'upper'})
-
-    return average_annual_by_level, annual_usage_by_level_with_ci, annual_usage_with_ci
-
-
 def get_expected_appt_time(resourcefilepath) -> pd.DataFrame:
     """This is to return the expected time requirements per appointment type per coarse cadre per facility level."""
     expected_appt_time = pd.read_csv(
@@ -378,18 +260,6 @@ def format_hcw_usage(hcwscenario='actual'):
     # save the data to draw sankey diagram
     hcw_usage_hsi.to_csv(output_folder/'hcw_working_time_per_hsi.csv', index=False)
 
-    # todo: get actual hcw time used derived from DHIS2 data and plot
-    # It is tricky that dhis2 datasets have different appt types, thus
-    # to make the intended comparison of hcw time usage between simulation and data,
-    # we need first to select the common appts/remap some appts and then make the comparison.
-    # A simpler way is to not draw this plot,
-    # but add discussion and appendix to explain the difference between simulation and data usage per appts.
-
-    # dhis_usage = get_real_usage(resourcefilepath)[0]
-
-    # first compare appts defined and appts in DHIS2 and HIV Dept datasets
-    # appts_dhis = set(dhis_usage.columns.values)
-
 
 if __name__ == "__main__":
     outputspath = Path('./outputs/bshe@ic.ac.uk')

From f8b106b3bac56983f85db0a5b23c3af1db49d040 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Mon, 31 Jul 2023 14:51:53 +0100
Subject: [PATCH 101/131] rename Nursing_and_Midwifery

---
 .../analysis_hcw_usage_by_appt_and_by_disease.py               | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
index 42e8c90f1e..0f28655046 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
@@ -207,6 +207,9 @@ def format_hcw_usage(hcwscenario='actual'):
         # reduce the mean ratio by 1.0, for the bar plot that starts from y=1.0 instead of y=0.0
         hcw_usage_ratio['mean'] = hcw_usage_ratio['mean'] - 1.0
 
+        # rename cadre Nursing_and_Midwifery
+        hcw_usage_ratio.rename(index={'Nursing_and_Midwifery': 'Nursing and Midwifery'}, inplace=True)
+
         return hcw_usage_ratio, asymmetric_error
 
     hcw_usage_ratio_actual, error_actual = format_hcw_usage(hcwscenario='actual')

From 133c71c3db9c758f8aa87d5e7d2285808d3912f6 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Mon, 31 Jul 2023 14:58:30 +0100
Subject: [PATCH 102/131] rename Nursing_and_Midwifery

---
 .../analysis_hcw_usage_by_appt_and_by_disease.py              | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
index 0f28655046..a0821b6d53 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
@@ -260,6 +260,10 @@ def format_hcw_usage(hcwscenario='actual'):
     hcw_usage_hsi = hcw_usage_hsi.groupby(['Treatment_ID', 'Appt_Category', 'Officer_Category']
                                           )['Total_Mins_Used_Per_Year'].sum().reset_index()
 
+    # rename Nursing_and_Midwifery
+    hcw_usage_hsi.Officer_Category = hcw_usage_hsi.Officer_Category.replace(
+        {'Nursing_and_Midwifery': 'Nursing and Midwifery'})
+
     # save the data to draw sankey diagram
     hcw_usage_hsi.to_csv(output_folder/'hcw_working_time_per_hsi.csv', index=False)
 

From 883e2c3a7ef74afcbccdbc9c028eb309ee092ef4 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Mon, 31 Jul 2023 15:54:22 +0100
Subject: [PATCH 103/131] plot hcw sankey flow via appt to module

---
 .../analysis_sankey_hcwtime_appt_hsi.ipynb    | 187 ++++++++++++++++++
 1 file changed, 187 insertions(+)
 create mode 100644 src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_hcwtime_appt_hsi.ipynb

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_hcwtime_appt_hsi.ipynb b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_hcwtime_appt_hsi.ipynb
new file mode 100644
index 0000000000..5c5e59c2c4
--- /dev/null
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_hcwtime_appt_hsi.ipynb
@@ -0,0 +1,187 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# This file uses the run generated by `scenario_hsi_in_typical_run.py` and floweaver\n",
+    "to produce a Sankey diagram that maps appointments with HSI events.\n",
+    "\n",
+    "Below is the instruction to run the file.\n",
+    "\n",
+    "### Install floweaver in Anaconda Prompt (if use Jupyter Notebook) / PyCharm Terminal:\n",
+    "\n",
+    "pip install floweaver\n",
+    "\n",
+    "pip install ipysankeywidget\n",
+    "\n",
+    "jupyter nbextension enable --py --sys-prefix widgetsnbextension\n",
+    "\n",
+    "jupyter nbextension enable --py --sys-prefix ipysankeywidget\n",
+    "\n",
+    "jupyter notebook (to open jupyter notebook)\n",
+    "\n",
+    "### To display and save the output figures:\n",
+    "Select Start Jupyter Server from the Jupyter Actions Menu (lightbulb icon next to Run All cells icon)\n",
+    "\n",
+    "Open Event Log\n",
+    "\n",
+    "Open in Browser\n",
+    "\n",
+    "Find the script and run all cells\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "import tlo\n",
+    "\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from tlo.analysis.utils import get_scenario_outputs, load_pickled_dataframes\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "import numpy as np\n",
+    "\n",
+    "from matplotlib import pyplot as plt\n",
+    "\n",
+    "from ipysankeywidget import SankeyWidget\n",
+    "\n",
+    "from floweaver import *"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "# Declare the name of the file that specified the scenarios used in this run.\n",
+    "scenario_filename = '10_year_scale_run.py'\n",
+    "\n",
+    "# Declare usual paths:\n",
+    "# Get the tlo path\n",
+    "tlopath = Path(tlo.__file__).parent.parent.parent\n",
+    "outputspath = tlopath / Path('outputs/bshe@ic.ac.uk')\n",
+    "\n",
+    "# Find results folder (most recent run generated using that scenario_filename)\n",
+    "results_folder = get_scenario_outputs(scenario_filename, outputspath)[-4]\n",
+    "print(f\"Results folder is: {results_folder}\")\n",
+    "\n",
+    "# Declare path for output graphs from this script\n",
+    "make_graph_file_name = lambda stub: results_folder / f\"{stub}.png\"  # noqa: E731\n",
+    "\n",
+    "# Extract results\n",
+    "hcw_time = pd.read_csv(results_folder / 'hcw_working_time_per_hsi.csv')\n",
+    "\n",
+    "# Format data for flow\n",
+    "hcw_time['Module'] = hcw_time['Treatment_ID'].str.split('_').apply(lambda x: x[0])\n",
+    "hcw_time = hcw_time.groupby(['Officer_Category', 'Appt_Category', 'Module'])['Total_Mins_Used_Per_Year'].sum().reset_index()\n",
+    "\n",
+    "hcw_time['source'] = 'Officer_Category'\n",
+    "hcw_time['target'] = 'Module'\n",
+    "hcw_time['value'] = hcw_time['Total_Mins_Used_Per_Year']\n",
+    "\n",
+    "# Format the flow\n",
+    "\n",
+    "partition_officer_cat = Partition.Simple('Officer_Category',\n",
+    "                                         pd.array(['DCSA', 'Clinical', 'Nursing and Midwifery', 'Pharmacy',\n",
+    "                                                   'Laboratory', 'Radiography', 'Mental']))\n",
+    "partition_module = Partition.Simple('Module',\n",
+    "                                   np.unique(hcw_time['Module']))\n",
+    "partition_appt_cat = Partition.Simple('Appt_Category',\n",
+    "                                      pd.array(['ConWithDCSA', 'IPOP', 'RMNCH', 'MISC',\n",
+    "                                      'HIV', 'TB', 'NUTRITION', 'PharmDispensing', 'LABORATORY',\n",
+    "                                      'RADIOGRAPHY', 'MENTAL']))\n",
+    "\n",
+    "\n",
+    "nodes = {\n",
+    "    'Officer': ProcessGroup(['Officer_Category'], partition_officer_cat),\n",
+    "    'Module': ProcessGroup(['Module'], partition_module),\n",
+    "}\n",
+    "\n",
+    "# Add nodes Waypoint\n",
+    "nodes['waypoint'] = Waypoint(partition_appt_cat)\n",
+    "\n",
+    "bundles = [\n",
+    "    Bundle('Officer', 'Module', waypoints=['waypoint']),\n",
+    "]\n",
+    "\n",
+    "ordering = [\n",
+    "    ['Officer'],  # left\n",
+    "    ['waypoint'],    # middle\n",
+    "    ['Module'],     # right\n",
+    "    ]\n",
+    "\n",
+    "# Set the color for each officer category\n",
+    "palette = {'Clinical': 'skyblue', 'Nursing and Midwifery': 'lightpink',\n",
+    "           'Pharmacy': 'khaki', 'Laboratory': 'cadetblue',\n",
+    "           'Radiography': 'yellowgreen',\n",
+    "           'Mental': 'mediumorchid', 'DCSA': 'royalblue'\n",
+    "          }\n",
+    "\n",
+    "# Set the size for the Sankey\n",
+    "size = dict(width=800, height=800, margins=dict(left=180, right=180))\n",
+    "\n",
+    "# Sankey diagram definition (SDD)\n",
+    "sdd = SankeyDefinition(nodes, bundles, ordering, flow_partition=partition_officer_cat)\n",
+    "\n",
+    "hcw_time_flow = weave(sdd, hcw_time, palette=palette, measures='value').to_widget(**size)\n",
+    "\n",
+    "hcw_time_flow.auto_save_png(results_folder /'Sankey_hcw_time_flow.png')\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.17"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
\ No newline at end of file

From 907d53a68fcc63aeebb8b67878a55fb8150f4544 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Tue, 1 Aug 2023 15:50:40 +0100
Subject: [PATCH 104/131] fix failed checks

---
 .../analysis_hcw_usage_by_appt_and_by_disease.py              | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
index a0821b6d53..34fd015e41 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
@@ -3,7 +3,6 @@
 
 import matplotlib
 import matplotlib.pyplot as plt
-import numpy as np
 import pandas as pd
 
 from tlo import Date
@@ -11,7 +10,6 @@
     bin_hsi_event_details,
     compute_mean_across_runs,
     extract_results,
-    get_coarse_appt_type,
     get_scenario_outputs,
     summarize)
 
@@ -274,7 +272,7 @@ def format_hcw_usage(hcwscenario='actual'):
 
     # Find results folder (most recent run generated using that scenario_filename)
     scenario_filename = '10_year_scale_run.py'
-    results_folder = get_scenario_outputs(scenario_filename, outputspath)[-1]
+    results_folder = get_scenario_outputs(scenario_filename, outputspath)[-4]
 
     # Test dataset:
     # results_folder = Path('/Users/tbh03/GitHub/TLOmodel/outputs/tbh03@ic.ac.uk/long_run_all_diseases-small')

From 234df052d05c87eb4a8238f1ad0b33a2c2764fb3 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Wed, 2 Aug 2023 14:08:18 +0100
Subject: [PATCH 105/131] fix failed checks

---
 .../analysis_hcw_usage_by_appt_and_by_disease.py             | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
index 34fd015e41..b95ba54c63 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
@@ -1,5 +1,5 @@
-from pathlib import Path
 from collections import Counter
+from pathlib import Path
 
 import matplotlib
 import matplotlib.pyplot as plt
@@ -11,7 +11,8 @@
     compute_mean_across_runs,
     extract_results,
     get_scenario_outputs,
-    summarize)
+    summarize,
+)
 
 PREFIX_ON_FILENAME = '6'
 

From 6052f9bc0ed74a5a5779b18f53399f7cc8f6ab39 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Thu, 3 Aug 2023 11:19:53 +0100
Subject: [PATCH 106/131] temporary change for HCW paper

---
 ...nalysis_compare_appt_usage_real_and_simulation.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py b/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
index f7f2ee887c..f468e82722 100644
--- a/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
+++ b/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
@@ -179,6 +179,10 @@ def get_simulation_usage_with_confidence_interval(results_folder: Path) -> pd.Da
     model_output.drop(columns='name', inplace=True)
     model_output.reset_index(drop=True, inplace=True)
 
+    # drop dummy PharmDispensing for HCW paper
+    model_output = model_output.drop(index=model_output[model_output.appt_type == 'PharmDispensing'].index
+                                     ).reset_index(drop=True)
+
     return model_output
 
 
@@ -390,7 +394,7 @@ def format_real_usage():
         _rel_diff['upper_error'] = (_rel_diff['upper'] - _rel_diff['mean'])
         _asymmetric_error = [_rel_diff['lower_error'].values, _rel_diff['upper_error'].values]
 
-        _rel_diff = pd.DataFrame(_rel_diff['mean'])
+        _rel_diff = pd.DataFrame(_rel_diff['mean']).clip(lower=0.1, upper=10.0)
 
         return _rel_diff, _asymmetric_error
 
@@ -398,7 +402,7 @@ def format_real_usage():
 
     # plot
     name_of_plot = 'Model vs Data usage per appointment type at all facility levels' \
-                   '\n[Model average annual 95% CI, Adjusted Data average annual]'
+                   '\n[Model average annual 95% CI, Data average annual]'
     fig, ax = plt.subplots()
     ax.errorbar(rel_diff_real.index.values,
                 rel_diff_real['mean'].values,
@@ -547,8 +551,8 @@ def format_data_for_bar_plot(_usage):
     rfp = Path('./resources')
 
     # Find results folder (most recent run generated using that scenario_filename)
-    scenario_filename = 'long_run_all_diseases.py'
-    results_folder = get_scenario_outputs(scenario_filename, outputspath)[-1]
+    scenario_filename = '10_year_scale_run.py'
+    results_folder = get_scenario_outputs(scenario_filename, outputspath)[-4]
 
     # Test dataset:
     # results_folder = Path('/Users/tbh03/GitHub/TLOmodel/outputs/tbh03@ic.ac.uk/long_run_all_diseases-small')

From f7d6e3f3fa002612630f89f9e52759abbac89219 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Thu, 3 Aug 2023 11:30:15 +0100
Subject: [PATCH 107/131] temporary change for HCW paper

---
 .../analysis_compare_appt_usage_real_and_simulation.py        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py b/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
index f468e82722..4ad97636b8 100644
--- a/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
+++ b/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
@@ -411,9 +411,9 @@ def format_real_usage():
     for idx in rel_diff_real.index:
         if not pd.isna(rel_diff_real.loc[idx, 'mean']):
             ax.text(idx,
-                    rel_diff_real.loc[idx, 'mean'] * (1 + 0.2),
+                    rel_diff_real.loc[idx, 'mean'] * (1 + 0.3),
                     round(rel_diff_real.loc[idx, 'mean'], 1),
-                    ha='left', fontsize=8)
+                    ha='center', fontsize=8)
     ax.axhline(1.0, color='r')
     format_and_save(fig, ax, name_of_plot)
 

From 4a650608d98f074f0052883ef730da36a3b37b49 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Thu, 3 Aug 2023 11:41:01 +0100
Subject: [PATCH 108/131] temporary change for HCW paper

---
 .../analysis_compare_appt_usage_real_and_simulation.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py b/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
index 4ad97636b8..a7315b92a0 100644
--- a/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
+++ b/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
@@ -401,7 +401,7 @@ def format_real_usage():
     rel_diff_real, err_real = format_rel_diff(adjusted=True)
 
     # plot
-    name_of_plot = 'Model vs Data usage per appointment type at all facility levels' \
+    name_of_plot = 'Model vs Data on health service volume per appointment type' \
                    '\n[Model average annual 95% CI, Data average annual]'
     fig, ax = plt.subplots()
     ax.errorbar(rel_diff_real.index.values,

From c2390aca392301b8d3f658aa48e9c3d6c8a0205c Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Fri, 4 Aug 2023 22:36:40 +0100
Subject: [PATCH 109/131] plot stacked bar for total appts usage of Model vs
 Data

---
 ..._compare_appt_usage_real_and_simulation.py | 49 +++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py b/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
index a7315b92a0..93c0b6ed41 100644
--- a/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
+++ b/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
@@ -373,6 +373,55 @@ def format_and_save(_fig, _ax, _name_of_plot):
     ax.axhline(1.0, color='r')
     format_and_save(fig, ax, name_of_plot)
 
+    # Plot two stacked bars for Model and Data to compare the total appt usage,
+    # to show the overall and main appts like OPD have been calibrated well.
+    real_usage_all = real_usage.sum(axis=0).reset_index().rename(columns={0: 'Data'})
+    simulation_usage_all = simulation_usage.sum(axis=0).reset_index().rename(columns={'index': 'Appt_Type', 0: 'Model'})
+    usage_all = simulation_usage_all.merge(real_usage_all, on='Appt_Type', how='inner').melt(
+        id_vars='Appt_Type', value_vars=['Model', 'Data'], var_name='Type', value_name='Value').pivot(
+        index='Type', columns='Appt_Type', values='Value')
+    usage_all = usage_all / 1e6
+    appt_color_dict = {
+        'OPD': 'lightpink',
+        'IPAdmission': 'palevioletred',
+        'InpatientDays': 'mediumvioletred',
+
+        'U5Malnutr': 'orchid',
+
+        'FamPlan': 'darkseagreen',
+        'AntenatalTotal': 'green',
+        'Delivery': 'limegreen',
+        'Csection': 'springgreen',
+        'EPI': 'paleturquoise',
+        'STI': 'mediumaquamarine',
+
+        'AccidentsandEmerg': 'orange',
+
+        'TBNew': 'yellow',
+
+        'VCTTests': 'lightsteelblue',
+        'NewAdult': 'cornflowerblue',
+        'EstAdult': 'royalblue',
+        'Peds': 'lightskyblue',
+        'PMTCT': 'deepskyblue',
+        'MaleCirc': 'mediumslateblue',
+
+        'MentalAll': 'darkgrey',
+
+        'DentalAll': 'silver',
+    }
+
+    name_of_plot = 'Model vs Data on average annual health service volume'
+    fig, ax = plt.subplots()
+    usage_all.plot(kind='bar', stacked=True, color=appt_color_dict, rot=0, ax=ax)
+    ax.set_ylabel('Health service volume in millions')
+    ax.set(xlabel=None)
+    plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), title='Appointment type', fontsize=9)
+    plt.title(name_of_plot)
+    fig.tight_layout()
+    fig.savefig(make_graph_file_name(name_of_plot.replace(',', '').replace('\n', '_').replace(' ', '_')))
+    plt.show()
+
     # Plot Simulation with 95% CI vs Adjusted Real usage by appt type, across all levels (trimmed to 0.1 and 10)
     # format data
     def format_rel_diff(adjusted=True):

From 28088bf524694067eb62db30529e7a56d7022fa8 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Fri, 4 Aug 2023 22:56:10 +0100
Subject: [PATCH 110/131] modify time for dummy ConWithDCSA so that no
 overworking/underworking

---
 .../analysis_hcw_usage_by_appt_and_by_disease.py               | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
index b95ba54c63..9990ef92cb 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
@@ -157,6 +157,9 @@ def get_expected_appt_time(resourcefilepath) -> pd.DataFrame:
     expected_appt_time['Appt_Cat'] = expected_appt_time['Appt_Cat'].replace(appt_cat)
     expected_appt_time.rename(columns={'Appt_Cat': 'Appt_Category'}, inplace=True)
 
+    # modify time for dummy ConWithDCSA so that no overworking/underworking
+    expected_appt_time.loc[expected_appt_time['Appt_Category'] == 'ConWithDCSA', 'Time_Taken_Mins'] = 20.0
+
     return expected_appt_time
 
 

From ce2dd3f47b1a820403e6b06151ec53422ad9e5e4 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Sat, 5 Aug 2023 19:17:54 +0100
Subject: [PATCH 111/131] drop dummy PharmDispensing for HCW paper results and
 plots

---
 .../analysis_hcw_usage_by_appt_and_by_disease.py            | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
index 9990ef92cb..0512fd3323 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
@@ -72,6 +72,9 @@ def get_annual_num_hsi_by_appt_and_level(results_folder: Path) -> pd.DataFrame:
     hsi_count = hsi_count.groupby(['Treatment_ID', 'Appt_Type_Code', 'Facility_Level'])['Count'].sum()/yr_count
     hsi_count = hsi_count.to_frame().reset_index()
 
+    # drop dummy PharmDispensing for HCW paper results and plots
+    hsi_count = hsi_count.drop(index=hsi_count[hsi_count['Appt_Type_Code'] == 'PharmDispensing'].index)
+
     return hsi_count
 
 
@@ -97,7 +100,8 @@ def unpack_nested_dict_in_series(_raw: pd.Series):
             .mean(axis=0) \
             .to_frame().reset_index() \
             .rename(columns={'level_0': 'Facility_Level', 'level_1': 'Appt_Type_Code', 0: 'Count'}) \
-            .pivot(index='Facility_Level', columns='Appt_Type_Code', values='Count')
+            .pivot(index='Facility_Level', columns='Appt_Type_Code', values='Count') \
+            .drop(columns='PharmDispensing')  # do not include this dummy appt for HCW paper results and plots
 
         # get appt time definitions
         appt_time = get_expected_appt_time(resourcefilepath)

From d295657a1e017601a39a5e284cb37e916dd052e1 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Sat, 5 Aug 2023 20:11:12 +0100
Subject: [PATCH 112/131] update plot title

---
 .../analysis_hcw_usage_by_appt_and_by_disease.py                | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
index 0512fd3323..3cc5a183fa 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
@@ -221,7 +221,7 @@ def format_hcw_usage(hcwscenario='actual'):
     hcw_usage_ratio_actual, error_actual = format_hcw_usage(hcwscenario='actual')
     hcw_usage_ratio_establishment, error_establishment = format_hcw_usage(hcwscenario='funded_plus')
 
-    name_of_plot = 'Simulated annual working time vs Capability per cadre'
+    name_of_plot = 'Simulated average annual working time (95% CI) vs Capability per cadre'
     fig, ax = plt.subplots(figsize=(8, 5))
     hcw_usage_ratio_establishment.plot(kind='bar', yerr=error_establishment, width=0.4,
                                        ax=ax, position=0, bottom=1.0,

From 4a49c7edee538b71130d40daf5f58a737e173aee Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Sat, 5 Aug 2023 20:11:44 +0100
Subject: [PATCH 113/131] increase height for maximal health care seeking
 sankey

---
 .../analysis_sankey_hcwtime_appt_hsi.ipynb    | 90 ++++++++++++-------
 1 file changed, 58 insertions(+), 32 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_hcwtime_appt_hsi.ipynb b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_hcwtime_appt_hsi.ipynb
index 5c5e59c2c4..1247ee410e 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_hcwtime_appt_hsi.ipynb
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_hcwtime_appt_hsi.ipynb
@@ -2,9 +2,14 @@
  "cells": [
   {
    "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
    "source": [
-    "# This file uses the run generated by `scenario_hsi_in_typical_run.py` and floweaver\n",
-    "to produce a Sankey diagram that maps appointments with HSI events.\n",
+    "# This file uses the run generated by `10_year_scale_run.py` and floweaver\n",
+    "to produce a Sankey diagram that maps hcw working time per cadre, appointments and disease modules.\n",
     "\n",
     "Below is the instruction to run the file.\n",
     "\n",
@@ -28,17 +33,16 @@
     "Open in Browser\n",
     "\n",
     "Find the script and run all cells\n"
-   ],
-   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%% md\n"
-    }
-   }
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 119,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
    "outputs": [],
    "source": [
     "import tlo\n",
@@ -56,18 +60,40 @@
     "from ipysankeywidget import SankeyWidget\n",
     "\n",
     "from floweaver import *"
-   ],
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 120,
    "metadata": {
-    "collapsed": false,
     "pycharm": {
      "name": "#%%\n"
     }
-   }
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "outputs": [],
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Results folder is: c:\\users\\jdbb1\\pycharmprojects\\tlomodel\\outputs\\bshe@ic.ac.uk\\10_year_scale_run-2023-08-01T145207Z_draw3\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "afc9210895044f57bbd059e73e266138",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "SankeyWidget(groups=[{'id': 'Officer', 'type': 'process', 'title': '', 'nodes': ['Officer^DCSA', 'Officer^Clin…"
+      ]
+     },
+     "execution_count": 120,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "# Declare the name of the file that specified the scenarios used in this run.\n",
     "scenario_filename = '10_year_scale_run.py'\n",
@@ -78,9 +104,16 @@
     "outputspath = tlopath / Path('outputs/bshe@ic.ac.uk')\n",
     "\n",
     "# Find results folder (most recent run generated using that scenario_filename)\n",
-    "results_folder = get_scenario_outputs(scenario_filename, outputspath)[-4]\n",
+    "f = -1  # -4: Actual + Default health care seeking, -2: Actual + Maximal health care seeking\n",
+    "results_folder = get_scenario_outputs(scenario_filename, outputspath)[f]\n",
     "print(f\"Results folder is: {results_folder}\")\n",
     "\n",
+    "# Sankey diagram height scale factor\n",
+    "if (f == -4) or (f == -3):\n",
+    "    sankey_scale = 1\n",
+    "elif (f == -2) or (f == -1):\n",
+    "    sankey_scale = 7186802559.49815 / 2253171916.6190553  # the total working time of file -2 / ... of file -4\n",
+    "        \n",
     "# Declare path for output graphs from this script\n",
     "make_graph_file_name = lambda stub: results_folder / f\"{stub}.png\"  # noqa: E731\n",
     "\n",
@@ -134,7 +167,7 @@
     "          }\n",
     "\n",
     "# Set the size for the Sankey\n",
-    "size = dict(width=800, height=800, margins=dict(left=180, right=180))\n",
+    "size = dict(width=800, height=600*sankey_scale, margins=dict(left=180, right=180))\n",
     "\n",
     "# Sankey diagram definition (SDD)\n",
     "sdd = SankeyDefinition(nodes, bundles, ordering, flow_partition=partition_officer_cat)\n",
@@ -142,25 +175,18 @@
     "hcw_time_flow = weave(sdd, hcw_time, palette=palette, measures='value').to_widget(**size)\n",
     "\n",
     "hcw_time_flow.auto_save_png(results_folder /'Sankey_hcw_time_flow.png')\n"
-   ],
-   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   }
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "outputs": [],
-   "source": [],
    "metadata": {
-    "collapsed": false,
     "pycharm": {
      "name": "#%%\n"
     }
-   }
+   },
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -184,4 +210,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 1
-}
\ No newline at end of file
+}

From e81a163c2eabcb7a4ebf9ee174ee2d7ff798e114 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Wed, 16 Aug 2023 14:27:12 +0100
Subject: [PATCH 114/131] adjust figure size to fix margin issues of the long
 sankey

---
 .../analysis_sankey_hcwtime_appt_hsi.ipynb    | 22 ++++++++-----------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_hcwtime_appt_hsi.ipynb b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_hcwtime_appt_hsi.ipynb
index 1247ee410e..2ef713f5f0 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_hcwtime_appt_hsi.ipynb
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_hcwtime_appt_hsi.ipynb
@@ -37,7 +37,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 119,
+   "execution_count": 38,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -64,7 +64,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 120,
+   "execution_count": 39,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -75,13 +75,13 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Results folder is: c:\\users\\jdbb1\\pycharmprojects\\tlomodel\\outputs\\bshe@ic.ac.uk\\10_year_scale_run-2023-08-01T145207Z_draw3\n"
+      "Results folder is: c:\\users\\jdbb1\\pycharmprojects\\tlomodel\\outputs\\bshe@ic.ac.uk\\10_year_scale_run-2023-08-01T145207Z_draw0\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "afc9210895044f57bbd059e73e266138",
+       "model_id": "752fc8430270487fba9537301a8ef8c0",
        "version_major": 2,
        "version_minor": 0
       },
@@ -89,7 +89,7 @@
        "SankeyWidget(groups=[{'id': 'Officer', 'type': 'process', 'title': '', 'nodes': ['Officer^DCSA', 'Officer^Clin…"
       ]
      },
-     "execution_count": 120,
+     "execution_count": 39,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -104,7 +104,7 @@
     "outputspath = tlopath / Path('outputs/bshe@ic.ac.uk')\n",
     "\n",
     "# Find results folder (most recent run generated using that scenario_filename)\n",
-    "f = -1  # -4: Actual + Default health care seeking, -2: Actual + Maximal health care seeking\n",
+    "f = -4  # -4: Actual + Default health care seeking, -2: Actual + Maximal health care seeking\n",
     "results_folder = get_scenario_outputs(scenario_filename, outputspath)[f]\n",
     "print(f\"Results folder is: {results_folder}\")\n",
     "\n",
@@ -137,7 +137,7 @@
     "                                   np.unique(hcw_time['Module']))\n",
     "partition_appt_cat = Partition.Simple('Appt_Category',\n",
     "                                      pd.array(['ConWithDCSA', 'IPOP', 'RMNCH', 'MISC',\n",
-    "                                      'HIV', 'TB', 'NUTRITION', 'PharmDispensing', 'LABORATORY',\n",
+    "                                      'HIV', 'TB', 'NUTRITION', 'LABORATORY',\n",
     "                                      'RADIOGRAPHY', 'MENTAL']))\n",
     "\n",
     "\n",
@@ -167,7 +167,7 @@
     "          }\n",
     "\n",
     "# Set the size for the Sankey\n",
-    "size = dict(width=800, height=600*sankey_scale, margins=dict(left=180, right=180))\n",
+    "size = dict(width=800, height=560*sankey_scale, margins=dict(left=180, right=180))\n",
     "\n",
     "# Sankey diagram definition (SDD)\n",
     "sdd = SankeyDefinition(nodes, bundles, ordering, flow_partition=partition_officer_cat)\n",
@@ -180,11 +180,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": []
   }

From 7f708a8c7e4db0b32b20385ff95ace9fb713e663 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Wed, 16 Aug 2023 14:33:02 +0100
Subject: [PATCH 115/131] fix failing checks

---
 .../analysis_compare_appt_usage_real_and_simulation.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py b/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
index fc91764e99..a394cbbb66 100644
--- a/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
+++ b/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
@@ -7,7 +7,7 @@
 import pandas as pd
 
 from tlo import Date
-from tlo.analysis.utils import extract_results, get_scenario_outputs, summarize
+from tlo.analysis.utils import extract_results, summarize
 
 PREFIX_ON_FILENAME = '4'
 

From a954c6abeb76cb5d0988e3f09505f57045b7eb60 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Fri, 18 Aug 2023 11:42:55 +0100
Subject: [PATCH 116/131] save data for sankey diagram of appt to hsi/treatment
 id

---
 ...alysis_hcw_usage_by_appt_and_by_disease.py | 29 +++++++++----------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
index 3cc5a183fa..0863d58da3 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
@@ -1,3 +1,4 @@
+import argparse
 from collections import Counter
 from pathlib import Path
 
@@ -262,6 +263,8 @@ def format_hcw_usage(hcwscenario='actual'):
     hcw_usage_hsi = appt_time.drop(index=appt_time[~appt_time.Appt_Type_Code.isin(appts_sim)].index
                                    ).reset_index(drop=True)
     hcw_usage_hsi = hsi_count.merge(hcw_usage_hsi, on=['Facility_Level', 'Appt_Type_Code'], how='left')
+    # save the data to draw sankey diagram of appt to hsi
+    hcw_usage_hsi.to_csv(output_folder / 'hsi_count_by_treatment_appt_level.csv', index=False)
     hcw_usage_hsi['Total_Mins_Used_Per_Year'] = hcw_usage_hsi['Count'] * hcw_usage_hsi['Time_Taken_Mins']
     hcw_usage_hsi = hcw_usage_hsi.groupby(['Treatment_ID', 'Appt_Category', 'Officer_Category']
                                           )['Total_Mins_Used_Per_Year'].sum().reset_index()
@@ -270,23 +273,17 @@ def format_hcw_usage(hcwscenario='actual'):
     hcw_usage_hsi.Officer_Category = hcw_usage_hsi.Officer_Category.replace(
         {'Nursing_and_Midwifery': 'Nursing and Midwifery'})
 
-    # save the data to draw sankey diagram
+    # save the data to draw sankey diagram of hcw time via appt to disease
     hcw_usage_hsi.to_csv(output_folder/'hcw_working_time_per_hsi.csv', index=False)
 
 
 if __name__ == "__main__":
-    outputspath = Path('./outputs/bshe@ic.ac.uk')
-    rfp = Path('./resources')
-
-    # Find results folder (most recent run generated using that scenario_filename)
-    scenario_filename = '10_year_scale_run.py'
-    results_folder = get_scenario_outputs(scenario_filename, outputspath)[-4]
-
-    # Test dataset:
-    # results_folder = Path('/Users/tbh03/GitHub/TLOmodel/outputs/tbh03@ic.ac.uk/long_run_all_diseases-small')
-
-    # If needed -- in the case that pickles were not created remotely during batch
-    # create_pickles_locally(results_folder)
-
-    # Run all the calibrations
-    apply(results_folder=results_folder, output_folder=results_folder, resourcefilepath=rfp)
+    parser = argparse.ArgumentParser()
+    parser.add_argument("results_folder", type=Path)
+    args = parser.parse_args()
+
+    apply(
+        results_folder=args.results_folder,
+        output_folder=args.results_folder,
+        resourcefilepath=Path('./resources')
+    )

From ea6a800a82697f3cb2a89e342cb098f2a26466c9 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Fri, 18 Aug 2023 14:49:25 +0100
Subject: [PATCH 117/131] draft to plot sankey diagram from appt to hsi via
 facility level

---
 .../analysis_sankey_appt_level_hsi.ipynb      | 263 ++++++++++++++++++
 1 file changed, 263 insertions(+)
 create mode 100644 src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_level_hsi.ipynb

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_level_hsi.ipynb b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_level_hsi.ipynb
new file mode 100644
index 0000000000..0fcdea5652
--- /dev/null
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_level_hsi.ipynb
@@ -0,0 +1,263 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "This file uses the run generated by `10_year_scale_run.py` and floweaver\n",
+    "to produce a Sankey diagram that maps appointments with HSI events via facility levels.\n",
+    "\n",
+    "Below is the instruction to run the file.\n",
+    "\n",
+    "### Install floweaver in Anaconda Prompt (if use Jupyter Notebook) / PyCharm Terminal:\n",
+    "\n",
+    "pip install floweaver\n",
+    "\n",
+    "pip install ipysankeywidget\n",
+    "\n",
+    "jupyter nbextension enable --py --sys-prefix ipysankeywidget\n",
+    "\n",
+    "jupyter notebook (to open jupyter notebook, which should be installed first) \n",
+    "\n",
+    "### To display and save the output figures:\n",
+    "Select Start Jupyter Server from the Jupyter Actions Menu (lightbulb icon next to Run All cells icon) -> Open Event Log -> Open in Browser\n",
+    "Or \n",
+    "Type jupyter notebook in PyCharm Terminal\n",
+    "\n",
+    "Find the script and run all cells\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import tlo\n",
+    "\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from tlo.analysis.utils import get_scenario_outputs, load_pickled_dataframes\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "import numpy as np\n",
+    "\n",
+    "from matplotlib import pyplot as plt\n",
+    "\n",
+    "from ipysankeywidget import SankeyWidget\n",
+    "\n",
+    "from floweaver import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "pycharm": {
+     "is_executing": true,
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Declare the name of the file that specified the scenarios used in this run.\n",
+    "scenario_filename = '10_year_scale_run.py'\n",
+    "\n",
+    "# Declare usual paths:\n",
+    "# Get the tlo path\n",
+    "tlopath = Path(tlo.__file__).parent.parent.parent\n",
+    "outputspath = tlopath / Path('outputs/bshe@ic.ac.uk')\n",
+    "\n",
+    "# Find results folder (most recent run generated using that scenario_filename)\n",
+    "f = -4  # -4: Actual + Default health care seeking, -2: Actual + Maximal health care seeking\n",
+    "results_folder = get_scenario_outputs(scenario_filename, outputspath)[f]\n",
+    "print(f\"Results folder is: {results_folder}\")\n",
+    "\n",
+    "# Declare path for output graphs from this script\n",
+    "make_graph_file_name = lambda stub: results_folder / f\"{stub}.png\"  # noqa: E731\n",
+    "\n",
+    "# Extract results\n",
+    "hsi = pd.read_csv(results_folder / 'hsi_count_by_treatment_appt_level.csv')\n",
+    "hsi[\"Module\"] = hsi[\"TREATMENT_ID\"].str.split('_').apply(lambda x: x[0])\n",
+    "\n",
+    "# todo: Format data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "pycharm": {
+     "is_executing": true,
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Sankey 0 that map appt to hsi considering only appt footprint for each hsi\n",
+    "\n",
+    "# Prepare the data: appt type and number per hsi\n",
+    "appt_and_hsi = appts_by_treatment_id_short.reset_index().copy()\n",
+    "appt_and_hsi.rename(columns={'index': 'TREATMENT_ID'}, inplace=True)\n",
+    "appt_and_hsi = pd.melt(appt_and_hsi, id_vars=['TREATMENT_ID'], value_vars=appt_and_hsi.columns[1:],\n",
+    "                      var_name='Appt_Type')\n",
+    "\n",
+    "# Define the flow\n",
+    "appt_and_hsi['source'] = 'Appt_Type'\n",
+    "appt_and_hsi['target'] = 'TREATMENT_ID'\n",
+    "\n",
+    "size = dict(width=1000, height=800, margins=dict(left=120, right=520))\n",
+    "\n",
+    "# Nodes in alphabetic order\n",
+    "# partition_appt_type = Partition.Simple('Appt_Type', np.unique(appt_and_hsi['Appt_Type']))\n",
+    "# partition_treatment_id = Partition.Simple('TREATMENT_ID', np.unique(appt_and_hsi['TREATMENT_ID']))\n",
+    "# if to keep the order in the dataframe\n",
+    "# partition_appt_type = Partition.Simple('Appt_Type', pd.unique(pd.Series(appt_and_hsi['Appt_Type'])))\n",
+    "# partition_treatment_id = Partition.Simple('TREATMENT_ID', pd.unique(pd.Series(appt_and_hsi['TREATMENT_ID'])))\n",
+    "# if to fix the oder of the nodes in the way we want\n",
+    "partition_appt_type = Partition.Simple('Appt_Type', pd.array([\n",
+    "    'IPAdmission', 'InpatientDays', 'Over5OPD', 'Under5OPD',\n",
+    "    'AntenatalFirst', 'ANCSubsequent', 'CompDelivery', 'NormalDelivery',\n",
+    "    'FamPlan', 'MajorSurg', 'ConWithDCSA',\n",
+    "    'MaleCirc', 'NewAdult', 'VCTNegative', 'VCTPositive']))\n",
+    "partition_treatment_id = Partition.Simple('TREATMENT_ID', pd.array([\n",
+    "    'Malaria_treatment_complicated_child', 'Malaria_IPTp',\n",
+    "    'Diarrhoea_Treatment_Inpatient', 'Depression_Antidepressant_Refill',\n",
+    "    'PostnatalSupervisor_NeonatalWardInpatientCare', 'CareOfWomenDuringPregnancy_FirstAntenatalCareContact',\n",
+    "    'CareOfWomenDuringPregnancy_AntenatalOutpatientManagementOfAnaemia',\n",
+    "    'CareOfWomenDuringPregnancy_PostAbortionCaseManagement', 'Labour_ReceivesSkilledBirthAttendanceDuringLabour',\n",
+    "    'Contraception_FamilyPlanningAppt', 'GenericEmergencyFirstApptAtFacilityLevel1',\n",
+    "    'GenericFirstApptAtFacilityLevel0', 'OesophagealCancer_StartTreatment', 'breastCancer_StartTreatment',\n",
+    "    'Hiv_Circumcision', 'Hiv_Treatment_InitiationOrContinuation', 'Hiv_TestAndRefer']))\n",
+    "\n",
+    "nodes = {\n",
+    "    'Appt': ProcessGroup(['Appt_Type'], partition_appt_type),\n",
+    "    'HSI': ProcessGroup(['TREATMENT_ID'], partition_treatment_id),\n",
+    "}\n",
+    "\n",
+    "bundles = [\n",
+    "    Bundle('Appt', 'HSI'),\n",
+    "]\n",
+    "\n",
+    "ordering = [\n",
+    "    ['Appt'],  # left\n",
+    "    ['HSI'],   # right\n",
+    "    ]\n",
+    "\n",
+    "# Set the color for each appt type\n",
+    "palette = {'IPAdmission': 'lightsteelblue', 'InpatientDays': 'skyblue',\n",
+    "           'Over5OPD': 'cornflowerblue', 'Under5OPD': 'steelblue',\n",
+    "           'AntenatalFirst': 'plum', 'ANCSubsequent': 'hotpink',\n",
+    "           'CompDelivery': 'tomato', 'NormalDelivery': 'darksalmon',\n",
+    "           'FamPlan': 'gold', 'MajorSurg': 'orange', 'ConWithDCSA': 'mediumpurple',\n",
+    "           'MaleCirc': 'lightgreen', 'NewAdult': 'mediumseagreen',\n",
+    "           'VCTNegative': 'greenyellow', 'VCTPositive': 'olivedrab',\n",
+    "          }\n",
+    "\n",
+    "# Sankey diagram definition (SDD)\n",
+    "sdd = SankeyDefinition(nodes, bundles, ordering, flow_partition=partition_appt_type)\n",
+    "\n",
+    "# Generate and save Sankey\n",
+    "sankey_appt_and_hsi = weave(sdd, appt_and_hsi, palette=palette, measures='value').to_widget(**size)\n",
+    "\n",
+    "sankey_appt_and_hsi.auto_save_png(make_graph_file_name('Sankey_appt_and_hsi'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "pycharm": {
+     "is_executing": true,
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Sankey 1 that maps appt to hsi considering appt footprint for each hsi and number of each hsi\n",
+    "\n",
+    "# Prepare the data: total number of appts per hsi for year 2010-2018\n",
+    "num_appt_by_hsi = appts_by_treatment_id_full.copy()\n",
+    "for event in num_appt_by_hsi.index:\n",
+    "    num_appt_by_hsi.loc[event,:] = appts_by_treatment_id_full.loc[event,:] * num_hsi_by_treatment_id.loc[event, 'Number_of_HSI']\n",
+    "num_appt_by_hsi = num_appt_by_hsi.reset_index().copy()\n",
+    "num_appt_by_hsi.rename(columns={'index': 'TREATMENT_ID'}, inplace=True)\n",
+    "num_appt_by_hsi = pd.melt(num_appt_by_hsi, id_vars=['TREATMENT_ID'], value_vars=num_appt_by_hsi.columns[1:],\n",
+    "                      var_name='Appt_Type')\n",
+    "\n",
+    "# Define the flow\n",
+    "num_appt_by_hsi['source'] = 'Appt_Type'\n",
+    "num_appt_by_hsi['target'] = 'TREATMENT_ID'\n",
+    "\n",
+    "size = dict(width=1000, height=800, margins=dict(left=120, right=520))\n",
+    "\n",
+    "partition_appt_type = Partition.Simple('Appt_Type', np.unique(num_appt_by_hsi['Appt_Type']))\n",
+    "\n",
+    "partition_treatment_id = Partition.Simple('TREATMENT_ID', np.unique(num_appt_by_hsi['TREATMENT_ID']))\n",
+    "\n",
+    "nodes = {\n",
+    "    'Appt': ProcessGroup(['Appt_Type'], partition_appt_type),\n",
+    "    'HSI': ProcessGroup(['TREATMENT_ID'], partition_treatment_id),\n",
+    "}\n",
+    "\n",
+    "bundles = [\n",
+    "    Bundle('Appt', 'HSI'),\n",
+    "]\n",
+    "\n",
+    "ordering = [\n",
+    "    ['Appt'],  # left\n",
+    "    ['HSI'],   # right\n",
+    "    ]\n",
+    "\n",
+    "# Set the color for each appt type\n",
+    "palette = {'IPAdmission': 'lightsteelblue', 'InpatientDays': 'skyblue',\n",
+    "           'Over5OPD': 'cornflowerblue', 'Under5OPD': 'steelblue',\n",
+    "           'AntenatalFirst': 'plum', 'ANCSubsequent': 'hotpink',\n",
+    "           'CompDelivery': 'tomato', 'NormalDelivery': 'darksalmon',\n",
+    "           'FamPlan': 'gold', 'MajorSurg': 'orange', 'ConWithDCSA': 'mediumpurple',\n",
+    "           'MaleCirc': 'lightgreen', 'NewAdult': 'mediumseagreen',\n",
+    "           'VCTNegative': 'greenyellow', 'VCTPositive': 'olivedrab',\n",
+    "          }\n",
+    "\n",
+    "# Sankey diagram definition (SDD)\n",
+    "sdd = SankeyDefinition(nodes, bundles, ordering, flow_partition=partition_appt_type)\n",
+    "\n",
+    "# Generate and save Sankey\n",
+    "sankey_num_appt_by_hsi = weave(sdd, num_appt_by_hsi, palette=palette, measures='value').to_widget(**size)\n",
+    "\n",
+    "sankey_num_appt_by_hsi.auto_save_png(make_graph_file_name('Sankey_num_appt_by_hsi'))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.17"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}

From 28dd29f4ff92cc42e36a4cd61a8953fb29f18d70 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Fri, 18 Aug 2023 20:11:50 +0100
Subject: [PATCH 118/131] plot sankey diagram from appt to hsi via facility
 level

---
 .../analysis_sankey_appt_level_hsi.ipynb      | 202 ++++++++----------
 1 file changed, 87 insertions(+), 115 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_level_hsi.ipynb b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_level_hsi.ipynb
index 0fcdea5652..786b5d6b8f 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_level_hsi.ipynb
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_level_hsi.ipynb
@@ -34,7 +34,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 47,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -61,14 +61,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 48,
    "metadata": {
     "pycharm": {
      "is_executing": true,
      "name": "#%%\n"
     }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Results folder is: c:\\users\\jdbb1\\pycharmprojects\\tlomodel\\outputs\\bshe@ic.ac.uk\\10_year_scale_run-2023-08-01T145207Z_draw0\n"
+     ]
+    }
+   ],
    "source": [
     "# Declare the name of the file that specified the scenarios used in this run.\n",
     "scenario_filename = '10_year_scale_run.py'\n",
@@ -88,155 +96,119 @@
     "\n",
     "# Extract results\n",
     "hsi = pd.read_csv(results_folder / 'hsi_count_by_treatment_appt_level.csv')\n",
-    "hsi[\"Module\"] = hsi[\"TREATMENT_ID\"].str.split('_').apply(lambda x: x[0])\n",
     "\n",
-    "# todo: Format data"
+    "# todo: Format data\n",
+    "hsi = hsi[[\"Appt_Type_Code\", \"Facility_Level\", \"Treatment_ID\"]].drop_duplicates().reset_index(drop=True)\n",
+    "hsi['Facility_Level'] = 'Facility_Level_' + hsi['Facility_Level'].astype(str)\n",
+    "hsi['source'] = 'Appt_Type_Code'\n",
+    "hsi['target'] = 'Treatment_ID'\n",
+    "hsi['value'] = 1.0"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 49,
    "metadata": {
     "pycharm": {
      "is_executing": true,
      "name": "#%%\n"
     }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cd57dcf703ee438099eaac317b281520",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "SankeyWidget(groups=[{'id': 'Appt', 'type': 'process', 'title': '', 'nodes': ['Appt^ConWithDCSA', 'Appt^Under5…"
+      ]
+     },
+     "execution_count": 49,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "# Sankey 0 that map appt to hsi considering only appt footprint for each hsi\n",
-    "\n",
-    "# Prepare the data: appt type and number per hsi\n",
-    "appt_and_hsi = appts_by_treatment_id_short.reset_index().copy()\n",
-    "appt_and_hsi.rename(columns={'index': 'TREATMENT_ID'}, inplace=True)\n",
-    "appt_and_hsi = pd.melt(appt_and_hsi, id_vars=['TREATMENT_ID'], value_vars=appt_and_hsi.columns[1:],\n",
-    "                      var_name='Appt_Type')\n",
-    "\n",
-    "# Define the flow\n",
-    "appt_and_hsi['source'] = 'Appt_Type'\n",
-    "appt_and_hsi['target'] = 'TREATMENT_ID'\n",
-    "\n",
-    "size = dict(width=1000, height=800, margins=dict(left=120, right=520))\n",
+    "# Format the flow\n",
+    "size = dict(width=1000, height=1600, margins=dict(left=150, right=520))\n",
     "\n",
     "# Nodes in alphabetic order\n",
-    "# partition_appt_type = Partition.Simple('Appt_Type', np.unique(appt_and_hsi['Appt_Type']))\n",
-    "# partition_treatment_id = Partition.Simple('TREATMENT_ID', np.unique(appt_and_hsi['TREATMENT_ID']))\n",
-    "# if to keep the order in the dataframe\n",
-    "# partition_appt_type = Partition.Simple('Appt_Type', pd.unique(pd.Series(appt_and_hsi['Appt_Type'])))\n",
-    "# partition_treatment_id = Partition.Simple('TREATMENT_ID', pd.unique(pd.Series(appt_and_hsi['TREATMENT_ID'])))\n",
-    "# if to fix the oder of the nodes in the way we want\n",
-    "partition_appt_type = Partition.Simple('Appt_Type', pd.array([\n",
-    "    'IPAdmission', 'InpatientDays', 'Over5OPD', 'Under5OPD',\n",
-    "    'AntenatalFirst', 'ANCSubsequent', 'CompDelivery', 'NormalDelivery',\n",
-    "    'FamPlan', 'MajorSurg', 'ConWithDCSA',\n",
-    "    'MaleCirc', 'NewAdult', 'VCTNegative', 'VCTPositive']))\n",
-    "partition_treatment_id = Partition.Simple('TREATMENT_ID', pd.array([\n",
-    "    'Malaria_treatment_complicated_child', 'Malaria_IPTp',\n",
-    "    'Diarrhoea_Treatment_Inpatient', 'Depression_Antidepressant_Refill',\n",
-    "    'PostnatalSupervisor_NeonatalWardInpatientCare', 'CareOfWomenDuringPregnancy_FirstAntenatalCareContact',\n",
-    "    'CareOfWomenDuringPregnancy_AntenatalOutpatientManagementOfAnaemia',\n",
-    "    'CareOfWomenDuringPregnancy_PostAbortionCaseManagement', 'Labour_ReceivesSkilledBirthAttendanceDuringLabour',\n",
-    "    'Contraception_FamilyPlanningAppt', 'GenericEmergencyFirstApptAtFacilityLevel1',\n",
-    "    'GenericFirstApptAtFacilityLevel0', 'OesophagealCancer_StartTreatment', 'breastCancer_StartTreatment',\n",
-    "    'Hiv_Circumcision', 'Hiv_Treatment_InitiationOrContinuation', 'Hiv_TestAndRefer']))\n",
+    "partition_appt_type = Partition.Simple('Appt_Type_Code', pd.array([\n",
+    "    'ConWithDCSA',\n",
+    "    'Under5OPD', 'Over5OPD', 'IPAdmission', 'InpatientDays',\n",
+    "    'AntenatalFirst', 'ANCSubsequent', 'FamPlan', 'EPI', \n",
+    "    'CompDelivery', 'NormalDelivery', 'Csection',\n",
+    "    'AccidentsandEmerg', 'MajorSurg', 'MinorSurg',\n",
+    "    'U5Malnutr',\n",
+    "    'MentOPD',\n",
+    "    'Mammography', 'DiagRadio', 'Tomography', 'LabMolec', 'LabTBMicro',\n",
+    "    'MaleCirc', 'Peds', 'VCTNegative', 'VCTPositive', 'NewAdult', 'EstNonCom',\n",
+    "    'TBNew', 'TBFollowUp']))\n",
+    "partition_treatment_id = Partition.Simple('Treatment_ID', np.unique(hsi['Treatment_ID']))\n",
+    "partition_facility_level = Partition.Simple('Facility_Level',np.unique(hsi['Facility_Level']))\n",
     "\n",
     "nodes = {\n",
-    "    'Appt': ProcessGroup(['Appt_Type'], partition_appt_type),\n",
-    "    'HSI': ProcessGroup(['TREATMENT_ID'], partition_treatment_id),\n",
+    "    'Appt': ProcessGroup(['Appt_Type_Code'], partition_appt_type),\n",
+    "    'HSI': ProcessGroup(['Treatment_ID'], partition_treatment_id),\n",
     "}\n",
     "\n",
+    "# Add nodes Waypoint\n",
+    "nodes['waypoint'] = Waypoint(partition_facility_level)\n",
+    "\n",
     "bundles = [\n",
-    "    Bundle('Appt', 'HSI'),\n",
+    "    Bundle('Appt', 'HSI', waypoints=['waypoint']),\n",
     "]\n",
     "\n",
     "ordering = [\n",
     "    ['Appt'],  # left\n",
-    "    ['HSI'],   # right\n",
+    "    ['waypoint'],    # middle\n",
+    "    ['HSI'],     # right\n",
     "    ]\n",
     "\n",
+    "\n",
     "# Set the color for each appt type\n",
-    "palette = {'IPAdmission': 'lightsteelblue', 'InpatientDays': 'skyblue',\n",
-    "           'Over5OPD': 'cornflowerblue', 'Under5OPD': 'steelblue',\n",
-    "           'AntenatalFirst': 'plum', 'ANCSubsequent': 'hotpink',\n",
-    "           'CompDelivery': 'tomato', 'NormalDelivery': 'darksalmon',\n",
-    "           'FamPlan': 'gold', 'MajorSurg': 'orange', 'ConWithDCSA': 'mediumpurple',\n",
-    "           'MaleCirc': 'lightgreen', 'NewAdult': 'mediumseagreen',\n",
-    "           'VCTNegative': 'greenyellow', 'VCTPositive': 'olivedrab',\n",
-    "          }\n",
+    "palette = {'Under5OPD': 'lightpink', 'Over5OPD': 'lightpink',\n",
+    "           'IPAdmission': 'palevioletred', 'InpatientDays': 'mediumvioletred',\n",
+    "            \n",
+    "           'AntenatalFirst': 'green', 'ANCSubsequent': 'green',\n",
+    "           'FamPlan': 'darkseagreen', 'EPI': 'paleturquoise', \n",
+    "           'CompDelivery': 'limegreen', 'NormalDelivery': 'limegreen', 'Csection': 'springgreen',\n",
+    "            \n",
+    "           'AccidentsandEmerg': 'darkorange', 'MajorSurg': 'orange', 'MinorSurg': 'gold',\n",
+    "            \n",
+    "           'ConWithDCSA': 'violet',\n",
+    "            \n",
+    "           'U5Malnutr': 'orchid',\n",
+    "            \n",
+    "           'MentOPD': 'darkgrey',\n",
+    "            \n",
+    "           'Mammography': 'lightgrey', 'DiagRadio': 'lightgrey', 'Tomography': 'lightgrey', \n",
+    "           'LabMolec': 'gainsboro', 'LabTBMicro': 'gainsboro',\n",
+    "            \n",
+    "           'MaleCirc': 'mediumslateblue', 'Peds': 'lightskyblue', \n",
+    "           'VCTNegative': 'lightsteelblue', 'VCTPositive': 'lightsteelblue', \n",
+    "           'NewAdult': 'cornflowerblue', 'EstNonCom': 'royalblue',\n",
+    "            \n",
+    "           'TBNew': 'yellow', 'TBFollowUp': 'yellow'}\n",
     "\n",
     "# Sankey diagram definition (SDD)\n",
     "sdd = SankeyDefinition(nodes, bundles, ordering, flow_partition=partition_appt_type)\n",
     "\n",
     "# Generate and save Sankey\n",
-    "sankey_appt_and_hsi = weave(sdd, appt_and_hsi, palette=palette, measures='value').to_widget(**size)\n",
+    "sankey_appt_level_hsi = weave(sdd, hsi, palette=palette, measures='value').to_widget(**size)\n",
     "\n",
-    "sankey_appt_and_hsi.auto_save_png(make_graph_file_name('Sankey_appt_and_hsi'))"
+    "sankey_appt_level_hsi.auto_save_png(make_graph_file_name('Sankey_appt_level_hsi'))"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "pycharm": {
-     "is_executing": true,
-     "name": "#%%\n"
-    }
-   },
+   "metadata": {},
    "outputs": [],
-   "source": [
-    "# Sankey 1 that maps appt to hsi considering appt footprint for each hsi and number of each hsi\n",
-    "\n",
-    "# Prepare the data: total number of appts per hsi for year 2010-2018\n",
-    "num_appt_by_hsi = appts_by_treatment_id_full.copy()\n",
-    "for event in num_appt_by_hsi.index:\n",
-    "    num_appt_by_hsi.loc[event,:] = appts_by_treatment_id_full.loc[event,:] * num_hsi_by_treatment_id.loc[event, 'Number_of_HSI']\n",
-    "num_appt_by_hsi = num_appt_by_hsi.reset_index().copy()\n",
-    "num_appt_by_hsi.rename(columns={'index': 'TREATMENT_ID'}, inplace=True)\n",
-    "num_appt_by_hsi = pd.melt(num_appt_by_hsi, id_vars=['TREATMENT_ID'], value_vars=num_appt_by_hsi.columns[1:],\n",
-    "                      var_name='Appt_Type')\n",
-    "\n",
-    "# Define the flow\n",
-    "num_appt_by_hsi['source'] = 'Appt_Type'\n",
-    "num_appt_by_hsi['target'] = 'TREATMENT_ID'\n",
-    "\n",
-    "size = dict(width=1000, height=800, margins=dict(left=120, right=520))\n",
-    "\n",
-    "partition_appt_type = Partition.Simple('Appt_Type', np.unique(num_appt_by_hsi['Appt_Type']))\n",
-    "\n",
-    "partition_treatment_id = Partition.Simple('TREATMENT_ID', np.unique(num_appt_by_hsi['TREATMENT_ID']))\n",
-    "\n",
-    "nodes = {\n",
-    "    'Appt': ProcessGroup(['Appt_Type'], partition_appt_type),\n",
-    "    'HSI': ProcessGroup(['TREATMENT_ID'], partition_treatment_id),\n",
-    "}\n",
-    "\n",
-    "bundles = [\n",
-    "    Bundle('Appt', 'HSI'),\n",
-    "]\n",
-    "\n",
-    "ordering = [\n",
-    "    ['Appt'],  # left\n",
-    "    ['HSI'],   # right\n",
-    "    ]\n",
-    "\n",
-    "# Set the color for each appt type\n",
-    "palette = {'IPAdmission': 'lightsteelblue', 'InpatientDays': 'skyblue',\n",
-    "           'Over5OPD': 'cornflowerblue', 'Under5OPD': 'steelblue',\n",
-    "           'AntenatalFirst': 'plum', 'ANCSubsequent': 'hotpink',\n",
-    "           'CompDelivery': 'tomato', 'NormalDelivery': 'darksalmon',\n",
-    "           'FamPlan': 'gold', 'MajorSurg': 'orange', 'ConWithDCSA': 'mediumpurple',\n",
-    "           'MaleCirc': 'lightgreen', 'NewAdult': 'mediumseagreen',\n",
-    "           'VCTNegative': 'greenyellow', 'VCTPositive': 'olivedrab',\n",
-    "          }\n",
-    "\n",
-    "# Sankey diagram definition (SDD)\n",
-    "sdd = SankeyDefinition(nodes, bundles, ordering, flow_partition=partition_appt_type)\n",
-    "\n",
-    "# Generate and save Sankey\n",
-    "sankey_num_appt_by_hsi = weave(sdd, num_appt_by_hsi, palette=palette, measures='value').to_widget(**size)\n",
-    "\n",
-    "sankey_num_appt_by_hsi.auto_save_png(make_graph_file_name('Sankey_num_appt_by_hsi'))"
-   ]
+   "source": []
   }
  ],
  "metadata": {
@@ -260,4 +232,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 1
-}
+}
\ No newline at end of file

From 32f0f746407da9e225a904bcdc968efde277148a Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Fri, 18 Aug 2023 20:15:48 +0100
Subject: [PATCH 119/131] fix failed check

---
 .../analysis_hcw_usage_by_appt_and_by_disease.py                 | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
index 0863d58da3..b5c7135666 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
@@ -11,7 +11,6 @@
     bin_hsi_event_details,
     compute_mean_across_runs,
     extract_results,
-    get_scenario_outputs,
     summarize,
 )
 

From b24952096bb107958f39a22aff5f93f30fc132da Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Sat, 19 Aug 2023 21:21:24 +0100
Subject: [PATCH 120/131] redo the merge of level 1b and level 2 in
 healthsystem module

---
 src/tlo/methods/healthsystem.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/tlo/methods/healthsystem.py b/src/tlo/methods/healthsystem.py
index 5db4e4f054..fa19cc7ca5 100644
--- a/src/tlo/methods/healthsystem.py
+++ b/src/tlo/methods/healthsystem.py
@@ -345,7 +345,7 @@ def initialise(self):
         health_system = self.sim.modules['HealthSystem']
 
         # Over-write ACCEPTED_FACILITY_LEVEL to to redirect all '1b' appointments to '2'
-        self.ACCEPTED_FACILITY_LEVEL = adjust_facility_level_to_merge_1b_and_2(self.ACCEPTED_FACILITY_LEVEL)
+        # self.ACCEPTED_FACILITY_LEVEL = adjust_facility_level_to_merge_1b_and_2(self.ACCEPTED_FACILITY_LEVEL)
 
         if not isinstance(self.target, tlo.population.Population):
             self.facility_info = health_system.get_facility_info(self)
@@ -821,8 +821,7 @@ def pre_initialise_population(self):
 
         # Initialise the Consumables class
         self.consumables = Consumables(
-            data=self.update_consumables_availability_to_represent_merging_of_levels_1b_and_2(
-                self.parameters['availability_estimates']),
+            data=self.parameters['availability_estimates'],
             rng=rng_for_consumables,
             availability=self.get_cons_availability()
         )
@@ -1044,8 +1043,7 @@ def format_daily_capabilities(self, use_funded_or_actual_staffing: str) -> pd.Se
         """
 
         # Get the capabilities data imported (according to the specified underlying assumptions).
-        capabilities = pool_capabilities_at_levels_1b_and_2(
-            self.parameters[f'Daily_Capabilities_{use_funded_or_actual_staffing}'])
+        capabilities = self.parameters[f'Daily_Capabilities_{use_funded_or_actual_staffing}']
         capabilities = capabilities.rename(columns={'Officer_Category': 'Officer_Type_Code'})  # neaten
 
         # Create dataframe containing background information about facility and officer types
@@ -1860,7 +1858,7 @@ def log_current_capabilities_and_usage(self):
         # Compute Fraction of Time For Each Officer and level
         officer = [_f.rsplit('Officer_')[1] for _f in comparison.index]
         level = [self._facility_by_facility_id[int(_fac_id)].level for _fac_id in facility_id]
-        level = list(map(lambda x: x.replace('1b', '2'), level))
+        # level = list(map(lambda x: x.replace('1b', '2'), level))
         summary_by_officer = comparison.groupby(by=[officer, level])[['Total_Minutes_Per_Day', 'Minutes_Used']].sum()
         summary_by_officer['Fraction_Time_Used'] = (
             summary_by_officer['Minutes_Used'] / summary_by_officer['Total_Minutes_Per_Day']

From 889bdf5f7244ff795f971f1f6c10bd33105c9e72 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Sun, 20 Aug 2023 08:57:46 +0100
Subject: [PATCH 121/131] undo the modification of test_healthsystem
 considering the merge of levels 1b and 2

---
 tests/test_healthsystem.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/tests/test_healthsystem.py b/tests/test_healthsystem.py
index ee9c64d22b..e9b7d6678c 100644
--- a/tests/test_healthsystem.py
+++ b/tests/test_healthsystem.py
@@ -2047,19 +2047,19 @@ def check_appt_works(district, level, appt_type) -> Tuple:
     # (If they can happen at either, then this test will make it look like they are happening at both!)
     # The file on the HSI expected not to run should show such appointments as not happening at either '1b' or '2'.
     # .... work out which appointment cannot happen at either '1b' or '2'
-    _levels_at_which_appts_dont_run = appts_not_run.groupby(
-        by=['use_funded_or_actual_staffing', 'appt_type', 'district'])['level'].sum()
-    _levels_at_which_appts_dont_run = _levels_at_which_appts_dont_run.drop(
-        _levels_at_which_appts_dont_run.index[_levels_at_which_appts_dont_run.isin(['1b', '2'])]
-    )
-    appts_not_run = _levels_at_which_appts_dont_run.reset_index().dropna()
-    appts_not_run['level'] = appts_not_run['level'].replace({'21b': '2'})  # ... label such appointments for level '2'
-    # ... reproduce that block labelled for level '1b'
-    appts_not_run_level2 = appts_not_run.loc[appts_not_run.level == '2'].copy()
-    appts_not_run_level2['level'] = '1b'
-    appts_not_run = pd.concat([appts_not_run, appts_not_run_level2])
-    # ... re-order columns to suit.
-    appts_not_run = appts_not_run[['use_funded_or_actual_staffing', 'level', 'appt_type', 'district']]
+    # _levels_at_which_appts_dont_run = appts_not_run.groupby(
+    #     by=['use_funded_or_actual_staffing', 'appt_type', 'district'])['level'].sum()
+    # _levels_at_which_appts_dont_run = _levels_at_which_appts_dont_run.drop(
+    #     _levels_at_which_appts_dont_run.index[_levels_at_which_appts_dont_run.isin(['1b', '2'])]
+    # )
+    # appts_not_run = _levels_at_which_appts_dont_run.reset_index().dropna()
+    # appts_not_run['level'] = appts_not_run['level'].replace({'21b': '2'})  # ... label such appointments for level '2'
+    # # ... reproduce that block labelled for level '1b'
+    # appts_not_run_level2 = appts_not_run.loc[appts_not_run.level == '2'].copy()
+    # appts_not_run_level2['level'] = '1b'
+    # appts_not_run = pd.concat([appts_not_run, appts_not_run_level2])
+    # # ... re-order columns to suit.
+    # appts_not_run = appts_not_run[['use_funded_or_actual_staffing', 'level', 'appt_type', 'district']]
 
     # reformat the 'district' info at levels 3 and 4 in results to map with appts_not_run file for convenience
     districts_per_region = mfl[['District', 'Region']].drop_duplicates().dropna(axis='index', how='any').set_index(

From 75bc4732af959e7ea0e82a48aec9ed9c30dc323f Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Mon, 21 Aug 2023 22:32:17 +0100
Subject: [PATCH 122/131] more sankeys for illustrating HSI and appt

---
 .../analysis_sankey_appt_level_hsi.ipynb      | 212 +++++++++++++++---
 1 file changed, 178 insertions(+), 34 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_level_hsi.ipynb b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_level_hsi.ipynb
index 786b5d6b8f..b4fa9abc07 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_level_hsi.ipynb
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_level_hsi.ipynb
@@ -34,7 +34,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 59,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -61,22 +61,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
-   "metadata": {
-    "pycharm": {
-     "is_executing": true,
-     "name": "#%%\n"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Results folder is: c:\\users\\jdbb1\\pycharmprojects\\tlomodel\\outputs\\bshe@ic.ac.uk\\10_year_scale_run-2023-08-01T145207Z_draw0\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "outputs": [],
    "source": [
     "# Declare the name of the file that specified the scenarios used in this run.\n",
     "scenario_filename = '10_year_scale_run.py'\n",
@@ -97,28 +83,190 @@
     "# Extract results\n",
     "hsi = pd.read_csv(results_folder / 'hsi_count_by_treatment_appt_level.csv')\n",
     "\n",
-    "# todo: Format data\n",
+    "# Format data\n",
     "hsi = hsi[[\"Appt_Type_Code\", \"Facility_Level\", \"Treatment_ID\"]].drop_duplicates().reset_index(drop=True)\n",
     "hsi['Facility_Level'] = 'Facility_Level_' + hsi['Facility_Level'].astype(str)\n",
     "hsi['source'] = 'Appt_Type_Code'\n",
     "hsi['target'] = 'Treatment_ID'\n",
-    "hsi['value'] = 1.0"
-   ]
+    "hsi['value'] = 1.0\n",
+    "\n",
+    "# Format data alternatively\n",
+    "hsi_no_level = hsi.drop(columns='Facility_Level').drop_duplicates().reset_index(drop=True)\n",
+    "hsi_example = hsi[(hsi.Appt_Type_Code == 'Over5OPD') |\n",
+    "                  (hsi.Appt_Type_Code == 'VCTNegative') |\n",
+    "                  (hsi.Appt_Type_Code == 'VCTPositive')].reset_index(drop=True)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "# Format the flow\n",
+    "size = dict(width=1000, height=1600, margins=dict(left=150, right=520))\n",
+    "\n",
+    "# Nodes in alphabetic order\n",
+    "partition_appt_type = Partition.Simple('Appt_Type_Code', pd.array([\n",
+    "    'ConWithDCSA',\n",
+    "    'Under5OPD', 'Over5OPD', 'IPAdmission', 'InpatientDays',\n",
+    "    'AntenatalFirst', 'ANCSubsequent', 'FamPlan', 'EPI', \n",
+    "    'CompDelivery', 'NormalDelivery', 'Csection',\n",
+    "    'AccidentsandEmerg', 'MajorSurg', 'MinorSurg',\n",
+    "    'U5Malnutr',\n",
+    "    'MentOPD',\n",
+    "    'Mammography', 'DiagRadio', 'Tomography', 'LabMolec', 'LabTBMicro',\n",
+    "    'MaleCirc', 'Peds', 'VCTNegative', 'VCTPositive', 'NewAdult', 'EstNonCom',\n",
+    "    'TBNew', 'TBFollowUp']))\n",
+    "partition_treatment_id = Partition.Simple('Treatment_ID', np.unique(hsi['Treatment_ID']))\n",
+    "partition_facility_level = Partition.Simple('Facility_Level',np.unique(hsi['Facility_Level']))\n",
+    "\n",
+    "nodes = {\n",
+    "    'Appt': ProcessGroup(['Appt_Type_Code'], partition_appt_type),\n",
+    "    'HSI': ProcessGroup(['Treatment_ID'], partition_treatment_id),\n",
+    "}\n",
+    "\n",
+    "# Add nodes Waypoint\n",
+    "nodes['waypoint'] = Waypoint(partition_facility_level)\n",
+    "\n",
+    "bundles = [\n",
+    "    Bundle('Appt', 'HSI', waypoints=['waypoint']),\n",
+    "]\n",
+    "\n",
+    "ordering = [\n",
+    "    ['Appt'],  # left\n",
+    "    ['waypoint'],    # middle\n",
+    "    ['HSI'],     # right\n",
+    "    ]\n",
+    "\n",
+    "\n",
+    "# Set the color for each appt type\n",
+    "palette = {'Under5OPD': 'lightpink', 'Over5OPD': 'lightpink',\n",
+    "           'IPAdmission': 'palevioletred', 'InpatientDays': 'mediumvioletred',\n",
+    "            \n",
+    "           'AntenatalFirst': 'green', 'ANCSubsequent': 'green',\n",
+    "           'FamPlan': 'darkseagreen', 'EPI': 'paleturquoise', \n",
+    "           'CompDelivery': 'limegreen', 'NormalDelivery': 'limegreen', 'Csection': 'springgreen',\n",
+    "            \n",
+    "           'AccidentsandEmerg': 'darkorange', 'MajorSurg': 'orange', 'MinorSurg': 'gold',\n",
+    "            \n",
+    "           'ConWithDCSA': 'violet',\n",
+    "            \n",
+    "           'U5Malnutr': 'orchid',\n",
+    "            \n",
+    "           'MentOPD': 'darkgrey',\n",
+    "            \n",
+    "           'Mammography': 'lightgrey', 'DiagRadio': 'lightgrey', 'Tomography': 'lightgrey', \n",
+    "           'LabMolec': 'gainsboro', 'LabTBMicro': 'gainsboro',\n",
+    "            \n",
+    "           'MaleCirc': 'mediumslateblue', 'Peds': 'lightskyblue', \n",
+    "           'VCTNegative': 'lightsteelblue', 'VCTPositive': 'lightsteelblue', \n",
+    "           'NewAdult': 'cornflowerblue', 'EstNonCom': 'royalblue',\n",
+    "            \n",
+    "           'TBNew': 'yellow', 'TBFollowUp': 'yellow'}\n",
+    "\n",
+    "# Sankey diagram definition (SDD)\n",
+    "sdd = SankeyDefinition(nodes, bundles, ordering, flow_partition=partition_appt_type)\n",
+    "\n",
+    "# Generate and save Sankey\n",
+    "sankey_appt_level_hsi = weave(sdd, hsi, palette=palette, measures='value').to_widget(**size)\n",
+    "\n",
+    "sankey_appt_level_hsi.auto_save_png(make_graph_file_name('Sankey_appt_level_hsi'))"
+   ],
    "metadata": {
+    "collapsed": false,
     "pycharm": {
-     "is_executing": true,
      "name": "#%%\n"
     }
-   },
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "# Format the flow alt 1\n",
+    "# Format the flow\n",
+    "size = dict(width=1000, height=800, margins=dict(left=150, right=520))\n",
+    "\n",
+    "# Nodes in alphabetic order\n",
+    "partition_appt_type = Partition.Simple('Appt_Type_Code', pd.array(['Over5OPD', 'VCTNegative', 'VCTPositive']))\n",
+    "partition_treatment_id = Partition.Simple('Treatment_ID', np.unique(hsi_example['Treatment_ID']))\n",
+    "partition_facility_level = Partition.Simple('Facility_Level',np.unique(hsi_example['Facility_Level']))\n",
+    "\n",
+    "nodes = {\n",
+    "    'Appt': ProcessGroup(['Appt_Type_Code'], partition_appt_type),\n",
+    "    'HSI': ProcessGroup(['Treatment_ID'], partition_treatment_id),\n",
+    "}\n",
+    "\n",
+    "# Add nodes Waypoint\n",
+    "nodes['waypoint'] = Waypoint(partition_facility_level)\n",
+    "\n",
+    "bundles = [\n",
+    "    Bundle('Appt', 'HSI', waypoints=['waypoint']),\n",
+    "]\n",
+    "\n",
+    "ordering = [\n",
+    "    ['Appt'],  # left\n",
+    "    ['waypoint'],    # middle\n",
+    "    ['HSI'],     # right\n",
+    "    ]\n",
+    "\n",
+    "\n",
+    "# Set the color for each appt type\n",
+    "palette = {'Under5OPD': 'lightpink', 'Over5OPD': 'lightpink',\n",
+    "           'IPAdmission': 'palevioletred', 'InpatientDays': 'mediumvioletred',\n",
+    "            \n",
+    "           'AntenatalFirst': 'green', 'ANCSubsequent': 'green',\n",
+    "           'FamPlan': 'darkseagreen', 'EPI': 'paleturquoise', \n",
+    "           'CompDelivery': 'limegreen', 'NormalDelivery': 'limegreen', 'Csection': 'springgreen',\n",
+    "            \n",
+    "           'AccidentsandEmerg': 'darkorange', 'MajorSurg': 'orange', 'MinorSurg': 'gold',\n",
+    "            \n",
+    "           'ConWithDCSA': 'violet',\n",
+    "            \n",
+    "           'U5Malnutr': 'orchid',\n",
+    "            \n",
+    "           'MentOPD': 'darkgrey',\n",
+    "            \n",
+    "           'Mammography': 'lightgrey', 'DiagRadio': 'lightgrey', 'Tomography': 'lightgrey', \n",
+    "           'LabMolec': 'gainsboro', 'LabTBMicro': 'gainsboro',\n",
+    "            \n",
+    "           'MaleCirc': 'mediumslateblue', 'Peds': 'lightskyblue', \n",
+    "           'VCTNegative': 'lightsteelblue', 'VCTPositive': 'lightsteelblue', \n",
+    "           'NewAdult': 'cornflowerblue', 'EstNonCom': 'royalblue',\n",
+    "            \n",
+    "           'TBNew': 'yellow', 'TBFollowUp': 'yellow'}\n",
+    "\n",
+    "# Sankey diagram definition (SDD)\n",
+    "sdd = SankeyDefinition(nodes, bundles, ordering, flow_partition=partition_appt_type)\n",
+    "\n",
+    "# Generate and save Sankey\n",
+    "sankey_appt_level_hsi = weave(sdd, hsi_example, palette=palette, measures='value').to_widget(**size)\n",
+    "\n",
+    "sankey_appt_level_hsi.auto_save_png(make_graph_file_name('Sankey_appt_level_hsi_example'))\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "cd57dcf703ee438099eaac317b281520",
+       "model_id": "37c5a98f99e845628d06fd58c58b58b5",
        "version_major": 2,
        "version_minor": 0
       },
@@ -126,13 +274,13 @@
        "SankeyWidget(groups=[{'id': 'Appt', 'type': 'process', 'title': '', 'nodes': ['Appt^ConWithDCSA', 'Appt^Under5…"
       ]
      },
-     "execution_count": 49,
+     "execution_count": 63,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "# Format the flow\n",
+    "# Format the flow alt 2\n",
     "size = dict(width=1000, height=1600, margins=dict(left=150, right=520))\n",
     "\n",
     "# Nodes in alphabetic order\n",
@@ -147,24 +295,20 @@
     "    'Mammography', 'DiagRadio', 'Tomography', 'LabMolec', 'LabTBMicro',\n",
     "    'MaleCirc', 'Peds', 'VCTNegative', 'VCTPositive', 'NewAdult', 'EstNonCom',\n",
     "    'TBNew', 'TBFollowUp']))\n",
-    "partition_treatment_id = Partition.Simple('Treatment_ID', np.unique(hsi['Treatment_ID']))\n",
-    "partition_facility_level = Partition.Simple('Facility_Level',np.unique(hsi['Facility_Level']))\n",
+    "partition_treatment_id = Partition.Simple('Treatment_ID', np.unique(hsi_no_level['Treatment_ID']))\n",
     "\n",
     "nodes = {\n",
     "    'Appt': ProcessGroup(['Appt_Type_Code'], partition_appt_type),\n",
     "    'HSI': ProcessGroup(['Treatment_ID'], partition_treatment_id),\n",
     "}\n",
     "\n",
-    "# Add nodes Waypoint\n",
-    "nodes['waypoint'] = Waypoint(partition_facility_level)\n",
     "\n",
     "bundles = [\n",
-    "    Bundle('Appt', 'HSI', waypoints=['waypoint']),\n",
+    "    Bundle('Appt', 'HSI'),\n",
     "]\n",
     "\n",
     "ordering = [\n",
     "    ['Appt'],  # left\n",
-    "    ['waypoint'],    # middle\n",
     "    ['HSI'],     # right\n",
     "    ]\n",
     "\n",
@@ -198,9 +342,9 @@
     "sdd = SankeyDefinition(nodes, bundles, ordering, flow_partition=partition_appt_type)\n",
     "\n",
     "# Generate and save Sankey\n",
-    "sankey_appt_level_hsi = weave(sdd, hsi, palette=palette, measures='value').to_widget(**size)\n",
+    "sankey_appt_level_hsi = weave(sdd, hsi_no_level, palette=palette, measures='value').to_widget(**size)\n",
     "\n",
-    "sankey_appt_level_hsi.auto_save_png(make_graph_file_name('Sankey_appt_level_hsi'))"
+    "sankey_appt_level_hsi.auto_save_png(make_graph_file_name('Sankey_appt_hsi'))"
    ]
   },
   {

From 65441c7086148b057b904b48b309d508317a5444 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Wed, 23 Aug 2023 12:23:18 +0100
Subject: [PATCH 123/131] update sankey

---
 .../analysis_sankey_hcwtime_appt_hsi.ipynb     | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_hcwtime_appt_hsi.ipynb b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_hcwtime_appt_hsi.ipynb
index 2ef713f5f0..4609694f42 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_hcwtime_appt_hsi.ipynb
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_hcwtime_appt_hsi.ipynb
@@ -37,7 +37,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 13,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -64,7 +64,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 14,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -75,13 +75,13 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Results folder is: c:\\users\\jdbb1\\pycharmprojects\\tlomodel\\outputs\\bshe@ic.ac.uk\\10_year_scale_run-2023-08-01T145207Z_draw0\n"
+      "Results folder is: c:\\users\\jdbb1\\pycharmprojects\\tlomodel\\outputs\\bshe@ic.ac.uk\\scale_run_for_hcw_analysis-2023-08-20T102040Z_draw2\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "752fc8430270487fba9537301a8ef8c0",
+       "model_id": "a0ac92b252f14078b47b92cdbb790d4e",
        "version_major": 2,
        "version_minor": 0
       },
@@ -89,14 +89,14 @@
        "SankeyWidget(groups=[{'id': 'Officer', 'type': 'process', 'title': '', 'nodes': ['Officer^DCSA', 'Officer^Clin…"
       ]
      },
-     "execution_count": 39,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "# Declare the name of the file that specified the scenarios used in this run.\n",
-    "scenario_filename = '10_year_scale_run.py'\n",
+    "scenario_filename = 'scale_run_for_hcw_analysis.py'  # i.e., the 10_year_scale_run.py\n",
     "\n",
     "# Declare usual paths:\n",
     "# Get the tlo path\n",
@@ -104,7 +104,7 @@
     "outputspath = tlopath / Path('outputs/bshe@ic.ac.uk')\n",
     "\n",
     "# Find results folder (most recent run generated using that scenario_filename)\n",
-    "f = -4  # -4: Actual + Default health care seeking, -2: Actual + Maximal health care seeking\n",
+    "f = -2  # -4: Actual + Default health care seeking, -2: Actual + Maximal health care seeking\n",
     "results_folder = get_scenario_outputs(scenario_filename, outputspath)[f]\n",
     "print(f\"Results folder is: {results_folder}\")\n",
     "\n",
@@ -112,7 +112,7 @@
     "if (f == -4) or (f == -3):\n",
     "    sankey_scale = 1\n",
     "elif (f == -2) or (f == -1):\n",
-    "    sankey_scale = 7186802559.49815 / 2253171916.6190553  # the total working time of file -2 / ... of file -4\n",
+    "    sankey_scale = 7144638826.033691 / 2206856635.0910654  # the total working time of file -2 / ... of file -4\n",
     "        \n",
     "# Declare path for output graphs from this script\n",
     "make_graph_file_name = lambda stub: results_folder / f\"{stub}.png\"  # noqa: E731\n",
@@ -167,7 +167,7 @@
     "          }\n",
     "\n",
     "# Set the size for the Sankey\n",
-    "size = dict(width=800, height=560*sankey_scale, margins=dict(left=180, right=180))\n",
+    "size = dict(width=800, height=550*sankey_scale, margins=dict(left=180, right=180))\n",
     "\n",
     "# Sankey diagram definition (SDD)\n",
     "sdd = SankeyDefinition(nodes, bundles, ordering, flow_partition=partition_officer_cat)\n",

From 3a8df13bef5d3a0bd8d3ac769604f8a3f5c9ede5 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Wed, 23 Aug 2023 18:27:54 +0100
Subject: [PATCH 124/131] update sankey

---
 .../analysis_sankey_appt_level_hsi.ipynb      | 82 ++++++++++---------
 1 file changed, 42 insertions(+), 40 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_level_hsi.ipynb b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_level_hsi.ipynb
index b4fa9abc07..5b0f60980d 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_level_hsi.ipynb
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_level_hsi.ipynb
@@ -34,7 +34,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 59,
+   "execution_count": 140,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -65,7 +65,7 @@
    "outputs": [],
    "source": [
     "# Declare the name of the file that specified the scenarios used in this run.\n",
-    "scenario_filename = '10_year_scale_run.py'\n",
+    "scenario_filename = 'scale_run_for_hcw_analysis.py'  # i.e., the 10_year_scale_run.py\n",
     "\n",
     "# Declare usual paths:\n",
     "# Get the tlo path\n",
@@ -73,7 +73,7 @@
     "outputspath = tlopath / Path('outputs/bshe@ic.ac.uk')\n",
     "\n",
     "# Find results folder (most recent run generated using that scenario_filename)\n",
-    "f = -4  # -4: Actual + Default health care seeking, -2: Actual + Maximal health care seeking\n",
+    "f = -4  # -4: Actual + Default health care seeking\n",
     "results_folder = get_scenario_outputs(scenario_filename, outputspath)[f]\n",
     "print(f\"Results folder is: {results_folder}\")\n",
     "\n",
@@ -84,17 +84,26 @@
     "hsi = pd.read_csv(results_folder / 'hsi_count_by_treatment_appt_level.csv')\n",
     "\n",
     "# Format data\n",
-    "hsi = hsi[[\"Appt_Type_Code\", \"Facility_Level\", \"Treatment_ID\"]].drop_duplicates().reset_index(drop=True)\n",
+    "hsi = hsi[[\"Appt_Type_Code\", \"Facility_Level\", \"Treatment_ID\", \"Count\"]].drop_duplicates().reset_index(drop=True)\n",
     "hsi['Facility_Level'] = 'Facility_Level_' + hsi['Facility_Level'].astype(str)\n",
     "hsi['source'] = 'Appt_Type_Code'\n",
     "hsi['target'] = 'Treatment_ID'\n",
-    "hsi['value'] = 1.0\n",
+    "hsi['value'] = hsi['Count']\n",
     "\n",
     "# Format data alternatively\n",
-    "hsi_no_level = hsi.drop(columns='Facility_Level').drop_duplicates().reset_index(drop=True)\n",
-    "hsi_example = hsi[(hsi.Appt_Type_Code == 'Over5OPD') |\n",
-    "                  (hsi.Appt_Type_Code == 'VCTNegative') |\n",
-    "                  (hsi.Appt_Type_Code == 'VCTPositive')].reset_index(drop=True)"
+    "hsi_def = hsi.copy()\n",
+    "hsi_def['value'] = 1.0  # only show hsi definitions re. appt footprint and facility level, no hsi count\n",
+    "\n",
+    "hsi_all_levels = hsi.groupby(['Appt_Type_Code', 'Treatment_ID'])['Count'].sum().reset_index()\n",
+    "hsi_all_levels['source'] = 'Appt_Type_Code'\n",
+    "hsi_all_levels['target'] = 'Treatment_ID'\n",
+    "hsi_all_levels['value'] = hsi_all_levels['Count'] # hsi count per appt per level\n",
+    "\n",
+    "hsi_example = hsi[(hsi.Appt_Type_Code == 'Over5OPD')].reset_index(drop=True)\n",
+    "level_example = 'Facility_Level_1a'\n",
+    "appt_example = ['AntenatalFirst', 'FamPlan', 'IPAdmission', 'NormalDelivery', 'Over5OPD', 'VCTNegative', 'NewAdult', 'TBNew']\n",
+    "hsi_example = hsi[(hsi.Appt_Type_Code.isin(appt_example)) &\n",
+    "                  (hsi.Facility_Level == level_example) ].reset_index(drop=True)"
    ],
    "metadata": {
     "collapsed": false,
@@ -174,7 +183,7 @@
     "sdd = SankeyDefinition(nodes, bundles, ordering, flow_partition=partition_appt_type)\n",
     "\n",
     "# Generate and save Sankey\n",
-    "sankey_appt_level_hsi = weave(sdd, hsi, palette=palette, measures='value').to_widget(**size)\n",
+    "sankey_appt_level_hsi = weave(sdd, hsi_def, palette=palette, measures='value').to_widget(**size)\n",
     "\n",
     "sankey_appt_level_hsi.auto_save_png(make_graph_file_name('Sankey_appt_level_hsi'))"
    ],
@@ -192,11 +201,22 @@
    "source": [
     "# Format the flow alt 1\n",
     "# Format the flow\n",
-    "size = dict(width=1000, height=800, margins=dict(left=150, right=520))\n",
+    "size = dict(width=1000, height=1000, margins=dict(left=150, right=520))\n",
     "\n",
     "# Nodes in alphabetic order\n",
-    "partition_appt_type = Partition.Simple('Appt_Type_Code', pd.array(['Over5OPD', 'VCTNegative', 'VCTPositive']))\n",
-    "partition_treatment_id = Partition.Simple('Treatment_ID', np.unique(hsi_example['Treatment_ID']))\n",
+    "partition_appt_type = Partition.Simple('Appt_Type_Code', pd.array([\n",
+    "                 'AntenatalFirst', 'FamPlan',  \n",
+    "                 'VCTNegative', 'NewAdult', 'Over5OPD', 'IPAdmission', 'NormalDelivery', 'TBNew']))\n",
+    "partition_treatment_id = Partition.Simple('Treatment_ID', pd.array(\n",
+    "    ['AntenatalCare_Outpatient', 'Contraception_Routine', \n",
+    "     'Hiv_Prevention_Infant', 'Hiv_Prevention_Prep', 'Hiv_Test', 'Hiv_Treatment',\n",
+    "     'AntenatalCare_FollowUp', \n",
+    "     'CardioMetabolicDisorders_Prevention_CommunityTestingForHypertension', 'Depression_Treatment',\n",
+    "     'Malaria_Prevention_Iptp', 'Malaria_Test', 'Malaria_Treatment',\n",
+    "     'Measles_Treatment', 'PostnatalCare_Maternal', 'Schisto_Treatment',\n",
+    "     'Tb_Prevention_Ipt', 'Tb_Test_Screening', \n",
+    "     'Alri_Pneumonia_Treatment_Inpatient_Followup', 'AntenatalCare_Inpatient', 'Diarrhoea_Treatment_Inpatient',\n",
+    "     'DeliveryCare_Basic', 'Tb_Treatment']))\n",
     "partition_facility_level = Partition.Simple('Facility_Level',np.unique(hsi_example['Facility_Level']))\n",
     "\n",
     "nodes = {\n",
@@ -219,29 +239,11 @@
     "\n",
     "\n",
     "# Set the color for each appt type\n",
-    "palette = {'Under5OPD': 'lightpink', 'Over5OPD': 'lightpink',\n",
-    "           'IPAdmission': 'palevioletred', 'InpatientDays': 'mediumvioletred',\n",
-    "            \n",
-    "           'AntenatalFirst': 'green', 'ANCSubsequent': 'green',\n",
-    "           'FamPlan': 'darkseagreen', 'EPI': 'paleturquoise', \n",
-    "           'CompDelivery': 'limegreen', 'NormalDelivery': 'limegreen', 'Csection': 'springgreen',\n",
-    "            \n",
-    "           'AccidentsandEmerg': 'darkorange', 'MajorSurg': 'orange', 'MinorSurg': 'gold',\n",
-    "            \n",
-    "           'ConWithDCSA': 'violet',\n",
-    "            \n",
-    "           'U5Malnutr': 'orchid',\n",
-    "            \n",
-    "           'MentOPD': 'darkgrey',\n",
-    "            \n",
-    "           'Mammography': 'lightgrey', 'DiagRadio': 'lightgrey', 'Tomography': 'lightgrey', \n",
-    "           'LabMolec': 'gainsboro', 'LabTBMicro': 'gainsboro',\n",
-    "            \n",
-    "           'MaleCirc': 'mediumslateblue', 'Peds': 'lightskyblue', \n",
-    "           'VCTNegative': 'lightsteelblue', 'VCTPositive': 'lightsteelblue', \n",
-    "           'NewAdult': 'cornflowerblue', 'EstNonCom': 'royalblue',\n",
-    "            \n",
-    "           'TBNew': 'yellow', 'TBFollowUp': 'yellow'}\n",
+    "# Set the color for each appt type\n",
+    "palette = {'AntenatalFirst': 'green', 'FamPlan': 'darkseagreen',\n",
+    "           'VCTNegative': 'lightsteelblue', 'NewAdult': 'cornflowerblue',\n",
+    "           'Over5OPD': 'lightpink','IPAdmission': 'palevioletred',\n",
+    "           'NormalDelivery': 'limegreen', 'TBNew': 'yellow'}\n",
     "\n",
     "# Sankey diagram definition (SDD)\n",
     "sdd = SankeyDefinition(nodes, bundles, ordering, flow_partition=partition_appt_type)\n",
@@ -260,13 +262,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 63,
+   "execution_count": 144,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "37c5a98f99e845628d06fd58c58b58b5",
+       "model_id": "756a39d036c54489a3854e9e50444544",
        "version_major": 2,
        "version_minor": 0
       },
@@ -274,7 +276,7 @@
        "SankeyWidget(groups=[{'id': 'Appt', 'type': 'process', 'title': '', 'nodes': ['Appt^ConWithDCSA', 'Appt^Under5…"
       ]
      },
-     "execution_count": 63,
+     "execution_count": 144,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -342,7 +344,7 @@
     "sdd = SankeyDefinition(nodes, bundles, ordering, flow_partition=partition_appt_type)\n",
     "\n",
     "# Generate and save Sankey\n",
-    "sankey_appt_level_hsi = weave(sdd, hsi_no_level, palette=palette, measures='value').to_widget(**size)\n",
+    "sankey_appt_level_hsi = weave(sdd, hsi_all_levels, palette=palette, measures='value').to_widget(**size)\n",
     "\n",
     "sankey_appt_level_hsi.auto_save_png(make_graph_file_name('Sankey_appt_hsi'))"
    ]

From a26bab21aa8498ce622df0cb9b97fba14a5d369b Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Wed, 23 Aug 2023 18:32:46 +0100
Subject: [PATCH 125/131] fix typo

---
 .../analysis_sankey_appt_level_hsi.ipynb      | 103 ++++++++++++------
 1 file changed, 72 insertions(+), 31 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_level_hsi.ipynb b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_level_hsi.ipynb
index 5b0f60980d..618ec331f7 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_level_hsi.ipynb
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_sankey_appt_level_hsi.ipynb
@@ -34,7 +34,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 140,
+   "execution_count": 16,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -61,8 +61,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "outputs": [],
+   "execution_count": 17,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Results folder is: c:\\users\\jdbb1\\pycharmprojects\\tlomodel\\outputs\\bshe@ic.ac.uk\\scale_run_for_hcw_analysis-2023-08-20T102040Z_draw0\n"
+     ]
+    }
+   ],
    "source": [
     "# Declare the name of the file that specified the scenarios used in this run.\n",
     "scenario_filename = 'scale_run_for_hcw_analysis.py'  # i.e., the 10_year_scale_run.py\n",
@@ -104,18 +117,33 @@
     "appt_example = ['AntenatalFirst', 'FamPlan', 'IPAdmission', 'NormalDelivery', 'Over5OPD', 'VCTNegative', 'NewAdult', 'TBNew']\n",
     "hsi_example = hsi[(hsi.Appt_Type_Code.isin(appt_example)) &\n",
     "                  (hsi.Facility_Level == level_example) ].reset_index(drop=True)"
-   ],
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
    "metadata": {
-    "collapsed": false,
     "pycharm": {
      "name": "#%%\n"
     }
-   }
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "outputs": [],
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "281afa32e1d940968322b5be1a998cb5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "SankeyWidget(groups=[{'id': 'Appt', 'type': 'process', 'title': '', 'nodes': ['Appt^ConWithDCSA', 'Appt^Under5…"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "# Format the flow\n",
     "size = dict(width=1000, height=1600, margins=dict(left=150, right=520))\n",
@@ -186,18 +214,33 @@
     "sankey_appt_level_hsi = weave(sdd, hsi_def, palette=palette, measures='value').to_widget(**size)\n",
     "\n",
     "sankey_appt_level_hsi.auto_save_png(make_graph_file_name('Sankey_appt_level_hsi'))"
-   ],
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
    "metadata": {
-    "collapsed": false,
     "pycharm": {
      "name": "#%%\n"
     }
-   }
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "outputs": [],
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "00de864cca414bf1b9af1e91f324e834",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "SankeyWidget(groups=[{'id': 'Appt', 'type': 'process', 'title': '', 'nodes': ['Appt^AntenatalFirst', 'Appt^Fam…"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "# Format the flow alt 1\n",
     "# Format the flow\n",
@@ -252,23 +295,21 @@
     "sankey_appt_level_hsi = weave(sdd, hsi_example, palette=palette, measures='value').to_widget(**size)\n",
     "\n",
     "sankey_appt_level_hsi.auto_save_png(make_graph_file_name('Sankey_appt_level_hsi_example'))\n"
-   ],
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
    "metadata": {
-    "collapsed": false,
     "pycharm": {
      "name": "#%%\n"
     }
-   }
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 144,
-   "metadata": {},
+   },
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "756a39d036c54489a3854e9e50444544",
+       "model_id": "31629b7c852e4ab5a2862085ff8f6212",
        "version_major": 2,
        "version_minor": 0
       },
@@ -276,14 +317,14 @@
        "SankeyWidget(groups=[{'id': 'Appt', 'type': 'process', 'title': '', 'nodes': ['Appt^ConWithDCSA', 'Appt^Under5…"
       ]
      },
-     "execution_count": 144,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "# Format the flow alt 2\n",
-    "size = dict(width=1000, height=1600, margins=dict(left=150, right=520))\n",
+    "size = dict(width=1000, height=2000, margins=dict(left=150, right=520))\n",
     "\n",
     "# Nodes in alphabetic order\n",
     "partition_appt_type = Partition.Simple('Appt_Type_Code', pd.array([\n",
@@ -297,7 +338,7 @@
     "    'Mammography', 'DiagRadio', 'Tomography', 'LabMolec', 'LabTBMicro',\n",
     "    'MaleCirc', 'Peds', 'VCTNegative', 'VCTPositive', 'NewAdult', 'EstNonCom',\n",
     "    'TBNew', 'TBFollowUp']))\n",
-    "partition_treatment_id = Partition.Simple('Treatment_ID', np.unique(hsi_no_level['Treatment_ID']))\n",
+    "partition_treatment_id = Partition.Simple('Treatment_ID', np.unique(hsi_all_levels['Treatment_ID']))\n",
     "\n",
     "nodes = {\n",
     "    'Appt': ProcessGroup(['Appt_Type_Code'], partition_appt_type),\n",

From 32d317d1dcd86d552c619ec58e1d105680e772f7 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Thu, 24 Aug 2023 10:26:50 +0100
Subject: [PATCH 126/131] do not adjust Data MentalAll usage, considering very
 low reporting rates and better match with latest Model usage

---
 .../analysis_compare_appt_usage_real_and_simulation.py     | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py b/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
index 50b1065083..e1f93a2ef1 100644
--- a/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
+++ b/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
@@ -278,9 +278,10 @@ def get_real_usage(resourcefilepath, adjusted=True) -> pd.DataFrame:
     # get facility_level for each record
     real_usage = real_usage.merge(mfl[['Facility_ID', 'Facility_Level']], left_on='Facility_ID', right_on='Facility_ID')
 
-    # adjust annual MentalAll usage using annual reporting rates
-    if adjusted:
-        real_usage = adjust_real_usage_on_mentalall(real_usage)
+    # adjust annual MentalAll usage using annual reporting rates if needed
+    # for now not adjust it considering very low reporting rates and better match with Model usage
+    # if adjusted:
+    #     real_usage = adjust_real_usage_on_mentalall(real_usage)
 
     # assign date to each record
     real_usage['date'] = pd.to_datetime({'year': real_usage['Year'], 'month': real_usage['Month'], 'day': 1})

From 72b081a165b576471244303e9ec6f4f4c5de1a4f Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Thu, 31 Aug 2023 21:14:18 +0100
Subject: [PATCH 127/131] update y scale for staffing plots

---
 .../analysis_describe_healthsystem_capabilities.py   | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities.py b/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities.py
index 2786d0af96..f252f8ce55 100644
--- a/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities.py
+++ b/src/scripts/healthsystem/descriptions_of_input_data/analysis_describe_healthsystem_capabilities.py
@@ -30,9 +30,10 @@
 # MINUTES PER HEALTH OFFICER CATEGORY BY DISTRICT
 data_districts = data.dropna(inplace=False)
 dat = pd.DataFrame(data_districts.groupby(['District', 'Officer_Category'], as_index=False)['Total_Mins_Per_Day'].sum())
+dat['Total_Mins_Per_Day'] = dat['Total_Mins_Per_Day'] / 100000
 tab = dat.pivot(index='District', columns='Officer_Category', values='Total_Mins_Per_Day')
 ax = tab.plot.bar(stacked=True, fontsize='medium')
-plt.ylabel('Average Total Minutes per Day', fontsize='large')
+plt.ylabel('Average Total Minutes per Day in 1e5', fontsize='large')
 plt.xlabel('District', fontsize='large')
 
 ax.legend(ncol=3, bbox_to_anchor=(0, 1),
@@ -43,9 +44,10 @@
 # STAFF COUNTS PER HEALTH OFFICER CATEGORY BY DISTRICT
 data_districts = data.dropna(inplace=False)
 dat = pd.DataFrame(data_districts.groupby(['District', 'Officer_Category'], as_index=False)['Staff_Count'].sum())
+dat['Staff_Count'] = dat['Staff_Count'] / 1000
 tab = dat.pivot(index='District', columns='Officer_Category', values='Staff_Count')
 ax = tab.plot.bar(stacked=True, fontsize='medium')
-plt.ylabel('Staff counts', fontsize='large')
+plt.ylabel('Staff counts in 1e3', fontsize='large')
 plt.xlabel('District', fontsize='large')
 
 ax.legend(ncol=3, bbox_to_anchor=(0, 1),
@@ -56,10 +58,11 @@
 
 # MINUTES PER HEALTH OFFICER CATEGORY BY LEVEL
 dat = pd.DataFrame(data.groupby(['Facility_Level', 'Officer_Category'], as_index=False)['Total_Mins_Per_Day'].sum())
+dat['Total_Mins_Per_Day'] = dat['Total_Mins_Per_Day'] / 100000
 tab = dat.pivot(index='Facility_Level', columns='Officer_Category', values='Total_Mins_Per_Day')
 ax = tab.plot.bar(stacked=True, fontsize='medium')
 # ax = tab.plot.bar(stacked=True, log=True)
-plt.ylabel('Average Total Minutes per Day', fontsize='large')
+plt.ylabel('Average Total Minutes per Day in 1e5', fontsize='large')
 plt.xlabel('Facility level', fontsize='large')
 
 ax.tick_params(axis='x', rotation=0)
@@ -75,10 +78,11 @@
 
 # STAFF COUNTS PER HEALTH OFFICER CATEGORY BY LEVEL
 dat = pd.DataFrame(data.groupby(['Facility_Level', 'Officer_Category'], as_index=False)['Staff_Count'].sum())
+dat['Staff_Count'] = dat['Staff_Count'] / 1000
 tab = dat.pivot(index='Facility_Level', columns='Officer_Category', values='Staff_Count')
 ax = tab.plot.bar(stacked=True, fontsize='medium')
 # ax = tab.plot.bar(stacked=True, log=True)
-plt.ylabel('Staff counts', fontsize='large')
+plt.ylabel('Staff counts in 1e3', fontsize='large')
 plt.xlabel('Facility level', fontsize='large')
 
 ax.tick_params(axis='x', rotation=0)

From ad2343b3b2b8ec2a668a5d1f14c346659b981da3 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Wed, 10 Jan 2024 19:57:26 +0000
Subject: [PATCH 128/131] update figure according to BMC HRH review

---
 .../analysis_compare_appt_usage_real_and_simulation.py        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py b/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
index e1f93a2ef1..addd34c86b 100644
--- a/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
+++ b/src/scripts/calibration_analyses/analysis_scripts/analysis_compare_appt_usage_real_and_simulation.py
@@ -416,10 +416,10 @@ def format_and_save(_fig, _ax, _name_of_plot):
     usage_all = usage_all / 1e6
 
     # plot
-    name_of_plot = 'Model vs Data on average annual health service volume'
+    name_of_plot = 'Average annual health service volume on national level'
     fig, ax = plt.subplots()
     usage_all.plot(kind='bar', stacked=True, color=appt_color_dict, rot=0, ax=ax)
-    ax.set_ylabel('Health service volume in millions')
+    ax.set_ylabel('Number of visits')
     ax.set(xlabel=None)
     plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), title='Appointment type', fontsize=9)
     plt.title(name_of_plot)

From 7efa1cb1c731d6411bd23d3ef54ef0f9681db453 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Thu, 11 Jan 2024 11:24:37 +0000
Subject: [PATCH 129/131] try update figure according to BMC HRH review

---
 ...alysis_hcw_usage_by_appt_and_by_disease.py | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
index b5c7135666..42aa21e470 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
@@ -211,7 +211,7 @@ def format_hcw_usage(hcwscenario='actual'):
             .clip(lower=0.1, upper=10.0)
 
         # reduce the mean ratio by 1.0, for the bar plot that starts from y=1.0 instead of y=0.0
-        hcw_usage_ratio['mean'] = hcw_usage_ratio['mean'] - 1.0
+        #hcw_usage_ratio['mean'] = hcw_usage_ratio['mean'] - 1.0
 
         # rename cadre Nursing_and_Midwifery
         hcw_usage_ratio.rename(index={'Nursing_and_Midwifery': 'Nursing and Midwifery'}, inplace=True)
@@ -224,22 +224,22 @@ def format_hcw_usage(hcwscenario='actual'):
     name_of_plot = 'Simulated average annual working time (95% CI) vs Capability per cadre'
     fig, ax = plt.subplots(figsize=(8, 5))
     hcw_usage_ratio_establishment.plot(kind='bar', yerr=error_establishment, width=0.4,
-                                       ax=ax, position=0, bottom=1.0,
+                                       ax=ax, position=0, bottom=0.0,
                                        legend=False, color='c')
     hcw_usage_ratio_actual.plot(kind='bar', yerr=error_actual, width=0.4,
-                                ax=ax, position=1, bottom=1.0,
+                                ax=ax, position=1, bottom=0.0,
                                 legend=False, color='y')
-    ax.axhline(1.0, color='r')
+    #ax.axhline(1.0, color='r')
     ax.set_xlim(right=len(hcw_usage_ratio_establishment) - 0.3)
-    ax.set_yscale('log')
-    ax.set_ylim(1 / 20, 20)
-    ax.set_yticks([1 / 10, 1.0, 10])
-    ax.set_yticklabels(("<= 1/10", "1.0", ">= 10"))
+    #ax.set_yscale('log')
+    #ax.set_ylim(1 / 20, 20)
+    #ax.set_yticks([1 / 10, 1.0, 10])
+    #ax.set_yticklabels(("<= 1/10", "1.0", ">= 10"))
     ax.set_ylabel('Working time / Capability')
     ax.set_xlabel('Cadre Category')
     plt.xticks(rotation=60, ha='right')
-    ax.xaxis.grid(True, which='major', linestyle='--')
-    ax.yaxis.grid(True, which='both', linestyle='--')
+    #ax.xaxis.grid(True, which='major', linestyle='--')
+    #ax.yaxis.grid(True, which='both', linestyle='--')
     ax.set_title(name_of_plot)
     patch_establishment = matplotlib.patches.Patch(facecolor='c', label='Establishment capability')
     patch_actual = matplotlib.patches.Patch(facecolor='y', label='Actual capability')

From b31b216c2bef4f57312e9669824cd14fe0655497 Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Mon, 15 Jan 2024 16:21:53 +0000
Subject: [PATCH 130/131] try update figure according to BMC HRH review

---
 .../analysis_hcw_usage_by_appt_and_by_disease.py     | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
index 42aa21e470..03badafc54 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
@@ -221,7 +221,7 @@ def format_hcw_usage(hcwscenario='actual'):
     hcw_usage_ratio_actual, error_actual = format_hcw_usage(hcwscenario='actual')
     hcw_usage_ratio_establishment, error_establishment = format_hcw_usage(hcwscenario='funded_plus')
 
-    name_of_plot = 'Simulated average annual working time (95% CI) vs Capability per cadre'
+    name_of_plot = 'Simulated average annual working time (95% CI) vs Capability'
     fig, ax = plt.subplots(figsize=(8, 5))
     hcw_usage_ratio_establishment.plot(kind='bar', yerr=error_establishment, width=0.4,
                                        ax=ax, position=0, bottom=0.0,
@@ -229,13 +229,13 @@ def format_hcw_usage(hcwscenario='actual'):
     hcw_usage_ratio_actual.plot(kind='bar', yerr=error_actual, width=0.4,
                                 ax=ax, position=1, bottom=0.0,
                                 legend=False, color='y')
-    #ax.axhline(1.0, color='r')
+    ax.axhline(1.0, color='gray', linestyle='dashed')
     ax.set_xlim(right=len(hcw_usage_ratio_establishment) - 0.3)
     #ax.set_yscale('log')
-    #ax.set_ylim(1 / 20, 20)
-    #ax.set_yticks([1 / 10, 1.0, 10])
-    #ax.set_yticklabels(("<= 1/10", "1.0", ">= 10"))
-    ax.set_ylabel('Working time / Capability')
+    ax.set_ylim(0, 11.5)
+    ax.set_yticks([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+    ax.set_yticklabels(("0", "1", '2', '3', '4', '5', '6', '7', '8', '9', ">= 10"))
+    ax.set_ylabel('Simulated working time : Capability')
     ax.set_xlabel('Cadre Category')
     plt.xticks(rotation=60, ha='right')
     #ax.xaxis.grid(True, which='major', linestyle='--')

From 1829ec65ea7367267030df6d23d8b230d57888aa Mon Sep 17 00:00:00 2001
From: Bingling <b.she@imperial.ac.uk>
Date: Mon, 12 Feb 2024 11:29:26 +0000
Subject: [PATCH 131/131] todo: drop facility level 5 and all relevant
 according to BMC HRH review

---
 .../analysis_hcw_usage_by_appt_and_by_disease.py                | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
index 03badafc54..79aba48f72 100644
--- a/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
+++ b/src/scripts/healthsystem/hsi_in_typical_run/analysis_hcw_usage_by_appt_and_by_disease.py
@@ -174,7 +174,7 @@ def get_hcw_capability(resourcefilepath, hcwscenario='actual') -> pd.DataFrame:
         resourcefilepath / 'healthsystem' / 'human_resources' / hcwscenario / 'ResourceFile_Daily_Capabilities.csv'
     )
     hcw_capability = hcw_capability.groupby(['Facility_Level', 'Officer_Category']
-                                            )['Total_Mins_Per_Day'].sum().reset_index()
+                                            )['Total_Mins_Per_Day'].sum().reset_index()  # todo: drop facility level 5
     hcw_capability['Total_Mins_Per_Year'] = hcw_capability['Total_Mins_Per_Day'] * 365.25
     hcw_capability.drop(columns='Total_Mins_Per_Day', inplace=True)