update_hrh_data_script (#1516)

* update repo path to Imperial OneDrive * modify functions that do not work now * modify bad functions that do not work now * run for funded_plus * Revert "run for funded_plus" This reverts commit 1284d9b. * none-necessary changes of data format; just to keep in consistency with TLO master resource files * Revert "Revert "run for funded_plus"" This reverts commit e4cf31c. * save new resource files with only changes of data format (such as int/float, one more decimal digit in the floats data) * recover the path of reading the fixed auxiliary file * re-comment the lines only for funded_plus scenario --------- Co-authored-by: Tim Hallett <39991060+tbhallett@users.noreply.github.com>
UCL · Dec 13, 2024 · 926d79f · 926d79f
1 parent 018af3f
commit 926d79f
Show file tree

Hide file tree

Showing 5 changed files with 58 additions and 59 deletions.
diff --git a/resources/healthsystem/human_resources/actual/ResourceFile_Daily_Capabilities.csv b/resources/healthsystem/human_resources/actual/ResourceFile_Daily_Capabilities.csv
diff --git a/resources/healthsystem/human_resources/definitions/ResourceFile_Appt_Time_Table.csv b/resources/healthsystem/human_resources/definitions/ResourceFile_Appt_Time_Table.csv
diff --git a/resources/healthsystem/human_resources/funded/ResourceFile_Daily_Capabilities.csv b/resources/healthsystem/human_resources/funded/ResourceFile_Daily_Capabilities.csv
diff --git a/resources/healthsystem/human_resources/funded_plus/ResourceFile_Daily_Capabilities.csv b/resources/healthsystem/human_resources/funded_plus/ResourceFile_Daily_Capabilities.csv
diff --git a/..._file_processing/healthsystem/human_resources/formatting_human_resources_and_appt_data.py b/..._file_processing/healthsystem/human_resources/formatting_human_resources_and_appt_data.py
@@ -44,18 +44,18 @@
 
 resourcefilepath = Path('./resources')
 
-path_to_dropbox = Path(
-    '/Users/jdbb1/Dropbox/Thanzi La Onse')  # <-- point to the TLO dropbox locally
+path_to_onedrive = Path(
+    '/Users/jdbb1/Imperial College London/TLOModel - WP - Documents')  # <-- point to the TLO onedrive locally
 
-workingfile = (path_to_dropbox /
+workingfile = (path_to_onedrive /
                '05 - Resources' / 'Module-healthsystem' / 'chai ehp resource use data' / 'ORIGINAL' /
                'Malawi optimization model import_2022-02-11.xlsx')
 
-working_file_old = (path_to_dropbox /
+working_file_old = (path_to_onedrive /
                     '05 - Resources' / 'Module-healthsystem' / 'chai ehp resource use data' / 'ORIGINAL' /
                     'Optimization model import_Malawi_20180315 v10.xlsx')
 
-path_to_auxiliaryfiles = (path_to_dropbox /
+path_to_auxiliaryfiles = (path_to_onedrive /
                           '05 - Resources' /
                           'Module-healthsystem' /
                           'chai ehp resource use data' /
@@ -283,8 +283,9 @@
 
 # --- Generate assumptions of established/funded staff distribution at facility levels 0&1a&1b&2
 # Read 2018-03-09 Facility-level establishment MOH & CHAM from CHAI auxiliary datasets
-fund_staff_2018_raw = pd.read_excel(path_to_auxiliaryfiles / '2018-03-09 Facility-level establishment MOH & CHAM.xlsx',
-                                    sheet_name='Establishment listing')
+fund_staff_2018_raw = pd.read_csv(path_to_auxiliaryfiles / '2018-03-09 Facility-level establishment MOH & CHAM.csv')
+fund_staff_2018_raw['Number of positions'] = fund_staff_2018_raw['Number of positions'].fillna(0)
+fund_staff_2018_raw['Number of positions'] = fund_staff_2018_raw['Number of positions'].astype(int)
 
 # Get relevant columns
 fund_staff_2018 = fund_staff_2018_raw[['Number of positions', 'Facility', 'Facility Type', 'WFOM Cadre']].copy()
@@ -556,7 +557,9 @@
                                                     'CenHos'].index, 'Facility_Level'] = 'Facility_Level_3'
 
 # Group staff by levels
-immed_need_distribution = pd.DataFrame(immed_need_distribution.groupby(by=['Facility_Level'], sort=False).sum())
+immed_need_distribution = pd.DataFrame(
+    immed_need_distribution.groupby(by=['Facility_Level'], sort=False).sum()
+).drop(columns=['FacilityType', 'FacilityName'])
 # Drop level 3
 immed_need_distribution.drop(index='Facility_Level_3', inplace=True)
 # Reset index
@@ -773,7 +776,8 @@
 # Group the referral hospitals QECH and ZCH as Referral Hospital_Southern
 Is_DistrictLevel = fund_staffing_table['Is_DistrictLevel'].values  # Save the column 'Is_DistrictLevel' first
 fund_staffing_table = pd.DataFrame(
-    fund_staffing_table.groupby(by=['District_Or_Hospital'], sort=False).sum()).reset_index()
+    fund_staffing_table.groupby(by=['District_Or_Hospital'], sort=False).sum()
+).reset_index().drop(columns=['Is_DistrictLevel'])
 fund_staffing_table.insert(1, 'Is_DistrictLevel', Is_DistrictLevel[:-1])  # Add the column 'Is_DistrictLevel'
 
 # Check that in fund_staffing_table every staff count entry >= 0
@@ -809,7 +813,7 @@
     record['Is_DistrictLevel'] = True
 
     # get total staff level from the super districts
-    cols = set(fund_staffing_table.columns).intersection(set(officer_types_table.Officer_Type_Code))
+    cols = list(set(fund_staffing_table.columns).intersection(set(officer_types_table.Officer_Type_Code)))
 
     total_staff = fund_staffing_table.loc[
         fund_staffing_table['District_Or_Hospital'] == super_district, cols].values.squeeze()
@@ -823,7 +827,8 @@
 
     # assign w * 100% staff to the new district
     record.loc[cols] = w * total_staff
-    fund_staffing_table = fund_staffing_table.append(record).reset_index(drop=True)
+    assert (record.to_frame().T.columns == fund_staffing_table.columns).all()
+    fund_staffing_table = pd.concat([fund_staffing_table, record.to_frame().T], axis=0).reset_index(drop=True)
 
     # take staff away from the super district
     fund_staffing_table.loc[fund_staffing_table['District_Or_Hospital'] == super_district, cols] = \
@@ -907,7 +912,7 @@
                                                       'Facility_Level_4']
 
 # Check that in fund_staffing_table every staff count entry >= 0
-assert (fund_staffing_table.loc[:, 'M01':'R04'].values >= 0).all()
+assert (fund_staffing_table.loc[:, 'M01':'R04'] >= 0).all().all()
 # fund_staffing_table ready!
 
 # Save the table without column 'Is_DistrictLevel'; staff counts in floats
@@ -960,7 +965,8 @@
 # Group the referral hospitals QECH and ZCH as Referral Hospital_Southern
 Is_DistrictLevel = curr_staffing_table['Is_DistrictLevel'].values  # Save the column 'Is_DistrictLevel' first
 curr_staffing_table = pd.DataFrame(
-    curr_staffing_table.groupby(by=['District_Or_Hospital'], sort=False).sum()).reset_index()
+    curr_staffing_table.groupby(by=['District_Or_Hospital'], sort=False).sum()
+).reset_index().drop(columns='Is_DistrictLevel')
 curr_staffing_table.insert(1, 'Is_DistrictLevel', Is_DistrictLevel[:-1])  # Add the column 'Is_DistrictLevel'
 
 # No need to add a row for Zomba Mental Hospital, as the updated CHAI data has this row for ZMH.
@@ -993,7 +999,7 @@
     record['Is_DistrictLevel'] = True
 
     # get total staff level from the super districts
-    cols = set(curr_staffing_table.columns).intersection(set(officer_types_table.Officer_Type_Code))
+    cols = list(set(curr_staffing_table.columns).intersection(set(officer_types_table.Officer_Type_Code)))
 
     total_staff = curr_staffing_table.loc[
         curr_staffing_table['District_Or_Hospital'] == super_district, cols].values.squeeze()
@@ -1008,7 +1014,8 @@
 
     # assign w * 100% staff to the new district
     record.loc[cols] = w * total_staff
-    curr_staffing_table = curr_staffing_table.append(record).reset_index(drop=True)
+    assert (record.to_frame().T.columns == curr_staffing_table.columns).all()
+    curr_staffing_table = pd.concat([curr_staffing_table, record.to_frame().T], axis=0).reset_index(drop=True)
 
     # take staff away from the super district
     curr_staffing_table.loc[curr_staffing_table['District_Or_Hospital'] == super_district, cols] = \
@@ -1105,23 +1112,23 @@
 for d in pop_districts:
     df = pd.DataFrame({'Facility_Level': Facility_Levels[0:4], 'District': d,
                        'Region': pop.loc[pop['District'] == d, 'Region'].values[0]})
-    mfl = mfl.append(df, ignore_index=True, sort=True)
+    mfl = pd.concat([mfl, df], ignore_index=True, sort=True)
 
 # Add in the Referral Hospitals, one for each region
 for r in pop_regions:
-    mfl = mfl.append(pd.DataFrame({
-        'Facility_Level': Facility_Levels[4], 'District': None, 'Region': r
-    }, index=[0]), ignore_index=True, sort=True)
+    df = pd.DataFrame({
+        'Facility_Level': Facility_Levels[4], 'District': None, 'Region': r}, index=[0])
+    mfl = pd.concat([mfl, df], ignore_index=True, sort=True)
 
 # Add the ZMH
-mfl = mfl.append(pd.DataFrame({
-    'Facility_Level': Facility_Levels[5], 'District': None, 'Region': None
-}, index=[0]), ignore_index=True, sort=True)
+df = pd.DataFrame({
+    'Facility_Level': Facility_Levels[5], 'District': None, 'Region': None}, index=[0])
+mfl = pd.concat([mfl, df], ignore_index=True, sort=True)
 
 # Add the HQ
-mfl = mfl.append(pd.DataFrame({
-    'Facility_Level': Facility_Levels[6], 'District': None, 'Region': None
-}, index=[0]), ignore_index=True, sort=True)
+df = pd.DataFrame({
+    'Facility_Level': Facility_Levels[6], 'District': None, 'Region': None}, index=[0])
+mfl = pd.concat([mfl, df], ignore_index=True, sort=True)
 
 # Create the Facility_ID
 mfl.loc[:, 'Facility_ID'] = mfl.index
@@ -1409,7 +1416,7 @@
 # Generate appt_time_table_coarse with officer_category, instead of officer_type
 appt_time_table_coarse = pd.DataFrame(
     ApptTimeTable.groupby(['Appt_Type_Code', 'Facility_Level', 'Officer_Category']).sum()
-).reset_index()
+).reset_index().drop(columns=['Officer_Type', 'Officer_Type_Code'])
 
 # Save
 # ApptTimeTable.to_csv(
@@ -1475,19 +1482,14 @@
 
         if len(block) == 0:
             # no requirement expressed => The appt is not possible at this location
-            Officers_Need_For_Appt = Officers_Need_For_Appt.append(
-                {'Facility_Level': f,
-                 'Appt_Type_Code': a,
-                 'Officer_Type_Codes': False
-                 }, ignore_index=True)
+            df = pd.DataFrame({'Facility_Level': f, 'Appt_Type_Code': a, 'Officer_Type_Codes': False}, index=[0])
+            Officers_Need_For_Appt = pd.concat([Officers_Need_For_Appt, df], ignore_index=True)
 
         else:
             need_officer_types = list(block['Officer_Type_Code'])
-            Officers_Need_For_Appt = Officers_Need_For_Appt.append(
-                {'Facility_Level': f,
-                 'Appt_Type_Code': a,
-                 'Officer_Type_Codes': need_officer_types
-                 }, ignore_index=True)
+            df = pd.DataFrame({'Facility_Level': f, 'Appt_Type_Code': a, 'Officer_Type_Codes': need_officer_types},
+                              index=range(len(block)))
+            Officers_Need_For_Appt = pd.concat([Officers_Need_For_Appt, df], ignore_index=True)
 
 # Turn this into the the set of staff that are required for each type of appointment
 FacLevel_By_Officer = pd.DataFrame(columns=Facility_Levels,
@@ -1675,7 +1677,8 @@
      HosHC_patient_facing_time_old.iloc[:, 1:].values) /
     HosHC_patient_facing_time_old.iloc[:, 1:].values
 )
-HosHC_pft_diff = HosHC_pft_diff.append(HosHC_pft_diff.iloc[:, 1:].mean(axis=0), ignore_index=True)
+df = HosHC_pft_diff.iloc[:, 1:].mean(axis=0).to_frame().T
+HosHC_pft_diff = pd.concat([HosHC_pft_diff, df], ignore_index=True)
 
 # save
 # HosHC_pft_diff.to_csv(
@@ -1746,13 +1749,8 @@
 funded_daily_capability.drop(columns='District_Or_Hospital', inplace=True)
 
 # Add info from mfl: Region and Facility ID
-for i in funded_daily_capability.index:
-    the_facility_name = funded_daily_capability.loc[i, 'Facility_Name']
-    the_ID = mfl.loc[mfl['Facility_Name'] == the_facility_name, 'Facility_ID']
-    the_region = mfl.loc[mfl['Facility_Name'] == the_facility_name, 'Region']
-
-    funded_daily_capability.loc[i, 'Facility_ID'] = the_ID.values
-    funded_daily_capability.loc[i, 'Region'] = the_region.values
+funded_daily_capability = funded_daily_capability.merge(
+    mfl[['Facility_Name', 'Facility_ID', 'Region']], on='Facility_Name', how='left')
 
 # Add 'officer_category' info
 funded_daily_capability = funded_daily_capability.merge(officer_types_table, on='Officer_Type_Code', how='left')
@@ -1763,6 +1761,9 @@
         ['Facility_ID', 'Facility_Name', 'Facility_Level', 'District', 'Region', 'Officer_Category'],
         dropna=False)[['Total_Mins_Per_Day', 'Staff_Count']].sum()
 ).reset_index()
+# None-necessary changes of data format; just to keep in consistency with TLO master resource files
+funded_daily_capability_coarse['Staff_Count'] = funded_daily_capability_coarse['Staff_Count'].astype(float)
+funded_daily_capability_coarse['Facility_ID'] = funded_daily_capability_coarse['Facility_ID'].astype(float)
 
 # Since not dropped zero-minute rows in lines 1717-1718,
 # check that there are entries for all coarse cadres and all facility id
@@ -1825,13 +1826,8 @@
 curr_daily_capability.drop(columns='District_Or_Hospital', inplace=True)
 
 # Add info from mfl: Region and Facility ID
-for i in curr_daily_capability.index:
-    the_facility_name = curr_daily_capability.loc[i, 'Facility_Name']
-    the_ID = mfl.loc[mfl['Facility_Name'] == the_facility_name, 'Facility_ID']
-    the_region = mfl.loc[mfl['Facility_Name'] == the_facility_name, 'Region']
-
-    curr_daily_capability.loc[i, 'Facility_ID'] = the_ID.values
-    curr_daily_capability.loc[i, 'Region'] = the_region.values
+curr_daily_capability = curr_daily_capability.merge(
+    mfl[['Facility_Name', 'Facility_ID', 'Region']], on='Facility_Name', how='left')
 
 # Add 'officer_category' info
 curr_daily_capability = curr_daily_capability.merge(officer_types_table, on='Officer_Type_Code', how='left')
@@ -1842,6 +1838,9 @@
         ['Facility_ID', 'Facility_Name', 'Facility_Level', 'District', 'Region', 'Officer_Category'],
         dropna=False)[['Total_Mins_Per_Day', 'Staff_Count']].sum()
 ).reset_index()
+# None-necessary changes of data format; just to keep in consistency with TLO master resource files
+curr_daily_capability_coarse['Staff_Count'] = curr_daily_capability_coarse['Staff_Count'].astype(float)
+curr_daily_capability_coarse['Facility_ID'] = curr_daily_capability_coarse['Facility_ID'].astype(float)
 
 # Since not dropped zero-minute rows in lines 1797-1798,
 # check that there are entries for all coarse cadres and all facility id