Merge pull request #7 from PolicyEngine/healthcare-spending

Add healthcare spending calibration
PolicyEngine · Sep 6, 2024 · c6daaa9 · c6daaa9
2 parents 771bfd9 + bd8173d
commit c6daaa9
Show file tree

Hide file tree

Showing 11 changed files with 183 additions and 66 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,5 @@
 **/*.csv
 !uprating_factors.csv
 !uprating_growth_factors.csv
+!healthcare_spending.csv
+!spm_threshold_agi.csv
diff --git a/docs/pages/Benchmarks.py b/docs/pages/Benchmarks.py
@@ -31,11 +31,13 @@ def compare_datasets():
         comparison["Abs. Error %"] = (
             (comparison["Abs. Error"] / comparison["Actual"].abs())
             .replace([np.inf, -np.inf], np.nan)
-            .fillna(1)
+            .fillna(0)
         )
         comparison["Dataset"] = dataset.label
         comparison_combined = pd.concat([comparison_combined, comparison])
 
+    comparison_combined.to_csv("comparisons.csv", index=False)
+
     return comparison_combined
 
 

diff --git a/policyengine_us_data/data_storage/healthcare_spending.csv b/policyengine_us_data/data_storage/healthcare_spending.csv
@@ -0,0 +1,10 @@
+age_10_year_lower_bound,health_insurance_premiums_without_medicare_part_b,over_the_counter_health_expenses,other_medical_expenses,medicare_part_b_premiums,age,age_10_year_lower_bound
+0.0,0.0,3139958418.4186344,10581757838.419937,0.0,178907224.9393921,0.0
+10.0,1331908850.7960727,3899326144.79529,16161392678.387276,11198509.280605555,590464084.1854401,408983930.26535034
+20.0,32361931490.785435,6987581951.171959,23521586513.130955,71263601.57686229,1007420789.8561249,818687466.8069458
+30.0,71590362469.06906,9615087479.840654,36626364271.092926,125195737.01158798,1532737442.4855804,1335366065.3704834
+40.0,80661760961.92291,10104218376.054054,40816478783.58568,357437542.5703746,1801306563.9217224,1623998071.1169434
+50.0,81896637177.70609,11697789643.067505,45846296831.32144,1077750904.7396624,2230327049.8746796,2044125057.2036743
+60.0,64221993928.27972,12500423336.761482,49034771200.48631,33075117802.61769,2590424902.998871,2416114812.9629517
+70.0,29366086293.545853,8774761672.04829,32298721271.149677,54002252445.046425,2024842778.7351837,1917786555.2562714
+80.0,14151276913.693628,4168277057.180005,14749253153.81525,24692726700.12216,1129862392.4155273,1084021779.61792
diff --git a/policyengine_us_data/data_storage/spm_threshold_agi.csv b/policyengine_us_data/data_storage/spm_threshold_agi.csv
@@ -0,0 +1,11 @@
+decile,lower_spm_threshold,upper_spm_threshold,adjusted_gross_income,count
+1,13232.297,16084.154296875,601395945534.2526,14067622.329864502
+2,16084.154,18203.76171875,699173775752.6232,14069629.232849121
+3,18203.762,19966.45703125,1038943520151.5376,14065266.263519287
+4,19966.457,22151.69140625,1284310549943.0337,14072190.912216187
+5,22151.691,24156.166015625,1520507632014.9219,14067456.695106506
+6,24156.166,27384.509765625,1922194713836.21,14069159.144554138
+7,27384.51,32536.736328125,1988726862598.7808,14067486.546066284
+8,32536.736,37774.16015625,2236533834141.468,14068930.520233154
+9,37774.16,44738.5859375,2540801186386.7207,14069854.290618896
+10,44738.586,inf,2856965773326.8867,14069456.2210083
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
@@ -29,7 +29,6 @@ def generate(self):
             # Extrapolate from CPS 2022
 
             cps_2022 = CPS_2022(require=True)
-            print("Creating uprating factors table...")
             uprating = create_policyengine_uprating_factors_table()
             arrays = cps_2022.load_dataset()
             for variable in uprating.index.unique():
@@ -41,7 +40,6 @@ def generate(self):
                         2021
                     ].values[0]
                     growth = current_index / start_index
-                    print(f"Uprating {variable} by {growth-1:.1%}")
                     arrays[variable] = arrays[variable] * growth
 
             self.save_dataset(arrays)
@@ -61,11 +59,22 @@ def generate(self):
         add_previous_year_income(self, cps)
         add_spm_variables(cps, spm_unit)
         add_household_variables(cps, household)
+        add_rent(cps, person, household)
 
         raw_data.close()
         cps.close()
 
 
+def add_rent(cps: h5py.File, person: DataFrame, household: DataFrame):
+    is_renting = household.H_TENURE == 2
+    AVERAGE_RENT = 1_300 * 12
+    # Project down to the first person in the household
+    person_is_renting = (
+        household.set_index("H_SEQ").loc[person.PH_SEQ].H_TENURE.values == 2
+    )
+    cps["pre_subsidy_rent"] = np.where(person_is_renting, AVERAGE_RENT, 0)
+
+
 def add_id_variables(
     cps: h5py.File,
     person: DataFrame,

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -12,6 +12,7 @@
 from policyengine_us_data.datasets.cps.extended_cps import (
     ExtendedCPS_2024,
     CPS_2019,
+    CPS_2024,
 )
 import torch
 
@@ -47,10 +48,10 @@ def loss(weights):
         worst_val = rel_error[torch.argmax(rel_error)].item()
         return rel_error.mean(), worst_name, worst_val
 
-    optimizer = torch.optim.Adam([weights], lr=1)
+    optimizer = torch.optim.Adam([weights], lr=1e-2)
     from tqdm import trange
 
-    iterator = trange(1_000)
+    iterator = trange(10_000)
     for i in iterator:
         optimizer.zero_grad()
         l, worst_name, worst_val = loss(torch.exp(weights))
@@ -133,6 +134,35 @@ def generate(self):
         self.save_dataset(data)
 
 
+class ReweightedCPS_2024(Dataset):
+    data_format = Dataset.ARRAYS
+    file_path = STORAGE_FOLDER / "reweighted_cps_2024.h5"
+    name = "reweighted_cps_2024"
+    label = "Reweighted CPS 2024"
+    input_dataset = CPS_2024
+    time_period = 2024
+
+    def generate(self):
+        from policyengine_us import Microsimulation
+
+        sim = Microsimulation(dataset=self.input_dataset)
+        data = sim.dataset.load_dataset()
+        original_weights = sim.calculate("household_weight")
+        original_weights = original_weights.values + np.random.normal(
+            1, 0.1, len(original_weights)
+        )
+        for year in [2024]:
+            loss_matrix, targets_array = build_loss_matrix(
+                self.input_dataset, year
+            )
+            optimised_weights = reweight(
+                original_weights, loss_matrix, targets_array
+            )
+            data["household_weight"] = optimised_weights
+
+        self.save_dataset(data)
+
+
 class EnhancedCPS_2024(EnhancedCPS):
     input_dataset = ExtendedCPS_2024
     start_year = 2024

diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py
@@ -5,62 +5,67 @@
 from ..puf import *
 import pandas as pd
 
+# These are sorted by magnitude.
+# First 15 contain 90%.
+# First 7 contain 75%.
+# If you're trying to debug this part of the code and don't want to wait ages
+# to see if something breaks, try limiting to those.
 IMPUTED_VARIABLES = [
-    "alimony_expense",
-    "alimony_income",
-    "american_opportunity_credit",
-    "amt_foreign_tax_credit",
-    "casualty_loss",
-    "cdcc_relevant_expenses",
-    "charitable_cash_donations",
-    "charitable_non_cash_donations",
-    "domestic_production_ald",
-    "early_withdrawal_penalty",
-    "educator_expense",
     "employment_income",
-    "energy_efficient_home_improvement_credit",
-    "estate_income",
-    "excess_withheld_payroll_tax",
-    "farm_income",
-    "farm_rent_income",
-    "foreign_tax_credit",
-    "general_business_credit",
-    "health_savings_account_ald",
+    "partnership_s_corp_income",
+    "social_security",
+    "taxable_pension_income",
     "interest_deduction",
-    "investment_income_elected_form_4952",
+    "tax_exempt_pension_income",
     "long_term_capital_gains",
-    "long_term_capital_gains_on_collectibles",
     "misc_deduction",
-    "miscellaneous_income",
-    "non_qualified_dividend_income",
-    "non_sch_d_capital_gains",
-    "other_credits",
-    "partnership_s_corp_income",
     "pre_tax_contributions",
-    "prior_year_minimum_tax_credit",
+    "taxable_ira_distributions",
+    "self_employment_income",
+    "w2_wages_from_qualified_business",
+    "short_term_capital_gains",
     "qualified_dividend_income",
-    "qualified_tuition_expenses",
+    "charitable_cash_donations",
+    "self_employed_pension_contribution_ald",
     "real_estate_taxes",
-    "recapture_of_investment_credit",
+    "unrecaptured_section_1250_gain",
+    "taxable_unemployment_compensation",
+    "taxable_interest_income",
+    "domestic_production_ald",
+    "self_employed_health_insurance_ald",
     "rental_income",
+    "non_qualified_dividend_income",
+    "cdcc_relevant_expenses",
+    "tax_exempt_interest_income",
     "salt_refund_income",
+    "foreign_tax_credit",
+    "estate_income",
+    "charitable_non_cash_donations",
+    "american_opportunity_credit",
+    "miscellaneous_income",
+    "alimony_expense",
+    "farm_income",
+    "alimony_income",
+    "health_savings_account_ald",
+    "non_sch_d_capital_gains",
+    "general_business_credit",
+    "energy_efficient_home_improvement_credit",
+    "traditional_ira_contributions",
+    "amt_foreign_tax_credit",
+    "excess_withheld_payroll_tax",
     "savers_credit",
-    "self_employed_health_insurance_ald",
-    "self_employed_pension_contribution_ald",
-    "self_employment_income",
-    "short_term_capital_gains",
-    "social_security",
     "student_loan_interest",
-    "tax_exempt_interest_income",
-    "tax_exempt_pension_income",
-    "taxable_interest_income",
-    "taxable_ira_distributions",
-    "taxable_pension_income",
-    "taxable_unemployment_compensation",
-    "traditional_ira_contributions",
-    "unrecaptured_section_1250_gain",
+    "investment_income_elected_form_4952",
+    "early_withdrawal_penalty",
+    "prior_year_minimum_tax_credit",
+    "farm_rent_income",
+    "qualified_tuition_expenses",
+    "educator_expense",
+    "long_term_capital_gains_on_collectibles",
+    "other_credits",
+    "casualty_loss",
     "unreported_payroll_tax",
-    "w2_wages_from_qualified_business",
+    "recapture_of_investment_credit",
 ]
 
 

diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py
@@ -289,8 +289,6 @@ class PUF(Dataset):
     def generate(self):
         from policyengine_us.system import system
 
-        print("Importing PolicyEngine US variable metadata...")
-
         irs_puf = IRS_PUF_2015(require=True)
 
         puf = irs_puf.load("puf")
@@ -300,7 +298,6 @@ def generate(self):
             puf = uprate_puf(puf, 2015, self.time_period)
         elif self.time_period >= 2021:
             puf_2021 = PUF_2021(require=True)
-            print("Creating uprating factors table...")
             uprating = create_policyengine_uprating_factors_table()
             arrays = puf_2021.load_dataset()
             for variable in uprating:
@@ -312,19 +309,15 @@ def generate(self):
                         2021
                     ].values[0]
                     growth = current_index / start_index
-                    print(f"Uprating {variable} by {growth-1:.1%}")
                     arrays[variable] = arrays[variable] * growth
             self.save_dataset(arrays)
             return
 
         puf = puf[puf.MARS != 0]  # Remove aggregate records
 
-        print("Pre-processing PUF...")
         original_recid = puf.RECID.values.copy()
         puf = preprocess_puf(puf)
-        print("Imputing missing PUF demographics...")
         puf = impute_missing_demographics(puf, demographics)
-        print("Imputing PUF pension contributions...")
         puf["pre_tax_contributions"] = impute_pension_contributions_to_puf(
             puf[["employment_income"]]
         )

diff --git a/policyengine_us_data/datasets/puf/uprate_puf.py b/policyengine_us_data/datasets/puf/uprate_puf.py
@@ -137,7 +137,6 @@ def get_growth(variable, from_year, to_year):
 
 
 def uprate_puf(puf, from_year, to_year):
-    print(f"Uprating PUF from {from_year} to {to_year}...")
     puf = puf.copy()
     for variable in SOI_TO_PUF_STRAIGHT_RENAMES:
         growth = get_growth(variable, from_year, to_year)

diff --git a/policyengine_us_data/utils/github.py b/policyengine_us_data/utils/github.py
@@ -1,5 +1,6 @@
 import os
 import requests
+from tqdm import tqdm
 
 auth_headers = {
     "Authorization": f"token {os.environ.get('POLICYENGINE_US_DATA_GITHUB_TOKEN')}",
@@ -63,15 +64,27 @@ def upload(
 ) -> bytes:
     release_id = get_release_id(org, repo, release_tag)
     url = f"https://uploads.github.com/repos/{org}/{repo}/releases/{release_id}/assets?name={file_name}"
-    response = requests.post(
-        url,
-        headers={
-            "Accept": "application/vnd.github.v3+json",
-            "Content-Type": "application/octet-stream",
-            **auth_headers,
-        },
-        data=open(file_path, "rb").read(),
-    )
+
+    file_size = os.path.getsize(file_path)
+    headers = {
+        "Accept": "application/vnd.github.v3+json",
+        "Content-Type": "application/octet-stream",
+        **auth_headers,
+    }
+
+    with open(file_path, "rb") as f:
+        with tqdm(total=file_size, unit="B", unit_scale=True) as pbar:
+            response = requests.post(
+                url,
+                headers=headers,
+                data=f,
+                stream=True,
+                hooks=dict(
+                    response=lambda r, *args, **kwargs: pbar.update(
+                        len(r.content)
+                    )
+                ),
+            )
 
     if response.status_code != 201:
         raise ValueError(

diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py
@@ -133,7 +133,6 @@ def build_loss_matrix(dataset: type, time_period):
     from policyengine_us import Microsimulation
 
     sim = Microsimulation(dataset=dataset)
-    sim.macro_cache_read = False
     hh_id = sim.calculate("household_id", map_to="person")
     tax_unit_hh_id = sim.map_result(
         hh_id, "person", "tax_unit", how="value_from_first_person"
@@ -214,6 +213,50 @@ def build_loss_matrix(dataset: type, time_period):
             raise ValueError(f"Missing values for {label}")
         targets_array.append(target)
 
+    # Healthcare spending by age
+
+    healthcare = pd.read_csv(STORAGE_FOLDER / "healthcare_spending.csv")
+
+    for _, row in healthcare.iterrows():
+        age_lower_bound = int(row["age_10_year_lower_bound"])
+        in_age_range = (age >= age_lower_bound) * (age < age_lower_bound + 10)
+        for expense_type in [
+            "health_insurance_premiums_without_medicare_part_b",
+            "over_the_counter_health_expenses",
+            "other_medical_expenses",
+            "medicare_part_b_premiums",
+        ]:
+            label = f"census/{expense_type}/age_{age_lower_bound}_to_{age_lower_bound+9}"
+            value = sim.calculate(expense_type).values
+            loss_matrix[label] = sim.map_result(
+                in_age_range * value, "person", "household"
+            )
+            targets_array.append(row[expense_type])
+
+    # AGI by SPM threshold totals
+
+    spm_threshold_agi = pd.read_csv(STORAGE_FOLDER / "spm_threshold_agi.csv")
+
+    for _, row in spm_threshold_agi.iterrows():
+        spm_unit_agi = sim.calculate(
+            "adjusted_gross_income", map_to="spm_unit"
+        ).values
+        spm_threshold = sim.calculate("spm_unit_spm_threshold").values
+        in_threshold_range = (spm_threshold >= row["lower_spm_threshold"]) * (
+            spm_threshold < row["upper_spm_threshold"]
+        )
+        label = f"census/agi_in_spm_threshold_decile_{int(row['decile'])}"
+        loss_matrix[label] = sim.map_result(
+            in_threshold_range * spm_unit_agi, "spm_unit", "household"
+        )
+        targets_array.append(row["adjusted_gross_income"])
+
+        label = f"census/count_in_spm_threshold_decile_{int(row['decile'])}"
+        loss_matrix[label] = sim.map_result(
+            in_threshold_range, "spm_unit", "household"
+        )
+        targets_array.append(row["count"])
+
     if any(loss_matrix.isna().sum() > 0):
         raise ValueError("Some targets are missing from the loss matrix")