Skip to content

Commit

Permalink
Merge pull request #7 from PolicyEngine/healthcare-spending
Browse files Browse the repository at this point in the history
Add healthcare spending calibration
  • Loading branch information
nikhilwoodruff authored Sep 6, 2024
2 parents 771bfd9 + bd8173d commit c6daaa9
Show file tree
Hide file tree
Showing 11 changed files with 183 additions and 66 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@
**/*.csv
!uprating_factors.csv
!uprating_growth_factors.csv
!healthcare_spending.csv
!spm_threshold_agi.csv
4 changes: 3 additions & 1 deletion docs/pages/Benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,13 @@ def compare_datasets():
comparison["Abs. Error %"] = (
(comparison["Abs. Error"] / comparison["Actual"].abs())
.replace([np.inf, -np.inf], np.nan)
.fillna(1)
.fillna(0)
)
comparison["Dataset"] = dataset.label
comparison_combined = pd.concat([comparison_combined, comparison])

comparison_combined.to_csv("comparisons.csv", index=False)

return comparison_combined


Expand Down
10 changes: 10 additions & 0 deletions policyengine_us_data/data_storage/healthcare_spending.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
age_10_year_lower_bound,health_insurance_premiums_without_medicare_part_b,over_the_counter_health_expenses,other_medical_expenses,medicare_part_b_premiums,age,age_10_year_lower_bound
0.0,0.0,3139958418.4186344,10581757838.419937,0.0,178907224.9393921,0.0
10.0,1331908850.7960727,3899326144.79529,16161392678.387276,11198509.280605555,590464084.1854401,408983930.26535034
20.0,32361931490.785435,6987581951.171959,23521586513.130955,71263601.57686229,1007420789.8561249,818687466.8069458
30.0,71590362469.06906,9615087479.840654,36626364271.092926,125195737.01158798,1532737442.4855804,1335366065.3704834
40.0,80661760961.92291,10104218376.054054,40816478783.58568,357437542.5703746,1801306563.9217224,1623998071.1169434
50.0,81896637177.70609,11697789643.067505,45846296831.32144,1077750904.7396624,2230327049.8746796,2044125057.2036743
60.0,64221993928.27972,12500423336.761482,49034771200.48631,33075117802.61769,2590424902.998871,2416114812.9629517
70.0,29366086293.545853,8774761672.04829,32298721271.149677,54002252445.046425,2024842778.7351837,1917786555.2562714
80.0,14151276913.693628,4168277057.180005,14749253153.81525,24692726700.12216,1129862392.4155273,1084021779.61792
11 changes: 11 additions & 0 deletions policyengine_us_data/data_storage/spm_threshold_agi.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
decile,lower_spm_threshold,upper_spm_threshold,adjusted_gross_income,count
1,13232.297,16084.154296875,601395945534.2526,14067622.329864502
2,16084.154,18203.76171875,699173775752.6232,14069629.232849121
3,18203.762,19966.45703125,1038943520151.5376,14065266.263519287
4,19966.457,22151.69140625,1284310549943.0337,14072190.912216187
5,22151.691,24156.166015625,1520507632014.9219,14067456.695106506
6,24156.166,27384.509765625,1922194713836.21,14069159.144554138
7,27384.51,32536.736328125,1988726862598.7808,14067486.546066284
8,32536.736,37774.16015625,2236533834141.468,14068930.520233154
9,37774.16,44738.5859375,2540801186386.7207,14069854.290618896
10,44738.586,inf,2856965773326.8867,14069456.2210083
13 changes: 11 additions & 2 deletions policyengine_us_data/datasets/cps/cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ def generate(self):
# Extrapolate from CPS 2022

cps_2022 = CPS_2022(require=True)
print("Creating uprating factors table...")
uprating = create_policyengine_uprating_factors_table()
arrays = cps_2022.load_dataset()
for variable in uprating.index.unique():
Expand All @@ -41,7 +40,6 @@ def generate(self):
2021
].values[0]
growth = current_index / start_index
print(f"Uprating {variable} by {growth-1:.1%}")
arrays[variable] = arrays[variable] * growth

self.save_dataset(arrays)
Expand All @@ -61,11 +59,22 @@ def generate(self):
add_previous_year_income(self, cps)
add_spm_variables(cps, spm_unit)
add_household_variables(cps, household)
add_rent(cps, person, household)

raw_data.close()
cps.close()


def add_rent(cps: h5py.File, person: DataFrame, household: DataFrame):
is_renting = household.H_TENURE == 2
AVERAGE_RENT = 1_300 * 12
# Project down to the first person in the household
person_is_renting = (
household.set_index("H_SEQ").loc[person.PH_SEQ].H_TENURE.values == 2
)
cps["pre_subsidy_rent"] = np.where(person_is_renting, AVERAGE_RENT, 0)


def add_id_variables(
cps: h5py.File,
person: DataFrame,
Expand Down
34 changes: 32 additions & 2 deletions policyengine_us_data/datasets/cps/enhanced_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from policyengine_us_data.datasets.cps.extended_cps import (
ExtendedCPS_2024,
CPS_2019,
CPS_2024,
)
import torch

Expand Down Expand Up @@ -47,10 +48,10 @@ def loss(weights):
worst_val = rel_error[torch.argmax(rel_error)].item()
return rel_error.mean(), worst_name, worst_val

optimizer = torch.optim.Adam([weights], lr=1)
optimizer = torch.optim.Adam([weights], lr=1e-2)
from tqdm import trange

iterator = trange(1_000)
iterator = trange(10_000)
for i in iterator:
optimizer.zero_grad()
l, worst_name, worst_val = loss(torch.exp(weights))
Expand Down Expand Up @@ -133,6 +134,35 @@ def generate(self):
self.save_dataset(data)


class ReweightedCPS_2024(Dataset):
data_format = Dataset.ARRAYS
file_path = STORAGE_FOLDER / "reweighted_cps_2024.h5"
name = "reweighted_cps_2024"
label = "Reweighted CPS 2024"
input_dataset = CPS_2024
time_period = 2024

def generate(self):
from policyengine_us import Microsimulation

sim = Microsimulation(dataset=self.input_dataset)
data = sim.dataset.load_dataset()
original_weights = sim.calculate("household_weight")
original_weights = original_weights.values + np.random.normal(
1, 0.1, len(original_weights)
)
for year in [2024]:
loss_matrix, targets_array = build_loss_matrix(
self.input_dataset, year
)
optimised_weights = reweight(
original_weights, loss_matrix, targets_array
)
data["household_weight"] = optimised_weights

self.save_dataset(data)


class EnhancedCPS_2024(EnhancedCPS):
input_dataset = ExtendedCPS_2024
start_year = 2024
Expand Down
91 changes: 48 additions & 43 deletions policyengine_us_data/datasets/cps/extended_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,62 +5,67 @@
from ..puf import *
import pandas as pd

# These are sorted by magnitude.
# First 15 contain 90%.
# First 7 contain 75%.
# If you're trying to debug this part of the code and don't want to wait ages
# to see if something breaks, try limiting to those.
IMPUTED_VARIABLES = [
"alimony_expense",
"alimony_income",
"american_opportunity_credit",
"amt_foreign_tax_credit",
"casualty_loss",
"cdcc_relevant_expenses",
"charitable_cash_donations",
"charitable_non_cash_donations",
"domestic_production_ald",
"early_withdrawal_penalty",
"educator_expense",
"employment_income",
"energy_efficient_home_improvement_credit",
"estate_income",
"excess_withheld_payroll_tax",
"farm_income",
"farm_rent_income",
"foreign_tax_credit",
"general_business_credit",
"health_savings_account_ald",
"partnership_s_corp_income",
"social_security",
"taxable_pension_income",
"interest_deduction",
"investment_income_elected_form_4952",
"tax_exempt_pension_income",
"long_term_capital_gains",
"long_term_capital_gains_on_collectibles",
"misc_deduction",
"miscellaneous_income",
"non_qualified_dividend_income",
"non_sch_d_capital_gains",
"other_credits",
"partnership_s_corp_income",
"pre_tax_contributions",
"prior_year_minimum_tax_credit",
"taxable_ira_distributions",
"self_employment_income",
"w2_wages_from_qualified_business",
"short_term_capital_gains",
"qualified_dividend_income",
"qualified_tuition_expenses",
"charitable_cash_donations",
"self_employed_pension_contribution_ald",
"real_estate_taxes",
"recapture_of_investment_credit",
"unrecaptured_section_1250_gain",
"taxable_unemployment_compensation",
"taxable_interest_income",
"domestic_production_ald",
"self_employed_health_insurance_ald",
"rental_income",
"non_qualified_dividend_income",
"cdcc_relevant_expenses",
"tax_exempt_interest_income",
"salt_refund_income",
"foreign_tax_credit",
"estate_income",
"charitable_non_cash_donations",
"american_opportunity_credit",
"miscellaneous_income",
"alimony_expense",
"farm_income",
"alimony_income",
"health_savings_account_ald",
"non_sch_d_capital_gains",
"general_business_credit",
"energy_efficient_home_improvement_credit",
"traditional_ira_contributions",
"amt_foreign_tax_credit",
"excess_withheld_payroll_tax",
"savers_credit",
"self_employed_health_insurance_ald",
"self_employed_pension_contribution_ald",
"self_employment_income",
"short_term_capital_gains",
"social_security",
"student_loan_interest",
"tax_exempt_interest_income",
"tax_exempt_pension_income",
"taxable_interest_income",
"taxable_ira_distributions",
"taxable_pension_income",
"taxable_unemployment_compensation",
"traditional_ira_contributions",
"unrecaptured_section_1250_gain",
"investment_income_elected_form_4952",
"early_withdrawal_penalty",
"prior_year_minimum_tax_credit",
"farm_rent_income",
"qualified_tuition_expenses",
"educator_expense",
"long_term_capital_gains_on_collectibles",
"other_credits",
"casualty_loss",
"unreported_payroll_tax",
"w2_wages_from_qualified_business",
"recapture_of_investment_credit",
]


Expand Down
7 changes: 0 additions & 7 deletions policyengine_us_data/datasets/puf/puf.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,8 +289,6 @@ class PUF(Dataset):
def generate(self):
from policyengine_us.system import system

print("Importing PolicyEngine US variable metadata...")

irs_puf = IRS_PUF_2015(require=True)

puf = irs_puf.load("puf")
Expand All @@ -300,7 +298,6 @@ def generate(self):
puf = uprate_puf(puf, 2015, self.time_period)
elif self.time_period >= 2021:
puf_2021 = PUF_2021(require=True)
print("Creating uprating factors table...")
uprating = create_policyengine_uprating_factors_table()
arrays = puf_2021.load_dataset()
for variable in uprating:
Expand All @@ -312,19 +309,15 @@ def generate(self):
2021
].values[0]
growth = current_index / start_index
print(f"Uprating {variable} by {growth-1:.1%}")
arrays[variable] = arrays[variable] * growth
self.save_dataset(arrays)
return

puf = puf[puf.MARS != 0] # Remove aggregate records

print("Pre-processing PUF...")
original_recid = puf.RECID.values.copy()
puf = preprocess_puf(puf)
print("Imputing missing PUF demographics...")
puf = impute_missing_demographics(puf, demographics)
print("Imputing PUF pension contributions...")
puf["pre_tax_contributions"] = impute_pension_contributions_to_puf(
puf[["employment_income"]]
)
Expand Down
1 change: 0 additions & 1 deletion policyengine_us_data/datasets/puf/uprate_puf.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,6 @@ def get_growth(variable, from_year, to_year):


def uprate_puf(puf, from_year, to_year):
print(f"Uprating PUF from {from_year} to {to_year}...")
puf = puf.copy()
for variable in SOI_TO_PUF_STRAIGHT_RENAMES:
growth = get_growth(variable, from_year, to_year)
Expand Down
31 changes: 22 additions & 9 deletions policyengine_us_data/utils/github.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import requests
from tqdm import tqdm

auth_headers = {
"Authorization": f"token {os.environ.get('POLICYENGINE_US_DATA_GITHUB_TOKEN')}",
Expand Down Expand Up @@ -63,15 +64,27 @@ def upload(
) -> bytes:
release_id = get_release_id(org, repo, release_tag)
url = f"https://uploads.github.com/repos/{org}/{repo}/releases/{release_id}/assets?name={file_name}"
response = requests.post(
url,
headers={
"Accept": "application/vnd.github.v3+json",
"Content-Type": "application/octet-stream",
**auth_headers,
},
data=open(file_path, "rb").read(),
)

file_size = os.path.getsize(file_path)
headers = {
"Accept": "application/vnd.github.v3+json",
"Content-Type": "application/octet-stream",
**auth_headers,
}

with open(file_path, "rb") as f:
with tqdm(total=file_size, unit="B", unit_scale=True) as pbar:
response = requests.post(
url,
headers=headers,
data=f,
stream=True,
hooks=dict(
response=lambda r, *args, **kwargs: pbar.update(
len(r.content)
)
),
)

if response.status_code != 201:
raise ValueError(
Expand Down
45 changes: 44 additions & 1 deletion policyengine_us_data/utils/loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,6 @@ def build_loss_matrix(dataset: type, time_period):
from policyengine_us import Microsimulation

sim = Microsimulation(dataset=dataset)
sim.macro_cache_read = False
hh_id = sim.calculate("household_id", map_to="person")
tax_unit_hh_id = sim.map_result(
hh_id, "person", "tax_unit", how="value_from_first_person"
Expand Down Expand Up @@ -214,6 +213,50 @@ def build_loss_matrix(dataset: type, time_period):
raise ValueError(f"Missing values for {label}")
targets_array.append(target)

# Healthcare spending by age

healthcare = pd.read_csv(STORAGE_FOLDER / "healthcare_spending.csv")

for _, row in healthcare.iterrows():
age_lower_bound = int(row["age_10_year_lower_bound"])
in_age_range = (age >= age_lower_bound) * (age < age_lower_bound + 10)
for expense_type in [
"health_insurance_premiums_without_medicare_part_b",
"over_the_counter_health_expenses",
"other_medical_expenses",
"medicare_part_b_premiums",
]:
label = f"census/{expense_type}/age_{age_lower_bound}_to_{age_lower_bound+9}"
value = sim.calculate(expense_type).values
loss_matrix[label] = sim.map_result(
in_age_range * value, "person", "household"
)
targets_array.append(row[expense_type])

# AGI by SPM threshold totals

spm_threshold_agi = pd.read_csv(STORAGE_FOLDER / "spm_threshold_agi.csv")

for _, row in spm_threshold_agi.iterrows():
spm_unit_agi = sim.calculate(
"adjusted_gross_income", map_to="spm_unit"
).values
spm_threshold = sim.calculate("spm_unit_spm_threshold").values
in_threshold_range = (spm_threshold >= row["lower_spm_threshold"]) * (
spm_threshold < row["upper_spm_threshold"]
)
label = f"census/agi_in_spm_threshold_decile_{int(row['decile'])}"
loss_matrix[label] = sim.map_result(
in_threshold_range * spm_unit_agi, "spm_unit", "household"
)
targets_array.append(row["adjusted_gross_income"])

label = f"census/count_in_spm_threshold_decile_{int(row['decile'])}"
loss_matrix[label] = sim.map_result(
in_threshold_range, "spm_unit", "household"
)
targets_array.append(row["count"])

if any(loss_matrix.isna().sum() > 0):
raise ValueError("Some targets are missing from the loss matrix")

Expand Down

0 comments on commit c6daaa9

Please sign in to comment.