Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ACS, rent and property taxes and 3-year CPS #35

Merged
merged 46 commits into from
Sep 23, 2024
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
d9146dd
Migrate ACS from policyengine-us
PavelMakarchuk Sep 9, 2024
7fadd29
Merge branch 'main' of https://github.com/PavelMakarchuk/policyengine…
PavelMakarchuk Sep 11, 2024
5065de3
populate acs
PavelMakarchuk Sep 12, 2024
b9e0f22
Update PolicyEngine US data
PavelMakarchuk Sep 12, 2024
68e2e98
Merge branch 'main' of https://github.com/PolicyEngine/policyengine-u…
PavelMakarchuk Sep 12, 2024
74b53bf
format
PavelMakarchuk Sep 12, 2024
0ba70a6
data fix
PavelMakarchuk Sep 13, 2024
380d6b2
test
PavelMakarchuk Sep 13, 2024
c72d813
changelog
PavelMakarchuk Sep 13, 2024
9d2e340
Update PolicyEngine US data
PavelMakarchuk Sep 13, 2024
040ea97
remove extra
PavelMakarchuk Sep 14, 2024
2390120
chagelog
PavelMakarchuk Sep 14, 2024
9c8ecd5
Update PolicyEngine US data
PavelMakarchuk Sep 14, 2024
a292087
readme file
PavelMakarchuk Sep 15, 2024
8a8c93f
Merge branch 'main' of https://github.com/PavelMakarchuk/policyengine…
PavelMakarchuk Sep 17, 2024
553f63f
property tax
PavelMakarchuk Sep 17, 2024
96013e9
changelog
PavelMakarchuk Sep 17, 2024
ed627e8
Update PolicyEngine US data
PavelMakarchuk Sep 17, 2024
cd66e84
Merge branch 'main' of https://github.com/PolicyEngine/policyengine-u…
PavelMakarchuk Sep 17, 2024
6d48d19
format
PavelMakarchuk Sep 17, 2024
317de21
changelog
PavelMakarchuk Sep 17, 2024
8914b9e
Pool 3 CPS years
nikhilwoodruff Sep 19, 2024
43e3bb7
Upload ECPS result in PRs
nikhilwoodruff Sep 19, 2024
247230c
Feed into ECPS
nikhilwoodruff Sep 19, 2024
84ac325
Bump version and ECPS file
nikhilwoodruff Sep 19, 2024
824bf8e
Merge branch 'main' of https://github.com/PavelMakarchuk/policyengine…
PavelMakarchuk Sep 19, 2024
0338cb9
changelog
PavelMakarchuk Sep 19, 2024
42fdd24
Move back to old ECPS
nikhilwoodruff Sep 19, 2024
d619ef0
Merge branch 'main' of https://github.com/PolicyEngine/policyengine-u…
PavelMakarchuk Sep 19, 2024
abf512e
init
PavelMakarchuk Sep 19, 2024
33251a9
storage
PavelMakarchuk Sep 19, 2024
8af92c3
Fix imports
nikhilwoodruff Sep 19, 2024
80be6b9
Move versioning back
nikhilwoodruff Sep 20, 2024
1526b47
Merge branch 'main' of https://github.com/PolicyEngine/policyengine-u…
nikhilwoodruff Sep 20, 2024
7edbccc
Add URL for ACS 2022
nikhilwoodruff Sep 20, 2024
95d7980
Add QRF rewrite and full imputations
nikhilwoodruff Sep 22, 2024
9330572
Merge branch 'nikhilwoodruff/issue66' of https://github.com/PolicyEng…
nikhilwoodruff Sep 22, 2024
c35a21c
Add calibration
nikhilwoodruff Sep 22, 2024
5a3f94d
Shift to branch of US
nikhilwoodruff Sep 22, 2024
a23329b
Make optional install
nikhilwoodruff Sep 22, 2024
dcda8bd
Generate ACS before CPS
nikhilwoodruff Sep 22, 2024
502d8c9
What a silly error
nikhilwoodruff Sep 22, 2024
c8e2710
Minor improvements
nikhilwoodruff Sep 23, 2024
7024666
Fix bugs
nikhilwoodruff Sep 23, 2024
54449a2
Adjust QRF to enable single-output predictions
nikhilwoodruff Sep 23, 2024
b67a64f
Fix bug in QRF
nikhilwoodruff Sep 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- bump: minor
changes:
added:
- Migrate the ACS from the US-repository.
3 changes: 2 additions & 1 deletion policyengine_us_data/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,6 @@
ReweightedCPS_2024,
)
from .puf import PUF_2015, PUF_2021, PUF_2024, IRS_PUF_2015
from .acs import ACS_2022

DATASETS = [CPS_2022, PUF_2021, CPS_2024, EnhancedCPS_2024]
DATASETS = [CPS_2022, PUF_2021, CPS_2024, EnhancedCPS_2024, ACS_2022]
6 changes: 6 additions & 0 deletions policyengine_us_data/datasets/acs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
2022 ACS 1 Year Data Dictionary:
https://www2.census.gov/programs-surveys/acs/tech_docs/pums/data_dict/PUMS_Data_Dictionary_2022.pdf
User Guide:
https://www2.census.gov/programs-surveys/acs/tech_docs/pums/2022ACS_PUMS_User_Guide.pdf
PUMS Documentation:
https://www.census.gov/programs-surveys/acs/microdata/documentation.html
2 changes: 2 additions & 0 deletions policyengine_us_data/datasets/acs/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .acs import *
from .raw_acs import *
101 changes: 101 additions & 0 deletions policyengine_us_data/datasets/acs/acs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import logging
from policyengine_core.data import Dataset
import h5py
from policyengine_us_data.datasets.acs.raw_acs import RawACS
from policyengine_us_data.storage import STORAGE_FOLDER
from pandas import DataFrame
import numpy as np
import os


class ACS(Dataset):
name = "acs"
label = "ACS"
data_format = Dataset.ARRAYS
time_period = None

def __init__(self):
nikhilwoodruff marked this conversation as resolved.
Show resolved Hide resolved
super().__init__()
self.raw_acs = RawACS()

def generate(self) -> None:
"""Generates the ACS dataset."""
if self.time_period is None:
raise ValueError("time_period must be set in child classes")

if os.path.exists(self.file_path):
os.remove(self.file_path)

if self.time_period not in self.raw_acs.years:
self.raw_acs.generate(self.time_period)

raw_data = self.raw_acs.load(self.time_period)
acs = h5py.File(self.file_path, mode="w")
person, spm_unit, household = [
raw_data[entity] for entity in ("person", "spm_unit", "household")
]

self.add_id_variables(acs, person, spm_unit, household)
self.add_person_variables(acs, person)
self.add_spm_variables(acs, spm_unit)
self.add_household_variables(acs, household)

acs.close()

@staticmethod
def add_id_variables(
acs: h5py.File,
person: DataFrame,
spm_unit: DataFrame,
household: DataFrame,
) -> None:
# Create numeric IDs based on SERIALNO
person["numeric_id"] = person["SERIALNO"].astype("category").cat.codes
household["numeric_id"] = (
household["SERIALNO"].astype("category").cat.codes
)

acs["person_id"] = person["numeric_id"] * 100 + person.SPORDER.astype(
int
)
acs["person_spm_unit_id"] = person.SPM_ID
acs["spm_unit_id"] = spm_unit.index
acs["tax_unit_id"] = (
spm_unit.index
) # Using SPM unit as proxy for tax unit
acs["family_id"] = spm_unit.index # Using SPM unit as proxy for family
acs["person_household_id"] = person["numeric_id"]
acs["person_tax_unit_id"] = person.SPM_ID
acs["person_family_id"] = person.SPM_ID
acs["household_id"] = household["numeric_id"]
acs["person_marital_unit_id"] = person["numeric_id"]
acs["marital_unit_id"] = np.unique(person["numeric_id"])
acs["person_weight"] = person.PWGTP
acs["household_weight"] = household.WGTP

@staticmethod
def add_person_variables(acs: h5py.File, person: DataFrame) -> None:
acs["age"] = person.AGEP
acs["employment_income"] = person.WAGP
acs["self_employment_income"] = person.SEMP
acs["total_income"] = person.PINCP

@staticmethod
def add_spm_variables(acs: h5py.File, spm_unit: DataFrame) -> None:
acs["spm_unit_net_income_reported"] = spm_unit.SPM_RESOURCES
acs["spm_unit_spm_threshold"] = spm_unit.SPM_POVTHRESHOLD

@staticmethod
def add_household_variables(acs: h5py.File, household: DataFrame) -> None:
acs["household_vehicles_owned"] = household.VEH
acs["state_fips"] = acs["household_state_fips"] = household.ST.astype(
int
)


class ACS_2022(ACS):
PavelMakarchuk marked this conversation as resolved.
Show resolved Hide resolved
name = "acs_2022"
label = "ACS 2022"
time_period = 2022
file_path = STORAGE_FOLDER / "acs_2022.h5"
nikhilwoodruff marked this conversation as resolved.
Show resolved Hide resolved
url = "release://PolicyEngine/policyengine-us-data/release/acs_2022.h5"
250 changes: 250 additions & 0 deletions policyengine_us_data/datasets/acs/raw_acs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
from io import BytesIO
import logging
from typing import List
from zipfile import ZipFile
import pandas as pd
from policyengine_core.data import Dataset
import requests
from tqdm import tqdm
from policyengine_us_data.storage import STORAGE_FOLDER

logging.getLogger().setLevel(logging.INFO)

PERSON_COLUMNS = [
"SERIALNO", # Household ID
"SPORDER", # Person number within household
"PWGTP", # Person weight
"AGEP", # Age
"CIT", # Citizenship
"MAR", # Marital status
"WAGP", # Wage/salary
"SSP", # Social security income
"SSIP", # Supplemental security income
"SEX", # Sex
"SEMP", # Self-employment income
"SCHL", # Educational attainment
"RETP", # Retirement income
"PAP", # Public assistance income
"OIP", # Other income
"PERNP", # Total earnings
"PINCP", # Total income
"POVPIP", # Income-to-poverty line percentage
"RAC1P", # Race
]

HOUSEHOLD_COLUMNS = [
"SERIALNO", # Household ID
"PUMA", # PUMA area code
"ST", # State code
"ADJHSG", # Adjustment factor for housing dollar amounts
"ADJINC", # Adjustment factor for income
"WGTP", # Household weight
"NP", # Number of persons in household
"BDSP", # Number of bedrooms
"ELEP", # Electricity monthly cost
"FULP", # Fuel monthly cost
"GASP", # Gas monthly cost
"RMSP", # Number of rooms
"RNTP", # Monthly rent
"TEN", # Tenure
"VEH", # Number of vehicles
"FINCP", # Total income
"GRNTP", # Gross rent
"TAXAMT", # Property taxes
]


class RawACS(Dataset):
name = "raw_acs"
label = "Raw ACS"
data_format = Dataset.TABLES
years = [] # This will be populated as datasets are generated
file_path = STORAGE_FOLDER / "raw_acs_{year}.h5"

@staticmethod
def file(year: int):
return STORAGE_FOLDER / f"raw_acs_{year}.h5"

def generate(self, year: int) -> None:
year = int(year)
if year in self.years:
self.remove(year)

spm_url = f"https://www2.census.gov/programs-surveys/supplemental-poverty-measure/datasets/spm/spm_{year}_pu.dta"
person_url = f"https://www2.census.gov/programs-surveys/acs/data/pums/{year}/1-Year/csv_pus.zip"
household_url = f"https://www2.census.gov/programs-surveys/acs/data/pums/{year}/1-Year/csv_hus.zip"

try:
with pd.HDFStore(self.file(year)) as storage:
logging.info(f"Downloading household file")
household = self.process_household_data(
household_url, "psam_hus", HOUSEHOLD_COLUMNS
)
storage["household"] = household

logging.info(f"Downloading person file")
person = self.process_person_data(
person_url, "psam_pus", PERSON_COLUMNS
)
storage["person"] = person

logging.info(f"Downloading SPM unit file")
spm_person = pd.read_stata(spm_url).fillna(0)
spm_person.columns = spm_person.columns.str.upper()
self.create_spm_unit_table(storage, spm_person)

self.years.append(year)
logging.info(f"Successfully generated Raw ACS data for {year}")
except Exception as e:
self.remove(year)
logging.error(f"Error generating Raw ACS data for {year}: {e}")
raise e

@staticmethod
def process_household_data(
url: str, prefix: str, columns: List[str]
) -> pd.DataFrame:
req = requests.get(url, stream=True)
with BytesIO() as f:
pbar = tqdm()
for chunk in req.iter_content(chunk_size=1024):
if chunk:
pbar.update(len(chunk))
f.write(chunk)
f.seek(0)
zf = ZipFile(f)
logging.info(f"Loading the first half of the household dataset")
a = pd.read_csv(
zf.open(prefix + "a.csv"),
usecols=columns,
dtype={"SERIALNO": str},
)
logging.info(f"Loading the second half of the household dataset")
b = pd.read_csv(
zf.open(prefix + "b.csv"),
usecols=columns,
dtype={"SERIALNO": str},
)
logging.info(f"Concatenating household datasets")
res = pd.concat([a, b]).fillna(0)
res.columns = res.columns.str.upper()

# Ensure correct data types
res["ST"] = res["ST"].astype(int)

return res

@staticmethod
def process_person_data(
url: str, prefix: str, columns: List[str]
) -> pd.DataFrame:
req = requests.get(url, stream=True)
with BytesIO() as f:
pbar = tqdm()
for chunk in req.iter_content(chunk_size=1024):
if chunk:
pbar.update(len(chunk))
f.write(chunk)
f.seek(0)
zf = ZipFile(f)
logging.info(f"Loading the first half of the person dataset")
a = pd.read_csv(
zf.open(prefix + "a.csv"),
usecols=columns,
dtype={"SERIALNO": str},
)
logging.info(f"Loading the second half of the person dataset")
b = pd.read_csv(
zf.open(prefix + "b.csv"),
usecols=columns,
dtype={"SERIALNO": str},
)
logging.info(f"Concatenating person datasets")
res = pd.concat([a, b]).fillna(0)
res.columns = res.columns.str.upper()

# Ensure correct data types
res["SPORDER"] = res["SPORDER"].astype(int)

return res

@staticmethod
def create_spm_unit_table(
storage: pd.HDFStore, person: pd.DataFrame
) -> None:
SPM_UNIT_COLUMNS = [
"CAPHOUSESUB",
"CAPWKCCXPNS",
"CHILDCAREXPNS",
"EITC",
"ENGVAL",
"EQUIVSCALE",
"FEDTAX",
"FEDTAXBC",
"FICA",
"GEOADJ",
"MEDXPNS",
"NUMADULTS",
"NUMKIDS",
"NUMPER",
"POOR",
"POVTHRESHOLD",
"RESOURCES",
"SCHLUNCH",
"SNAPSUB",
"STTAX",
"TENMORTSTATUS",
"TOTVAL",
"WCOHABIT",
"WICVAL",
"WKXPNS",
"WUI_LT15",
"ID",
]
spm_table = (
person[["SPM_" + column for column in SPM_UNIT_COLUMNS]]
.groupby(person.SPM_ID)
.first()
)

original_person_table = storage["person"]

# Ensure SERIALNO is treated as string
JOIN_COLUMNS = ["SERIALNO", "SPORDER"]
original_person_table["SERIALNO"] = original_person_table[
"SERIALNO"
].astype(str)
original_person_table["SPORDER"] = original_person_table[
"SPORDER"
].astype(int)
person["SERIALNO"] = person["SERIALNO"].astype(str)
person["SPORDER"] = person["SPORDER"].astype(int)

# Add SPM_ID from the SPM person table to the original person table.
combined_person_table = pd.merge(
original_person_table,
person[JOIN_COLUMNS + ["SPM_ID"]],
on=JOIN_COLUMNS,
)

storage["person"] = combined_person_table
storage["spm_unit"] = spm_table

def load(self, year: int) -> dict:
if not self.file(year).exists():
raise FileNotFoundError(
f"Raw ACS data for {year} not found. Please generate it first."
)

with pd.HDFStore(self.file(year), mode="r") as store:
return {
"person": store["person"],
"household": store["household"],
"spm_unit": store["spm_unit"],
}

def remove(self, year: int) -> None:
if self.file(year).exists():
self.file(year).unlink()
if year in self.years:
self.years.remove(year)
8 changes: 8 additions & 0 deletions policyengine_us_data/storage/upload_completed_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,11 @@
"puf_2024.h5",
FOLDER / "puf_2024.h5",
)

upload(
"PolicyEngine",
"policyengine-us-data",
"release",
"acs_2022.h5",
FOLDER / "acs_2022.h5",
)
Loading