Skip to content

Commit

Permalink
Fix indices
Browse files Browse the repository at this point in the history
  • Loading branch information
nikhilwoodruff committed Aug 31, 2024
1 parent 96e209c commit c0df878
Show file tree
Hide file tree
Showing 7 changed files with 26 additions and 28 deletions.
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ download:
python policyengine_us_data/data_storage/download_public_prerequisites.py
python policyengine_us_data/data_storage/download_private_prerequisites.py

upload:
python policyengine_us_data/data_storage/upload_completed_datasets.py

docker:
docker buildx build --platform linux/amd64 . -t policyengine-us-data:latest

Expand Down
4 changes: 4 additions & 0 deletions docs/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
FROM python:latest
COPY . .
# Install
RUN make download
RUN make install
RUN make data
RUN make test
RUN make upload
17 changes: 0 additions & 17 deletions policyengine_us_data/datasets/cps/cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,23 +97,6 @@ def add_id_variables(
cps["person_household_id"] = person.PH_SEQ
cps["person_family_id"] = person.PH_SEQ * 10 + person.PF_SEQ

# Add weights
# Weights are multiplied by 100 to avoid decimals
cps["person_weight"] = person.A_FNLWGT / 1e2
cps["family_weight"] = family.FSUP_WGT / 1e2

# Tax unit weight is the weight of the containing family.
family_weight = Series(
cps["family_weight"][...], index=cps["family_id"][...]
)
person_family_id = cps["person_family_id"][...]
persons_family_weight = Series(family_weight[person_family_id])
cps["tax_unit_weight"] = persons_family_weight.groupby(
cps["person_tax_unit_id"][...]
).first()

cps["spm_unit_weight"] = spm_unit.SPM_WEIGHT / 1e2

cps["household_weight"] = household.HSUP_WGT / 1e2

# Marital units
Expand Down
5 changes: 3 additions & 2 deletions policyengine_us_data/datasets/cps/enhanced_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ def generate(self):
data = {}

for column in df.columns:
variable_name, time_period = column.split("__")
variable_name = column.split("__")[0]
time_period = int(column.split("__")[1])
data[variable_name] = data.get(variable_name, {})
data[variable_name][time_period] = df[column].values

Expand All @@ -140,7 +141,7 @@ def generate(self):
class EnhancedCPS_2024(EnhancedCPS):
input_dataset = ExtendedCPS_2024
start_year = 2024
end_year = 2034
end_year = 2024
name = "enhanced_cps_2024"
label = "Enhanced CPS 2024"
file_path = STORAGE_FOLDER / "enhanced_cps_2024.h5"
Expand Down
23 changes: 15 additions & 8 deletions policyengine_us_data/datasets/cps/extended_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
"self_employment_income",
"short_term_capital_gains",
"social_security",
"state_and_local_sales_or_income_tax",
# "state_and_local_sales_or_income_tax", # Don't impute SALT, or it'll override the computed state taxes.
"student_loan_interest",
"tax_exempt_interest_income",
"tax_exempt_pension_income",
Expand All @@ -65,11 +65,15 @@
"w2_wages_from_qualified_business",
]

IMPUTED_VARIABLES = [
"employment_income",
]


class ExtendedCPS(Dataset):
cps: Type[CPS]
puf: Type[PUF]
data_format = Dataset.ARRAYS
data_format = Dataset.TIME_PERIOD_ARRAYS

def generate(self):
from policyengine_us import Microsimulation
Expand Down Expand Up @@ -111,7 +115,7 @@ def generate(self):
for variable in IMPUTED_VARIABLES:
imputed_dataset[f"{variable}__{self.time_period}"] = y[variable]

ENTITIES = ("person", "tax_unit", "family", "spm_unit", "household")
ENTITIES = ("person", "tax_unit", "marital_unit", "family", "spm_unit", "household")
for entity in ENTITIES:
for id_name in [
f"{entity}_id__{self.time_period}",
Expand All @@ -126,17 +130,20 @@ def generate(self):
for variable in imputed_dataset.columns:
if "_weight" in variable:
imputed_dataset[variable] = 0
original_dataset["data_source"] = "cps"
imputed_dataset["data_source"] = "puf_imputed"

original_dataset["data_source__2024"] = "cps"
imputed_dataset["data_source__2024"] = "puf_imputed"
combined = pd.concat([original_dataset, imputed_dataset]).fillna(0)
# Sort columns in alphabetical order
combined = combined.reindex(sorted(combined.columns), axis=1)

data = {}

for column in combined:
variable_name, time_period = column.split("__")
data[variable_name] = combined[column].values
for column in combined.columns:
variable_name = column.split("__")[0]
time_period = int(column.split("__")[1])
data[variable_name] = data.get(variable_name, {})
data[variable_name][time_period] = combined[column].values

self.save_dataset(data)

Expand Down
1 change: 0 additions & 1 deletion policyengine_us_data/utils/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
auth_headers = {
"Authorization": f"token {os.environ.get('POLICYENGINE_US_DATA_GITHUB_TOKEN')}",
}
print(auth_headers["Authorization"][:13] + "***")


def get_asset_url(
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
"black",
"pytest",
"policyengine_us==1.65",
"streamlit",
],
},
)

0 comments on commit c0df878

Please sign in to comment.