Skip to content

Commit

Permalink
Merge pull request #2 from PolicyEngine/ecps
Browse files Browse the repository at this point in the history
Add the Enhanced CPS
  • Loading branch information
nikhilwoodruff authored Sep 2, 2024
2 parents 3b1f4ab + 1637b8e commit 75a7868
Show file tree
Hide file tree
Showing 42 changed files with 2,281 additions and 187 deletions.
21 changes: 0 additions & 21 deletions .github/review_pull_request.py

This file was deleted.

13 changes: 0 additions & 13 deletions .github/upload_evaluation.py

This file was deleted.

35 changes: 20 additions & 15 deletions .github/workflows/pull_request.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,26 +7,31 @@ on:

jobs:
build:
name: Build and test
name: Test
runs-on: ubuntu-latest
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

steps:
- name: Checkout code
uses: actions/checkout@v2

- name: Set up Python
uses: actions/setup-python@v2

- name: Install dependencies
uses: actions/setup-python@v5
with:
python-version: 3.12
- name: Install package
run: make install

- name: Download data inputs
run: make download
env:
POLICYENGINE_US_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_US_DATA_GITHUB_TOKEN }}
- name: Build datasets
run: make data
- name: Run tests
run: make test

- name: Run evaluation
run: make evaluate

- name: Add review comment
run: python .github/review_pull_request.py
lint:
runs-on: ubuntu-latest
name: Lint
steps:
- uses: actions/checkout@v4
- name: Check formatting
uses: "lgeiger/black-action@master"
with:
args: ". -l 79 --check"
60 changes: 45 additions & 15 deletions .github/workflows/push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,26 +7,56 @@ on:

jobs:
build:
name: Build and test
name: Test
runs-on: ubuntu-latest
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

steps:
- name: Checkout code
uses: actions/checkout@v2

- name: Set up Python
uses: actions/setup-python@v2

- name: Install dependencies
uses: actions/setup-python@v5
with:
python-version: 3.12
- name: Install package
run: make install

- name: Download data inputs
run: make download
env:
POLICYENGINE_US_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_US_DATA_GITHUB_TOKEN }}
- name: Build datasets
run: make data
- name: Run tests
run: make test

- name: Run evaluation
run: make evaluate

- name: Upload evaluation
run: python .github/upload_evaluation.py
- name: Upload completed datasets
run: make upload
env:
POLICYENGINE_US_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_US_DATA_GITHUB_TOKEN }}
lint:
runs-on: ubuntu-latest
name: Lint
steps:
- uses: actions/checkout@v4
- name: Check formatting
uses: "lgeiger/black-action@master"
with:
args: ". -l 79 --check"
publish:
runs-on: ubuntu-latest
name: Publish
steps:
- name: Checkout code
uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: 3.12
- name: Install package
run: make install
- name: Build package
run: make build
- name: Publish a Python distribution to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
with:
user: __token__
password: ${{ secrets.PYPI }}
skip-existing: true

2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@
**/*.h5
*.ipynb
**/*.csv
!uprating_factors.csv
!uprating_growth_factors.csv
3 changes: 1 addition & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,4 @@ FROM python:latest
COPY . .
# Install
RUN make install
# Run tests
CMD ["make", "test"]
RUN ["make", "data"]
22 changes: 20 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
all: data test

format:
black . -l 79

Expand All @@ -7,8 +9,24 @@ test:
install:
pip install -e .[dev]

download:
python policyengine_us_data/data_storage/download_public_prerequisites.py
python policyengine_us_data/data_storage/download_private_prerequisites.py

upload:
python policyengine_us_data/data_storage/upload_completed_datasets.py

docker:
docker buildx build --platform linux/amd64 . -t policyengine-us-data:latest

evaluate:
python policyengine_us_data/evaluation/summary.py
documentation:
streamlit run docs/Home.py

data:
python policyengine_us_data/datasets/cps/enhanced_cps.py

clean:
rm policyengine_us_data/data_storage/puf_2015.csv
rm policyengine_us_data/data_storage/demographics_2015.csv
build:
python setup.py sdist bdist_wheel
8 changes: 8 additions & 0 deletions docs/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
FROM python:latest
COPY . .
# Install
RUN make download
RUN make install
RUN python docs/download.py
EXPOSE 8080
ENTRYPOINT ["streamlit", "run", "docs/Home.py", "--server.port=8080", "--server.address=0.0.0.0"]
37 changes: 37 additions & 0 deletions docs/Home.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import streamlit as st

st.title("PolicyEngine-US-Data")

st.write(
"""PolicyEngine-US-Data is a package to create representative microdata for the US, designed for input in the PolicyEngine tax-benefit microsimulation model."""
)

st.subheader("What does this repo do?")

st.write(
"""Principally, this package creates a (partly synthetic) dataset of households (with incomes, demographics and more) that describes the U.S. household sector. This dataset synthesises multiple sources of data (the Current Population Survey, the IRS Public Use File, and administrative statistics) to improve upon the accuracy of **any** of them."""
)

st.subheader("What does this dataset look like?")

st.write(
"The below table shows an extract of the person records in one household in the dataset."
)


@st.cache_data
def sample_household():
import pandas as pd
from policyengine_us_data.datasets import EnhancedCPS_2024
from policyengine_us import Microsimulation

df = Microsimulation(dataset=EnhancedCPS_2024).to_input_dataframe()

household_id = df.person_household_id__2024.values[10]
people_in_household = df[df.person_household_id__2024 == household_id]
return people_in_household


people_in_household = sample_household()

st.dataframe(people_in_household.T, use_container_width=True)
26 changes: 26 additions & 0 deletions docs/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from policyengine_us_data.utils.github import download
from policyengine_us_data.data_storage import STORAGE_FOLDER

download(
"PolicyEngine",
"policyengine-us-data",
"release",
"enhanced_cps_2024.h5",
STORAGE_FOLDER / "enhanced_cps_2024.h5",
)

download(
"PolicyEngine",
"policyengine-us-data",
"release",
"cps_2024.h5",
STORAGE_FOLDER / "cps_2024.h5",
)

download(
"PolicyEngine",
"irs-soi-puf",
"release",
"puf_2024.h5",
STORAGE_FOLDER / "puf_2024.h5",
)
43 changes: 43 additions & 0 deletions docs/pages/Aggregates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import streamlit as st

st.title("Aggregates")

st.write(
"""The table below shows the totals for calendar year 2024 for the Enhanced CPS dataset variables."""
)


@st.cache_data
def sample_household():
from policyengine_us import Microsimulation
from policyengine_us_data import EnhancedCPS_2024
from policyengine_us_data.datasets.cps.extended_cps import (
IMPUTED_VARIABLES as FINANCE_VARIABLES,
)
import pandas as pd

sim = Microsimulation(dataset=EnhancedCPS_2024)

df = (
pd.DataFrame(
{
"Variable": FINANCE_VARIABLES,
"Total ($bn)": [
round(
sim.calculate(variable, map_to="household").sum()
/ 1e9,
1,
)
for variable in FINANCE_VARIABLES
],
}
)
.sort_values("Total ($bn)", ascending=False)
.set_index("Variable")
)
return df


df = sample_household()

st.dataframe(df, use_container_width=True)
Loading

0 comments on commit 75a7868

Please sign in to comment.