Merge pull request #2 from PolicyEngine/ecps

Add the Enhanced CPS
PolicyEngine · Sep 2, 2024 · 75a7868 · 75a7868
2 parents 3b1f4ab + 1637b8e
commit 75a7868
Show file tree

Hide file tree

Showing 42 changed files with 2,281 additions and 187 deletions.
diff --git a/.github/review_pull_request.py b/.github/review_pull_request.py
diff --git a/.github/upload_evaluation.py b/.github/upload_evaluation.py
diff --git a/.github/workflows/pull_request.yaml b/.github/workflows/pull_request.yaml
@@ -7,26 +7,31 @@ on:
 
 jobs:
   build:
-    name: Build and test
+    name: Test
     runs-on: ubuntu-latest
-    env:
-      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
     steps:
       - name: Checkout code
         uses: actions/checkout@v2
-
       - name: Set up Python
-        uses: actions/setup-python@v2
-
-      - name: Install dependencies
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.12
+      - name: Install package
         run: make install
-
+      - name: Download data inputs
+        run: make download
+        env:
+          POLICYENGINE_US_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_US_DATA_GITHUB_TOKEN }}
+      - name: Build datasets
+        run: make data
       - name: Run tests
         run: make test
-
-      - name: Run evaluation
-        run: make evaluate
-
-      - name: Add review comment
-        run: python .github/review_pull_request.py
+  lint:
+    runs-on: ubuntu-latest
+    name: Lint
+    steps:
+      - uses: actions/checkout@v4
+      - name: Check formatting
+        uses: "lgeiger/black-action@master"
+        with:
+          args: ". -l 79 --check"
diff --git a/.github/workflows/push.yaml b/.github/workflows/push.yaml
@@ -7,26 +7,56 @@ on:
 
 jobs:
   build:
-    name: Build and test
+    name: Test
     runs-on: ubuntu-latest
-    env:
-      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
     steps:
       - name: Checkout code
         uses: actions/checkout@v2
-
       - name: Set up Python
-        uses: actions/setup-python@v2
-
-      - name: Install dependencies
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.12
+      - name: Install package
         run: make install
-
+      - name: Download data inputs
+        run: make download
+        env:
+          POLICYENGINE_US_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_US_DATA_GITHUB_TOKEN }}
+      - name: Build datasets
+        run: make data
       - name: Run tests
         run: make test
-
-      - name: Run evaluation
-        run: make evaluate
-
-      - name: Upload evaluation
-        run: python .github/upload_evaluation.py
+      - name: Upload completed datasets
+        run: make upload
+        env:
+          POLICYENGINE_US_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_US_DATA_GITHUB_TOKEN }}
+  lint:
+    runs-on: ubuntu-latest
+    name: Lint
+    steps:
+      - uses: actions/checkout@v4
+      - name: Check formatting
+        uses: "lgeiger/black-action@master"
+        with:
+          args: ". -l 79 --check"
+  publish:
+    runs-on: ubuntu-latest
+    name: Publish
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.12
+      - name: Install package
+        run: make install
+      - name: Build package
+        run: make build
+      - name: Publish a Python distribution to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          user: __token__
+          password: ${{ secrets.PYPI }}
+          skip-existing: true
+
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,5 @@
 **/*.h5
 *.ipynb
 **/*.csv
+!uprating_factors.csv
+!uprating_growth_factors.csv
diff --git a/Dockerfile b/Dockerfile
@@ -2,5 +2,4 @@ FROM python:latest
 COPY . .
 # Install
 RUN make install
-# Run tests
-CMD ["make", "test"]
+RUN ["make", "data"]
diff --git a/Makefile b/Makefile
@@ -1,3 +1,5 @@
+all: data test
+
 format:
 	black . -l 79
 
@@ -7,8 +9,24 @@ test:
 install:
 	pip install -e .[dev]
 
+download:
+	python policyengine_us_data/data_storage/download_public_prerequisites.py
+	python policyengine_us_data/data_storage/download_private_prerequisites.py
+
+upload:
+	python policyengine_us_data/data_storage/upload_completed_datasets.py
+
 docker:
 	docker buildx build --platform linux/amd64 . -t policyengine-us-data:latest
 
-evaluate:
-	python policyengine_us_data/evaluation/summary.py
+documentation:
+	streamlit run docs/Home.py
+
+data:
+	python policyengine_us_data/datasets/cps/enhanced_cps.py
+
+clean:
+	rm policyengine_us_data/data_storage/puf_2015.csv
+	rm policyengine_us_data/data_storage/demographics_2015.csv
+build:
+	python setup.py sdist bdist_wheel
diff --git a/docs/Dockerfile b/docs/Dockerfile
@@ -0,0 +1,8 @@
+FROM python:latest
+COPY . .
+# Install
+RUN make download
+RUN make install
+RUN python docs/download.py
+EXPOSE 8080
+ENTRYPOINT ["streamlit", "run", "docs/Home.py", "--server.port=8080", "--server.address=0.0.0.0"]
diff --git a/docs/Home.py b/docs/Home.py
@@ -0,0 +1,37 @@
+import streamlit as st
+
+st.title("PolicyEngine-US-Data")
+
+st.write(
+    """PolicyEngine-US-Data is a package to create representative microdata for the US, designed for input in the PolicyEngine tax-benefit microsimulation model."""
+)
+
+st.subheader("What does this repo do?")
+
+st.write(
+    """Principally, this package creates a (partly synthetic) dataset of households (with incomes, demographics and more) that describes the U.S. household sector. This dataset synthesises multiple sources of data (the Current Population Survey, the IRS Public Use File, and administrative statistics) to improve upon the accuracy of **any** of them."""
+)
+
+st.subheader("What does this dataset look like?")
+
+st.write(
+    "The below table shows an extract of the person records in one household in the dataset."
+)
+
+
+@st.cache_data
+def sample_household():
+    import pandas as pd
+    from policyengine_us_data.datasets import EnhancedCPS_2024
+    from policyengine_us import Microsimulation
+
+    df = Microsimulation(dataset=EnhancedCPS_2024).to_input_dataframe()
+
+    household_id = df.person_household_id__2024.values[10]
+    people_in_household = df[df.person_household_id__2024 == household_id]
+    return people_in_household
+
+
+people_in_household = sample_household()
+
+st.dataframe(people_in_household.T, use_container_width=True)
diff --git a/docs/download.py b/docs/download.py
@@ -0,0 +1,26 @@
+from policyengine_us_data.utils.github import download
+from policyengine_us_data.data_storage import STORAGE_FOLDER
+
+download(
+    "PolicyEngine",
+    "policyengine-us-data",
+    "release",
+    "enhanced_cps_2024.h5",
+    STORAGE_FOLDER / "enhanced_cps_2024.h5",
+)
+
+download(
+    "PolicyEngine",
+    "policyengine-us-data",
+    "release",
+    "cps_2024.h5",
+    STORAGE_FOLDER / "cps_2024.h5",
+)
+
+download(
+    "PolicyEngine",
+    "irs-soi-puf",
+    "release",
+    "puf_2024.h5",
+    STORAGE_FOLDER / "puf_2024.h5",
+)
diff --git a/docs/pages/Aggregates.py b/docs/pages/Aggregates.py
@@ -0,0 +1,43 @@
+import streamlit as st
+
+st.title("Aggregates")
+
+st.write(
+    """The table below shows the totals for calendar year 2024 for the Enhanced CPS dataset variables."""
+)
+
+
+@st.cache_data
+def sample_household():
+    from policyengine_us import Microsimulation
+    from policyengine_us_data import EnhancedCPS_2024
+    from policyengine_us_data.datasets.cps.extended_cps import (
+        IMPUTED_VARIABLES as FINANCE_VARIABLES,
+    )
+    import pandas as pd
+
+    sim = Microsimulation(dataset=EnhancedCPS_2024)
+
+    df = (
+        pd.DataFrame(
+            {
+                "Variable": FINANCE_VARIABLES,
+                "Total ($bn)": [
+                    round(
+                        sim.calculate(variable, map_to="household").sum()
+                        / 1e9,
+                        1,
+                    )
+                    for variable in FINANCE_VARIABLES
+                ],
+            }
+        )
+        .sort_values("Total ($bn)", ascending=False)
+        .set_index("Variable")
+    )
+    return df
+
+
+df = sample_household()
+
+st.dataframe(df, use_container_width=True)