Fix indices

PolicyEngine · Aug 31, 2024 · c0df878 · c0df878
1 parent 96e209c
commit c0df878
Show file tree

Hide file tree

Showing 7 changed files with 26 additions and 28 deletions.
diff --git a/Makefile b/Makefile
@@ -13,6 +13,9 @@ download:
 	python policyengine_us_data/data_storage/download_public_prerequisites.py
 	python policyengine_us_data/data_storage/download_private_prerequisites.py
 
+upload:
+	python policyengine_us_data/data_storage/upload_completed_datasets.py
+
 docker:
 	docker buildx build --platform linux/amd64 . -t policyengine-us-data:latest
 

diff --git a/docs/Dockerfile b/docs/Dockerfile
@@ -1,4 +1,8 @@
 FROM python:latest
 COPY . .
 # Install
+RUN make download
 RUN make install
+RUN make data
+RUN make test
+RUN make upload
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
@@ -97,23 +97,6 @@ def add_id_variables(
     cps["person_household_id"] = person.PH_SEQ
     cps["person_family_id"] = person.PH_SEQ * 10 + person.PF_SEQ
 
-    # Add weights
-    # Weights are multiplied by 100 to avoid decimals
-    cps["person_weight"] = person.A_FNLWGT / 1e2
-    cps["family_weight"] = family.FSUP_WGT / 1e2
-
-    # Tax unit weight is the weight of the containing family.
-    family_weight = Series(
-        cps["family_weight"][...], index=cps["family_id"][...]
-    )
-    person_family_id = cps["person_family_id"][...]
-    persons_family_weight = Series(family_weight[person_family_id])
-    cps["tax_unit_weight"] = persons_family_weight.groupby(
-        cps["person_tax_unit_id"][...]
-    ).first()
-
-    cps["spm_unit_weight"] = spm_unit.SPM_WEIGHT / 1e2
-
     cps["household_weight"] = household.HSUP_WGT / 1e2
 
     # Marital units

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -130,7 +130,8 @@ def generate(self):
         data = {}
 
         for column in df.columns:
-            variable_name, time_period = column.split("__")
+            variable_name = column.split("__")[0]
+            time_period = int(column.split("__")[1])
             data[variable_name] = data.get(variable_name, {})
             data[variable_name][time_period] = df[column].values
 
@@ -140,7 +141,7 @@ def generate(self):
 class EnhancedCPS_2024(EnhancedCPS):
     input_dataset = ExtendedCPS_2024
     start_year = 2024
-    end_year = 2034
+    end_year = 2024
     name = "enhanced_cps_2024"
     label = "Enhanced CPS 2024"
     file_path = STORAGE_FOLDER / "enhanced_cps_2024.h5"

diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py
@@ -51,7 +51,7 @@
     "self_employment_income",
     "short_term_capital_gains",
     "social_security",
-    "state_and_local_sales_or_income_tax",
+    # "state_and_local_sales_or_income_tax", # Don't impute SALT, or it'll override the computed state taxes.
     "student_loan_interest",
     "tax_exempt_interest_income",
     "tax_exempt_pension_income",
@@ -65,11 +65,15 @@
     "w2_wages_from_qualified_business",
 ]
 
+IMPUTED_VARIABLES = [
+    "employment_income",
+]
+
 
 class ExtendedCPS(Dataset):
     cps: Type[CPS]
     puf: Type[PUF]
-    data_format = Dataset.ARRAYS
+    data_format = Dataset.TIME_PERIOD_ARRAYS
 
     def generate(self):
         from policyengine_us import Microsimulation
@@ -111,7 +115,7 @@ def generate(self):
         for variable in IMPUTED_VARIABLES:
             imputed_dataset[f"{variable}__{self.time_period}"] = y[variable]
 
-        ENTITIES = ("person", "tax_unit", "family", "spm_unit", "household")
+        ENTITIES = ("person", "tax_unit", "marital_unit", "family", "spm_unit", "household")
         for entity in ENTITIES:
             for id_name in [
                 f"{entity}_id__{self.time_period}",
@@ -126,17 +130,20 @@ def generate(self):
         for variable in imputed_dataset.columns:
             if "_weight" in variable:
                 imputed_dataset[variable] = 0
-        original_dataset["data_source"] = "cps"
-        imputed_dataset["data_source"] = "puf_imputed"
+
+        original_dataset["data_source__2024"] = "cps"
+        imputed_dataset["data_source__2024"] = "puf_imputed"
         combined = pd.concat([original_dataset, imputed_dataset]).fillna(0)
         # Sort columns in alphabetical order
         combined = combined.reindex(sorted(combined.columns), axis=1)
 
         data = {}
 
-        for column in combined:
-            variable_name, time_period = column.split("__")
-            data[variable_name] = combined[column].values
+        for column in combined.columns:
+            variable_name = column.split("__")[0]
+            time_period = int(column.split("__")[1])
+            data[variable_name] = data.get(variable_name, {})
+            data[variable_name][time_period] = combined[column].values
 
         self.save_dataset(data)
 

diff --git a/policyengine_us_data/utils/github.py b/policyengine_us_data/utils/github.py
@@ -4,7 +4,6 @@
 auth_headers = {
     "Authorization": f"token {os.environ.get('POLICYENGINE_US_DATA_GITHUB_TOKEN')}",
 }
-print(auth_headers["Authorization"][:13] + "***")
 
 
 def get_asset_url(

diff --git a/setup.py b/setup.py
@@ -23,6 +23,7 @@
             "black",
             "pytest",
             "policyengine_us==1.65",
+            "streamlit",
         ],
     },
 )