Merge pull request #351 from PSLmodels/resampling

Add tmd/examination/2022/sampling_variability.py logic
PSLmodels · Jan 19, 2025 · ab77294 · ab77294
2 parents 2672efe + e2e9857
commit ab77294
Show file tree

Hide file tree

Showing 6 changed files with 47 additions and 108 deletions.
diff --git a/README.md b/README.md
@@ -29,9 +29,14 @@ TMD files with Tax-Calculator.
 ## Examination results
 
 To assess, review the data examination results that compare federal
-agency tax estimates with those generated using the national microdata
-files created in each project phase:
+agency tax microsimulation estimates for 2023 and 2026 with those
+generated using the national microdata files created in each project
+phase:
 * [phase 1 results](./tmd/examination/results1.md)
 * [phase 2 results](./tmd/examination/results2.md)
 * [phase 3 results](./tmd/examination/results3.md)
 * [phase 4+ results](./tmd/examination/results4.md)
+
+2022 TMD estimates have also been compared with IRS/SOI data on actual
+income tax returns for 2022 in [this
+document](./tmd/examination/2022/results.md).
diff --git a/tmd/examination/2022/bootstrap_sampling.py b/tmd/examination/2022/bootstrap_sampling.py
@@ -8,7 +8,7 @@
 
 USAGE = "USAGE: python bootstrap_sampling.py tc_dump_output_csv_file_name\n"
 
-SS_FRAC = 1.0
+SS_FRAC = 1.00  # 0.1
 SS_RNSEED = 902345678
 
 BS_SAMPLES = 1000
@@ -54,7 +54,7 @@ def bootstrap_sampling(outfile):
     wght_cv = wght_stdv / wght_mean
     print(
         f"BS:wght num,mean,stdev,cv(%) = {BS_SAMPLES:4d}  "
-        f"{wght_mean:9.3f}  {wght_stdv:7.3f}  {100 * wght_cv:6.2f}"
+        f"{wght_mean:9.3f}  {wght_stdv:7.3f}  {100 * wght_cv:8.4f}"
     )
     if BS_CI and BS_SAMPLES == 1000:
         print(f"BS:wght median = {wght[499]:9.3f}")
@@ -64,7 +64,7 @@ def bootstrap_sampling(outfile):
     itax_cv = itax_stdv / itax_mean
     print(
         f"BS:itax num,mean,stdev,cv(%) = {BS_SAMPLES:4d}  "
-        f"{itax_mean:9.3f}  {itax_stdv:7.3f}  {100 * itax_cv:6.2f}"
+        f"{itax_mean:9.3f}  {itax_stdv:7.3f}  {100 * itax_cv:8.4f}"
     )
     if BS_CI and BS_SAMPLES == 1000:
         print(f"BS:itax median = {itax[499]:9.3f}")

diff --git a/tmd/examination/2022/generate_tmd_results.sh b/tmd/examination/2022/generate_tmd_results.sh
@@ -41,82 +41,3 @@ rm -f ./*tmd*csv*
 rm -f ./tmd*-22-*
 rm -f ./*.log
 exit 0
-
-
-# WE HAVE THESE RESULTS ON 2025-01-16:
-#
-# (taxcalc-dev) 2022% time ./generate_tmd_results.sh
-# Generating weights for ak ...
-#   ::loop,delta,misses,exectime(secs):   1   1.000000e-09   0   401.1
-# DISTRIBUTION OF TARGET ACT/EXP RATIOS (n=146):
-#   with REGULARIZATION_DELTA= 1.000000e-09
-# low bin ratio    high bin ratio    bin #    cum #     bin %     cum %
-# >=     0.996000, <     1.004000:     146      146   100.00%   100.00%
-# MINIMUM VALUE OF TARGET ACT/EXP RATIO = 0.998
-# MAXIMUM VALUE OF TARGET ACT/EXP RATIO = 1.001
-# Generating weights for mn ...
-#   ::loop,delta,misses,exectime(secs):   1   1.000000e-09   0   240.2
-# DISTRIBUTION OF TARGET ACT/EXP RATIOS (n=147):
-#   with REGULARIZATION_DELTA= 1.000000e-09
-# low bin ratio    high bin ratio    bin #    cum #     bin %     cum %
-# >=     0.996000, <     1.004000:     147      147   100.00%   100.00%
-# MINIMUM VALUE OF TARGET ACT/EXP RATIO = 0.999
-# MAXIMUM VALUE OF TARGET ACT/EXP RATIO = 1.001
-# Generating weights for nj ...
-#   ::loop,delta,misses,exectime(secs):   1   1.000000e-09   0   180.3
-# DISTRIBUTION OF TARGET ACT/EXP RATIOS (n=147):
-#   with REGULARIZATION_DELTA= 1.000000e-09
-# low bin ratio    high bin ratio    bin #    cum #     bin %     cum %
-# >=     0.996000, <     1.004000:     147      147   100.00%   100.00%
-# MINIMUM VALUE OF TARGET ACT/EXP RATIO = 0.999
-# MAXIMUM VALUE OF TARGET ACT/EXP RATIO = 1.000
-# Generating weights for nm ...
-#   ::loop,delta,misses,exectime(secs):   1   1.000000e-09   0   168.4
-# DISTRIBUTION OF TARGET ACT/EXP RATIOS (n=147):
-#   with REGULARIZATION_DELTA= 1.000000e-09
-# low bin ratio    high bin ratio    bin #    cum #     bin %     cum %
-# >=     0.996000, <     1.004000:     147      147   100.00%   100.00%
-# MINIMUM VALUE OF TARGET ACT/EXP RATIO = 1.000
-# MAXIMUM VALUE OF TARGET ACT/EXP RATIO = 1.001
-# Generating weights for sc ...
-#   ::loop,delta,misses,exectime(secs):   1   1.000000e-09   0   231.2
-# DISTRIBUTION OF TARGET ACT/EXP RATIOS (n=147):
-#   with REGULARIZATION_DELTA= 1.000000e-09
-# low bin ratio    high bin ratio    bin #    cum #     bin %     cum %
-# >=     0.996000, <     1.004000:     147      147   100.00%   100.00%
-# MINIMUM VALUE OF TARGET ACT/EXP RATIO = 0.999
-# MAXIMUM VALUE OF TARGET ACT/EXP RATIO = 1.000
-# Generating weights for va ...
-#   ::loop,delta,misses,exectime(secs):   1   1.000000e-09   2   456.7
-# DISTRIBUTION OF TARGET ACT/EXP RATIOS (n=147):
-#   with REGULARIZATION_DELTA= 1.000000e-09
-# low bin ratio    high bin ratio    bin #    cum #     bin %     cum %
-# >=     0.900000, <     0.990000:       1        1     0.68%     0.68%
-# >=     0.990000, <     0.996000:       0        1     0.00%     0.68%
-# >=     0.996000, <     1.004000:     145      146    98.64%    99.32%
-# >=     1.004000, <     1.010000:       0      146     0.00%    99.32%
-# >=     1.010000, <     1.100000:       1      147     0.68%   100.00%
-# MINIMUM VALUE OF TARGET ACT/EXP RATIO = 0.948
-# MAXIMUM VALUE OF TARGET ACT/EXP RATIO = 1.050
-# Generating results for US ...
-# 9654.475|420.299|396.303|1114.474|29.909|473.755|14851.081|11842.505|116.717
-# 2289.792
-# Generating results for ak ...
-# 19.843|0.554|0.607|1.911|0.109|0.906|29.261|22.975|0.217
-# 3.796
-# Generating results for mn ...
-# 179.963|6.321|6.362|20.908|0.71|9.72|267.728|212.373|1.392
-# 38.722
-# Generating results for nj ...
-# 346.77|16.023|10.934|32.693|1.049|16.227|506.593|415.249|2.642
-# 82.935
-# Generating results for nm ...
-# 41.203|1.354|2.04|3.184|0.199|2.932|63.316|46.637|0.947
-# 7.416
-# Generating results for sc ...
-# 119.387|3.966|4.309|14.67|0.429|8.407|185.522|141.508|2.133
-# 24.706
-# Generating results for va ...
-# 276.779|10.304|8.814|28.721|0.767|13.378|408.918|327.695|2.597
-# 61.405
-# ./generate_tmd_results.sh  6396.15s user 93.15s system 310% cpu 34:52.81 total
diff --git a/tmd/examination/2022/precision.sh b/tmd/examination/2022/precision.sh
@@ -10,7 +10,6 @@ TMD=../..
 cp $TMD/storage/output/tmd*csv* .
 gunzip -f tmd.csv.gz
 STATES="ak mn nj nm sc va"
-STATES=""
 
 # === WEIGHTS ===
 for S in $STATES; do

diff --git a/tmd/examination/2022/results.md b/tmd/examination/2022/results.md
@@ -26,7 +26,7 @@ details on the TMD and IRS/SOI estimates.
 
 **Sources**:
 
-The 2022 TMD estimates are generated by the
+The 2022 TMD estimates are generated using Tax-Calculator by the
 [`generate_tmd_results.sh`](./generate_tmd_results.sh) script.
 
 The SOI estimates are taken directly from the "All returns" "Total tax

diff --git a/tmd/examination/2022/sampling_variability.py b/tmd/examination/2022/sampling_variability.py
@@ -1,11 +1,11 @@
 """
 Calculate hypothetical TMD-vs-SOI itax percentage differences
-using bootstrap sampling methods.
+using resampling methods.
 """
 
 import sys
+import yaml
 import numpy as np
-import pandas as pd
 
 BS_SAMPLES = 1000
 BS_RNSEED = 192837465
@@ -14,33 +14,47 @@
 TMD_CV = 0.0034
 SOI_CV = 0.0102
 
+USAGE = "USAGE: python sampling_variability.py ASSUMPTIONS_YAML_FILE_NAME\n"
 
-def sampling_variability():
+
+def sampling_variability(yamlfilename):
     """
     High-level logic of the script.
     """
+    #
+    with open(yamlfilename, "r", encoding="utf-8") as yamlfile:
+        assumptions = yaml.safe_load(yamlfile)
+    assert isinstance(assumptions, dict)
+
     # specify rng and draw samples
-    rng = np.random.default_rng(seed=BS_RNSEED)
-    tmd = rng.normal(ITX_MEAN, TMD_CV * ITX_MEAN, BS_SAMPLES)
-    soi = rng.normal(ITX_MEAN, SOI_CV * ITX_MEAN, BS_SAMPLES)
-    pctdiff = 100 * (tmd / soi - 1)
-
-    # show results
-    print(f"ITX_MEAN,TMD_CV,SOI_CV = {ITX_MEAN:.1f} {TMD_CV:.4f} {SOI_CV:.4f}")
-    pd_mean = pctdiff.mean()
-    pd_stdv = pctdiff.std()
-    pd_cv = pd_stdv / pd_mean
-    print(
-        f"BS:pctdiff num,mean,stdev,cv(%) = {BS_SAMPLES:4d}  "
-        f"{pd_mean:9.3f}  {pd_stdv:7.3f}  {100 * pd_cv:6.2f}"
-    )
-    if BS_SAMPLES == 1000:
-        pdiff = np.sort(pctdiff)
-        print(f"BS:pctdiff median = {pdiff[499]:9.3f}")
-        print(f"BS:pctdiff 95%_ci = {pdiff[24]:9.3f} , {pdiff[974]:9.3f}")
+    for area, asmp in assumptions.items():
+        print(f"Generating results for {area} ...")
+        rng = np.random.default_rng(seed=BS_RNSEED)
+        mean = asmp["mean"]
+        cv_tmd = asmp["cv_tmd"]
+        cv_soi = asmp["cv_soi"]
+        tmd = rng.normal(mean, cv_tmd * mean, BS_SAMPLES)
+        soi = rng.normal(mean, cv_soi * mean, BS_SAMPLES)
+        del rng
+        pctdiff = 100 * (tmd / soi - 1)
+        print(f"mean,cv_tmd,cv_soi = {mean:.3f} {cv_tmd:.6f} {cv_soi:.6f}")
+        pd_mean = pctdiff.mean()
+        pd_stdv = pctdiff.std()
+        print(
+            f"BS:pctdiff num,mean,stdev = {BS_SAMPLES:4d}  "
+            f"{pd_mean:9.3f}  {pd_stdv:7.3f}"
+        )
+        if BS_SAMPLES == 1000:
+            pdiff = np.sort(pctdiff)
+            print(f"BS:pctdiff median = {pdiff[499]:9.3f}")
+            print(f"BS:pctdiff 95%_ci = {pdiff[24]:9.3f} , {pdiff[974]:9.3f}")
 
     return 0
 
 
 if __name__ == "__main__":
-    sys.exit(sampling_variability())
+    if len(sys.argv) - 1 != 1:
+        sys.stderr.write("ERROR: one command-line argument not specified\n")
+        sys.stderr.write(USAGE)
+        sys.exit(1)
+    sys.exit(sampling_variability(sys.argv[1]))