Skip to content

Commit

Permalink
Merge pull request #351 from PSLmodels/resampling
Browse files Browse the repository at this point in the history
Add tmd/examination/2022/sampling_variability.py logic
  • Loading branch information
martinholmer authored Jan 19, 2025
2 parents 2672efe + e2e9857 commit ab77294
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 108 deletions.
9 changes: 7 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,14 @@ TMD files with Tax-Calculator.
## Examination results

To assess, review the data examination results that compare federal
agency tax estimates with those generated using the national microdata
files created in each project phase:
agency tax microsimulation estimates for 2023 and 2026 with those
generated using the national microdata files created in each project
phase:
* [phase 1 results](./tmd/examination/results1.md)
* [phase 2 results](./tmd/examination/results2.md)
* [phase 3 results](./tmd/examination/results3.md)
* [phase 4+ results](./tmd/examination/results4.md)

2022 TMD estimates have also been compared with IRS/SOI data on actual
income tax returns for 2022 in [this
document](./tmd/examination/2022/results.md).
6 changes: 3 additions & 3 deletions tmd/examination/2022/bootstrap_sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

USAGE = "USAGE: python bootstrap_sampling.py tc_dump_output_csv_file_name\n"

SS_FRAC = 1.0
SS_FRAC = 1.00 # 0.1
SS_RNSEED = 902345678

BS_SAMPLES = 1000
Expand Down Expand Up @@ -54,7 +54,7 @@ def bootstrap_sampling(outfile):
wght_cv = wght_stdv / wght_mean
print(
f"BS:wght num,mean,stdev,cv(%) = {BS_SAMPLES:4d} "
f"{wght_mean:9.3f} {wght_stdv:7.3f} {100 * wght_cv:6.2f}"
f"{wght_mean:9.3f} {wght_stdv:7.3f} {100 * wght_cv:8.4f}"
)
if BS_CI and BS_SAMPLES == 1000:
print(f"BS:wght median = {wght[499]:9.3f}")
Expand All @@ -64,7 +64,7 @@ def bootstrap_sampling(outfile):
itax_cv = itax_stdv / itax_mean
print(
f"BS:itax num,mean,stdev,cv(%) = {BS_SAMPLES:4d} "
f"{itax_mean:9.3f} {itax_stdv:7.3f} {100 * itax_cv:6.2f}"
f"{itax_mean:9.3f} {itax_stdv:7.3f} {100 * itax_cv:8.4f}"
)
if BS_CI and BS_SAMPLES == 1000:
print(f"BS:itax median = {itax[499]:9.3f}")
Expand Down
79 changes: 0 additions & 79 deletions tmd/examination/2022/generate_tmd_results.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,82 +41,3 @@ rm -f ./*tmd*csv*
rm -f ./tmd*-22-*
rm -f ./*.log
exit 0


# WE HAVE THESE RESULTS ON 2025-01-16:
#
# (taxcalc-dev) 2022% time ./generate_tmd_results.sh
# Generating weights for ak ...
# ::loop,delta,misses,exectime(secs): 1 1.000000e-09 0 401.1
# DISTRIBUTION OF TARGET ACT/EXP RATIOS (n=146):
# with REGULARIZATION_DELTA= 1.000000e-09
# low bin ratio high bin ratio bin # cum # bin % cum %
# >= 0.996000, < 1.004000: 146 146 100.00% 100.00%
# MINIMUM VALUE OF TARGET ACT/EXP RATIO = 0.998
# MAXIMUM VALUE OF TARGET ACT/EXP RATIO = 1.001
# Generating weights for mn ...
# ::loop,delta,misses,exectime(secs): 1 1.000000e-09 0 240.2
# DISTRIBUTION OF TARGET ACT/EXP RATIOS (n=147):
# with REGULARIZATION_DELTA= 1.000000e-09
# low bin ratio high bin ratio bin # cum # bin % cum %
# >= 0.996000, < 1.004000: 147 147 100.00% 100.00%
# MINIMUM VALUE OF TARGET ACT/EXP RATIO = 0.999
# MAXIMUM VALUE OF TARGET ACT/EXP RATIO = 1.001
# Generating weights for nj ...
# ::loop,delta,misses,exectime(secs): 1 1.000000e-09 0 180.3
# DISTRIBUTION OF TARGET ACT/EXP RATIOS (n=147):
# with REGULARIZATION_DELTA= 1.000000e-09
# low bin ratio high bin ratio bin # cum # bin % cum %
# >= 0.996000, < 1.004000: 147 147 100.00% 100.00%
# MINIMUM VALUE OF TARGET ACT/EXP RATIO = 0.999
# MAXIMUM VALUE OF TARGET ACT/EXP RATIO = 1.000
# Generating weights for nm ...
# ::loop,delta,misses,exectime(secs): 1 1.000000e-09 0 168.4
# DISTRIBUTION OF TARGET ACT/EXP RATIOS (n=147):
# with REGULARIZATION_DELTA= 1.000000e-09
# low bin ratio high bin ratio bin # cum # bin % cum %
# >= 0.996000, < 1.004000: 147 147 100.00% 100.00%
# MINIMUM VALUE OF TARGET ACT/EXP RATIO = 1.000
# MAXIMUM VALUE OF TARGET ACT/EXP RATIO = 1.001
# Generating weights for sc ...
# ::loop,delta,misses,exectime(secs): 1 1.000000e-09 0 231.2
# DISTRIBUTION OF TARGET ACT/EXP RATIOS (n=147):
# with REGULARIZATION_DELTA= 1.000000e-09
# low bin ratio high bin ratio bin # cum # bin % cum %
# >= 0.996000, < 1.004000: 147 147 100.00% 100.00%
# MINIMUM VALUE OF TARGET ACT/EXP RATIO = 0.999
# MAXIMUM VALUE OF TARGET ACT/EXP RATIO = 1.000
# Generating weights for va ...
# ::loop,delta,misses,exectime(secs): 1 1.000000e-09 2 456.7
# DISTRIBUTION OF TARGET ACT/EXP RATIOS (n=147):
# with REGULARIZATION_DELTA= 1.000000e-09
# low bin ratio high bin ratio bin # cum # bin % cum %
# >= 0.900000, < 0.990000: 1 1 0.68% 0.68%
# >= 0.990000, < 0.996000: 0 1 0.00% 0.68%
# >= 0.996000, < 1.004000: 145 146 98.64% 99.32%
# >= 1.004000, < 1.010000: 0 146 0.00% 99.32%
# >= 1.010000, < 1.100000: 1 147 0.68% 100.00%
# MINIMUM VALUE OF TARGET ACT/EXP RATIO = 0.948
# MAXIMUM VALUE OF TARGET ACT/EXP RATIO = 1.050
# Generating results for US ...
# 9654.475|420.299|396.303|1114.474|29.909|473.755|14851.081|11842.505|116.717
# 2289.792
# Generating results for ak ...
# 19.843|0.554|0.607|1.911|0.109|0.906|29.261|22.975|0.217
# 3.796
# Generating results for mn ...
# 179.963|6.321|6.362|20.908|0.71|9.72|267.728|212.373|1.392
# 38.722
# Generating results for nj ...
# 346.77|16.023|10.934|32.693|1.049|16.227|506.593|415.249|2.642
# 82.935
# Generating results for nm ...
# 41.203|1.354|2.04|3.184|0.199|2.932|63.316|46.637|0.947
# 7.416
# Generating results for sc ...
# 119.387|3.966|4.309|14.67|0.429|8.407|185.522|141.508|2.133
# 24.706
# Generating results for va ...
# 276.779|10.304|8.814|28.721|0.767|13.378|408.918|327.695|2.597
# 61.405
# ./generate_tmd_results.sh 6396.15s user 93.15s system 310% cpu 34:52.81 total
1 change: 0 additions & 1 deletion tmd/examination/2022/precision.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ TMD=../..
cp $TMD/storage/output/tmd*csv* .
gunzip -f tmd.csv.gz
STATES="ak mn nj nm sc va"
STATES=""

# === WEIGHTS ===
for S in $STATES; do
Expand Down
2 changes: 1 addition & 1 deletion tmd/examination/2022/results.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ details on the TMD and IRS/SOI estimates.

**Sources**:

The 2022 TMD estimates are generated by the
The 2022 TMD estimates are generated using Tax-Calculator by the
[`generate_tmd_results.sh`](./generate_tmd_results.sh) script.

The SOI estimates are taken directly from the "All returns" "Total tax
Expand Down
58 changes: 36 additions & 22 deletions tmd/examination/2022/sampling_variability.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
"""
Calculate hypothetical TMD-vs-SOI itax percentage differences
using bootstrap sampling methods.
using resampling methods.
"""

import sys
import yaml
import numpy as np
import pandas as pd

BS_SAMPLES = 1000
BS_RNSEED = 192837465
Expand All @@ -14,33 +14,47 @@
TMD_CV = 0.0034
SOI_CV = 0.0102

USAGE = "USAGE: python sampling_variability.py ASSUMPTIONS_YAML_FILE_NAME\n"

def sampling_variability():

def sampling_variability(yamlfilename):
"""
High-level logic of the script.
"""
#
with open(yamlfilename, "r", encoding="utf-8") as yamlfile:
assumptions = yaml.safe_load(yamlfile)
assert isinstance(assumptions, dict)

# specify rng and draw samples
rng = np.random.default_rng(seed=BS_RNSEED)
tmd = rng.normal(ITX_MEAN, TMD_CV * ITX_MEAN, BS_SAMPLES)
soi = rng.normal(ITX_MEAN, SOI_CV * ITX_MEAN, BS_SAMPLES)
pctdiff = 100 * (tmd / soi - 1)

# show results
print(f"ITX_MEAN,TMD_CV,SOI_CV = {ITX_MEAN:.1f} {TMD_CV:.4f} {SOI_CV:.4f}")
pd_mean = pctdiff.mean()
pd_stdv = pctdiff.std()
pd_cv = pd_stdv / pd_mean
print(
f"BS:pctdiff num,mean,stdev,cv(%) = {BS_SAMPLES:4d} "
f"{pd_mean:9.3f} {pd_stdv:7.3f} {100 * pd_cv:6.2f}"
)
if BS_SAMPLES == 1000:
pdiff = np.sort(pctdiff)
print(f"BS:pctdiff median = {pdiff[499]:9.3f}")
print(f"BS:pctdiff 95%_ci = {pdiff[24]:9.3f} , {pdiff[974]:9.3f}")
for area, asmp in assumptions.items():
print(f"Generating results for {area} ...")
rng = np.random.default_rng(seed=BS_RNSEED)
mean = asmp["mean"]
cv_tmd = asmp["cv_tmd"]
cv_soi = asmp["cv_soi"]
tmd = rng.normal(mean, cv_tmd * mean, BS_SAMPLES)
soi = rng.normal(mean, cv_soi * mean, BS_SAMPLES)
del rng
pctdiff = 100 * (tmd / soi - 1)
print(f"mean,cv_tmd,cv_soi = {mean:.3f} {cv_tmd:.6f} {cv_soi:.6f}")
pd_mean = pctdiff.mean()
pd_stdv = pctdiff.std()
print(
f"BS:pctdiff num,mean,stdev = {BS_SAMPLES:4d} "
f"{pd_mean:9.3f} {pd_stdv:7.3f}"
)
if BS_SAMPLES == 1000:
pdiff = np.sort(pctdiff)
print(f"BS:pctdiff median = {pdiff[499]:9.3f}")
print(f"BS:pctdiff 95%_ci = {pdiff[24]:9.3f} , {pdiff[974]:9.3f}")

return 0


if __name__ == "__main__":
sys.exit(sampling_variability())
if len(sys.argv) - 1 != 1:
sys.stderr.write("ERROR: one command-line argument not specified\n")
sys.stderr.write(USAGE)
sys.exit(1)
sys.exit(sampling_variability(sys.argv[1]))

0 comments on commit ab77294

Please sign in to comment.