diff --git a/.coveragerc b/.coveragerc index eb6d5b460..3eb7ba134 100644 --- a/.coveragerc +++ b/.coveragerc @@ -4,6 +4,6 @@ omit = taxcalc/functions.py taxcalc/*.json taxcalc/cli/* - taxcalc/dropq/* + taxcalc/tbi/* taxcalc/tests/* taxcalc/validation/* diff --git a/RELEASES.md b/RELEASES.md index abd9df8e2..ce20d76a5 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -16,6 +16,9 @@ Release 0.12.0 on 2017-??-?? - Remove arrays_not_lists argument from read_json_param_objects [[#1568](https://github.com/open-source-economics/Tax-Calculator/pull/1568) by Martin Holmer] +- Rename dropq as tbi (taxbrain interface) and refactor run_nth_year_*_model functions + [[#1577](https://github.com/open-source-economics/Tax-Calculator/pull/1577) + by Martin Holmer] **New Features** - Add Calculator.reform_documentation that generates plain text documentation of a reform diff --git a/read-the-docs/source/public_api.rst b/read-the-docs/source/public_api.rst index 2144f88e5..972690213 100644 --- a/read-the-docs/source/public_api.rst +++ b/read-the-docs/source/public_api.rst @@ -52,12 +52,6 @@ taxcalc.decorators .. automodule:: taxcalc.decorators :members: -taxcalc.dropq.dropq -------------------- - -.. automodule:: taxcalc.dropq.dropq - :members: - taxcalc.functions ----------------- @@ -108,6 +102,12 @@ taxcalc.TaxCalcIO .. autoclass:: taxcalc.TaxCalcIO :members: +taxcalc.tbi.tbi +------------------- + +.. automodule:: taxcalc.tbi.tbi + :members: + taxcalc.utils ------------- diff --git a/setup.py b/setup.py index 1b1195c8a..a075625f1 100755 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ 'cmdclass': cmdclass, 'license': 'MIT', 'packages': ['taxcalc', 'taxcalc.filings', 'taxcalc.filings.forms', - 'taxcalc.dropq', 'taxcalc.cli'], + 'taxcalc.tbi', 'taxcalc.cli'], 'include_package_data': True, 'name': 'taxcalc', 'install_requires': ['numpy', 'pandas'], diff --git a/taxcalc/__init__.py b/taxcalc/__init__.py index 7791680bc..f27d23013 100755 --- a/taxcalc/__init__.py +++ b/taxcalc/__init__.py @@ -9,7 +9,7 @@ from taxcalc.taxcalcio import * from taxcalc.utils import * from taxcalc.macro_elasticity import * -from taxcalc.dropq import * +from taxcalc.tbi import * from taxcalc.cli import * from taxcalc._version import get_versions diff --git a/taxcalc/dropq/__init__.py b/taxcalc/dropq/__init__.py deleted file mode 100644 index f116dc920..000000000 --- a/taxcalc/dropq/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from taxcalc.dropq.dropq import (run_nth_year_tax_calc_model, - run_nth_year_gdp_elast_model, - create_json_table, - reform_warnings_errors) diff --git a/taxcalc/macro_elasticity.py b/taxcalc/macro_elasticity.py index 547cac120..8959a0937 100644 --- a/taxcalc/macro_elasticity.py +++ b/taxcalc/macro_elasticity.py @@ -6,7 +6,7 @@ # pylint --disable=locally-disabled macro_elasticity.py -def proportional_change_gdp(calc1, calc2, elasticity=0.0): +def proportional_change_gdp(calc1, calc2, elasticity): ''' This function harnesses econometric estimates of the historic relationship between tax policy and the macroeconomy to predict the effect of tax diff --git a/taxcalc/records.py b/taxcalc/records.py index d406f0b78..b5c8ffbaf 100644 --- a/taxcalc/records.py +++ b/taxcalc/records.py @@ -173,7 +173,7 @@ def __init__(self, @staticmethod def cps_constructor(data=None, exact_calculations=False, - growfactors=Growfactors()): + gfactors=Growfactors()): """ Static method returns a Records object instantiated with CPS input data. This works in a analogous way to Records(), which @@ -188,7 +188,7 @@ def cps_constructor(data=None, data = os.path.join(Records.CUR_PATH, 'cps.csv.gz') return Records(data=data, exact_calculations=exact_calculations, - gfactors=growfactors, + gfactors=gfactors, weights=Records.CPS_WEIGHTS_FILENAME, adjust_ratios=Records.CPS_RATIOS_FILENAME, start_year=CPSCSV_YEAR) @@ -213,7 +213,7 @@ def increment_year(self): Also, does extrapolation, reweighting, adjusting for new current year. """ self._current_year += 1 - # apply variable extrapolation growfactors + # apply variable extrapolation grow factors if self.gfactors is not None: self._blowup(self.current_year) # apply variable adjustment ratios diff --git a/taxcalc/taxcalcio.py b/taxcalc/taxcalcio.py index c0c8ef3c7..f985f9b02 100644 --- a/taxcalc/taxcalcio.py +++ b/taxcalc/taxcalcio.py @@ -228,11 +228,11 @@ def init(self, input_data, tax_year, reform, assump, if aging_input_data: if self.cps_input_data: recs = Records.cps_constructor( - growfactors=gfactors_ref, + gfactors=gfactors_ref, exact_calculations=exact_calculations ) recs_clp = Records.cps_constructor( - growfactors=gfactors_clp, + gfactors=gfactors_clp, exact_calculations=exact_calculations ) else: # if not cps_input_data diff --git a/taxcalc/tbi/__init__.py b/taxcalc/tbi/__init__.py new file mode 100644 index 000000000..b46660300 --- /dev/null +++ b/taxcalc/tbi/__init__.py @@ -0,0 +1,4 @@ +from taxcalc.tbi.tbi import (run_nth_year_tax_calc_model, + run_nth_year_gdp_elast_model, + create_dict_table, + reform_warnings_errors) diff --git a/taxcalc/dropq/dropq.py b/taxcalc/tbi/tbi.py similarity index 70% rename from taxcalc/dropq/dropq.py rename to taxcalc/tbi/tbi.py index a3816a6b4..31422e491 100644 --- a/taxcalc/dropq/dropq.py +++ b/taxcalc/tbi/tbi.py @@ -1,24 +1,25 @@ """ -The dropq functions are used by TaxBrain to call Tax-Calculator in order -to maintain the privacy of the IRS-SOI PUF data being used by TaxBrain. -This is done by "fuzzing" reform results for several randomly selected +The tbi functions are used by TaxBrain to call Tax-Calculator in order +to do distributed processing of TaxBrain runs and in order to maintain +the privacy of the IRS-SOI PUF data being used by TaxBrain. Maintaining +privacy is done by "fuzzing" reform results for several randomly selected filing units in each table cell. The filing units randomly selected differ for each policy reform and the "fuzzing" involves replacing the post-reform tax results for the selected units with their pre-reform tax results. """ # CODING-STYLE CHECKS: -# pep8 --ignore=E402 dropq.py -# pylint --disable=locally-disabled dropq.py +# pep8 --ignore=E402 tbi.py +# pylint --disable=locally-disabled tbi.py from __future__ import print_function import time import numpy as np import pandas as pd -from taxcalc.dropq.dropq_utils import (dropq_calculate, - random_seed, - dropq_summary, - AGGR_ROW_NAMES) +from taxcalc.tbi.tbi_utils import (calculate, + random_seed, + summary, + AGGR_ROW_NAMES) from taxcalc import (results, DIST_TABLE_LABELS, proportional_change_gdp, Growdiff, Growfactors, Policy) @@ -45,7 +46,7 @@ def reform_warnings_errors(user_mods): """ The reform_warnings_errors function assumes user_mods is a dictionary - returned by the Calculator.read_json_parameter_files() function. + returned by the Calculator.read_json_param_objects() function. This function returns a dictionary containing two STR:STR pairs: {'warnings': '', 'errors': ''} @@ -81,22 +82,27 @@ def reform_warnings_errors(user_mods): def run_nth_year_tax_calc_model(year_n, start_year, - taxrec_df, user_mods, - return_json=True): + use_puf_not_cps, + use_full_sample, + user_mods, + return_dict=True): """ - The run_nth_year_tax_calc_model function assumes user_mods is a - dictionary returned by the Calculator.read_json_parameter_files() - function with an extra key:value pair that is specified as - 'gdp_elasticity': {'value': }. + The run_nth_year_tax_calc_model function assumes user_mods is a dictionary + returned by the Calculator.read_json_param_objects() function. + Setting use_puf_not_cps=True implies use puf.csv input file; + otherwise, use cps.csv input file. + Setting use_full_sample=False implies use sub-sample of input file; + otherwsie, use the complete sample. """ - # pylint: disable=too-many-locals + # pylint: disable=too-many-arguments,too-many-locals + start_time = time.time() # create calc1 and calc2 calculated for year_n and mask - (calc1, calc2, mask) = dropq_calculate(year_n, start_year, - taxrec_df, user_mods, - behavior_allowed=True, - mask_computed=True) + (calc1, calc2, mask) = calculate(year_n, start_year, + use_puf_not_cps, use_full_sample, + user_mods, + behavior_allowed=True) # extract raw results from calc1 and calc2 rawres1 = results(calc1.records) @@ -107,11 +113,8 @@ def run_nth_year_tax_calc_model(year_n, start_year, print('seed={}'.format(seed)) np.random.seed(seed) # pylint: disable=no-member - # construct dropq summary results from raw results - summ = dropq_summary(rawres1, rawres2, mask) - - elapsed_time = time.time() - start_time - print('elapsed time for this run: ', elapsed_time) + # construct TaxBrain summary results from raw results + summ = summary(rawres1, rawres2, mask) def append_year(pdf): """ @@ -121,10 +124,12 @@ def append_year(pdf): return pdf # optionally return non-JSON results - if not return_json: + if not return_dict: res = dict() for tbl in summ: res[tbl] = append_year(summ[tbl]) + elapsed_time = time.time() - start_time + print('elapsed time for this run: {:.1f}'.format(elapsed_time)) return res # optionally construct JSON results tables for year n @@ -147,41 +152,50 @@ def append_year(pdf): res = dict() for tbl in summ: if 'aggr' in tbl: - res_table = create_json_table(summ[tbl], + res_table = create_dict_table(summ[tbl], row_names=info[tbl]['row_names']) res[tbl] = dict((k, v[0]) for k, v in res_table.items()) else: - res[tbl] = create_json_table(summ[tbl], + res[tbl] = create_dict_table(summ[tbl], row_names=info[tbl]['row_names'], column_types=info[tbl]['col_types']) + elapsed_time = time.time() - start_time + print('elapsed time for this run: {:.1f}'.format(elapsed_time)) return res def run_nth_year_gdp_elast_model(year_n, start_year, - taxrec_df, user_mods, - return_json=True): + use_puf_not_cps, + use_full_sample, + user_mods, + gdp_elasticity, + return_dict=True): """ - The run_nth_year_gdp_elast_model function assumes user_mods is a - dictionary returned by the Calculator.read_json_parameter_files() - function with an extra key:value pair that is specified as - 'gdp_elasticity': {'value': }. + The run_nth_year_gdp_elast_model function assumes user_mods is a dictionary + returned by the Calculator.read_json_param_objects() function. + Setting use_puf_not_cps=True implies use puf.csv input file; + otherwise, use cps.csv input file. + Setting use_full_sample=False implies use sub-sample of input file; + otherwsie, use the complete sample. """ + # pylint: disable=too-many-arguments + # create calc1 and calc2 calculated for year_n - (calc1, calc2, _) = dropq_calculate(year_n, start_year, - taxrec_df, user_mods, - behavior_allowed=False, - mask_computed=False) + (calc1, calc2, _) = calculate(year_n, start_year, + use_puf_not_cps, + use_full_sample, + user_mods, + behavior_allowed=False) - # compute GDP effect given assumed gdp elasticity - gdp_elasticity = user_mods['gdp_elasticity']['value'] + # compute GDP effect given specified gdp_elasticity gdp_effect = proportional_change_gdp(calc1, calc2, gdp_elasticity) # return gdp_effect results - if return_json: + if return_dict: gdp_df = pd.DataFrame(data=[gdp_effect], columns=['col0']) gdp_elast_names_n = [x + '_' + str(year_n) for x in GDP_ELAST_ROW_NAMES] - gdp_elast_total = create_json_table(gdp_df, + gdp_elast_total = create_dict_table(gdp_df, row_names=gdp_elast_names_n, num_decimals=5) gdp_elast_total = dict((k, v[0]) for k, v in gdp_elast_total.items()) @@ -190,10 +204,10 @@ def run_nth_year_gdp_elast_model(year_n, start_year, return gdp_effect -def create_json_table(dframe, row_names=None, column_types=None, +def create_dict_table(dframe, row_names=None, column_types=None, num_decimals=2): """ - Create and return dictionary with JSON-like contents from specified dframe. + Create and return dictionary with JSON-like content from specified dframe. """ # embedded formatted_string function def formatted_string(val, _type, num_decimals): @@ -216,7 +230,7 @@ def formatted_string(val, _type, num_decimals): except ValueError: # try making it a string - good luck! return str(val) - # high-level create_json_table function logic + # high-level create_dict_table function logic out = dict() if row_names is None: row_names = [str(x) for x in list(dframe.index)] diff --git a/taxcalc/dropq/dropq_utils.py b/taxcalc/tbi/tbi_utils.py similarity index 76% rename from taxcalc/dropq/dropq_utils.py rename to taxcalc/tbi/tbi_utils.py index 4ceed6ab0..306611e32 100644 --- a/taxcalc/dropq/dropq_utils.py +++ b/taxcalc/tbi/tbi_utils.py @@ -1,10 +1,13 @@ """ -Private utility functions used only by public functions in the dropq.py file. +Private utility functions used only by public functions in the tbi.py file. """ # CODING-STYLE CHECKS: -# pep8 --ignore=E402 dropq_utils.py -# pylint --disable=locally-disabled dropq_utils.py +# pep8 --ignore=E402 tbi_utils.py +# pylint --disable=locally-disabled tbi_utils.py +from __future__ import print_function +import os +import time import copy import hashlib import numpy as np @@ -14,7 +17,7 @@ from taxcalc.utils import (add_income_bins, add_quantile_bins, results, create_difference_table, create_distribution_table, STATS_COLUMNS, DIST_TABLE_COLUMNS, - WEBAPP_INCOME_BINS) + WEBAPP_INCOME_BINS, read_egg_csv) def check_years(start_year, year_n): @@ -41,30 +44,31 @@ def check_user_mods(user_mods): raise ValueError('user_mods is not a dictionary') actual_keys = set(list(user_mods.keys())) expected_keys = set(['policy', 'consumption', 'behavior', - 'growdiff_baseline', 'growdiff_response', - 'gdp_elasticity']) - missing_keys = expected_keys - actual_keys - if len(missing_keys) > 0: - raise ValueError('user_mods has missing keys: {}'.format(missing_keys)) - extra_keys = actual_keys - expected_keys - if len(extra_keys) > 0: - raise ValueError('user_mods has extra keys: {}'.format(extra_keys)) - - -def dropq_calculate(year_n, start_year, - taxrec_df, user_mods, - behavior_allowed, mask_computed): + 'growdiff_baseline', 'growdiff_response']) + if actual_keys != expected_keys: + msg = 'actual user_mod keys not equal to expected keys\n' + msg += ' actual: {}\n'.format(actual_keys) + msg += ' expect: {}'.format(expected_keys) + raise ValueError(msg) + + +def calculate(year_n, start_year, + use_puf_not_cps, + use_full_sample, + user_mods, + behavior_allowed): """ - The dropq_calculate function assumes specified user_mods is - a dictionary returned by the Calculator.read_json_parameter_files() - function with an extra key:value pair that is specified as - 'gdp_elasticity': {'value': }. + The calculate function assumes the specified user_mods is a dictionary + returned by the Calculator.read_json_param_objects() function. The function returns (calc1, calc2, mask) where calc1 is pre-reform Calculator object calculated for year_n, calc2 is post-reform Calculator object calculated for year_n, and - mask is boolean array if compute_mask=True or None otherwise + mask is boolean array marking records with reform-induced iitax diffs + Set behavior_allowed to False when generating static results or + set behavior_allowed to True when generating dynamic results. """ - # pylint: disable=too-many-arguments,too-many-locals,too-many-statements + # pylint: disable=too-many-arguments,too-many-locals + # pylint: disable=too-many-branches,too-many-statements check_years(start_year, year_n) check_user_mods(user_mods) @@ -89,9 +93,47 @@ def dropq_calculate(year_n, start_year, growdiff_baseline.apply_to(growfactors_post) growdiff_response.apply_to(growfactors_post) - # create pre-reform Calculator instance using PUF input data & weights - recs1 = Records(data=copy.deepcopy(taxrec_df), - gfactors=growfactors_pre) + # create sample pd.DataFrame from specified input file and sampling scheme + stime = time.time() + tbi_path = os.path.abspath(os.path.dirname(__file__)) + if use_puf_not_cps: + # first try TaxBrain deployment path + input_path = 'puf.csv.gz' + if not os.path.isfile(input_path): + # otherwise try local Tax-Calculator deployment path + input_path = os.path.join(tbi_path, '..', '..', 'puf.csv') + sampling_frac = 0.05 + sampling_seed = 180 + else: # if using cps input not puf input + # first try Tax-Calculator code path + input_path = os.path.join(tbi_path, '..', 'cps.csv.gz') + if not os.path.isfile(input_path): + # otherwise try taxcalc package path + input_path = None + full_sample = read_egg_csv('cps.csv.gz') # pragma: no cover + sampling_frac = 0.05 # TODO: using same as for puf for now + sampling_seed = 180 # TODO: using same as for puf for now + if input_path: + full_sample = pd.read_csv(input_path) + if use_full_sample: + sample = full_sample + else: + sample = full_sample.sample( # pylint: disable=no-member + frac=sampling_frac, + random_state=sampling_seed + ) + if use_puf_not_cps: + print('puf-read-time= {:.1f}'.format(time.time() - stime)) + else: + print('cps-read-time= {:.1f}'.format(time.time() - stime)) + + # create pre-reform Calculator instance + if use_puf_not_cps: + recs1 = Records(data=copy.deepcopy(sample), + gfactors=growfactors_pre) + else: + recs1 = Records.cps_constructor(data=copy.deepcopy(sample), + gfactors=growfactors_pre) policy1 = Policy(gfactors=growfactors_pre) calc1 = Calculator(policy=policy1, records=recs1, consumption=consump) while calc1.current_year < start_year: @@ -99,11 +141,11 @@ def dropq_calculate(year_n, start_year, calc1.calc_all() assert calc1.current_year == start_year - # optionally compute mask - if mask_computed: - # create pre-reform Calculator instance with extra income using - # PUF input data & weights - recs1p = Records(data=copy.deepcopy(taxrec_df), + # compute mask array + res1 = results(calc1.records) + if use_puf_not_cps: + # create pre-reform Calculator instance with extra income + recs1p = Records(data=copy.deepcopy(sample), gfactors=growfactors_pre) # add one dollar to the income of each filing unit to determine # which filing units undergo a resulting change in tax liability @@ -120,13 +162,14 @@ def dropq_calculate(year_n, start_year, # compute mask showing which of the calc1 and calc1p results differ; # mask is true if a filing unit's income tax liability changed after # a dollar was added to the filing unit's wage and salary income - res1 = results(calc1.records) res1p = results(calc1p.records) mask = np.logical_not( # pylint: disable=no-member np.isclose(res1.iitax, res1p.iitax, atol=0.001, rtol=0.0) ) - else: - mask = None + assert np.any(mask) + else: # if use_cps_not_cps is False + # indicate that no fuzzing of reform results is required + mask = np.zeros(res1.shape[0], dtype=np.int8) # specify Behavior instance behv = Behavior() @@ -143,9 +186,13 @@ def dropq_calculate(year_n, start_year, msg = 'A behavior RESPONSE IS NOT ALLOWED' raise ValueError(msg) - # create post-reform Calculator instance using PUF input data & weights - recs2 = Records(data=copy.deepcopy(taxrec_df), - gfactors=growfactors_post) + # create post-reform Calculator instance + if use_puf_not_cps: + recs2 = Records(data=copy.deepcopy(sample), + gfactors=growfactors_post) + else: + recs2 = Records.cps_constructor(data=copy.deepcopy(sample), + gfactors=growfactors_post) policy2 = Policy(gfactors=growfactors_post) policy_reform = user_mods['policy'] policy2.implement_reform(policy_reform) @@ -206,7 +253,7 @@ def random_seed_from_subdict(subdict): return seed % np.iinfo(np.uint32).max # pylint: disable=no-member -NUM_TO_FUZZ = 3 +NUM_TO_FUZZ = 3 # when using dropq algorithm on puf.csv results def chooser(agg): @@ -227,16 +274,17 @@ def chooser(agg): msg = ('Not enough differences in income tax when adding ' 'one dollar for chunk with name: {}') raise ValueError(msg.format(agg.name)) - # mark the records chosen to be fuzzed + # mark the records chosen to be fuzzed (ans=0) ans = [1] * len(agg) for idx in choices: ans[idx] = 0 return ans -def fuzz_df2_records(df1, df2, mask): +def create_results_columns(df1, df2, mask): """ - Modify df2 by adding random fuzz for data privacy. + Create columns in df2 results dataframe and possibly + modify df2 results by adding random fuzz for data privacy. Parameters ---------- @@ -247,24 +295,27 @@ def fuzz_df2_records(df1, df2, mask): contains results for the reform plan mask: boolean numpy array - contains info about whether or not each row might be fuzzed + contains info about whether or not units have reform-induced tax diffs + (if mask contains all False values, then no results fuzzing is done) Returns ------- - fuzzed df2: Pandas DataFrame + expanded and possibly fuzzed df2: Pandas DataFrame Notes ----- - This function groups both DataFrames based on the web application's + When doing the fuzzing for puf.csv results, this + function groups both DataFrames based on the web application's income groupings (both decile and income bins), and then randomly selects NUM_TO_FUZZ records to fuzz within each bin. The fuzzing involves overwriting df2 columns in cols_to_fuzz with df1 values. """ # nested function that does the fuzzing - def fuzz(df1, df2, bin_type, imeasure, suffix, cols_to_fuzz): + def create(df1, df2, bin_type, imeasure, suffix, cols_to_fuzz, do_fuzzing): """ - Fuzz some df2 records in each bin defined by bin_type and imeasure. - The fuzzed records have their post-reform tax results (in df2) + Create additional df2 columns. If do_fuzzing is True, also + fuzz some df2 records in each bin defined by bin_type and imeasure + with the fuzzed records having their post-reform tax results (in df2) set to their pre-reform tax results (in df1). """ # pylint: disable=too-many-arguments @@ -276,41 +327,51 @@ def fuzz(df1, df2, bin_type, imeasure, suffix, cols_to_fuzz): else: df2 = add_quantile_bins(df2, imeasure, 1) gdf2 = df2.groupby('bins') - df2['nofuzz'] = gdf2['mask'].transform(chooser) + if do_fuzzing: + df2['nofuzz'] = gdf2['mask'].transform(chooser) + else: # never do any results fuzzing + df2['nofuzz'] = np.ones(df2.shape[0], dtype=np.int8) for col in cols_to_fuzz: df2[col + suffix] = (df2[col] * df2['nofuzz'] - df1[col] * df2['nofuzz'] + df1[col]) - # main logic of fuzz_df2_records + # main logic of create_results_columns function skips = set(['num_returns_ItemDed', 'num_returns_StandardDed', 'num_returns_AMT', 's006']) - columns_to_fuzz = (set(DIST_TABLE_COLUMNS) | set(STATS_COLUMNS)) - skips - df2['mask'] = mask - # always use expanded income in df1 baseline to groupby into bins + columns_to_create = (set(DIST_TABLE_COLUMNS) | + set(STATS_COLUMNS)) - skips + do_fuzzing = np.any(mask) + if do_fuzzing: + df2['mask'] = mask df2['expanded_income_baseline'] = df1['expanded_income'] - fuzz(df1, df2, 'dec', 'expanded_income_baseline', '_xdec', columns_to_fuzz) - fuzz(df1, df2, 'bin', 'expanded_income_baseline', '_xbin', columns_to_fuzz) - fuzz(df1, df2, 'agg', 'expanded_income_baseline', '_agg', columns_to_fuzz) + create(df1, df2, 'dec', 'expanded_income_baseline', '_xdec', + columns_to_create, do_fuzzing) + create(df1, df2, 'bin', 'expanded_income_baseline', '_xbin', + columns_to_create, do_fuzzing) + create(df1, df2, 'agg', 'expanded_income_baseline', '_agg', + columns_to_create, do_fuzzing) df2['c00100_baseline'] = df1['c00100'] # c00100 is AGI - fuzz(df1, df2, 'dec', 'c00100_baseline', '_adec', columns_to_fuzz) - fuzz(df1, df2, 'bin', 'c00100_baseline', '_abin', columns_to_fuzz) + create(df1, df2, 'dec', 'c00100_baseline', '_adec', + columns_to_create, do_fuzzing) + create(df1, df2, 'bin', 'c00100_baseline', '_abin', + columns_to_create, do_fuzzing) return df2 AGGR_ROW_NAMES = ['ind_tax', 'payroll_tax', 'combined_tax'] -def dropq_summary(df1, df2, mask): +def summary(df1, df2, mask): """ df1 contains raw results for baseline plan df2 contains raw results for reform plan - mask is the boolean array specifying which records might be fuzzed + mask is the boolean array specifying records with reform-induced tax diffs returns dictionary of summary results DataFrames """ # pylint: disable=too-many-statements,too-many-locals - df2 = fuzz_df2_records(df1, df2, mask) + df2 = create_results_columns(df1, df2, mask) summ = dict() diff --git a/taxcalc/tests/test_macro_elasticity.py b/taxcalc/tests/test_macro_elasticity.py index b5dd930da..b4b15ed28 100644 --- a/taxcalc/tests/test_macro_elasticity.py +++ b/taxcalc/tests/test_macro_elasticity.py @@ -9,5 +9,9 @@ def test_proportional_change_gdp(cps_subsample): reform = {2013: {'_II_em': [0.0]}} # reform increases taxes and MTRs pol2.implement_reform(reform) calc2 = Calculator(policy=pol2, records=rec2) - gdp_diff = proportional_change_gdp(calc1, calc2, elasticity=0.36) - assert gdp_diff < 0. # higher MTRs imply negative GDP effect + calc1.advance_to_year(2014) + calc2.advance_to_year(2014) + gdp_pchg = 100.0 * proportional_change_gdp(calc1, calc2, elasticity=0.36) + exp_pchg = -0.6 # higher MTRs imply negative expected GDP percent change + abs_diff_pchg = abs(gdp_pchg - exp_pchg) + assert abs_diff_pchg < 0.05 diff --git a/taxcalc/tests/test_dropq.py b/taxcalc/tests/test_tbi.py similarity index 71% rename from taxcalc/tests/test_dropq.py rename to taxcalc/tests/test_tbi.py index 85b696d4d..c84be8360 100644 --- a/taxcalc/tests/test_dropq.py +++ b/taxcalc/tests/test_tbi.py @@ -1,12 +1,11 @@ """ -test_dropq.py uses only PUF input data because the dropq algorithm -is designed to work exclusively with private IRS-SOI PUF input data. +Test functions in taxcalc/tbi directory using both puf.csv and cps.csv input. """ import numpy as np import pandas as pd import pytest -from taxcalc.dropq.dropq_utils import * -from taxcalc.dropq import * +from taxcalc.tbi.tbi_utils import * +from taxcalc.tbi import * from taxcalc import (Policy, Records, Calculator, multiyear_diagnostic_table, results) @@ -28,8 +27,6 @@ }, 'growdiff_response': { }, - 'gdp_elasticity': { - } } @@ -61,30 +58,45 @@ def test_check_user_mods_errors(): @pytest.mark.requires_pufcsv -def test_run_nth_year_value_errors(puf_subsample): +def test_run_nth_year_value_errors(): usermods = USER_MODS + # test for growdiff_response not allowed error usermods['growdiff_response'] = {2018: {'_AINTS': [0.02]}} with pytest.raises(ValueError): - run_nth_year_gdp_elast_model(1, 2013, puf_subsample, usermods, False) + run_nth_year_gdp_elast_model(1, 2013, + use_puf_not_cps=True, + use_full_sample=False, + user_mods=usermods, + gdp_elasticity=0.36, + return_dict=False) usermods['growdiff_response'] = dict() + # test for behavior not allowed error with pytest.raises(ValueError): - run_nth_year_gdp_elast_model(1, 2013, puf_subsample, usermods, False) + run_nth_year_gdp_elast_model(1, 2013, + use_puf_not_cps=True, + use_full_sample=False, + user_mods=usermods, + gdp_elasticity=0.36, + return_dict=False) @pytest.mark.requires_pufcsv -@pytest.mark.parametrize('resjson', [True, False]) -def test_run_tax_calc_model(puf_subsample, resjson): - res = run_nth_year_tax_calc_model(2, 2016, puf_subsample, USER_MODS, - return_json=resjson) +@pytest.mark.parametrize('resdict', [True, False]) +def test_run_tax_calc_model(resdict): + res = run_nth_year_tax_calc_model(2, 2016, + use_puf_not_cps=resdict, + use_full_sample=False, + user_mods=USER_MODS, + return_dict=resdict) assert isinstance(res, dict) dump = False # set to True in order to dump returned results and fail test for tbl in sorted(res.keys()): - if resjson: + if resdict: assert isinstance(res[tbl], dict) else: assert isinstance(res[tbl], pd.DataFrame) if dump: - if resjson: + if resdict: cols = sorted(res[tbl].keys()) else: cols = sorted(list(res[tbl])) @@ -95,14 +107,17 @@ def test_run_tax_calc_model(puf_subsample, resjson): @pytest.mark.requires_pufcsv -@pytest.mark.parametrize('resjson', [True, False]) -def test_run_gdp_elast_model(puf_subsample, resjson): +@pytest.mark.parametrize('resdict', [True, False]) +def test_run_gdp_elast_model(resdict): usermods = USER_MODS usermods['behavior'] = dict() - usermods['gdp_elasticity'] = {'value': 0.36} - res = run_nth_year_gdp_elast_model(2, 2016, puf_subsample, usermods, - return_json=resjson) - if resjson: + res = run_nth_year_gdp_elast_model(2, 2016, + use_puf_not_cps=True, + use_full_sample=False, + user_mods=usermods, + gdp_elasticity=0.36, + return_dict=resdict) + if resdict: assert isinstance(res, dict) else: assert isinstance(res, float) @@ -134,11 +149,11 @@ def test_chooser_error(): chooser(dframe['zeros']) -def test_create_json_table(): +def test_create_dict_table(): # test correct usage dframe = pd.DataFrame(data=[[1., 2, 3], [4, 5, 6], [7, 8, 9]], columns=['a', 'b', 'c']) - ans = create_json_table(dframe) + ans = create_dict_table(dframe) exp = {'0': ['1.00', '2', '3'], '1': ['4.00', '5', '6'], '2': ['7.00', '8', '9']} @@ -147,7 +162,7 @@ def test_create_json_table(): dframe = pd.DataFrame(data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=['a', 'b', 'c'], dtype='i2') with pytest.raises(NotImplementedError): - create_json_table(dframe) + create_dict_table(dframe) @pytest.mark.requires_pufcsv @@ -166,7 +181,6 @@ def test_with_pufcsv(puf_fullsample): usermods['behavior'] = {} usermods['growdiff_baseline'] = {} usermods['growdiff_response'] = {} - usermods['gdp_elasticity'] = {} seed = random_seed(usermods) assert seed == 1574318062 # create a Policy object (pol) containing reform policy parameters @@ -183,19 +197,19 @@ def test_with_pufcsv(puf_fullsample): taxes_fullsample = adt.loc["Combined Liability ($b)"] assert taxes_fullsample is not None fulls_reform_revenue = float(taxes_fullsample.loc[analysis_year]) - # create a Public Use File object - tax_data = puf_fullsample # call run_nth_year_tax_calc_model function resdict = run_nth_year_tax_calc_model(year_n, start_year, - tax_data, usermods, - return_json=True) + use_puf_not_cps=True, + use_full_sample=True, + user_mods=usermods, + return_dict=True) total = resdict['aggr_2'] - dropq_reform_revenue = float(total['combined_tax_9']) * 1e-9 - # assert that dropq revenue is similar to the fullsample calculation - diff = abs(fulls_reform_revenue - dropq_reform_revenue) + tbi_reform_revenue = float(total['combined_tax_9']) * 1e-9 + # assert that tbi revenue is similar to the fullsample calculation + diff = abs(fulls_reform_revenue - tbi_reform_revenue) proportional_diff = diff / fulls_reform_revenue frmt = 'f,d,adiff,pdiff= {:.4f} {:.4f} {:.4f} {}' - print(frmt.format(fulls_reform_revenue, dropq_reform_revenue, + print(frmt.format(fulls_reform_revenue, tbi_reform_revenue, diff, proportional_diff)) assert proportional_diff < 0.0001 # one-hundredth of one percent # assert 1 == 2 # uncomment to force test failure with above print out diff --git a/taxcalc/tests/test_utils.py b/taxcalc/tests/test_utils.py index e5f5d0897..d930a6173 100644 --- a/taxcalc/tests/test_utils.py +++ b/taxcalc/tests/test_utils.py @@ -256,7 +256,7 @@ def test_diff_count_precision(): """ Estimate bootstrap standard error and confidence interval for count statistics ('tax_cut' and 'tax_inc') in difference table generated - using puf.csv input data taking no account of dropq fuzzing and + using puf.csv input data taking no account of tbi privacy fuzzing and assuming all filing units in each bin have the same weight. These assumptions imply that the estimates produced here are likely to over-estimate the precision of the count statistics.