From ad6301eb42155c968f20b2c7e071cbec039acc03 Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Thu, 4 Aug 2022 21:47:20 +1000 Subject: [PATCH] Add black (#50) * Add black to the test dependency * Run black * Ignore docs directory * Fix stickler errors * Fix stickler errors * Rerun black * Move pd_read/pd_write to utils to stop a circular dependency * Needed to import pd_read from utils --- .gitignore | 4 +- .stickler.yml | 5 +- Makefile | 4 + aneris/__init__.py | 4 +- aneris/_io.py | 74 ++--- aneris/cli.py | 83 +++--- aneris/harmonize.py | 402 +++++++++++++++------------- aneris/methods.py | 102 +++---- aneris/tutorial.py | 36 +-- aneris/utils.py | 357 +++++++++++++----------- pyproject.toml | 10 + setup.cfg | 2 +- setup.py | 70 ++--- tests/ci/download_data.py | 19 +- tests/test_default_decision_tree.py | 136 ++++------ tests/test_harmonize.py | 196 ++++++++------ tests/test_io.py | 36 +-- tests/test_regression.py | 83 +++--- tests/test_tutorials.py | 36 ++- tests/test_utils.py | 293 ++++++++++---------- 20 files changed, 1036 insertions(+), 916 deletions(-) create mode 100644 pyproject.toml diff --git a/.gitignore b/.gitignore index 46273c2..9775ca5 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,6 @@ build dist *.egg-info .cache -.* \ No newline at end of file +.* + +venv \ No newline at end of file diff --git a/.stickler.yml b/.stickler.yml index 6e08f83..410a11a 100644 --- a/.stickler.yml +++ b/.stickler.yml @@ -3,10 +3,13 @@ linters: python: 3 max-line-length: 88 fixer: false - ignore: I002, F403, E402, E731, E203 + ignore: I002, F403, E402, E731, E203, W503 # stickler doesn't support 'exclude' for flake8 properly, so we disable it # below with files.ignore: # https://github.com/markstory/lint-review/issues/184 + black: + config: ./pyproject.toml + fixer: false files: ignore: - doc/**/*.py diff --git a/Makefile b/Makefile index d474f0b..7d9c8a1 100644 --- a/Makefile +++ b/Makefile @@ -68,6 +68,10 @@ publish-on-pypi: $(VENV_DIR) ## publish release on PyPI echo run git status --porcelain to find dirty files >&2; \ fi; +.PHONY: black +black: $(VENV_DIR) + black . + .PHONY: ci_dl ci_dl: $(VENV_DIR) ## run all the tests cd tests/ci; python download_data.py diff --git a/aneris/__init__.py b/aneris/__init__.py index 2e8a927..fa86a3e 100644 --- a/aneris/__init__.py +++ b/aneris/__init__.py @@ -1,8 +1,8 @@ - from aneris._io import * from aneris.harmonize import * from aneris.utils import * from ._version import get_versions -__version__ = get_versions()['version'] + +__version__ = get_versions()["version"] del get_versions diff --git a/aneris/_io.py b/aneris/_io.py index 7e723f0..5e535b1 100644 --- a/aneris/_io.py +++ b/aneris/_io.py @@ -8,7 +8,7 @@ import pandas as pd -from aneris.utils import isstr, isnum, iamc_idx +from aneris.utils import isstr, isnum, iamc_idx, pd_read RC_DEFAULTS = """ config: @@ -26,7 +26,7 @@ def _read_data(indfs): - datakeys = sorted([x for x in indfs if x.startswith('data')]) + datakeys = sorted([x for x in indfs if x.startswith("data")]) df = pd.concat([indfs[k] for k in datakeys]) # don't know why reading from excel changes dtype and column types # but I have to reset them manually @@ -50,46 +50,6 @@ def _recursive_update(d, u): return d -def pd_read(f, str_cols=False, *args, **kwargs): - """Try to read a file with pandas, supports CSV and XLSX - - Parameters - ---------- - f : string - the file to read in - str_cols : bool, optional - turn all columns into strings (numerical column names are sometimes - read in as numerical dtypes) - args, kwargs : sent directly to the Pandas read function - - Returns - ------- - df : pd.DataFrame - """ - if f.endswith('csv'): - df = pd.read_csv(f, *args, **kwargs) - else: - df = pd.read_excel(f, *args, **kwargs) - - if str_cols: - df.columns = [str(x) for x in df.columns] - - return df - - -def pd_write(df, f, *args, **kwargs): - """Try to write a file with pandas, supports CSV and XLSX""" - # guess whether to use index, unless we're told otherwise - index = kwargs.pop('index', isinstance(df.index, pd.MultiIndex)) - - if f.endswith('csv'): - df.to_csv(f, index=index, *args, **kwargs) - else: - writer = pd.ExcelWriter(f) - df.to_excel(writer, index=index, *args, **kwargs) - writer.save() - - def read_excel(f): """Read an excel-based input file for harmonization. @@ -111,20 +71,23 @@ def read_excel(f): model = _read_data(indfs) # make an empty df which will be caught later - overrides = indfs['harmonization'] if 'harmonization' in indfs \ - else pd.DataFrame([], columns=iamc_idx + ['Unit']) + overrides = ( + indfs["harmonization"] + if "harmonization" in indfs + else pd.DataFrame([], columns=iamc_idx + ["Unit"]) + ) # get run control config = {} - if'Configuration' in overrides: - config = overrides[['Configuration', 'Value']].dropna() - config = config.set_index('Configuration').to_dict()['Value'] - overrides = overrides.drop(['Configuration', 'Value'], axis=1) + if "Configuration" in overrides: + config = overrides[["Configuration", "Value"]].dropna() + config = config.set_index("Configuration").to_dict()["Value"] + overrides = overrides.drop(["Configuration", "Value"], axis=1) # a single row of nans implies only configs provided, # if so, only return the empty df if len(overrides) == 1 and overrides.isnull().values.all(): - overrides = pd.DataFrame([], columns=iamc_idx + ['Unit']) + overrides = pd.DataFrame([], columns=iamc_idx + ["Unit"]) return model, overrides, config @@ -140,10 +103,10 @@ def __init__(self, rc=None, defaults=None): Parameters ---------- rc : string, file, dictionary, optional - a path to a YAML file, a file handle for a YAML file, or a + a path to a YAML file, a file handle for a YAML file, or a dictionary describing run control configuration defaults : string, file, dictionary, optional - a path to a YAML file, a file handle for a YAML file, or a + a path to a YAML file, a file handle for a YAML file, or a dictionary describing **default** run control configuration """ rc = rc or {} @@ -171,14 +134,15 @@ def _get_path(self, key, fyaml, fname): _fname = os.path.join(os.path.dirname(fyaml), fname) if not os.path.exists(_fname): - msg = "YAML key '{}' in {}: {} is not a valid relative " + \ - "or absolute path" + msg = ( + "YAML key '{}' in {}: {} is not a valid relative " + "or absolute path" + ) raise IOError(msg.format(key, fyaml, fname)) return _fname def _fill_relative_paths(self, fyaml, d): file_keys = [ - 'exogenous', + "exogenous", ] for k in file_keys: if k in d: @@ -186,7 +150,7 @@ def _fill_relative_paths(self, fyaml, d): def _load_yaml(self, obj): check_rel_paths = False - if hasattr(obj, 'read'): # it's a file + if hasattr(obj, "read"): # it's a file obj = obj.read() if isstr(obj) and os.path.exists(obj): check_rel_paths = True diff --git a/aneris/cli.py b/aneris/cli.py index ef95289..075fbe5 100644 --- a/aneris/cli.py +++ b/aneris/cli.py @@ -18,46 +18,54 @@ def read_args(): aneris input.xlsx --history history.csv --regions regions.csv """ parser = argparse.ArgumentParser( - description=descr, - formatter_class=argparse.RawDescriptionHelpFormatter + description=descr, formatter_class=argparse.RawDescriptionHelpFormatter ) - input_file = 'Input data file.' - parser.add_argument('input_file', help=input_file) - history = 'Historical emissions in the base year.' - parser.add_argument('--history', help=history, - default=hist_path('history.csv')) - regions = 'Mapping of country iso-codes to native regions.' - parser.add_argument('--regions', help=regions, - default=region_path('message.csv')) - rc = 'Runcontrol YAML file (see http://mattgidden.com/aneris/config.html for examples).' - parser.add_argument('--rc', help=rc, default=None) - output_path = 'Path to use for output file names.' - parser.add_argument('--output_path', help=output_path, default='.') - output_prefix = 'Prefix to use for output file names.' - parser.add_argument('--output_prefix', help=output_prefix, default=None) + input_file = "Input data file." + parser.add_argument("input_file", help=input_file) + history = "Historical emissions in the base year." + parser.add_argument("--history", help=history, default=hist_path("history.csv")) + regions = "Mapping of country iso-codes to native regions." + parser.add_argument("--regions", help=regions, default=region_path("message.csv")) + rc = ( + "Runcontrol YAML file " + "(see http://mattgidden.com/aneris/config.html for examples)." + ) + parser.add_argument("--rc", help=rc, default=None) + output_path = "Path to use for output file names." + parser.add_argument("--output_path", help=output_path, default=".") + output_prefix = "Prefix to use for output file names." + parser.add_argument("--output_prefix", help=output_prefix, default=None) args = parser.parse_args() return args -def harmonize(inf, history, regions, rc, output_path, output_prefix, - return_result=False, write_output=True): +def harmonize( + inf, + history, + regions, + rc, + output_path, + output_prefix, + return_result=False, + write_output=True, +): # check files exist check = [inf, history, regions, rc] for f in check: if f and not os.path.exists(f): - raise IOError('{} does not exist on the filesystem.'.format(f)) + raise IOError("{} does not exist on the filesystem.".format(f)) # read input hist = aneris.pd_read(history, str_cols=True) if hist.empty: - raise ValueError('History file is empty') + raise ValueError("History file is empty") regions = aneris.pd_read(regions, str_cols=True) if regions.empty: - raise ValueError('Region definition is empty') + raise ValueError("Region definition is empty") model, overrides, config = aneris.read_excel(inf) rc = aneris.RunControl(rc=rc) - rc.recursive_update('config', config) + rc.recursive_update("config", config) # do core harmonization driver = aneris.HarmonizationDriver(rc, hist, model, overrides, regions) @@ -67,37 +75,40 @@ def harmonize(inf, history, regions, rc, output_path, output_prefix, if write_output: # write to excel - prefix = output_prefix or inf.split('.')[0] - fname = os.path.join(output_path, '{}_harmonized.xlsx'.format(prefix)) - logger().info('Writing result to: {}'.format(fname)) - aneris.pd_write(model, fname, sheet_name='data') + prefix = output_prefix or inf.split(".")[0] + fname = os.path.join(output_path, "{}_harmonized.xlsx".format(prefix)) + logger().info("Writing result to: {}".format(fname)) + aneris.pd_write(model, fname, sheet_name="data") # save data about harmonization - fname = os.path.join(output_path, '{}_metadata.xlsx'.format(prefix)) - logger().info('Writing metadata to: {}'.format(fname)) + fname = os.path.join(output_path, "{}_metadata.xlsx".format(prefix)) + logger().info("Writing metadata to: {}".format(fname)) aneris.pd_write(metadata, fname) # save data about harmonization if not diagnostics.empty: - fname = os.path.join(output_path, - '{}_diagnostics.xlsx'.format(prefix)) - logger().info('Writing diagnostics to: {}'.format(fname)) + fname = os.path.join(output_path, "{}_diagnostics.xlsx".format(prefix)) + logger().info("Writing diagnostics to: {}".format(fname)) aneris.pd_write(diagnostics, fname) if return_result: return model, metadata, diagnostics - - def main(): # parse cli args = read_args() # run program - harmonize(args.input_file, args.history, args.regions, - args.rc, args.output_path, args.output_prefix) + harmonize( + args.input_file, + args.history, + args.regions, + args.rc, + args.output_path, + args.output_prefix, + ) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/aneris/harmonize.py b/aneris/harmonize.py index 6dfd5ce..d4f9eb4 100644 --- a/aneris/harmonize.py +++ b/aneris/harmonize.py @@ -6,11 +6,20 @@ from functools import partial from aneris import utils -from aneris import pd_read -from aneris.utils import isin -from aneris.methods import harmonize_factors, constant_offset, reduce_offset, \ - constant_ratio, reduce_ratio, linear_interpolate, model_zero, hist_zero, \ - budget, coeff_of_var, default_methods +from aneris.utils import isin, pd_read +from aneris.methods import ( + harmonize_factors, + constant_offset, + reduce_offset, + constant_ratio, + reduce_ratio, + linear_interpolate, + model_zero, + hist_zero, + budget, + coeff_of_var, + default_methods, +) def _log(msg, *args, **kwargs): @@ -25,19 +34,20 @@ class Harmonizer(object): """A class used to harmonize model data to historical data in the standard calculation format """ + _methods = { - 'model_zero': model_zero, - 'hist_zero': hist_zero, - 'budget': budget, - 'constant_ratio': constant_ratio, - 'constant_offset': constant_offset, - 'reduce_offset_2150_cov': partial(reduce_offset, final_year='2150'), - 'reduce_ratio_2150_cov': partial(reduce_ratio, final_year='2150'), + "model_zero": model_zero, + "hist_zero": hist_zero, + "budget": budget, + "constant_ratio": constant_ratio, + "constant_offset": constant_offset, + "reduce_offset_2150_cov": partial(reduce_offset, final_year="2150"), + "reduce_ratio_2150_cov": partial(reduce_ratio, final_year="2150"), **{ - f'{method.__name__}_{year}': partial(method, final_year=str(year)) + f"{method.__name__}_{year}": partial(method, final_year=str(year)) for year in chain(range(2020, 2101, 10), [2150]) for method in (reduce_offset, reduce_ratio, linear_interpolate) - } + }, } def __init__( @@ -57,77 +67,83 @@ def __init__( different """ if not isinstance(data.index, pd.MultiIndex): - raise ValueError('Data must use utils.df_idx') + raise ValueError("Data must use utils.df_idx") if not isinstance(history.index, pd.MultiIndex): - raise ValueError('History must use utils.df_idx') + raise ValueError("History must use utils.df_idx") if verify_indicies and not data.index.equals(history.index): idx = history.index.difference(data.index) - msg = 'More history than model reports, adding 0 values {}' + msg = "More history than model reports, adding 0 values {}" _warn(msg.format(idx.to_series().head())) df = pd.DataFrame(0, columns=data.columns, index=idx) data = pd.concat([data, df]).sort_index().loc[history.index] assert data.index.equals(history.index) - key = 'harmonize_year' + key = "harmonize_year" # TODO type - self.base_year = str(config[key]) if key in config else '2015' + self.base_year = str(config[key]) if key in config else "2015" self.data = data[utils.numcols(data)] - self.model = pd.Series(index=self.data.index, - name=self.base_year, - dtype=float).to_frame() + self.model = pd.Series( + index=self.data.index, name=self.base_year, dtype=float + ).to_frame() self.history = history self.methods_used = None self.offsets, self.ratios = harmonize_factors( - self.data, self.history, self.base_year) + self.data, self.history, self.base_year + ) self.method_choice = method_choice # get default methods to use in decision tree - self.ratio_method = config.get('default_ratio_method') - self.offset_method = config.get('default_offset_method') - self.luc_method = config.get('default_luc_method') - self.luc_cov_threshold = config.get('luc_cov_threshold') + self.ratio_method = config.get("default_ratio_method") + self.offset_method = config.get("default_offset_method") + self.luc_method = config.get("default_luc_method") + self.luc_cov_threshold = config.get("luc_cov_threshold") def metadata(self): """Return pd.DataFrame of method choice metadata""" methods = self.methods_used if isinstance(methods, pd.Series): # only defaults used methods = methods.to_frame() - methods['default'] = methods['method'] - methods['override'] = '' - - meta = pd.concat([ - methods['method'], - methods['default'], - methods['override'], - self.offsets, - self.ratios, - self.history[self.base_year], - self.history.apply(coeff_of_var, axis=1), - self.data[self.base_year], - self.model[self.base_year], - ], axis=1) + methods["default"] = methods["method"] + methods["override"] = "" + + meta = pd.concat( + [ + methods["method"], + methods["default"], + methods["override"], + self.offsets, + self.ratios, + self.history[self.base_year], + self.history.apply(coeff_of_var, axis=1), + self.data[self.base_year], + self.model[self.base_year], + ], + axis=1, + ) meta.columns = [ - 'method', - 'default', - 'override', - 'offset', - 'ratio', - 'history', - 'cov', - 'unharmonized', - 'harmonized', + "method", + "default", + "override", + "offset", + "ratio", + "history", + "cov", + "unharmonized", + "harmonized", ] return meta def _default_methods(self): methods, diagnostics = default_methods( - self.history, self.data, self.base_year, + self.history, + self.data, + self.base_year, method_choice=self.method_choice, ratio_method=self.ratio_method, offset_method=self.offset_method, luc_method=self.luc_method, - luc_cov_threshold=self.luc_cov_threshold + luc_cov_threshold=self.luc_cov_threshold, ) return methods @@ -138,25 +154,23 @@ def _harmonize(self, method, idx, check_len): offsets = self.offsets.loc[idx] ratios = self.ratios.loc[idx] # get delta - delta = hist if method == 'budget' else ratios if 'ratio' in method else offsets + delta = hist if method == "budget" else ratios if "ratio" in method else offsets # checks - assert(not model.isnull().values.any()) - assert(not hist.isnull().values.any()) - assert(not delta.isnull().values.any()) + assert not model.isnull().values.any() + assert not hist.isnull().values.any() + assert not delta.isnull().values.any() if check_len: - assert((len(model) < len(self.data)) & (len(hist) < len(self.history))) + assert (len(model) < len(self.data)) & (len(hist) < len(self.history)) # harmonize model = Harmonizer._methods[method](model, delta, harmonize_year=self.base_year) y = str(self.base_year) if model.isnull().values.any(): - msg = '{} method produced NaNs: {}, {}' + msg = "{} method produced NaNs: {}, {}" where = model.isnull().any(axis=1) - raise ValueError(msg.format(method, - model.loc[where, y], - delta.loc[where])) + raise ValueError(msg.format(method, model.loc[where, y], delta.loc[where])) # construct the full df of history and future return model @@ -172,24 +186,24 @@ def methods(self, overrides=None): oidx = overrides.index # remove duplicate values - dup = oidx.duplicated(keep='last') + dup = oidx.duplicated(keep="last") if dup.any(): - msg = 'Removing duplicated override entries found: {}\n' + msg = "Removing duplicated override entries found: {}\n" _warn(msg.format(overrides.loc[dup])) overrides = overrides.loc[~dup] # get subset of overrides which are in model outidx = oidx.difference(midx) if outidx.size > 0: - msg = 'Removing override methods not in processed model output:\n{}' + msg = "Removing override methods not in processed model output:\n{}" _warn(msg.format(overrides.loc[outidx])) inidx = oidx.intersection(midx) overrides = overrides.loc[inidx] # overwrite defaults with overrides final_methods = overrides.combine_first(methods).to_frame() - final_methods['default'] = methods - final_methods['override'] = overrides + final_methods["default"] = methods + final_methods["override"] = overrides methods = final_methods return methods @@ -204,25 +218,25 @@ def harmonize(self, overrides=None): # save for future inspection self.methods_used = methods if isinstance(methods, pd.DataFrame): - methods = methods['method'] # drop default and override info - if (methods == 'unicorn').any(): + methods = methods["method"] # drop default and override info + if (methods == "unicorn").any(): msg = """Values found where model has positive and negative values and is zero in base year. Unsure how to proceed:\n{}\n{}""" - cols = ['history', 'unharmonized'] - df1 = self.metadata().loc[methods == 'unicorn', cols] - df2 = self.data.loc[methods == 'unicorn'] + cols = ["history", "unharmonized"] + df1 = self.metadata().loc[methods == "unicorn", cols] + df2 = self.data.loc[methods == "unicorn"] raise ValueError(msg.format(df1.reset_index(), df2.reset_index())) dfs = [] y = str(self.base_year) for method in methods.unique(): - _log('Harmonizing with {}'.format(method)) + _log("Harmonizing with {}".format(method)) # get subset indicies idx = methods[methods == method].index check_len = len(methods.unique()) > 1 # harmonize df = self._harmonize(method, idx, check_len) - if method not in ['model_zero', 'hist_zero']: + if method not in ["model_zero", "hist_zero"]: close = (df[y] - self.history.loc[df.index, y]).abs() < 1e-5 if not close.all(): report = df[~close][y].reset_index() @@ -256,8 +270,8 @@ def _downselect_scen(self, scenario): def _downselect_var(self): # separate data - select = '|'.join([self.prefix, self.suffix]) - _log('Downselecting {} variables'.format(select)) + select = "|".join([self.prefix, self.suffix]) + _log("Downselecting {} variables".format(select)) hasprefix = lambda df: df.Variable.str.startswith(self.prefix) hassuffix = lambda df: df.Variable.str.endswith(self.suffix) @@ -268,12 +282,12 @@ def _downselect_var(self): self.overrides = subset(self.overrides) if len(self.model) == 0: - msg = 'No Variables found for harmonization. Searched for {}.' + msg = "No Variables found for harmonization. Searched for {}." raise ValueError(msg.format(select)) - assert(len(self.hist) > 0) + assert len(self.hist) > 0 def _to_std(self): - _log('Translating to standard format') + _log("Translating to standard format") xlator = utils.FormatTranslator() self.model = ( @@ -288,28 +302,31 @@ def _to_std(self): .sort_index() ) # override with special cases if more are found in history - self.hist = self.hist[~self.hist.index.duplicated(keep='last')] + self.hist = self.hist[~self.hist.index.duplicated(keep="last")] # hackery required because unit needed for df_idx if self.overrides.empty: self.overrides = None else: - self.overrides['Unit'] = 'kt' + self.overrides["Unit"] = "kt" self.overrides = ( xlator.to_std(df=self.overrides.copy(), set_metadata=False) .set_index(utils.df_idx) .sort_index() ) self.overrides.columns = self.overrides.columns.str.lower() - self.overrides = self.overrides['method'] + self.overrides = self.overrides["method"] def _agg_hist(self): # aggregate and clean hist - _log('Aggregating historical values to native regions') + _log("Aggregating historical values to native regions") # must set verify to false for now because some isos aren't included! self.hist = utils.agg_regions( - self.hist, verify=False, mapping=self.regions, - rfrom='ISO Code', rto='Native Region Code' + self.hist, + verify=False, + mapping=self.regions, + rfrom="ISO Code", + rto="Native Region Code", ) def _fill_model_trajectories(self): @@ -317,7 +334,7 @@ def _fill_model_trajectories(self): idx = self.hist.index notin = ~idx.isin(self.model.index) if notin.any(): - msg = 'Not all of self.history is covered by self.model: \n{}' + msg = "Not all of self.history is covered by self.model: \n{}" _df = self.hist.loc[notin].reset_index()[utils.df_idx] _warn(msg.format(_df.head())) zeros = pd.DataFrame(0, index=idx, columns=self.model.columns) @@ -337,8 +354,7 @@ def results(self): class HarmonizationDriver(object): - """A helper class to harmonize all scenarios for a model. - """ + """A helper class to harmonize all scenarios for a model.""" def __init__(self, rc, hist, model, overrides, regions): """Parameters @@ -353,69 +369,70 @@ def __init__(self, rc, hist, model, overrides, regions): regions : pd.DataFrame regional aggregation mapping (ISO -> model regions) """ - self.prefix = rc['prefix'] - self.suffix = rc['suffix'] - self.config = rc['config'] - self.add_5regions = rc['add_5regions'] - self.exog_files = rc['exogenous'] if 'exogenous' in rc else [] + self.prefix = rc["prefix"] + self.suffix = rc["suffix"] + self.config = rc["config"] + self.add_5regions = rc["add_5regions"] + self.exog_files = rc["exogenous"] if "exogenous" in rc else [] self.model = model self.hist = hist self.overrides = overrides self.regions = regions - if not self.regions['ISO Code'].isin(['World']).any(): + if not self.regions["ISO Code"].isin(["World"]).any(): glb = { - 'ISO Code': 'World', - 'Country': 'World', - 'Native Region Code': 'World', + "ISO Code": "World", + "Country": "World", + "Native Region Code": "World", } - _log('Manually adding global regional definition: {}'.format(glb)) + _log("Manually adding global regional definition: {}".format(glb)) self.regions = self.regions.append(glb, ignore_index=True) model_names = self.model.Model.unique() if len(model_names) > 1: - raise ValueError('Can not have more than one model to harmonize') + raise ValueError("Can not have more than one model to harmonize") self.model_name = model_names[0] - self._xlator = utils.FormatTranslator(prefix=self.prefix, - suffix=self.suffix) + self._xlator = utils.FormatTranslator(prefix=self.prefix, suffix=self.suffix) self._model_dfs = [] self._metadata_dfs = [] self._diagnostic_dfs = [] self.exogenous_trajectories = self._exogenous_trajectories() # TODO better type checking? - self.config['harmonize_year'] = str(self.config['harmonize_year']) - y = self.config['harmonize_year'] + self.config["harmonize_year"] = str(self.config["harmonize_year"]) + y = self.config["harmonize_year"] if y not in model.columns: - msg = 'Base year {} not found in model data. Existing columns are {}.' + msg = "Base year {} not found in model data. Existing columns are {}." raise ValueError(msg.format(y, model.columns)) if y not in hist.columns: - msg = 'Base year {} not found in hist data. Existing columns are {}.' + msg = "Base year {} not found in hist data. Existing columns are {}." raise ValueError(msg.format(y, hist.columns)) def _exogenous_trajectories(self): # add exogenous variables dfs = [] for fname in self.exog_files: - exog = pd_read(fname, sheet_name='data') + exog = pd_read(fname, sheet_name="data") exog.columns = [str(x) for x in exog.columns] - exog['Model'] = self.model_name + exog["Model"] = self.model_name dfs.append(exog) if len(dfs) == 0: # add empty df if none were provided dfs.append(pd.DataFrame(columns=self.model.columns)) return pd.concat(dfs) def _postprocess_trajectories(self, scenario): - _log('Translating to IAMC template') + _log("Translating to IAMC template") # update variable name self._model = self._model.reset_index() self._model.sector = self._model.sector.str.replace( - self.suffix, self.config['replace_suffix']) + self.suffix, self.config["replace_suffix"] + ) self._model = self._model.set_index(utils.df_idx) # from native to iamc format self._model = ( - self._xlator.to_template(self._model, model=self.model_name, - scenario=scenario) + self._xlator.to_template( + self._model, model=self.model_name, scenario=scenario + ) .sort_index() .reset_index() ) @@ -423,7 +440,7 @@ def _postprocess_trajectories(self, scenario): # add exogenous trajectories exog = self.exogenous_trajectories.copy() if not exog.empty: - exog['Scenario'] = scenario + exog["Scenario"] = scenario cols = [c for c in self._model.columns if c in exog.columns] exog = exog[cols] self._model = pd.concat([self._model, exog]) @@ -446,22 +463,31 @@ def harmonize(self, scenario, diagnostic_config=None): self._regions = self.regions.copy() # preprocess - pp = _TrajectoryPreprocessor(self._hist, self._model, self._overrides, - self._regions, self.prefix, self.suffix) + pp = _TrajectoryPreprocessor( + self._hist, + self._model, + self._overrides, + self._regions, + self.prefix, + self.suffix, + ) # TODO, preprocess in init, just process here - self._hist, self._model, self._overrides = pp.process( - scenario).results() + self._hist, self._model, self._overrides = pp.process(scenario).results() unharmonized = self._model.copy() # flag if this run will be with only global trajectories. if so, then # only global totals are harmonized, rest is skipped. - global_harmonization_only = self.config['global_harmonization_only'] + global_harmonization_only = self.config["global_harmonization_only"] # global only gases self._glb_model, self._glb_meta = _harmonize_global_total( - self.config, self.prefix, self.suffix, - self._hist, self._model.copy(), self._overrides, + self.config, + self.prefix, + self.suffix, + self._hist, + self._model.copy(), + self._overrides, default_global_gases=not global_harmonization_only, ) @@ -471,9 +497,15 @@ def harmonize(self, scenario, diagnostic_config=None): else: # regional gases self._model, self._meta = _harmonize_regions( - self.config, self.prefix, self.suffix, self._regions, - self._hist, self._model.copy(), self._overrides, - self.config['harmonize_year'], self.add_5regions + self.config, + self.prefix, + self.suffix, + self._regions, + self._hist, + self._model.copy(), + self._overrides, + self.config["harmonize_year"], + self.add_5regions, ) # combine special case results with harmonized results @@ -483,13 +515,14 @@ def harmonize(self, scenario, diagnostic_config=None): # perform any automated diagnostics/analysis self._diag = diagnostics( - unharmonized, self._model, self._meta, config=diagnostic_config) + unharmonized, self._model, self._meta, config=diagnostic_config + ) # collect metadata self._meta = self._meta.reset_index() - self._meta['model'] = self.model_name - self._meta['scenario'] = scenario - self._meta = self._meta.set_index(['model', 'scenario']) + self._meta["model"] = self.model_name + self._meta["scenario"] = scenario + self._meta = self._meta.set_index(["model", "scenario"]) self._postprocess_trajectories(scenario) # store results @@ -499,7 +532,7 @@ def harmonize(self, scenario, diagnostic_config=None): def scenarios(self): """Return all known scenarios""" - return self.model['Scenario'].unique() + return self.model["Scenario"].unique() def harmonized_results(self): """Return 3-tuple of (pd.DataFrame of harmonized trajectories, @@ -522,18 +555,19 @@ def _get_global_overrides(overrides, gases, sector): return o if not o.empty else None -def _harmonize_global_total(config, prefix, suffix, hist, model, overrides, - default_global_gases=True): - all_gases = list(model.index.get_level_values('gas').unique()) +def _harmonize_global_total( + config, prefix, suffix, hist, model, overrides, default_global_gases=True +): + all_gases = list(model.index.get_level_values("gas").unique()) gases = utils.harmonize_total_gases if default_global_gases else all_gases - sector = '|'.join([prefix, suffix]) + sector = "|".join([prefix, suffix]) idx = isin(region="World", gas=gases, sector=sector) h = hist.loc[idx].copy() try: m = model.loc[idx].copy() except TypeError: - _warn('Non-history gases not found in model') + _warn("Non-history gases not found in model") return None, None if m.empty: @@ -542,53 +576,53 @@ def _harmonize_global_total(config, prefix, suffix, hist, model, overrides, # match override methods with global gases, None if no match o = _get_global_overrides(overrides, gases, sector) - utils.check_null(m, 'model') - utils.check_null(h, 'hist', fail=True) + utils.check_null(m, "model") + utils.check_null(h, "hist", fail=True) harmonizer = Harmonizer(m, h, config=config) - _log('Harmonizing (with example methods):') + _log("Harmonizing (with example methods):") _log(harmonizer.methods(overrides=o).head()) if o is not None: - _log('and override methods:') + _log("and override methods:") _log(o.head()) m = harmonizer.harmonize(overrides=o) - utils.check_null(m, 'model') + utils.check_null(m, "model") metadata = harmonizer.metadata() return m, metadata -def _harmonize_regions(config, prefix, suffix, regions, hist, model, overrides, - base_year, add_5regions): +def _harmonize_regions( + config, prefix, suffix, regions, hist, model, overrides, base_year, add_5regions +): # clean model - model = utils.subtract_regions_from_world(model, 'model', base_year) + model = utils.subtract_regions_from_world(model, "model", base_year) model = utils.remove_recalculated_sectors(model, prefix, suffix) # remove rows with all 0s model = model[(model.T > 0).any()] # clean hist - hist = utils.subtract_regions_from_world(hist, 'hist', base_year) + hist = utils.subtract_regions_from_world(hist, "hist", base_year) hist = utils.remove_recalculated_sectors(hist, prefix, suffix) # remove rows with all 0s hist = hist[(hist.T > 0).any()] if model.empty: - raise RuntimeError( - 'Model is empty after downselecting regional values') + raise RuntimeError("Model is empty after downselecting regional values") # harmonize - utils.check_null(model, 'model') - utils.check_null(hist, 'hist', fail=True) + utils.check_null(model, "model") + utils.check_null(hist, "hist", fail=True) harmonizer = Harmonizer(model, hist, config=config) - _log('Harmonizing (with example methods):') + _log("Harmonizing (with example methods):") _log(harmonizer.methods(overrides=overrides).head()) if overrides is not None: - _log('and override methods:') + _log("and override methods:") _log(overrides.head()) model = harmonizer.harmonize(overrides=overrides) - utils.check_null(model, 'model') + utils.check_null(model, "model") metadata = harmonizer.metadata() # add aggregate variables. this works in three steps: @@ -596,46 +630,45 @@ def _harmonize_regions(config, prefix, suffix, regions, hist, model, overrides, # be recalculated idx = utils.recalculated_row_idx(model, prefix, suffix) if idx.any(): - msg = 'Removing sector aggregates. Recalculating with harmonized totals.' + msg = "Removing sector aggregates. Recalculating with harmonized totals." _warn(msg) model = model[~idx] - totals = '|'.join([prefix, suffix]) + totals = "|".join([prefix, suffix]) sector_total_idx = isin(model, sector=totals) subsector_idx = ~sector_total_idx # step 2: on the "clean" df, recalculate those totals subsectors_with_total_df = ( utils.EmissionsAggregator(model[subsector_idx]) .add_variables(totals=totals, aggregates=False) - .df - .set_index(utils.df_idx) + .df.set_index(utils.df_idx) ) # step 3: recombine with model data that was sector total only sector_total_df = model[sector_total_idx] model = pd.concat([sector_total_df, subsectors_with_total_df]) - utils.check_null(model, 'model') + utils.check_null(model, "model") # combine regional values to send back into template form model.reset_index(inplace=True) model = model.set_index(utils.df_idx).sort_index() - glb = utils.combine_rows(model, 'region', 'World', - sumall=False, rowsonly=True) + glb = utils.combine_rows(model, "region", "World", sumall=False, rowsonly=True) model = glb.combine_first(model) # add 5regions if add_5regions: - _log('Adding 5region values') + _log("Adding 5region values") # explicitly don't add World, it already exists from aggregation - mapping = regions[regions['Native Region Code'] != 'World'].copy() - aggdf = utils.agg_regions(model, mapping=mapping, - rfrom='Native Region Code', rto='5_region') + mapping = regions[regions["Native Region Code"] != "World"].copy() + aggdf = utils.agg_regions( + model, mapping=mapping, rfrom="Native Region Code", rto="5_region" + ) model = pd.concat([model, aggdf]) - assert(not model.isnull().values.any()) + assert not model.isnull().values.any() # duplicates come in from World and World being translated - duplicates = model.index.duplicated(keep='first') + duplicates = model.index.duplicated(keep="first") if duplicates.any(): - regions = model[duplicates].index.get_level_values('region').unique() - msg = 'Dropping duplicate rows found for regions: {}'.format(regions) + regions = model[duplicates].index.get_level_values("region").unique() + msg = "Dropping duplicate rows found for regions: {}".format(regions) _warn(msg) model = model[~duplicates] @@ -664,30 +697,28 @@ def diagnostics(unharmonized, model, metadata, config=None): config : dictionary, optional ratio values to use in diagnostics, key options include 'mid' and 'end'. """ - config = config or {'mid': 4.0, 'end': 2.0} + config = config or {"mid": 4.0, "end": 2.0} # # Detect Large Missing Values # - num = metadata['history'] - denom = metadata['history'].groupby(level=['region', 'gas']).sum() + num = metadata["history"] + denom = metadata["history"].groupby(level=["region", "gas"]).sum() # special merge because you can't do operations on multiindex - ratio = pd.merge(num.reset_index(), - denom.reset_index(), - on=['region', 'gas']) - ratio = ratio['history_x'] / ratio['history_y'] + ratio = pd.merge(num.reset_index(), denom.reset_index(), on=["region", "gas"]) + ratio = ratio["history_x"] / ratio["history_y"] ratio.index = num.index - ratio.name = 'fraction' + ratio.name = "fraction" # downselect big = ratio[ratio > 0.2] - bigmethods = metadata.loc[big.index, 'method'] - bad = bigmethods[bigmethods == 'model_zero'] + bigmethods = metadata.loc[big.index, "method"] + bad = bigmethods[bigmethods == "model_zero"] report = big.loc[bad.index].reset_index() if not report.empty: - _warn('LARGE MISSING Values Found!!:\n {}'.format(report)) + _warn("LARGE MISSING Values Found!!:\n {}".format(report)) # # report on large medium an dlong-term differences @@ -696,31 +727,30 @@ def diagnostics(unharmonized, model, metadata, config=None): report = model.copy() mid, end = cols[len(cols) // 2 - 1], cols[-1] - if 'mid' in config: + if "mid" in config: bigmid = np.abs(model[mid] - unharmonized[mid]) / unharmonized[mid] - bigmid = bigmid[bigmid > config['mid']] - report['{}_diff'.format(mid)] = bigmid + bigmid = bigmid[bigmid > config["mid"]] + report["{}_diff".format(mid)] = bigmid - if 'end' in config: + if "end" in config: bigend = np.abs(model[end] - unharmonized[end]) / unharmonized[end] - bigend = bigend[bigend > config['end']] - report['{}_diff'.format(end)] = bigend + bigend = bigend[bigend > config["end"]] + report["{}_diff".format(end)] = bigend - report = report.drop(cols, axis=1).dropna(how='all') + report = report.drop(cols, axis=1).dropna(how="all") idx = metadata.index.intersection(report.index) - report['method'] = metadata.loc[idx, 'method'] - report = report[~report['method'].isin(['model_zero', np.nan])] + report["method"] = metadata.loc[idx, "method"] + report = report[~report["method"].isin(["model_zero", np.nan])] # # Detect non-negative CO2 emissions # m = model.reset_index() - m = m[m.gas != 'CO2'] + m = m[m.gas != "CO2"] neg = m[(m[utils.numcols(m)].T < 0).any()] if not neg.empty: - _warn( - 'Negative Emissions found for non-CO2 gases:\n {}'.format(neg)) - raise ValueError('Harmonization failed due to negative non-CO2 gases') + _warn("Negative Emissions found for non-CO2 gases:\n {}".format(neg)) + raise ValueError("Harmonization failed due to negative non-CO2 gases") return report diff --git a/aneris/methods.py b/aneris/methods.py index 8d9243b..7f88b92 100644 --- a/aneris/methods.py +++ b/aneris/methods.py @@ -11,7 +11,7 @@ from aneris import utils -def harmonize_factors(df, hist, harmonize_year='2015'): +def harmonize_factors(df, hist, harmonize_year="2015"): """Calculate offset and ratio values between data and history Parameters @@ -32,13 +32,13 @@ def harmonize_factors(df, hist, harmonize_year='2015'): """ c, m = hist[harmonize_year], df[harmonize_year] offset = (c - m).fillna(0) - offset.name = 'offset' + offset.name = "offset" ratios = (c / m).replace(np.inf, np.nan).fillna(0) - ratios.name = 'ratio' + ratios.name = "ratio" return offset, ratios -def constant_offset(df, offset, harmonize_year='2015'): +def constant_offset(df, offset, harmonize_year="2015"): """Calculate constant offset harmonized trajectory Parameters @@ -62,7 +62,7 @@ def constant_offset(df, offset, harmonize_year='2015'): return df -def constant_ratio(df, ratios, harmonize_year='2015'): +def constant_ratio(df, ratios, harmonize_year="2015"): """Calculate constant ratio harmonized trajectory Parameters @@ -86,7 +86,7 @@ def constant_ratio(df, ratios, harmonize_year='2015'): return df -def linear_interpolate(df, offset, final_year='2050', harmonize_year='2015'): +def linear_interpolate(df, offset, final_year="2050", harmonize_year="2015"): """Calculate linearly interpolated convergence harmonized trajectory Parameters @@ -117,7 +117,7 @@ def linear_interpolate(df, offset, final_year='2050', harmonize_year='2015'): return df -def reduce_offset(df, offset, final_year='2050', harmonize_year='2015'): +def reduce_offset(df, offset, final_year="2050", harmonize_year="2015"): """Calculate offset convergence harmonized trajectory Parameters @@ -144,13 +144,14 @@ def reduce_offset(df, offset, final_year='2050', harmonize_year='2015'): f = lambda year: -(year - yi) / float(yf - yi) + 1 factors = [f(year) if year <= yf else 0.0 for year in numcols_int] # add existing values to offset time series - offsets = pd.DataFrame(np.outer(offset, factors), - columns=numcols, index=offset.index) + offsets = pd.DataFrame( + np.outer(offset, factors), columns=numcols, index=offset.index + ) df[numcols] = df[numcols] + offsets return df -def reduce_ratio(df, ratios, final_year='2050', harmonize_year='2015'): +def reduce_ratio(df, ratios, final_year="2050", harmonize_year="2015"): """Calculate ratio convergence harmonized trajectory Parameters @@ -176,20 +177,20 @@ def reduce_ratio(df, ratios, final_year='2050', harmonize_year='2015'): # get factors that reduce from 1 to 0, but replace with 1s in years prior # to harmonization f = lambda year: -(year - yi) / float(yf - yi) + 1 - prefactors = [f(yi) - for year in numcols_int if year < yi] - postfactors = [f(year) if year <= yf else 0.0 - for year in numcols_int if year >= yi] + prefactors = [f(yi) for year in numcols_int if year < yi] + postfactors = [f(year) if year <= yf else 0.0 for year in numcols_int if year >= yi] factors = prefactors + postfactors # multiply existing values by ratio time series - ratios = pd.DataFrame(np.outer(ratios - 1, factors), - columns=numcols, index=ratios.index) + 1 + ratios = ( + pd.DataFrame(np.outer(ratios - 1, factors), columns=numcols, index=ratios.index) + + 1 + ) df[numcols] = df[numcols] * ratios return df -def budget(df, df_hist, harmonize_year='2015'): +def budget(df, df_hist, harmonize_year="2015"): r"""Calculate budget harmonized trajectory Parameters @@ -244,8 +245,8 @@ def budget(df, df_hist, harmonize_year='2015'): harmonize_year = int(harmonize_year) - df = df.set_axis(df.columns.astype(int), axis='columns') - df_hist = df_hist.set_axis(df_hist.columns.astype(int), axis='columns') + df = df.set_axis(df.columns.astype(int), axis="columns") + df_hist = df_hist.set_axis(df_hist.columns.astype(int), axis="columns") data_years = df.columns hist_years = df_hist.columns @@ -254,10 +255,8 @@ def budget(df, df_hist, harmonize_year='2015'): if data_years[0] not in hist_years: hist_years = hist_years.insert(bisect(hist_years, data_years[0]), data_years[0]) - df_hist = ( - df_hist - .reindex(columns=hist_years) - .interpolate(method='slinear', axis=1) + df_hist = df_hist.reindex(columns=hist_years).interpolate( + method="slinear", axis=1 ) def carbon_budget(years, emissions): @@ -343,7 +342,7 @@ def l2_norm(): return df_harm -def model_zero(df, offset, harmonize_year='2015'): +def model_zero(df, offset, harmonize_year="2015"): """Returns result of aneris.methods.constant_offset()""" # current decision is to return a simple offset, this will be a straight # line for all time periods. previous behavior was to set df[numcols] = 0, @@ -388,13 +387,13 @@ def default_method_choice( """ # special cases if row.h == 0: - return 'hist_zero' + return "hist_zero" if row.zero_m: - return 'model_zero' + return "model_zero" if np.isinf(row.f) and row.neg_m and row.pos_m: # model == 0 in base year, and model goes negative # and positive - return 'unicorn' # this shouldn't exist! + return "unicorn" # this shouldn't exist! # model 0 in base year? if np.isclose(row.m, 0): @@ -402,15 +401,15 @@ def default_method_choice( if row.neg_m: return offset_method else: - return 'constant_offset' + return "constant_offset" else: # is this co2? # ZN: This gas dependence isn't documented in the default # decision tree - if hasattr(row, "gas") and row.gas == 'CO2': + if hasattr(row, "gas") and row.gas == "CO2": return ratio_method # is cov big? - if np.isfinite(row['cov']) and row['cov'] > luc_cov_threshold: + if np.isfinite(row["cov"]) and row["cov"] > luc_cov_threshold: return luc_method else: # dH small? @@ -419,9 +418,9 @@ def default_method_choice( else: # goes negative? if row.neg_m: - return 'reduce_ratio_2100' + return "reduce_ratio_2100" else: - return 'constant_ratio' + return "constant_ratio" def default_methods(hist, model, base_year, method_choice=None, **kwargs): @@ -464,14 +463,14 @@ def default_methods(hist, model, base_year, method_choice=None, **kwargs): `default_method_choice` """ - if kwargs.get('ratio_method') is None: - kwargs['ratio_method'] = 'reduce_ratio_2080' - if kwargs.get('offset_method') is None: - kwargs['offset_method'] = 'reduce_offset_2080' - if kwargs.get('luc_method') is None: - kwargs['luc_method'] = 'reduce_offset_2150_cov' - if kwargs.get('luc_cov_threshold') is None: - kwargs['luc_cov_threshold'] = 10 + if kwargs.get("ratio_method") is None: + kwargs["ratio_method"] = "reduce_ratio_2080" + if kwargs.get("offset_method") is None: + kwargs["offset_method"] = "reduce_offset_2080" + if kwargs.get("luc_method") is None: + kwargs["luc_method"] = "reduce_offset_2150_cov" + if kwargs.get("luc_cov_threshold") is None: + kwargs["luc_cov_threshold"] = 10 y = str(base_year) try: @@ -489,17 +488,24 @@ def default_methods(hist, model, base_year, method_choice=None, **kwargs): go_neg = ((model.min(axis=1) - h) < 0).any() cov = hist.apply(coeff_of_var, axis=1) - df = pd.DataFrame({ - 'dH': dH, 'f': f, 'dM': dM, - 'neg_m': neg_m, 'pos_m': pos_m, - 'zero_m': zero_m, 'go_neg': go_neg, - 'cov': cov, - 'h': h, 'm': m, - }).join(model.index.to_frame()) + df = pd.DataFrame( + { + "dH": dH, + "f": f, + "dM": dM, + "neg_m": neg_m, + "pos_m": pos_m, + "zero_m": zero_m, + "go_neg": go_neg, + "cov": cov, + "h": h, + "m": m, + } + ).join(model.index.to_frame()) if method_choice is None: method_choice = default_method_choice ret = df.apply(method_choice, axis=1, **kwargs) - ret.name = 'method' + ret.name = "method" return ret, df diff --git a/aneris/tutorial.py b/aneris/tutorial.py index 6be2628..c24b2ab 100644 --- a/aneris/tutorial.py +++ b/aneris/tutorial.py @@ -7,12 +7,15 @@ import aneris -_default_cache_dir = os.path.join('~', '.aneris_tutorial_data') +_default_cache_dir = os.path.join("~", ".aneris_tutorial_data") # idea borrowed from Seaborn -def load_data(cache_dir=_default_cache_dir, cache=True, - github_url='https://github.com/iiasa/aneris'): +def load_data( + cache_dir=_default_cache_dir, + cache=True, + github_url="https://github.com/iiasa/aneris", +): """ Load a dataset from the online repository (requires internet). @@ -32,32 +35,31 @@ def load_data(cache_dir=_default_cache_dir, cache=True, os.mkdir(longdir) files = { - 'rc': 'aneris_regions_sectors.yaml', - 'hist': 'history_regions_sectors.xls', - 'model': 'model_regions_sectors.xls', - 'regions': 'regions_regions_sectors.csv', + "rc": "aneris_regions_sectors.yaml", + "hist": "history_regions_sectors.xls", + "model": "model_regions_sectors.xls", + "regions": "regions_regions_sectors.csv", } files = {k: os.path.join(longdir, f) for k, f in files.items()} for localfile in files.values(): if not os.path.exists(localfile): fname = os.path.basename(localfile) - url = '/'.join((github_url, 'raw', 'master', - 'tests', 'test_data', fname)) + url = "/".join((github_url, "raw", "master", "tests", "test_data", fname)) urlretrieve(url, localfile) # read input - hist = aneris.pd_read(files['hist']) + hist = aneris.pd_read(files["hist"]) if hist.empty: - raise ValueError('History file is empty') + raise ValueError("History file is empty") hist.columns = hist.columns.astype(str) # make sure they're all strings - regions = aneris.pd_read(files['regions']) + regions = aneris.pd_read(files["regions"]) if regions.empty: - raise ValueError('Region definition is empty') - model, overrides, config = aneris.read_excel(files['model']) + raise ValueError("Region definition is empty") + model, overrides, config = aneris.read_excel(files["model"]) model.columns = model.columns.astype(str) # make sure they're all strings - rc = aneris.RunControl(rc=files['rc']) - rc.recursive_update('config', config) + rc = aneris.RunControl(rc=files["rc"]) + rc.recursive_update("config", config) # get driver driver = aneris.HarmonizationDriver(rc, hist, model, overrides, regions) @@ -69,7 +71,7 @@ def load_data(cache_dir=_default_cache_dir, cache=True, return model, hist, driver -if __name__ == '__main__': +if __name__ == "__main__": model, hist, driver = load_data(cache=False) for scenario in driver.scenarios(): driver.harmonize(scenario) diff --git a/aneris/utils.py b/aneris/utils.py index 6bcd3c9..7150b39 100644 --- a/aneris/utils.py +++ b/aneris/utils.py @@ -7,66 +7,77 @@ import numpy as np import pandas as pd + # Index for iamc -iamc_idx = ['Model', 'Scenario', 'Region', 'Variable'] +iamc_idx = ["Model", "Scenario", "Region", "Variable"] # default dataframe index -df_idx = ['region', 'gas', 'sector', 'units'] +df_idx = ["region", "gas", "sector", "units"] # paths to data dependencies here = os.path.join(os.path.dirname(os.path.realpath(__file__))) -hist_path = lambda f: os.path.join(here, 'historical', f) -iamc_path = lambda f: os.path.join(here, 'iamc_template', f) -region_path = lambda f: os.path.join(here, 'regional_definitions', f) +hist_path = lambda f: os.path.join(here, "historical", f) +iamc_path = lambda f: os.path.join(here, "iamc_template", f) +region_path = lambda f: os.path.join(here, "regional_definitions", f) # gases reported in kt of species kt_gases = [ - 'N2O', - 'SF6', - 'CF4', # explicit species of PFC - 'C2F6', # explicit species of PFC + "N2O", + "SF6", + "CF4", # explicit species of PFC + "C2F6", # explicit species of PFC # individual f gases removed for now # # hfcs # 'HFC23', 'HFC32', 'HFC43-10', 'HFC125', 'HFC134a', 'HFC143a', 'HFC227ea', 'HFC245fa', # CFCs - 'CFC-11', - 'CFC-12', - 'CFC-113', - 'CFC-114', - 'CFC-115', - 'CH3CCl3', - 'CCl4', - 'HCFC-22', - 'HCFC-141b', - 'HCFC-142b', - 'Halon1211', - 'Halon1301', - 'Halon2402', - 'Halon1202', - 'CH3Br', - 'CH3Cl', + "CFC-11", + "CFC-12", + "CFC-113", + "CFC-114", + "CFC-115", + "CH3CCl3", + "CCl4", + "HCFC-22", + "HCFC-141b", + "HCFC-142b", + "Halon1211", + "Halon1301", + "Halon2402", + "Halon1202", + "CH3Br", + "CH3Cl", ] # gases reported in co2-equiv co2_eq_gases = [ - 'HFC', + "HFC", ] # gases reported in Mt of species mt_gases = [ # IAMC names - 'BC', 'CH4', 'CO2', 'CO', 'NOx', 'OC', 'Sulfur', 'NH3', 'VOC', + "BC", + "CH4", + "CO2", + "CO", + "NOx", + "OC", + "Sulfur", + "NH3", + "VOC", # non-IAMC names - 'SO2', 'NOX', 'NMVOC', + "SO2", + "NOX", + "NMVOC", ] all_gases = sorted(kt_gases + co2_eq_gases + mt_gases) # gases for which only sectoral totals are reported -total_gases = ['SF6', 'CF4', 'C2F6'] + co2_eq_gases +total_gases = ["SF6", "CF4", "C2F6"] + co2_eq_gases # gases for which only sectoral totals are harmonized -harmonize_total_gases = ['N2O'] + total_gases +harmonize_total_gases = ["N2O"] + total_gases # gases for which full sectoral breakdown is reported sector_gases = sorted(set(all_gases) - set(total_gases)) @@ -75,19 +86,19 @@ # TODO: can we remove this? # TODO: should probably be a dictionary.. std_to_iamc_gases = [ - ('SO2', 'Sulfur'), - ('NOX', 'NOx'), - ('NMVOC', 'VOC'), + ("SO2", "Sulfur"), + ("NOX", "NOx"), + ("NMVOC", "VOC"), ] # mapping from gas name to name to use in units unit_gas_names = { - 'Sulfur': 'SO2', - 'Kyoto Gases': 'CO2-equiv', - 'F-Gases': 'CO2-equiv', - 'HFC': 'CO2-equiv', - 'PFC': 'CO2-equiv', - 'CFC': 'CO2-equiv', + "Sulfur": "SO2", + "Kyoto Gases": "CO2-equiv", + "F-Gases": "CO2-equiv", + "HFC": "CO2-equiv", + "PFC": "CO2-equiv", + "CFC": "CO2-equiv", } _logger = None @@ -99,7 +110,7 @@ def logger(): if _logger is None: logging.basicConfig() _logger = logging.getLogger() - _logger.setLevel('INFO') + _logger.setLevel("INFO") return _logger @@ -123,7 +134,7 @@ def isnum(s): def numcols(df): """Returns all columns in df that have data types of floats or ints""" dtypes = df.dtypes - return [i for i in dtypes.index if dtypes.loc[i].name.startswith(('float', 'int'))] + return [i for i in dtypes.index if dtypes.loc[i].name.startswith(("float", "int"))] def check_null(df, name=None, fail=False): @@ -139,9 +150,9 @@ def check_null(df, name=None, fail=False): """ anynull = df.isnull().values.any() if fail: - assert(not anynull) + assert not anynull if anynull: - msg = 'Null (missing) values found for {} indicies: \n{}' + msg = "Null (missing) values found for {} indicies: \n{}" _df = df[df.isnull().any(axis=1)].reset_index()[df_idx] logger().warning(msg.format(name, _df)) df.dropna(inplace=True, axis=1) @@ -149,8 +160,8 @@ def check_null(df, name=None, fail=False): def gases(var_col): """The gas associated with each variable""" - gasidx = lambda x: x.split('|').index('Emissions') + 1 - return var_col.apply(lambda x: x.split('|')[gasidx(x)]) + gasidx = lambda x: x.split("|").index("Emissions") + 1 + return var_col.apply(lambda x: x.split("|")[gasidx(x)]) def units(var_col): @@ -163,44 +174,42 @@ def units(var_col): gas_col = gas_col.apply(replace) return gas_col.apply( - lambda gas: '{} {}/yr'.format('kt' if gas in kt_gases else 'Mt', gas)) + lambda gas: "{} {}/yr".format("kt" if gas in kt_gases else "Mt", gas) + ) -def remove_emissions_prefix(x, gas='XXX'): +def remove_emissions_prefix(x, gas="XXX"): """Return x with emissions prefix removed, e.g., Emissions|XXX|foo|bar -> foo|bar """ - return re.sub(r'^Emissions\|{}\|'.format(gas), '', x) + return re.sub(r"^Emissions\|{}\|".format(gas), "", x) -def recalculated_row_idx(df, prefix='', suffix=''): +def recalculated_row_idx(df, prefix="", suffix=""): """Return a boolean array with rows that need to be recalculated. - These are rows with total values for a gas species which is a sum of - subsectors. - During harmonization, subsector totals change, thus this summation must - be recalculated. + These are rows with total values for a gas species which is a sum of + subsectors. + During harmonization, subsector totals change, thus this summation must + be recalculated. """ df = df.reset_index() - gas_sec_pairs = df[['gas', 'sector']].drop_duplicates() - total_sector = '|'.join([prefix, suffix]) + gas_sec_pairs = df[["gas", "sector"]].drop_duplicates() + total_sector = "|".join([prefix, suffix]) gases_with_subsectors = df.gas.isin( - gas_sec_pairs[gas_sec_pairs.sector != total_sector] - .gas - .unique() + gas_sec_pairs[gas_sec_pairs.sector != total_sector].gas.unique() ) is_sector_total = df.sector == total_sector return np.array(gases_with_subsectors & is_sector_total) -def remove_recalculated_sectors(df, prefix='', suffix=''): - """Return df with Total gas (sum of all sectors) removed - """ - idx = recalculated_row_idx(df, prefix='', suffix='') +def remove_recalculated_sectors(df, prefix="", suffix=""): + """Return df with Total gas (sum of all sectors) removed""" + idx = recalculated_row_idx(df, prefix="", suffix="") return df[~idx] -def subtract_regions_from_world(df, name=None, base_year='2015', threshold=5e-2): +def subtract_regions_from_world(df, name=None, base_year="2015", threshold=5e-2): """Subtract the sum of regional results in each variable from the World total. If the result is a World total below a threshold, set those values to 0. @@ -216,31 +225,39 @@ def subtract_regions_from_world(df, name=None, base_year='2015', threshold=5e-2) """ # make global only global (not global + sum of regions) check_null(df, name) - if (df.loc['World'][base_year] == 0).all(): + if (df.loc["World"][base_year] == 0).all(): # some models (gcam) are not reporting any values in World # without this, you get `0 - sum(other regions)` - logger().warning('Empty global region found in ' + name) + logger().warning("Empty global region found in " + name) return df # sum all rows where region == World - total = combine_rows(df, 'region', 'World', sumall=True, - others=[], rowsonly=True) + total = combine_rows(df, "region", "World", sumall=True, others=[], rowsonly=True) # sum all rows where region != World - nonglb = combine_rows(df, 'region', 'World', sumall=False, - others=None, rowsonly=True) + nonglb = combine_rows( + df, "region", "World", sumall=False, others=None, rowsonly=True + ) glb = total.subtract(nonglb, fill_value=0) # pick up some precision issues # TODO: this precision is large because I have seen model results # be reported with this large of difference due to round off and values # approaching 0 - glb[(glb / total).abs() < threshold] = 0. + glb[(glb / total).abs() < threshold] = 0.0 df = glb.combine_first(df) check_null(df, name) return df -def combine_rows(df, level, main, others=None, sumall=True, dropothers=True, - rowsonly=False, newlabel=None): +def combine_rows( + df, + level, + main, + others=None, + sumall=True, + dropothers=True, + rowsonly=False, + newlabel=None, +): """Combine rows (add values) in a dataframe. Rows corresponding to the main and other values in a given level (or column) are added together and reattached taking the main value in the new column. @@ -280,8 +297,7 @@ def combine_rows(df, level, main, others=None, sumall=True, dropothers=True, lvl_values = df[level].unique() # if others is none, then its everything other than the primary - others = others if others is not None else \ - list(set(lvl_values) - set([main])) + others = others if others is not None else list(set(lvl_values) - set([main])) # set up df idx for operations grp_idx = [x for x in df_idx if x != level] @@ -289,27 +305,14 @@ def combine_rows(df, level, main, others=None, sumall=True, dropothers=True, # generate new rows which are summation of subset of old rows sum_subset = [main] + others if sumall else others - rows = ( - df.loc[sum_subset] - .groupby(level=grp_idx) - .sum() - ) + rows = df.loc[sum_subset].groupby(level=grp_idx).sum() rows[level] = newlabel - rows = ( - rows - .set_index(level, append=True) - .reorder_levels(df_idx) - .sort_index() - ) + rows = rows.set_index(level, append=True).reorder_levels(df_idx).sort_index() # get rid of rows that aren't needed in final dataframe drop = [main] + others if dropothers else [main] drop = list(set(drop) & set(lvl_values)) - df = ( - df.drop(drop) - .reset_index() - .set_index(df_idx) - ) + df = df.drop(drop).reset_index().set_index(df_idx) # construct final dataframe df = rows if rowsonly else pd.concat([df, rows]).sort_index() @@ -320,8 +323,9 @@ def combine_rows(df, level, main, others=None, sumall=True, dropothers=True, return df -def agg_regions(df, rfrom='ISO Code', rto='Native Region Code', mapping=None, - verify=True): +def agg_regions( + df, rfrom="ISO Code", rto="Native Region Code", mapping=None, verify=True +): """Aggregate values in a dataframe to a new regional composition Parameters @@ -340,11 +344,11 @@ def agg_regions(df, rfrom='ISO Code', rto='Native Region Code', mapping=None, ------- df : pd.DataFrame """ - mapping = mapping if mapping is not None else \ - pd.read_csv(region_path('message.csv')) + mapping = ( + mapping if mapping is not None else pd.read_csv(region_path("message.csv")) + ) mapping[rfrom] = mapping[rfrom].str.upper() - case_map = pd.Series(mapping[rto].unique(), - index=mapping[rto].str.upper().unique()) + case_map = pd.Series(mapping[rto].unique(), index=mapping[rto].str.upper().unique()) mapping[rto] = mapping[rto].str.upper() mapping = mapping[[rfrom, rto]].drop_duplicates().dropna() @@ -358,17 +362,17 @@ def agg_regions(df, rfrom='ISO Code', rto='Native Region Code', mapping=None, check = mapping[rfrom] notin = list(set(df.region) - set(check)) if len(notin) > 0: - logger().warning( - 'Removing regions without direct mapping: {}'.format(notin)) + logger().warning("Removing regions without direct mapping: {}".format(notin)) df = df[df.region.isin(check)] # map and sum dfto = ( - df - .merge(mapping, left_on='region', right_on=rfrom, how='outer') - .drop([rfrom, 'region'], axis=1) - .rename(columns={rto: 'region'}) - .groupby(df_idx).sum().reset_index() + df.merge(mapping, left_on="region", right_on=rfrom, how="outer") + .drop([rfrom, "region"], axis=1) + .rename(columns={rto: "region"}) + .groupby(df_idx) + .sum() + .reset_index() ) dfto.region = dfto.region.map(case_map) dfto = dfto.set_index(df_idx).sort_index() @@ -379,8 +383,8 @@ def agg_regions(df, rfrom='ISO Code', rto='Native Region Code', mapping=None, end = dfto[numcols(dfto)].values.sum() diff = abs(start - end) if np.isnan(diff) or diff / start > 1e-6: - msg = 'Difference between before and after is large: {}' - raise(ValueError(msg.format(diff))) + msg = "Difference between before and after is large: {}" + raise (ValueError(msg.format(diff))) # revert form if needed if not multi_idx: @@ -407,7 +411,7 @@ def __init__(self, df, model=None, scenario=None): self.df = df self.model = model self.scenario = scenario - assert((self.df.units == 'kt').all()) + assert (self.df.units == "kt").all() def add_variables(self, totals=None, aggregates=True): """Add aggregates and variables with direct mappings. @@ -433,26 +437,26 @@ def to_template(self, **kwargs): first_year: optional, the first year to report values for """ self.df = FormatTranslator(self.df).to_template( - model=self.model, scenario=self.scenario, **kwargs) + model=self.model, scenario=self.scenario, **kwargs + ) return self.df def _add_totals(self, totals): - assert(not (self.df.sector == totals).any()) - grp_idx = [x for x in df_idx if x != 'sector'] + assert not (self.df.sector == totals).any() + grp_idx = [x for x in df_idx if x != "sector"] rows = self.df.groupby(grp_idx).sum().reset_index() - rows['sector'] = totals + rows["sector"] = totals self.df = pd.concat([self.df, rows]) def _add_aggregates(self): - mapping = pd_read(iamc_path('sector_mapping.xlsx'), - sheet_name='Aggregates') + mapping = pd_read(iamc_path("sector_mapping.xlsx"), sheet_name="Aggregates") mapping = mapping.applymap(remove_emissions_prefix) rows = [] - for sector in mapping['IAMC Parent'].unique(): + for sector in mapping["IAMC Parent"].unique(): # mapping for aggregate sector for all gases - _map = mapping[mapping['IAMC Parent'] == sector] - _map = _map.set_index('IAMC Child')['IAMC Parent'] + _map = mapping[mapping["IAMC Parent"] == sector] + _map = _map.set_index("IAMC Child")["IAMC Parent"] # rename variable column for subset of rows subset = self.df[self.df.sector.isin(_map.index)].copy() @@ -468,7 +472,7 @@ def _add_aggregates(self): class FormatTranslator(object): """Helper class to translate between IAMC and calcluation formats""" - def __init__(self, df=None, prefix='', suffix=''): + def __init__(self, df=None, prefix="", suffix=""): self.df = df if df is None else df.copy() self.model = None self.scenario = None @@ -490,26 +494,25 @@ def to_std(self, df=None, set_metadata=True): df.reset_index(inplace=True) if len(set(iamc_idx) - set(df.columns)): - msg = 'Columns do not conform with IAMC index: {}' + msg = "Columns do not conform with IAMC index: {}" raise ValueError(msg.format(df.columns)) # make sure we're working with good data - if len(df['Model'].unique()) > 1: - raise ValueError( - 'Model not unique: {}'.format(df['Model'].unique())) - assert(len(df['Scenario'].unique()) <= 1) - assert(df['Variable'].apply(lambda x: 'Emissions' in x).all()) + if len(df["Model"].unique()) > 1: + raise ValueError("Model not unique: {}".format(df["Model"].unique())) + assert len(df["Scenario"].unique()) <= 1 + assert df["Variable"].apply(lambda x: "Emissions" in x).all() # save data if set_metadata: - self.model = df['Model'].iloc[0] - self.scenario = df['Scenario'].iloc[0] + self.model = df["Model"].iloc[0] + self.scenario = df["Scenario"].iloc[0] # add std columns needed for conversions - df['region'] = df['Region'] - df['gas'] = gases(df['Variable']) - df['units'] = df['Unit'].apply(lambda x: x.split()[0]) - df['sector'] = df['Variable'] + df["region"] = df["Region"] + df["gas"] = gases(df["Variable"]) + df["units"] = df["Unit"].apply(lambda x: x.split()[0]) + df["sector"] = df["Variable"] # convert gas names self._convert_gases(df, tostd=True) @@ -519,15 +522,16 @@ def to_std(self, df=None, set_metadata=True): # remove emissions prefix def update_sector(row): - sectors = row.sector.split('|') - idx = sectors.index('Emissions') + sectors = row.sector.split("|") + idx = sectors.index("Emissions") sectors.pop(idx) # emissions sectors.pop(idx) # gas - return '|'.join(sectors).strip('|') + return "|".join(sectors).strip("|") + if not df.empty: - df['sector'] = df.apply(update_sector, axis=1) + df["sector"] = df.apply(update_sector, axis=1) # drop old columns - df.drop(iamc_idx + ['Unit'], axis=1, inplace=True) + df.drop(iamc_idx + ["Unit"], axis=1, inplace=True) # set up index and column order df.set_index(df_idx, inplace=True) @@ -538,8 +542,7 @@ def update_sector(row): return df - def to_template(self, df=None, model=None, scenario=None, - column_style=None): + def to_template(self, df=None, model=None, scenario=None, column_style=None): """Translate a dataframe from standard calculation format to IAMC Parameters @@ -560,7 +563,7 @@ def to_template(self, df=None, model=None, scenario=None, scenario = scenario or self.scenario if set(df.columns) != set(df_idx + numcols(df)): - msg = 'Columns do not conform with standard index: {}' + msg = "Columns do not conform with standard index: {}" raise ValueError(msg.format(df.columns)) # convert gas names @@ -571,31 +574,32 @@ def to_template(self, df=None, model=None, scenario=None, # inject emissions prefix def update_sector(row): - sectors = row.sector.split('|') - idx = self.prefix.count('|') + 1 - sectors.insert(idx, 'Emissions') + sectors = row.sector.split("|") + idx = self.prefix.count("|") + 1 + sectors.insert(idx, "Emissions") sectors.insert(idx + 1, row.gas) - return '|'.join(sectors).strip('|') - df['sector'] = df.apply(update_sector, axis=1) + return "|".join(sectors).strip("|") + + df["sector"] = df.apply(update_sector, axis=1) # write units correctly - df['units'] = units(df.sector) + df["units"] = units(df.sector) # add new columns, remove old - df['Model'] = model - df['Scenario'] = scenario - df['Variable'] = df.sector - df['Region'] = df.region - df['Unit'] = df.units + df["Model"] = model + df["Scenario"] = scenario + df["Variable"] = df.sector + df["Region"] = df.region + df["Unit"] = df.units df.drop(df_idx, axis=1, inplace=True) # unit magic to make it always first, would be easier if it was in idx. - hold = df['Unit'] - df.drop('Unit', axis=1, inplace=True) - df.insert(0, 'Unit', hold) + hold = df["Unit"] + df.drop("Unit", axis=1, inplace=True) + df.insert(0, "Unit", hold) # set up index and column order idx = iamc_idx - if column_style == 'upper': + if column_style == "upper": df.columns = df.columns.str.upper() idx = [x.upper() for x in idx] df.set_index(idx, inplace=True) @@ -614,19 +618,19 @@ def _convert_gases(self, df, tostd=True): # from, to for f, t in convert: - for col in ['gas', 'sector']: + for col in ["gas", "sector"]: df[col] = df[col].replace(f, t) def _convert_units(self, df, tostd=True): where = ~df.gas.isin(kt_gases) if tostd: df.loc[where, numcols(df)] *= 1e3 - df.loc[where, 'units'] = 'kt' - assert((df.units == 'kt').all()) + df.loc[where, "units"] = "kt" + assert (df.units == "kt").all() else: - assert((df.units == 'kt').all()) + assert (df.units == "kt").all() df.loc[where, numcols(df)] /= 1e3 - df.loc[where, 'units'] = 'Mt' + df.loc[where, "units"] = "Mt" def isin(df=None, **filters): @@ -638,8 +642,49 @@ def isin(df=None, **filters): or with explicit df to get boolean mask > isin(df, region="World", gas=["CO2", "N2O"]) """ + def tester(df): tests = (df.index.isin(np.atleast_1d(v), level=k) for k, v in filters.items()) return reduce(and_, tests, next(tests)) return tester if df is None else tester(df) + + +def pd_read(f, str_cols=False, *args, **kwargs): + """Try to read a file with pandas, supports CSV and XLSX + + Parameters + ---------- + f : string + the file to read in + str_cols : bool, optional + turn all columns into strings (numerical column names are sometimes + read in as numerical dtypes) + args, kwargs : sent directly to the Pandas read function + + Returns + ------- + df : pd.DataFrame + """ + if f.endswith("csv"): + df = pd.read_csv(f, *args, **kwargs) + else: + df = pd.read_excel(f, *args, **kwargs) + + if str_cols: + df.columns = [str(x) for x in df.columns] + + return df + + +def pd_write(df, f, *args, **kwargs): + """Try to write a file with pandas, supports CSV and XLSX""" + # guess whether to use index, unless we're told otherwise + index = kwargs.pop("index", isinstance(df.index, pd.MultiIndex)) + + if f.endswith("csv"): + df.to_csv(f, index=index, *args, **kwargs) + else: + writer = pd.ExcelWriter(f) + df.to_excel(writer, index=index, *args, **kwargs) + writer.save() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..c59836a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,10 @@ +[tool.black] +line-length = 88 +target-version = ['py39'] +extend-exclude = ''' +( + _version.py | + versioneer.py | + ^/doc +) +''' \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 145e53f..0f41d6d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -10,7 +10,7 @@ tag_prefix = v parentdir_prefix = aneris- [flake8] -ignore = I002, F403, E402, E731, E203 +ignore = I002, F403, E402, E731, E203, W503 max-line-length = 88 exclude = doc diff --git a/setup.py b/setup.py index 9abad7b..6967954 100755 --- a/setup.py +++ b/setup.py @@ -1,4 +1,3 @@ - #!/usr/bin/env python from __future__ import print_function @@ -22,26 +21,27 @@ """ REQUIREMENTS = [ - 'numpy', - 'pandas>=1.1', - 'PyYAML', - 'xlrd>=2.0', - 'openpyxl', - 'matplotlib', - 'pyomo>=5' + "numpy", + "pandas>=1.1", + "PyYAML", + "xlrd>=2.0", + "openpyxl", + "matplotlib", + "pyomo>=5", ] EXTRA_REQUIREMENTS = { - 'tests': ['pytest', 'coverage', 'coveralls', 'pytest', 'pytest-cov'], - 'deploy': ['twine', 'setuptools', 'wheel'], - 'units': ['openscm-units'] + "tests": ["pytest", "coverage", "coveralls", "pytest", "pytest-cov", "black"], + "deploy": ["twine", "setuptools", "wheel"], + "units": ["openscm-units"], } # thank you https://stormpath.com/blog/building-simple-cli-interfaces-in-python class RunTests(Command): """Run all tests.""" - description = 'run tests' + + description = "run tests" user_options = [] def initialize_options(self): @@ -52,29 +52,29 @@ def finalize_options(self): def run(self): """Run all tests!""" - errno = call(['py.test', '--cov=skele', '--cov-report=term-missing']) + errno = call(["py.test", "--cov=skele", "--cov-report=term-missing"]) raise SystemExit(errno) CMDCLASS = versioneer.get_cmdclass() -CMDCLASS.update({'test': RunTests}) +CMDCLASS.update({"test": RunTests}) def main(): print(logo) classifiers = [ - 'License :: OSI Approved :: Apache Software License', + "License :: OSI Approved :: Apache Software License", ] packages = [ - 'aneris', + "aneris", ] pack_dir = { - 'aneris': 'aneris', + "aneris": "aneris", } entry_points = { - 'console_scripts': [ + "console_scripts": [ # list CLIs here - 'aneris=aneris.cli:main', + "aneris=aneris.cli:main", ], } package_data = { @@ -85,22 +85,22 @@ def main(): extra_requirements = EXTRA_REQUIREMENTS setup_kwargs = { "name": "aneris-iamc", - 'version': versioneer.get_version(), - "description": 'Harmonize Integrated Assessment Model Emissions ' - 'Trajectories', - "author": 'Matthew Gidden', - "author_email": 'matthew.gidden@gmail.com', - "url": 'http://github.com/iiasa/aneris', - 'cmdclass': CMDCLASS, - 'classifiers': classifiers, - 'license': 'Apache License 2.0', - 'packages': packages, - 'package_dir': pack_dir, - 'entry_points': entry_points, - 'package_data': package_data, - 'python_requires': '>=3.6', - 'install_requires': install_requirements, - 'extras_require': extra_requirements, + "version": versioneer.get_version(), + "description": "Harmonize Integrated Assessment Model Emissions " + "Trajectories", + "author": "Matthew Gidden", + "author_email": "matthew.gidden@gmail.com", + "url": "http://github.com/iiasa/aneris", + "cmdclass": CMDCLASS, + "classifiers": classifiers, + "license": "Apache License 2.0", + "packages": packages, + "package_dir": pack_dir, + "entry_points": entry_points, + "package_data": package_data, + "python_requires": ">=3.6", + "install_requires": install_requirements, + "extras_require": extra_requirements, } rtn = setup(**setup_kwargs) diff --git a/tests/ci/download_data.py b/tests/ci/download_data.py index 320b676..67ad0bc 100644 --- a/tests/ci/download_data.py +++ b/tests/ci/download_data.py @@ -2,30 +2,31 @@ import requests import tarfile -username = os.environ['ANERIS_CI_USER'] -password = os.environ['ANERIS_CI_PW'] +username = os.environ["ANERIS_CI_USER"] +password = os.environ["ANERIS_CI_PW"] -url = 'https://data.ene.iiasa.ac.at/continuous_integration/aneris/' +url = "https://data.ene.iiasa.ac.at/continuous_integration/aneris/" def download(filename): r = requests.get(url + filename, auth=(username, password)) if r.status_code == 200: - print('Downloading {} from {}'.format(filename, url)) - with open(filename, 'wb') as out: + print("Downloading {} from {}".format(filename, url)) + with open(filename, "wb") as out: for bits in r.iter_content(): out.write(bits) assert os.path.exists(filename) - print('Untarring {}'.format(filename)) + print("Untarring {}".format(filename)) tar = tarfile.open(filename, "r:gz") tar.extractall() tar.close() os.remove(filename) else: raise IOError( - 'Failed download with user/pass: {}/{}'.format(username, password)) + "Failed download with user/pass: {}/{}".format(username, password) + ) -download('data.tar.gz') -download('output.tar.gz') +download("data.tar.gz") +download("output.tar.gz") diff --git a/tests/test_default_decision_tree.py b/tests/test_default_decision_tree.py index 796c7ed..fde47e1 100644 --- a/tests/test_default_decision_tree.py +++ b/tests/test_default_decision_tree.py @@ -6,10 +6,10 @@ import pandas.testing as pdt -def make_index(length, gas='CH4', sector='Energy'): +def make_index(length, gas="CH4", sector="Energy"): return pd.MultiIndex.from_product( [["region_{i}" for i in range(length)], [gas], [sector]], - names=["region", "gas", "sector"] + names=["region", "gas", "sector"], ) @@ -20,153 +20,135 @@ def index1(): @pytest.fixture def index1_co2(): - return make_index(1, gas='CO2') + return make_index(1, gas="CO2") def test_hist_zero(index1): - hist = pd.DataFrame({'2015': [0]}, index1) - df = pd.DataFrame({'2015': [1.]}, index1) + hist = pd.DataFrame({"2015": [0]}, index1) + df = pd.DataFrame({"2015": [1.0]}, index1) - obs, diags = harmonize.default_methods(hist, df, '2015') + obs, diags = harmonize.default_methods(hist, df, "2015") - exp = pd.Series(['hist_zero'], index1, name='methods') + exp = pd.Series(["hist_zero"], index1, name="methods") pdt.assert_series_equal(exp, obs, check_names=False) def test_model_zero(index1): - hist = pd.DataFrame({'2015': [1.]}, index1) - df = pd.DataFrame({'2015': [0.]}, index1) + hist = pd.DataFrame({"2015": [1.0]}, index1) + df = pd.DataFrame({"2015": [0.0]}, index1) - obs, diags = harmonize.default_methods(hist, df, '2015') + obs, diags = harmonize.default_methods(hist, df, "2015") - exp = pd.Series(['model_zero'], index1, name='methods') + exp = pd.Series(["model_zero"], index1, name="methods") pdt.assert_series_equal(exp, obs, check_names=False) def test_branch1(index1): - hist = pd.DataFrame({'2015': [1.]}, index1) - df = pd.DataFrame( - {'2015': [0.], '2020': [-1.]}, - index1 - ) + hist = pd.DataFrame({"2015": [1.0]}, index1) + df = pd.DataFrame({"2015": [0.0], "2020": [-1.0]}, index1) - obs, diags = harmonize.default_methods(hist, df, '2015') - exp = pd.Series(['reduce_offset_2080'], index1, name='methods') + obs, diags = harmonize.default_methods(hist, df, "2015") + exp = pd.Series(["reduce_offset_2080"], index1, name="methods") pdt.assert_series_equal(exp, obs, check_names=False) - obs, diags = harmonize.default_methods(hist, df, '2015', - offset_method='reduce_offset_2050') - exp = pd.Series(['reduce_offset_2050'], index1, name='methods') + obs, diags = harmonize.default_methods( + hist, df, "2015", offset_method="reduce_offset_2050" + ) + exp = pd.Series(["reduce_offset_2050"], index1, name="methods") pdt.assert_series_equal(exp, obs, check_names=False) def test_branch2(index1): - hist = pd.DataFrame({'2015': [1.]}, index1) - df = pd.DataFrame( - {'2015': [0.], '2020': [1.]}, - index1 - ) + hist = pd.DataFrame({"2015": [1.0]}, index1) + df = pd.DataFrame({"2015": [0.0], "2020": [1.0]}, index1) - obs, diags = harmonize.default_methods(hist, df, '2015') - exp = pd.Series(['constant_offset'], index1, name='methods') + obs, diags = harmonize.default_methods(hist, df, "2015") + exp = pd.Series(["constant_offset"], index1, name="methods") pdt.assert_series_equal(exp, obs, check_names=False) def test_branch3(index1): - hist = pd.DataFrame( - {'2015': [1.]}, - index1 - ) - df = pd.DataFrame( - {'2015': [1.001], '2020': [-1.001]}, - index1 - ) + hist = pd.DataFrame({"2015": [1.0]}, index1) + df = pd.DataFrame({"2015": [1.001], "2020": [-1.001]}, index1) - obs, diags = harmonize.default_methods(hist, df, '2015') - exp = pd.Series(['reduce_ratio_2080'], index1, name='methods') + obs, diags = harmonize.default_methods(hist, df, "2015") + exp = pd.Series(["reduce_ratio_2080"], index1, name="methods") pdt.assert_series_equal(exp, obs, check_names=False) - obs, diags = harmonize.default_methods(hist, df, '2015', - ratio_method='reduce_ratio_2050') - exp = pd.Series(['reduce_ratio_2050'], index1, name='methods') + obs, diags = harmonize.default_methods( + hist, df, "2015", ratio_method="reduce_ratio_2050" + ) + exp = pd.Series(["reduce_ratio_2050"], index1, name="methods") pdt.assert_series_equal(exp, obs, check_names=False) def test_branch4(index1): - hist = pd.DataFrame({'2015': [1.]}, index1) - df = pd.DataFrame( - {'2015': [5.001], '2020': [-1.]}, - index1 - ) + hist = pd.DataFrame({"2015": [1.0]}, index1) + df = pd.DataFrame({"2015": [5.001], "2020": [-1.0]}, index1) - obs, diags = harmonize.default_methods(hist, df, '2015') + obs, diags = harmonize.default_methods(hist, df, "2015") - exp = pd.Series(['reduce_ratio_2100'], index1, name='methods') + exp = pd.Series(["reduce_ratio_2100"], index1, name="methods") pdt.assert_series_equal(exp, obs, check_names=False) def test_branch5(index1): - hist = pd.DataFrame({'2015': [1.]}, index1) - df = pd.DataFrame( - {'2015': [5.001], '2020': [1.]}, - index1 - ) + hist = pd.DataFrame({"2015": [1.0]}, index1) + df = pd.DataFrame({"2015": [5.001], "2020": [1.0]}, index1) - obs, diags = harmonize.default_methods(hist, df, '2015') + obs, diags = harmonize.default_methods(hist, df, "2015") - exp = pd.Series(['constant_ratio'], index1, name='methods') + exp = pd.Series(["constant_ratio"], index1, name="methods") pdt.assert_series_equal(exp, obs, check_names=False) def test_branch6(index1): hist = pd.DataFrame( { - '2000': [1.], - '2005': [1000.], - '2010': [1.], - '2015': [100.], + "2000": [1.0], + "2005": [1000.0], + "2010": [1.0], + "2015": [100.0], }, - index1 + index1, ) df = pd.DataFrame( { - '2015': [5.001], - '2020': [1.], + "2015": [5.001], + "2020": [1.0], }, - index1 + index1, ) - obs, diags = harmonize.default_methods(hist, df, '2015') + obs, diags = harmonize.default_methods(hist, df, "2015") print(diags) - exp = pd.Series(['reduce_offset_2150_cov'], index1, name='methods') + exp = pd.Series(["reduce_offset_2150_cov"], index1, name="methods") pdt.assert_series_equal(exp, obs, check_names=False) def test_custom_method_choice(index1, index1_co2): - def method_choice( - row, ratio_method, offset_method, luc_method, luc_cov_threshold - ): - return 'budget' if row.gas == 'CO2' else ratio_method + def method_choice(row, ratio_method, offset_method, luc_method, luc_cov_threshold): + return "budget" if row.gas == "CO2" else ratio_method # CH4 - hist_ch4 = pd.DataFrame({'2015': [1.]}, index1) - df_ch4 = pd.DataFrame({'2015': [1.]}, index1) + hist_ch4 = pd.DataFrame({"2015": [1.0]}, index1) + df_ch4 = pd.DataFrame({"2015": [1.0]}, index1) obs_ch4, _ = harmonize.default_methods( - hist_ch4, df_ch4, '2015', method_choice=method_choice + hist_ch4, df_ch4, "2015", method_choice=method_choice ) - exp_ch4 = pd.Series(['reduce_ratio_2080'], index1, name='methods') + exp_ch4 = pd.Series(["reduce_ratio_2080"], index1, name="methods") pdt.assert_series_equal(exp_ch4, obs_ch4, check_names=False) # CO2 - hist_co2 = pd.DataFrame({'2015': [1.]}, index1_co2) - df_co2 = pd.DataFrame({'2015': [1.]}, index1_co2) + hist_co2 = pd.DataFrame({"2015": [1.0]}, index1_co2) + df_co2 = pd.DataFrame({"2015": [1.0]}, index1_co2) obs_co2, _ = harmonize.default_methods( - hist_co2, df_co2, '2015', method_choice=method_choice + hist_co2, df_co2, "2015", method_choice=method_choice ) - exp_co2 = pd.Series(['budget'], index1_co2, name='methods') + exp_co2 = pd.Series(["budget"], index1_co2, name="methods") pdt.assert_series_equal(exp_co2, obs_co2, check_names=False) diff --git a/tests/test_harmonize.py b/tests/test_harmonize.py index a7b35ce..476a2b6 100644 --- a/tests/test_harmonize.py +++ b/tests/test_harmonize.py @@ -10,35 +10,53 @@ nvals = 6 -_df = pd.DataFrame({ - 'gas': ['BC'] * nvals, - 'region': ['a'] * nvals, - 'units': ['Mt'] * nvals, - 'sector': ['bar', 'foo'] + [str(x) for x in range(nvals - 2)], - '2010': [2, 1, 9000, 9000, 9000, 9000], - '2015': [3, 2, 0.51, 9000, 9000, -90], - '2040': [4.5, 1.5, 9000, 9000, 9000, 9000], - '2060': [6, 1, 9000, 9000, 9000, 9000], -}).set_index(utils.df_idx).sort_index() +_df = ( + pd.DataFrame( + { + "gas": ["BC"] * nvals, + "region": ["a"] * nvals, + "units": ["Mt"] * nvals, + "sector": ["bar", "foo"] + [str(x) for x in range(nvals - 2)], + "2010": [2, 1, 9000, 9000, 9000, 9000], + "2015": [3, 2, 0.51, 9000, 9000, -90], + "2040": [4.5, 1.5, 9000, 9000, 9000, 9000], + "2060": [6, 1, 9000, 9000, 9000, 9000], + } + ) + .set_index(utils.df_idx) + .sort_index() +) _t_frac = lambda tf: (2040 - 2015) / float(tf - 2015) -_hist = pd.DataFrame({ - 'gas': ['BC'] * nvals, - 'region': ['a'] * nvals, - 'units': ['Mt'] * nvals, - 'sector': ['bar', 'foo'] + [str(x) for x in range(nvals - 2)], - '2010': [1., 0.34, 9000, 9000, 9000, 9000], - '2015': [0.01, 1., 0.5, 2 * 8999. / 9, 3 * 8999., 8999.], -}).set_index(utils.df_idx).sort_index() - -_methods = pd.DataFrame({ - 'gas': _df.index.get_level_values('gas'), - 'sector': _df.index.get_level_values('sector'), - 'region': ['a'] * nvals, - 'units': ['Mt'] * nvals, - 'method': ['constant_offset'] * nvals, -}).set_index(utils.df_idx).sort_index() +_hist = ( + pd.DataFrame( + { + "gas": ["BC"] * nvals, + "region": ["a"] * nvals, + "units": ["Mt"] * nvals, + "sector": ["bar", "foo"] + [str(x) for x in range(nvals - 2)], + "2010": [1.0, 0.34, 9000, 9000, 9000, 9000], + "2015": [0.01, 1.0, 0.5, 2 * 8999.0 / 9, 3 * 8999.0, 8999.0], + } + ) + .set_index(utils.df_idx) + .sort_index() +) + +_methods = ( + pd.DataFrame( + { + "gas": _df.index.get_level_values("gas"), + "sector": _df.index.get_level_values("sector"), + "region": ["a"] * nvals, + "units": ["Mt"] * nvals, + "method": ["constant_offset"] * nvals, + } + ) + .set_index(utils.df_idx) + .sort_index() +) def test_factors(): @@ -46,7 +64,7 @@ def test_factors(): hist = _hist.copy() obsoffset, obsratio = harmonize.harmonize_factors(df.copy(), hist.copy()) # im lazy; test initially written when these were of length 2 - exp = np.array([0.01 - 3, -1.]) + exp = np.array([0.01 - 3, -1.0]) npt.assert_array_almost_equal(exp, obsoffset[-2:]) exp = np.array([0.01 / 3, 0.5]) npt.assert_array_almost_equal(exp, obsratio[-2:]) @@ -57,22 +75,22 @@ def test_harmonize_constant_offset(): hist = _hist.copy() methods = _methods.copy() h = harmonize.Harmonizer(df, hist) - res = h.harmonize(overrides=methods['method']) + res = h.harmonize(overrides=methods["method"]) # base year - obs = res['2015'] - exp = _hist['2015'] + obs = res["2015"] + exp = _hist["2015"] npt.assert_array_almost_equal(obs, exp) # future year - obs = res['2060'] - exp = _df['2060'] + (_hist['2015'] - _df['2015']) + obs = res["2060"] + exp = _df["2060"] + (_hist["2015"] - _df["2015"]) npt.assert_array_almost_equal(obs, exp) def test_no_model(): - df = pd.DataFrame({'2015': [0]}) - hist = pd.DataFrame({'2015': [1.5]}) + df = pd.DataFrame({"2015": [0]}) + hist = pd.DataFrame({"2015": [1.5]}) obsoffset, obsratio = harmonize.harmonize_factors(df.copy(), hist.copy()) exp = np.array([1.5]) npt.assert_array_almost_equal(exp, obsoffset) @@ -85,17 +103,17 @@ def test_harmonize_constant_ratio(): hist = _hist.copy() methods = _methods.copy() h = harmonize.Harmonizer(df, hist) - methods['method'] = ['constant_ratio'] * nvals - res = h.harmonize(overrides=methods['method']) + methods["method"] = ["constant_ratio"] * nvals + res = h.harmonize(overrides=methods["method"]) # base year - obs = res['2015'] - exp = _hist['2015'] + obs = res["2015"] + exp = _hist["2015"] npt.assert_array_almost_equal(obs, exp) # future year - obs = res['2060'] - exp = _df['2060'] * (_hist['2015'] / _df['2015']) + obs = res["2060"] + exp = _df["2060"] * (_hist["2015"] / _df["2015"]) npt.assert_array_almost_equal(obs, exp) @@ -108,24 +126,24 @@ def test_harmonize_reduce_offset(): # this is bad, there should be a test for each case for tf in [2050, 2100, 2150]: print(tf) - method = 'reduce_offset_{}'.format(tf) - methods['method'] = [method] * nvals - res = h.harmonize(overrides=methods['method']) + method = "reduce_offset_{}".format(tf) + methods["method"] = [method] * nvals + res = h.harmonize(overrides=methods["method"]) # base year - obs = res['2015'] - exp = _hist['2015'] + obs = res["2015"] + exp = _hist["2015"] npt.assert_array_almost_equal(obs, exp) # future year - obs = res['2040'] - exp = _df['2040'] + (1 - _t_frac(tf)) * (_hist['2015'] - _df['2015']) + obs = res["2040"] + exp = _df["2040"] + (1 - _t_frac(tf)) * (_hist["2015"] - _df["2015"]) npt.assert_array_almost_equal(obs, exp) # future year if tf < 2060: - obs = res['2060'] - exp = _df['2060'] + obs = res["2060"] + exp = _df["2060"] npt.assert_array_almost_equal(obs, exp) @@ -138,25 +156,25 @@ def test_harmonize_reduce_ratio(): # this is bad, there should be a test for each case for tf in [2050, 2100, 2150]: print(tf) - method = 'reduce_ratio_{}'.format(tf) - methods['method'] = [method] * nvals - res = h.harmonize(overrides=methods['method']) + method = "reduce_ratio_{}".format(tf) + methods["method"] = [method] * nvals + res = h.harmonize(overrides=methods["method"]) # base year - obs = res['2015'] - exp = _hist['2015'] + obs = res["2015"] + exp = _hist["2015"] npt.assert_array_almost_equal(obs, exp) # future year - obs = res['2040'] - ratio = _hist['2015'] / _df['2015'] - exp = _df['2040'] * (ratio + _t_frac(tf) * (1 - ratio)) + obs = res["2040"] + ratio = _hist["2015"] / _df["2015"] + exp = _df["2040"] * (ratio + _t_frac(tf) * (1 - ratio)) npt.assert_array_almost_equal(obs, exp) # future year if tf < 2060: - obs = res['2060'] - exp = _df['2060'] + obs = res["2060"] + exp = _df["2060"] npt.assert_array_almost_equal(obs, exp) @@ -172,13 +190,13 @@ def test_harmonize_reduce_ratio_different_units(): tf = 2050 - method = 'reduce_ratio_{}'.format(tf) - methods['method'] = [method] * nvals - res = h.harmonize(overrides=methods['method']) + method = "reduce_ratio_{}".format(tf) + methods["method"] = [method] * nvals + res = h.harmonize(overrides=methods["method"]) # base year - obs = res['2015'] - exp = hist['2015'] + obs = res["2015"] + exp = hist["2015"] # should come back with input units obs_units = obs.index.get_level_values("units") df_units = df.index.get_level_values("units") @@ -186,15 +204,15 @@ def test_harmonize_reduce_ratio_different_units(): npt.assert_array_almost_equal(obs, exp) # future year - obs = res['2040'] - ratio = _hist['2015'] / _df['2015'] - exp = _df['2040'] * (ratio + _t_frac(tf) * (1 - ratio)) + obs = res["2040"] + ratio = _hist["2015"] / _df["2015"] + exp = _df["2040"] * (ratio + _t_frac(tf) * (1 - ratio)) npt.assert_array_almost_equal(obs, exp) # future year if tf < 2060: - obs = res['2060'] - exp = _df['2060'] + obs = res["2060"] + exp = _df["2060"] npt.assert_array_almost_equal(obs, exp) @@ -203,18 +221,20 @@ def test_harmonize_mix(): hist = _hist.copy() methods = _methods.copy() h = harmonize.Harmonizer(df, hist) - methods['method'] = ['constant_offset'] * nvals - res = h.harmonize(overrides=methods['method']) + methods["method"] = ["constant_offset"] * nvals + res = h.harmonize(overrides=methods["method"]) # base year - obs = res['2015'] - exp = _hist['2015'] + obs = res["2015"] + exp = _hist["2015"] npt.assert_array_almost_equal(obs, exp) # future year - obs = res['2060'][:2] - exp = [_df['2060'][0] + (_hist['2015'][0] - _df['2015'][0]), - _df['2060'][1] * (_hist['2015'][1] / _df['2015'][1])] + obs = res["2060"][:2] + exp = [ + _df["2060"][0] + (_hist["2015"][0] - _df["2015"][0]), + _df["2060"][1] * (_hist["2015"][1] / _df["2015"][1]), + ] npt.assert_array_almost_equal(obs, exp) @@ -223,16 +243,16 @@ def test_harmonize_linear_interpolation(): hist = _hist.copy() methods = _methods.copy() h = harmonize.Harmonizer(df, hist) - methods['method'] = ['linear_interpolate_2060'] * nvals - res = h.harmonize(overrides=methods['method']) + methods["method"] = ["linear_interpolate_2060"] * nvals + res = h.harmonize(overrides=methods["method"]) # base year - obs = res['2015'] - exp = _hist['2015'] + obs = res["2015"] + exp = _hist["2015"] npt.assert_array_almost_equal(obs, exp) # future year - x1, x2, x = '2015', '2060', '2040' + x1, x2, x = "2015", "2060", "2040" y1, y2 = _hist[x1], _df[x2] m = (y2 - y1) / (float(x2) - float(x1)) b = y1 - m * float(x1) @@ -241,8 +261,8 @@ def test_harmonize_linear_interpolation(): npt.assert_array_almost_equal(obs, exp) # year after interp - obs = res['2060'] - exp = _df['2060'] + obs = res["2060"] + exp = _df["2060"] npt.assert_array_almost_equal(obs, exp) @@ -252,12 +272,12 @@ def test_harmonize_budget(): methods = _methods.copy() h = harmonize.Harmonizer(df, hist) - methods['method'] = 'budget' - res = h.harmonize(overrides=methods['method']) + methods["method"] = "budget" + res = h.harmonize(overrides=methods["method"]) # base year - obs = res['2015'] - exp = _hist['2015'] + obs = res["2015"] + exp = _hist["2015"] npt.assert_array_almost_equal(obs, exp) # carbon budget conserved @@ -272,5 +292,5 @@ def _carbon_budget(emissions): npt.assert_array_almost_equal( _carbon_budget(res), - _carbon_budget(df) - _carbon_budget(hist.loc[:, '2010':'2015']), + _carbon_budget(df) - _carbon_budget(hist.loc[:, "2010":"2015"]), ) diff --git a/tests/test_io.py b/tests/test_io.py index 568a88e..62134e4 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -4,18 +4,18 @@ from aneris import _io _defaults = { - 'config': { - 'default_luc_method': 'reduce_ratio_2150_cov', - 'default_offset_method': 'reduce_offset_2080', - 'default_ratio_method': 'reduce_ratio_2080', - 'cov_threshold': 20, - 'harmonize_year': 2015, - 'global_harmonization_only': False, - 'replace_suffix': 'Harmonized-DB', + "config": { + "default_luc_method": "reduce_ratio_2150_cov", + "default_offset_method": "reduce_offset_2080", + "default_ratio_method": "reduce_ratio_2080", + "cov_threshold": 20, + "harmonize_year": 2015, + "global_harmonization_only": False, + "replace_suffix": "Harmonized-DB", }, - 'prefix': 'CEDS+|9+ Sectors', - 'suffix': 'Unharmonized', - 'add_5regions': True, + "prefix": "CEDS+|9+ Sectors", + "suffix": "Unharmonized", + "add_5regions": True, } @@ -28,7 +28,7 @@ def test_default_rc(): def test_mutable(): obs = _io.RunControl() with pytest.raises(TypeError): - obs['foo'] = 'bar' + obs["foo"] = "bar" def test_nondefault_rc(): @@ -39,7 +39,7 @@ def test_nondefault_rc(): obs = _io.RunControl(rcstr) exp = _defaults - exp['config']['cov_threshold'] = 42 + exp["config"]["cov_threshold"] = 42 assert exp == obs @@ -53,18 +53,18 @@ def test_nondefault_rc_file_read(): f.flush() obs = _io.RunControl(f.name) exp = _defaults - exp['config']['cov_threshold'] = 42 + exp["config"]["cov_threshold"] = 42 assert exp == obs def test_recursive_update(): update = { - 'foo': 'bar', - 'cov_threshold': 42, + "foo": "bar", + "cov_threshold": 42, } exp = _defaults - exp['config'].update(update) + exp["config"].update(update) obs = _io.RunControl() - obs.recursive_update('config', update) + obs.recursive_update("config", update) assert obs == exp diff --git a/tests/test_regression.py b/tests/test_regression.py index 65d71f6..1dc83cf 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -18,29 +18,28 @@ # worry about this again. here = join(os.path.dirname(os.path.realpath(__file__))) -ci_path = join(here, 'ci') +ci_path = join(here, "ci") # check variables for if we are on CI (will then run regression tests) -ON_CI_REASON = 'No access to regression test credentials' +ON_CI_REASON = "No access to regression test credentials" try: - os.environ['ANERIS_CI_USER'] + os.environ["ANERIS_CI_USER"] ON_CI = True except KeyError: ON_CI = False FILE_SUFFIXES = [ - 'global_only', - 'regions_sectors', - 'global_sectors', - 'mock_pipeline_prototype', - 'pipeline_progress', - 'full_ar6', - 'global_ar6', + "global_only", + "regions_sectors", + "global_sectors", + "mock_pipeline_prototype", + "pipeline_progress", + "full_ar6", + "global_ar6", ] -class TestHarmonizeRegression(): - +class TestHarmonizeRegression: def _run(self, inf, checkf, hist, reg, rc, prefix, name): # path setup prefix = join(here, prefix) @@ -48,9 +47,9 @@ def _run(self, inf, checkf, hist, reg, rc, prefix, name): reg = join(prefix, reg) rc = join(prefix, rc) inf = join(prefix, inf) - outf = join(prefix, '{}_harmonized.xlsx'.format(name)) - outf_meta = join(prefix, '{}_metadata.xlsx'.format(name)) - outf_diag = join(prefix, '{}_diagnostics.xlsx'.format(name)) + outf = join(prefix, "{}_harmonized.xlsx".format(name)) + outf_meta = join(prefix, "{}_metadata.xlsx".format(name)) + outf_diag = join(prefix, "{}_diagnostics.xlsx".format(name)) clean = [outf, outf_meta, outf_diag] # make sure we're fresh @@ -61,21 +60,31 @@ def _run(self, inf, checkf, hist, reg, rc, prefix, name): # run print(inf, hist, reg, rc, name) cli.harmonize( - inf, hist, reg, rc, prefix, name, return_result=False, + inf, + hist, + reg, + rc, + prefix, + name, + return_result=False, ) # test ncols = 5 expfile = join(prefix, checkf) - exp = pd.read_excel(expfile, sheet_name='data', - index_col=list(range(ncols)), - engine='openpyxl', - ).sort_index() + exp = pd.read_excel( + expfile, + sheet_name="data", + index_col=list(range(ncols)), + engine="openpyxl", + ).sort_index() exp.columns = exp.columns.astype(str) - obs = pd.read_excel(outf, sheet_name='data', - index_col=list(range(ncols)), - engine='openpyxl', - ).sort_index() + obs = pd.read_excel( + outf, + sheet_name="data", + index_col=list(range(ncols)), + engine="openpyxl", + ).sort_index() assert_frame_equal(exp, obs, check_dtype=False) # tidy up after @@ -86,25 +95,25 @@ def _run(self, inf, checkf, hist, reg, rc, prefix, name): @pytest.mark.parametrize("file_suffix", FILE_SUFFIXES) def test_basic_run(self, file_suffix): # this is run no matter what - prefix = 'test_data' - checkf = 'test_{}.xlsx'.format(file_suffix) - hist = 'history_{}.xls'.format(file_suffix) - reg = 'regions_{}.csv'.format(file_suffix) - inf = 'model_{}.xls'.format(file_suffix) - rc = 'aneris_{}.yaml'.format(file_suffix) + prefix = "test_data" + checkf = "test_{}.xlsx".format(file_suffix) + hist = "history_{}.xls".format(file_suffix) + reg = "regions_{}.csv".format(file_suffix) + inf = "model_{}.xls".format(file_suffix) + rc = "aneris_{}.yaml".format(file_suffix) # get all arguments self._run(inf, checkf, hist, reg, rc, prefix, file_suffix) @pytest.mark.skipif(not ON_CI, reason=ON_CI_REASON) - @pytest.mark.parametrize("name", ['msg', 'gcam']) + @pytest.mark.parametrize("name", ["msg", "gcam"]) def test_regression_ci(self, name): - prefix = join(ci_path, 'test-{}'.format(name)) - checkf = '{}_harmonized.xlsx'.format(name) - hist = 'history.csv' - reg = 'regiondef.xlsx' - rc = 'rc.yaml' - inf = 'inputfile.xlsx' + prefix = join(ci_path, "test-{}".format(name)) + checkf = "{}_harmonized.xlsx".format(name) + hist = "history.csv" + reg = "regiondef.xlsx" + rc = "rc.yaml" + inf = "inputfile.xlsx" # copy needed files for fname in [hist, rc, checkf]: diff --git a/tests/test_tutorials.py b/tests/test_tutorials.py index 1155d06..d6cb3c3 100644 --- a/tests/test_tutorials.py +++ b/tests/test_tutorials.py @@ -8,7 +8,7 @@ import jupyter here = os.path.dirname(os.path.realpath(__file__)) -tut_path = os.path.join(here, '..', 'doc', 'source') +tut_path = os.path.join(here, "..", "doc", "source") # taken from the execellent example here: # https://blog.thedataincubator.com/2016/06/testing-jupyter-notebooks/ @@ -20,27 +20,35 @@ def _notebook_run(path, kernel=None, capsys=None): """ assert os.path.exists(path) major_version = sys.version_info[0] - kernel = kernel or 'python{}'.format(major_version) + kernel = kernel or "python{}".format(major_version) if capsys is not None: with capsys.disabled(): - print('using py version {} with kernel {}'.format( - major_version, kernel)) + print("using py version {} with kernel {}".format(major_version, kernel)) dirname, __ = os.path.split(path) os.chdir(dirname) - fname = os.path.join(here, 'test.ipynb') + fname = os.path.join(here, "test.ipynb") args = [ - 'jupyter', 'nbconvert', '--to', 'notebook', '--execute', - '--ExecutePreprocessor.timeout=60', - '--ExecutePreprocessor.kernel_name={}'.format(kernel), - "--output", fname, path] + "jupyter", + "nbconvert", + "--to", + "notebook", + "--execute", + "--ExecutePreprocessor.timeout=60", + "--ExecutePreprocessor.kernel_name={}".format(kernel), + "--output", + fname, + path, + ] subprocess.check_call(args) - nb = nbformat.read(io.open(fname, encoding='utf-8'), - nbformat.current_nbformat) + nb = nbformat.read(io.open(fname, encoding="utf-8"), nbformat.current_nbformat) errors = [ - output for cell in nb.cells if "outputs" in cell - for output in cell["outputs"] if output.output_type == "error" + output + for cell in nb.cells + if "outputs" in cell + for output in cell["outputs"] + if output.output_type == "error" ] os.remove(fname) @@ -49,6 +57,6 @@ def _notebook_run(path, kernel=None, capsys=None): def test_tutorial(capsys): - fname = os.path.join(tut_path, 'tutorial.ipynb') + fname = os.path.join(tut_path, "tutorial.ipynb") nb, errors = _notebook_run(fname, capsys=capsys) assert errors == [] diff --git a/tests/test_utils.py b/tests/test_utils.py index 97e7a47..2fb8896 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -7,32 +7,41 @@ def test_remove_emissions_prefix(): - assert 'foo' == utils.remove_emissions_prefix('foo') - assert 'foo' == utils.remove_emissions_prefix('Emissions|XXX|foo') - assert 'Emissions|bar|foo' == \ - utils.remove_emissions_prefix('Emissions|bar|foo') - assert 'foo' == \ - utils.remove_emissions_prefix('Emissions|bar|foo', gas='bar') + assert "foo" == utils.remove_emissions_prefix("foo") + assert "foo" == utils.remove_emissions_prefix("Emissions|XXX|foo") + assert "Emissions|bar|foo" == utils.remove_emissions_prefix("Emissions|bar|foo") + assert "foo" == utils.remove_emissions_prefix("Emissions|bar|foo", gas="bar") def test_region_agg_funky_name(): - df = pd.DataFrame({ - 'sector': ['foo', 'foo'], - 'region': ['a', 'b'], - '2010': [1.0, 4.0], - 'units': ['Mt'] * 2, - 'gas': ['BC'] * 2, - }).set_index(utils.df_idx).sort_index() - mapping = pd.DataFrame( - [['fOO_Bar', 'a'], ['fOO_Bar', 'b']], columns=['x', 'y']) - exp = pd.DataFrame({ - 'sector': ['foo'], - 'region': ['fOO_Bar'], - '2010': [5.0], - 'units': ['Mt'], - 'gas': ['BC'], - }).set_index(utils.df_idx).sort_index() - obs = utils.agg_regions(df, rfrom='y', rto='x', mapping=mapping) + df = ( + pd.DataFrame( + { + "sector": ["foo", "foo"], + "region": ["a", "b"], + "2010": [1.0, 4.0], + "units": ["Mt"] * 2, + "gas": ["BC"] * 2, + } + ) + .set_index(utils.df_idx) + .sort_index() + ) + mapping = pd.DataFrame([["fOO_Bar", "a"], ["fOO_Bar", "b"]], columns=["x", "y"]) + exp = ( + pd.DataFrame( + { + "sector": ["foo"], + "region": ["fOO_Bar"], + "2010": [5.0], + "units": ["Mt"], + "gas": ["BC"], + } + ) + .set_index(utils.df_idx) + .sort_index() + ) + obs = utils.agg_regions(df, rfrom="y", rto="x", mapping=mapping) pdt.assert_frame_equal(obs, exp) @@ -42,65 +51,69 @@ def test_no_repeat_gases(): def test_gases(): - var_col = pd.Series(['foo|Emissions|CH4|bar', 'Emissions|N2O|baz|zing']) - exp = pd.Series(['CH4', 'N2O']) + var_col = pd.Series(["foo|Emissions|CH4|bar", "Emissions|N2O|baz|zing"]) + exp = pd.Series(["CH4", "N2O"]) obs = utils.gases(var_col) pdt.assert_series_equal(obs, exp) def test_units(): - var_col = pd.Series(['foo|Emissions|CH4|bar', 'Emissions|N2O|baz|zing']) - exp = pd.Series(['Mt CH4/yr', 'kt N2O/yr']) + var_col = pd.Series(["foo|Emissions|CH4|bar", "Emissions|N2O|baz|zing"]) + exp = pd.Series(["Mt CH4/yr", "kt N2O/yr"]) obs = utils.units(var_col) pdt.assert_series_equal(obs, exp) def test_formatter_to_std(): - df = pd.DataFrame({ - 'Variable': [ - 'CEDS+|9+ Sectors|Emissions|BC|foo|Unharmonized', - 'Emissions|BC|bar|baz', - ], - 'Region': ['a', 'b'], - '2010': [5.0, 2.0], - '2020': [-1.0, 3.0], - 'Unit': ['Mt foo/yr'] * 2, - 'Model': ['foo'] * 2, - 'Scenario': ['foo'] * 2, - }) + df = pd.DataFrame( + { + "Variable": [ + "CEDS+|9+ Sectors|Emissions|BC|foo|Unharmonized", + "Emissions|BC|bar|baz", + ], + "Region": ["a", "b"], + "2010": [5.0, 2.0], + "2020": [-1.0, 3.0], + "Unit": ["Mt foo/yr"] * 2, + "Model": ["foo"] * 2, + "Scenario": ["foo"] * 2, + } + ) fmt = utils.FormatTranslator(df.copy()) obs = fmt.to_std() - exp = pd.DataFrame({ - 'sector': [ - 'CEDS+|9+ Sectors|foo|Unharmonized', - 'bar|baz', - ], - 'region': ['a', 'b'], - '2010': [5000.0, 2000.0], - '2020': [-1000.0, 3000.0], - 'units': ['kt'] * 2, - 'gas': ['BC'] * 2, - }) - pdt.assert_frame_equal(obs.set_index(utils.df_idx), - exp.set_index(utils.df_idx)) + exp = pd.DataFrame( + { + "sector": [ + "CEDS+|9+ Sectors|foo|Unharmonized", + "bar|baz", + ], + "region": ["a", "b"], + "2010": [5000.0, 2000.0], + "2020": [-1000.0, 3000.0], + "units": ["kt"] * 2, + "gas": ["BC"] * 2, + } + ) + pdt.assert_frame_equal(obs.set_index(utils.df_idx), exp.set_index(utils.df_idx)) def test_formatter_to_template(): - df = pd.DataFrame({ - 'Variable': [ - 'CEDS+|9+ Sectors|Emissions|BC|foo|Unharmonized', - 'CEDS+|9+ Sectors|Emissions|BC|bar|Unharmonized', - ], - 'Region': ['a', 'b'], - '2010': [5.0, 2.0], - '2020': [-1.0, 3.0], - 'Unit': ['Mt BC/yr'] * 2, - 'Model': ['foo'] * 2, - 'Scenario': ['foo'] * 2, - }).set_index(utils.iamc_idx) - fmt = utils.FormatTranslator(df, prefix='CEDS+|9+ Sectors', - suffix='Unharmonized') + df = pd.DataFrame( + { + "Variable": [ + "CEDS+|9+ Sectors|Emissions|BC|foo|Unharmonized", + "CEDS+|9+ Sectors|Emissions|BC|bar|Unharmonized", + ], + "Region": ["a", "b"], + "2010": [5.0, 2.0], + "2020": [-1.0, 3.0], + "Unit": ["Mt BC/yr"] * 2, + "Model": ["foo"] * 2, + "Scenario": ["foo"] * 2, + } + ).set_index(utils.iamc_idx) + fmt = utils.FormatTranslator(df, prefix="CEDS+|9+ Sectors", suffix="Unharmonized") fmt.to_std() obs = fmt.to_template() exp = df.reindex(columns=obs.columns) @@ -108,39 +121,43 @@ def test_formatter_to_template(): def combine_rows_df(): - df = pd.DataFrame({ - 'sector': [ - 'sector1', - 'sector2', - 'sector1', - 'extra_b', - 'sector1', - ], - 'region': ['a', 'a', 'b', 'b', 'c'], - '2010': [1.0, 4.0, 2.0, 21, 42], - 'foo': [-1.0, -4.0, 2.0, 21, 42], - 'units': ['Mt'] * 5, - 'gas': ['BC'] * 5, - }).set_index(utils.df_idx) + df = pd.DataFrame( + { + "sector": [ + "sector1", + "sector2", + "sector1", + "extra_b", + "sector1", + ], + "region": ["a", "a", "b", "b", "c"], + "2010": [1.0, 4.0, 2.0, 21, 42], + "foo": [-1.0, -4.0, 2.0, 21, 42], + "units": ["Mt"] * 5, + "gas": ["BC"] * 5, + } + ).set_index(utils.df_idx) return df def test_combine_rows_default(): df = combine_rows_df() - exp = pd.DataFrame({ - 'sector': [ - 'sector1', - 'sector2', - 'extra_b', - 'sector1', - ], - 'region': ['a', 'a', 'a', 'c'], - '2010': [3.0, 4.0, 21, 42], - 'foo': [1.0, -4.0, 21, 42], - 'units': ['Mt'] * 4, - 'gas': ['BC'] * 4, - }).set_index(utils.df_idx) - obs = utils.combine_rows(df, 'region', 'a', ['b']) + exp = pd.DataFrame( + { + "sector": [ + "sector1", + "sector2", + "extra_b", + "sector1", + ], + "region": ["a", "a", "a", "c"], + "2010": [3.0, 4.0, 21, 42], + "foo": [1.0, -4.0, 21, 42], + "units": ["Mt"] * 4, + "gas": ["BC"] * 4, + } + ).set_index(utils.df_idx) + obs = utils.combine_rows(df, "region", "a", ["b"]) exp = exp.reindex(columns=obs.columns) clean = lambda df: df.sort_index().reset_index() @@ -149,22 +166,24 @@ def test_combine_rows_default(): def test_combine_rows_dropothers(): df = combine_rows_df() - exp = pd.DataFrame({ - 'sector': [ - 'sector1', - 'sector2', - 'extra_b', - 'sector1', - 'extra_b', - 'sector1', - ], - 'region': ['a', 'a', 'a', 'b', 'b', 'c'], - '2010': [3.0, 4.0, 21, 2.0, 21, 42], - 'foo': [1.0, -4.0, 21, 2.0, 21, 42], - 'units': ['Mt'] * 6, - 'gas': ['BC'] * 6, - }).set_index(utils.df_idx) - obs = utils.combine_rows(df, 'region', 'a', ['b'], dropothers=False) + exp = pd.DataFrame( + { + "sector": [ + "sector1", + "sector2", + "extra_b", + "sector1", + "extra_b", + "sector1", + ], + "region": ["a", "a", "a", "b", "b", "c"], + "2010": [3.0, 4.0, 21, 2.0, 21, 42], + "foo": [1.0, -4.0, 21, 2.0, 21, 42], + "units": ["Mt"] * 6, + "gas": ["BC"] * 6, + } + ).set_index(utils.df_idx) + obs = utils.combine_rows(df, "region", "a", ["b"], dropothers=False) exp = exp.reindex(columns=obs.columns) clean = lambda df: df.sort_index().reset_index() @@ -173,19 +192,21 @@ def test_combine_rows_dropothers(): def test_combine_rows_sumall(): df = combine_rows_df() - exp = pd.DataFrame({ - 'sector': [ - 'sector1', - 'extra_b', - 'sector1', - ], - 'region': ['a', 'a', 'c'], - '2010': [2.0, 21, 42], - 'foo': [2.0, 21, 42], - 'units': ['Mt'] * 3, - 'gas': ['BC'] * 3, - }).set_index(utils.df_idx) - obs = utils.combine_rows(df, 'region', 'a', ['b'], sumall=False) + exp = pd.DataFrame( + { + "sector": [ + "sector1", + "extra_b", + "sector1", + ], + "region": ["a", "a", "c"], + "2010": [2.0, 21, 42], + "foo": [2.0, 21, 42], + "units": ["Mt"] * 3, + "gas": ["BC"] * 3, + } + ).set_index(utils.df_idx) + obs = utils.combine_rows(df, "region", "a", ["b"], sumall=False) exp = exp.reindex(columns=obs.columns) clean = lambda df: df.sort_index().reset_index() @@ -194,18 +215,20 @@ def test_combine_rows_sumall(): def test_isin(): df = combine_rows_df() - exp = pd.DataFrame({ - 'sector': [ - 'sector1', - 'sector2', - 'sector1', - ], - 'region': ['a', 'a', 'b'], - '2010': [1.0, 4.0, 2.0], - 'foo': [-1.0, -4.0, 2.0], - 'units': ['Mt'] * 3, - 'gas': ['BC'] * 3, - }).set_index(utils.df_idx) + exp = pd.DataFrame( + { + "sector": [ + "sector1", + "sector2", + "sector1", + ], + "region": ["a", "a", "b"], + "2010": [1.0, 4.0, 2.0], + "foo": [-1.0, -4.0, 2.0], + "units": ["Mt"] * 3, + "gas": ["BC"] * 3, + } + ).set_index(utils.df_idx) obs = exp.loc[ utils.isin(sector=["sector1", "sector2"], region=["a", "b", "non-existent"]) ]