From 381ec6f1a20d8af4f3c0f84448bf70b78895e5b9 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 15 Oct 2024 17:47:38 +0200 Subject: [PATCH] Remove recipe filler --- doc/sphinx/source/utils.rst | 57 -- esmvaltool/utils/recipe_filler.py | 914 ------------------------ setup.py | 2 - tests/integration/test_recipe_filler.py | 211 ------ 4 files changed, 1184 deletions(-) delete mode 100755 esmvaltool/utils/recipe_filler.py delete mode 100644 tests/integration/test_recipe_filler.py diff --git a/doc/sphinx/source/utils.rst b/doc/sphinx/source/utils.rst index 71de0e01f6..659701d42f 100644 --- a/doc/sphinx/source/utils.rst +++ b/doc/sphinx/source/utils.rst @@ -382,63 +382,6 @@ klaus.zimmermann@smhi.se .. _pygithub: https://pygithub.readthedocs.io/en/latest/introduction.html -Recipe filler -============= - -If you need to fill in a blank recipe with additional datasets, you can do that with -the command `recipe_filler`. This runs a tool to obtain a set of additional datasets when -given a blank recipe, and you can give an arbitrary number of data parameters. The blank recipe -should contain, to the very least, a list of diagnostics, each with their variable(s). -Example of running the tool: - -.. code-block:: bash - - recipe_filler recipe.yml - -where `recipe.yml` is the recipe that needs to be filled with additional datasets; a minimal -example of this recipe could be: - -.. code-block:: yaml - - diagnostics: - diagnostic: - variables: - ta: - mip: Amon # required - start_year: 1850 # required - end_year: 1900 # required - - -Key features ------------- - -- you can add as many variable parameters as are needed; if not added, the - tool will use the ``"*"`` wildcard and find all available combinations; -- you can restrict the number of datasets to be looked for with the ``dataset:`` - key for each variable, pass a list of datasets as value, e.g. - ``dataset: [MPI-ESM1-2-LR, MPI-ESM-LR]``; -- you can specify a pair of experiments, e.g. ``exp: [historical, rcp85]`` - for each variable; this will look for each available dataset per experiment - and assemble an aggregated data stretch from each experiment to complete - for the total data length specified by ``start_year`` and ``end_year``; equivalent to - ESMValTool's syntax on multiple experiments; this option needs an ensemble - to be declared explicitly; it will return no entry if there are gaps in data; -- ``start_year`` and ``end_year`` are required and are used to filter out the - datasets that don't have data in the interval; as noted above, the tool will not - return datasets with partial coverage from ``start_year`` to ``end_year``; - if you want all possible years hence no filtering on years just use ``"*"`` - for start and end years; -- ``config-user: rootpath: CMIPX`` may be a list, rootpath lists are supported; -- all major DRS paths (including ``default``, ``BADC``, ``ETHZ`` etc) are supported; -- speedup is achieved through CMIP mip tables lookup, so ``mip`` is required in recipe; - -Caveats -------- - -- the tool doesn't yet work with derived variables; it will not return any available datasets; -- operation restricted to CMIP data only, OBS lookup is not available yet. - - Extracting a list of input files from the provenance ==================================================== diff --git a/esmvaltool/utils/recipe_filler.py b/esmvaltool/utils/recipe_filler.py deleted file mode 100755 index 40f637c6d5..0000000000 --- a/esmvaltool/utils/recipe_filler.py +++ /dev/null @@ -1,914 +0,0 @@ -""" -Fill in a blank recipe with additional datasets. - -Tool to obtain a set of additional datasets when given a blank recipe. -The blank recipe should contain, to the very least, a list of diagnostics -each with their variable(s). Example of minimum settings: - -diagnostics: - diagnostic: - variables: - ta: - mip: Amon - start_year: 1850 - end_year: 1900 - -Note that the tool will exit if any of these minimum settings are missing! - -Key features: - -- you can add as many variable parameters as are needed; if not added, the - tool will use the "*" wildcard and find all available combinations; -- you can restrict the number of datasets to be looked for with the `dataset:` - key for each variable, pass a list of datasets as value, e.g. - `dataset: [MPI-ESM1-2-LR, MPI-ESM-LR]`; -- you can specify a pair of experiments eg `exp: [rcp26, rcp85]` - for each variable; this will look for each available dataset per experiment - and assemble an aggregated data stretch from each experiment; equivalent to - esmvaltool's syntax of multiple experiments; this option needs an ensemble - to be declared explicitly; it will return no entry if there are gaps in data -- `start_year` and `end_year` are mandatory and are used to filter out the - datasets that don't have data in the interval; if you want all possible years - hence no filtering on years just use "*" for start and end years; -- `config-user: rootpath: CMIPX` may be a list, rootpath lists are supported; - -Caveats: - -- the tool doesn't yet work for derived variables; -- operation restricted to CMIP data. - -Have fun! -""" -import argparse -import datetime -import itertools -import logging -import logging.config -import os -import shutil -import time -from glob import glob -from pathlib import Path - -import esmvalcore -import yaml - -from esmvalcore import __version__ as core_ver -from esmvalcore.cmor.table import CMOR_TABLES, read_cmor_tables -from packaging import version as pkg_version -from ruamel.yaml import YAML - -logger = logging.getLogger(__name__) - -CFG = {} - - -def _purge_file_handlers(cfg: dict) -> None: - """Remove handlers with filename set. - - This is used to remove file handlers which require an output - directory to be set. - """ - cfg['handlers'] = { - name: handler - for name, handler in cfg['handlers'].items() - if 'filename' not in handler - } - prev_root = cfg['root']['handlers'] - cfg['root']['handlers'] = [ - name for name in prev_root if name in cfg['handlers'] - ] - - -def _update_stream_level(cfg: dict, level=None): - """Update the log level for the stream handlers.""" - handlers = cfg['handlers'] - - for handler in handlers.values(): - if level is not None and 'stream' in handler: - if handler['stream'] in ('ext://sys.stdout', 'ext://sys.stderr'): - handler['level'] = level.upper() - - -def _get_log_files(cfg: dict, output_dir: str = None) -> list: - """Initialize log files for the file handlers.""" - log_files = [] - - handlers = cfg['handlers'] - - for handler in handlers.values(): - filename = handler.get('filename', None) - - if filename: - if not os.path.isabs(filename): - handler['filename'] = os.path.join(output_dir, filename) - log_files.append(handler['filename']) - - return log_files - - -def configure_logging(cfg_file: str = None, - output_dir: str = None, - console_log_level: str = None) -> list: - """Configure logging. - - Parameters - ---------- - cfg_file : str, optional - Logging config file. If `None`, defaults to `configure-logging.yml` - output_dir : str, optional - Output directory for the log files. If `None`, log only to the console. - console_log_level : str, optional - If `None`, use the default (INFO). - - Returns - ------- - log_files : list - Filenames that will be logged to. - """ - if cfg_file is None: - cfg_loc = Path(esmvalcore.__file__ + "esmvalcore") - if pkg_version.parse(core_ver) < pkg_version.parse('2.8.0'): - cfg_file = cfg_loc.parents[0] / '_config' / 'config-logging.yml' - else: - cfg_file = cfg_loc.parents[0] / 'config' / 'config-logging.yml' - - cfg_file = Path(cfg_file).absolute() - - with open(cfg_file) as file_handler: - cfg = yaml.safe_load(file_handler) - - if output_dir is None: - _purge_file_handlers(cfg) - - log_files = _get_log_files(cfg, output_dir=output_dir) - _update_stream_level(cfg, level=console_log_level) - - logging.config.dictConfig(cfg) - logging.Formatter.converter = time.gmtime - logging.captureWarnings(True) - - return log_files - - -def read_config_developer_file(cfg_file=None): - """Read the developer's configuration file.""" - if cfg_file is None: - cfg_loc = Path(esmvalcore.__file__ + "esmvalcore") - cfg_file = cfg_loc.parents[0] / 'config-developer.yml' - - with open(cfg_file, 'r') as file: - cfg = yaml.safe_load(file) - - return cfg - - -def _normalize_path(path): - """Normalize paths. - - Expand ~ character and environment variables and convert path to absolute. - - Parameters - ---------- - path: str - Original path - - Returns - ------- - str: - Normalized path - """ - if path is None: - return None - return os.path.abspath(os.path.expanduser(os.path.expandvars(path))) - - -def read_config_user_file(config_file, folder_name, options=None): - """Read config user file and store settings in a dictionary.""" - if not config_file: - config_file = '~/.esmvaltool/config-user.yml' - config_file = os.path.abspath( - os.path.expandvars(os.path.expanduser(config_file))) - # Read user config file - if not os.path.exists(config_file): - print(f"ERROR: Config file {config_file} does not exist") - - with open(config_file, 'r') as file: - cfg = yaml.safe_load(file) - - if options is None: - options = dict() - for key, value in options.items(): - cfg[key] = value - - # set defaults - defaults = { - 'compress_netcdf': False, - 'exit_on_warning': False, - 'output_file_type': 'png', - 'output_dir': 'esmvaltool_output', - 'auxiliary_data_dir': 'auxiliary_data', - 'save_intermediary_cubes': False, - 'remove_preproc_dir': True, - 'max_parallel_tasks': None, - 'run_diagnostic': True, - 'profile_diagnostic': False, - 'config_developer_file': None, - 'drs': {}, - } - - for key in defaults: - if key not in cfg: - logger.info( - "No %s specification in config file, " - "defaulting to %s", key, defaults[key]) - cfg[key] = defaults[key] - - cfg['output_dir'] = _normalize_path(cfg['output_dir']) - cfg['auxiliary_data_dir'] = _normalize_path(cfg['auxiliary_data_dir']) - - cfg['config_developer_file'] = _normalize_path( - cfg['config_developer_file']) - - for key in cfg['rootpath']: - root = cfg['rootpath'][key] - if isinstance(root, str): - cfg['rootpath'][key] = [_normalize_path(root)] - else: - cfg['rootpath'][key] = [_normalize_path(path) for path in root] - - # insert a directory date_time_recipe_usertag in the output paths - now = datetime.datetime.utcnow().strftime("%Y%m%d_%H%M%S") - new_subdir = '_'.join((folder_name, now)) - cfg['output_dir'] = os.path.join(cfg['output_dir'], new_subdir) - - # create subdirectories - cfg['preproc_dir'] = os.path.join(cfg['output_dir'], 'preproc') - cfg['work_dir'] = os.path.join(cfg['output_dir'], 'work') - cfg['plot_dir'] = os.path.join(cfg['output_dir'], 'plots') - cfg['run_dir'] = os.path.join(cfg['output_dir'], 'run') - - # Read developer configuration file - read_cmor_tables(cfg['config_developer_file']) - - return cfg - - -HEADER = r""" -______________________________________________________________________ - _____ ____ __ ____ __ _ _____ _ - | ____/ ___|| \/ \ \ / /_ _| |_ _|__ ___ | | - | _| \___ \| |\/| |\ \ / / _` | | | |/ _ \ / _ \| | - | |___ ___) | | | | \ V / (_| | | | | (_) | (_) | | - |_____|____/|_| |_| \_/ \__,_|_| |_|\___/ \___/|_| -______________________________________________________________________ - -""" + __doc__ - -dataset_order = [ - 'dataset', 'project', 'exp', 'mip', 'ensemble', 'grid', 'start_year', - 'end_year' -] - -# cmip eras -cmip_eras = ["CMIP5", "CMIP6"] - -# The base dictionairy (all wildcards): -base_dict = { - 'institute': '*', - 'dataset': '*', - 'project': '*', - 'exp': '*', - 'frequency': '*', - 'ensemble': '*', - 'mip': '*', - 'modeling_realm': '*', - 'short_name': '*', - 'grid': '*', - 'start_year': '*', - 'end_year': '*', - 'activity': '*', -} - - -def _get_download_dir(yamlconf, cmip_era): - """Get the Download Directory from user config file.""" - if 'download_dir' in yamlconf: - return os.path.join(yamlconf['download_dir'], cmip_era) - return False - - -def _get_site_rootpath(cmip_era): - """Get site (drs) from config-user.yml.""" - config_yml = get_args().config_file - with open(config_yml, 'r') as yamf: - yamlconf = yaml.safe_load(yamf) - drs = yamlconf['drs'][cmip_era] - - download_dir = _get_download_dir(yamlconf, cmip_era) - rootdir = [yamlconf['rootpath'][cmip_era], ] - - if download_dir: - rootdir.append(download_dir) - logger.debug("%s root directory %s", cmip_era, rootdir) - if drs == 'default' and 'default' in yamlconf['rootpath']: - rootdir = [yamlconf['rootpath']['default'], ] - if download_dir: - rootdir.append(download_dir) - - logger.debug("Using drs default and " - "default: %s data directory", rootdir) - - return drs, rootdir - - -def _get_input_dir(cmip_era): - """Get input_dir from config-developer.yml.""" - site = _get_site_rootpath(cmip_era)[0] - yamlconf = read_config_developer_file() - - return yamlconf[cmip_era]['input_dir'][site] - - -def _get_input_file(cmip_era): - """Get input_file from config-developer.yml.""" - yamlconf = read_config_developer_file() - return yamlconf[cmip_era]['input_file'] - - -def _determine_basepath(cmip_era): - """Determine a basepath.""" - if isinstance(_get_site_rootpath(cmip_era)[1], list): - rootpaths = _get_site_rootpath(cmip_era)[1] - else: - rootpaths = [_get_site_rootpath(cmip_era)[1]] - - basepaths = [] - for rootpath in rootpaths: - if _get_input_dir(cmip_era) != os.path.sep: - basepath = os.path.join(rootpath, _get_input_dir(cmip_era), - _get_input_file(cmip_era)) - else: - basepath = os.path.join(rootpath, _get_input_file(cmip_era)) - basepath = basepath.replace('//', '/') - basepaths.append(basepath) - logger.debug("We will look for files of patterns %s", basepaths) - - return basepaths - - -def _overlapping_datasets(files, all_years, start_year, end_year): - """Process overlapping datasets and check for avail data in time range.""" - valid_files = [] - ay_sorted = sorted(all_years) - if ay_sorted[0] <= start_year and ay_sorted[-1] >= end_year: - yr_pairs = sorted( - [all_years[i:i + 2] for i in range(0, len(all_years), 2)]) - yr_pairs = list(k for k, _ in itertools.groupby(yr_pairs)) - d_y = [ - yr_pairs[j][1] - yr_pairs[j + 1][0] - for j in range(len(yr_pairs) - 1) - ] - gaps = [c for c in d_y if c < -1] - if not gaps: - valid_files = files - logger.info("Contiguous data from multiple experiments.") - else: - logger.warning("Data from multiple exps has >1 year gaps! ") - logger.debug("Start %s/end %s requested - " - "files covering %s found.", - start_year, end_year, yr_pairs) - - return valid_files - - -def filter_years(files, start_year, end_year, overlap=False): - """ - Filter out files that are outside requested time range. - - Nifty function that takes a list of files and two years - as arguments; it will build a series of filter dictionaries - and check if data is available for the entire interval; - it will return a single file per dataset, the first file - in the list of files that cover the specified interval; - optional argument `overlap` used if multiple experiments are - used and overlap between datasets is present. - - Parameters - ---------- - files: list - A list of files that need filtering by requested time range. - - start_year: int - Integer start year of requested range. - - end_year: int - Integer end year of requested range. - - overlap: bool - Flag if datasets overlap; defaults to False. - - Returns - ------- - list - List of files which have been identified as falling in - the requested time range; if multiple files within time range - per dataset, the first file will be returned. - - """ - valid_files = [] - available_years = {} - - if start_year == "*" and end_year == "*": - return files - - if not files: - return valid_files - - all_files_roots = [("").join(fil.split("_")[0:-1]) for fil in files] - for fil in files: - available_years[("").join(fil.split("_")[0:-1])] = [] - for fil in files: - available_years[("").join(fil.split("_")[0:-1])].append( - fil.split("_")[-1].strip(".nc").split("-")) - - all_years = [] - for root, yr_list in available_years.items(): - actual_years = [] - yr_list = list(itertools.chain.from_iterable(yr_list)) - for year in yr_list: - if len(year) == 4: - actual_years.append(int(year)) - else: - actual_years.append(int(year[0:4])) - actual_years = sorted(actual_years) - all_years.extend(actual_years) - if not overlap: - actual_years = sorted(list(set(actual_years))) - if actual_years[0] <= start_year and actual_years[-1] >= end_year: - idx = all_files_roots.index(root) - valid_files.append(files[idx]) - - # multiple experiments to complete each other - if overlap: - valid_files = _overlapping_datasets(files, all_years, start_year, - end_year) - - if not valid_files: - logger.warning("No data found to fully cover start " - "%s / end %s as requested!", start_year, end_year) - - return valid_files - - -def _resolve_latestversion(dirname_template): - """Resolve the 'latestversion' tag.""" - for version_separator in ['{latestversion}', '{version}']: - if version_separator in dirname_template: - break - else: - return dirname_template - - # Find latest version - part1, part2 = dirname_template.split(version_separator) - part2 = part2.lstrip(os.sep) - part1_contents = glob(part1) - if part1_contents: - versions = os.listdir(part1_contents[0]) - versions.sort(reverse=True) - for version in ['latest'] + versions: - dirname = os.path.join(part1, version, part2) - if glob(dirname): - return dirname - - return dirname_template - - -def list_all_files(file_dict, cmip_era): - """ - List all files that match the dataset dictionary. - - Function that returns all files that are determined by a - file_dict dictionary; file_dict is keyed on usual parameters - like `dataset`, `project`, `mip` etc; glob.glob is used - to find files; speedup is achieved by replacing wildcards - with values from CMOR tables. - - Parameters - ---------- - file_dict: dict - Dictionary to hold dataset specifications. - - cmip_era: str - Either CMIP5 or CMIP6. - - Returns - ------- - list: - List of found files. - - """ - mip = file_dict['mip'] - short_name = file_dict['short_name'] - try: - frequency = CMOR_TABLES[cmip_era].get_variable(mip, - short_name).frequency - realms = CMOR_TABLES[cmip_era].get_variable(mip, - short_name).modeling_realm - except AttributeError: - logger.warning("Could not find %s CMOR table " - "for variable %s with mip %s", - cmip_era, short_name, mip) - return [] - file_dict['frequency'] = frequency - - basepaths = _determine_basepath(cmip_era) - all_files = [] - - for basepath in basepaths: - new_path = basepath[:] - - # could have multiple realms - for realm in realms: - file_dict['modeling_realm'] = realm - - # load all the files in the custom dict - for key, value in file_dict.items(): - new_path = new_path.replace('{' + key + '}', str(value)) - new_path = _resolve_latestversion(new_path) - if new_path.startswith("~"): - new_path = os.path.expanduser(new_path) - if not new_path.startswith(os.sep): - raise ValueError( - "Could not expand ~ to user home dir " - "please expand it in the config user file!") - logger.info("Expanding path to %s", new_path) - - # Globs all the wildcards into a list of files. - files = glob(new_path) - all_files.extend(files) - if not all_files: - logger.warning("Could not find any file for data specifications.") - - return all_files - - -def _file_to_recipe_dataset(fn_path, cmip_era, file_dict): - """Convert a filename to an recipe ready dataset.""" - # Add the obvious ones - ie the one you requested! - output_dataset = {} - output_dataset['project'] = cmip_era - for key, value in file_dict.items(): - if value == '*': - continue - if key in dataset_order: - output_dataset[key] = value - - # Split file name and base path into directory structure and filenames. - basefiles = _determine_basepath(cmip_era) - _, fnfile = os.path.split(fn_path) - - for basefile in basefiles: - _, basefile = os.path.split(basefile) - # Some of the key words include the splitting character '_' ! - basefile = basefile.replace('short_name', 'shortname') - basefile = basefile.replace('start_year', 'startyear') - basefile = basefile.replace('end_year', 'endyear') - - # Assume filename is separated by '_' - basefile_split = [key.replace("{", "") for key in basefile.split('_')] - basefile_split = [key.replace("}", "") for key in basefile_split] - fnfile_split = fnfile.split('_') - - # iterate through directory structure looking for useful bits. - for base_key, fn_key in zip(basefile_split, fnfile_split): - if base_key == '*.nc': - fn_key = fn_key.replace('.nc', '') - start_year, end_year = fn_key.split('-') - output_dataset['start_year'] = start_year - output_dataset['end_year'] = end_year - elif base_key == "ensemble*.nc": - output_dataset['ensemble'] = fn_key - elif base_key == "grid*.nc": - output_dataset['grid'] = fn_key - elif base_key == "shortname": - pass - else: - output_dataset[base_key] = fn_key - if "exp" in file_dict: - if isinstance(file_dict["exp"], list): - output_dataset["exp"] = file_dict["exp"] - - return output_dataset - - -def _remove_duplicates(add_datasets): - """ - Remove accidental duplicates. - - Close to 0% chances this will ever be used. - May be used when there are actual duplicates in data - storage, we've seen these before, but seldom. - """ - datasets = [] - seen = set() - - for dataset in add_datasets: - orig_exp = dataset["exp"] - dataset["exp"] = str(dataset["exp"]) - tup_dat = tuple(dataset.items()) - if tup_dat not in seen: - seen.add(tup_dat) - dataset["exp"] = orig_exp - datasets.append(dataset) - - return datasets - - -def _check_recipe(recipe_dict): - """Perform a quick recipe check for mandatory fields.""" - do_exit = False - if "diagnostics" not in recipe_dict: - logger.error("Recipe missing diagnostics section.") - do_exit = True - for diag_name, diag in recipe_dict["diagnostics"].items(): - if "variables" not in diag: - logger.error("Diagnostic %s missing variables.", diag_name) - do_exit = True - for var_name, var_pars in diag["variables"].items(): - if "mip" not in var_pars: - logger.error("Variable %s missing mip.", var_name) - do_exit = True - if "start_year" not in var_pars: - logger.error("Variable %s missing start_year.", var_name) - do_exit = True - if "end_year" not in var_pars: - logger.error("Variable %s missing end_year.", var_name) - do_exit = True - if "exp" in var_pars: - if isinstance(var_pars["exp"], - list) and "ensemble" not in var_pars: - logger.error("Asking for experiments list for ") - logger.error("variable %s - you need to ", var_name) - logger.error("define an ensemble for this case.") - do_exit = True - if do_exit: - raise ValueError("Please fix the issues in recipe and rerun") - - -def _check_config_file(user_config_file): - """Perform a quick recipe check for mandatory fields.""" - do_exit = False - if "rootpath" not in user_config_file: - logger.error("Config file missing rootpath section.") - do_exit = True - if "drs" not in user_config_file: - logger.error("Config file missing drs section.") - do_exit = True - for proj in cmip_eras: - if proj not in user_config_file["rootpath"].keys(): - logger.error("Config file missing rootpath for %s", proj) - do_exit = True - if proj not in user_config_file["drs"].keys(): - logger.error("Config file missing drs for %s", proj) - do_exit = True - if do_exit: - raise ValueError("Please fix issues in config file and rerun") - - -def _parse_recipe_to_dicts(yamlrecipe): - """Parse a recipe's variables into a dictionary of dictionairies.""" - output_dicts = {} - for diag in yamlrecipe['diagnostics']: - for variable, var_dict in yamlrecipe['diagnostics'][diag][ - 'variables'].items(): - new_dict = base_dict.copy() - for var_key, var_value in var_dict.items(): - if var_key in new_dict: - new_dict[var_key] = var_value - output_dicts[(diag, variable)] = new_dict - - return output_dicts - - -def _add_datasets_into_recipe(additional_datasets, output_recipe): - """Add the datasets into a new recipe.""" - yaml = YAML() - yaml.default_flow_style = False - with open(output_recipe, 'r') as yamlfile: - cur_yaml = yaml.load(yamlfile) - for diag_var, add_dat in additional_datasets.items(): - if add_dat: - if 'additional_datasets' in cur_yaml['diagnostics']: - cur_yaml['diagnostics'][diag_var[0]]['variables'][ - diag_var[1]]['additional_datasets'].extend(add_dat) - else: - cur_yaml['diagnostics'][diag_var[0]]['variables'][ - diag_var[1]]['additional_datasets'] = add_dat - if cur_yaml: - with open(output_recipe, 'w') as yamlfile: - yaml.dump(cur_yaml, yamlfile) - - -def _find_all_datasets(recipe_dict, cmip_eras): - """Find all datasets explicitly.""" - datasets = [] - for cmip_era in cmip_eras: - if cmip_era == "CMIP6": - activity = "CMIP" - else: - activity = "" - drs, site_path = _get_site_rootpath(cmip_era) - if drs in ["default", "SMHI"]: - logger.info("DRS is %s; filter on dataset disabled.", drs) - datasets = ["*"] - else: - if not isinstance(site_path, list): - site_path = [site_path] - for site_pth in site_path: - if drs in ["BADC", "DKRZ", "CP4CDS"]: - institutes_path = os.path.join(site_pth, activity) - elif drs in ["ETHZ", "RCAST"]: - exp = recipe_dict["exp"][0] - if exp == "*": - exp = "piControl" # all institutes have piControl - mip = recipe_dict["mip"] - var = recipe_dict["short_name"] - institutes_path = os.path.join(site_pth, exp, mip, var) - - if not os.path.isdir(institutes_path): - logger.warning("Path to data %s " - "does not exist; will look everywhere.", - institutes_path) - datasets = ["*"] - return datasets - - institutes = os.listdir(institutes_path) - if drs in ["BADC", "DKRZ", "CP4CDS"]: - for institute in institutes: - datasets.extend( - os.listdir(os.path.join(institutes_path, - institute))) - else: - datasets.extend(institutes) - - return datasets - - -def _get_exp(recipe_dict): - """Get the correct exp as list of single or multiple exps.""" - if isinstance(recipe_dict["exp"], list): - exps_list = recipe_dict["exp"] - logger.info("Multiple %s experiments requested", exps_list) - else: - exps_list = [recipe_dict["exp"]] - logger.info("Single %s experiment requested", exps_list) - - return exps_list - - -def _get_datasets(recipe_dict, cmip_eras): - """Get the correct datasets as list if needed.""" - if recipe_dict["dataset"] == "*": - datasets = _find_all_datasets(recipe_dict, cmip_eras) - return datasets - if isinstance(recipe_dict['dataset'], list): - datasets = recipe_dict['dataset'] - logger.info("Multiple %s datasets requested", datasets) - else: - datasets = [recipe_dict['dataset']] - logger.info("Single %s dataset requested", datasets) - - return datasets - - -def get_args(): - """Parse command line arguments.""" - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument('recipe', help='Path/name of yaml pilot recipe file') - parser.add_argument('-c', - '--config-file', - default=os.path.join(os.environ["HOME"], '.esmvaltool', - 'config-user.yml'), - help='User configuration file') - - parser.add_argument('-o', - '--output', - default=os.path.join(os.getcwd(), - 'recipe_autofilled.yml'), - help='Output recipe, default recipe_autofilled.yml') - - args = parser.parse_args() - return args - - -def _get_timefiltered_files(recipe_dict, exps_list, cmip_era): - """Obtain all files that correspond to requested time range.""" - # multiple experiments allowed, complement data from each exp - if len(exps_list) > 1: - files = [] - for exp in exps_list: - recipe_dict["exp"] = exp - files.extend(list_all_files(recipe_dict, cmip_era)) - files = filter_years(files, - recipe_dict["start_year"], - recipe_dict["end_year"], - overlap=True) - recipe_dict["exp"] = exps_list - - else: - files = list_all_files(recipe_dict, cmip_era) - files = filter_years(files, recipe_dict["start_year"], - recipe_dict["end_year"]) - - return files - - -def run(): - """Run the `recipe_filler` tool. Help in __doc__ and via --help.""" - # Get arguments - args = get_args() - input_recipe = args.recipe - output_recipe = args.output - cmip_eras = ["CMIP5", "CMIP6"] - - # read the config file - config_user = read_config_user_file(args.config_file, - 'recipe_filler', - options={}) - - # configure logger - run_dir = os.path.join(config_user['output_dir'], 'recipe_filler') - if not os.path.isdir(run_dir): - os.makedirs(run_dir) - log_files = configure_logging(output_dir=run_dir, - console_log_level=config_user['log_level']) - logger.info(HEADER) - logger.info("Using user configuration file: %s", args.config_file) - logger.info("Using pilot recipe file: %s", input_recipe) - logger.info("Writing filled out recipe to: %s", output_recipe) - log_files = "\n".join(log_files) - logger.info("Writing program log files to:\n%s", log_files) - - # check config user file - _check_config_file(config_user) - - # parse recipe - with open(input_recipe, 'r') as yamlfile: - yamlrecipe = yaml.safe_load(yamlfile) - _check_recipe(yamlrecipe) - recipe_dicts = _parse_recipe_to_dicts(yamlrecipe) - - # Create a list of additional_datasets for each diagnostic/variable. - additional_datasets = {} - for (diag, variable), recipe_dict in recipe_dicts.items(): - logger.info("Looking for data for " - "variable %s in diagnostic %s", variable, diag) - new_datasets = [] - if "short_name" not in recipe_dict: - recipe_dict['short_name'] = variable - elif recipe_dict['short_name'] == "*": - recipe_dict['short_name'] = variable - - # adjust cmip era if needed - if recipe_dict['project'] != "*": - cmip_eras = [recipe_dict['project']] - - # get datasets depending on user request; always a list - datasets = _get_datasets(recipe_dict, cmip_eras) - - # get experiments depending on user request; always a list - exps_list = _get_exp(recipe_dict) - - # loop through datasets - for dataset in datasets: - recipe_dict['dataset'] = dataset - logger.info("Seeking data for dataset: %s", dataset) - for cmip_era in cmip_eras: - files = _get_timefiltered_files(recipe_dict, exps_list, - cmip_era) - - # assemble in new recipe - add_datasets = [] - for fn in sorted(files): - fn_dir = os.path.dirname(fn) - logger.info("Data directory: %s", fn_dir) - out = _file_to_recipe_dataset(fn, cmip_era, recipe_dict) - logger.info("New recipe entry: %s", out) - if out is None: - continue - add_datasets.append(out) - new_datasets.extend(add_datasets) - additional_datasets[(diag, variable, cmip_era)] = \ - _remove_duplicates(new_datasets) - - # add datasets to recipe as additional_datasets - shutil.copyfile(input_recipe, output_recipe, follow_symlinks=True) - _add_datasets_into_recipe(additional_datasets, output_recipe) - logger.info("Finished recipe filler. Go get some science done now!") - - -if __name__ == "__main__": - run() diff --git a/setup.py b/setup.py index 6b4636d1f7..86aab79854 100755 --- a/setup.py +++ b/setup.py @@ -250,8 +250,6 @@ def read_description(filename): 'nclcodestyle = esmvaltool.utils.nclcodestyle.nclcodestyle:_main', 'test_recipe = ' 'esmvaltool.utils.testing.recipe_settings.install_expand_run:main', - 'recipe_filler = ' - 'esmvaltool.utils.recipe_filler:run' ], 'esmvaltool_commands': [ 'colortables = ' diff --git a/tests/integration/test_recipe_filler.py b/tests/integration/test_recipe_filler.py deleted file mode 100644 index b78ac8c5f8..0000000000 --- a/tests/integration/test_recipe_filler.py +++ /dev/null @@ -1,211 +0,0 @@ -"""Tests for _data_finder.py.""" -import contextlib -import os -import shutil -import sys -import tempfile - -import pytest -import yaml - -from esmvaltool.utils.recipe_filler import run - - -# Load test configuration -with open(os.path.join(os.path.dirname(__file__), - 'recipe_filler.yml')) as file: - CONFIG = yaml.safe_load(file) - - -@contextlib.contextmanager -def arguments(*args): - backup = sys.argv - sys.argv = list(args) - yield - sys.argv = backup - - -def print_path(path): - """Print path.""" - txt = path - if os.path.isdir(path): - txt += '/' - if os.path.islink(path): - txt += ' -> ' + os.readlink(path) - print(txt) - - -def tree(path): - """Print path, similar to the the `tree` command.""" - print_path(path) - for dirpath, dirnames, filenames in os.walk(path): - for dirname in dirnames: - print_path(os.path.join(dirpath, dirname)) - for filename in filenames: - print_path(os.path.join(dirpath, filename)) - - -def create_file(filename): - """Create an empty file.""" - dirname = os.path.dirname(filename) - if not os.path.exists(dirname): - os.makedirs(dirname) - - with open(filename, 'a'): - pass - - -def create_tree(path, filenames=None, symlinks=None): - """Create directory structure and files.""" - for filename in filenames or []: - create_file(os.path.join(path, filename)) - - for symlink in symlinks or []: - link_name = os.path.join(path, symlink['link_name']) - os.symlink(symlink['target'], link_name) - - -def write_config_user_file(dirname, file_path, drs): - config_file = dirname / 'config-user.yml' - cfg = { - 'log_level': 'info', - 'output_dir': str(dirname / 'recipe_filler_output'), - 'rootpath': { - 'CMIP5': str(dirname / file_path), - 'CMIP6': str(dirname / file_path), - }, - 'drs': { - 'CMIP5': drs, - 'CMIP6': drs, - }, - } - config_file.write_text(yaml.safe_dump(cfg, encoding=None)) - return str(config_file) - - -def write_recipe(dirname, recipe_dict): - recipe_file = dirname / 'recipe.yml' - diags = {'diagnostics': recipe_dict} - recipe_file.write_text(yaml.safe_dump(diags, encoding=None)) - return str(recipe_file) - - -@pytest.fixture -def root(): - """Root function for tests.""" - dirname = tempfile.mkdtemp() - yield os.path.join(dirname, 'output1') - print("Directory structure was:") - tree(dirname) - shutil.rmtree(dirname) - - -def setup_files(tmp_path, root, cfg): - """Create config, recipe ,output recipe etc.""" - user_config_file = write_config_user_file(tmp_path, root, cfg['drs']) - diagnostics = {} - diagnostics["test_diagnostic"] = {} - diagnostics["test_diagnostic"]["variables"] = {} - diagnostics["test_diagnostic"]["variables"]["test_var"] = cfg["variable"] - recipe = write_recipe(tmp_path, diagnostics) - output_recipe = str(tmp_path / "recipe_auto.yml") - - return user_config_file, recipe, output_recipe - - -@pytest.mark.parametrize('cfg', CONFIG['has_additional_datasets']) -def test_adding_datasets(tmp_path, root, cfg): - """Test retrieving additional datasets.""" - create_tree(root, cfg.get('available_files'), - cfg.get('available_symlinks')) - - user_config_file, recipe, output_recipe = setup_files(tmp_path, root, cfg) - - with arguments( - 'recipe_filler', - recipe, - '-c', - user_config_file, - '-o', - output_recipe, - ): - run() - - with open(output_recipe, 'r') as file: - autofilled_recipe = yaml.safe_load(file) - diag = autofilled_recipe["diagnostics"]["test_diagnostic"] - var = diag["variables"]["test_var"] - assert "additional_datasets" in var - - -@pytest.mark.parametrize('cfg', CONFIG['no_additional_datasets']) -def test_not_adding_datasets(tmp_path, root, cfg): - """Test retrieving no additional datasets.""" - create_tree(root, cfg.get('available_files'), - cfg.get('available_symlinks')) - - user_config_file, recipe, output_recipe = setup_files(tmp_path, root, cfg) - - with arguments( - 'recipe_filler', - recipe, - '-c', - user_config_file, - '-o', - output_recipe, - ): - run() - - with open(output_recipe, 'r') as file: - autofilled_recipe = yaml.safe_load(file) - diag = autofilled_recipe["diagnostics"]["test_diagnostic"] - var = diag["variables"]["test_var"] - assert "additional_datasets" not in var - - -def test_bad_var(tmp_path, root): - """Test a bad variable in the works.""" - cfg = CONFIG['bad_variable'][0] - user_config_file, recipe, output_recipe = setup_files(tmp_path, root, cfg) - - # this doesn't fail and it shouldn't since it can go on - # and look for data for other valid variables - with arguments( - 'recipe_filler', - recipe, - '-c', - user_config_file, - '-o', - output_recipe, - ): - run() - - with open(output_recipe, 'r') as file: - autofilled_recipe = yaml.safe_load(file) - diag = autofilled_recipe["diagnostics"]["test_diagnostic"] - var = diag["variables"]["test_var"] - assert "additional_datasets" not in var - - -def test_no_short_name(tmp_path, root): - """Test a bad variable in the works.""" - cfg = CONFIG['no_short_name'][0] - user_config_file, recipe, output_recipe = setup_files(tmp_path, root, cfg) - - # this doesn't fail and it shouldn't since it can go on - # and look for data for other valid variables - with arguments( - 'recipe_filler', - recipe, - '-c', - user_config_file, - '-o', - output_recipe, - ): - run() - - with open(output_recipe, 'r') as file: - autofilled_recipe = yaml.safe_load(file) - diag = autofilled_recipe["diagnostics"]["test_diagnostic"] - var = diag["variables"]["test_var"] - assert "additional_datasets" not in var