From 3b3e92b2f6ba0f3da227994322876eeff736628d Mon Sep 17 00:00:00 2001 From: Merel Theisen <49397448+merelcht@users.noreply.github.com> Date: Tue, 20 Dec 2022 12:40:42 +0000 Subject: [PATCH] Add `OmegaConfLoader` (#2085) * Add OmegaConfLoader with tests * Use OmegaConf for merging configuration from different dirs * Clear built-in resolvers from OmegaConf * Improve class docstring * Refactor loading and processing of config files * Revert changes to common and refactor omegaconf methods instead * Merge load_omegaconf into load_conf method and clean up * Make test purpose clearer * Make methods class methods + allow for directly setting of config on loader instance * Rewrite check duplicate to check all files + add test Signed-off-by: Merel Theisen --- RELEASE.md | 5 +- dependency/requirements.txt | 2 +- kedro/config/__init__.py | 2 + kedro/config/config.py | 4 + kedro/config/omegaconf_config.py | 278 +++++++++++++++++ kedro/config/templated_config.py | 8 +- tests/config/test_config.py | 19 ++ tests/config/test_omegaconf_config.py | 423 ++++++++++++++++++++++++++ tests/config/test_templated_config.py | 22 ++ 9 files changed, 757 insertions(+), 6 deletions(-) create mode 100644 kedro/config/omegaconf_config.py create mode 100644 tests/config/test_omegaconf_config.py diff --git a/RELEASE.md b/RELEASE.md index d9bd8486bc..1a34728490 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -11,11 +11,12 @@ # Upcoming Release 0.18.5 ## Major features and improvements -* Add the `--conf-source` option to `kedro run`, allowing users to specify a source for project configuration for the run. +* Added new `OmegaConfLoader` which uses `OmegaConf` for loading and merging configuration. +* Added the `--conf-source` option to `kedro run`, allowing users to specify a source for project configuration for the run. +* Added `omegaconf` syntax as option for `--params`. Keys and values can now be separated by colons or equals signs. ## Bug fixes and other changes * Fix bug where `micropkg` manifest section in `pyproject.toml` isn't recognised as allowed configuration. -* Added `omegaconf` syntax as option for `--params`. Keys and values can now be separated by colons or equals signs. * Added anyconfig's `ac_context` parameter to `kedro.config.commons` module functions for more flexible `ConfigLoader` customizations. ## Breaking changes to the API diff --git a/dependency/requirements.txt b/dependency/requirements.txt index 658f9a9180..5263fa5eb5 100644 --- a/dependency/requirements.txt +++ b/dependency/requirements.txt @@ -10,7 +10,7 @@ importlib-metadata>=3.6; python_version >= '3.8' importlib_metadata>=3.6, <5.0; python_version < '3.8' # The "selectable" entry points were introduced in `importlib_metadata` 3.6 and Python 3.10. Bandit on Python 3.7 relies on a library with `importlib_metadata` < 5.0 importlib_resources>=1.3 # The `files()` API was introduced in `importlib_resources` 1.3 and Python 3.9. jmespath>=0.9.5, <1.0 -omegaconf~=2.2 +omegaconf~=2.3 pip-tools~=6.12 pluggy~=1.0.0 PyYAML>=4.2, <7.0 diff --git a/kedro/config/__init__.py b/kedro/config/__init__.py index f632c4039a..c76508d5c1 100644 --- a/kedro/config/__init__.py +++ b/kedro/config/__init__.py @@ -8,6 +8,7 @@ MissingConfigException, ) from .config import ConfigLoader +from .omegaconf_config import OmegaConfLoader from .templated_config import TemplatedConfigLoader __all__ = [ @@ -16,4 +17,5 @@ "ConfigLoader", "MissingConfigException", "TemplatedConfigLoader", + "OmegaConfLoader", ] diff --git a/kedro/config/config.py b/kedro/config/config.py index 09105700e9..6f45e830d9 100644 --- a/kedro/config/config.py +++ b/kedro/config/config.py @@ -108,6 +108,10 @@ def __init__( ) def __getitem__(self, key): + # Allow bypassing of loading config from patterns if a key and value have been set + # explicitly on the ``ConfigLoader`` instance. + if key in self: + return super().__getitem__(key) return self.get(*self.config_patterns[key]) def __repr__(self): # pragma: no cover diff --git a/kedro/config/omegaconf_config.py b/kedro/config/omegaconf_config.py new file mode 100644 index 0000000000..ce4efe386f --- /dev/null +++ b/kedro/config/omegaconf_config.py @@ -0,0 +1,278 @@ +"""This module provides ``kedro.config`` with the functionality to load one +or more configuration files of yaml or json type from specified paths through OmegaConf. +""" +import logging +from glob import iglob +from pathlib import Path +from typing import Any, Dict, Iterable, List, Set # noqa + +from omegaconf import OmegaConf +from yaml.parser import ParserError +from yaml.scanner import ScannerError + +from kedro.config import AbstractConfigLoader, MissingConfigException + +_config_logger = logging.getLogger(__name__) + + +class OmegaConfLoader(AbstractConfigLoader): + """Recursively scan directories (config paths) contained in ``conf_source`` for + configuration files with a ``yaml``, ``yml`` or ``json`` extension, load and merge + them through ``OmegaConf`` (https://omegaconf.readthedocs.io/) + and return them in the form of a config dictionary. + + The first processed config path is the ``base`` directory inside + ``conf_source``. The optional ``env`` argument can be used to specify a + subdirectory of ``conf_source`` to process as a config path after ``base``. + + When the same top-level key appears in any two config files located in + the same (sub)directory, a ``ValueError`` is raised. + + When the same key appears in any two config files located in different + (sub)directories, the last processed config path takes precedence + and overrides this key and any sub-keys. + + You can access the different configurations as follows: + :: + + >>> import logging.config + >>> from kedro.config import OmegaConfLoader + >>> from kedro.framework.project import settings + >>> + >>> conf_path = str(project_path / settings.CONF_SOURCE) + >>> conf_loader = OmegaConfLoader(conf_source=conf_path, env="local") + >>> + >>> conf_logging = conf_loader["logging"] + >>> logging.config.dictConfig(conf_logging) # set logging conf + >>> + >>> conf_catalog = conf_loader["catalog"] + >>> conf_params = conf_loader["parameters"] + + ``OmegaConf`` supports variable interpolation in configuration + https://omegaconf.readthedocs.io/en/2.2_branch/usage.html#merging-configurations. It is + recommended to use this instead of yaml anchors with the ``OmegaConfLoader``. + + This version of the ``OmegaConfLoader`` does not support any of the built-in ``OmegaConf`` + resolvers. Support for resolvers might be added in future versions. + + To use this class, change the setting for the `CONFIG_LOADER_CLASS` constant + in `settings.py`. + + Example: + :: + + >>> # in settings.py + >>> from kedro.config import OmegaConfLoader + >>> + >>> CONFIG_LOADER_CLASS = OmegaConfLoader + + """ + + def __init__( + self, + conf_source: str, + env: str = None, + runtime_params: Dict[str, Any] = None, + *, + config_patterns: Dict[str, List[str]] = None, + base_env: str = "base", + default_run_env: str = "local", + ): + """Instantiates a ``OmegaConfLoader``. + + Args: + conf_source: Path to use as root directory for loading configuration. + env: Environment that will take precedence over base. + runtime_params: Extra parameters passed to a Kedro run. + config_patterns: Regex patterns that specify the naming convention for configuration + files so they can be loaded. Can be customised by supplying config_patterns as + in `CONFIG_LOADER_ARGS` in `settings.py`. + base_env: Name of the base environment. Defaults to `"base"`. + This is used in the `conf_paths` property method to construct + the configuration paths. + default_run_env: Name of the default run environment. Defaults to `"local"`. + Can be overridden by supplying the `env` argument. + """ + self.base_env = base_env + self.default_run_env = default_run_env + + self.config_patterns = { + "catalog": ["catalog*", "catalog*/**", "**/catalog*"], + "parameters": ["parameters*", "parameters*/**", "**/parameters*"], + "credentials": ["credentials*", "credentials*/**", "**/credentials*"], + "logging": ["logging*", "logging*/**", "**/logging*"], + } + self.config_patterns.update(config_patterns or {}) + + # In the first iteration of the OmegaConfLoader we'll keep the resolver turned-off. + # It's easier to introduce them step by step, but removing them would be a breaking change. + self._clear_omegaconf_resolvers() + + super().__init__( + conf_source=conf_source, + env=env, + runtime_params=runtime_params, + ) + + def __getitem__(self, key) -> Dict[str, Any]: + """Get configuration files by key, load and merge them, and + return them in the form of a config dictionary. + + Args: + key: Key of the configuration type to fetch. + + Raises: + KeyError: If key provided isn't present in the config_patterns of this + OmegaConfLoader instance. + MissingConfigException: If no configuration files exist matching the patterns + mapped to the provided key. + + Returns: + Dict[str, Any]: A Python dictionary with the combined + configuration from all configuration files. + """ + + # Allow bypassing of loading config from patterns if a key and value have been set + # explicitly on the ``OmegaConfLoader`` instance. + if key in self: + return super().__getitem__(key) + + if key not in self.config_patterns: + raise KeyError( + f"No config patterns were found for '{key}' in your config loader" + ) + patterns = [*self.config_patterns[key]] + + # Load base env config + base_path = str(Path(self.conf_source) / self.base_env) + base_config = self.load_and_merge_dir_config(base_path, patterns) + config = base_config + + # Load chosen env config + run_env = self.env or self.default_run_env + env_path = str(Path(self.conf_source) / run_env) + env_config = self.load_and_merge_dir_config(env_path, patterns) + + # Destructively merge the two env dirs. The chosen env will override base. + common_keys = config.keys() & env_config.keys() + if common_keys: + sorted_keys = ", ".join(sorted(common_keys)) + msg = ( + "Config from path '%s' will override the following " + "existing top-level config keys: %s" + ) + _config_logger.debug(msg, env_path, sorted_keys) + + config.update(env_config) + + if not config: + raise MissingConfigException( + f"No files of YAML or JSON format found in {base_path} or {env_path} matching" + f" the glob pattern(s): {[*self.config_patterns[key]]}" + ) + return config + + def __repr__(self): # pragma: no cover + return ( + f"OmegaConfLoader(conf_source={self.conf_source}, env={self.env}, " + f"config_patterns={self.config_patterns})" + ) + + def load_and_merge_dir_config(self, conf_path: str, patterns: Iterable[str]): + """Recursively load and merge all configuration files in a directory using OmegaConf, + which satisfy a given list of glob patterns from a specific path. + + Args: + conf_path: Path to configuration directory. + patterns: List of glob patterns to match the filenames against. + + Raises: + MissingConfigException: If configuration path doesn't exist or isn't valid. + ValueError: If two or more configuration files contain the same key(s). + ParserError: If config file contains invalid YAML or JSON syntax. + + Returns: + Resulting configuration dictionary. + + """ + if not Path(conf_path).is_dir(): + raise MissingConfigException( + f"Given configuration path either does not exist " + f"or is not a valid directory: {conf_path}" + ) + + paths = [ + Path(each).resolve() + for pattern in patterns + for each in iglob(f"{str(conf_path)}/{pattern}", recursive=True) + ] + deduplicated_paths = set(paths) + config_files_filtered = [ + path for path in deduplicated_paths if self._is_valid_config_path(path) + ] + + config_per_file = {} + + for config_filepath in config_files_filtered: + try: + config = OmegaConf.load(config_filepath) + config_per_file[config_filepath] = config + except (ParserError, ScannerError) as exc: + line = exc.problem_mark.line # type: ignore + cursor = exc.problem_mark.column # type: ignore + raise ParserError( + f"Invalid YAML or JSON file {config_filepath}, unable to read line {line}, " + f"position {cursor}." + ) from exc + + seen_file_to_keys = { + file: set(config.keys()) for file, config in config_per_file.items() + } + aggregate_config = config_per_file.values() + self._check_duplicates(seen_file_to_keys) + + if aggregate_config: + if len(aggregate_config) > 1: + return dict(OmegaConf.merge(*aggregate_config)) + return list(aggregate_config)[0] + return {} + + @staticmethod + def _is_valid_config_path(path): + """Check if given path is a file path and file type is yaml or json.""" + return path.is_file() and path.suffix in [".yml", ".yaml", ".json"] + + @staticmethod + def _check_duplicates(seen_files_to_keys: Dict[Path, Set[Any]]): + duplicates = [] + + filepaths = list(seen_files_to_keys.keys()) + for i, filepath1 in enumerate(filepaths, 1): + config1 = seen_files_to_keys[filepath1] + for filepath2 in filepaths[i:]: + config2 = seen_files_to_keys[filepath2] + + overlapping_keys = config1 & config2 + + if overlapping_keys: + sorted_keys = ", ".join(sorted(overlapping_keys)) + if len(sorted_keys) > 100: + sorted_keys = sorted_keys[:100] + "..." + duplicates.append( + f"Duplicate keys found in {filepath1} and {filepath2}: {sorted_keys}" + ) + + if duplicates: + dup_str = "\n".join(duplicates) + raise ValueError(f"{dup_str}") + + @staticmethod + def _clear_omegaconf_resolvers(): + """Clear the built-in OmegaConf resolvers.""" + OmegaConf.clear_resolver("oc.env") + OmegaConf.clear_resolver("oc.create") + OmegaConf.clear_resolver("oc.deprecated") + OmegaConf.clear_resolver("oc.decode") + OmegaConf.clear_resolver("oc.select") + OmegaConf.clear_resolver("oc.dict.keys") + OmegaConf.clear_resolver("oc.dict.values") diff --git a/kedro/config/templated_config.py b/kedro/config/templated_config.py index d70b0c8cc2..3468bf10dc 100644 --- a/kedro/config/templated_config.py +++ b/kedro/config/templated_config.py @@ -145,6 +145,10 @@ def __init__( self._config_mapping = {**self._config_mapping, **globals_dict} def __getitem__(self, key): + # Allow bypassing of loading config from patterns if a key and value have been set + # explicitly on the ``TemplatedConfigLoader`` instance. + if key in self: + return super().__getitem__(key) return self.get(*self.config_patterns[key]) def __repr__(self): # pragma: no cover @@ -172,9 +176,7 @@ def get(self, *patterns: str) -> Dict[str, Any]: # type: ignore configuration files. **Note:** any keys that start with `_` will be ignored. String values wrapped in `${...}` will be replaced with the result of the corresponding JMESpath - expression evaluated against globals (see `__init` for more - configuration files. **Note:** any keys that start with `_` - details). + expression evaluated against globals. Raises: ValueError: malformed config found. diff --git a/tests/config/test_config.py b/tests/config/test_config.py index 88edc9e881..0c141168c9 100644 --- a/tests/config/test_config.py +++ b/tests/config/test_config.py @@ -352,3 +352,22 @@ def test_customised_config_patterns(self, tmp_path): "params*/**", "**/params*", ] + + @use_config_dir + def test_adding_extra_keys_to_confloader(self, tmp_path): + """Make sure extra keys can be added directly to the config loader instance.""" + conf = ConfigLoader(str(tmp_path)) + catalog = conf["catalog"] + conf["spark"] = {"spark_config": "emr.blabla"} + + assert catalog["trains"]["type"] == "MemoryDataSet" + assert conf["spark"] == {"spark_config": "emr.blabla"} + + @use_config_dir + def test_bypass_catalog_config_loading(self, tmp_path): + """Make sure core config loading can be bypassed by setting the key and values + directly on the config loader instance.""" + conf = ConfigLoader(str(tmp_path)) + conf["catalog"] = {"catalog_config": "something_new"} + + assert conf["catalog"] == {"catalog_config": "something_new"} diff --git a/tests/config/test_omegaconf_config.py b/tests/config/test_omegaconf_config.py new file mode 100644 index 0000000000..aebb0e3221 --- /dev/null +++ b/tests/config/test_omegaconf_config.py @@ -0,0 +1,423 @@ +# pylint: disable=expression-not-assigned, pointless-statement +import configparser +import json +import re +from pathlib import Path +from typing import Dict + +import pytest +import yaml +from yaml.parser import ParserError + +from kedro.config import MissingConfigException, OmegaConfLoader + +_DEFAULT_RUN_ENV = "local" +_BASE_ENV = "base" + + +def _write_yaml(filepath: Path, config: Dict): + filepath.parent.mkdir(parents=True, exist_ok=True) + yaml_str = yaml.dump(config) + filepath.write_text(yaml_str) + + +def _write_json(filepath: Path, config: Dict): + filepath.parent.mkdir(parents=True, exist_ok=True) + json_str = json.dumps(config) + filepath.write_text(json_str) + + +def _write_dummy_ini(filepath: Path): + filepath.parent.mkdir(parents=True, exist_ok=True) + config = configparser.ConfigParser() + config["prod"] = {"url": "postgresql://user:pass@url_prod/db"} + config["staging"] = {"url": "postgresql://user:pass@url_staging/db"} + with filepath.open("wt") as configfile: # save + config.write(configfile) + + +@pytest.fixture +def base_config(tmp_path): + filepath = str(tmp_path / "cars.csv") + return { + "trains": {"type": "MemoryDataSet"}, + "cars": { + "type": "pandas.CSVDataSet", + "filepath": filepath, + "save_args": {"index": True}, + }, + } + + +@pytest.fixture +def local_config(tmp_path): + filepath = str(tmp_path / "cars.csv") + return { + "cars": { + "type": "pandas.CSVDataSet", + "filepath": filepath, + "save_args": {"index": False}, + }, + "boats": {"type": "MemoryDataSet"}, + } + + +@pytest.fixture +def create_config_dir(tmp_path, base_config, local_config): + proj_catalog = tmp_path / _BASE_ENV / "catalog.yml" + local_catalog = tmp_path / _DEFAULT_RUN_ENV / "catalog.yml" + parameters = tmp_path / _BASE_ENV / "parameters.json" + project_parameters = dict(param1=1, param2=2) + + _write_yaml(proj_catalog, base_config) + _write_yaml(local_catalog, local_config) + _write_json(parameters, project_parameters) + + +@pytest.fixture +def proj_catalog(tmp_path, base_config): + proj_catalog = tmp_path / _BASE_ENV / "catalog.yml" + _write_yaml(proj_catalog, base_config) + + +@pytest.fixture +def proj_catalog_nested(tmp_path): + path = tmp_path / _BASE_ENV / "catalog" / "dir" / "nested.yml" + _write_yaml(path, {"nested": {"type": "MemoryDataSet"}}) + + +use_config_dir = pytest.mark.usefixtures("create_config_dir") +use_proj_catalog = pytest.mark.usefixtures("proj_catalog") + + +class TestOmegaConfLoader: + @use_config_dir + def test_load_core_config_dict_syntax(self, tmp_path): + """Make sure core config can be fetched with a dict [] access.""" + conf = OmegaConfLoader(str(tmp_path)) + params = conf["parameters"] + catalog = conf["catalog"] + + assert params["param1"] == 1 + assert catalog["trains"]["type"] == "MemoryDataSet" + + @use_config_dir + def test_load_core_config_get_syntax(self, tmp_path): + """Make sure core config can be fetched with .get()""" + conf = OmegaConfLoader(str(tmp_path)) + params = conf.get("parameters") + catalog = conf.get("catalog") + + assert params["param1"] == 1 + assert catalog["trains"]["type"] == "MemoryDataSet" + + @use_config_dir + def test_load_local_config_overrides_base(self, tmp_path): + """Make sure that configs from `local/` override the ones + from `base/`""" + conf = OmegaConfLoader(str(tmp_path)) + params = conf["parameters"] + catalog = conf["catalog"] + + assert params["param1"] == 1 + assert catalog["trains"]["type"] == "MemoryDataSet" + assert catalog["cars"]["type"] == "pandas.CSVDataSet" + assert catalog["boats"]["type"] == "MemoryDataSet" + assert not catalog["cars"]["save_args"]["index"] + + @use_proj_catalog + def test_load_base_config(self, tmp_path, base_config): + """Test config loading if `local/` directory is empty""" + (tmp_path / _DEFAULT_RUN_ENV).mkdir(exist_ok=True) + catalog = OmegaConfLoader(str(tmp_path))["catalog"] + assert catalog == base_config + + @use_proj_catalog + def test_duplicate_patterns(self, tmp_path, base_config): + """Test config loading if the glob patterns cover the same file""" + (tmp_path / _DEFAULT_RUN_ENV).mkdir(exist_ok=True) + conf = OmegaConfLoader(str(tmp_path)) + catalog1 = conf["catalog"] + catalog2 = conf["catalog"] + assert catalog1 == catalog2 == base_config + + def test_subdirs_dont_exist(self, tmp_path, base_config): + """Check the error when config paths don't exist""" + pattern = ( + r"Given configuration path either does not exist " + r"or is not a valid directory\: {}" + ) + with pytest.raises(MissingConfigException, match=pattern.format(".*base")): + OmegaConfLoader(str(tmp_path))["catalog"] + with pytest.raises(MissingConfigException, match=pattern.format(".*local")): + proj_catalog = tmp_path / _BASE_ENV / "catalog.yml" + _write_yaml(proj_catalog, base_config) + OmegaConfLoader(str(tmp_path))["catalog"] + + @pytest.mark.usefixtures("create_config_dir", "proj_catalog", "proj_catalog_nested") + def test_nested(self, tmp_path): + """Test loading the config from subdirectories""" + config_loader = OmegaConfLoader(str(tmp_path)) + config_loader.default_run_env = "prod" + + prod_catalog = tmp_path / "prod" / "catalog.yml" + _write_yaml(prod_catalog, {}) + + catalog = config_loader["catalog"] + assert catalog.keys() == {"cars", "trains", "nested"} + assert catalog["cars"]["type"] == "pandas.CSVDataSet" + assert catalog["cars"]["save_args"]["index"] is True + assert catalog["nested"]["type"] == "MemoryDataSet" + + @use_config_dir + def test_nested_subdirs_duplicate(self, tmp_path, base_config): + """Check the error when the configs from subdirectories contain + duplicate keys""" + nested = tmp_path / _BASE_ENV / "catalog" / "dir" / "nested.yml" + _write_yaml(nested, base_config) + + pattern = ( + r"Duplicate keys found in " + r"(.*catalog\.yml and .*nested\.yml|.*nested\.yml and .*catalog\.yml)" + r"\: cars, trains" + ) + with pytest.raises(ValueError, match=pattern): + OmegaConfLoader(str(tmp_path))["catalog"] + + @use_config_dir + def test_multiple_nested_subdirs_duplicates( + self, tmp_path, base_config, local_config + ): + """Check the error when several config files from subdirectories contain + duplicate keys""" + nested = tmp_path / _BASE_ENV / "catalog" / "dir" / "nested.yml" + _write_yaml(nested, base_config) + + local = tmp_path / _BASE_ENV / "catalog" / "dir" / "local.yml" + _write_yaml(local, local_config) + + pattern_catalog_nested = ( + r"Duplicate keys found in " + r"(.*catalog\.yml and .*nested\.yml|.*nested\.yml and .*catalog\.yml)" + r"\: cars, trains" + ) + pattern_catalog_local = ( + r"Duplicate keys found in " + r"(.*catalog\.yml and .*local\.yml|.*local\.yml and .*catalog\.yml)" + r"\: cars" + ) + pattern_nested_local = ( + r"Duplicate keys found in " + r"(.*nested\.yml and .*local\.yml|.*local\.yml and .*nested\.yml)" + r"\: cars" + ) + + with pytest.raises(ValueError) as exc: + OmegaConfLoader(str(tmp_path))["catalog"] + assert re.search(pattern_catalog_nested, str(exc.value)) + assert re.search(pattern_catalog_local, str(exc.value)) + assert re.search(pattern_nested_local, str(exc.value)) + + @use_config_dir + def test_bad_config_syntax(self, tmp_path): + conf_path = tmp_path / _BASE_ENV + conf_path.mkdir(parents=True, exist_ok=True) + (conf_path / "catalog.yml").write_text("bad:\nconfig") + + pattern = f"Invalid YAML or JSON file {conf_path}" + with pytest.raises(ParserError, match=re.escape(pattern)): + OmegaConfLoader(str(tmp_path))["catalog"] + + def test_lots_of_duplicates(self, tmp_path): + data = {str(i): i for i in range(100)} + _write_yaml(tmp_path / _BASE_ENV / "catalog1.yml", data) + _write_yaml(tmp_path / _BASE_ENV / "catalog2.yml", data) + + conf = OmegaConfLoader(str(tmp_path)) + pattern = ( + r"Duplicate keys found in " + r"(.*catalog2\.yml and .*catalog1\.yml|.*catalog1\.yml and .*catalog2\.yml)" + r"\: .*\.\.\.$" + ) + with pytest.raises(ValueError, match=pattern): + conf["catalog"] + + @use_config_dir + def test_same_key_in_same_dir(self, tmp_path, base_config): + """Check the error if 2 files in the same config dir contain + the same top-level key""" + dup_json = tmp_path / _BASE_ENV / "catalog.json" + _write_json(dup_json, base_config) + + pattern = ( + r"Duplicate keys found in " + r"(.*catalog\.yml and .*catalog\.json|.*catalog\.json and .*catalog\.yml)" + r"\: cars, trains" + ) + with pytest.raises(ValueError, match=pattern): + OmegaConfLoader(str(tmp_path))["catalog"] + + @use_config_dir + def test_pattern_key_not_found(self, tmp_path): + """Check the error if no config files satisfy a given pattern""" + key = "non-existent-pattern" + pattern = f"No config patterns were found for '{key}' in your config loader" + with pytest.raises(KeyError, match=pattern): + OmegaConfLoader(str(tmp_path))[key] + + @use_config_dir + def test_cannot_load_non_yaml_or_json_files(self, tmp_path): + db_patterns = {"db": ["db*"]} + db_config_path = tmp_path / _BASE_ENV / "db.ini" + _write_dummy_ini(db_config_path) + + conf = OmegaConfLoader(str(tmp_path), config_patterns=db_patterns) + pattern = ( + r"No files of YAML or JSON format found in " + r".*base or " + r".*local " + r"matching the glob pattern\(s\): " + r"\[\'db\*\'\]" + ) + with pytest.raises(MissingConfigException, match=pattern): + conf["db"] + + @use_config_dir + def test_no_files_found(self, tmp_path): + """Check the error if no config files satisfy a given pattern""" + pattern = ( + r"No files of YAML or JSON format found in " + r".*base or " + r".*local " + r"matching the glob pattern\(s\): " + r"\[\'credentials\*\', \'credentials\*/\**\', \'\**/credentials\*\'\]" + ) + with pytest.raises(MissingConfigException, match=pattern): + OmegaConfLoader(str(tmp_path))["credentials"] + + def test_overlapping_patterns(self, tmp_path, mocker): + """Check that same configuration file is not loaded more than once.""" + _write_yaml( + tmp_path / _BASE_ENV / "catalog0.yml", + {"env": _BASE_ENV, "common": "common"}, + ) + _write_yaml( + tmp_path / "dev" / "catalog1.yml", {"env": "dev", "dev_specific": "wiz"} + ) + _write_yaml(tmp_path / "dev" / "user1" / "catalog2.yml", {"user1_c2": True}) + + catalog_patterns = { + "catalog": [ + "catalog*", + "catalog*/**", + "../**/user1/catalog2*", + "../**/catalog2*", + ] + } + + catalog = OmegaConfLoader( + conf_source=str(tmp_path), env="dev", config_patterns=catalog_patterns + )["catalog"] + expected_catalog = { + "env": "dev", + "common": "common", + "dev_specific": "wiz", + "user1_c2": True, + } + assert catalog == expected_catalog + + mocked_load = mocker.patch("omegaconf.OmegaConf.load") + expected_path = (tmp_path / "dev" / "user1" / "catalog2.yml").resolve() + assert mocked_load.called_once_with(expected_path) + + def test_yaml_parser_error(self, tmp_path): + conf_path = tmp_path / _BASE_ENV + conf_path.mkdir(parents=True, exist_ok=True) + + example_catalog = """ + example_iris_data: + type: pandas.CSVDataSet + filepath: data/01_raw/iris.csv + """ + + (conf_path / "catalog.yml").write_text(example_catalog) + + msg = ( + f"Invalid YAML or JSON file {conf_path / 'catalog.yml'}, unable to read" + f" line 3, position 10." + ) + with pytest.raises(ParserError, match=re.escape(msg)): + OmegaConfLoader(str(tmp_path))["catalog"] + + def test_customised_config_patterns(self, tmp_path): + config_loader = OmegaConfLoader( + conf_source=str(tmp_path), + config_patterns={ + "spark": ["spark*/"], + "parameters": ["params*", "params*/**", "**/params*"], + }, + ) + assert config_loader.config_patterns["catalog"] == [ + "catalog*", + "catalog*/**", + "**/catalog*", + ] + assert config_loader.config_patterns["spark"] == ["spark*/"] + assert config_loader.config_patterns["parameters"] == [ + "params*", + "params*/**", + "**/params*", + ] + + def test_destructive_merging_strategy(self, tmp_path): + mlflow_patterns = {"mlflow": ["mlflow*", "mlflow*/**", "**/mlflow*"]} + base_mlflow = tmp_path / _BASE_ENV / "mlflow.yml" + base_config = { + "tracking": { + "disable_tracking": {"pipelines": "[on_exit_notification]"}, + "experiment": { + "name": "name-of-local-experiment", + }, + "params": {"long_params_strategy": "tag"}, + } + } + local_mlflow = tmp_path / _DEFAULT_RUN_ENV / "mlflow.yml" + local_config = { + "tracking": { + "experiment": { + "name": "name-of-prod-experiment", + }, + } + } + + _write_yaml(base_mlflow, base_config) + _write_yaml(local_mlflow, local_config) + + conf = OmegaConfLoader(str(tmp_path), config_patterns=mlflow_patterns)["mlflow"] + + assert conf == { + "tracking": { + "experiment": { + "name": "name-of-prod-experiment", + }, + } + } + + @use_config_dir + def test_adding_extra_keys_to_confloader(self, tmp_path): + """Make sure extra keys can be added directly to the config loader instance.""" + conf = OmegaConfLoader(str(tmp_path)) + catalog = conf["catalog"] + conf["spark"] = {"spark_config": "emr.blabla"} + + assert catalog["trains"]["type"] == "MemoryDataSet" + assert conf["spark"] == {"spark_config": "emr.blabla"} + + @use_config_dir + def test_bypass_catalog_config_loading(self, tmp_path): + """Make sure core config loading can be bypassed by setting the key and values + directly on the config loader instance.""" + conf = OmegaConfLoader(str(tmp_path)) + conf["catalog"] = {"catalog_config": "something_new"} + + assert conf["catalog"] == {"catalog_config": "something_new"} diff --git a/tests/config/test_templated_config.py b/tests/config/test_templated_config.py index a78c0dc926..2dd4bb6b6c 100644 --- a/tests/config/test_templated_config.py +++ b/tests/config/test_templated_config.py @@ -480,3 +480,25 @@ def test_customised_patterns(self, tmp_path): "**/catalog*", ] assert config_loader.config_patterns["spark"] == ["spark*/"] + + @pytest.mark.usefixtures("proj_catalog_param") + def test_adding_extra_keys_to_confloader(self, tmp_path, template_config): + """Make sure extra keys can be added directly to the config loader instance.""" + config_loader = TemplatedConfigLoader( + str(tmp_path), globals_dict=template_config + ) + config_loader.default_run_env = "" + catalog = config_loader["catalog"] + config_loader["spark"] = {"spark_config": "emr.blabla"} + + assert catalog["boats"]["type"] == "SparkDataSet" + assert config_loader["spark"] == {"spark_config": "emr.blabla"} + + @pytest.mark.usefixtures("proj_catalog_param") + def test_bypass_catalog_config_loading(self, tmp_path): + """Make sure core config loading can be bypassed by setting the key and values + directly on the config loader instance.""" + conf = TemplatedConfigLoader(str(tmp_path)) + conf["catalog"] = {"catalog_config": "something_new"} + + assert conf["catalog"] == {"catalog_config": "something_new"}