From d1fa3b71d558a776c06f628f17457437b3b4d9d3 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Tue, 29 Nov 2022 17:40:39 +0000 Subject: [PATCH 01/18] Add OmegaConfLoader with tests Signed-off-by: Merel Theisen --- dependency/requirements.txt | 1 + kedro/config/__init__.py | 2 + kedro/config/common.py | 17 +- kedro/config/omegaconf_config.py | 236 ++++++++++++++++++ tests/config/test_omegaconf_config.py | 346 ++++++++++++++++++++++++++ 5 files changed, 598 insertions(+), 4 deletions(-) create mode 100644 kedro/config/omegaconf_config.py create mode 100644 tests/config/test_omegaconf_config.py diff --git a/dependency/requirements.txt b/dependency/requirements.txt index 7a60b430e7..c442b1045d 100644 --- a/dependency/requirements.txt +++ b/dependency/requirements.txt @@ -10,6 +10,7 @@ importlib-metadata>=3.6; python_version >= '3.8' importlib_metadata>=3.6, <5.0; python_version < '3.8' # The "selectable" entry points were introduced in `importlib_metadata` 3.6 and Python 3.10. Bandit on Python 3.7 relies on a library with `importlib_metadata` < 5.0 importlib_resources>=1.3 # The `files()` API was introduced in `importlib_resources` 1.3 and Python 3.9. jmespath>=0.9.5, <1.0 +omegaconf~=2.2 pip-tools~=6.10 pluggy~=1.0.0 PyYAML>=4.2, <7.0 diff --git a/kedro/config/__init__.py b/kedro/config/__init__.py index f632c4039a..c76508d5c1 100644 --- a/kedro/config/__init__.py +++ b/kedro/config/__init__.py @@ -8,6 +8,7 @@ MissingConfigException, ) from .config import ConfigLoader +from .omegaconf_config import OmegaConfLoader from .templated_config import TemplatedConfigLoader __all__ = [ @@ -16,4 +17,5 @@ "ConfigLoader", "MissingConfigException", "TemplatedConfigLoader", + "OmegaConfLoader", ] diff --git a/kedro/config/common.py b/kedro/config/common.py index 0b5d2b05e9..65e8d18e77 100644 --- a/kedro/config/common.py +++ b/kedro/config/common.py @@ -175,8 +175,9 @@ def _lookup_config_filepaths( patterns: Iterable[str], processed_files: Set[Path], logger: Any, + omegaconf: bool = False, ) -> List[Path]: - config_files = _path_lookup(conf_path, patterns) + config_files = _path_lookup(conf_path, patterns, omegaconf) seen_files = config_files & processed_files if seen_files: @@ -222,7 +223,11 @@ def _check_duplicate_keys( raise ValueError(f"Duplicate keys found in {filepath} and:\n- {dup_str}") -def _path_lookup(conf_path: Path, patterns: Iterable[str]) -> Set[Path]: +def _path_lookup( + conf_path: Path, + patterns: Iterable[str], + omegaconf: bool = False, +) -> Set[Path]: """Return a set of all configuration files from ``conf_path`` or its subdirectories, which satisfy a given list of glob patterns. @@ -242,7 +247,11 @@ def _path_lookup(conf_path: Path, patterns: Iterable[str]) -> Set[Path]: # therefore iglob is used instead for each in iglob(str(conf_path / pattern), recursive=True): path = Path(each).resolve() - if path.is_file() and path.suffix in SUPPORTED_EXTENSIONS: - config_files.add(path) + if omegaconf: + if path.is_file() and path.suffix in [".yml", ".yaml", ".json"]: + config_files.add(path) + else: + if path.is_file() and path.suffix in SUPPORTED_EXTENSIONS: + config_files.add(path) return config_files diff --git a/kedro/config/omegaconf_config.py b/kedro/config/omegaconf_config.py new file mode 100644 index 0000000000..618a2434ef --- /dev/null +++ b/kedro/config/omegaconf_config.py @@ -0,0 +1,236 @@ +"""This module provides ``kedro.config`` with the functionality to load one +or more configuration files of yaml or json type from specified paths through OmegaConf. +""" +import logging +from pathlib import Path +from typing import AbstractSet, Any, Dict, Iterable, List, Set # noqa + +from omegaconf import OmegaConf +from yaml.parser import ParserError +from yaml.scanner import ScannerError + +from kedro.config import AbstractConfigLoader, MissingConfigException +from kedro.config.common import ( + _check_duplicate_keys, + _lookup_config_filepaths, + _remove_duplicates, +) + +_config_logger = logging.getLogger(__name__) + + +class OmegaConfLoader(AbstractConfigLoader): + """Recursively scan directories (config paths) contained in ``conf_source`` for + configuration files with a ``yaml``, ``yml`` or ``json`` extension, load and merge + them through OmegaConf and return them in the form of a config dictionary. + + The first processed config path is the ``base`` directory inside + ``conf_source``. The optional ``env`` argument can be used to specify a + subdirectory of ``conf_source`` to process as a config path after ``base``. + + When the same top-level key appears in any 2 config files located in + the same (sub)directory, a ``ValueError`` is raised. + + When the same key appears in any 2 config files located in different + (sub)directories, the last processed config path takes precedence + and overrides this key. + + You can access the different configurations as follows: + :: + + >>> import logging.config + >>> from kedro.config import OmegaConfLoader + >>> from kedro.framework.project import settings + >>> + >>> conf_path = str(project_path / settings.CONF_SOURCE) + >>> conf_loader = OmegaConfLoader(conf_source=conf_path, env="local") + >>> + >>> conf_logging = conf_loader["logging"] + >>> logging.config.dictConfig(conf_logging) # set logging conf + >>> + >>> conf_catalog = conf_loader["catalog"] + >>> conf_params = conf_loader["parameters"] + + + """ + + def __init__( + self, + conf_source: str, + env: str = None, + runtime_params: Dict[str, Any] = None, + config_patterns: Dict[str, List[str]] = None, + *, + base_env: str = "base", + default_run_env: str = "local", + ): + """Instantiates a ``OmegaConfLoader``. + + Args: + conf_source: Path to use as root directory for loading configuration. + env: Environment that will take precedence over base. + runtime_params: Extra parameters passed to a Kedro run. + config_patterns: Regex patterns that specify the naming convention for configuration + files so they can be loaded. Can be customised by supplying config_patterns as + in `CONFIG_LOADER_ARGS` in `settings.py`. + base_env: Name of the base environment. Defaults to `"base"`. + This is used in the `conf_paths` property method to construct + the configuration paths. + default_run_env: Name of the base environment. Defaults to `"local"`. + This is used in the `conf_paths` property method to construct + the configuration paths. Can be overriden by supplying the `env` argument. + """ + self.base_env = base_env + self.default_run_env = default_run_env + + self.config_patterns = { + "catalog": ["catalog*", "catalog*/**", "**/catalog*"], + "parameters": ["parameters*", "parameters*/**", "**/parameters*"], + "credentials": ["credentials*", "credentials*/**", "**/credentials*"], + "logging": ["logging*", "logging*/**", "**/logging*"], + } + self.config_patterns.update(config_patterns or {}) + + super().__init__( + conf_source=conf_source, + env=env, + runtime_params=runtime_params, + ) + + def __getitem__(self, key): + return self.get(*self.config_patterns[key]) + + def __repr__(self): # pragma: no cover + return ( + f"OmegaConfLoader(conf_source={self.conf_source}, env={self.env}, " + f"config_patterns={self.config_patterns})" + ) + + @property + def conf_paths(self): + """Property method to return deduplicated configuration paths.""" + return _remove_duplicates(self._build_conf_paths()) + + def get(self, *patterns: str) -> Dict[str, Any]: # type: ignore + return _get_config_from_patterns_omegaconf( + conf_paths=self.conf_paths, patterns=list(patterns) + ) + + def _build_conf_paths(self) -> Iterable[str]: + run_env = self.env or self.default_run_env + return [ + str(Path(self.conf_source) / self.base_env), + str(Path(self.conf_source) / run_env), + ] + + +def _get_config_from_patterns_omegaconf( + conf_paths: Iterable[str], + patterns: Iterable[str] = None, +) -> Dict[str, Any]: + """Recursively scan for configuration files, load and merge them using OmegaConf, and + return them in the form of a config dictionary. + + Args: + conf_paths: List of configuration paths to directories + patterns: Glob patterns to match. Files, which names match + any of the specified patterns, will be processed. + + Raises: + ValueError: If 2 or more configuration files inside the same + config path (or its subdirectories) contain the same + top-level key. + MissingConfigException: If no configuration files exist within + a specified config path. + ParserError: If configuration is poorly formatted and + cannot be loaded. + + Returns: + Dict[str, Any]: A Python dictionary with the combined + configuration from all configuration files. + """ + + if not patterns: + raise ValueError( + "'patterns' must contain at least one glob " + "pattern to match config filenames against." + ) + + config = {} # type: Dict[str, Any] + processed_files = set() # type: Set[Path] + + for conf_path in conf_paths: + if not Path(conf_path).is_dir(): + raise ValueError( + f"Given configuration path either does not exist " + f"or is not a valid directory: {conf_path}" + ) + + config_filepaths = _lookup_config_filepaths( + Path(conf_path), patterns, processed_files, _config_logger, True + ) + + new_conf = _load_configs_omegaconf(config_filepaths=config_filepaths) + + common_keys = config.keys() & new_conf.keys() + if common_keys: + sorted_keys = ", ".join(sorted(common_keys)) + msg = ( + "Config from path '%s' will override the following " + "existing top-level config keys: %s" + ) + _config_logger.info(msg, conf_path, sorted_keys) + + config.update(new_conf) + processed_files |= set(config_filepaths) + + if not processed_files: + raise MissingConfigException( + f"No files of YAML or JSON format found in {conf_paths} matching the glob " + f"pattern(s): {patterns}" + ) + return config + + +def _load_configs_omegaconf(config_filepaths: List[Path]) -> Dict[str, Any]: + """Recursively load and merge all configuration files using OmegaConf, which satisfy + a given list of glob patterns from a specific path. + + Args: + config_filepaths: Configuration files sorted in the order of precedence. + + Raises: + ValueError: If 2 or more configuration files contain the same key(s). + BadConfigException: If configuration is poorly formatted and + cannot be loaded. + + Returns: + Resulting configuration dictionary. + + """ + + config = {} + aggregate_config = [] + seen_file_to_keys = {} # type: Dict[Path, AbstractSet[str]] + + for config_filepath in config_filepaths: + try: + single_config = OmegaConf.load(config_filepath) + config = single_config + except (ParserError, ScannerError) as exc: + assert exc.problem_mark is not None + line = exc.problem_mark.line + cursor = exc.problem_mark.column + raise ParserError( + f"Invalid YAML file {config_filepath}, unable to read line {line}, " + f"position {cursor}." + ) from exc + + _check_duplicate_keys(seen_file_to_keys, config_filepath, dict(single_config)) + seen_file_to_keys[config_filepath] = single_config.keys() + aggregate_config.append(single_config) + + if len(aggregate_config) > 1: + return dict(OmegaConf.merge(*aggregate_config)) + + return config diff --git a/tests/config/test_omegaconf_config.py b/tests/config/test_omegaconf_config.py new file mode 100644 index 0000000000..769c3c25a4 --- /dev/null +++ b/tests/config/test_omegaconf_config.py @@ -0,0 +1,346 @@ +import configparser +import json +import re +from pathlib import Path +from typing import Dict + +import pytest +import yaml +from yaml.parser import ParserError + +from kedro.config import MissingConfigException, OmegaConfLoader + +_DEFAULT_RUN_ENV = "local" +_BASE_ENV = "base" + + +def _write_yaml(filepath: Path, config: Dict): + filepath.parent.mkdir(parents=True, exist_ok=True) + yaml_str = yaml.dump(config) + filepath.write_text(yaml_str) + + +def _write_json(filepath: Path, config: Dict): + filepath.parent.mkdir(parents=True, exist_ok=True) + json_str = json.dumps(config) + filepath.write_text(json_str) + + +def _write_dummy_ini(filepath: Path): + filepath.parent.mkdir(parents=True, exist_ok=True) + config = configparser.ConfigParser() + config["prod"] = {"url": "postgresql://user:pass@url_prod/db"} + config["staging"] = {"url": "postgresql://user:pass@url_staging/db"} + with filepath.open("wt") as configfile: # save + config.write(configfile) + + +@pytest.fixture +def base_config(tmp_path): + filepath = str(tmp_path / "cars.csv") + return { + "trains": {"type": "MemoryDataSet"}, + "cars": { + "type": "pandas.CSVDataSet", + "filepath": filepath, + "save_args": {"index": True}, + }, + } + + +@pytest.fixture +def local_config(tmp_path): + filepath = str(tmp_path / "cars.csv") + return { + "cars": { + "type": "pandas.CSVDataSet", + "filepath": filepath, + "save_args": {"index": False}, + }, + "boats": {"type": "MemoryDataSet"}, + } + + +@pytest.fixture +def create_config_dir(tmp_path, base_config, local_config): + proj_catalog = tmp_path / _BASE_ENV / "catalog.yml" + local_catalog = tmp_path / _DEFAULT_RUN_ENV / "catalog.yml" + parameters = tmp_path / _BASE_ENV / "parameters.json" + project_parameters = dict(param1=1, param2=2) + + _write_yaml(proj_catalog, base_config) + _write_yaml(local_catalog, local_config) + _write_json(parameters, project_parameters) + + +@pytest.fixture +def proj_catalog(tmp_path, base_config): + proj_catalog = tmp_path / _BASE_ENV / "catalog.yml" + _write_yaml(proj_catalog, base_config) + + +@pytest.fixture +def proj_catalog_nested(tmp_path): + path = tmp_path / _BASE_ENV / "catalog" / "dir" / "nested.yml" + _write_yaml(path, {"nested": {"type": "MemoryDataSet"}}) + + +use_config_dir = pytest.mark.usefixtures("create_config_dir") +use_proj_catalog = pytest.mark.usefixtures("proj_catalog") + + +class TestOmegaConfLoader: + @use_config_dir + def test_load_core_config_dict_get(self, tmp_path): + """Make sure core config can be fetched with a dict [] access.""" + conf = OmegaConfLoader(str(tmp_path)) + params = conf["parameters"] + catalog = conf["catalog"] + + assert params["param1"] == 1 + assert catalog["trains"]["type"] == "MemoryDataSet" + assert catalog["cars"]["type"] == "pandas.CSVDataSet" + assert catalog["boats"]["type"] == "MemoryDataSet" + assert not catalog["cars"]["save_args"]["index"] + + @use_config_dir + def test_load_local_config(self, tmp_path): + """Make sure that configs from `local/` override the ones + from `base/`""" + conf = OmegaConfLoader(str(tmp_path)) + params = conf.get("parameters*") + catalog = conf.get("catalog*") + + assert params["param1"] == 1 + assert catalog["trains"]["type"] == "MemoryDataSet" + assert catalog["cars"]["type"] == "pandas.CSVDataSet" + assert catalog["boats"]["type"] == "MemoryDataSet" + assert not catalog["cars"]["save_args"]["index"] + + @use_proj_catalog + def test_load_base_config(self, tmp_path, base_config): + """Test config loading if `local/` directory is empty""" + (tmp_path / _DEFAULT_RUN_ENV).mkdir(exist_ok=True) + catalog = OmegaConfLoader(str(tmp_path)).get("catalog*.yml") + assert catalog == base_config + + @use_proj_catalog + def test_duplicate_patterns(self, tmp_path, base_config): + """Test config loading if the glob patterns cover the same file""" + (tmp_path / _DEFAULT_RUN_ENV).mkdir(exist_ok=True) + conf = OmegaConfLoader(str(tmp_path)) + catalog1 = conf.get("catalog*.yml", "catalog*.yml") + catalog2 = conf.get("catalog*.yml", "catalog.yml") + assert catalog1 == catalog2 == base_config + + def test_subdirs_dont_exist(self, tmp_path, base_config): + """Check the error when config paths don't exist""" + pattern = ( + r"Given configuration path either does not exist " + r"or is not a valid directory\: {}" + ) + with pytest.raises(ValueError, match=pattern.format(".*base")): + OmegaConfLoader(str(tmp_path)).get("catalog*") + with pytest.raises(ValueError, match=pattern.format(".*local")): + proj_catalog = tmp_path / _BASE_ENV / "catalog.yml" + _write_yaml(proj_catalog, base_config) + OmegaConfLoader(str(tmp_path)).get("catalog*") + + @pytest.mark.usefixtures("create_config_dir", "proj_catalog", "proj_catalog_nested") + def test_nested(self, tmp_path): + """Test loading the config from subdirectories""" + config_loader = OmegaConfLoader(str(tmp_path)) + config_loader.default_run_env = "" + catalog = config_loader.get("catalog*", "catalog*/**") + assert catalog.keys() == {"cars", "trains", "nested"} + assert catalog["cars"]["type"] == "pandas.CSVDataSet" + assert catalog["cars"]["save_args"]["index"] is True + assert catalog["nested"]["type"] == "MemoryDataSet" + + @use_config_dir + def test_nested_subdirs_duplicate(self, tmp_path, base_config): + """Check the error when the configs from subdirectories contain + duplicate keys""" + nested = tmp_path / _BASE_ENV / "catalog" / "dir" / "nested.yml" + _write_yaml(nested, base_config) + + pattern = ( + r"Duplicate keys found in .*catalog\.yml " + r"and\:\n\- .*nested\.yml\: cars, trains" + ) + with pytest.raises(ValueError, match=pattern): + OmegaConfLoader(str(tmp_path)).get("catalog*", "catalog*/**") + + @use_config_dir + def test_bad_config_syntax(self, tmp_path): + conf_path = tmp_path / _BASE_ENV + conf_path.mkdir(parents=True, exist_ok=True) + (conf_path / "catalog.yml").write_text("bad:\nconfig") + + pattern = f"Invalid YAML file {conf_path}" + with pytest.raises(ParserError, match=re.escape(pattern)): + OmegaConfLoader(str(tmp_path)).get("catalog*.yml") + + def test_lots_of_duplicates(self, tmp_path): + data = {str(i): i for i in range(100)} + _write_yaml(tmp_path / _BASE_ENV / "catalog1.yml", data) + _write_yaml(tmp_path / _BASE_ENV / "catalog2.yml", data) + + conf = OmegaConfLoader(str(tmp_path)) + pattern = r"^Duplicate keys found in .*catalog2\.yml and\:\n\- .*catalog1\.yml\: .*\.\.\.$" + with pytest.raises(ValueError, match=pattern): + conf.get("**/catalog*") + + @use_config_dir + def test_same_key_in_same_dir(self, tmp_path, base_config): + """Check the error if 2 files in the same config dir contain + the same top-level key""" + dup_json = tmp_path / _BASE_ENV / "catalog.json" + _write_json(dup_json, base_config) + + pattern = ( + r"Duplicate keys found in .*catalog\.yml " + r"and\:\n\- .*catalog\.json\: cars, trains" + ) + with pytest.raises(ValueError, match=pattern): + OmegaConfLoader(str(tmp_path)).get("catalog*") + + @use_config_dir + def test_empty_patterns(self, tmp_path): + """Check the error if no config patterns were specified""" + pattern = ( + r"'patterns' must contain at least one glob pattern " + r"to match config filenames against" + ) + with pytest.raises(ValueError, match=pattern): + OmegaConfLoader(str(tmp_path)).get() + + @use_config_dir + def test_no_files_found(self, tmp_path): + """Check the error if no config files satisfy a given pattern""" + pattern = ( + r"No files of YAML or JSON format found in " + r"\[\'.*base\', " + r"\'.*local\'\] " + r"matching the glob pattern\(s\): " + r"\[\'non\-existent\-pattern\'\]" + ) + with pytest.raises(MissingConfigException, match=pattern): + OmegaConfLoader(str(tmp_path)).get("non-existent-pattern") + + @use_config_dir + def test_cannot_load_non_yaml_or_json_files(self, tmp_path): + db_config_path = tmp_path / _BASE_ENV / "db.ini" + _write_dummy_ini(db_config_path) + + conf = OmegaConfLoader(str(tmp_path)) + pattern = ( + r"No files of YAML or JSON format found in " + r"\[\'.*base\', " + r"\'.*local\'\] " + r"matching the glob pattern\(s\): " + r"\[\'db\*\'\]" + ) + with pytest.raises(MissingConfigException, match=pattern): + conf.get("db*") + + @use_config_dir + def test_key_not_found_dict_get(self, tmp_path): + """Check the error if no config files satisfy a given pattern""" + with pytest.raises(KeyError): + # pylint: disable=expression-not-assigned + OmegaConfLoader(str(tmp_path))["non-existent-pattern"] + + @use_config_dir + def test_no_files_found_dict_get(self, tmp_path): + """Check the error if no config files satisfy a given pattern""" + pattern = ( + r"No files of YAML or JSON format found in " + r"\[\'.*base\', " + r"\'.*local\'\] " + r"matching the glob pattern\(s\): " + r"\[\'credentials\*\', \'credentials\*/\**\', \'\**/credentials\*\'\]" + ) + with pytest.raises(MissingConfigException, match=pattern): + # pylint: disable=expression-not-assigned + OmegaConfLoader(str(tmp_path))["credentials"] + + def test_duplicate_paths(self, tmp_path, caplog): + """Check that trying to load the same environment config multiple times logs a + warning and skips the reload""" + _write_yaml(tmp_path / _BASE_ENV / "catalog.yml", {"env": _BASE_ENV, "a": "a"}) + config_loader = OmegaConfLoader(str(tmp_path), _BASE_ENV) + + with pytest.warns(UserWarning, match="Duplicate environment detected"): + config_paths = config_loader.conf_paths + assert config_paths == [str(tmp_path / _BASE_ENV)] + + config_loader.get("catalog*", "catalog*/**") + log_messages = [record.getMessage() for record in caplog.records] + assert not log_messages + + def test_overlapping_patterns(self, tmp_path, caplog): + """Check that same configuration file is not loaded more than once.""" + _write_yaml( + tmp_path / _BASE_ENV / "catalog0.yml", + {"env": _BASE_ENV, "common": "common"}, + ) + _write_yaml( + tmp_path / "dev" / "catalog1.yml", {"env": "dev", "dev_specific": "wiz"} + ) + _write_yaml(tmp_path / "dev" / "user1" / "catalog2.yml", {"user1_c2": True}) + _write_yaml(tmp_path / "dev" / "user1" / "catalog3.yml", {"user1_c3": True}) + + catalog = OmegaConfLoader(str(tmp_path), "dev").get( + "catalog*", "catalog*/**", "user1/catalog2*", "../**/catalog2*" + ) + expected_catalog = { + "env": "dev", + "common": "common", + "dev_specific": "wiz", + "user1_c2": True, + } + assert catalog == expected_catalog + + log_messages = [record.getMessage() for record in caplog.records] + expected_path = (tmp_path / "dev" / "user1" / "catalog2.yml").resolve() + expected_message = ( + f"Config file(s): {expected_path} already processed, skipping loading..." + ) + assert expected_message in log_messages + + def test_yaml_parser_error(self, tmp_path): + conf_path = tmp_path / _BASE_ENV + conf_path.mkdir(parents=True, exist_ok=True) + + example_catalog = """ + example_iris_data: + type: pandas.CSVDataSet + filepath: data/01_raw/iris.csv + """ + + (conf_path / "catalog.yml").write_text(example_catalog) + + msg = f"Invalid YAML file {conf_path / 'catalog.yml'}, unable to read line 3, position 10." + with pytest.raises(ParserError, match=re.escape(msg)): + OmegaConfLoader(str(tmp_path)).get("catalog*.yml") + + def test_customised_config_patterns(self, tmp_path): + config_loader = OmegaConfLoader( + conf_source=str(tmp_path), + config_patterns={ + "spark": ["spark*/"], + "parameters": ["params*", "params*/**", "**/params*"], + }, + ) + assert config_loader.config_patterns["catalog"] == [ + "catalog*", + "catalog*/**", + "**/catalog*", + ] + assert config_loader.config_patterns["spark"] == ["spark*/"] + assert config_loader.config_patterns["parameters"] == [ + "params*", + "params*/**", + "**/params*", + ] From df30001fa5b9598e437ffe2168dbe7572df9375c Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Tue, 29 Nov 2022 17:44:51 +0000 Subject: [PATCH 02/18] Use OmegaConf for merging configuration from different dirs Signed-off-by: Merel Theisen --- kedro/config/omegaconf_config.py | 15 +++--------- tests/config/test_omegaconf_config.py | 35 +++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/kedro/config/omegaconf_config.py b/kedro/config/omegaconf_config.py index 618a2434ef..2a0fce2186 100644 --- a/kedro/config/omegaconf_config.py +++ b/kedro/config/omegaconf_config.py @@ -156,7 +156,7 @@ def _get_config_from_patterns_omegaconf( "pattern to match config filenames against." ) - config = {} # type: Dict[str, Any] + aggregate_config = [] processed_files = set() # type: Set[Path] for conf_path in conf_paths: @@ -172,16 +172,7 @@ def _get_config_from_patterns_omegaconf( new_conf = _load_configs_omegaconf(config_filepaths=config_filepaths) - common_keys = config.keys() & new_conf.keys() - if common_keys: - sorted_keys = ", ".join(sorted(common_keys)) - msg = ( - "Config from path '%s' will override the following " - "existing top-level config keys: %s" - ) - _config_logger.info(msg, conf_path, sorted_keys) - - config.update(new_conf) + aggregate_config.append(new_conf) processed_files |= set(config_filepaths) if not processed_files: @@ -189,7 +180,7 @@ def _get_config_from_patterns_omegaconf( f"No files of YAML or JSON format found in {conf_paths} matching the glob " f"pattern(s): {patterns}" ) - return config + return dict(OmegaConf.merge(*aggregate_config)) def _load_configs_omegaconf(config_filepaths: List[Path]) -> Dict[str, Any]: diff --git a/tests/config/test_omegaconf_config.py b/tests/config/test_omegaconf_config.py index 769c3c25a4..062343fb4d 100644 --- a/tests/config/test_omegaconf_config.py +++ b/tests/config/test_omegaconf_config.py @@ -344,3 +344,38 @@ def test_customised_config_patterns(self, tmp_path): "params*/**", "**/params*", ] + + def test_merging_strategy_only_overwrites_specified_key(self, tmp_path): + base_mlflow = tmp_path / _BASE_ENV / "mlflow.yml" + base_config = { + "tracking": { + "disable_tracking": {"pipelines": "[on_exit_notification]"}, + "experiment": { + "name": "name-of-local-experiment", + }, + "params": {"long_params_strategy": "tag"}, + } + } + local_mlflow = tmp_path / _DEFAULT_RUN_ENV / "mlflow.yml" + local_config = { + "tracking": { + "experiment": { + "name": "name-of-prod-experiment", + }, + } + } + + _write_yaml(base_mlflow, base_config) + _write_yaml(local_mlflow, local_config) + + conf = OmegaConfLoader(str(tmp_path)).get("mlflow*") + + assert conf == { + "tracking": { + "disable_tracking": {"pipelines": "[on_exit_notification]"}, + "experiment": { + "name": "name-of-prod-experiment", + }, + "params": {"long_params_strategy": "tag"}, + } + } From a846ce7f8943546b1dcfda3f803cc2c9543fd01f Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Tue, 29 Nov 2022 17:50:35 +0000 Subject: [PATCH 03/18] Clear built-in resolvers from OmegaConf Signed-off-by: Merel Theisen --- kedro/config/omegaconf_config.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/kedro/config/omegaconf_config.py b/kedro/config/omegaconf_config.py index 2a0fce2186..d00b9b000e 100644 --- a/kedro/config/omegaconf_config.py +++ b/kedro/config/omegaconf_config.py @@ -156,6 +156,10 @@ def _get_config_from_patterns_omegaconf( "pattern to match config filenames against." ) + # In the first iteration of the OmegaConfLoader we'll keep the resolver turned-off. + # It's easier to introduce them step by step, but removing them would be a breaking change. + _clear_omegaconf_resolvers() + aggregate_config = [] processed_files = set() # type: Set[Path] @@ -225,3 +229,14 @@ def _load_configs_omegaconf(config_filepaths: List[Path]) -> Dict[str, Any]: return dict(OmegaConf.merge(*aggregate_config)) return config + + +def _clear_omegaconf_resolvers(): + """Clear the built-in OmegaConf resolvers.""" + OmegaConf.clear_resolver("oc.env") + OmegaConf.clear_resolver("oc.create") + OmegaConf.clear_resolver("oc.deprecated") + OmegaConf.clear_resolver("oc.decode") + OmegaConf.clear_resolver("oc.select") + OmegaConf.clear_resolver("oc.dict.keys") + OmegaConf.clear_resolver("oc.dict.values") From 3085841e9ffea53b628702007ded2b5952539517 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Wed, 30 Nov 2022 11:48:15 +0000 Subject: [PATCH 04/18] Improve class docstring Signed-off-by: Merel Theisen --- kedro/config/omegaconf_config.py | 24 ++++++++++++++++++++++-- kedro/config/templated_config.py | 4 +--- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/kedro/config/omegaconf_config.py b/kedro/config/omegaconf_config.py index d00b9b000e..6ba9521367 100644 --- a/kedro/config/omegaconf_config.py +++ b/kedro/config/omegaconf_config.py @@ -22,7 +22,8 @@ class OmegaConfLoader(AbstractConfigLoader): """Recursively scan directories (config paths) contained in ``conf_source`` for configuration files with a ``yaml``, ``yml`` or ``json`` extension, load and merge - them through OmegaConf and return them in the form of a config dictionary. + them through ``OmegaConf`` (https://omegaconf.readthedocs.io/) + and return them in the form of a config dictionary. The first processed config path is the ``base`` directory inside ``conf_source``. The optional ``env`` argument can be used to specify a @@ -33,7 +34,9 @@ class OmegaConfLoader(AbstractConfigLoader): When the same key appears in any 2 config files located in different (sub)directories, the last processed config path takes precedence - and overrides this key. + and overrides this key. You can find more information about how ``OmegaConf`` + does merging of configuration in their documentation + https://omegaconf.readthedocs.io/en/2.2_branch/usage.html#merging-configurations You can access the different configurations as follows: :: @@ -51,6 +54,23 @@ class OmegaConfLoader(AbstractConfigLoader): >>> conf_catalog = conf_loader["catalog"] >>> conf_params = conf_loader["parameters"] + ``OmegaConf`` supports variable interpolation in configuration + https://omegaconf.readthedocs.io/en/2.2_branch/usage.html#merging-configurations. It is + recommended to use this instead of yaml anchors with the ``OmegaConfLoader``. + + This version of the ``OmegaConfLoader`` does not support any of the built-in ``OmegaConf`` + resolvers. Support for resolvers might be added in future versions. + + To use this class, change the setting for the `CONFIG_LOADER_CLASS` constant + in `settings.py`. + + Example: + :: + + >>> # in settings.py + >>> from kedro.config import OmegaConfLoader + >>> + >>> CONFIG_LOADER_CLASS = OmegaConfLoader """ diff --git a/kedro/config/templated_config.py b/kedro/config/templated_config.py index d70b0c8cc2..3d2c85dc07 100644 --- a/kedro/config/templated_config.py +++ b/kedro/config/templated_config.py @@ -172,9 +172,7 @@ def get(self, *patterns: str) -> Dict[str, Any]: # type: ignore configuration files. **Note:** any keys that start with `_` will be ignored. String values wrapped in `${...}` will be replaced with the result of the corresponding JMESpath - expression evaluated against globals (see `__init` for more - configuration files. **Note:** any keys that start with `_` - details). + expression evaluated against globals. Raises: ValueError: malformed config found. From 26fb1ed06a6d8e3b711e623b3ac6e44573d6399e Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Thu, 8 Dec 2022 11:13:06 +0000 Subject: [PATCH 05/18] Refactor loading and processing of config files Signed-off-by: Merel Theisen --- kedro/config/common.py | 72 ++++++++++++-------------------- kedro/config/omegaconf_config.py | 11 ++++- 2 files changed, 36 insertions(+), 47 deletions(-) diff --git a/kedro/config/common.py b/kedro/config/common.py index 65e8d18e77..c3b2b12d33 100644 --- a/kedro/config/common.py +++ b/kedro/config/common.py @@ -5,7 +5,7 @@ import logging from glob import iglob from pathlib import Path -from typing import AbstractSet, Any, Dict, Iterable, List, Set +from typing import AbstractSet, Any, Dict, Iterable, List, Set, Union from warnings import warn from yaml.parser import ParserError @@ -71,9 +71,15 @@ def _get_config_from_patterns( f"or is not a valid directory: {conf_path}" ) - config_filepaths = _lookup_config_filepaths( - Path(conf_path), patterns, processed_files, _config_logger + config_files = _lookup_config_filepaths(Path(conf_path), patterns) + filtered_config_files = set() + for path in config_files: + if path.is_file() and path.suffix in SUPPORTED_EXTENSIONS: + filtered_config_files.add(path) + config_filepaths = _process_files( + filtered_config_files, processed_files, _config_logger ) + new_conf = _load_configs( config_filepaths=config_filepaths, ac_template=ac_template ) @@ -173,21 +179,31 @@ def _load_configs(config_filepaths: List[Path], ac_template: bool) -> Dict[str, def _lookup_config_filepaths( conf_path: Path, patterns: Iterable[str], - processed_files: Set[Path], - logger: Any, - omegaconf: bool = False, -) -> List[Path]: - config_files = _path_lookup(conf_path, patterns, omegaconf) +) -> Set[Union[Path, Any]]: + config_files = set() + conf_path = conf_path.resolve() - seen_files = config_files & processed_files + for pattern in patterns: + # `Path.glob()` ignores the files if pattern ends with "**", + # therefore iglob is used instead + for each in iglob(str(conf_path / pattern), recursive=True): + path = Path(each).resolve() + config_files.add(path) + return config_files + + +def _process_files( + filtered_config_files, processed_files: Set[Path], logger: Any +) -> List[Path]: + seen_files = filtered_config_files & processed_files if seen_files: logger.warning( "Config file(s): %s already processed, skipping loading...", ", ".join(str(seen) for seen in sorted(seen_files)), ) - config_files -= seen_files + filtered_config_files -= seen_files - return sorted(config_files) + return sorted(filtered_config_files) def _remove_duplicates(items: Iterable[str]): @@ -221,37 +237,3 @@ def _check_duplicate_keys( if duplicates: dup_str = "\n- ".join(duplicates) raise ValueError(f"Duplicate keys found in {filepath} and:\n- {dup_str}") - - -def _path_lookup( - conf_path: Path, - patterns: Iterable[str], - omegaconf: bool = False, -) -> Set[Path]: - """Return a set of all configuration files from ``conf_path`` or - its subdirectories, which satisfy a given list of glob patterns. - - Args: - conf_path: Path to configuration directory. - patterns: List of glob patterns to match the filenames against. - - Returns: - A set of paths to configuration files. - - """ - config_files = set() - conf_path = conf_path.resolve() - - for pattern in patterns: - # `Path.glob()` ignores the files if pattern ends with "**", - # therefore iglob is used instead - for each in iglob(str(conf_path / pattern), recursive=True): - path = Path(each).resolve() - if omegaconf: - if path.is_file() and path.suffix in [".yml", ".yaml", ".json"]: - config_files.add(path) - else: - if path.is_file() and path.suffix in SUPPORTED_EXTENSIONS: - config_files.add(path) - - return config_files diff --git a/kedro/config/omegaconf_config.py b/kedro/config/omegaconf_config.py index 6ba9521367..769c8b9fc3 100644 --- a/kedro/config/omegaconf_config.py +++ b/kedro/config/omegaconf_config.py @@ -13,6 +13,7 @@ from kedro.config.common import ( _check_duplicate_keys, _lookup_config_filepaths, + _process_files, _remove_duplicates, ) @@ -190,8 +191,14 @@ def _get_config_from_patterns_omegaconf( f"or is not a valid directory: {conf_path}" ) - config_filepaths = _lookup_config_filepaths( - Path(conf_path), patterns, processed_files, _config_logger, True + config_files = _lookup_config_filepaths(Path(conf_path), patterns) + filtered_config_files = set() + for path in config_files: + if path.is_file() and path.suffix in [".yml", ".yaml", ".json"]: + filtered_config_files.add(path) + + config_filepaths = _process_files( + filtered_config_files, processed_files, _config_logger ) new_conf = _load_configs_omegaconf(config_filepaths=config_filepaths) From 1c12a87259aca2b72be7277c46116da7c0e0d3de Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Fri, 9 Dec 2022 16:00:27 +0000 Subject: [PATCH 06/18] Revert changes to common and refactor omegaconf methods instead (step 1) Signed-off-by: Merel Theisen --- kedro/config/common.py | 63 +++++++------ kedro/config/omegaconf_config.py | 127 ++++++++++---------------- tests/config/test_omegaconf_config.py | 96 +++++++++---------- 3 files changed, 125 insertions(+), 161 deletions(-) diff --git a/kedro/config/common.py b/kedro/config/common.py index c3b2b12d33..0b5d2b05e9 100644 --- a/kedro/config/common.py +++ b/kedro/config/common.py @@ -5,7 +5,7 @@ import logging from glob import iglob from pathlib import Path -from typing import AbstractSet, Any, Dict, Iterable, List, Set, Union +from typing import AbstractSet, Any, Dict, Iterable, List, Set from warnings import warn from yaml.parser import ParserError @@ -71,15 +71,9 @@ def _get_config_from_patterns( f"or is not a valid directory: {conf_path}" ) - config_files = _lookup_config_filepaths(Path(conf_path), patterns) - filtered_config_files = set() - for path in config_files: - if path.is_file() and path.suffix in SUPPORTED_EXTENSIONS: - filtered_config_files.add(path) - config_filepaths = _process_files( - filtered_config_files, processed_files, _config_logger + config_filepaths = _lookup_config_filepaths( + Path(conf_path), patterns, processed_files, _config_logger ) - new_conf = _load_configs( config_filepaths=config_filepaths, ac_template=ac_template ) @@ -179,31 +173,20 @@ def _load_configs(config_filepaths: List[Path], ac_template: bool) -> Dict[str, def _lookup_config_filepaths( conf_path: Path, patterns: Iterable[str], -) -> Set[Union[Path, Any]]: - config_files = set() - conf_path = conf_path.resolve() - - for pattern in patterns: - # `Path.glob()` ignores the files if pattern ends with "**", - # therefore iglob is used instead - for each in iglob(str(conf_path / pattern), recursive=True): - path = Path(each).resolve() - config_files.add(path) - return config_files - - -def _process_files( - filtered_config_files, processed_files: Set[Path], logger: Any + processed_files: Set[Path], + logger: Any, ) -> List[Path]: - seen_files = filtered_config_files & processed_files + config_files = _path_lookup(conf_path, patterns) + + seen_files = config_files & processed_files if seen_files: logger.warning( "Config file(s): %s already processed, skipping loading...", ", ".join(str(seen) for seen in sorted(seen_files)), ) - filtered_config_files -= seen_files + config_files -= seen_files - return sorted(filtered_config_files) + return sorted(config_files) def _remove_duplicates(items: Iterable[str]): @@ -237,3 +220,29 @@ def _check_duplicate_keys( if duplicates: dup_str = "\n- ".join(duplicates) raise ValueError(f"Duplicate keys found in {filepath} and:\n- {dup_str}") + + +def _path_lookup(conf_path: Path, patterns: Iterable[str]) -> Set[Path]: + """Return a set of all configuration files from ``conf_path`` or + its subdirectories, which satisfy a given list of glob patterns. + + Args: + conf_path: Path to configuration directory. + patterns: List of glob patterns to match the filenames against. + + Returns: + A set of paths to configuration files. + + """ + config_files = set() + conf_path = conf_path.resolve() + + for pattern in patterns: + # `Path.glob()` ignores the files if pattern ends with "**", + # therefore iglob is used instead + for each in iglob(str(conf_path / pattern), recursive=True): + path = Path(each).resolve() + if path.is_file() and path.suffix in SUPPORTED_EXTENSIONS: + config_files.add(path) + + return config_files diff --git a/kedro/config/omegaconf_config.py b/kedro/config/omegaconf_config.py index 769c8b9fc3..4a7f02fadc 100644 --- a/kedro/config/omegaconf_config.py +++ b/kedro/config/omegaconf_config.py @@ -2,6 +2,7 @@ or more configuration files of yaml or json type from specified paths through OmegaConf. """ import logging +from glob import iglob from pathlib import Path from typing import AbstractSet, Any, Dict, Iterable, List, Set # noqa @@ -10,12 +11,7 @@ from yaml.scanner import ScannerError from kedro.config import AbstractConfigLoader, MissingConfigException -from kedro.config.common import ( - _check_duplicate_keys, - _lookup_config_filepaths, - _process_files, - _remove_duplicates, -) +from kedro.config.common import _check_duplicate_keys _config_logger = logging.getLogger(__name__) @@ -119,7 +115,31 @@ def __init__( ) def __getitem__(self, key): - return self.get(*self.config_patterns[key]) + # In the first iteration of the OmegaConfLoader we'll keep the resolver turned-off. + # It's easier to introduce them step by step, but removing them would be a breaking change. + _clear_omegaconf_resolvers() + + # 1. Load base env + base_path = str(Path(self.conf_source) / self.base_env) + try: + base_config = load_config(base_path, [*self.config_patterns[key]]) + except KeyError as exc: + raise KeyError("Key not found in patterns") + + # 2. Load other env + run_env = self.env or self.default_run_env + env_path = str(Path(self.conf_source) / run_env) + env_config = load_config(env_path, [*self.config_patterns[key]]) + + # TODO: 3. Destructive Merge the two env dirs + merged_config = dict(OmegaConf.merge(base_config, env_config)) + + if not merged_config: + raise MissingConfigException( + f"No files of YAML or JSON format found in {base_path} or {env_path} matching the glob " + f"pattern(s): {[*self.config_patterns[key]]}" + ) + return merged_config def __repr__(self): # pragma: no cover return ( @@ -127,91 +147,40 @@ def __repr__(self): # pragma: no cover f"config_patterns={self.config_patterns})" ) - @property - def conf_paths(self): - """Property method to return deduplicated configuration paths.""" - return _remove_duplicates(self._build_conf_paths()) - - def get(self, *patterns: str) -> Dict[str, Any]: # type: ignore - return _get_config_from_patterns_omegaconf( - conf_paths=self.conf_paths, patterns=list(patterns) - ) - - def _build_conf_paths(self) -> Iterable[str]: - run_env = self.env or self.default_run_env - return [ - str(Path(self.conf_source) / self.base_env), - str(Path(self.conf_source) / run_env), - ] +def is_valid_path(path): + if path.is_file() and path.suffix in [".yml", ".yaml", ".json"]: + return True + return False -def _get_config_from_patterns_omegaconf( - conf_paths: Iterable[str], - patterns: Iterable[str] = None, -) -> Dict[str, Any]: - """Recursively scan for configuration files, load and merge them using OmegaConf, and - return them in the form of a config dictionary. - - Args: - conf_paths: List of configuration paths to directories - patterns: Glob patterns to match. Files, which names match - any of the specified patterns, will be processed. - - Raises: - ValueError: If 2 or more configuration files inside the same - config path (or its subdirectories) contain the same - top-level key. - MissingConfigException: If no configuration files exist within - a specified config path. - ParserError: If configuration is poorly formatted and - cannot be loaded. - - Returns: - Dict[str, Any]: A Python dictionary with the combined - configuration from all configuration files. - """ +def load_config(conf_path: str, patterns: Iterable[str] = None): if not patterns: raise ValueError( "'patterns' must contain at least one glob " "pattern to match config filenames against." ) - # In the first iteration of the OmegaConfLoader we'll keep the resolver turned-off. - # It's easier to introduce them step by step, but removing them would be a breaking change. - _clear_omegaconf_resolvers() - - aggregate_config = [] - processed_files = set() # type: Set[Path] - - for conf_path in conf_paths: - if not Path(conf_path).is_dir(): - raise ValueError( - f"Given configuration path either does not exist " - f"or is not a valid directory: {conf_path}" - ) - - config_files = _lookup_config_filepaths(Path(conf_path), patterns) - filtered_config_files = set() - for path in config_files: - if path.is_file() and path.suffix in [".yml", ".yaml", ".json"]: - filtered_config_files.add(path) - - config_filepaths = _process_files( - filtered_config_files, processed_files, _config_logger + if not Path(conf_path).is_dir(): + raise ValueError( + f"Given configuration path either does not exist " + f"or is not a valid directory: {conf_path}" ) - new_conf = _load_configs_omegaconf(config_filepaths=config_filepaths) + # TODO: try to write cleaner + paths = [] + for pattern in patterns: + for each in iglob(f"{str(conf_path)}/{pattern}", recursive=True): + path = Path(each).resolve() + paths.append(path) - aggregate_config.append(new_conf) - processed_files |= set(config_filepaths) + # TODO: Should this be re-ordered? + deduplicated_paths = set(paths) + config_files = list(deduplicated_paths) + config_files_filtered = [path for path in config_files if is_valid_path(path)] - if not processed_files: - raise MissingConfigException( - f"No files of YAML or JSON format found in {conf_paths} matching the glob " - f"pattern(s): {patterns}" - ) - return dict(OmegaConf.merge(*aggregate_config)) + loaded_config = _load_configs_omegaconf(config_filepaths=config_files_filtered) + return loaded_config def _load_configs_omegaconf(config_filepaths: List[Path]) -> Dict[str, Any]: diff --git a/tests/config/test_omegaconf_config.py b/tests/config/test_omegaconf_config.py index 062343fb4d..25a930d109 100644 --- a/tests/config/test_omegaconf_config.py +++ b/tests/config/test_omegaconf_config.py @@ -108,8 +108,8 @@ def test_load_local_config(self, tmp_path): """Make sure that configs from `local/` override the ones from `base/`""" conf = OmegaConfLoader(str(tmp_path)) - params = conf.get("parameters*") - catalog = conf.get("catalog*") + params = conf.get("parameters") + catalog = conf.get("catalog") assert params["param1"] == 1 assert catalog["trains"]["type"] == "MemoryDataSet" @@ -121,7 +121,7 @@ def test_load_local_config(self, tmp_path): def test_load_base_config(self, tmp_path, base_config): """Test config loading if `local/` directory is empty""" (tmp_path / _DEFAULT_RUN_ENV).mkdir(exist_ok=True) - catalog = OmegaConfLoader(str(tmp_path)).get("catalog*.yml") + catalog = OmegaConfLoader(str(tmp_path))["catalog"] assert catalog == base_config @use_proj_catalog @@ -129,8 +129,8 @@ def test_duplicate_patterns(self, tmp_path, base_config): """Test config loading if the glob patterns cover the same file""" (tmp_path / _DEFAULT_RUN_ENV).mkdir(exist_ok=True) conf = OmegaConfLoader(str(tmp_path)) - catalog1 = conf.get("catalog*.yml", "catalog*.yml") - catalog2 = conf.get("catalog*.yml", "catalog.yml") + catalog1 = conf["catalog"] + catalog2 = conf["catalog"] assert catalog1 == catalog2 == base_config def test_subdirs_dont_exist(self, tmp_path, base_config): @@ -140,18 +140,22 @@ def test_subdirs_dont_exist(self, tmp_path, base_config): r"or is not a valid directory\: {}" ) with pytest.raises(ValueError, match=pattern.format(".*base")): - OmegaConfLoader(str(tmp_path)).get("catalog*") + OmegaConfLoader(str(tmp_path))["catalog"] with pytest.raises(ValueError, match=pattern.format(".*local")): proj_catalog = tmp_path / _BASE_ENV / "catalog.yml" _write_yaml(proj_catalog, base_config) - OmegaConfLoader(str(tmp_path)).get("catalog*") + OmegaConfLoader(str(tmp_path))["catalog"] @pytest.mark.usefixtures("create_config_dir", "proj_catalog", "proj_catalog_nested") def test_nested(self, tmp_path): """Test loading the config from subdirectories""" config_loader = OmegaConfLoader(str(tmp_path)) - config_loader.default_run_env = "" - catalog = config_loader.get("catalog*", "catalog*/**") + config_loader.default_run_env = "prod" + + prod_catalog = tmp_path / "prod" / "catalog.yml" + _write_yaml(prod_catalog, {}) + + catalog = config_loader["catalog"] assert catalog.keys() == {"cars", "trains", "nested"} assert catalog["cars"]["type"] == "pandas.CSVDataSet" assert catalog["cars"]["save_args"]["index"] is True @@ -169,7 +173,7 @@ def test_nested_subdirs_duplicate(self, tmp_path, base_config): r"and\:\n\- .*nested\.yml\: cars, trains" ) with pytest.raises(ValueError, match=pattern): - OmegaConfLoader(str(tmp_path)).get("catalog*", "catalog*/**") + OmegaConfLoader(str(tmp_path))["catalog"] @use_config_dir def test_bad_config_syntax(self, tmp_path): @@ -179,7 +183,7 @@ def test_bad_config_syntax(self, tmp_path): pattern = f"Invalid YAML file {conf_path}" with pytest.raises(ParserError, match=re.escape(pattern)): - OmegaConfLoader(str(tmp_path)).get("catalog*.yml") + OmegaConfLoader(str(tmp_path))["catalog"] def test_lots_of_duplicates(self, tmp_path): data = {str(i): i for i in range(100)} @@ -189,7 +193,7 @@ def test_lots_of_duplicates(self, tmp_path): conf = OmegaConfLoader(str(tmp_path)) pattern = r"^Duplicate keys found in .*catalog2\.yml and\:\n\- .*catalog1\.yml\: .*\.\.\.$" with pytest.raises(ValueError, match=pattern): - conf.get("**/catalog*") + conf["catalog"] @use_config_dir def test_same_key_in_same_dir(self, tmp_path, base_config): @@ -203,46 +207,39 @@ def test_same_key_in_same_dir(self, tmp_path, base_config): r"and\:\n\- .*catalog\.json\: cars, trains" ) with pytest.raises(ValueError, match=pattern): - OmegaConfLoader(str(tmp_path)).get("catalog*") + OmegaConfLoader(str(tmp_path))["catalog"] @use_config_dir - def test_empty_patterns(self, tmp_path): - """Check the error if no config patterns were specified""" - pattern = ( - r"'patterns' must contain at least one glob pattern " - r"to match config filenames against" - ) - with pytest.raises(ValueError, match=pattern): - OmegaConfLoader(str(tmp_path)).get() + def test_pattern_key_not_found(self, tmp_path): + """Check the error if no config files satisfy a given pattern""" + pattern = "Key not found in patterns" + with pytest.raises(KeyError, match=pattern): + OmegaConfLoader(str(tmp_path))["non-existent-pattern"] + # TODO: write this test @use_config_dir def test_no_files_found(self, tmp_path): """Check the error if no config files satisfy a given pattern""" - pattern = ( - r"No files of YAML or JSON format found in " - r"\[\'.*base\', " - r"\'.*local\'\] " - r"matching the glob pattern\(s\): " - r"\[\'non\-existent\-pattern\'\]" - ) - with pytest.raises(MissingConfigException, match=pattern): - OmegaConfLoader(str(tmp_path)).get("non-existent-pattern") + pattern = "Key not found in patterns" + with pytest.raises(KeyError, match=pattern): + OmegaConfLoader(str(tmp_path))["non-existent-pattern"] @use_config_dir def test_cannot_load_non_yaml_or_json_files(self, tmp_path): + db_patterns = {"db": ["db*"]} db_config_path = tmp_path / _BASE_ENV / "db.ini" _write_dummy_ini(db_config_path) - conf = OmegaConfLoader(str(tmp_path)) + conf = OmegaConfLoader(str(tmp_path), config_patterns=db_patterns) pattern = ( r"No files of YAML or JSON format found in " - r"\[\'.*base\', " - r"\'.*local\'\] " + r".*base or " + r".*local " r"matching the glob pattern\(s\): " r"\[\'db\*\'\]" ) with pytest.raises(MissingConfigException, match=pattern): - conf.get("db*") + conf["db"] @use_config_dir def test_key_not_found_dict_get(self, tmp_path): @@ -256,8 +253,8 @@ def test_no_files_found_dict_get(self, tmp_path): """Check the error if no config files satisfy a given pattern""" pattern = ( r"No files of YAML or JSON format found in " - r"\[\'.*base\', " - r"\'.*local\'\] " + r".*base or " + r".*local " r"matching the glob pattern\(s\): " r"\[\'credentials\*\', \'credentials\*/\**\', \'\**/credentials\*\'\]" ) @@ -265,20 +262,6 @@ def test_no_files_found_dict_get(self, tmp_path): # pylint: disable=expression-not-assigned OmegaConfLoader(str(tmp_path))["credentials"] - def test_duplicate_paths(self, tmp_path, caplog): - """Check that trying to load the same environment config multiple times logs a - warning and skips the reload""" - _write_yaml(tmp_path / _BASE_ENV / "catalog.yml", {"env": _BASE_ENV, "a": "a"}) - config_loader = OmegaConfLoader(str(tmp_path), _BASE_ENV) - - with pytest.warns(UserWarning, match="Duplicate environment detected"): - config_paths = config_loader.conf_paths - assert config_paths == [str(tmp_path / _BASE_ENV)] - - config_loader.get("catalog*", "catalog*/**") - log_messages = [record.getMessage() for record in caplog.records] - assert not log_messages - def test_overlapping_patterns(self, tmp_path, caplog): """Check that same configuration file is not loaded more than once.""" _write_yaml( @@ -291,9 +274,11 @@ def test_overlapping_patterns(self, tmp_path, caplog): _write_yaml(tmp_path / "dev" / "user1" / "catalog2.yml", {"user1_c2": True}) _write_yaml(tmp_path / "dev" / "user1" / "catalog3.yml", {"user1_c3": True}) - catalog = OmegaConfLoader(str(tmp_path), "dev").get( - "catalog*", "catalog*/**", "user1/catalog2*", "../**/catalog2*" - ) + # catalog = OmegaConfLoader(str(tmp_path), "dev").get( + # "catalog*", "catalog*/**", "user1/catalog2*", "../**/catalog2*" + # ) + + catalog = OmegaConfLoader(str(tmp_path), "dev")["catalog"] expected_catalog = { "env": "dev", "common": "common", @@ -323,7 +308,7 @@ def test_yaml_parser_error(self, tmp_path): msg = f"Invalid YAML file {conf_path / 'catalog.yml'}, unable to read line 3, position 10." with pytest.raises(ParserError, match=re.escape(msg)): - OmegaConfLoader(str(tmp_path)).get("catalog*.yml") + OmegaConfLoader(str(tmp_path))["catalog"] def test_customised_config_patterns(self, tmp_path): config_loader = OmegaConfLoader( @@ -346,6 +331,7 @@ def test_customised_config_patterns(self, tmp_path): ] def test_merging_strategy_only_overwrites_specified_key(self, tmp_path): + mlflow_patterns = {"mlflow": ["mlflow*", "mlflow*/**", "**/mlflow*"]} base_mlflow = tmp_path / _BASE_ENV / "mlflow.yml" base_config = { "tracking": { @@ -368,7 +354,7 @@ def test_merging_strategy_only_overwrites_specified_key(self, tmp_path): _write_yaml(base_mlflow, base_config) _write_yaml(local_mlflow, local_config) - conf = OmegaConfLoader(str(tmp_path)).get("mlflow*") + conf = OmegaConfLoader(str(tmp_path), config_patterns=mlflow_patterns)["mlflow"] assert conf == { "tracking": { From 473c924a8a9fa419d7db7ca0e1910520fd90d254 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Mon, 12 Dec 2022 12:11:22 +0000 Subject: [PATCH 07/18] Merge load_omegaconf into load_conf method and clean up Signed-off-by: Merel Theisen --- kedro/config/omegaconf_config.py | 86 ++++++++++++++------------- tests/config/test_omegaconf_config.py | 7 +-- 2 files changed, 49 insertions(+), 44 deletions(-) diff --git a/kedro/config/omegaconf_config.py b/kedro/config/omegaconf_config.py index 4a7f02fadc..8ecd4dd9ec 100644 --- a/kedro/config/omegaconf_config.py +++ b/kedro/config/omegaconf_config.py @@ -119,27 +119,44 @@ def __getitem__(self, key): # It's easier to introduce them step by step, but removing them would be a breaking change. _clear_omegaconf_resolvers() - # 1. Load base env + # 1. Load base env config base_path = str(Path(self.conf_source) / self.base_env) try: - base_config = load_config(base_path, [*self.config_patterns[key]]) + base_config = load_and_merge_dir_config( + base_path, [*self.config_patterns[key]] + ) + config = base_config except KeyError as exc: raise KeyError("Key not found in patterns") - # 2. Load other env + # 2. Load chosen env config run_env = self.env or self.default_run_env env_path = str(Path(self.conf_source) / run_env) - env_config = load_config(env_path, [*self.config_patterns[key]]) + try: + env_config = load_and_merge_dir_config( + env_path, [*self.config_patterns[key]] + ) + except KeyError as exc: + raise KeyError("Key not found in patterns") + + # 3. Destructively merge the two env dirs. The chosen env will override base. + common_keys = config.keys() & env_config.keys() + if common_keys: + sorted_keys = ", ".join(sorted(common_keys)) + msg = ( + "Config from path '%s' will override the following " + "existing top-level config keys: %s" + ) + _config_logger.info(msg, env_path, sorted_keys) - # TODO: 3. Destructive Merge the two env dirs - merged_config = dict(OmegaConf.merge(base_config, env_config)) + config.update(env_config) - if not merged_config: + if not config: raise MissingConfigException( - f"No files of YAML or JSON format found in {base_path} or {env_path} matching the glob " - f"pattern(s): {[*self.config_patterns[key]]}" + f"No files of YAML or JSON format found in {base_path} or {env_path} matching" + f" the glob pattern(s): {[*self.config_patterns[key]]}" ) - return merged_config + return config def __repr__(self): # pragma: no cover return ( @@ -154,15 +171,25 @@ def is_valid_path(path): return False -def load_config(conf_path: str, patterns: Iterable[str] = None): - if not patterns: - raise ValueError( - "'patterns' must contain at least one glob " - "pattern to match config filenames against." - ) +def load_and_merge_dir_config(conf_path: str, patterns: Iterable[str] = None): + """Recursively load and merge all configuration files in a directory using OmegaConf, + which satisfy a given list of glob patterns from a specific path. + + Args: + conf_path: Path to configuration directory. + patterns: List of glob patterns to match the filenames against. + + Raises: + MissingConfigException: If configuration path doesn't exist or isn't valid. + ValueError: If two or more configuration files contain the same key(s). + ParserError: If config file contains invalid YAML or JSON syntax. + + Returns: + Resulting configuration dictionary. + """ if not Path(conf_path).is_dir(): - raise ValueError( + raise MissingConfigException( f"Given configuration path either does not exist " f"or is not a valid directory: {conf_path}" ) @@ -179,32 +206,11 @@ def load_config(conf_path: str, patterns: Iterable[str] = None): config_files = list(deduplicated_paths) config_files_filtered = [path for path in config_files if is_valid_path(path)] - loaded_config = _load_configs_omegaconf(config_filepaths=config_files_filtered) - return loaded_config - - -def _load_configs_omegaconf(config_filepaths: List[Path]) -> Dict[str, Any]: - """Recursively load and merge all configuration files using OmegaConf, which satisfy - a given list of glob patterns from a specific path. - - Args: - config_filepaths: Configuration files sorted in the order of precedence. - - Raises: - ValueError: If 2 or more configuration files contain the same key(s). - BadConfigException: If configuration is poorly formatted and - cannot be loaded. - - Returns: - Resulting configuration dictionary. - - """ - config = {} aggregate_config = [] seen_file_to_keys = {} # type: Dict[Path, AbstractSet[str]] - for config_filepath in config_filepaths: + for config_filepath in config_files_filtered: try: single_config = OmegaConf.load(config_filepath) config = single_config @@ -213,7 +219,7 @@ def _load_configs_omegaconf(config_filepaths: List[Path]) -> Dict[str, Any]: line = exc.problem_mark.line cursor = exc.problem_mark.column raise ParserError( - f"Invalid YAML file {config_filepath}, unable to read line {line}, " + f"Invalid YAML or JSON file {config_filepath}, unable to read line {line}, " f"position {cursor}." ) from exc diff --git a/tests/config/test_omegaconf_config.py b/tests/config/test_omegaconf_config.py index 25a930d109..a00c6b8458 100644 --- a/tests/config/test_omegaconf_config.py +++ b/tests/config/test_omegaconf_config.py @@ -1,3 +1,4 @@ +# pylint: disable=expression-not-assigned, pointless-statement import configparser import json import re @@ -139,9 +140,9 @@ def test_subdirs_dont_exist(self, tmp_path, base_config): r"Given configuration path either does not exist " r"or is not a valid directory\: {}" ) - with pytest.raises(ValueError, match=pattern.format(".*base")): + with pytest.raises(MissingConfigException, match=pattern.format(".*base")): OmegaConfLoader(str(tmp_path))["catalog"] - with pytest.raises(ValueError, match=pattern.format(".*local")): + with pytest.raises(MissingConfigException, match=pattern.format(".*local")): proj_catalog = tmp_path / _BASE_ENV / "catalog.yml" _write_yaml(proj_catalog, base_config) OmegaConfLoader(str(tmp_path))["catalog"] @@ -245,7 +246,6 @@ def test_cannot_load_non_yaml_or_json_files(self, tmp_path): def test_key_not_found_dict_get(self, tmp_path): """Check the error if no config files satisfy a given pattern""" with pytest.raises(KeyError): - # pylint: disable=expression-not-assigned OmegaConfLoader(str(tmp_path))["non-existent-pattern"] @use_config_dir @@ -259,7 +259,6 @@ def test_no_files_found_dict_get(self, tmp_path): r"\[\'credentials\*\', \'credentials\*/\**\', \'\**/credentials\*\'\]" ) with pytest.raises(MissingConfigException, match=pattern): - # pylint: disable=expression-not-assigned OmegaConfLoader(str(tmp_path))["credentials"] def test_overlapping_patterns(self, tmp_path, caplog): From 808f335703e78fa7aedce4c32d78fd9220de6a5e Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Mon, 12 Dec 2022 13:23:44 +0000 Subject: [PATCH 08/18] Fix tests Signed-off-by: Merel Theisen --- kedro/config/omegaconf_config.py | 4 +- tests/config/test_omegaconf_config.py | 68 +++++++++++++-------------- 2 files changed, 33 insertions(+), 39 deletions(-) diff --git a/kedro/config/omegaconf_config.py b/kedro/config/omegaconf_config.py index 8ecd4dd9ec..3960d62fdb 100644 --- a/kedro/config/omegaconf_config.py +++ b/kedro/config/omegaconf_config.py @@ -201,10 +201,8 @@ def load_and_merge_dir_config(conf_path: str, patterns: Iterable[str] = None): path = Path(each).resolve() paths.append(path) - # TODO: Should this be re-ordered? deduplicated_paths = set(paths) - config_files = list(deduplicated_paths) - config_files_filtered = [path for path in config_files if is_valid_path(path)] + config_files_filtered = [path for path in deduplicated_paths if is_valid_path(path)] config = {} aggregate_config = [] diff --git a/tests/config/test_omegaconf_config.py b/tests/config/test_omegaconf_config.py index a00c6b8458..0f56a0f927 100644 --- a/tests/config/test_omegaconf_config.py +++ b/tests/config/test_omegaconf_config.py @@ -170,8 +170,9 @@ def test_nested_subdirs_duplicate(self, tmp_path, base_config): _write_yaml(nested, base_config) pattern = ( - r"Duplicate keys found in .*catalog\.yml " - r"and\:\n\- .*nested\.yml\: cars, trains" + r"Duplicate keys found in " + r"(.*catalog\.yml and\:\n\- .*nested\.yml|.*nested\.yml and\:\n\- .*catalog\.yml)" + r"\: cars, trains" ) with pytest.raises(ValueError, match=pattern): OmegaConfLoader(str(tmp_path))["catalog"] @@ -182,7 +183,7 @@ def test_bad_config_syntax(self, tmp_path): conf_path.mkdir(parents=True, exist_ok=True) (conf_path / "catalog.yml").write_text("bad:\nconfig") - pattern = f"Invalid YAML file {conf_path}" + pattern = f"Invalid YAML or JSON file {conf_path}" with pytest.raises(ParserError, match=re.escape(pattern)): OmegaConfLoader(str(tmp_path))["catalog"] @@ -192,7 +193,11 @@ def test_lots_of_duplicates(self, tmp_path): _write_yaml(tmp_path / _BASE_ENV / "catalog2.yml", data) conf = OmegaConfLoader(str(tmp_path)) - pattern = r"^Duplicate keys found in .*catalog2\.yml and\:\n\- .*catalog1\.yml\: .*\.\.\.$" + pattern = ( + r"Duplicate keys found in " + r"(.*catalog2\.yml and\:\n\- .*catalog1\.yml|.*catalog1\.yml and\:\n\- .*catalog2\.yml)" + r"\: .*\.\.\.$" + ) with pytest.raises(ValueError, match=pattern): conf["catalog"] @@ -204,8 +209,9 @@ def test_same_key_in_same_dir(self, tmp_path, base_config): _write_json(dup_json, base_config) pattern = ( - r"Duplicate keys found in .*catalog\.yml " - r"and\:\n\- .*catalog\.json\: cars, trains" + r"Duplicate keys found in " + r"(.*catalog\.yml and\:\n\- .*catalog\.json|.*catalog\.json and\:\n\- .*catalog\.yml)" + r"\: cars, trains" ) with pytest.raises(ValueError, match=pattern): OmegaConfLoader(str(tmp_path))["catalog"] @@ -217,14 +223,6 @@ def test_pattern_key_not_found(self, tmp_path): with pytest.raises(KeyError, match=pattern): OmegaConfLoader(str(tmp_path))["non-existent-pattern"] - # TODO: write this test - @use_config_dir - def test_no_files_found(self, tmp_path): - """Check the error if no config files satisfy a given pattern""" - pattern = "Key not found in patterns" - with pytest.raises(KeyError, match=pattern): - OmegaConfLoader(str(tmp_path))["non-existent-pattern"] - @use_config_dir def test_cannot_load_non_yaml_or_json_files(self, tmp_path): db_patterns = {"db": ["db*"]} @@ -243,13 +241,7 @@ def test_cannot_load_non_yaml_or_json_files(self, tmp_path): conf["db"] @use_config_dir - def test_key_not_found_dict_get(self, tmp_path): - """Check the error if no config files satisfy a given pattern""" - with pytest.raises(KeyError): - OmegaConfLoader(str(tmp_path))["non-existent-pattern"] - - @use_config_dir - def test_no_files_found_dict_get(self, tmp_path): + def test_no_files_found(self, tmp_path): """Check the error if no config files satisfy a given pattern""" pattern = ( r"No files of YAML or JSON format found in " @@ -261,7 +253,7 @@ def test_no_files_found_dict_get(self, tmp_path): with pytest.raises(MissingConfigException, match=pattern): OmegaConfLoader(str(tmp_path))["credentials"] - def test_overlapping_patterns(self, tmp_path, caplog): + def test_overlapping_patterns(self, tmp_path, mocker): """Check that same configuration file is not loaded more than once.""" _write_yaml( tmp_path / _BASE_ENV / "catalog0.yml", @@ -271,13 +263,19 @@ def test_overlapping_patterns(self, tmp_path, caplog): tmp_path / "dev" / "catalog1.yml", {"env": "dev", "dev_specific": "wiz"} ) _write_yaml(tmp_path / "dev" / "user1" / "catalog2.yml", {"user1_c2": True}) - _write_yaml(tmp_path / "dev" / "user1" / "catalog3.yml", {"user1_c3": True}) - # catalog = OmegaConfLoader(str(tmp_path), "dev").get( - # "catalog*", "catalog*/**", "user1/catalog2*", "../**/catalog2*" - # ) + catalog_patterns = { + "catalog": [ + "catalog*", + "catalog*/**", + "../**/user1/catalog2*", + "../**/catalog2*", + ] + } - catalog = OmegaConfLoader(str(tmp_path), "dev")["catalog"] + catalog = OmegaConfLoader( + conf_source=str(tmp_path), env="dev", config_patterns=catalog_patterns + )["catalog"] expected_catalog = { "env": "dev", "common": "common", @@ -286,12 +284,9 @@ def test_overlapping_patterns(self, tmp_path, caplog): } assert catalog == expected_catalog - log_messages = [record.getMessage() for record in caplog.records] + mocked_load = mocker.patch("omegaconf.OmegaConf.load") expected_path = (tmp_path / "dev" / "user1" / "catalog2.yml").resolve() - expected_message = ( - f"Config file(s): {expected_path} already processed, skipping loading..." - ) - assert expected_message in log_messages + assert mocked_load.called_once_with(expected_path) def test_yaml_parser_error(self, tmp_path): conf_path = tmp_path / _BASE_ENV @@ -305,7 +300,10 @@ def test_yaml_parser_error(self, tmp_path): (conf_path / "catalog.yml").write_text(example_catalog) - msg = f"Invalid YAML file {conf_path / 'catalog.yml'}, unable to read line 3, position 10." + msg = ( + f"Invalid YAML or JSON file {conf_path / 'catalog.yml'}, unable to read" + f" line 3, position 10." + ) with pytest.raises(ParserError, match=re.escape(msg)): OmegaConfLoader(str(tmp_path))["catalog"] @@ -329,7 +327,7 @@ def test_customised_config_patterns(self, tmp_path): "**/params*", ] - def test_merging_strategy_only_overwrites_specified_key(self, tmp_path): + def test_destructive_merging_strategy(self, tmp_path): mlflow_patterns = {"mlflow": ["mlflow*", "mlflow*/**", "**/mlflow*"]} base_mlflow = tmp_path / _BASE_ENV / "mlflow.yml" base_config = { @@ -357,10 +355,8 @@ def test_merging_strategy_only_overwrites_specified_key(self, tmp_path): assert conf == { "tracking": { - "disable_tracking": {"pipelines": "[on_exit_notification]"}, "experiment": { "name": "name-of-prod-experiment", }, - "params": {"long_params_strategy": "tag"}, } } From 4a26f16cd72586e358f2cc9a330dfae4d913c650 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Mon, 12 Dec 2022 16:42:25 +0000 Subject: [PATCH 09/18] Clean up code and docstring after refactoring Signed-off-by: Merel Theisen --- kedro/config/omegaconf_config.py | 80 ++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 35 deletions(-) diff --git a/kedro/config/omegaconf_config.py b/kedro/config/omegaconf_config.py index 3960d62fdb..e3efb7a23d 100644 --- a/kedro/config/omegaconf_config.py +++ b/kedro/config/omegaconf_config.py @@ -4,7 +4,7 @@ import logging from glob import iglob from pathlib import Path -from typing import AbstractSet, Any, Dict, Iterable, List, Set # noqa +from typing import AbstractSet, Any, Dict, Iterable, List # noqa from omegaconf import OmegaConf from yaml.parser import ParserError @@ -26,14 +26,12 @@ class OmegaConfLoader(AbstractConfigLoader): ``conf_source``. The optional ``env`` argument can be used to specify a subdirectory of ``conf_source`` to process as a config path after ``base``. - When the same top-level key appears in any 2 config files located in + When the same top-level key appears in any two config files located in the same (sub)directory, a ``ValueError`` is raised. - When the same key appears in any 2 config files located in different + When the same key appears in any two config files located in different (sub)directories, the last processed config path takes precedence - and overrides this key. You can find more information about how ``OmegaConf`` - does merging of configuration in their documentation - https://omegaconf.readthedocs.io/en/2.2_branch/usage.html#merging-configurations + and overrides this key and any sub-keys. You can access the different configurations as follows: :: @@ -93,9 +91,8 @@ def __init__( base_env: Name of the base environment. Defaults to `"base"`. This is used in the `conf_paths` property method to construct the configuration paths. - default_run_env: Name of the base environment. Defaults to `"local"`. - This is used in the `conf_paths` property method to construct - the configuration paths. Can be overriden by supplying the `env` argument. + default_run_env: Name of the default run environment. Defaults to `"local"`. + Can be overridden by supplying the `env` argument. """ self.base_env = base_env self.default_run_env = default_run_env @@ -114,12 +111,29 @@ def __init__( runtime_params=runtime_params, ) - def __getitem__(self, key): + def __getitem__(self, key) -> Dict[str, Any]: + """Get configuration files by key, load and merge them, and + return them in the form of a config dictionary. + + Args: + key: Key of the configuration type to fetch. + + Raises: + KeyError: If key provided isn't present in the config_patterns of this + OmegaConfLoader instance. + MissingConfigException: If no configuration files exist matching the patterns + mapped to the provided key. + + Returns: + Dict[str, Any]: A Python dictionary with the combined + configuration from all configuration files. Configuration files will + """ + # In the first iteration of the OmegaConfLoader we'll keep the resolver turned-off. # It's easier to introduce them step by step, but removing them would be a breaking change. _clear_omegaconf_resolvers() - # 1. Load base env config + # Load base env config base_path = str(Path(self.conf_source) / self.base_env) try: base_config = load_and_merge_dir_config( @@ -127,19 +141,14 @@ def __getitem__(self, key): ) config = base_config except KeyError as exc: - raise KeyError("Key not found in patterns") + raise KeyError("Key not found in patterns") from exc - # 2. Load chosen env config + # Load chosen env config run_env = self.env or self.default_run_env env_path = str(Path(self.conf_source) / run_env) - try: - env_config = load_and_merge_dir_config( - env_path, [*self.config_patterns[key]] - ) - except KeyError as exc: - raise KeyError("Key not found in patterns") + env_config = load_and_merge_dir_config(env_path, [*self.config_patterns[key]]) - # 3. Destructively merge the two env dirs. The chosen env will override base. + # Destructively merge the two env dirs. The chosen env will override base. common_keys = config.keys() & env_config.keys() if common_keys: sorted_keys = ", ".join(sorted(common_keys)) @@ -165,13 +174,7 @@ def __repr__(self): # pragma: no cover ) -def is_valid_path(path): - if path.is_file() and path.suffix in [".yml", ".yaml", ".json"]: - return True - return False - - -def load_and_merge_dir_config(conf_path: str, patterns: Iterable[str] = None): +def load_and_merge_dir_config(conf_path: str, patterns: Iterable[str]): """Recursively load and merge all configuration files in a directory using OmegaConf, which satisfy a given list of glob patterns from a specific path. @@ -194,15 +197,15 @@ def load_and_merge_dir_config(conf_path: str, patterns: Iterable[str] = None): f"or is not a valid directory: {conf_path}" ) - # TODO: try to write cleaner - paths = [] - for pattern in patterns: - for each in iglob(f"{str(conf_path)}/{pattern}", recursive=True): - path = Path(each).resolve() - paths.append(path) - + paths = [ + Path(each).resolve() + for pattern in patterns + for each in iglob(f"{str(conf_path)}/{pattern}", recursive=True) + ] deduplicated_paths = set(paths) - config_files_filtered = [path for path in deduplicated_paths if is_valid_path(path)] + config_files_filtered = [ + path for path in deduplicated_paths if _is_valid_config_path(path) + ] config = {} aggregate_config = [] @@ -240,3 +243,10 @@ def _clear_omegaconf_resolvers(): OmegaConf.clear_resolver("oc.select") OmegaConf.clear_resolver("oc.dict.keys") OmegaConf.clear_resolver("oc.dict.values") + + +def _is_valid_config_path(path): + """Check if given path is a file path and file type is yaml or json.""" + if path.is_file() and path.suffix in [".yml", ".yaml", ".json"]: + return True + return False From 94cca6666ffe732ea452157b2dc93ac25d185069 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Mon, 12 Dec 2022 17:00:52 +0000 Subject: [PATCH 10/18] Make test purpose clearer Signed-off-by: Merel Theisen --- tests/config/test_omegaconf_config.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/tests/config/test_omegaconf_config.py b/tests/config/test_omegaconf_config.py index 0f56a0f927..298ec08fb5 100644 --- a/tests/config/test_omegaconf_config.py +++ b/tests/config/test_omegaconf_config.py @@ -92,7 +92,7 @@ def proj_catalog_nested(tmp_path): class TestOmegaConfLoader: @use_config_dir - def test_load_core_config_dict_get(self, tmp_path): + def test_load_core_config_dict_syntax(self, tmp_path): """Make sure core config can be fetched with a dict [] access.""" conf = OmegaConfLoader(str(tmp_path)) params = conf["parameters"] @@ -100,20 +100,27 @@ def test_load_core_config_dict_get(self, tmp_path): assert params["param1"] == 1 assert catalog["trains"]["type"] == "MemoryDataSet" - assert catalog["cars"]["type"] == "pandas.CSVDataSet" - assert catalog["boats"]["type"] == "MemoryDataSet" - assert not catalog["cars"]["save_args"]["index"] @use_config_dir - def test_load_local_config(self, tmp_path): - """Make sure that configs from `local/` override the ones - from `base/`""" + def test_load_core_config_get_syntax(self, tmp_path): + """Make sure core config can be fetched with .get()""" conf = OmegaConfLoader(str(tmp_path)) params = conf.get("parameters") catalog = conf.get("catalog") assert params["param1"] == 1 assert catalog["trains"]["type"] == "MemoryDataSet" + + @use_config_dir + def test_load_local_config_overrides_base(self, tmp_path): + """Make sure that configs from `local/` override the ones + from `base/`""" + conf = OmegaConfLoader(str(tmp_path)) + params = conf["parameters"] + catalog = conf["catalog"] + + assert params["param1"] == 1 + assert catalog["trains"]["type"] == "MemoryDataSet" assert catalog["cars"]["type"] == "pandas.CSVDataSet" assert catalog["boats"]["type"] == "MemoryDataSet" assert not catalog["cars"]["save_args"]["index"] From ed0b82b3789ae8f53ee6c35975e4f326c6e1207e Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Mon, 12 Dec 2022 17:11:38 +0000 Subject: [PATCH 11/18] Fix lint Signed-off-by: Merel Theisen --- docs/source/kedro_project_setup/dependencies.md | 12 ++++++------ docs/source/nodes_and_pipelines/micro_packaging.md | 2 +- kedro/config/omegaconf_config.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/source/kedro_project_setup/dependencies.md b/docs/source/kedro_project_setup/dependencies.md index 08d58ece9b..271d96a2a1 100644 --- a/docs/source/kedro_project_setup/dependencies.md +++ b/docs/source/kedro_project_setup/dependencies.md @@ -5,8 +5,8 @@ Both `pip install kedro` and `conda install -c conda-forge kedro` install the co When you create a project, you then introduce additional dependencies for the tasks it performs. ## Project-specific dependencies -You can specify a project's exact dependencies in the `src/requirements.txt` file to make it easier for you and others to run your project in the future, -and to avoid version conflicts downstream. This can be achieved with the help of [`pip-tools`](https://pypi.org/project/pip-tools/). +You can specify a project's exact dependencies in the `src/requirements.txt` file to make it easier for you and others to run your project in the future, +and to avoid version conflicts downstream. This can be achieved with the help of [`pip-tools`](https://pypi.org/project/pip-tools/). To install `pip-tools` in your virtual environment, run the following command: ```bash pip install pip-tools @@ -18,10 +18,10 @@ To add or remove dependencies to a project, edit the `src/requirements.txt` file pip-compile --output-file=/src/requirements.txt --input-file=/src/requirements.txt ``` -This will [pip compile](https://github.com/jazzband/pip-tools#example-usage-for-pip-compile) the requirements listed in -the `src/requirements.txt` file into a `src/requirements.lock` that specifies a list of pinned project dependencies -(those with a strict version). You can also use this command with additional CLI arguments such as `--generate-hashes` -to use `pip`'s Hash Checking Mode or `--upgrade-package` to update specific packages to the latest or specific versions. +This will [pip compile](https://github.com/jazzband/pip-tools#example-usage-for-pip-compile) the requirements listed in +the `src/requirements.txt` file into a `src/requirements.lock` that specifies a list of pinned project dependencies +(those with a strict version). You can also use this command with additional CLI arguments such as `--generate-hashes` +to use `pip`'s Hash Checking Mode or `--upgrade-package` to update specific packages to the latest or specific versions. [Check out the `pip-tools` documentation](https://pypi.org/project/pip-tools/) for more information. ```{note} diff --git a/docs/source/nodes_and_pipelines/micro_packaging.md b/docs/source/nodes_and_pipelines/micro_packaging.md index c9da24beab..f034b14204 100644 --- a/docs/source/nodes_and_pipelines/micro_packaging.md +++ b/docs/source/nodes_and_pipelines/micro_packaging.md @@ -71,7 +71,7 @@ You can pull a micro-package from a tar file by executing `kedro micropkg pull < * To place parameters from a different config environment, run `kedro micropkg pull --env ` * Unit tests in `src/tests/` * Kedro will also parse any requirements packaged with the micro-package and add them to project level `requirements.in`. -* It is advised to compile an updated list of requirements after pulling a micro-package using [`pip-compile`](https://pypi.org/project/pip-tools/). +* It is advised to compile an updated list of requirements after pulling a micro-package using [`pip-compile`](https://pypi.org/project/pip-tools/). ```{note} If a micro-package has embedded requirements and a project `requirements.in` file does not already exist, it will be generated based on the project `requirements.txt` before appending the micro-package requirements. diff --git a/kedro/config/omegaconf_config.py b/kedro/config/omegaconf_config.py index e3efb7a23d..9801fec098 100644 --- a/kedro/config/omegaconf_config.py +++ b/kedro/config/omegaconf_config.py @@ -126,7 +126,7 @@ def __getitem__(self, key) -> Dict[str, Any]: Returns: Dict[str, Any]: A Python dictionary with the combined - configuration from all configuration files. Configuration files will + configuration from all configuration files. """ # In the first iteration of the OmegaConfLoader we'll keep the resolver turned-off. From 761f92f09920db8be22142c0993b496405d2848c Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Wed, 14 Dec 2022 11:27:58 +0000 Subject: [PATCH 12/18] Make methods class methods + allow for directly setting of config on loader instance Signed-off-by: Merel Theisen --- kedro/config/omegaconf_config.py | 156 +++++++++++++------------- tests/config/test_omegaconf_config.py | 19 ++++ 2 files changed, 100 insertions(+), 75 deletions(-) diff --git a/kedro/config/omegaconf_config.py b/kedro/config/omegaconf_config.py index 9801fec098..7df7c8f84f 100644 --- a/kedro/config/omegaconf_config.py +++ b/kedro/config/omegaconf_config.py @@ -129,14 +129,19 @@ def __getitem__(self, key) -> Dict[str, Any]: configuration from all configuration files. """ + # Allow bypassing of loading config from patterns if a key and value have been set + # explicitly on the ``OmegaConfLoader`` instance. + if key in self: + return super().__getitem__(key) + # In the first iteration of the OmegaConfLoader we'll keep the resolver turned-off. # It's easier to introduce them step by step, but removing them would be a breaking change. - _clear_omegaconf_resolvers() + self._clear_omegaconf_resolvers() # Load base env config base_path = str(Path(self.conf_source) / self.base_env) try: - base_config = load_and_merge_dir_config( + base_config = self.load_and_merge_dir_config( base_path, [*self.config_patterns[key]] ) config = base_config @@ -146,7 +151,9 @@ def __getitem__(self, key) -> Dict[str, Any]: # Load chosen env config run_env = self.env or self.default_run_env env_path = str(Path(self.conf_source) / run_env) - env_config = load_and_merge_dir_config(env_path, [*self.config_patterns[key]]) + env_config = self.load_and_merge_dir_config( + env_path, [*self.config_patterns[key]] + ) # Destructively merge the two env dirs. The chosen env will override base. common_keys = config.keys() & env_config.keys() @@ -156,7 +163,7 @@ def __getitem__(self, key) -> Dict[str, Any]: "Config from path '%s' will override the following " "existing top-level config keys: %s" ) - _config_logger.info(msg, env_path, sorted_keys) + _config_logger.debug(msg, env_path, sorted_keys) config.update(env_config) @@ -173,80 +180,79 @@ def __repr__(self): # pragma: no cover f"config_patterns={self.config_patterns})" ) + def load_and_merge_dir_config(self, conf_path: str, patterns: Iterable[str]): + """Recursively load and merge all configuration files in a directory using OmegaConf, + which satisfy a given list of glob patterns from a specific path. -def load_and_merge_dir_config(conf_path: str, patterns: Iterable[str]): - """Recursively load and merge all configuration files in a directory using OmegaConf, - which satisfy a given list of glob patterns from a specific path. + Args: + conf_path: Path to configuration directory. + patterns: List of glob patterns to match the filenames against. - Args: - conf_path: Path to configuration directory. - patterns: List of glob patterns to match the filenames against. + Raises: + MissingConfigException: If configuration path doesn't exist or isn't valid. + ValueError: If two or more configuration files contain the same key(s). + ParserError: If config file contains invalid YAML or JSON syntax. - Raises: - MissingConfigException: If configuration path doesn't exist or isn't valid. - ValueError: If two or more configuration files contain the same key(s). - ParserError: If config file contains invalid YAML or JSON syntax. + Returns: + Resulting configuration dictionary. - Returns: - Resulting configuration dictionary. + """ + if not Path(conf_path).is_dir(): + raise MissingConfigException( + f"Given configuration path either does not exist " + f"or is not a valid directory: {conf_path}" + ) - """ - if not Path(conf_path).is_dir(): - raise MissingConfigException( - f"Given configuration path either does not exist " - f"or is not a valid directory: {conf_path}" - ) + paths = [ + Path(each).resolve() + for pattern in patterns + for each in iglob(f"{str(conf_path)}/{pattern}", recursive=True) + ] + deduplicated_paths = set(paths) + config_files_filtered = [ + path for path in deduplicated_paths if self._is_valid_config_path(path) + ] + + config = {} + aggregate_config = [] + seen_file_to_keys = {} # type: Dict[Path, AbstractSet[str]] + + for config_filepath in config_files_filtered: + try: + single_config = OmegaConf.load(config_filepath) + config = single_config + except (ParserError, ScannerError) as exc: + assert exc.problem_mark is not None + line = exc.problem_mark.line + cursor = exc.problem_mark.column + raise ParserError( + f"Invalid YAML or JSON file {config_filepath}, unable to read line {line}, " + f"position {cursor}." + ) from exc + + _check_duplicate_keys( + seen_file_to_keys, config_filepath, dict(single_config) + ) + seen_file_to_keys[config_filepath] = single_config.keys() + aggregate_config.append(single_config) - paths = [ - Path(each).resolve() - for pattern in patterns - for each in iglob(f"{str(conf_path)}/{pattern}", recursive=True) - ] - deduplicated_paths = set(paths) - config_files_filtered = [ - path for path in deduplicated_paths if _is_valid_config_path(path) - ] - - config = {} - aggregate_config = [] - seen_file_to_keys = {} # type: Dict[Path, AbstractSet[str]] - - for config_filepath in config_files_filtered: - try: - single_config = OmegaConf.load(config_filepath) - config = single_config - except (ParserError, ScannerError) as exc: - assert exc.problem_mark is not None - line = exc.problem_mark.line - cursor = exc.problem_mark.column - raise ParserError( - f"Invalid YAML or JSON file {config_filepath}, unable to read line {line}, " - f"position {cursor}." - ) from exc - - _check_duplicate_keys(seen_file_to_keys, config_filepath, dict(single_config)) - seen_file_to_keys[config_filepath] = single_config.keys() - aggregate_config.append(single_config) - - if len(aggregate_config) > 1: - return dict(OmegaConf.merge(*aggregate_config)) - - return config - - -def _clear_omegaconf_resolvers(): - """Clear the built-in OmegaConf resolvers.""" - OmegaConf.clear_resolver("oc.env") - OmegaConf.clear_resolver("oc.create") - OmegaConf.clear_resolver("oc.deprecated") - OmegaConf.clear_resolver("oc.decode") - OmegaConf.clear_resolver("oc.select") - OmegaConf.clear_resolver("oc.dict.keys") - OmegaConf.clear_resolver("oc.dict.values") - - -def _is_valid_config_path(path): - """Check if given path is a file path and file type is yaml or json.""" - if path.is_file() and path.suffix in [".yml", ".yaml", ".json"]: - return True - return False + if len(aggregate_config) > 1: + return dict(OmegaConf.merge(*aggregate_config)) + + return config + + @staticmethod + def _is_valid_config_path(path): + """Check if given path is a file path and file type is yaml or json.""" + return path.is_file() and path.suffix in [".yml", ".yaml", ".json"] + + @staticmethod + def _clear_omegaconf_resolvers(): + """Clear the built-in OmegaConf resolvers.""" + OmegaConf.clear_resolver("oc.env") + OmegaConf.clear_resolver("oc.create") + OmegaConf.clear_resolver("oc.deprecated") + OmegaConf.clear_resolver("oc.decode") + OmegaConf.clear_resolver("oc.select") + OmegaConf.clear_resolver("oc.dict.keys") + OmegaConf.clear_resolver("oc.dict.values") diff --git a/tests/config/test_omegaconf_config.py b/tests/config/test_omegaconf_config.py index 298ec08fb5..01548cafd0 100644 --- a/tests/config/test_omegaconf_config.py +++ b/tests/config/test_omegaconf_config.py @@ -367,3 +367,22 @@ def test_destructive_merging_strategy(self, tmp_path): }, } } + + @use_config_dir + def test_adding_extra_keys_to_confloader(self, tmp_path): + """Make sure extra keys can be added directly to the config loader instance.""" + conf = OmegaConfLoader(str(tmp_path)) + catalog = conf["catalog"] + conf["spark"] = {"spark_config": "emr.blabla"} + + assert catalog["trains"]["type"] == "MemoryDataSet" + assert conf["spark"] == {"spark_config": "emr.blabla"} + + @use_config_dir + def test_bypass_catalog_config_loading(self, tmp_path): + """Make sure core config loading can be bypassed by setting the key and values + directly on the config loader instance.""" + conf = OmegaConfLoader(str(tmp_path)) + conf["catalog"] = {"catalog_config": "something_new"} + + assert conf["catalog"] == {"catalog_config": "something_new"} From 635f003b3747d58b8d69315cb02767eb13cba951 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Wed, 14 Dec 2022 17:00:13 +0000 Subject: [PATCH 13/18] Allow for directly setting config on loader instance for CL + TCL Signed-off-by: Merel Theisen --- kedro/config/config.py | 4 ++++ kedro/config/templated_config.py | 4 ++++ tests/config/test_config.py | 19 +++++++++++++++++++ tests/config/test_templated_config.py | 22 ++++++++++++++++++++++ 4 files changed, 49 insertions(+) diff --git a/kedro/config/config.py b/kedro/config/config.py index 09105700e9..6f45e830d9 100644 --- a/kedro/config/config.py +++ b/kedro/config/config.py @@ -108,6 +108,10 @@ def __init__( ) def __getitem__(self, key): + # Allow bypassing of loading config from patterns if a key and value have been set + # explicitly on the ``ConfigLoader`` instance. + if key in self: + return super().__getitem__(key) return self.get(*self.config_patterns[key]) def __repr__(self): # pragma: no cover diff --git a/kedro/config/templated_config.py b/kedro/config/templated_config.py index 3d2c85dc07..3468bf10dc 100644 --- a/kedro/config/templated_config.py +++ b/kedro/config/templated_config.py @@ -145,6 +145,10 @@ def __init__( self._config_mapping = {**self._config_mapping, **globals_dict} def __getitem__(self, key): + # Allow bypassing of loading config from patterns if a key and value have been set + # explicitly on the ``TemplatedConfigLoader`` instance. + if key in self: + return super().__getitem__(key) return self.get(*self.config_patterns[key]) def __repr__(self): # pragma: no cover diff --git a/tests/config/test_config.py b/tests/config/test_config.py index 88edc9e881..0c141168c9 100644 --- a/tests/config/test_config.py +++ b/tests/config/test_config.py @@ -352,3 +352,22 @@ def test_customised_config_patterns(self, tmp_path): "params*/**", "**/params*", ] + + @use_config_dir + def test_adding_extra_keys_to_confloader(self, tmp_path): + """Make sure extra keys can be added directly to the config loader instance.""" + conf = ConfigLoader(str(tmp_path)) + catalog = conf["catalog"] + conf["spark"] = {"spark_config": "emr.blabla"} + + assert catalog["trains"]["type"] == "MemoryDataSet" + assert conf["spark"] == {"spark_config": "emr.blabla"} + + @use_config_dir + def test_bypass_catalog_config_loading(self, tmp_path): + """Make sure core config loading can be bypassed by setting the key and values + directly on the config loader instance.""" + conf = ConfigLoader(str(tmp_path)) + conf["catalog"] = {"catalog_config": "something_new"} + + assert conf["catalog"] == {"catalog_config": "something_new"} diff --git a/tests/config/test_templated_config.py b/tests/config/test_templated_config.py index a78c0dc926..2dd4bb6b6c 100644 --- a/tests/config/test_templated_config.py +++ b/tests/config/test_templated_config.py @@ -480,3 +480,25 @@ def test_customised_patterns(self, tmp_path): "**/catalog*", ] assert config_loader.config_patterns["spark"] == ["spark*/"] + + @pytest.mark.usefixtures("proj_catalog_param") + def test_adding_extra_keys_to_confloader(self, tmp_path, template_config): + """Make sure extra keys can be added directly to the config loader instance.""" + config_loader = TemplatedConfigLoader( + str(tmp_path), globals_dict=template_config + ) + config_loader.default_run_env = "" + catalog = config_loader["catalog"] + config_loader["spark"] = {"spark_config": "emr.blabla"} + + assert catalog["boats"]["type"] == "SparkDataSet" + assert config_loader["spark"] == {"spark_config": "emr.blabla"} + + @pytest.mark.usefixtures("proj_catalog_param") + def test_bypass_catalog_config_loading(self, tmp_path): + """Make sure core config loading can be bypassed by setting the key and values + directly on the config loader instance.""" + conf = TemplatedConfigLoader(str(tmp_path)) + conf["catalog"] = {"catalog_config": "something_new"} + + assert conf["catalog"] == {"catalog_config": "something_new"} From e9d42e3520a9b525f7ea2e988ea914f6be1f9ce6 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Thu, 15 Dec 2022 13:05:49 +0000 Subject: [PATCH 14/18] Address comments from review Signed-off-by: Merel Theisen --- kedro/config/omegaconf_config.py | 30 +++++++++++++-------------- tests/config/test_omegaconf_config.py | 5 +++-- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/kedro/config/omegaconf_config.py b/kedro/config/omegaconf_config.py index 7df7c8f84f..6e63b798f2 100644 --- a/kedro/config/omegaconf_config.py +++ b/kedro/config/omegaconf_config.py @@ -105,6 +105,10 @@ def __init__( } self.config_patterns.update(config_patterns or {}) + # In the first iteration of the OmegaConfLoader we'll keep the resolver turned-off. + # It's easier to introduce them step by step, but removing them would be a breaking change. + self._clear_omegaconf_resolvers() + super().__init__( conf_source=conf_source, env=env, @@ -134,26 +138,21 @@ def __getitem__(self, key) -> Dict[str, Any]: if key in self: return super().__getitem__(key) - # In the first iteration of the OmegaConfLoader we'll keep the resolver turned-off. - # It's easier to introduce them step by step, but removing them would be a breaking change. - self._clear_omegaconf_resolvers() + if key not in self.config_patterns: + raise KeyError( + f"No config patterns were found for '{key}' in your config loader" + ) + patterns = [*self.config_patterns[key]] # Load base env config base_path = str(Path(self.conf_source) / self.base_env) - try: - base_config = self.load_and_merge_dir_config( - base_path, [*self.config_patterns[key]] - ) - config = base_config - except KeyError as exc: - raise KeyError("Key not found in patterns") from exc + base_config = self.load_and_merge_dir_config(base_path, patterns) + config = base_config # Load chosen env config run_env = self.env or self.default_run_env env_path = str(Path(self.conf_source) / run_env) - env_config = self.load_and_merge_dir_config( - env_path, [*self.config_patterns[key]] - ) + env_config = self.load_and_merge_dir_config(env_path, patterns) # Destructively merge the two env dirs. The chosen env will override base. common_keys = config.keys() & env_config.keys() @@ -222,9 +221,8 @@ def load_and_merge_dir_config(self, conf_path: str, patterns: Iterable[str]): single_config = OmegaConf.load(config_filepath) config = single_config except (ParserError, ScannerError) as exc: - assert exc.problem_mark is not None - line = exc.problem_mark.line - cursor = exc.problem_mark.column + line = exc.problem_mark.line # type: ignore + cursor = exc.problem_mark.column # type: ignore raise ParserError( f"Invalid YAML or JSON file {config_filepath}, unable to read line {line}, " f"position {cursor}." diff --git a/tests/config/test_omegaconf_config.py b/tests/config/test_omegaconf_config.py index 01548cafd0..f4f3b2ba68 100644 --- a/tests/config/test_omegaconf_config.py +++ b/tests/config/test_omegaconf_config.py @@ -226,9 +226,10 @@ def test_same_key_in_same_dir(self, tmp_path, base_config): @use_config_dir def test_pattern_key_not_found(self, tmp_path): """Check the error if no config files satisfy a given pattern""" - pattern = "Key not found in patterns" + key = "non-existent-pattern" + pattern = f"No config patterns were found for '{key}' in your config loader" with pytest.raises(KeyError, match=pattern): - OmegaConfLoader(str(tmp_path))["non-existent-pattern"] + OmegaConfLoader(str(tmp_path))[key] @use_config_dir def test_cannot_load_non_yaml_or_json_files(self, tmp_path): From b7e4d09e3e13d3718925bc9ef97031f99cd0be27 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Mon, 19 Dec 2022 12:42:20 +0100 Subject: [PATCH 15/18] Rewrite check duplicate to check all files + add test Signed-off-by: Merel Theisen --- kedro/config/omegaconf_config.py | 42 ++++++++++++++++++++------- tests/config/test_omegaconf_config.py | 40 +++++++++++++++++++++++-- 2 files changed, 69 insertions(+), 13 deletions(-) diff --git a/kedro/config/omegaconf_config.py b/kedro/config/omegaconf_config.py index 6e63b798f2..b470bcdf7e 100644 --- a/kedro/config/omegaconf_config.py +++ b/kedro/config/omegaconf_config.py @@ -4,14 +4,13 @@ import logging from glob import iglob from pathlib import Path -from typing import AbstractSet, Any, Dict, Iterable, List # noqa +from typing import Any, Dict, Iterable, List, Set # noqa from omegaconf import OmegaConf from yaml.parser import ParserError from yaml.scanner import ScannerError from kedro.config import AbstractConfigLoader, MissingConfigException -from kedro.config.common import _check_duplicate_keys _config_logger = logging.getLogger(__name__) @@ -213,12 +212,12 @@ def load_and_merge_dir_config(self, conf_path: str, patterns: Iterable[str]): ] config = {} - aggregate_config = [] - seen_file_to_keys = {} # type: Dict[Path, AbstractSet[str]] + config_per_file = {} for config_filepath in config_files_filtered: try: single_config = OmegaConf.load(config_filepath) + config_per_file[config_filepath] = single_config config = single_config except (ParserError, ScannerError) as exc: line = exc.problem_mark.line # type: ignore @@ -228,12 +227,11 @@ def load_and_merge_dir_config(self, conf_path: str, patterns: Iterable[str]): f"position {cursor}." ) from exc - _check_duplicate_keys( - seen_file_to_keys, config_filepath, dict(single_config) - ) - seen_file_to_keys[config_filepath] = single_config.keys() - aggregate_config.append(single_config) - + seen_file_to_keys = { + file: set(config.keys()) for file, config in config_per_file.items() + } + aggregate_config = config_per_file.values() + self._check_duplicates(seen_file_to_keys) if len(aggregate_config) > 1: return dict(OmegaConf.merge(*aggregate_config)) @@ -244,6 +242,30 @@ def _is_valid_config_path(path): """Check if given path is a file path and file type is yaml or json.""" return path.is_file() and path.suffix in [".yml", ".yaml", ".json"] + @staticmethod + def _check_duplicates(seen_files_to_keys: Dict[Path, Set[Any]]): + duplicates = [] + + filepaths = list(seen_files_to_keys.keys()) + for i, key1 in enumerate(filepaths, 1): + config1 = seen_files_to_keys[key1] + for key2 in filepaths[i:]: + config2 = seen_files_to_keys[key2] + + overlapping_keys = config1 & config2 + + if overlapping_keys: + sorted_keys = ", ".join(sorted(overlapping_keys)) + if len(sorted_keys) > 100: + sorted_keys = sorted_keys[:100] + "..." + duplicates.append( + f"Duplicate keys found in {key1} and {key2}: {sorted_keys}" + ) + + if duplicates: + dup_str = "\n".join(duplicates) + raise ValueError(f"{dup_str}") + @staticmethod def _clear_omegaconf_resolvers(): """Clear the built-in OmegaConf resolvers.""" diff --git a/tests/config/test_omegaconf_config.py b/tests/config/test_omegaconf_config.py index f4f3b2ba68..aebb0e3221 100644 --- a/tests/config/test_omegaconf_config.py +++ b/tests/config/test_omegaconf_config.py @@ -178,12 +178,46 @@ def test_nested_subdirs_duplicate(self, tmp_path, base_config): pattern = ( r"Duplicate keys found in " - r"(.*catalog\.yml and\:\n\- .*nested\.yml|.*nested\.yml and\:\n\- .*catalog\.yml)" + r"(.*catalog\.yml and .*nested\.yml|.*nested\.yml and .*catalog\.yml)" r"\: cars, trains" ) with pytest.raises(ValueError, match=pattern): OmegaConfLoader(str(tmp_path))["catalog"] + @use_config_dir + def test_multiple_nested_subdirs_duplicates( + self, tmp_path, base_config, local_config + ): + """Check the error when several config files from subdirectories contain + duplicate keys""" + nested = tmp_path / _BASE_ENV / "catalog" / "dir" / "nested.yml" + _write_yaml(nested, base_config) + + local = tmp_path / _BASE_ENV / "catalog" / "dir" / "local.yml" + _write_yaml(local, local_config) + + pattern_catalog_nested = ( + r"Duplicate keys found in " + r"(.*catalog\.yml and .*nested\.yml|.*nested\.yml and .*catalog\.yml)" + r"\: cars, trains" + ) + pattern_catalog_local = ( + r"Duplicate keys found in " + r"(.*catalog\.yml and .*local\.yml|.*local\.yml and .*catalog\.yml)" + r"\: cars" + ) + pattern_nested_local = ( + r"Duplicate keys found in " + r"(.*nested\.yml and .*local\.yml|.*local\.yml and .*nested\.yml)" + r"\: cars" + ) + + with pytest.raises(ValueError) as exc: + OmegaConfLoader(str(tmp_path))["catalog"] + assert re.search(pattern_catalog_nested, str(exc.value)) + assert re.search(pattern_catalog_local, str(exc.value)) + assert re.search(pattern_nested_local, str(exc.value)) + @use_config_dir def test_bad_config_syntax(self, tmp_path): conf_path = tmp_path / _BASE_ENV @@ -202,7 +236,7 @@ def test_lots_of_duplicates(self, tmp_path): conf = OmegaConfLoader(str(tmp_path)) pattern = ( r"Duplicate keys found in " - r"(.*catalog2\.yml and\:\n\- .*catalog1\.yml|.*catalog1\.yml and\:\n\- .*catalog2\.yml)" + r"(.*catalog2\.yml and .*catalog1\.yml|.*catalog1\.yml and .*catalog2\.yml)" r"\: .*\.\.\.$" ) with pytest.raises(ValueError, match=pattern): @@ -217,7 +251,7 @@ def test_same_key_in_same_dir(self, tmp_path, base_config): pattern = ( r"Duplicate keys found in " - r"(.*catalog\.yml and\:\n\- .*catalog\.json|.*catalog\.json and\:\n\- .*catalog\.yml)" + r"(.*catalog\.yml and .*catalog\.json|.*catalog\.json and .*catalog\.yml)" r"\: cars, trains" ) with pytest.raises(ValueError, match=pattern): From 11c9ac14740d4ec5d37322346733237b87470899 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Mon, 19 Dec 2022 12:45:04 +0100 Subject: [PATCH 16/18] Update release notes Signed-off-by: Merel Theisen --- RELEASE.md | 1 + 1 file changed, 1 insertion(+) diff --git a/RELEASE.md b/RELEASE.md index 1fac6d2830..65ca804627 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -11,6 +11,7 @@ # Upcoming Release 0.18.5 ## Major features and improvements +* Add new `OmegaConfLoader` which uses `OmegaConf` for loading and merging configuration. ## Bug fixes and other changes * Fix bug where `micropkg` manifest section in `pyproject.toml` isn't recognised as allowed configuration. From 51e9e9d675b5dc7a1ac111eb1e83b91b5f1839eb Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Mon, 19 Dec 2022 12:55:32 +0100 Subject: [PATCH 17/18] Fix lint Signed-off-by: Merel Theisen --- RELEASE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RELEASE.md b/RELEASE.md index 65ca804627..3fd4c264a0 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -11,7 +11,7 @@ # Upcoming Release 0.18.5 ## Major features and improvements -* Add new `OmegaConfLoader` which uses `OmegaConf` for loading and merging configuration. +* Add new `OmegaConfLoader` which uses `OmegaConf` for loading and merging configuration. ## Bug fixes and other changes * Fix bug where `micropkg` manifest section in `pyproject.toml` isn't recognised as allowed configuration. From 9ea525abf51f82fc2498c343c8c18e4043de5ec1 Mon Sep 17 00:00:00 2001 From: Merel Theisen Date: Tue, 20 Dec 2022 12:09:09 +0100 Subject: [PATCH 18/18] Address review comments + bump omegaconf version Signed-off-by: Merel Theisen --- dependency/requirements.txt | 2 +- kedro/config/omegaconf_config.py | 26 +++++++++++++------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/dependency/requirements.txt b/dependency/requirements.txt index 55eb6b8e89..34ce5336ef 100644 --- a/dependency/requirements.txt +++ b/dependency/requirements.txt @@ -10,7 +10,7 @@ importlib-metadata>=3.6; python_version >= '3.8' importlib_metadata>=3.6, <5.0; python_version < '3.8' # The "selectable" entry points were introduced in `importlib_metadata` 3.6 and Python 3.10. Bandit on Python 3.7 relies on a library with `importlib_metadata` < 5.0 importlib_resources>=1.3 # The `files()` API was introduced in `importlib_resources` 1.3 and Python 3.9. jmespath>=0.9.5, <1.0 -omegaconf~=2.2 +omegaconf~=2.3 pip-tools~=6.11 pluggy~=1.0.0 PyYAML>=4.2, <7.0 diff --git a/kedro/config/omegaconf_config.py b/kedro/config/omegaconf_config.py index b470bcdf7e..ce4efe386f 100644 --- a/kedro/config/omegaconf_config.py +++ b/kedro/config/omegaconf_config.py @@ -73,8 +73,8 @@ def __init__( conf_source: str, env: str = None, runtime_params: Dict[str, Any] = None, - config_patterns: Dict[str, List[str]] = None, *, + config_patterns: Dict[str, List[str]] = None, base_env: str = "base", default_run_env: str = "local", ): @@ -211,14 +211,12 @@ def load_and_merge_dir_config(self, conf_path: str, patterns: Iterable[str]): path for path in deduplicated_paths if self._is_valid_config_path(path) ] - config = {} config_per_file = {} for config_filepath in config_files_filtered: try: - single_config = OmegaConf.load(config_filepath) - config_per_file[config_filepath] = single_config - config = single_config + config = OmegaConf.load(config_filepath) + config_per_file[config_filepath] = config except (ParserError, ScannerError) as exc: line = exc.problem_mark.line # type: ignore cursor = exc.problem_mark.column # type: ignore @@ -232,10 +230,12 @@ def load_and_merge_dir_config(self, conf_path: str, patterns: Iterable[str]): } aggregate_config = config_per_file.values() self._check_duplicates(seen_file_to_keys) - if len(aggregate_config) > 1: - return dict(OmegaConf.merge(*aggregate_config)) - return config + if aggregate_config: + if len(aggregate_config) > 1: + return dict(OmegaConf.merge(*aggregate_config)) + return list(aggregate_config)[0] + return {} @staticmethod def _is_valid_config_path(path): @@ -247,10 +247,10 @@ def _check_duplicates(seen_files_to_keys: Dict[Path, Set[Any]]): duplicates = [] filepaths = list(seen_files_to_keys.keys()) - for i, key1 in enumerate(filepaths, 1): - config1 = seen_files_to_keys[key1] - for key2 in filepaths[i:]: - config2 = seen_files_to_keys[key2] + for i, filepath1 in enumerate(filepaths, 1): + config1 = seen_files_to_keys[filepath1] + for filepath2 in filepaths[i:]: + config2 = seen_files_to_keys[filepath2] overlapping_keys = config1 & config2 @@ -259,7 +259,7 @@ def _check_duplicates(seen_files_to_keys: Dict[Path, Set[Any]]): if len(sorted_keys) > 100: sorted_keys = sorted_keys[:100] + "..." duplicates.append( - f"Duplicate keys found in {key1} and {key2}: {sorted_keys}" + f"Duplicate keys found in {filepath1} and {filepath2}: {sorted_keys}" ) if duplicates: