Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[KED-2565] Register DataCatalog in settings instead of through the registration hook. #1064

Merged
merged 11 commits into from
Dec 6, 2021
4 changes: 3 additions & 1 deletion RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,15 +64,17 @@
* Removed the `--parallel` flag from `kedro run` in favour of `--runner=ParallelRunner`. The `-p` flag is now an alias for `--pipeline`.
* Removed deprecated `CONF_SOURCE`, `package_name`, `pipeline`, `pipelines`, and `io` attributes from `KedroContext` as well as the deprecated `KedroContext.run` method.
* Changed the behaviour of `kedro build-reqs` to compile requirements from `requirements.txt` instead of `requirements.in` and save them to `requirements.lock` instead of `requirements.txt`.
* Removed `ProjectHooks.register_catalog` `hook_spec` in favour of loading `DATA_CATALOG_CLASS` directly from `settings.py`. The default option for `DATA_CATALOG_CLASS` is now set to `kedro.io.DataCatalog`.

## Thanks for supporting contributions

[Deepyaman Datta](https://github.com/deepyaman), [Lucas Jamar](https://github.com/lucasjamar), [Simon Brugman](https://github.com/sbrugman)

## Migration guide from Kedro 0.17.* to 0.18.*
* Please remove any existing `hook_impl` of the `register_config_loader` method from `ProjectHooks` (or custom alternatives).
* Please remove any existing `hook_impl` of the `register_config_loader` and `register_catalog` methods from `ProjectHooks` (or custom alternatives).
* Populate `settings.py` with `CONFIG_LOADER_CLASS` set to your expected config loader class (for example `kedro.config.TemplatedConfigLoader` or custom implementation). If `CONFIG_LOADER_CLASS` value is not set, it will default to `kedro.config.ConfigLoader` at runtime.
* Populate `settings.py` with `CONFIG_LOADER_ARGS` set to a dictionary with expected keyword arguments. If `CONFIG_LOADER_ARGS` is not set, it will default to an empty dictionary.
* Populate `settings.py` with `DATA_CATALOG_CLASS` set to your expected data catalog class. If `DATA_CATALOG_CLASS` value is not set, it will default to `kedro.io.DataCatalog` at runtime.
* Optional: You can now remove all `params:` prefix when supplying values to `parameters` argument in a `pipeline()` call.
* If you're using `pandas.ExcelDataSet`, make sure you have `openpyxl` installed in your environment. Note that this is automatically pulled if you specify `kedro[pandas.ExcelDataSet]==0.18.0` in your `requirements.in`. You can uninstall `xlrd` if you were only using it for this dataset.
* If you're using `pandas.ParquetDataSet`, please pass pandas saving arguments directly to `save_args` instead of nested in `from_pandas` (e.g. `save_args = {"preserve_index": False}` instead of `save_args = {"from_pandas": {"preserve_index": False}}`).
Expand Down
3 changes: 1 addition & 2 deletions docs/source/extend_kedro/hooks.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,9 @@ The naming convention for error hooks is `on_<noun>_error`, in which:

#### Registration Hooks

In addition, Kedro defines Hook specifications to register certain library components to be used with the project. This is where users can define their custom class implementations. Currently, the following Hook specifications are provided:
In addition, Kedro defines Hook specifications to register certain library components to be used with the project. This is where users can define their custom class implementations. Currently, the following Hook specification is provided:

* `register_pipelines`
* `register_catalog`

The naming convention for registration hooks is `register_<library_component>`.

Expand Down
Original file line number Diff line number Diff line change
@@ -1,19 +1,5 @@
"""Project hooks."""
from typing import Any, Dict, Optional

from kedro.framework.hooks import hook_impl
from kedro.io import DataCatalog


class ProjectHooks:
@hook_impl
def register_catalog(
self,
catalog: Optional[Dict[str, Dict[str, Any]]],
credentials: Dict[str, Dict[str, Any]],
load_versions: Dict[str, str],
save_version: str,
) -> DataCatalog:
return DataCatalog.from_config(
catalog, credentials, load_versions, save_version
)
pass
21 changes: 11 additions & 10 deletions kedro/framework/context/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from kedro.config import ConfigLoader, MissingConfigException
from kedro.framework.hooks import get_hook_manager
from kedro.framework.project import settings
from kedro.io import DataCatalog
from kedro.pipeline.pipeline import _transcode_split

Expand Down Expand Up @@ -267,18 +268,18 @@ def _get_catalog(
)
conf_creds = self._get_config_credentials()

hook_manager = get_hook_manager()
catalog = hook_manager.hook.register_catalog( # pylint: disable=no-member
catalog=conf_catalog,
credentials=conf_creds,
load_versions=load_versions,
save_version=save_version,
)
if not isinstance(catalog, DataCatalog):
try:
catalog = settings.DATA_CATALOG_CLASS.from_config(
catalog=conf_catalog,
credentials=conf_creds,
load_versions=load_versions,
save_version=save_version,
)
except (TypeError, AttributeError) as exc:
raise KedroContextError(
f"Expected an instance of `DataCatalog`, "
f"got `{type(catalog).__name__}` instead."
)
f"got `{type(settings.DATA_CATALOG_CLASS)}` instead."
) from exc

feed_dict = self._get_feed_dict()
catalog.add_feed_dict(feed_dict)
Expand Down
15 changes: 0 additions & 15 deletions kedro/framework/hooks/specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,18 +297,3 @@ def register_pipelines(self) -> Dict[str, Pipeline]:

"""
pass

@hook_spec(firstresult=True)
def register_catalog( # pylint: disable=too-many-arguments
self,
catalog: Optional[Dict[str, Dict[str, Any]]],
credentials: Dict[str, Dict[str, Any]],
load_versions: Dict[str, str],
save_version: str,
) -> DataCatalog:
"""Hook to be invoked to register a project's data catalog.

Returns:
An instance of a ``DataCatalog``.
"""
pass
4 changes: 4 additions & 0 deletions kedro/framework/project/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ class _ProjectSettings(LazySettings):
"CONFIG_LOADER_CLASS", default=_get_default_class("kedro.config.ConfigLoader")
)
_CONFIG_LOADER_ARGS = Validator("CONFIG_LOADER_ARGS", default={})
_DATA_CATALOG_CLASS = Validator(
"DATA_CATALOG_CLASS", default=_get_default_class("kedro.io.DataCatalog")
)

def __init__(self, *args, **kwargs):

Expand All @@ -76,6 +79,7 @@ def __init__(self, *args, **kwargs):
self._DISABLE_HOOKS_FOR_PLUGINS,
self._CONFIG_LOADER_CLASS,
self._CONFIG_LOADER_ARGS,
self._DATA_CATALOG_CLASS,
]
)
super().__init__(*args, **kwargs)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,4 @@


class ProjectHooks:
@hook_impl
def register_catalog(
self,
catalog: Optional[Dict[str, Dict[str, Any]]],
credentials: Dict[str, Dict[str, Any]],
load_versions: Dict[str, str],
save_version: str,
) -> DataCatalog:
return DataCatalog.from_config(
catalog, credentials, load_versions, save_version
)
pass
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,8 @@
# Define the configuration folder. Defaults to `conf`
# CONF_SOURCE = "conf"

# Select the project ConfigLoader class here.
# Define the project ConfigLoader class here.
# Defaults to kedro.config.ConfigLoader
# Define the config loader. Defaults to ConfigLoader.
# from kedro.config import TemplatedConfigLoader
# CONFIG_LOADER_CLASS = TemplatedConfigLoader

Expand All @@ -35,3 +34,7 @@
# "base_env": "base",
# "default_run_env": "local",
# }

# Define the project DataCatalog class here.
# Defaults to kedro.io.DataCatalog
# DATA_CATALOG_CLASS = DataCatalog
30 changes: 17 additions & 13 deletions tests/framework/context/test_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,19 @@
from kedro.config import ConfigLoader, MissingConfigException
from kedro.framework.context import KedroContext
from kedro.framework.context.context import (
KedroContextError,
_convert_paths_to_absolute_posix,
_is_relative_path,
_update_nested_dict,
_validate_layers_for_transcoding,
)
from kedro.framework.hooks import get_hook_manager, hook_impl
from kedro.framework.hooks import get_hook_manager
from kedro.framework.project import (
Validator,
_ProjectSettings,
configure_project,
pipelines,
)
from kedro.io import DataCatalog

MOCK_PACKAGE_NAME = "mock_package_name"

Expand Down Expand Up @@ -145,23 +145,21 @@ def prepare_project_dir(tmp_path, base_config, local_config, local_logging_confi
_write_toml(tmp_path / "pyproject.toml", pyproject_toml_payload)


class RegistrationHooks:
@hook_impl
def register_catalog(
self, catalog, credentials, load_versions, save_version
) -> DataCatalog:
return DataCatalog.from_config(
catalog, credentials, load_versions, save_version
)
class BrokenSettings(_ProjectSettings):
_DATA_CATALOG_CLASS = Validator("DATA_CATALOG_CLASS", default="it breaks")


class MockSettings(_ProjectSettings):
_HOOKS = Validator("HOOKS", default=(RegistrationHooks(),))
@pytest.fixture
def broken_settings(mocker):
mocked_settings = BrokenSettings()
mocker.patch("kedro.framework.session.session.settings", mocked_settings)
mocker.patch("kedro.framework.context.context.settings", mocked_settings)
return mocker.patch("kedro.framework.project.settings", mocked_settings)


@pytest.fixture(autouse=True)
def mock_settings(mocker):
mocked_settings = MockSettings()
mocked_settings = _ProjectSettings()
mocker.patch("kedro.framework.session.session.settings", mocked_settings)
return mocker.patch("kedro.framework.project.settings", mocked_settings)

Expand Down Expand Up @@ -271,6 +269,12 @@ def test_catalog(self, dummy_context, dummy_dataframe):
reloaded_df = dummy_context.catalog.load("cars")
assert_frame_equal(reloaded_df, dummy_dataframe)

# pylint: disable=unused-argument
def test_broken_catalog(self, broken_settings, dummy_context):
pattern = f"Expected an instance of `DataCatalog`, got `{type('')}` instead."
with pytest.raises(KedroContextError, match=re.escape(pattern)):
_ = dummy_context.catalog

@pytest.mark.parametrize(
"extra_params",
[None, {}, {"foo": "bar", "baz": [1, 2], "qux": None}],
Expand Down
23 changes: 1 addition & 22 deletions tests/framework/session/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from logging.handlers import QueueHandler, QueueListener
from multiprocessing import Queue
from pathlib import Path
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List

import pandas as pd
import pytest
Expand Down Expand Up @@ -345,27 +345,6 @@ def after_dataset_saved(self, dataset_name: str, data: Any) -> None:
"After dataset saved", extra={"dataset_name": dataset_name, "data": data}
)

@hook_impl
def register_catalog(
self,
catalog: Optional[Dict[str, Dict[str, Any]]],
credentials: Dict[str, Dict[str, Any]],
load_versions: Dict[str, str],
save_version: str,
) -> DataCatalog:
logger.info(
"Registering catalog",
extra={
"catalog": catalog,
"credentials": credentials,
"load_versions": load_versions,
"save_version": save_version,
},
)
return DataCatalog.from_config(
catalog, credentials, load_versions, save_version
)


@pytest.fixture
def project_hooks():
Expand Down
56 changes: 0 additions & 56 deletions tests/framework/session/test_session_registration_hooks.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
import logging
import re
from typing import Any, Dict, Optional

import pytest
from dynaconf.validator import Validator

from kedro.framework.context import KedroContextError
from kedro.framework.hooks import hook_impl
from kedro.framework.project import _ProjectSettings, pipelines
from kedro.framework.session import KedroSession
from kedro.io import DataCatalog
from tests.framework.session.conftest import (
_assert_hook_call_record_has_expected_parameters,
_mock_imported_settings_paths,
Expand Down Expand Up @@ -53,32 +50,6 @@ def mock_settings_duplicate_hooks(mocker, project_hooks, pipeline_registration_h
)


class RequiredRegistrationHooks:
"""Mandatory registration hooks"""

@hook_impl
def register_catalog(
self,
catalog: Optional[Dict[str, Dict[str, Any]]],
credentials: Dict[str, Dict[str, Any]],
load_versions: Dict[str, str],
save_version: str,
) -> DataCatalog:
return DataCatalog.from_config( # pragma: no cover
catalog, credentials, load_versions, save_version
)


@pytest.fixture
def mock_settings_broken_catalog_hooks(mocker):
class BrokenCatalogHooks(RequiredRegistrationHooks):
@hook_impl
def register_catalog(self): # pylint: disable=arguments-differ
return None

return _mock_settings_with_hooks(mocker, hooks=(BrokenCatalogHooks(),))


@pytest.fixture
def mock_session(
mock_settings_with_pipeline_hooks, mock_package_name, tmp_path
Expand Down Expand Up @@ -114,23 +85,6 @@ def test_register_pipelines_is_called(
}
assert pipelines == expected_pipelines

def test_register_catalog_is_called(self, mock_session, caplog):
context = mock_session.load_context()
catalog = context.catalog
assert isinstance(catalog, DataCatalog)

relevant_records = [
r for r in caplog.records if r.getMessage() == "Registering catalog"
]
assert len(relevant_records) == 1

record = relevant_records[0]
assert record.catalog.keys() == {"cars", "boats"}
assert record.credentials == {"dev_s3": "foo"}
# save_version is only passed during a run, not on the property getter
assert record.save_version is None
assert record.load_versions is None


class TestDuplicatePipelineRegistration:
"""Test to make sure that if pipelines are defined in both registration hooks
Expand All @@ -150,13 +104,3 @@ def test_register_pipelines_with_duplicate_entries(
)
with pytest.warns(UserWarning, match=re.escape(pattern)):
assert pipelines == expected_pipelines


class TestBrokenRegistrationHooks:
@pytest.mark.usefixtures("mock_settings_broken_catalog_hooks")
def test_broken_register_catalog_hook(self, tmp_path, mock_package_name):
pattern = "Expected an instance of `DataCatalog`, got `NoneType` instead."
with KedroSession.create(mock_package_name, tmp_path) as session:
context = session.load_context()
with pytest.raises(KedroContextError, match=re.escape(pattern)):
_ = context.catalog