Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update kedro catalog list command to account for dataset factories #2793

Merged
merged 17 commits into from
Jul 27, 2023
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

## Bug fixes and other changes
* Consolidated dependencies and optional dependencies in `pyproject.toml`.
* Updated `kedro catalog list` to show datasets generated with factories.
* Pin `pip<23.2` for CI due to a breaking change. See https://github.com/kedro-org/kedro/pull/2813

## Documentation changes
Expand Down
29 changes: 25 additions & 4 deletions kedro/framework/cli/catalog.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""A collection of CLI commands for working with Kedro catalog."""
from collections import defaultdict
from itertools import chain

import click
import yaml
Expand Down Expand Up @@ -32,7 +33,7 @@ def catalog():
"""Commands for working with catalog."""


# noqa: too-many-locals
# noqa: too-many-locals,protected-access
@catalog.command("list")
@env_option
@click.option(
Expand All @@ -50,10 +51,15 @@ def list_datasets(metadata: ProjectMetadata, pipeline, env):
title = "Datasets in '{}' pipeline"
not_mentioned = "Datasets not mentioned in pipeline"
mentioned = "Datasets mentioned in pipeline"
factories = "Datasets generated from factories"

session = _create_session(metadata.package_name, env=env)
context = session.load_context()
datasets_meta = context.catalog._data_sets # noqa: protected-access

data_catalog = context.catalog
datasets_meta = data_catalog._data_sets
catalog_ds = set(data_catalog.list())
datasets_meta = context.catalog._data_sets
merelcht marked this conversation as resolved.
Show resolved Hide resolved
catalog_ds = set(context.catalog.list())

target_pipelines = pipeline or pipelines.keys()
Expand All @@ -73,15 +79,30 @@ def list_datasets(metadata: ProjectMetadata, pipeline, env):
default_ds = pipeline_ds - catalog_ds
used_ds = catalog_ds - unused_ds

# resolve any factory datasets in the pipeline
factory_ds_by_type = defaultdict(list)
for ds_name in default_ds:
matched_pattern = data_catalog._match_pattern(
data_catalog._dataset_patterns, ds_name
)
if matched_pattern:
ds_config = data_catalog._resolve_config(ds_name, matched_pattern)
factory_ds_by_type[ds_config["type"]].append(ds_name)

default_ds = default_ds - set(chain.from_iterable(factory_ds_by_type.values()))
merelcht marked this conversation as resolved.
Show resolved Hide resolved

unused_by_type = _map_type_to_datasets(unused_ds, datasets_meta)
used_by_type = _map_type_to_datasets(used_ds, datasets_meta)

if default_ds:
used_by_type["DefaultDataset"].extend(default_ds)

data = ((not_mentioned, dict(unused_by_type)), (mentioned, dict(used_by_type)))
data = (
(mentioned, dict(used_by_type)),
(factories, dict(factory_ds_by_type)),
(not_mentioned, dict(unused_by_type)),
)
result[title.format(pipe)] = {key: value for key, value in data if value}

secho(yaml.dump(result))


Expand Down
53 changes: 53 additions & 0 deletions tests/framework/cli/test_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,18 @@ def mock_pipelines(mocker):
return mocker.patch("kedro.framework.cli.catalog.pipelines", dummy_pipelines)


@pytest.fixture
def fake_catalog_config():
config = {
"parquet_{factory_pattern}": {
"type": "pandas.ParquetDataSet",
"filepath": "test.pq",
},
"csv_{factory_pattern}": {"type": "pandas.CSVDataSet", "filepath": "test.csv"},
}
return config


@pytest.mark.usefixtures(
"chdir_to_dummy_project", "fake_load_context", "mock_pipelines"
)
Expand Down Expand Up @@ -150,6 +162,47 @@ def test_default_dataset(
assert yaml_dump_mock.call_count == 1
assert yaml_dump_mock.call_args[0][0][key] == expected_dict[key]

def test_list_factory_generated_datasets(
self,
fake_project_cli,
fake_metadata,
fake_load_context,
mocker,
mock_pipelines,
fake_catalog_config,
):
"""Test that datasets generated from factory patterns in the catalog
are resolved correctly under the correct dataset classes.
"""
yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML")
mocked_context = fake_load_context.return_value
mocked_context.catalog = DataCatalog.from_config(fake_catalog_config)
mocker.patch.object(
mock_pipelines[PIPELINE_NAME],
"data_sets",
return_value=mocked_context.catalog._data_sets.keys()
| {"csv_example", "parquet_example"},
)

result = CliRunner().invoke(
fake_project_cli,
["catalog", "list"],
obj=fake_metadata,
)

assert not result.exit_code
expected_dict = {
f"Datasets in '{PIPELINE_NAME}' pipeline": {
"Datasets generated from factories": {
"pandas.CSVDataSet": ["csv_example"],
"pandas.ParquetDataSet": ["parquet_example"],
}
}
}
key = f"Datasets in '{PIPELINE_NAME}' pipeline"
assert yaml_dump_mock.call_count == 1
assert yaml_dump_mock.call_args[0][0][key] == expected_dict[key]


def identity(data):
return data # pragma: no cover
Expand Down