kedro-org · AhdraMeraliQB · Jul 27, 2023 · Jul 13, 2023 · Jul 13, 2023 · Jul 13, 2023
@@ -16,6 +16,7 @@
 
 ## Bug fixes and other changes
 * Consolidated dependencies and optional dependencies in `pyproject.toml`.
+* Updated `kedro catalog list` to show datasets generated with factories.
 * Pin `pip<23.2` for CI due to a breaking change. See https://github.com/kedro-org/kedro/pull/2813
 
 ## Documentation changes

@@ -1,5 +1,6 @@
 """A collection of CLI commands for working with Kedro catalog."""
 from collections import defaultdict
+from itertools import chain
 
 import click
 import yaml
@@ -32,7 +33,7 @@ def catalog():
     """Commands for working with catalog."""
 
 
-# noqa: too-many-locals
+# noqa: too-many-locals,protected-access
 @catalog.command("list")
 @env_option
 @click.option(
@@ -50,10 +51,15 @@ def list_datasets(metadata: ProjectMetadata, pipeline, env):
     title = "Datasets in '{}' pipeline"
     not_mentioned = "Datasets not mentioned in pipeline"
     mentioned = "Datasets mentioned in pipeline"
+    factories = "Datasets generated from factories"
 
     session = _create_session(metadata.package_name, env=env)
     context = session.load_context()
-    datasets_meta = context.catalog._data_sets  # noqa: protected-access
+
+    data_catalog = context.catalog
+    datasets_meta = data_catalog._data_sets
+    catalog_ds = set(data_catalog.list())
+    datasets_meta = context.catalog._data_sets
     catalog_ds = set(context.catalog.list())
 
     target_pipelines = pipeline or pipelines.keys()
@@ -73,15 +79,30 @@ def list_datasets(metadata: ProjectMetadata, pipeline, env):
         default_ds = pipeline_ds - catalog_ds
         used_ds = catalog_ds - unused_ds
 
+        # resolve any factory datasets in the pipeline
+        factory_ds_by_type = defaultdict(list)
+        for ds_name in default_ds:
+            matched_pattern = data_catalog._match_pattern(
+                data_catalog._dataset_patterns, ds_name
+            )
+            if matched_pattern:
+                ds_config = data_catalog._resolve_config(ds_name, matched_pattern)
+                factory_ds_by_type[ds_config["type"]].append(ds_name)
+
+        default_ds = default_ds - set(chain.from_iterable(factory_ds_by_type.values()))
+
         unused_by_type = _map_type_to_datasets(unused_ds, datasets_meta)
         used_by_type = _map_type_to_datasets(used_ds, datasets_meta)
 
         if default_ds:
             used_by_type["DefaultDataset"].extend(default_ds)
 
-        data = ((not_mentioned, dict(unused_by_type)), (mentioned, dict(used_by_type)))
+        data = (
+            (mentioned, dict(used_by_type)),
+            (factories, dict(factory_ds_by_type)),
+            (not_mentioned, dict(unused_by_type)),
+        )
         result[title.format(pipe)] = {key: value for key, value in data if value}
-
     secho(yaml.dump(result))
 
 

@@ -30,6 +30,18 @@ def mock_pipelines(mocker):
     return mocker.patch("kedro.framework.cli.catalog.pipelines", dummy_pipelines)
 
 
+@pytest.fixture
+def fake_catalog_config():
+    config = {
+        "parquet_{factory_pattern}": {
+            "type": "pandas.ParquetDataSet",
+            "filepath": "test.pq",
+        },
+        "csv_{factory_pattern}": {"type": "pandas.CSVDataSet", "filepath": "test.csv"},
+    }
+    return config
+
+
 @pytest.mark.usefixtures(
     "chdir_to_dummy_project", "fake_load_context", "mock_pipelines"
 )
@@ -150,6 +162,47 @@ def test_default_dataset(
         assert yaml_dump_mock.call_count == 1
         assert yaml_dump_mock.call_args[0][0][key] == expected_dict[key]
 
+    def test_list_factory_generated_datasets(
+        self,
+        fake_project_cli,
+        fake_metadata,
+        fake_load_context,
+        mocker,
+        mock_pipelines,
+        fake_catalog_config,
+    ):
+        """Test that datasets generated from factory patterns in the catalog
+        are resolved correctly under the correct dataset classes.
+        """
+        yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML")
+        mocked_context = fake_load_context.return_value
+        mocked_context.catalog = DataCatalog.from_config(fake_catalog_config)
+        mocker.patch.object(
+            mock_pipelines[PIPELINE_NAME],
+            "data_sets",
+            return_value=mocked_context.catalog._data_sets.keys()
+            | {"csv_example", "parquet_example"},
+        )
+
+        result = CliRunner().invoke(
+            fake_project_cli,
+            ["catalog", "list"],
+            obj=fake_metadata,
+        )
+
+        assert not result.exit_code
+        expected_dict = {
+            f"Datasets in '{PIPELINE_NAME}' pipeline": {
+                "Datasets generated from factories": {
+                    "pandas.CSVDataSet": ["csv_example"],
+                    "pandas.ParquetDataSet": ["parquet_example"],
+                }
+            }
+        }
+        key = f"Datasets in '{PIPELINE_NAME}' pipeline"
+        assert yaml_dump_mock.call_count == 1
+        assert yaml_dump_mock.call_args[0][0][key] == expected_dict[key]
+
 
 def identity(data):
     return data  # pragma: no cover