From f6cac070f7186a496b238205882e24d5440a8496 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar Date: Wed, 21 Feb 2024 15:37:04 +0000 Subject: [PATCH 1/4] Make all paths relative in kedro catalog resolve Signed-off-by: Ankita Katiyar --- kedro/framework/cli/catalog.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index 417a168a5e..519d08fd25 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -264,7 +264,21 @@ def resolve_patterns(metadata: ProjectMetadata, env: str) -> None: ds_config = data_catalog._resolve_config( ds_name, matched_pattern, ds_config_copy ) - - explicit_datasets[ds_name] = ds_config + # Dataset factory config contains absolute paths + ds_config_trimmed = _trim_filepath( + project_path=str(context.project_path) + "/", config=ds_config + ) + explicit_datasets[ds_name] = ds_config_trimmed secho(yaml.dump(explicit_datasets)) + + +def _trim_filepath(project_path: str, config: dict) -> dict: + """Trim the filepaths in the config dictionary, make them relative to the project path.""" + conf_keys_with_path = ["filepath", "path", "filename"] + for key, value in config.items(): + if isinstance(value, dict): + config[key] = _trim_filepath(project_path, value) + if key in conf_keys_with_path: + config[key] = str(value).replace(project_path, "", 1) + return config From 5db55eabf7a2dcc40f3ef431b66229d614a7623b Mon Sep 17 00:00:00 2001 From: Ankita Katiyar Date: Wed, 21 Feb 2024 17:34:03 +0000 Subject: [PATCH 2/4] Refactor existing tests Signed-off-by: Ankita Katiyar --- tests/framework/cli/test_catalog.py | 309 ++++++++++++++-------------- 1 file changed, 149 insertions(+), 160 deletions(-) diff --git a/tests/framework/cli/test_catalog.py b/tests/framework/cli/test_catalog.py index 604c086e85..4238998213 100644 --- a/tests/framework/cli/test_catalog.py +++ b/tests/framework/cli/test_catalog.py @@ -403,192 +403,181 @@ def test_bad_env(self, fake_project_cli, fake_metadata): assert "Unable to instantiate Kedro Catalog" in result.output -@pytest.mark.usefixtures( - "chdir_to_dummy_project", "fake_load_context", "mock_pipelines" -) -def test_rank_catalog_factories( - fake_project_cli, - fake_metadata, - mocker, - fake_load_context, - fake_catalog_with_overlapping_factories, -): - yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML") - mocked_context = fake_load_context.return_value - mocked_context.catalog = DataCatalog.from_config( - fake_catalog_with_overlapping_factories - ) +@pytest.mark.usefixtures("chdir_to_dummy_project", "fake_load_context") +class TestCatalogFactoryCommands: + @pytest.mark.usefixtures("mock_pipelines") + def test_rank_catalog_factories( + self, + fake_project_cli, + fake_metadata, + mocker, + fake_load_context, + fake_catalog_with_overlapping_factories, + ): + yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML") + mocked_context = fake_load_context.return_value + mocked_context.catalog = DataCatalog.from_config( + fake_catalog_with_overlapping_factories + ) - result = CliRunner().invoke( - fake_project_cli, ["catalog", "rank"], obj=fake_metadata - ) - assert not result.exit_code + result = CliRunner().invoke( + fake_project_cli, ["catalog", "rank"], obj=fake_metadata + ) + assert not result.exit_code - expected_patterns_sorted = [ - "an_example_{place}_{holder}", - "an_example_{placeholder}", - "an_{example_placeholder}", - "on_{example_placeholder}", - ] + expected_patterns_sorted = [ + "an_example_{place}_{holder}", + "an_example_{placeholder}", + "an_{example_placeholder}", + "on_{example_placeholder}", + ] - assert yaml_dump_mock.call_count == 1 - assert yaml_dump_mock.call_args[0][0] == expected_patterns_sorted + assert yaml_dump_mock.call_count == 1 + assert yaml_dump_mock.call_args[0][0] == expected_patterns_sorted + def test_rank_catalog_factories_with_no_factories( + self, fake_project_cli, fake_metadata, fake_load_context + ): + mocked_context = fake_load_context.return_value -@pytest.mark.usefixtures( - "chdir_to_dummy_project", - "fake_load_context", -) -def test_rank_catalog_factories_with_no_factories( - fake_project_cli, fake_metadata, fake_load_context -): - mocked_context = fake_load_context.return_value - - catalog_datasets = { - "iris_data": CSVDataset(filepath="test.csv"), - "intermediate": MemoryDataset(), - "not_used": CSVDataset(filepath="test2.csv"), - } - mocked_context.catalog = DataCatalog(datasets=catalog_datasets) + catalog_datasets = { + "iris_data": CSVDataset(filepath="test.csv"), + "intermediate": MemoryDataset(), + "not_used": CSVDataset(filepath="test2.csv"), + } + mocked_context.catalog = DataCatalog(datasets=catalog_datasets) - result = CliRunner().invoke( - fake_project_cli, ["catalog", "rank"], obj=fake_metadata - ) + result = CliRunner().invoke( + fake_project_cli, ["catalog", "rank"], obj=fake_metadata + ) - assert not result.exit_code - expected_output = "There are no dataset factories in the catalog." - assert expected_output in result.output + assert not result.exit_code + expected_output = "There are no dataset factories in the catalog." + assert expected_output in result.output + @pytest.mark.usefixtures("mock_pipelines") + def test_catalog_resolve( + self, + fake_project_cli, + fake_metadata, + fake_load_context, + mocker, + mock_pipelines, + fake_catalog_config, + ): + """Test that datasets factories are correctly resolved to the explicit datasets in the pipeline.""" + yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML") + mocked_context = fake_load_context.return_value + mocked_context.catalog = DataCatalog.from_config(fake_catalog_config) -@pytest.mark.usefixtures( - "chdir_to_dummy_project", "fake_load_context", "mock_pipelines" -) -def test_catalog_resolve( - fake_project_cli, - fake_metadata, - fake_load_context, - mocker, - mock_pipelines, - fake_catalog_config, -): - """Test that datasets factories are correctly resolved to the explicit datasets in the pipeline.""" - yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML") - mocked_context = fake_load_context.return_value - mocked_context.catalog = DataCatalog.from_config(fake_catalog_config) - - placeholder_ds = mocked_context.catalog._datasets.keys() - explicit_ds = {"csv_example", "parquet_example"} - - mocker.patch.object( - mock_pipelines[PIPELINE_NAME], - "datasets", - return_value=explicit_ds, - ) + placeholder_ds = mocked_context.catalog._datasets.keys() + explicit_ds = {"csv_example", "parquet_example"} - result = CliRunner().invoke( - fake_project_cli, ["catalog", "resolve"], obj=fake_metadata - ) + mocker.patch.object( + mock_pipelines[PIPELINE_NAME], + "datasets", + return_value=explicit_ds, + ) - assert not result.exit_code - assert yaml_dump_mock.call_count == 1 + result = CliRunner().invoke( + fake_project_cli, ["catalog", "resolve"], obj=fake_metadata + ) - output = yaml_dump_mock.call_args[0][0] + assert not result.exit_code + assert yaml_dump_mock.call_count == 1 - for ds in placeholder_ds: - assert ds not in output + output = yaml_dump_mock.call_args[0][0] - for ds in explicit_ds: - assert ds in output + for ds in placeholder_ds: + assert ds not in output + for ds in explicit_ds: + assert ds in output -@pytest.mark.usefixtures( - "chdir_to_dummy_project", "fake_load_context", "mock_pipelines" -) -def test_no_overwrite( - fake_project_cli, - fake_metadata, - fake_load_context, - mocker, - mock_pipelines, - fake_catalog_config_with_resolvable_dataset, -): - """Test that explicit catalog entries are not overwritten by factory config.""" - yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML") - mocked_context = fake_load_context.return_value - - mocked_context.config_loader = { - "catalog": fake_catalog_config_with_resolvable_dataset - } - mocked_context.catalog = DataCatalog.from_config( - fake_catalog_config_with_resolvable_dataset - ) + @pytest.mark.usefixtures("mock_pipelines") + def test_no_overwrite( + self, + fake_project_cli, + fake_metadata, + fake_load_context, + mocker, + mock_pipelines, + fake_catalog_config_with_resolvable_dataset, + ): + """Test that explicit catalog entries are not overwritten by factory config.""" + yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML") + mocked_context = fake_load_context.return_value - mocker.patch.object( - mock_pipelines[PIPELINE_NAME], - "datasets", - return_value=mocked_context.catalog._datasets.keys() - | {"csv_example", "parquet_example"}, - ) + mocked_context.config_loader = { + "catalog": fake_catalog_config_with_resolvable_dataset + } + mocked_context.catalog = DataCatalog.from_config( + fake_catalog_config_with_resolvable_dataset + ) - result = CliRunner().invoke( - fake_project_cli, ["catalog", "resolve"], obj=fake_metadata - ) + mocker.patch.object( + mock_pipelines[PIPELINE_NAME], + "datasets", + return_value=mocked_context.catalog._datasets.keys() + | {"csv_example", "parquet_example"}, + ) - assert not result.exit_code - assert yaml_dump_mock.call_count == 1 + result = CliRunner().invoke( + fake_project_cli, ["catalog", "resolve"], obj=fake_metadata + ) - assert ( - yaml_dump_mock.call_args[0][0]["explicit_ds"] - == fake_catalog_config_with_resolvable_dataset["explicit_ds"] - ) + assert not result.exit_code + assert yaml_dump_mock.call_count == 1 + assert ( + yaml_dump_mock.call_args[0][0]["explicit_ds"] + == fake_catalog_config_with_resolvable_dataset["explicit_ds"] + ) -@pytest.mark.usefixtures( - "chdir_to_dummy_project", "fake_load_context", "mock_pipelines" -) -def test_no_param_datasets_in_resolve( - fake_project_cli, fake_metadata, fake_load_context, mocker, mock_pipelines -): - yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML") - mocked_context = fake_load_context.return_value - - catalog_config = { - "iris_data": { - "type": "pandas.CSVDataset", - "filepath": "test.csv", - }, - "intermediate": {"type": "MemoryDataset"}, - } + @pytest.mark.usefixtures("mock_pipelines") + def test_no_param_datasets_in_resolve( + self, fake_project_cli, fake_metadata, fake_load_context, mocker, mock_pipelines + ): + yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML") + mocked_context = fake_load_context.return_value - catalog_datasets = { - "iris_data": CSVDataset(filepath="test.csv"), - "intermediate": MemoryDataset(), - "parameters": MemoryDataset(), - "params:data_ratio": MemoryDataset(), - } + catalog_config = { + "iris_data": { + "type": "pandas.CSVDataset", + "filepath": "test.csv", + }, + "intermediate": {"type": "MemoryDataset"}, + } + + catalog_datasets = { + "iris_data": CSVDataset(filepath="test.csv"), + "intermediate": MemoryDataset(), + "parameters": MemoryDataset(), + "params:data_ratio": MemoryDataset(), + } - mocked_context.config_loader = {"catalog": catalog_config} - mocked_context.catalog = DataCatalog(datasets=catalog_datasets) + mocked_context.config_loader = {"catalog": catalog_config} + mocked_context.catalog = DataCatalog(datasets=catalog_datasets) - mocker.patch.object( - mock_pipelines[PIPELINE_NAME], - "datasets", - return_value=catalog_datasets.keys(), - ) + mocker.patch.object( + mock_pipelines[PIPELINE_NAME], + "datasets", + return_value=catalog_datasets.keys(), + ) - result = CliRunner().invoke( - fake_project_cli, - ["catalog", "resolve"], - obj=fake_metadata, - ) + result = CliRunner().invoke( + fake_project_cli, + ["catalog", "resolve"], + obj=fake_metadata, + ) - assert not result.exit_code - assert yaml_dump_mock.call_count == 1 + assert not result.exit_code + assert yaml_dump_mock.call_count == 1 - # 'parameters' and 'params:data_ratio' should not appear in the output - output = yaml_dump_mock.call_args[0][0] + # 'parameters' and 'params:data_ratio' should not appear in the output + output = yaml_dump_mock.call_args[0][0] - assert "parameters" not in output.keys() - assert "params:data_ratio" not in output.keys() - assert "iris_data" in output.keys() - assert "intermediate" in output.keys() + assert "parameters" not in output.keys() + assert "params:data_ratio" not in output.keys() + assert "iris_data" in output.keys() + assert "intermediate" in output.keys() From e93e09bf9e230aa89b44588d93a5f0578c721a23 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar Date: Thu, 22 Feb 2024 11:27:35 +0000 Subject: [PATCH 3/4] Add partitioneddataset to test + refactor Signed-off-by: Ankita Katiyar --- tests/framework/cli/test_catalog.py | 127 ++++++++++++++++++++-------- 1 file changed, 93 insertions(+), 34 deletions(-) diff --git a/tests/framework/cli/test_catalog.py b/tests/framework/cli/test_catalog.py index 4238998213..ed7ff2f744 100644 --- a/tests/framework/cli/test_catalog.py +++ b/tests/framework/cli/test_catalog.py @@ -33,9 +33,29 @@ def fake_catalog_config(): config = { "parquet_{factory_pattern}": { "type": "pandas.ParquetDataset", - "filepath": "test.pq", + "filepath": "data/01_raw/{factory_pattern}.pq", }, - "csv_{factory_pattern}": {"type": "pandas.CSVDataset", "filepath": "test.csv"}, + "csv_{factory_pattern}": { + "type": "pandas.CSVDataset", + "filepath": "data/01_raw/{factory_pattern}.csv", + }, + "csv_test": {"type": "pandas.CSVDataset", "filepath": "test.csv"}, + } + return config + + +@pytest.fixture +def fake_catalog_config_resolved(): + config = { + "parquet_example": { + "type": "pandas.ParquetDataset", + "filepath": "data/01_raw/example.pq", + }, + "csv_example": { + "type": "pandas.CSVDataset", + "filepath": "data/01_raw/example.csv", + }, + "csv_test": {"type": "pandas.CSVDataset", "filepath": "test.csv"}, } return config @@ -68,17 +88,62 @@ def fake_catalog_with_overlapping_factories(): @pytest.fixture -def fake_catalog_config_with_resolvable_dataset(): +def fake_catalog_config_with_factories(fake_metadata): config = { "parquet_{factory_pattern}": { "type": "pandas.ParquetDataset", - "filepath": "test.pq", + "filepath": str(fake_metadata.project_path) + + "/" + + "data/01_raw/{factory_pattern}.pq", + }, + "csv_{factory_pattern}": { + "type": "pandas.CSVDataset", + "filepath": str(fake_metadata.project_path) + + "/" + + "data/01_raw/{factory_pattern}.csv", }, - "csv_{factory_pattern}": {"type": "pandas.CSVDataset", "filepath": "test.csv"}, "explicit_ds": {"type": "pandas.CSVDataset", "filepath": "test.csv"}, "{factory_pattern}_ds": { "type": "pandas.ParquetDataset", - "filepath": "test.pq", + "filepath": str(fake_metadata.project_path) + + "/" + + "data/01_raw/{factory_pattern}_ds.pq", + }, + "partitioned_{factory_pattern}": { + "type": "partitions.PartitionedDataset", + "path": str(fake_metadata.project_path) + "/" + "data/01_raw", + "dataset": "pandas.CSVDataset", + "metadata": { + "my-plugin": { + "path": str(fake_metadata.project_path) + "/" + "data/01_raw", + } + }, + }, + } + return config + + +@pytest.fixture +def fake_catalog_config_with_factories_resolved(): + config = { + "parquet_example": { + "type": "pandas.ParquetDataset", + "filepath": "data/01_raw/example.pq", + }, + "csv_example": { + "type": "pandas.CSVDataset", + "filepath": "data/01_raw/example.csv", + }, + "explicit_ds": {"type": "pandas.CSVDataset", "filepath": "test.csv"}, + "partitioned_example": { + "type": "partitions.PartitionedDataset", + "path": "data/01_raw", + "dataset": "pandas.CSVDataset", + "metadata": { + "my-plugin": { + "path": "data/01_raw", + } + }, }, } return config @@ -231,14 +296,16 @@ def test_list_factory_generated_datasets( ["catalog", "list"], obj=fake_metadata, ) - assert not result.exit_code expected_dict = { f"Datasets in '{PIPELINE_NAME}' pipeline": { "Datasets generated from factories": { "pandas.CSVDataset": ["csv_example"], "pandas.ParquetDataset": ["parquet_example"], - } + }, + "Datasets mentioned in pipeline": { + "CSVDataset": ["csv_test"], + }, } } key = f"Datasets in '{PIPELINE_NAME}' pipeline" @@ -419,7 +486,7 @@ def test_rank_catalog_factories( mocked_context.catalog = DataCatalog.from_config( fake_catalog_with_overlapping_factories ) - + print("!!!!", mocked_context.catalog._dataset_patterns) result = CliRunner().invoke( fake_project_cli, ["catalog", "rank"], obj=fake_metadata ) @@ -464,19 +531,19 @@ def test_catalog_resolve( mocker, mock_pipelines, fake_catalog_config, + fake_catalog_config_resolved, ): """Test that datasets factories are correctly resolved to the explicit datasets in the pipeline.""" - yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML") mocked_context = fake_load_context.return_value + mocked_context.config_loader = {"catalog": fake_catalog_config} mocked_context.catalog = DataCatalog.from_config(fake_catalog_config) - - placeholder_ds = mocked_context.catalog._datasets.keys() - explicit_ds = {"csv_example", "parquet_example"} + placeholder_ds = mocked_context.catalog._dataset_patterns.keys() + pipeline_datasets = {"csv_example", "parquet_example", "explicit_dataset"} mocker.patch.object( mock_pipelines[PIPELINE_NAME], "datasets", - return_value=explicit_ds, + return_value=pipeline_datasets, ) result = CliRunner().invoke( @@ -484,42 +551,37 @@ def test_catalog_resolve( ) assert not result.exit_code - assert yaml_dump_mock.call_count == 1 - - output = yaml_dump_mock.call_args[0][0] + resolved_config = yaml.safe_load(result.output) + assert resolved_config == fake_catalog_config_resolved for ds in placeholder_ds: - assert ds not in output - - for ds in explicit_ds: - assert ds in output + assert ds not in result.output @pytest.mark.usefixtures("mock_pipelines") - def test_no_overwrite( + def test_catalog_resolve_nested_config( self, fake_project_cli, fake_metadata, fake_load_context, mocker, mock_pipelines, - fake_catalog_config_with_resolvable_dataset, + fake_catalog_config_with_factories, + fake_catalog_config_with_factories_resolved, ): """Test that explicit catalog entries are not overwritten by factory config.""" - yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML") mocked_context = fake_load_context.return_value + mocked_context.project_path = fake_metadata.project_path - mocked_context.config_loader = { - "catalog": fake_catalog_config_with_resolvable_dataset - } + mocked_context.config_loader = {"catalog": fake_catalog_config_with_factories} mocked_context.catalog = DataCatalog.from_config( - fake_catalog_config_with_resolvable_dataset + fake_catalog_config_with_factories ) mocker.patch.object( mock_pipelines[PIPELINE_NAME], "datasets", return_value=mocked_context.catalog._datasets.keys() - | {"csv_example", "parquet_example"}, + | {"csv_example", "parquet_example", "partitioned_example"}, ) result = CliRunner().invoke( @@ -527,12 +589,9 @@ def test_no_overwrite( ) assert not result.exit_code - assert yaml_dump_mock.call_count == 1 + resolved_config = yaml.safe_load(result.output) - assert ( - yaml_dump_mock.call_args[0][0]["explicit_ds"] - == fake_catalog_config_with_resolvable_dataset["explicit_ds"] - ) + assert resolved_config == fake_catalog_config_with_factories_resolved @pytest.mark.usefixtures("mock_pipelines") def test_no_param_datasets_in_resolve( From e283d417e3c11a96dff9f029840cfa0d66b4424b Mon Sep 17 00:00:00 2001 From: Ankita Katiyar Date: Fri, 23 Feb 2024 16:55:57 +0000 Subject: [PATCH 4/4] Change to DataCatalog Signed-off-by: Ankita Katiyar --- kedro/framework/cli/catalog.py | 20 +++----------------- tests/framework/cli/test_catalog.py | 16 +++++----------- 2 files changed, 8 insertions(+), 28 deletions(-) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index 519d08fd25..7b8249170f 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -16,6 +16,7 @@ from kedro.framework.session import KedroSession from kedro.framework.startup import ProjectMetadata from kedro.io import AbstractDataset +from kedro.io.data_catalog import DataCatalog def _create_session(package_name: str, **kwargs: Any) -> KedroSession: @@ -231,8 +232,8 @@ def resolve_patterns(metadata: ProjectMetadata, env: str) -> None: session = _create_session(metadata.package_name, env=env) context = session.load_context() - data_catalog = context.catalog catalog_config = context.config_loader["catalog"] + data_catalog = DataCatalog.from_config(catalog_config) explicit_datasets = { ds_name: ds_config @@ -264,21 +265,6 @@ def resolve_patterns(metadata: ProjectMetadata, env: str) -> None: ds_config = data_catalog._resolve_config( ds_name, matched_pattern, ds_config_copy ) - # Dataset factory config contains absolute paths - ds_config_trimmed = _trim_filepath( - project_path=str(context.project_path) + "/", config=ds_config - ) - explicit_datasets[ds_name] = ds_config_trimmed + explicit_datasets[ds_name] = ds_config secho(yaml.dump(explicit_datasets)) - - -def _trim_filepath(project_path: str, config: dict) -> dict: - """Trim the filepaths in the config dictionary, make them relative to the project path.""" - conf_keys_with_path = ["filepath", "path", "filename"] - for key, value in config.items(): - if isinstance(value, dict): - config[key] = _trim_filepath(project_path, value) - if key in conf_keys_with_path: - config[key] = str(value).replace(project_path, "", 1) - return config diff --git a/tests/framework/cli/test_catalog.py b/tests/framework/cli/test_catalog.py index ed7ff2f744..df74cfd015 100644 --- a/tests/framework/cli/test_catalog.py +++ b/tests/framework/cli/test_catalog.py @@ -92,30 +92,24 @@ def fake_catalog_config_with_factories(fake_metadata): config = { "parquet_{factory_pattern}": { "type": "pandas.ParquetDataset", - "filepath": str(fake_metadata.project_path) - + "/" - + "data/01_raw/{factory_pattern}.pq", + "filepath": "data/01_raw/{factory_pattern}.pq", }, "csv_{factory_pattern}": { "type": "pandas.CSVDataset", - "filepath": str(fake_metadata.project_path) - + "/" - + "data/01_raw/{factory_pattern}.csv", + "filepath": "data/01_raw/{factory_pattern}.csv", }, "explicit_ds": {"type": "pandas.CSVDataset", "filepath": "test.csv"}, "{factory_pattern}_ds": { "type": "pandas.ParquetDataset", - "filepath": str(fake_metadata.project_path) - + "/" - + "data/01_raw/{factory_pattern}_ds.pq", + "filepath": "data/01_raw/{factory_pattern}_ds.pq", }, "partitioned_{factory_pattern}": { "type": "partitions.PartitionedDataset", - "path": str(fake_metadata.project_path) + "/" + "data/01_raw", + "path": "data/01_raw", "dataset": "pandas.CSVDataset", "metadata": { "my-plugin": { - "path": str(fake_metadata.project_path) + "/" + "data/01_raw", + "path": "data/01_raw", } }, },