From 27ec9a31c4e32726703fe04b48b0b904543bcd05 Mon Sep 17 00:00:00 2001 From: Danny Farah Date: Fri, 10 Feb 2023 14:32:15 -0500 Subject: [PATCH 01/74] committing first version of UnityTableCatalog with unit tests. This datasets allows users to interface with Unity catalog tables in Databricks to both read and write. Signed-off-by: Danny Farah --- kedro-datasets/.gitignore | 3 + .../kedro_datasets/databricks/__init__.py | 8 + .../kedro_datasets/databricks/unity.py | 202 ++++++++ kedro-datasets/setup.py | 3 + kedro-datasets/tests/databricks/__init__.py | 0 kedro-datasets/tests/databricks/conftest.py | 26 + .../tests/databricks/test_unity_dataset.py | 448 ++++++++++++++++++ 7 files changed, 690 insertions(+) create mode 100644 kedro-datasets/kedro_datasets/databricks/__init__.py create mode 100644 kedro-datasets/kedro_datasets/databricks/unity.py create mode 100644 kedro-datasets/tests/databricks/__init__.py create mode 100644 kedro-datasets/tests/databricks/conftest.py create mode 100644 kedro-datasets/tests/databricks/test_unity_dataset.py diff --git a/kedro-datasets/.gitignore b/kedro-datasets/.gitignore index d20ee9733..3725bd847 100644 --- a/kedro-datasets/.gitignore +++ b/kedro-datasets/.gitignore @@ -145,3 +145,6 @@ kedro.db kedro/html docs/tmp-build-artifacts docs/build +spark-warehouse +metastore_db/ +derby.log \ No newline at end of file diff --git a/kedro-datasets/kedro_datasets/databricks/__init__.py b/kedro-datasets/kedro_datasets/databricks/__init__.py new file mode 100644 index 000000000..2fd3eccb9 --- /dev/null +++ b/kedro-datasets/kedro_datasets/databricks/__init__.py @@ -0,0 +1,8 @@ +"""Provides interface to Unity Catalog Tables.""" + +__all__ = ["UnityTableDataSet"] + +from contextlib import suppress + +with suppress(ImportError): + from .unity import UnityTableDataSet diff --git a/kedro-datasets/kedro_datasets/databricks/unity.py b/kedro-datasets/kedro_datasets/databricks/unity.py new file mode 100644 index 000000000..8921fca1b --- /dev/null +++ b/kedro-datasets/kedro_datasets/databricks/unity.py @@ -0,0 +1,202 @@ +import logging +from typing import Any, Dict, List, Union +import pandas as pd + +from kedro.io.core import ( + AbstractVersionedDataSet, + DataSetError, + VersionNotFoundError, +) +from pyspark.sql import DataFrame, SparkSession +from pyspark.sql.types import StructType +from pyspark.sql.utils import AnalysisException +from cachetools import Cache + +logger = logging.getLogger(__name__) + + +class UnityTableDataSet(AbstractVersionedDataSet): + """``UnityTableDataSet`` loads data into Unity managed tables.""" + + # this dataset cannot be used with ``ParallelRunner``, + # therefore it has the attribute ``_SINGLE_PROCESS = True`` + # for parallelism within a Spark pipeline please consider + # using ``ThreadRunner`` instead + _SINGLE_PROCESS = True + _VALID_WRITE_MODES = ["overwrite", "upsert", "append"] + _VALID_DATAFRAME_TYPES = ["spark", "pandas"] + + def __init__( + self, + table: str, + catalog: str = None, + database: str = "default", + write_mode: str = "overwrite", + dataframe_type: str = "spark", + primary_key: Union[str, List[str]] = None, + version: int = None, + *, + # the following parameters are used by the hook to create or update unity + schema: Dict[str, Any] = None, # pylint: disable=unused-argument + partition_columns: List[str] = None, # pylint: disable=unused-argument + owner_group: str = None, + ) -> None: + """Creates a new instance of ``UnityTableDataSet``.""" + + self._database = database + self._catalog = catalog + self._table = table + self._owner_group = owner_group + self._partition_columns = partition_columns + if catalog and database and table: + self._full_table_address = f"{catalog}.{database}.{table}" + elif table: + self._full_table_address = f"{database}.{table}" + + if write_mode not in self._VALID_WRITE_MODES: + valid_modes = ", ".join(self._VALID_WRITE_MODES) + raise DataSetError( + f"Invalid `write_mode` provided: {write_mode}. " + f"`write_mode` must be one of: {valid_modes}" + ) + self._write_mode = write_mode + + if dataframe_type not in self._VALID_DATAFRAME_TYPES: + valid_types = ", ".join(self._VALID_DATAFRAME_TYPES) + raise DataSetError(f"`dataframe_type` must be one of {valid_types}") + self._dataframe_type = dataframe_type + + if primary_key is None or len(primary_key) == 0: + if write_mode == "upsert": + raise DataSetError( + f"`primary_key` must be provided for" f"`write_mode` {write_mode}" + ) + + self._primary_key = primary_key + + self._version = version + self._version_cache = Cache(maxsize=2) + + self._schema = None + if schema is not None: + self._schema = StructType.fromJson(schema) + + def _get_spark(self) -> SparkSession: + return ( + SparkSession.builder.config( + "spark.jars.packages", "io.delta:delta-core_2.12:1.2.1" + ) + .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") + .config( + "spark.sql.catalog.spark_catalog", + "org.apache.spark.sql.delta.catalog.DeltaCatalog", + ) + .getOrCreate() + ) + + def _load(self) -> Union[DataFrame, pd.DataFrame]: + if self._version is not None and self._version >= 0: + try: + data = ( + self._get_spark() + .read.format("delta") + .option("versionAsOf", self._version) + .table(self._full_table_address) + ) + except: + raise VersionNotFoundError + else: + data = self._get_spark().table(self._full_table_address) + if self._dataframe_type == "pandas": + data = data.toPandas() + return data + + def _save_append(self, data: DataFrame) -> None: + data.write.format("delta").mode("append").saveAsTable(self._full_table_address) + + def _save_overwrite(self, data: DataFrame) -> None: + delta_table = data.write.format("delta") + if self._write_mode == "overwrite": + delta_table = delta_table.mode("overwrite").option( + "overwriteSchema", "true" + ) + delta_table.saveAsTable(self._full_table_address) + + def _save_upsert(self, update_data: DataFrame) -> None: + if self._exists(): + base_data = self._get_spark().table(self._full_table_address) + base_columns = base_data.columns + update_columns = update_data.columns + + if set(update_columns) != set(base_columns): + raise DataSetError( + f"Upsert requires tables to have identical columns. " + f"Delta table {self._full_table_address} " + f"has columns: {base_columns}, whereas " + f"dataframe has columns {update_columns}" + ) + + where_expr = "" + if isinstance(self._primary_key, str): + where_expr = f"base.{self._primary_key}=update.{self._primary_key}" + elif isinstance(self._primary_key, list): + where_expr = " AND ".join( + f"base.{col}=update.{col}" for col in self._primary_key + ) + + update_data.createOrReplaceTempView("update") + + upsert_sql = f"""MERGE INTO {self._full_table_address} base USING update + ON {where_expr} WHEN MATCHED THEN UPDATE SET * WHEN NOT MATCHED THEN INSERT * + """ + self._get_spark().sql(upsert_sql) + else: + self._save_append(update_data) + + def _save(self, data: Any) -> None: + # filter columns specified in schema and match their ordering + if self._schema: + cols = self._schema.fieldNames() + if self._dataframe_type == "pandas": + data = self._get_spark().createDataFrame( + data.loc[:, cols], schema=self._schema + ) + else: + data = data.select(*cols) + else: + if self._dataframe_type == "pandas": + data = self._get_spark().createDataFrame(data) + if self._write_mode == "overwrite": + self._save_overwrite(data) + elif self._write_mode == "upsert": + self._save_upsert(data) + elif self._write_mode == "append": + self._save_append(data) + + def _describe(self) -> Dict[str, str]: + return dict( + catalog=self._catalog, + database=self._database, + table=self._table, + write_mode=self._write_mode, + dataframe_type=self._dataframe_type, + primary_key=self._primary_key, + version=self._version, + ) + + def _exists(self) -> bool: + if self._catalog: + try: + self._get_spark().sql(f"USE CATALOG {self._catalog}") + except: + logger.warn(f"catalog {self._catalog} not found") + try: + return ( + self._get_spark() + .sql(f"SHOW TABLES IN `{self._database}`") + .filter(f"tableName = '{self._table}'") + .count() + > 0 + ) + except: + return False diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py index f75d3cad1..8c5440a75 100644 --- a/kedro-datasets/setup.py +++ b/kedro-datasets/setup.py @@ -37,6 +37,7 @@ def _collect_requirements(requires): api_require = {"api.APIDataSet": ["requests~=2.20"]} biosequence_require = {"biosequence.BioSequenceDataSet": ["biopython~=1.73"]} dask_require = {"dask.ParquetDataSet": ["dask[complete]~=2021.10", "triad>=0.6.7, <1.0"]} +databricks_require = {"databricks.UnityTableDataSet": [SPARK]} geopandas_require = { "geopandas.GeoJSONDataSet": ["geopandas>=0.6.0, <1.0", "pyproj~=3.0"] } @@ -90,6 +91,7 @@ def _collect_requirements(requires): "api": _collect_requirements(api_require), "biosequence": _collect_requirements(biosequence_require), "dask": _collect_requirements(dask_require), + "databricks": _collect_requirements(databricks_require), "docs": [ "docutils==0.16", "sphinx~=3.4.3", @@ -117,6 +119,7 @@ def _collect_requirements(requires): **api_require, **biosequence_require, **dask_require, + **databricks_require, **geopandas_require, **matplotlib_require, **holoviews_require, diff --git a/kedro-datasets/tests/databricks/__init__.py b/kedro-datasets/tests/databricks/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/kedro-datasets/tests/databricks/conftest.py b/kedro-datasets/tests/databricks/conftest.py new file mode 100644 index 000000000..d360ffb68 --- /dev/null +++ b/kedro-datasets/tests/databricks/conftest.py @@ -0,0 +1,26 @@ +""" +This file contains the fixtures that are reusable by any tests within +this directory. You don't need to import the fixtures as pytest will +discover them automatically. More info here: +https://docs.pytest.org/en/latest/fixture.html +""" +import pytest +from pyspark.sql import SparkSession +from delta.pip_utils import configure_spark_with_delta_pip + + +@pytest.fixture(scope="class", autouse=True) +def spark_session(): + spark = ( + SparkSession.builder.appName("test") + .config("spark.jars.packages", "io.delta:delta-core_2.12:1.2.1") + .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") + .config( + "spark.sql.catalog.spark_catalog", + "org.apache.spark.sql.delta.catalog.DeltaCatalog", + ) + .getOrCreate() + ) + spark.sql("create database if not exists test") + yield spark + spark.sql("drop database test cascade;") diff --git a/kedro-datasets/tests/databricks/test_unity_dataset.py b/kedro-datasets/tests/databricks/test_unity_dataset.py new file mode 100644 index 000000000..3f29a1e95 --- /dev/null +++ b/kedro-datasets/tests/databricks/test_unity_dataset.py @@ -0,0 +1,448 @@ +import pytest +from kedro.io.core import DataSetError, VersionNotFoundError +from pyspark.sql.types import IntegerType, StringType, StructField, StructType +from pyspark.sql import DataFrame, SparkSession +import pandas as pd +from kedro_datasets.databricks import UnityTableDataSet + + +@pytest.fixture +def sample_spark_df(spark_session: SparkSession): + schema = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) + + data = [("Alex", 31), ("Bob", 12), ("Clarke", 65), ("Dave", 29)] + + return spark_session.createDataFrame(data, schema) + + +@pytest.fixture +def upsert_spark_df(spark_session: SparkSession): + schema = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) + + data = [("Alex", 32), ("Evan", 23)] + + return spark_session.createDataFrame(data, schema) + + +@pytest.fixture +def mismatched_upsert_spark_df(spark_session: SparkSession): + schema = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("height", IntegerType(), True), + ] + ) + + data = [("Alex", 32, 174), ("Evan", 23, 166)] + + return spark_session.createDataFrame(data, schema) + + +@pytest.fixture +def subset_spark_df(spark_session: SparkSession): + schema = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("height", IntegerType(), True), + ] + ) + + data = [("Alex", 32, 174), ("Evan", 23, 166)] + + return spark_session.createDataFrame(data, schema) + + +@pytest.fixture +def subset_pandas_df(): + return pd.DataFrame( + {"name": ["Alex", "Evan"], "age": [32, 23], "height": [174, 166]} + ) + + +@pytest.fixture +def subset_expected_df(spark_session: SparkSession): + schema = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) + + data = [("Alex", 32), ("Evan", 23)] + + return spark_session.createDataFrame(data, schema) + + +@pytest.fixture +def sample_pandas_df(): + return pd.DataFrame( + {"name": ["Alex", "Bob", "Clarke", "Dave"], "age": [31, 12, 65, 29]} + ) + + +@pytest.fixture +def append_spark_df(spark_session: SparkSession): + schema = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) + + data = [("Evan", 23), ("Frank", 13)] + + return spark_session.createDataFrame(data, schema) + + +@pytest.fixture +def expected_append_spark_df(spark_session: SparkSession): + schema = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) + + data = [ + ("Alex", 31), + ("Bob", 12), + ("Clarke", 65), + ("Dave", 29), + ("Evan", 23), + ("Frank", 13), + ] + + return spark_session.createDataFrame(data, schema) + + +@pytest.fixture +def expected_upsert_spark_df(spark_session: SparkSession): + schema = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) + + data = [ + ("Alex", 32), + ("Bob", 12), + ("Clarke", 65), + ("Dave", 29), + ("Evan", 23), + ] + + return spark_session.createDataFrame(data, schema) + + +@pytest.fixture +def expected_upsert_multiple_primary_spark_df(spark_session: SparkSession): + schema = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) + + data = [ + ("Alex", 31), + ("Alex", 32), + ("Bob", 12), + ("Clarke", 65), + ("Dave", 29), + ("Evan", 23), + ] + + return spark_session.createDataFrame(data, schema) + + +class TestUnityTableDataSet: + def test_full_table(self): + unity_ds = UnityTableDataSet(catalog="test", database="test", table="test") + assert unity_ds._full_table_address == "test.test.test" + + def test_database_table(self): + unity_ds = UnityTableDataSet(database="test", table="test") + assert unity_ds._full_table_address == "test.test" + + def test_table_only(self): + unity_ds = UnityTableDataSet(table="test") + assert unity_ds._full_table_address == "default.test" + + def test_table_missing(self): + with pytest.raises(TypeError): + UnityTableDataSet() + + def test_describe(self): + unity_ds = UnityTableDataSet(table="test") + assert unity_ds._describe() == { + "catalog": None, + "database": "default", + "table": "test", + "write_mode": "overwrite", + "dataframe_type": "spark", + "primary_key": None, + "version": None, + } + + def test_invalid_write_mode(self): + with pytest.raises(DataSetError): + UnityTableDataSet(table="test", write_mode="invalid") + + def test_dataframe_type(self): + with pytest.raises(DataSetError): + UnityTableDataSet(table="test", dataframe_type="invalid") + + def test_missing_primary_key_upsert(self): + with pytest.raises(DataSetError): + UnityTableDataSet(table="test", write_mode="upsert") + + def test_schema(self): + unity_ds = UnityTableDataSet( + table="test", + schema={ + "fields": [ + { + "metadata": {}, + "name": "name", + "nullable": True, + "type": "string", + }, + { + "metadata": {}, + "name": "age", + "nullable": True, + "type": "integer", + }, + ], + "type": "struct", + }, + ) + expected_schema = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) + assert unity_ds._schema == expected_schema + + def test_catalog_exists(self): + unity_ds = UnityTableDataSet(catalog="test", database="invalid", table="test_not_there") + assert not unity_ds._exists() + + def test_table_does_not_exist(self): + unity_ds = UnityTableDataSet(database="invalid", table="test_not_there") + assert not unity_ds._exists() + + def test_save_default(self, sample_spark_df: DataFrame): + unity_ds = UnityTableDataSet(database="test", table="test_save") + unity_ds.save(sample_spark_df) + saved_table = unity_ds.load() + assert unity_ds.exists() and sample_spark_df.exceptAll(saved_table).count() == 0 + + def test_save_schema_spark( + self, subset_spark_df: DataFrame, subset_expected_df: DataFrame + ): + unity_ds = UnityTableDataSet( + database="test", + table="test_save_spark_schema", + schema={ + "fields": [ + { + "metadata": {}, + "name": "name", + "nullable": True, + "type": "string", + }, + { + "metadata": {}, + "name": "age", + "nullable": True, + "type": "integer", + }, + ], + "type": "struct", + }, + ) + unity_ds.save(subset_spark_df) + saved_table = unity_ds.load() + assert subset_expected_df.exceptAll(saved_table).count() == 0 + + def test_save_schema_pandas( + self, subset_pandas_df: pd.DataFrame, subset_expected_df: DataFrame + ): + unity_ds = UnityTableDataSet( + database="test", + table="test_save_pd_schema", + schema={ + "fields": [ + { + "metadata": {}, + "name": "name", + "nullable": True, + "type": "string", + }, + { + "metadata": {}, + "name": "age", + "nullable": True, + "type": "integer", + }, + ], + "type": "struct", + }, + dataframe_type="pandas", + ) + unity_ds.save(subset_pandas_df) + saved_ds = UnityTableDataSet( + database="test", + table="test_save_pd_schema", + ) + saved_table = saved_ds.load() + assert subset_expected_df.exceptAll(saved_table).count() == 0 + + def test_save_overwrite( + self, sample_spark_df: DataFrame, append_spark_df: DataFrame + ): + unity_ds = UnityTableDataSet(database="test", table="test_save") + unity_ds.save(sample_spark_df) + unity_ds.save(append_spark_df) + + overwritten_table = unity_ds.load() + + assert append_spark_df.exceptAll(overwritten_table).count() == 0 + + def test_save_append( + self, + sample_spark_df: DataFrame, + append_spark_df: DataFrame, + expected_append_spark_df: DataFrame, + ): + unity_ds = UnityTableDataSet( + database="test", table="test_save_append", write_mode="append" + ) + unity_ds.save(sample_spark_df) + unity_ds.save(append_spark_df) + + appended_table = unity_ds.load() + + assert expected_append_spark_df.exceptAll(appended_table).count() == 0 + + def test_save_upsert( + self, + sample_spark_df: DataFrame, + upsert_spark_df: DataFrame, + expected_upsert_spark_df: DataFrame, + ): + unity_ds = UnityTableDataSet( + database="test", + table="test_save_upsert", + write_mode="upsert", + primary_key="name", + ) + unity_ds.save(sample_spark_df) + unity_ds.save(upsert_spark_df) + + upserted_table = unity_ds.load() + + assert expected_upsert_spark_df.exceptAll(upserted_table).count() == 0 + + def test_save_upsert_multiple_primary( + self, + sample_spark_df: DataFrame, + upsert_spark_df: DataFrame, + expected_upsert_multiple_primary_spark_df: DataFrame, + ): + unity_ds = UnityTableDataSet( + database="test", + table="test_save_upsert_multiple", + write_mode="upsert", + primary_key=["name", "age"], + ) + unity_ds.save(sample_spark_df) + unity_ds.save(upsert_spark_df) + + upserted_table = unity_ds.load() + + assert ( + expected_upsert_multiple_primary_spark_df.exceptAll(upserted_table).count() + == 0 + ) + + def test_save_upsert_mismatched_columns( + self, + sample_spark_df: DataFrame, + mismatched_upsert_spark_df: DataFrame, + ): + unity_ds = UnityTableDataSet( + database="test", + table="test_save_upsert_mismatch", + write_mode="upsert", + primary_key="name", + ) + unity_ds.save(sample_spark_df) + with pytest.raises(DataSetError): + unity_ds.save(mismatched_upsert_spark_df) + + def test_load_spark(self, sample_spark_df: DataFrame): + unity_ds = UnityTableDataSet(database="test", table="test_load_spark") + unity_ds.save(sample_spark_df) + + delta_ds = UnityTableDataSet(database="test", table="test_load_spark") + delta_table = delta_ds.load() + + assert ( + isinstance(delta_table, DataFrame) + and delta_table.exceptAll(sample_spark_df).count() == 0 + ) + + def test_load_spark_no_version(self, sample_spark_df: DataFrame): + unity_ds = UnityTableDataSet(database="test", table="test_load_spark") + unity_ds.save(sample_spark_df) + + delta_ds = UnityTableDataSet( + database="test", table="test_load_spark", version=2 + ) + with pytest.raises(VersionNotFoundError): + _ = delta_ds.load() + + def test_load_version(self, sample_spark_df: DataFrame, append_spark_df: DataFrame): + unity_ds = UnityTableDataSet( + database="test", table="test_load_version", write_mode="append" + ) + unity_ds.save(sample_spark_df) + unity_ds.save(append_spark_df) + + loaded_ds = UnityTableDataSet( + database="test", table="test_load_version", version=0 + ) + loaded_df = loaded_ds.load() + + assert loaded_df.exceptAll(sample_spark_df).count() == 0 + + def test_load_pandas(self, sample_pandas_df: pd.DataFrame): + unity_ds = UnityTableDataSet( + database="test", table="test_load_pandas", dataframe_type="pandas" + ) + unity_ds.save(sample_pandas_df) + + pandas_ds = UnityTableDataSet( + database="test", table="test_load_pandas", dataframe_type="pandas" + ) + pandas_df = pandas_ds.load().sort_values("name", ignore_index=True) + + assert isinstance(pandas_df, pd.DataFrame) and pandas_df.equals( + sample_pandas_df + ) From fa14ea05b41d62336eaa9a4f05fb2aa24b0b853f Mon Sep 17 00:00:00 2001 From: adamfrly <45516720+adamfrly@users.noreply.github.com> Date: Wed, 1 Feb 2023 03:20:16 -0500 Subject: [PATCH 02/74] Replace kedro.pipeline with modular_pipeline.pipeline factory (#99) * Add non-spark related test changes Replace kedro.pipeline.Pipeline with kedro.pipeline.modular_pipeline.pipeline factory. This is for symmetry with changes made to the main kedro library. Signed-off-by: Adam Farley Signed-off-by: Danny Farah --- kedro-airflow/RELEASE.md | 1 + kedro-airflow/tests/test_plugin.py | 5 +++-- kedro-datasets/RELEASE.md | 1 + kedro-datasets/tests/spark/test_deltatable_dataset.py | 5 +++-- kedro-datasets/tests/spark/test_spark_dataset.py | 11 ++++++----- kedro-telemetry/RELEASE.md | 1 + kedro-telemetry/tests/test_plugin.py | 7 ++++--- 7 files changed, 19 insertions(+), 12 deletions(-) diff --git a/kedro-airflow/RELEASE.md b/kedro-airflow/RELEASE.md index 3e9d35d3e..75e4654e6 100755 --- a/kedro-airflow/RELEASE.md +++ b/kedro-airflow/RELEASE.md @@ -1,4 +1,5 @@ # Upcoming release 0.5.2 +* Change reference to `kedro.pipeline.Pipeline` object throughout test suite with `kedro.modular_pipeline.pipeline` factory. # Release 0.5.1 * Added additional CLI argument `--jinja-file` to provide a path to a custom Jinja2 template. diff --git a/kedro-airflow/tests/test_plugin.py b/kedro-airflow/tests/test_plugin.py index 48d1fb7b0..77c051ff5 100644 --- a/kedro-airflow/tests/test_plugin.py +++ b/kedro-airflow/tests/test_plugin.py @@ -2,7 +2,8 @@ import pytest from kedro.framework.project import pipelines -from kedro.pipeline import Pipeline, node +from kedro.pipeline import node +from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline from kedro_airflow.plugin import commands @@ -30,7 +31,7 @@ def test_create_airflow_dag( ): """Check the generation and validity of a simple Airflow DAG.""" dag_file = Path.cwd() / "airflow_dags" / f"{dag_name}.py" - mock_pipeline = Pipeline( + mock_pipeline = modular_pipeline( [ node(identity, ["input"], ["intermediate"], name="node0"), node(identity, ["intermediate"], ["output"], name="node1"), diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index de80c50e5..72237defd 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -1,5 +1,6 @@ # Upcoming Release: +* Change reference to `kedro.pipeline.Pipeline` object throughout test suite with `kedro.modular_pipeline.pipeline` factory. * Relaxed PyArrow range in line with Pandas diff --git a/kedro-datasets/tests/spark/test_deltatable_dataset.py b/kedro-datasets/tests/spark/test_deltatable_dataset.py index fe1d49d37..5cbbe62b7 100644 --- a/kedro-datasets/tests/spark/test_deltatable_dataset.py +++ b/kedro-datasets/tests/spark/test_deltatable_dataset.py @@ -1,7 +1,8 @@ import pytest from delta import DeltaTable from kedro.io import DataCatalog, DataSetError -from kedro.pipeline import Pipeline, node +from kedro.pipeline import node +from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline from kedro.runner import ParallelRunner from pyspark.sql import SparkSession from pyspark.sql.types import IntegerType, StringType, StructField, StructType @@ -80,7 +81,7 @@ def no_output(x): delta_ds = DeltaTableDataSet(filepath="") catalog = DataCatalog(data_sets={"delta_in": delta_ds}) - pipeline = Pipeline([node(no_output, "delta_in", None)]) + pipeline = modular_pipeline([node(no_output, "delta_in", None)]) pattern = ( r"The following data sets cannot be used with " r"multiprocessing: \['delta_in'\]" diff --git a/kedro-datasets/tests/spark/test_spark_dataset.py b/kedro-datasets/tests/spark/test_spark_dataset.py index 4567d6fc9..d02f99bff 100644 --- a/kedro-datasets/tests/spark/test_spark_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_dataset.py @@ -8,7 +8,8 @@ import pytest from kedro.io import DataCatalog, DataSetError, Version from kedro.io.core import generate_timestamp -from kedro.pipeline import Pipeline, node +from kedro.pipeline import node +from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline from kedro.runner import ParallelRunner, SequentialRunner from moto import mock_s3 from pyspark.sql import SparkSession @@ -413,7 +414,7 @@ def test_exists_raises_error(self, mocker): def test_parallel_runner(self, is_async, spark_in): """Test ParallelRunner with SparkDataSet fails.""" catalog = DataCatalog(data_sets={"spark_in": spark_in}) - pipeline = Pipeline([node(identity, "spark_in", "spark_out")]) + pipeline = modular_pipeline([node(identity, "spark_in", "spark_out")]) pattern = ( r"The following data sets cannot be used with " r"multiprocessing: \['spark_in'\]" @@ -949,7 +950,7 @@ def data_catalog(tmp_path): class TestDataFlowSequentialRunner: def test_spark_load_save(self, is_async, data_catalog): """SparkDataSet(load) -> node -> Spark (save).""" - pipeline = Pipeline([node(identity, "spark_in", "spark_out")]) + pipeline = modular_pipeline([node(identity, "spark_in", "spark_out")]) SequentialRunner(is_async=is_async).run(pipeline, data_catalog) save_path = Path(data_catalog._data_sets["spark_out"]._filepath.as_posix()) @@ -958,7 +959,7 @@ def test_spark_load_save(self, is_async, data_catalog): def test_spark_pickle(self, is_async, data_catalog): """SparkDataSet(load) -> node -> PickleDataSet (save)""" - pipeline = Pipeline([node(identity, "spark_in", "pickle_ds")]) + pipeline = modular_pipeline([node(identity, "spark_in", "pickle_ds")]) pattern = ".* was not serialised due to.*" with pytest.raises(DataSetError, match=pattern): SequentialRunner(is_async=is_async).run(pipeline, data_catalog) @@ -966,7 +967,7 @@ def test_spark_pickle(self, is_async, data_catalog): def test_spark_memory_spark(self, is_async, data_catalog): """SparkDataSet(load) -> node -> MemoryDataSet (save and then load) -> node -> SparkDataSet (save)""" - pipeline = Pipeline( + pipeline = modular_pipeline( [ node(identity, "spark_in", "memory_ds"), node(identity, "memory_ds", "spark_out"), diff --git a/kedro-telemetry/RELEASE.md b/kedro-telemetry/RELEASE.md index 595b27147..c8892d6bf 100644 --- a/kedro-telemetry/RELEASE.md +++ b/kedro-telemetry/RELEASE.md @@ -1,4 +1,5 @@ # Upcoming Release 0.2.4 +* Change reference to `kedro.pipeline.Pipeline` object throughout test suite with `kedro.modular_pipeline.pipeline` factory. # Release 0.2.3 diff --git a/kedro-telemetry/tests/test_plugin.py b/kedro-telemetry/tests/test_plugin.py index 0da62eb62..26ed0be6e 100644 --- a/kedro-telemetry/tests/test_plugin.py +++ b/kedro-telemetry/tests/test_plugin.py @@ -7,7 +7,8 @@ from kedro.framework.project import pipelines from kedro.framework.startup import ProjectMetadata from kedro.io import DataCatalog, MemoryDataSet -from kedro.pipeline import node, pipeline +from kedro.pipeline import node +from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline from pytest import fixture from kedro_telemetry import __version__ as TELEMETRY_VERSION @@ -55,7 +56,7 @@ def identity(arg): @fixture def fake_default_pipeline(): - mock_default_pipeline = pipeline( + mock_default_pipeline = modular_pipeline( [ node(identity, ["input"], ["intermediate"], name="node0"), node(identity, ["intermediate"], ["output"], name="node1"), @@ -66,7 +67,7 @@ def fake_default_pipeline(): @fixture def fake_sub_pipeline(): - mock_sub_pipeline = pipeline( + mock_sub_pipeline = modular_pipeline( [ node(identity, ["input"], ["intermediate"], name="node0"), ], From c0724c602921c4bc4d81d6d7d48cbedb95801ad6 Mon Sep 17 00:00:00 2001 From: Sajid Alam <90610031+SajidAlamQB@users.noreply.github.com> Date: Wed, 1 Feb 2023 10:32:19 +0000 Subject: [PATCH 03/74] Fix outdated links in Kedro Datasets (#111) * fix links * fix dill links Signed-off-by: Danny Farah --- kedro-datasets/kedro_datasets/pickle/pickle_dataset.py | 4 ++-- kedro-datasets/kedro_datasets/redis/redis_dataset.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py index 522950308..b4d3f20b7 100644 --- a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py +++ b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py @@ -105,7 +105,7 @@ def __init__( You can pass in arguments that the backend load function specified accepts, e.g: pickle.load: https://docs.python.org/3/library/pickle.html#pickle.load joblib.load: https://joblib.readthedocs.io/en/latest/generated/joblib.load.html - dill.load: https://dill.readthedocs.io/en/latest/dill.html#dill._dill.load + dill.load: https://dill.readthedocs.io/en/latest/index.html#dill.load compress_pickle.load: https://lucianopaz.github.io/compress_pickle/html/api/compress_pickle.html#compress_pickle.compress_pickle.load All defaults are preserved. @@ -113,7 +113,7 @@ def __init__( You can pass in arguments that the backend dump function specified accepts, e.g: pickle.dump: https://docs.python.org/3/library/pickle.html#pickle.dump joblib.dump: https://joblib.readthedocs.io/en/latest/generated/joblib.dump.html - dill.dump: https://dill.readthedocs.io/en/latest/dill.html#dill._dill.dump + dill.dump: https://dill.readthedocs.io/en/latest/index.html#dill.dump compress_pickle.dump: https://lucianopaz.github.io/compress_pickle/html/api/compress_pickle.html#compress_pickle.compress_pickle.dump All defaults are preserved. diff --git a/kedro-datasets/kedro_datasets/redis/redis_dataset.py b/kedro-datasets/kedro_datasets/redis/redis_dataset.py index 49ad02dce..8263cede2 100644 --- a/kedro-datasets/kedro_datasets/redis/redis_dataset.py +++ b/kedro-datasets/kedro_datasets/redis/redis_dataset.py @@ -87,14 +87,14 @@ def __init__( load_args: Pickle options for loading pickle files. You can pass in arguments that the backend load function specified accepts, e.g: pickle.loads: https://docs.python.org/3/library/pickle.html#pickle.loads - dill.loads: https://dill.readthedocs.io/en/latest/dill.html#dill._dill.loads + dill.loads: https://dill.readthedocs.io/en/latest/index.html#dill.loads compress_pickle.loads: https://lucianopaz.github.io/compress_pickle/html/api/compress_pickle.html#compress_pickle.compress_pickle.loads All defaults are preserved. save_args: Pickle options for saving pickle files. You can pass in arguments that the backend dump function specified accepts, e.g: pickle.dumps: https://docs.python.org/3/library/pickle.html#pickle.dump - dill.dumps: https://dill.readthedocs.io/en/latest/dill.html#dill._dill.dumps + dill.dumps: https://dill.readthedocs.io/en/latest/index.html#dill.dumps compress_pickle.dumps: https://lucianopaz.github.io/compress_pickle/html/api/compress_pickle.html#compress_pickle.compress_pickle.dumps All defaults are preserved. From 227a5dfcf905c2d78e229784f019d0436ff1a49a Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Wed, 1 Feb 2023 11:59:07 -0500 Subject: [PATCH 04/74] Fix docs formatting and phrasing for some datasets (#107) * Fix docs formatting and phrasing for some datasets Signed-off-by: Deepyaman Datta * Manually fix files not resolved with patch command Signed-off-by: Deepyaman Datta * Apply fix from #98 Signed-off-by: Deepyaman Datta --------- Signed-off-by: Deepyaman Datta Signed-off-by: Danny Farah --- .../kedro_datasets/api/api_dataset.py | 35 +++-- .../kedro_datasets/dask/parquet_dataset.py | 131 +++++++++--------- .../kedro_datasets/json/json_dataset.py | 27 ++-- .../matplotlib/matplotlib_writer.py | 26 ++-- .../kedro_datasets/pandas/csv_dataset.py | 43 +++--- .../kedro_datasets/pandas/excel_dataset.py | 60 ++++---- .../kedro_datasets/pandas/feather_dataset.py | 37 +++-- .../kedro_datasets/pandas/gbq_dataset.py | 31 +++-- .../kedro_datasets/pandas/generic_dataset.py | 50 +++---- .../kedro_datasets/pandas/hdf_dataset.py | 18 +-- .../kedro_datasets/pandas/json_dataset.py | 28 ++-- .../kedro_datasets/pandas/parquet_dataset.py | 52 +++---- .../kedro_datasets/pandas/sql_dataset.py | 69 ++++----- .../kedro_datasets/pandas/xml_dataset.py | 4 +- .../kedro_datasets/pickle/pickle_dataset.py | 34 ++--- .../kedro_datasets/pillow/image_dataset.py | 4 +- .../kedro_datasets/plotly/json_dataset.py | 18 ++- .../kedro_datasets/plotly/plotly_dataset.py | 36 ++--- .../kedro_datasets/redis/redis_dataset.py | 38 ++--- .../spark/deltatable_dataset.py | 78 ++++++----- .../kedro_datasets/spark/spark_dataset.py | 64 ++++----- .../spark/spark_hive_dataset.py | 20 +-- .../spark/spark_jdbc_dataset.py | 33 ++--- .../svmlight/svmlight_dataset.py | 46 +++--- .../tensorflow/tensorflow_model_dataset.py | 30 ++-- .../kedro_datasets/text/text_dataset.py | 14 +- .../kedro_datasets/tracking/json_dataset.py | 17 +-- .../tracking/metrics_dataset.py | 17 +-- .../kedro_datasets/video/video_dataset.py | 29 ++-- .../kedro_datasets/yaml/yaml_dataset.py | 17 +-- 30 files changed, 580 insertions(+), 526 deletions(-) diff --git a/kedro-datasets/kedro_datasets/api/api_dataset.py b/kedro-datasets/kedro_datasets/api/api_dataset.py index 2d175f26e..93e39fb51 100644 --- a/kedro-datasets/kedro_datasets/api/api_dataset.py +++ b/kedro-datasets/kedro_datasets/api/api_dataset.py @@ -12,27 +12,26 @@ class APIDataSet(AbstractDataSet[None, requests.Response]): """``APIDataSet`` loads the data from HTTP(S) APIs. It uses the python requests library: https://requests.readthedocs.io/en/latest/ - Example adding a catalog entry with - `YAML API - `_: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> usda: - >>> type: api.APIDataSet - >>> url: https://quickstats.nass.usda.gov - >>> params: - >>> key: SOME_TOKEN, - >>> format: JSON, - >>> commodity_desc: CORN, - >>> statisticcat_des: YIELD, - >>> agg_level_desc: STATE, - >>> year: 2000 - >>> - - - Example using Python API: + usda: + type: api.APIDataSet + url: https://quickstats.nass.usda.gov + params: + key: SOME_TOKEN, + format: JSON, + commodity_desc: CORN, + statisticcat_des: YIELD, + agg_level_desc: STATE, + year: 2000 + + Example usage for the + `Python API `_: :: >>> from kedro_datasets.api import APIDataSet diff --git a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py index 01624a44b..9161fa4e6 100644 --- a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py @@ -15,73 +15,72 @@ class ParquetDataSet(AbstractDataSet[dd.DataFrame, dd.DataFrame]): remote data services to handle the corresponding load and save operations: https://docs.dask.org/en/latest/how-to/connect-to-remote-data.html - Example adding a catalog entry with - `YAML API - `_: - - .. code-block:: yaml - - >>> cars: - >>> type: dask.ParquetDataSet - >>> filepath: s3://bucket_name/path/to/folder - >>> save_args: - >>> compression: GZIP - >>> credentials: - >>> client_kwargs: - >>> aws_access_key_id: YOUR_KEY - >>> aws_secret_access_key: YOUR_SECRET + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + cars: + type: dask.ParquetDataSet + filepath: s3://bucket_name/path/to/folder + save_args: + compression: GZIP + credentials: + client_kwargs: + aws_access_key_id: YOUR_KEY + aws_secret_access_key: YOUR_SECRET + + Example usage for the + `Python API `_: + :: + + >>> from kedro.extras.datasets.dask import ParquetDataSet + >>> import pandas as pd + >>> import dask.dataframe as dd >>> - - - Example using Python API (AWS S3): - :: - - >>> from kedro_datasets.dask import ParquetDataSet - >>> import pandas as pd - >>> import dask.dataframe as dd - >>> - >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - >>> 'col3': [[5, 6], [7, 8]]}) - >>> ddf = dd.from_pandas(data, npartitions=2) - >>> - >>> data_set = ParquetDataSet( - >>> filepath="s3://bucket_name/path/to/folder", - >>> credentials={ - >>> 'client_kwargs':{ - >>> 'aws_access_key_id': 'YOUR_KEY', - >>> 'aws_secret_access_key': 'YOUR SECRET', - >>> } - >>> }, - >>> save_args={"compression": "GZIP"} - >>> ) - >>> data_set.save(ddf) - >>> reloaded = data_set.load() - >>> - >>> assert ddf.compute().equals(reloaded.compute()) - - The output schema can also be explicitly specified using Triad's grammar. - This is processed to map specific columns into pyarrow field types or schema. - - References: - https://triad.readthedocs.io/en/latest/api/triad.collections.html#module-triad.collections.schema - https://arrow.apache.org/docs/python/api/datatypes.html - - .. code-block:: yaml - - >>> parquet_dataset: - >>> type: dask.ParquetDataSet - >>> filepath: "s3://bucket_name/path/to/folder" - >>> credentials: - >>> client_kwargs: - >>> aws_access_key_id: YOUR_KEY - >>> aws_secret_access_key: "YOUR SECRET" - >>> save_args: - >>> compression: GZIP - >>> schema: - >>> col1: [int32] - >>> col2: [int32] - >>> col3: [[int32]] + >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], + >>> 'col3': [[5, 6], [7, 8]]}) + >>> ddf = dd.from_pandas(data, npartitions=2) + >>> + >>> data_set = ParquetDataSet( + >>> filepath="s3://bucket_name/path/to/folder", + >>> credentials={ + >>> 'client_kwargs':{ + >>> 'aws_access_key_id': 'YOUR_KEY', + >>> 'aws_secret_access_key': 'YOUR SECRET', + >>> } + >>> }, + >>> save_args={"compression": "GZIP"} + >>> ) + >>> data_set.save(ddf) + >>> reloaded = data_set.load() + >>> + >>> assert ddf.compute().equals(reloaded.compute()) + + The output schema can also be explicitly specified using + `Triad `_. + This is processed to map specific columns to + `PyArrow field types `_ or schema. For instance: + + .. code-block:: yaml + + parquet_dataset: + type: dask.ParquetDataSet + filepath: "s3://bucket_name/path/to/folder" + credentials: + client_kwargs: + aws_access_key_id: YOUR_KEY + aws_secret_access_key: "YOUR SECRET" + save_args: + compression: GZIP + schema: + col1: [int32] + col2: [int32] + col3: [[int32]] """ DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] diff --git a/kedro-datasets/kedro_datasets/json/json_dataset.py b/kedro-datasets/kedro_datasets/json/json_dataset.py index 89b7081cd..73268b223 100644 --- a/kedro-datasets/kedro_datasets/json/json_dataset.py +++ b/kedro-datasets/kedro_datasets/json/json_dataset.py @@ -20,22 +20,21 @@ class JSONDataSet(AbstractVersionedDataSet[Any, Any]): """``JSONDataSet`` loads/saves data from/to a JSON file using an underlying filesystem (e.g.: local, S3, GCS). It uses native json to handle the JSON file. - Example adding a catalog entry with the ``YAML API``: - + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> json_dataset: - >>> type: json.JSONDataSet - >>> filepath: data/01_raw/location.json - >>> - >>> cars: - >>> type: json.JSONDataSet - >>> filepath: gcs://your_bucket/cars.json - >>> fs_args: - >>> project: my-project - >>> credentials: my_gcp_credentials - - Example using Python API: + cars: + type: json.JSONDataSet + filepath: gcs://your_bucket/cars.json + fs_args: + project: my-project + credentials: my_gcp_credentials + + Example usage for the + `Python API `_: :: >>> from kedro_datasets.json import JSONDataSet diff --git a/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py b/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py index 0a6163a23..5757b08ab 100644 --- a/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py +++ b/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py @@ -26,21 +26,21 @@ class MatplotlibWriter( """``MatplotlibWriter`` saves one or more Matplotlib objects as image files to an underlying filesystem (e.g. local, S3, GCS). - Example adding a catalog entry with the `YAML API - `_: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> output_plot: - >>> type: matplotlib.MatplotlibWriter - >>> filepath: data/08_reporting/output_plot.png - >>> save_args: - >>> format: png - >>> - - Example using the Python API: + output_plot: + type: matplotlib.MatplotlibWriter + filepath: data/08_reporting/output_plot.png + save_args: + format: png + Example usage for the + `Python API `_: :: >>> import matplotlib.pyplot as plt @@ -55,7 +55,6 @@ class MatplotlibWriter( >>> plot_writer.save(fig) Example saving a plot as a PDF file: - :: >>> import matplotlib.pyplot as plt @@ -70,9 +69,7 @@ class MatplotlibWriter( >>> plt.close() >>> pdf_plot_writer.save(fig) - Example saving multiple plots in a folder, using a dictionary: - :: >>> import matplotlib.pyplot as plt @@ -90,7 +87,6 @@ class MatplotlibWriter( >>> dict_plot_writer.save(plots_dict) Example saving multiple plots in a folder, using a list: - :: >>> import matplotlib.pyplot as plt diff --git a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py index 41bc27c9d..2a6366bd0 100644 --- a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py @@ -25,30 +25,31 @@ class CSVDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): """``CSVDataSet`` loads/saves data from/to a CSV file using an underlying filesystem (e.g.: local, S3, GCS). It uses pandas to handle the CSV file. - Example adding a catalog entry with - `YAML API - `_: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> cars: - >>> type: pandas.CSVDataSet - >>> filepath: data/01_raw/company/cars.csv - >>> load_args: - >>> sep: "," - >>> na_values: ["#NA", NA] - >>> save_args: - >>> index: False - >>> date_format: "%Y-%m-%d %H:%M" - >>> decimal: . - >>> - >>> motorbikes: - >>> type: pandas.CSVDataSet - >>> filepath: s3://your_bucket/data/02_intermediate/company/motorbikes.csv - >>> credentials: dev_s3 - - Example using Python API: + cars: + type: pandas.CSVDataSet + filepath: data/01_raw/company/cars.csv + load_args: + sep: "," + na_values: ["#NA", NA] + save_args: + index: False + date_format: "%Y-%m-%d %H:%M" + decimal: . + + motorbikes: + type: pandas.CSVDataSet + filepath: s3://your_bucket/data/02_intermediate/company/motorbikes.csv + credentials: dev_s3 + + Example usage for the + `Python API `_: :: >>> from kedro_datasets.pandas import CSVDataSet diff --git a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py index ec072d7c4..aec96c6ed 100644 --- a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py @@ -30,26 +30,30 @@ class ExcelDataSet( """``ExcelDataSet`` loads/saves data from/to a Excel file using an underlying filesystem (e.g.: local, S3, GCS). It uses pandas to handle the Excel file. - Example adding a catalog entry with the ``YAML API``: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> rockets: - >>> type: pandas.ExcelDataSet - >>> filepath: gcs://your_bucket/rockets.xlsx - >>> fs_args: - >>> project: my-project - >>> credentials: my_gcp_credentials - >>> save_args: - >>> sheet_name: Sheet1 - >>> load_args: - >>> sheet_name: Sheet1 - >>> - >>> shuttles: - >>> type: pandas.ExcelDataSet - >>> filepath: data/01_raw/shuttles.xlsx - - Example using Python API: + rockets: + type: pandas.ExcelDataSet + filepath: gcs://your_bucket/rockets.xlsx + fs_args: + project: my-project + credentials: my_gcp_credentials + save_args: + sheet_name: Sheet1 + load_args: + sheet_name: Sheet1 + + shuttles: + type: pandas.ExcelDataSet + filepath: data/01_raw/shuttles.xlsx + + Example usage for the + `Python API `_: :: >>> from kedro_datasets.pandas import ExcelDataSet @@ -63,21 +67,27 @@ class ExcelDataSet( >>> reloaded = data_set.load() >>> assert data.equals(reloaded) - Note: To save a multi-sheet Excel file, no special ``save_args`` are required. + To save a multi-sheet Excel file, no special ``save_args`` are required. Instead, return a dictionary of ``Dict[str, pd.DataFrame]`` where the string keys are your sheet names. - Example adding a catalog entry for multi-sheet Excel file with the ``YAML API``: + Example usage for the + `YAML API `_ + for a multi-sheet Excel file: .. code-block:: yaml - >>> trains: - >>> type: pandas.ExcelDataSet - >>> filepath: data/02_intermediate/company/trains.xlsx - >>> load_args: - >>> sheet_name: [Sheet1, Sheet2, Sheet3] + trains: + type: pandas.ExcelDataSet + filepath: data/02_intermediate/company/trains.xlsx + load_args: + sheet_name: [Sheet1, Sheet2, Sheet3] - Example multi-sheet Excel file using Python API: + Example usage for the + `Python API `_ + for a multi-sheet Excel file: :: >>> from kedro_datasets.pandas import ExcelDataSet diff --git a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py index a6dfa3ca8..9dc56b2b5 100644 --- a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py @@ -27,28 +27,27 @@ class FeatherDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): is supported by pandas, so it supports all allowed pandas options for loading and saving csv files. - Example adding a catalog entry with - `YAML API - `_: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> cars: - >>> type: pandas.FeatherDataSet - >>> filepath: data/01_raw/company/cars.feather - >>> load_args: - >>> columns: ['col1', 'col2', 'col3'] - >>> use_threads: True - >>> - >>> motorbikes: - >>> type: pandas.FeatherDataSet - >>> filepath: s3://your_bucket/data/02_intermediate/company/motorbikes.feather - >>> credentials: dev_s3 - >>> - - - Example using Python API: + cars: + type: pandas.FeatherDataSet + filepath: data/01_raw/company/cars.feather + load_args: + columns: ['col1', 'col2', 'col3'] + use_threads: True + + motorbikes: + type: pandas.FeatherDataSet + filepath: s3://your_bucket/data/02_intermediate/company/motorbikes.feather + credentials: dev_s3 + + Example usage for the + `Python API `_: :: >>> from kedro_datasets.pandas import FeatherDataSet diff --git a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py index 4a9464c5c..02dc31002 100644 --- a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py @@ -24,25 +24,26 @@ class GBQTableDataSet(AbstractDataSet[None, pd.DataFrame]): """``GBQTableDataSet`` loads and saves data from/to Google BigQuery. It uses pandas-gbq to read and write from/to BigQuery table. - Example adding a catalog entry with + Example usage for the `YAML API `_: + data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_: .. code-block:: yaml - >>> vehicles: - >>> type: pandas.GBQTableDataSet - >>> dataset: big_query_dataset - >>> table_name: big_query_table - >>> project: my-project - >>> credentials: gbq-creds - >>> load_args: - >>> reauth: True - >>> save_args: - >>> chunk_size: 100 - - - Example using Python API: + vehicles: + type: pandas.GBQTableDataSet + dataset: big_query_dataset + table_name: big_query_table + project: my-project + credentials: gbq-creds + load_args: + reauth: True + save_args: + chunk_size: 100 + + Example usage for the + `Python API `_: :: >>> from kedro_datasets.pandas import GBQTableDataSet diff --git a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py index 117d99015..08717fbb3 100644 --- a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py @@ -33,37 +33,39 @@ class GenericDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): filesystem (e.g.: local, S3, GCS). It uses pandas to dynamically select the appropriate type of read/write target on a best effort basis. - Example using `YAML API - `_: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> cars: - >>> type: pandas.GenericDataSet - >>> file_format: csv - >>> filepath: s3://data/01_raw/company/cars.csv - >>> load_args: - >>> sep: "," - >>> na_values: ["#NA", NA] - >>> save_args: - >>> index: False - >>> date_format: "%Y-%m-%d" - - This second example is able to load a SAS7BDAT file via the :code:`pd.read_sas` method. - Trying to save this dataset will raise a `DataSetError` since pandas does not provide an - equivalent :code:`pd.DataFrame.to_sas` write method. + cars: + type: pandas.GenericDataSet + file_format: csv + filepath: s3://data/01_raw/company/cars.csv + load_args: + sep: "," + na_values: ["#NA", NA] + save_args: + index: False + date_format: "%Y-%m-%d" + + This second example is able to load a SAS7BDAT file via the ``pd.read_sas`` method. + Trying to save this dataset will raise a ``DataSetError`` since pandas does not provide an + equivalent ``pd.DataFrame.to_sas`` write method. .. code-block:: yaml - >>> flights: - >>> type: pandas.GenericDataSet - >>> file_format: sas - >>> filepath: data/01_raw/airplanes.sas7bdat - >>> load_args: - >>> format: sas7bdat + flights: + type: pandas.GenericDataSet + file_format: sas + filepath: data/01_raw/airplanes.sas7bdat + load_args: + format: sas7bdat - Example using Python API: + Example usage for the + `Python API `_: :: >>> from kedro_datasets.pandas import GenericDataSet diff --git a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py index b790e6529..bf43a883e 100644 --- a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py @@ -23,19 +23,21 @@ class HDFDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): """``HDFDataSet`` loads/saves data from/to a hdf file using an underlying filesystem (e.g. local, S3, GCS). It uses pandas.HDFStore to handle the hdf file. - Example adding a catalog entry with + Example usage for the `YAML API `_: + data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_: .. code-block:: yaml - >>> hdf_dataset: - >>> type: pandas.HDFDataSet - >>> filepath: s3://my_bucket/raw/sensor_reading.h5 - >>> credentials: aws_s3_creds - >>> key: data + hdf_dataset: + type: pandas.HDFDataSet + filepath: s3://my_bucket/raw/sensor_reading.h5 + credentials: aws_s3_creds + key: data - Example using Python API: + Example usage for the + `Python API `_: :: >>> from kedro_datasets.pandas import HDFDataSet diff --git a/kedro-datasets/kedro_datasets/pandas/json_dataset.py b/kedro-datasets/kedro_datasets/pandas/json_dataset.py index 9c44fb502..cea0b985d 100644 --- a/kedro-datasets/kedro_datasets/pandas/json_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/json_dataset.py @@ -25,24 +25,26 @@ class JSONDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): """``JSONDataSet`` loads/saves data from/to a JSON file using an underlying filesystem (e.g.: local, S3, GCS). It uses pandas to handle the json file. - Example adding a catalog entry with + Example usage for the `YAML API `_: + data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_: .. code-block:: yaml - >>> clickstream_dataset: - >>> type: pandas.JSONDataSet - >>> filepath: abfs://landing_area/primary/click_stream.json - >>> credentials: abfs_creds - >>> - >>> json_dataset: - >>> type: pandas.JSONDataSet - >>> filepath: data/01_raw/Video_Games.json - >>> load_args: - >>> lines: True + clickstream_dataset: + type: pandas.JSONDataSet + filepath: abfs://landing_area/primary/click_stream.json + credentials: abfs_creds + + json_dataset: + type: pandas.JSONDataSet + filepath: data/01_raw/Video_Games.json + load_args: + lines: True - Example using Python API: + Example usage for the + `Python API `_: :: >>> from kedro_datasets.pandas import JSONDataSet diff --git a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py index 2352c2fd7..d0acdc5d1 100644 --- a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py @@ -25,35 +25,37 @@ class ParquetDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): """``ParquetDataSet`` loads/saves data from/to a Parquet file using an underlying filesystem (e.g.: local, S3, GCS). It uses pandas to handle the Parquet file. - Example adding a catalog entry with + Example usage for the `YAML API `_: + data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_: .. code-block:: yaml - >>> boats: - >>> type: pandas.ParquetDataSet - >>> filepath: data/01_raw/boats.parquet - >>> load_args: - >>> engine: pyarrow - >>> use_nullable_dtypes: True - >>> save_args: - >>> file_scheme: hive - >>> has_nulls: False - >>> engine: pyarrow - >>> - >>> trucks: - >>> type: pandas.ParquetDataSet - >>> filepath: abfs://container/02_intermediate/trucks.parquet - >>> credentials: dev_abs - >>> load_args: - >>> columns: [name, gear, disp, wt] - >>> index: name - >>> save_args: - >>> compression: GZIP - >>> partition_on: [name] - - Example using Python API: + boats: + type: pandas.ParquetDataSet + filepath: data/01_raw/boats.parquet + load_args: + engine: pyarrow + use_nullable_dtypes: True + save_args: + file_scheme: hive + has_nulls: False + engine: pyarrow + + trucks: + type: pandas.ParquetDataSet + filepath: abfs://container/02_intermediate/trucks.parquet + credentials: dev_abs + load_args: + columns: [name, gear, disp, wt] + index: name + save_args: + compression: GZIP + partition_on: [name] + + Example usage for the + `Python API `_: :: >>> from kedro_datasets.pandas import ParquetDataSet diff --git a/kedro-datasets/kedro_datasets/pandas/sql_dataset.py b/kedro-datasets/kedro_datasets/pandas/sql_dataset.py index 4de537812..400195719 100644 --- a/kedro-datasets/kedro_datasets/pandas/sql_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/sql_dataset.py @@ -101,30 +101,32 @@ class SQLTableDataSet(AbstractDataSet[pd.DataFrame, pd.DataFrame]): the data with no index. This is designed to make load and save methods symmetric. - Example adding a catalog entry with + Example usage for the `YAML API `_: + data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_: .. code-block:: yaml - >>> shuttles_table_dataset: - >>> type: pandas.SQLTableDataSet - >>> credentials: db_credentials - >>> table_name: shuttles - >>> load_args: - >>> schema: dwschema - >>> save_args: - >>> schema: dwschema - >>> if_exists: replace + shuttles_table_dataset: + type: pandas.SQLTableDataSet + credentials: db_credentials + table_name: shuttles + load_args: + schema: dwschema + save_args: + schema: dwschema + if_exists: replace Sample database credentials entry in ``credentials.yml``: .. code-block:: yaml - >>> db_credentials: - >>> con: postgresql://scott:tiger@localhost/test + db_credentials: + con: postgresql://scott:tiger@localhost/test - Example using Python API: + Example usage for the + `Python API `_: :: >>> from kedro_datasets.pandas import SQLTableDataSet @@ -270,38 +272,40 @@ class SQLQueryDataSet(AbstractDataSet[None, pd.DataFrame]): To save data to a SQL server use ``SQLTableDataSet``. - Example adding a catalog entry with + Example usage for the `YAML API `_: + data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_: .. code-block:: yaml - >>> shuttle_id_dataset: - >>> type: pandas.SQLQueryDataSet - >>> sql: "select shuttle, shuttle_id from spaceflights.shuttles;" - >>> credentials: db_credentials + shuttle_id_dataset: + type: pandas.SQLQueryDataSet + sql: "select shuttle, shuttle_id from spaceflights.shuttles;" + credentials: db_credentials - Advanced example using the `stream_results` and `chunksize` option to reduce memory usage + Advanced example using the ``stream_results`` and ``chunksize`` options to reduce memory usage: .. code-block:: yaml - >>> shuttle_id_dataset: - >>> type: pandas.SQLQueryDataSet - >>> sql: "select shuttle, shuttle_id from spaceflights.shuttles;" - >>> credentials: db_credentials - >>> execution_options: - >>> stream_results: true - >>> load_args: - >>> chunksize: 1000 + shuttle_id_dataset: + type: pandas.SQLQueryDataSet + sql: "select shuttle, shuttle_id from spaceflights.shuttles;" + credentials: db_credentials + execution_options: + stream_results: true + load_args: + chunksize: 1000 Sample database credentials entry in ``credentials.yml``: .. code-block:: yaml - >>> db_credentials: - >>> con: postgresql://scott:tiger@localhost/test + db_credentials: + con: postgresql://scott:tiger@localhost/test - Example using Python API: + Example usage for the + `Python API `_: :: >>> from kedro_datasets.pandas import SQLQueryDataSet @@ -317,7 +321,6 @@ class SQLQueryDataSet(AbstractDataSet[None, pd.DataFrame]): >>> credentials=credentials) >>> >>> sql_data = data_set.load() - >>> """ diff --git a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py index 64a3f9541..5760268a7 100644 --- a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py @@ -25,7 +25,9 @@ class XMLDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): """``XMLDataSet`` loads/saves data from/to a XML file using an underlying filesystem (e.g.: local, S3, GCS). It uses pandas to handle the XML file. - Example: + Example usage for the + `Python API `_: :: >>> from kedro_datasets.pandas import XMLDataSet diff --git a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py index b4d3f20b7..611865078 100644 --- a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py +++ b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py @@ -24,26 +24,28 @@ class PickleDataSet(AbstractVersionedDataSet[Any, Any]): the specified backend library passed in (defaults to the ``pickle`` library), so it supports all allowed options for loading and saving pickle files. - Example adding a catalog entry with + Example usage for the `YAML API `_: + data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_: .. code-block:: yaml - >>> test_model: # simple example without compression - >>> type: pickle.PickleDataSet - >>> filepath: data/07_model_output/test_model.pkl - >>> backend: pickle - >>> - >>> final_model: # example with load and save args - >>> type: pickle.PickleDataSet - >>> filepath: s3://your_bucket/final_model.pkl.lz4 - >>> backend: joblib - >>> credentials: s3_credentials - >>> save_args: - >>> compress: lz4 - - Example using Python API: + test_model: # simple example without compression + type: pickle.PickleDataSet + filepath: data/07_model_output/test_model.pkl + backend: pickle + + final_model: # example with load and save args + type: pickle.PickleDataSet + filepath: s3://your_bucket/final_model.pkl.lz4 + backend: joblib + credentials: s3_credentials + save_args: + compress: lz4 + + Example usage for the + `Python API `_: :: >>> from kedro_datasets.pickle import PickleDataSet diff --git a/kedro-datasets/kedro_datasets/pillow/image_dataset.py b/kedro-datasets/kedro_datasets/pillow/image_dataset.py index 71f6eb974..8c2fdc983 100644 --- a/kedro-datasets/kedro_datasets/pillow/image_dataset.py +++ b/kedro-datasets/kedro_datasets/pillow/image_dataset.py @@ -20,7 +20,9 @@ class ImageDataSet(AbstractVersionedDataSet[Image.Image, Image.Image]): """``ImageDataSet`` loads/saves image data as `numpy` from an underlying filesystem (e.g.: local, S3, GCS). It uses Pillow to handle image file. - Example: + Example usage for the + `Python API `_: :: >>> from kedro_datasets.pillow import ImageDataSet diff --git a/kedro-datasets/kedro_datasets/plotly/json_dataset.py b/kedro-datasets/kedro_datasets/plotly/json_dataset.py index 528751086..7eaae8da9 100644 --- a/kedro-datasets/kedro_datasets/plotly/json_dataset.py +++ b/kedro-datasets/kedro_datasets/plotly/json_dataset.py @@ -22,17 +22,21 @@ class JSONDataSet( """``JSONDataSet`` loads/saves a plotly figure from/to a JSON file using an underlying filesystem (e.g.: local, S3, GCS). - Example adding a catalog entry with YAML API: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> scatter_plot: - >>> type: plotly.JSONDataSet - >>> filepath: data/08_reporting/scatter_plot.json - >>> save_args: - >>> engine: auto + scatter_plot: + type: plotly.JSONDataSet + filepath: data/08_reporting/scatter_plot.json + save_args: + engine: auto - Example using Python API: + Example usage for the + `Python API `_: :: >>> from kedro_datasets.plotly import JSONDataSet diff --git a/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py b/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py index 4325d105c..1bb0acef6 100644 --- a/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py +++ b/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py @@ -21,25 +21,29 @@ class PlotlyDataSet(JSONDataSet): ``PlotlyDataSet`` is a convenience wrapper for ``plotly.JSONDataSet``. It generates the JSON file directly from a pandas DataFrame through ``plotly_args``. - Example adding a catalog entry with YAML API: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> bar_plot: - >>> type: plotly.PlotlyDataSet - >>> filepath: data/08_reporting/bar_plot.json - >>> plotly_args: - >>> type: bar - >>> fig: - >>> x: features - >>> y: importance - >>> orientation: h - >>> layout: - >>> xaxis_title: x - >>> yaxis_title: y - >>> title: Title - - Example using Python API: + bar_plot: + type: plotly.PlotlyDataSet + filepath: data/08_reporting/bar_plot.json + plotly_args: + type: bar + fig: + x: features + y: importance + orientation: h + layout: + xaxis_title: x + yaxis_title: y + title: Title + + Example usage for the + `Python API `_: :: >>> from kedro_datasets.plotly import PlotlyDataSet diff --git a/kedro-datasets/kedro_datasets/redis/redis_dataset.py b/kedro-datasets/kedro_datasets/redis/redis_dataset.py index 8263cede2..f012f0cd7 100644 --- a/kedro-datasets/kedro_datasets/redis/redis_dataset.py +++ b/kedro-datasets/kedro_datasets/redis/redis_dataset.py @@ -17,28 +17,30 @@ class PickleDataSet(AbstractDataSet[Any, Any]): all allowed options for instantiating the redis app ``from_url`` and setting a value. - Example adding a catalog entry with + Example usage for the `YAML API `_: + data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_: .. code-block:: yaml - >>> my_python_object: # simple example - >>> type: redis.PickleDataSet - >>> key: my_object - >>> from_url_args: - >>> url: redis://127.0.0.1:6379 - >>> - >>> final_python_object: # example with save args - >>> type: redis.PickleDataSet - >>> key: my_final_object - >>> from_url_args: - >>> url: redis://127.0.0.1:6379 - >>> db: 1 - >>> save_args: - >>> ex: 10 - - Example using Python API: + my_python_object: # simple example + type: redis.PickleDataSet + key: my_object + from_url_args: + url: redis://127.0.0.1:6379 + + final_python_object: # example with save args + type: redis.PickleDataSet + key: my_final_object + from_url_args: + url: redis://127.0.0.1:6379 + db: 1 + save_args: + ex: 10 + + Example usage for the + `Python API `_: :: >>> from kedro_datasets.redis import PickleDataSet diff --git a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py index eaa593e87..db45bc12c 100644 --- a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py @@ -15,44 +15,46 @@ class DeltaTableDataSet(AbstractDataSet[None, DeltaTable]): """``DeltaTableDataSet`` loads data into DeltaTable objects. - Example adding a catalog entry with - `YAML API `_: - - .. code-block:: yaml - - >>> weather@spark: - >>> type: spark.SparkDataSet - >>> filepath: data/02_intermediate/data.parquet - >>> file_format: "delta" - >>> - >>> weather@delta: - >>> type: spark.DeltaTableDataSet - >>> filepath: data/02_intermediate/data.parquet - - Example using Python API: - :: - - >>> from pyspark.sql import SparkSession - >>> from pyspark.sql.types import (StructField, StringType, - >>> IntegerType, StructType) - >>> - >>> from kedro_datasets.spark import DeltaTableDataSet, SparkDataSet - >>> - >>> schema = StructType([StructField("name", StringType(), True), - >>> StructField("age", IntegerType(), True)]) - >>> - >>> data = [('Alex', 31), ('Bob', 12), ('Clarke', 65), ('Dave', 29)] - >>> - >>> spark_df = SparkSession.builder.getOrCreate().createDataFrame(data, schema) - >>> - >>> data_set = SparkDataSet(filepath="test_data", file_format="delta") - >>> data_set.save(spark_df) - >>> deltatable_dataset = DeltaTableDataSet(filepath="test_data") - >>> delta_table = deltatable_dataset.load() - >>> - >>> delta_table.update() - """ + Example usage for the + `YAML API `_: + + .. code-block:: yaml + + weather@spark: + type: spark.SparkDataSet + filepath: data/02_intermediate/data.parquet + file_format: "delta" + + weather@delta: + type: spark.DeltaTableDataSet + filepath: data/02_intermediate/data.parquet + + Example usage for the + `Python API `_: + :: + + >>> from pyspark.sql import SparkSession + >>> from pyspark.sql.types import (StructField, StringType, + >>> IntegerType, StructType) + >>> + >>> from kedro.extras.datasets.spark import DeltaTableDataSet, SparkDataSet + >>> + >>> schema = StructType([StructField("name", StringType(), True), + >>> StructField("age", IntegerType(), True)]) + >>> + >>> data = [('Alex', 31), ('Bob', 12), ('Clarke', 65), ('Dave', 29)] + >>> + >>> spark_df = SparkSession.builder.getOrCreate().createDataFrame(data, schema) + >>> + >>> data_set = SparkDataSet(filepath="test_data", file_format="delta") + >>> data_set.save(spark_df) + >>> deltatable_dataset = DeltaTableDataSet(filepath="test_data") + >>> delta_table = deltatable_dataset.load() + >>> + >>> delta_table.update() + """ # this dataset cannot be used with ``ParallelRunner``, # therefore it has the attribute ``_SINGLE_PROCESS = True`` diff --git a/kedro-datasets/kedro_datasets/spark/spark_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_dataset.py index 7a2b54eef..2250ae337 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_dataset.py @@ -159,41 +159,43 @@ def hdfs_glob(self, pattern: str) -> List[str]: class SparkDataSet(AbstractVersionedDataSet[DataFrame, DataFrame]): """``SparkDataSet`` loads and saves Spark dataframes. - Example adding a catalog entry with + Example usage for the `YAML API `_: + data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_: .. code-block:: yaml - >>> weather: - >>> type: spark.SparkDataSet - >>> filepath: s3a://your_bucket/data/01_raw/weather/* - >>> file_format: csv - >>> load_args: - >>> header: True - >>> inferSchema: True - >>> save_args: - >>> sep: '|' - >>> header: True - >>> - >>> weather_schema: - >>> type: spark.SparkDataSet - >>> filepath: s3a://your_bucket/data/01_raw/weather/* - >>> file_format: csv - >>> load_args: - >>> header: True - >>> schema: - >>> filepath: path/to/schema.json - >>> save_args: - >>> sep: '|' - >>> header: True - >>> - >>> weather_cleaned: - >>> type: spark.SparkDataSet - >>> filepath: data/02_intermediate/data.parquet - >>> file_format: parquet - - Example using Python API: + weather: + type: spark.SparkDataSet + filepath: s3a://your_bucket/data/01_raw/weather/* + file_format: csv + load_args: + header: True + inferSchema: True + save_args: + sep: '|' + header: True + + weather_with_schema: + type: spark.SparkDataSet + filepath: s3a://your_bucket/data/01_raw/weather/* + file_format: csv + load_args: + header: True + schema: + filepath: path/to/schema.json + save_args: + sep: '|' + header: True + + weather_cleaned: + type: spark.SparkDataSet + filepath: data/02_intermediate/data.parquet + file_format: parquet + + Example usage for the + `Python API `_: :: >>> from pyspark.sql import SparkSession diff --git a/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py index c4cb80bf2..613b6af5f 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py @@ -18,25 +18,29 @@ class SparkHiveDataSet(AbstractDataSet[DataFrame, DataFrame]): of the existing file/partition. This DataSet has some key assumptions: + - Schemas do not change during the pipeline run (defined PKs must be present for the duration of the pipeline) - Tables are not being externally modified during upserts. The upsert method is NOT ATOMIC + to external changes to the target table while executing. Upsert methodology works by leveraging Spark DataFrame execution plan checkpointing. - Example adding a catalog entry with + Example usage for the `YAML API `_: + data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_: .. code-block:: yaml - >>> hive_dataset: - >>> type: spark.SparkHiveDataSet - >>> database: hive_database - >>> table: table_name - >>> write_mode: overwrite + hive_dataset: + type: spark.SparkHiveDataSet + database: hive_database + table: table_name + write_mode: overwrite - Example using Python API: + Example usage for the + `Python API `_: :: >>> from pyspark.sql import SparkSession diff --git a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py index 9567d3e73..24bb3220a 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py @@ -16,26 +16,27 @@ class SparkJDBCDataSet(AbstractDataSet[DataFrame, DataFrame]): ``pyspark.sql.DataFrameReader`` and ``pyspark.sql.DataFrameWriter`` internally, so it supports all allowed PySpark options on ``jdbc``. - - Example adding a catalog entry with + Example usage for the `YAML API `_: + data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_: .. code-block:: yaml - >>> weather: - >>> type: spark.SparkJDBCDataSet - >>> table: weather_table - >>> url: jdbc:postgresql://localhost/test - >>> credentials: db_credentials - >>> load_args: - >>> properties: - >>> driver: org.postgresql.Driver - >>> save_args: - >>> properties: - >>> driver: org.postgresql.Driver - - Example using Python API: + weather: + type: spark.SparkJDBCDataSet + table: weather_table + url: jdbc:postgresql://localhost/test + credentials: db_credentials + load_args: + properties: + driver: org.postgresql.Driver + save_args: + properties: + driver: org.postgresql.Driver + + Example usage for the + `Python API `_: :: >>> import pandas as pd diff --git a/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py b/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py index 4763abff4..5c9e0699f 100644 --- a/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py +++ b/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py @@ -42,30 +42,34 @@ class SVMLightDataSet(AbstractVersionedDataSet[_DI, _DO]): This format is used as the default format for both svmlight and the libsvm command line programs. - Example adding a catalog entry with the ``YAML API``: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> svm_dataset: - >>> type: svmlight.SVMLightDataSet - >>> filepath: data/01_raw/location.svm - >>> load_args: - >>> zero_based: False - >>> save_args: - >>> zero_based: False - >>> - >>> cars: - >>> type: svmlight.SVMLightDataSet - >>> filepath: gcs://your_bucket/cars.svm - >>> fs_args: - >>> project: my-project - >>> credentials: my_gcp_credentials - >>> load_args: - >>> zero_based: False - >>> save_args: - >>> zero_based: False - - Example using Python API: + svm_dataset: + type: svmlight.SVMLightDataSet + filepath: data/01_raw/location.svm + load_args: + zero_based: False + save_args: + zero_based: False + + cars: + type: svmlight.SVMLightDataSet + filepath: gcs://your_bucket/cars.svm + fs_args: + project: my-project + credentials: my_gcp_credentials + load_args: + zero_based: False + save_args: + zero_based: False + + Example usage for the + `Python API `_: :: >>> from kedro_datasets.svmlight import SVMLightDataSet diff --git a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py index 97bf4e505..63e53b7b4 100644 --- a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py +++ b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py @@ -24,21 +24,25 @@ class TensorFlowModelDataset(AbstractVersionedDataSet[tf.keras.Model, tf.keras.M The underlying functionality is supported by, and passes input arguments through to, TensorFlow 2.X load_model and save_model methods. - .. code-block:: yaml - - >>> tensorflow_model: - >>> type: tensorflow.TensorFlowModelDataset - >>> filepath: data/06_models/tensorflow_model.h5 - >>> load_args: - >>> compile: False - >>> save_args: - >>> overwrite: True - >>> include_optimizer: False - >>> credentials: tf_creds - >>> + Example usage for the + `YAML API `_: + .. code-block:: yaml - Example using Python API: + tensorflow_model: + type: tensorflow.TensorFlowModelDataset + filepath: data/06_models/tensorflow_model.h5 + load_args: + compile: False + save_args: + overwrite: True + include_optimizer: False + credentials: tf_creds + + Example usage for the + `Python API `_: :: >>> from kedro_datasets.tensorflow import TensorFlowModelDataset diff --git a/kedro-datasets/kedro_datasets/text/text_dataset.py b/kedro-datasets/kedro_datasets/text/text_dataset.py index 80ddbaf55..5ba2ee060 100644 --- a/kedro-datasets/kedro_datasets/text/text_dataset.py +++ b/kedro-datasets/kedro_datasets/text/text_dataset.py @@ -19,15 +19,19 @@ class TextDataSet(AbstractVersionedDataSet[str, str]): """``TextDataSet`` loads/saves data from/to a text file using an underlying filesystem (e.g.: local, S3, GCS) - Example adding a catalog entry with YAML API: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> alice_book: - >>> type: text.TextDataSet - >>> filepath: data/01_raw/alice.txt + alice_book: + type: text.TextDataSet + filepath: data/01_raw/alice.txt - Example using Python API: + Example usage for the + `Python API `_: :: >>> from kedro_datasets.text import TextDataSet diff --git a/kedro-datasets/kedro_datasets/tracking/json_dataset.py b/kedro-datasets/kedro_datasets/tracking/json_dataset.py index 9454b2cbc..4235df999 100644 --- a/kedro-datasets/kedro_datasets/tracking/json_dataset.py +++ b/kedro-datasets/kedro_datasets/tracking/json_dataset.py @@ -15,18 +15,19 @@ class JSONDataSet(JDS): The ``JSONDataSet`` is part of Kedro Experiment Tracking. The dataset is write-only and it is versioned by default. - Example adding a catalog entry with - `YAML API - `_: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> cars: - >>> type: tracking.JSONDataSet - >>> filepath: data/09_tracking/cars.json + cars: + type: tracking.JSONDataSet + filepath: data/09_tracking/cars.json - Example using Python API: + Example usage for the + `Python API `_: :: >>> from kedro_datasets.tracking import JSONDataSet diff --git a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py index f65adc7a3..7c7546a85 100644 --- a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py +++ b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py @@ -17,18 +17,19 @@ class MetricsDataSet(JSONDataSet): ``MetricsDataSet`` is part of Kedro Experiment Tracking. The dataset is write-only, it is versioned by default and only takes metrics of numeric values. -Example adding a catalog entry with - `YAML API - `_: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> cars: - >>> type: tracking.MetricsDataSet - >>> filepath: data/09_tracking/cars.json + cars: + type: tracking.MetricsDataSet + filepath: data/09_tracking/cars.json - Example using Python API: + Example usage for the + `Python API `_: :: >>> from kedro_datasets.tracking import MetricsDataSet diff --git a/kedro-datasets/kedro_datasets/video/video_dataset.py b/kedro-datasets/kedro_datasets/video/video_dataset.py index ad550d5b3..22bd51bc5 100644 --- a/kedro-datasets/kedro_datasets/video/video_dataset.py +++ b/kedro-datasets/kedro_datasets/video/video_dataset.py @@ -195,25 +195,24 @@ class VideoDataSet(AbstractDataSet[AbstractVideo, AbstractVideo]): """``VideoDataSet`` loads / save video data from a given filepath as sequence of PIL.Image.Image using OpenCV. - Example adding a catalog entry with - `YAML API - `_: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> cars: - >>> type: video.VideoDataSet - >>> filepath: data/01_raw/cars.mp4 - >>> - >>> motorbikes: - >>> type: video.VideoDataSet - >>> filepath: s3://your_bucket/data/02_intermediate/company/motorbikes.mp4 - >>> credentials: dev_s3 - >>> + cars: + type: video.VideoDataSet + filepath: data/01_raw/cars.mp4 + motorbikes: + type: video.VideoDataSet + filepath: s3://your_bucket/data/02_intermediate/company/motorbikes.mp4 + credentials: dev_s3 - Example using Python API: + Example usage for the + `Python API `_: :: >>> from kedro_datasets.video import VideoDataSet @@ -241,7 +240,7 @@ class VideoDataSet(AbstractDataSet[AbstractVideo, AbstractVideo]): >>> video.save(SequenceVideo(imgs, fps=25)) - Example creating a video from numpy frames using a generator and Python API: + Example creating a video from numpy frames using a generator and the Python API: :: >>> from kedro_datasets.video.video_dataset import VideoDataSet, GeneratorVideo diff --git a/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py b/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py index 5e9126e93..1ab2fa43b 100644 --- a/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py +++ b/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py @@ -20,18 +20,19 @@ class YAMLDataSet(AbstractVersionedDataSet[Dict, Dict]): """``YAMLDataSet`` loads/saves data from/to a YAML file using an underlying filesystem (e.g.: local, S3, GCS). It uses PyYAML to handle the YAML file. - Example adding a catalog entry with - `YAML API - `_: + Example usage for the + `YAML API `_: .. code-block:: yaml - >>> cars: - >>> type: yaml.YAMLDataSet - >>> filepath: cars.yaml + cars: + type: yaml.YAMLDataSet + filepath: cars.yaml - Example using Python API: + Example usage for the + `Python API `_: :: >>> from kedro_datasets.yaml import YAMLDataSet From 5f01c6703687c133a56178c3d845d852cbb381bf Mon Sep 17 00:00:00 2001 From: Sajid Alam <90610031+SajidAlamQB@users.noreply.github.com> Date: Thu, 2 Feb 2023 09:45:31 +0000 Subject: [PATCH 05/74] Release `kedro-datasets` `version 1.0.2` (#112) * bump version and update release notes * fix pylint errors Signed-off-by: Danny Farah --- kedro-datasets/RELEASE.md | 6 ++- kedro-datasets/kedro_datasets/__init__.py | 2 +- .../kedro_datasets/api/api_dataset.py | 2 +- .../biosequence/biosequence_dataset.py | 12 ++--- .../kedro_datasets/dask/parquet_dataset.py | 10 ++-- .../kedro_datasets/email/message_dataset.py | 18 +++---- .../geopandas/geojson_dataset.py | 14 ++--- .../holoviews/holoviews_writer.py | 12 ++--- .../kedro_datasets/json/json_dataset.py | 12 ++--- .../matplotlib/matplotlib_writer.py | 14 ++--- .../kedro_datasets/networkx/gml_dataset.py | 14 ++--- .../networkx/graphml_dataset.py | 14 ++--- .../kedro_datasets/networkx/json_dataset.py | 14 ++--- .../kedro_datasets/pandas/csv_dataset.py | 14 ++--- .../kedro_datasets/pandas/excel_dataset.py | 16 +++--- .../kedro_datasets/pandas/feather_dataset.py | 12 ++--- .../kedro_datasets/pandas/gbq_dataset.py | 12 ++--- .../kedro_datasets/pandas/generic_dataset.py | 16 +++--- .../kedro_datasets/pandas/hdf_dataset.py | 16 +++--- .../kedro_datasets/pandas/json_dataset.py | 14 ++--- .../kedro_datasets/pandas/parquet_dataset.py | 14 ++--- .../kedro_datasets/pandas/sql_dataset.py | 22 ++++---- .../kedro_datasets/pandas/xml_dataset.py | 14 ++--- .../kedro_datasets/pickle/pickle_dataset.py | 16 +++--- .../kedro_datasets/pillow/image_dataset.py | 12 ++--- .../kedro_datasets/plotly/json_dataset.py | 14 ++--- .../kedro_datasets/redis/redis_dataset.py | 2 +- .../spark/deltatable_dataset.py | 2 +- .../kedro_datasets/spark/spark_dataset.py | 14 ++--- .../spark/spark_hive_dataset.py | 16 +++--- .../spark/spark_jdbc_dataset.py | 9 ++-- .../svmlight/svmlight_dataset.py | 14 ++--- .../tensorflow/tensorflow_model_dataset.py | 14 ++--- .../kedro_datasets/text/text_dataset.py | 10 ++-- .../kedro_datasets/video/video_dataset.py | 2 +- .../kedro_datasets/yaml/yaml_dataset.py | 12 ++--- .../tests/pandas/test_sql_dataset.py | 54 +++++++++---------- 37 files changed, 245 insertions(+), 240 deletions(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 72237defd..9c6deef45 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -1,8 +1,12 @@ # Upcoming Release: -* Change reference to `kedro.pipeline.Pipeline` object throughout test suite with `kedro.modular_pipeline.pipeline` factory. +# Release 1.0.2: + +## Bug fixes and other changes +* Change reference to `kedro.pipeline.Pipeline` object throughout test suite with `kedro.modular_pipeline.pipeline` factory. * Relaxed PyArrow range in line with Pandas +* Fixed outdated links to the dill package documentation # Release 1.0.1: diff --git a/kedro-datasets/kedro_datasets/__init__.py b/kedro-datasets/kedro_datasets/__init__.py index d34d03c10..d8bcc2d13 100644 --- a/kedro-datasets/kedro_datasets/__init__.py +++ b/kedro-datasets/kedro_datasets/__init__.py @@ -1,3 +1,3 @@ """``kedro_datasets`` is where you can find all of Kedro's data connectors.""" -__version__ = "1.0.1" +__version__ = "1.0.2" diff --git a/kedro-datasets/kedro_datasets/api/api_dataset.py b/kedro-datasets/kedro_datasets/api/api_dataset.py index 93e39fb51..4f0ffb4cc 100644 --- a/kedro-datasets/kedro_datasets/api/api_dataset.py +++ b/kedro-datasets/kedro_datasets/api/api_dataset.py @@ -111,7 +111,7 @@ def __init__( } def _describe(self) -> Dict[str, Any]: - return dict(**self._request_args) + return {**self._request_args} def _execute_request(self) -> requests.Response: try: diff --git a/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py b/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py index ae34b30e8..7c45743da 100644 --- a/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py +++ b/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py @@ -101,12 +101,12 @@ def __init__( self._fs_open_args_save = _fs_open_args_save def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - load_args=self._load_args, - save_args=self._save_args, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "load_args": self._load_args, + "save_args": self._save_args, + } def _load(self) -> List: load_path = get_filepath_str(self._filepath, self._protocol) diff --git a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py index 9161fa4e6..f02144892 100644 --- a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py @@ -134,11 +134,11 @@ def fs_args(self) -> Dict[str, Any]: return fs_args def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - load_args=self._load_args, - save_args=self._save_args, - ) + return { + "filepath": self._filepath, + "load_args": self._load_args, + "save_args": self._save_args, + } def _load(self) -> dd.DataFrame: return dd.read_parquet( diff --git a/kedro-datasets/kedro_datasets/email/message_dataset.py b/kedro-datasets/kedro_datasets/email/message_dataset.py index fdc684504..0b8623f63 100644 --- a/kedro-datasets/kedro_datasets/email/message_dataset.py +++ b/kedro-datasets/kedro_datasets/email/message_dataset.py @@ -141,15 +141,15 @@ def __init__( self._fs_open_args_save = _fs_open_args_save def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - load_args=self._load_args, - parser_args=self._parser_args, - save_args=self._save_args, - generator_args=self._generator_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "load_args": self._load_args, + "parser_args": self._parser_args, + "save_args": self._save_args, + "generator_args": self._generator_args, + "version": self._version, + } def _load(self) -> Message: load_path = get_filepath_str(self._get_load_path(), self._protocol) diff --git a/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py b/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py index d0ca02722..ba9237909 100644 --- a/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py +++ b/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py @@ -135,13 +135,13 @@ def _exists(self) -> bool: return self._fs.exists(load_path) def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - load_args=self._load_args, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "load_args": self._load_args, + "save_args": self._save_args, + "version": self._version, + } def _release(self) -> None: self.invalidate_cache() diff --git a/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py b/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py index 7be8790e2..7f61909b9 100644 --- a/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py +++ b/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py @@ -98,12 +98,12 @@ def __init__( self._save_args.update(save_args) def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "save_args": self._save_args, + "version": self._version, + } def _load(self) -> NoReturn: raise DataSetError(f"Loading not supported for '{self.__class__.__name__}'") diff --git a/kedro-datasets/kedro_datasets/json/json_dataset.py b/kedro-datasets/kedro_datasets/json/json_dataset.py index 73268b223..ad86c9a17 100644 --- a/kedro-datasets/kedro_datasets/json/json_dataset.py +++ b/kedro-datasets/kedro_datasets/json/json_dataset.py @@ -115,12 +115,12 @@ def __init__( self._fs_open_args_save = _fs_open_args_save def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "save_args": self._save_args, + "version": self._version, + } def _load(self) -> Any: load_path = get_filepath_str(self._get_load_path(), self._protocol) diff --git a/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py b/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py index 5757b08ab..3fc396cb1 100644 --- a/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py +++ b/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py @@ -75,7 +75,7 @@ class MatplotlibWriter( >>> import matplotlib.pyplot as plt >>> from kedro_datasets.matplotlib import MatplotlibWriter >>> - >>> plots_dict = dict() + >>> plots_dict = {} >>> for colour in ["blue", "green", "red"]: >>> plots_dict[f"{colour}.png"] = plt.figure() >>> plt.plot([1, 2, 3], color=colour) @@ -177,12 +177,12 @@ def __init__( self._overwrite = overwrite def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "save_args": self._save_args, + "version": self._version, + } def _load(self) -> NoReturn: raise DataSetError(f"Loading not supported for '{self.__class__.__name__}'") diff --git a/kedro-datasets/kedro_datasets/networkx/gml_dataset.py b/kedro-datasets/kedro_datasets/networkx/gml_dataset.py index 1755674c9..bc8d4f86f 100644 --- a/kedro-datasets/kedro_datasets/networkx/gml_dataset.py +++ b/kedro-datasets/kedro_datasets/networkx/gml_dataset.py @@ -122,13 +122,13 @@ def _exists(self) -> bool: return self._fs.exists(load_path) def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - load_args=self._load_args, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "load_args": self._load_args, + "save_args": self._save_args, + "version": self._version, + } def _release(self) -> None: super()._release() diff --git a/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py b/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py index d48c53b5f..2105fb67f 100644 --- a/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py +++ b/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py @@ -120,13 +120,13 @@ def _exists(self) -> bool: return self._fs.exists(load_path) def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - load_args=self._load_args, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "load_args": self._load_args, + "save_args": self._save_args, + "version": self._version, + } def _release(self) -> None: super()._release() diff --git a/kedro-datasets/kedro_datasets/networkx/json_dataset.py b/kedro-datasets/kedro_datasets/networkx/json_dataset.py index b7c47c823..8cc436721 100644 --- a/kedro-datasets/kedro_datasets/networkx/json_dataset.py +++ b/kedro-datasets/kedro_datasets/networkx/json_dataset.py @@ -127,13 +127,13 @@ def _exists(self) -> bool: return self._fs.exists(load_path) def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - load_args=self._load_args, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "load_args": self._load_args, + "save_args": self._save_args, + "version": self._version, + } def _release(self) -> None: super()._release() diff --git a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py index 2a6366bd0..7b20813f3 100644 --- a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py @@ -139,13 +139,13 @@ def __init__( self._load_args.pop("storage_options", None) def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - load_args=self._load_args, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "load_args": self._load_args, + "save_args": self._save_args, + "version": self._version, + } def _load(self) -> pd.DataFrame: load_path = str(self._get_load_path()) diff --git a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py index aec96c6ed..4a981bc11 100644 --- a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py @@ -199,14 +199,14 @@ def __init__( self._load_args.pop("storage_options", None) def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - load_args=self._load_args, - save_args=self._save_args, - writer_args=self._writer_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "load_args": self._load_args, + "save_args": self._save_args, + "writer_args": self._writer_args, + "version": self._version, + } def _load(self) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]: load_path = str(self._get_load_path()) diff --git a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py index 9dc56b2b5..1116d4168 100644 --- a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py @@ -139,12 +139,12 @@ def __init__( self._load_args.pop("storage_options", None) def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - load_args=self._load_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "load_args": self._load_args, + "version": self._version, + } def _load(self) -> pd.DataFrame: load_path = str(self._get_load_path()) diff --git a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py index 02dc31002..c0122a6c0 100644 --- a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py @@ -126,12 +126,12 @@ def __init__( ) def _describe(self) -> Dict[str, Any]: - return dict( - dataset=self._dataset, - table_name=self._table_name, - load_args=self._load_args, - save_args=self._save_args, - ) + return { + "dataset": self._dataset, + "table_name": self._table_name, + "load_args": self._load_args, + "save_args": self._save_args, + } def _load(self) -> pd.DataFrame: sql = f"select * from {self._dataset}.{self._table_name}" # nosec diff --git a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py index 08717fbb3..86e347d70 100644 --- a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py @@ -223,14 +223,14 @@ def _exists(self) -> bool: return self._fs.exists(load_path) def _describe(self) -> Dict[str, Any]: - return dict( - file_format=self._file_format, - filepath=self._filepath, - protocol=self._protocol, - load_args=self._load_args, - save_args=self._save_args, - version=self._version, - ) + return { + "file_format": self._file_format, + "filepath": self._filepath, + "protocol": self._protocol, + "load_args": self._load_args, + "save_args": self._save_args, + "version": self._version, + } def _release(self) -> None: super()._release() diff --git a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py index bf43a883e..f11fe320f 100644 --- a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py @@ -135,14 +135,14 @@ def __init__( self._fs_open_args_save = _fs_open_args_save def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - key=self._key, - protocol=self._protocol, - load_args=self._load_args, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "key": self._key, + "protocol": self._protocol, + "load_args": self._load_args, + "save_args": self._save_args, + "version": self._version, + } def _load(self) -> pd.DataFrame: load_path = get_filepath_str(self._get_load_path(), self._protocol) diff --git a/kedro-datasets/kedro_datasets/pandas/json_dataset.py b/kedro-datasets/kedro_datasets/pandas/json_dataset.py index cea0b985d..d29ef57bd 100644 --- a/kedro-datasets/kedro_datasets/pandas/json_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/json_dataset.py @@ -133,13 +133,13 @@ def __init__( self._load_args.pop("storage_options", None) def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - load_args=self._load_args, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "load_args": self._load_args, + "save_args": self._save_args, + "version": self._version, + } def _load(self) -> pd.DataFrame: load_path = str(self._get_load_path()) diff --git a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py index d0acdc5d1..acb478bd9 100644 --- a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py @@ -148,13 +148,13 @@ def __init__( self._load_args.pop("storage_options", None) def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - load_args=self._load_args, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "load_args": self._load_args, + "save_args": self._save_args, + "version": self._version, + } def _load(self) -> pd.DataFrame: load_path = str(self._get_load_path()) diff --git a/kedro-datasets/kedro_datasets/pandas/sql_dataset.py b/kedro-datasets/kedro_datasets/pandas/sql_dataset.py index 400195719..1400e4981 100644 --- a/kedro-datasets/kedro_datasets/pandas/sql_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/sql_dataset.py @@ -237,11 +237,11 @@ def _describe(self) -> Dict[str, Any]: save_args = copy.deepcopy(self._save_args) del load_args["table_name"] del save_args["name"] - return dict( - table_name=self._load_args["table_name"], - load_args=load_args, - save_args=save_args, - ) + return { + "table_name": self._load_args["table_name"], + "load_args": load_args, + "save_args": save_args, + } def _load(self) -> pd.DataFrame: engine = self.engines[self._connection_str] # type:ignore @@ -434,12 +434,12 @@ def create_connection(cls, connection_str: str) -> None: def _describe(self) -> Dict[str, Any]: load_args = copy.deepcopy(self._load_args) - return dict( - sql=str(load_args.pop("sql", None)), - filepath=str(self._filepath), - load_args=str(load_args), - execution_options=str(self._execution_options), - ) + return { + "sql": str(load_args.pop("sql", None)), + "filepath": str(self._filepath), + "load_args": str(load_args), + "execution_options": str(self._execution_options), + } def _load(self) -> pd.DataFrame: load_args = copy.deepcopy(self._load_args) diff --git a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py index 5760268a7..ca8fc0dd2 100644 --- a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py @@ -117,13 +117,13 @@ def __init__( self._load_args.pop("storage_options", None) def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - load_args=self._load_args, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "load_args": self._load_args, + "save_args": self._save_args, + "version": self._version, + } def _load(self) -> pd.DataFrame: load_path = str(self._get_load_path()) diff --git a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py index 611865078..436fba29a 100644 --- a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py +++ b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py @@ -192,14 +192,14 @@ def __init__( self._fs_open_args_save = _fs_open_args_save def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - backend=self._backend, - protocol=self._protocol, - load_args=self._load_args, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "backend": self._backend, + "protocol": self._protocol, + "load_args": self._load_args, + "save_args": self._save_args, + "version": self._version, + } def _load(self) -> Any: load_path = get_filepath_str(self._get_load_path(), self._protocol) diff --git a/kedro-datasets/kedro_datasets/pillow/image_dataset.py b/kedro-datasets/kedro_datasets/pillow/image_dataset.py index 8c2fdc983..ca939b722 100644 --- a/kedro-datasets/kedro_datasets/pillow/image_dataset.py +++ b/kedro-datasets/kedro_datasets/pillow/image_dataset.py @@ -100,12 +100,12 @@ def __init__( self._fs_open_args_save = _fs_open_args_save def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "save_args": self._save_args, + "version": self._version, + } def _load(self) -> Image.Image: load_path = get_filepath_str(self._get_load_path(), self._protocol) diff --git a/kedro-datasets/kedro_datasets/plotly/json_dataset.py b/kedro-datasets/kedro_datasets/plotly/json_dataset.py index 7eaae8da9..f819dd338 100644 --- a/kedro-datasets/kedro_datasets/plotly/json_dataset.py +++ b/kedro-datasets/kedro_datasets/plotly/json_dataset.py @@ -125,13 +125,13 @@ def __init__( self._fs_open_args_save = _fs_open_args_save def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - load_args=self._load_args, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "load_args": self._load_args, + "save_args": self._save_args, + "version": self._version, + } def _load(self) -> Union[go.Figure, go.FigureWidget]: load_path = get_filepath_str(self._get_load_path(), self._protocol) diff --git a/kedro-datasets/kedro_datasets/redis/redis_dataset.py b/kedro-datasets/kedro_datasets/redis/redis_dataset.py index f012f0cd7..6d2f80df9 100644 --- a/kedro-datasets/kedro_datasets/redis/redis_dataset.py +++ b/kedro-datasets/kedro_datasets/redis/redis_dataset.py @@ -152,7 +152,7 @@ def __init__( ) def _describe(self) -> Dict[str, Any]: - return dict(key=self._key, **self._redis_from_url_args) + return {"key": self._key, **self._redis_from_url_args} # `redis_db` mypy does not work since it is optional and optional is not # accepted by pickle.loads. diff --git a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py index db45bc12c..34ee6f6a5 100644 --- a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py @@ -100,4 +100,4 @@ def _exists(self) -> bool: return True def _describe(self): - return dict(filepath=str(self._filepath), fs_prefix=self._fs_prefix) + return {"filepath": str(self._filepath), "fs_prefix": self._fs_prefix} diff --git a/kedro-datasets/kedro_datasets/spark/spark_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_dataset.py index 2250ae337..ca923c72e 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_dataset.py @@ -364,13 +364,13 @@ def _load_schema_from_file(schema: Dict[str, Any]) -> StructType: ) from exc def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._fs_prefix + str(self._filepath), - file_format=self._file_format, - load_args=self._load_args, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._fs_prefix + str(self._filepath), + "file_format": self._file_format, + "load_args": self._load_args, + "save_args": self._save_args, + "version": self._version, + } @staticmethod def _get_spark(): diff --git a/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py index 613b6af5f..08b0666ea 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py @@ -121,14 +121,14 @@ def __init__( self._eager_checkpoint = self._save_args.pop("eager_checkpoint", None) or True def _describe(self) -> Dict[str, Any]: - return dict( - database=self._database, - table=self._table, - write_mode=self._write_mode, - table_pk=self._table_pk, - partition_by=self._save_args.get("partitionBy"), - format=self._format, - ) + return { + "database": self._database, + "table": self._table, + "write_mode": self._write_mode, + "table_pk": self._table_pk, + "partition_by": self._save_args.get("partitionBy"), + "format": self._format, + } @staticmethod def _get_spark() -> SparkSession: diff --git a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py index 24bb3220a..aab501f26 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py @@ -156,9 +156,12 @@ def _describe(self) -> Dict[str, Any]: save_properties.pop("password", None) save_args = {**save_args, "properties": save_properties} - return dict( - url=self._url, table=self._table, load_args=load_args, save_args=save_args - ) + return { + "url": self._url, + "table": self._table, + "load_args": load_args, + "save_args": save_args, + } @staticmethod def _get_spark(): diff --git a/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py b/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py index 5c9e0699f..f909c1976 100644 --- a/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py +++ b/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py @@ -131,13 +131,13 @@ def __init__( self._fs_open_args_save = _fs_open_args_save def _describe(self): - return dict( - filepath=self._filepath, - protocol=self._protocol, - load_args=self._load_args, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "load_args": self._load_args, + "save_args": self._save_args, + "version": self._version, + } def _load(self) -> _DO: load_path = get_filepath_str(self._get_load_path(), self._protocol) diff --git a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py index 63e53b7b4..544aadb06 100644 --- a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py +++ b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py @@ -169,13 +169,13 @@ def _exists(self) -> bool: return self._fs.exists(load_path) def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - load_args=self._load_args, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "load_args": self._load_args, + "save_args": self._save_args, + "version": self._version, + } def _release(self) -> None: super()._release() diff --git a/kedro-datasets/kedro_datasets/text/text_dataset.py b/kedro-datasets/kedro_datasets/text/text_dataset.py index 5ba2ee060..0bb559e29 100644 --- a/kedro-datasets/kedro_datasets/text/text_dataset.py +++ b/kedro-datasets/kedro_datasets/text/text_dataset.py @@ -100,11 +100,11 @@ def __init__( self._fs_open_args_save = _fs_open_args_save def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "version": self._version, + } def _load(self) -> str: load_path = get_filepath_str(self._get_load_path(), self._protocol) diff --git a/kedro-datasets/kedro_datasets/video/video_dataset.py b/kedro-datasets/kedro_datasets/video/video_dataset.py index 22bd51bc5..07f0e1c8f 100644 --- a/kedro-datasets/kedro_datasets/video/video_dataset.py +++ b/kedro-datasets/kedro_datasets/video/video_dataset.py @@ -350,7 +350,7 @@ def _write_to_filepath(self, video: AbstractVideo, filepath: str) -> None: writer.release() def _describe(self) -> Dict[str, Any]: - return dict(filepath=self._filepath, protocol=self._protocol) + return {"filepath": self._filepath, "protocol": self._protocol} def _exists(self) -> bool: return self._fs.exists(self._filepath) diff --git a/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py b/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py index 1ab2fa43b..f2a3c2696 100644 --- a/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py +++ b/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py @@ -113,12 +113,12 @@ def __init__( self._fs_open_args_save = _fs_open_args_save def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - protocol=self._protocol, - save_args=self._save_args, - version=self._version, - ) + return { + "filepath": self._filepath, + "protocol": self._protocol, + "save_args": self._save_args, + "version": self._version, + } def _load(self) -> Dict: load_path = get_filepath_str(self._get_load_path(), self._protocol) diff --git a/kedro-datasets/tests/pandas/test_sql_dataset.py b/kedro-datasets/tests/pandas/test_sql_dataset.py index c882751b1..a1c6839d6 100644 --- a/kedro-datasets/tests/pandas/test_sql_dataset.py +++ b/kedro-datasets/tests/pandas/test_sql_dataset.py @@ -40,21 +40,21 @@ def sql_file(tmp_path: PosixPath): @pytest.fixture(params=[{}]) def table_data_set(request): - kwargs = dict(table_name=TABLE_NAME, credentials=dict(con=CONNECTION)) + kwargs = {"table_name": TABLE_NAME, "credentials": {"con": CONNECTION}} kwargs.update(request.param) return SQLTableDataSet(**kwargs) @pytest.fixture(params=[{}]) def query_data_set(request): - kwargs = dict(sql=SQL_QUERY, credentials=dict(con=CONNECTION)) + kwargs = {"sql": SQL_QUERY, "credentials": {"con": CONNECTION}} kwargs.update(request.param) return SQLQueryDataSet(**kwargs) @pytest.fixture(params=[{}]) def query_file_data_set(request, sql_file): - kwargs = dict(filepath=sql_file, credentials=dict(con=CONNECTION)) + kwargs = {"filepath": sql_file, "credentials": {"con": CONNECTION}} kwargs.update(request.param) return SQLQueryDataSet(**kwargs) @@ -74,7 +74,7 @@ def test_empty_table_name(self): """Check the error when instantiating with an empty table""" pattern = r"'table\_name' argument cannot be empty\." with pytest.raises(DataSetError, match=pattern): - SQLTableDataSet(table_name="", credentials=dict(con=CONNECTION)) + SQLTableDataSet(table_name="", credentials={"con": CONNECTION}) def test_empty_connection(self): """Check the error when instantiating with an empty @@ -84,7 +84,7 @@ def test_empty_connection(self): r"Please provide a SQLAlchemy connection string\." ) with pytest.raises(DataSetError, match=pattern): - SQLTableDataSet(table_name=TABLE_NAME, credentials=dict(con="")) + SQLTableDataSet(table_name=TABLE_NAME, credentials={"con": ""}) def test_driver_missing(self, mocker): """Check the error when the sql driver is missing""" @@ -93,7 +93,7 @@ def test_driver_missing(self, mocker): side_effect=ImportError("No module named 'mysqldb'"), ) with pytest.raises(DataSetError, match=ERROR_PREFIX + "mysqlclient"): - SQLTableDataSet(table_name=TABLE_NAME, credentials=dict(con=CONNECTION)) + SQLTableDataSet(table_name=TABLE_NAME, credentials={"con": CONNECTION}) def test_unknown_sql(self): """Check the error when unknown sql dialect is provided; @@ -102,7 +102,7 @@ def test_unknown_sql(self): """ pattern = r"The SQL dialect in your connection is not supported by SQLAlchemy" with pytest.raises(DataSetError, match=pattern): - SQLTableDataSet(table_name=TABLE_NAME, credentials=dict(con=FAKE_CONN_STR)) + SQLTableDataSet(table_name=TABLE_NAME, credentials={"con": FAKE_CONN_STR}) def test_unknown_module(self, mocker): """Test that if an unknown module/driver is encountered by SQLAlchemy @@ -113,7 +113,7 @@ def test_unknown_module(self, mocker): ) pattern = ERROR_PREFIX + r"No module named \'unknown\_module\'" with pytest.raises(DataSetError, match=pattern): - SQLTableDataSet(table_name=TABLE_NAME, credentials=dict(con=CONNECTION)) + SQLTableDataSet(table_name=TABLE_NAME, credentials={"con": CONNECTION}) def test_str_representation_table(self, table_data_set): """Test the data set instance string representation""" @@ -131,7 +131,7 @@ def test_table_exists(self, mocker, table_data_set): self._assert_sqlalchemy_called_once() @pytest.mark.parametrize( - "table_data_set", [{"load_args": dict(schema="ingested")}], indirect=True + "table_data_set", [{"load_args": {"schema": "ingested"}}], indirect=True ) def test_table_exists_schema(self, mocker, table_data_set): """Test `exists` method invocation with DB schema provided""" @@ -162,7 +162,7 @@ def test_save_default_index(self, mocker, table_data_set, dummy_dataframe): ) @pytest.mark.parametrize( - "table_data_set", [{"save_args": dict(index=True)}], indirect=True + "table_data_set", [{"save_args": {"index": True}}], indirect=True ) def test_save_overwrite_index(self, mocker, table_data_set, dummy_dataframe): """Test writing DataFrame index as a column""" @@ -173,7 +173,7 @@ def test_save_overwrite_index(self, mocker, table_data_set, dummy_dataframe): ) @pytest.mark.parametrize( - "table_data_set", [{"save_args": dict(name="TABLE_B")}], indirect=True + "table_data_set", [{"save_args": {"name": "TABLE_B"}}], indirect=True ) def test_save_ignore_table_name_override( self, mocker, table_data_set, dummy_dataframe @@ -192,7 +192,7 @@ def test_single_connection(self, dummy_dataframe, mocker): """Test to make sure multiple instances use the same connection object.""" mocker.patch("pandas.read_sql_table") dummy_to_sql = mocker.patch.object(dummy_dataframe, "to_sql") - kwargs = dict(table_name=TABLE_NAME, credentials=dict(con=CONNECTION)) + kwargs = {"table_name": TABLE_NAME, "credentials": {"con": CONNECTION}} first = SQLTableDataSet(**kwargs) unique_connection = first.engines[CONNECTION] @@ -216,11 +216,11 @@ def test_create_connection_only_once(self, mocker): (but different tables, for example) only create a connection once. """ mock_engine = mocker.patch("kedro_datasets.pandas.sql_dataset.create_engine") - first = SQLTableDataSet(table_name=TABLE_NAME, credentials=dict(con=CONNECTION)) + first = SQLTableDataSet(table_name=TABLE_NAME, credentials={"con": CONNECTION}) assert len(first.engines) == 1 second = SQLTableDataSet( - table_name="other_table", credentials=dict(con=CONNECTION) + table_name="other_table", credentials={"con": CONNECTION} ) assert len(second.engines) == 1 assert len(first.engines) == 1 @@ -232,13 +232,11 @@ def test_multiple_connections(self, mocker): only create one connection per db. """ mock_engine = mocker.patch("kedro_datasets.pandas.sql_dataset.create_engine") - first = SQLTableDataSet(table_name=TABLE_NAME, credentials=dict(con=CONNECTION)) + first = SQLTableDataSet(table_name=TABLE_NAME, credentials={"con": CONNECTION}) assert len(first.engines) == 1 second_con = f"other_{CONNECTION}" - second = SQLTableDataSet( - table_name=TABLE_NAME, credentials=dict(con=second_con) - ) + second = SQLTableDataSet(table_name=TABLE_NAME, credentials={"con": second_con}) assert len(second.engines) == 2 assert len(first.engines) == 2 @@ -254,7 +252,7 @@ def test_empty_query_error(self): r"Please provide a sql query or path to a sql query file\." ) with pytest.raises(DataSetError, match=pattern): - SQLQueryDataSet(sql="", filepath="", credentials=dict(con=CONNECTION)) + SQLQueryDataSet(sql="", filepath="", credentials={"con": CONNECTION}) def test_empty_con_error(self): """Check the error when instantiating with empty connection string""" @@ -263,7 +261,7 @@ def test_empty_con_error(self): r"a SQLAlchemy connection string" ) with pytest.raises(DataSetError, match=pattern): - SQLQueryDataSet(sql=SQL_QUERY, credentials=dict(con="")) + SQLQueryDataSet(sql=SQL_QUERY, credentials={"con": ""}) @pytest.mark.parametrize( "query_data_set, has_execution_options", @@ -319,7 +317,7 @@ def test_load_driver_missing(self, mocker): "kedro_datasets.pandas.sql_dataset.create_engine", side_effect=_err ) with pytest.raises(DataSetError, match=ERROR_PREFIX + "mysqlclient"): - SQLQueryDataSet(sql=SQL_QUERY, credentials=dict(con=CONNECTION)) + SQLQueryDataSet(sql=SQL_QUERY, credentials={"con": CONNECTION}) def test_invalid_module(self, mocker): """Test that if an unknown module/driver is encountered by SQLAlchemy @@ -330,7 +328,7 @@ def test_invalid_module(self, mocker): ) pattern = ERROR_PREFIX + r"Invalid module some\_module" with pytest.raises(DataSetError, match=pattern): - SQLQueryDataSet(sql=SQL_QUERY, credentials=dict(con=CONNECTION)) + SQLQueryDataSet(sql=SQL_QUERY, credentials={"con": CONNECTION}) def test_load_unknown_module(self, mocker): """Test that if an unknown module/driver is encountered by SQLAlchemy @@ -341,14 +339,14 @@ def test_load_unknown_module(self, mocker): ) pattern = ERROR_PREFIX + r"No module named \'unknown\_module\'" with pytest.raises(DataSetError, match=pattern): - SQLQueryDataSet(sql=SQL_QUERY, credentials=dict(con=CONNECTION)) + SQLQueryDataSet(sql=SQL_QUERY, credentials={"con": CONNECTION}) def test_load_unknown_sql(self): """Check the error when unknown SQL dialect is provided in the connection string""" pattern = r"The SQL dialect in your connection is not supported by SQLAlchemy" with pytest.raises(DataSetError, match=pattern): - SQLQueryDataSet(sql=SQL_QUERY, credentials=dict(con=FAKE_CONN_STR)) + SQLQueryDataSet(sql=SQL_QUERY, credentials={"con": FAKE_CONN_STR}) def test_save_error(self, query_data_set, dummy_dataframe): """Check the error when trying to save to the data set""" @@ -390,12 +388,12 @@ def test_create_connection_only_once(self, mocker): tables and execution options, for example) only create a connection once. """ mock_engine = mocker.patch("kedro_datasets.pandas.sql_dataset.create_engine") - first = SQLQueryDataSet(sql=SQL_QUERY, credentials=dict(con=CONNECTION)) + first = SQLQueryDataSet(sql=SQL_QUERY, credentials={"con": CONNECTION}) assert len(first.engines) == 1 # second engine has identical params to the first one # => no new engine should be created - second = SQLQueryDataSet(sql=SQL_QUERY, credentials=dict(con=CONNECTION)) + second = SQLQueryDataSet(sql=SQL_QUERY, credentials={"con": CONNECTION}) mock_engine.assert_called_once_with(CONNECTION) assert second.engines == first.engines assert len(first.engines) == 1 @@ -404,7 +402,7 @@ def test_create_connection_only_once(self, mocker): # => no new engine should be created third = SQLQueryDataSet( sql="a different query", - credentials=dict(con=CONNECTION), + credentials={"con": CONNECTION}, execution_options=EXECUTION_OPTIONS, ) assert mock_engine.call_count == 1 @@ -414,7 +412,7 @@ def test_create_connection_only_once(self, mocker): # fourth engine has a different connection string # => a new engine has to be created fourth = SQLQueryDataSet( - sql=SQL_QUERY, credentials=dict(con="an other connection string") + sql=SQL_QUERY, credentials={"con": "an other connection string"} ) assert mock_engine.call_count == 2 assert fourth.engines == first.engines From 6734a7e063d59edc649e33461065013d4886b3fa Mon Sep 17 00:00:00 2001 From: Merel Theisen <49397448+merelcht@users.noreply.github.com> Date: Tue, 7 Feb 2023 11:00:40 +0000 Subject: [PATCH 06/74] Bump pytest to 7.2 (#113) Signed-off-by: Merel Theisen Signed-off-by: Danny Farah --- kedro-datasets/test_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/test_requirements.txt b/kedro-datasets/test_requirements.txt index f0a0c8f48..d0472d429 100644 --- a/kedro-datasets/test_requirements.txt +++ b/kedro-datasets/test_requirements.txt @@ -42,7 +42,7 @@ pyspark>=2.2, <4.0 pytest-cov~=3.0 pytest-mock>=1.7.1, <2.0 pytest-xdist[psutil]~=2.2.1 -pytest~=6.2 +pytest~=7.2 redis~=4.1 requests-mock~=1.6 requests~=2.20 From 86aa3a74414ad27148a7b50abfe0c04d332d7779 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Tue, 7 Feb 2023 06:01:39 -0500 Subject: [PATCH 07/74] Prefix Docker plugin name with "Kedro-" in usage message (#57) * Prefix Docker plugin name with "Kedro-" in usage message Signed-off-by: Deepyaman Datta Signed-off-by: Danny Farah --- kedro-docker/kedro_docker/plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-docker/kedro_docker/plugin.py b/kedro-docker/kedro_docker/plugin.py index 730de899b..02c5d2bbc 100644 --- a/kedro-docker/kedro_docker/plugin.py +++ b/kedro-docker/kedro_docker/plugin.py @@ -86,7 +86,7 @@ def _make_docker_args_option(**kwargs): return click.option("--docker-args", **kwargs) -@click.group(name="Docker") +@click.group(name="Kedro-Docker") def commands(): """Kedro plugin for packaging a project with Docker""" pass From 51156077a5f5d57a05e643687a3e141160315792 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Tue, 7 Feb 2023 07:04:08 -0500 Subject: [PATCH 08/74] Keep Kedro-Docker plugin docstring from appearing in `kedro -h` (#56) * Keep Kedro-Docker plugin docstring from appearing in `kedro -h` Signed-off-by: Deepyaman Datta Signed-off-by: Danny Farah --- kedro-docker/.pre-commit-config.yaml | 4 +-- kedro-docker/kedro_docker/plugin.py | 38 +++++++++++----------------- kedro-docker/tests/test_helpers.py | 26 +++++++++---------- 3 files changed, 30 insertions(+), 38 deletions(-) diff --git a/kedro-docker/.pre-commit-config.yaml b/kedro-docker/.pre-commit-config.yaml index 049c5e56e..5fa408473 100644 --- a/kedro-docker/.pre-commit-config.yaml +++ b/kedro-docker/.pre-commit-config.yaml @@ -48,7 +48,7 @@ repos: language: system types: [file, python] files: ^kedro-docker/tests/ - entry: pylint --disable=missing-docstring,redefined-outer-name,no-self-use,invalid-name,protected-access,too-many-arguments + entry: pylint --disable=missing-docstring,redefined-outer-name,invalid-name,protected-access,too-many-arguments stages: [commit] # The same pylint checks, but running on all files. It's for manual run with `make lint` - id: pylint-kedro_docker @@ -68,7 +68,7 @@ repos: language: system pass_filenames: false stages: [manual] - entry: pylint --disable=missing-docstring,redefined-outer-name,no-self-use,invalid-name,protected-access,too-many-arguments kedro-docker/tests + entry: pylint --disable=missing-docstring,redefined-outer-name,invalid-name,protected-access,too-many-arguments kedro-docker/tests - id: isort name: "Sort imports" language: system diff --git a/kedro-docker/kedro_docker/plugin.py b/kedro-docker/kedro_docker/plugin.py index 02c5d2bbc..27af7db96 100644 --- a/kedro-docker/kedro_docker/plugin.py +++ b/kedro-docker/kedro_docker/plugin.py @@ -1,4 +1,5 @@ """ Kedro plugin for packaging a project with Docker """ +# pylint: disable=unused-argument import shlex import subprocess from pathlib import Path @@ -39,13 +40,13 @@ DIVE_IMAGE = "wagoodman/dive:latest" -def _image_callback(ctx, param, value): # pylint: disable=unused-argument +def _image_callback(ctx, param, value): image = value or Path.cwd().name check_docker_image_exists(image) return image -def _port_callback(ctx, param, value): # pylint: disable=unused-argument +def _port_callback(ctx, param, value): if is_port_in_use(value): raise KedroCliError( f"Port {value} is already in use on the host. " @@ -87,14 +88,11 @@ def _make_docker_args_option(**kwargs): @click.group(name="Kedro-Docker") -def commands(): - """Kedro plugin for packaging a project with Docker""" +def commands(): # pylint: disable=missing-function-docstring pass -@commands.group( - name="docker", context_settings=dict(help_option_names=["-h", "--help"]) -) +@commands.group(name="docker", context_settings={"help_option_names": ["-h", "--help"]}) def docker_group(): """Dockerize your Kedro project.""" # check that docker is running @@ -196,18 +194,18 @@ def docker_build( def _mount_info() -> Dict[str, Union[str, Tuple]]: - res = dict( - host_root=str(Path.cwd()), - container_root="/home/kedro_docker", - mount_volumes=DOCKER_DEFAULT_VOLUMES, - ) + res = { + "host_root": str(Path.cwd()), + "container_root": "/home/kedro_docker", + "mount_volumes": DOCKER_DEFAULT_VOLUMES, + } return res @forward_command(docker_group, "run") @_make_image_option(callback=_image_callback) @_make_docker_args_option() -def docker_run(image, docker_args, args, **kwargs): # pylint: disable=unused-argument +def docker_run(image, docker_args, args, **kwargs): """Run the pipeline in the Docker container. Any extra arguments unspecified in this help are passed to `docker run` as is. @@ -230,9 +228,7 @@ def docker_run(image, docker_args, args, **kwargs): # pylint: disable=unused-ar @forward_command(docker_group, "ipython") @_make_image_option(callback=_image_callback) @_make_docker_args_option() -def docker_ipython( - image, docker_args, args, **kwargs -): # pylint: disable=unused-argument +def docker_ipython(image, docker_args, args, **kwargs): """Run ipython in the Docker container. Any extra arguments unspecified in this help are passed to `kedro ipython` command inside the container as is. @@ -261,9 +257,7 @@ def docker_jupyter(): @_make_image_option(callback=_image_callback) @_make_port_option() @_make_docker_args_option() -def docker_jupyter_notebook( - docker_args, port, image, args, **kwargs -): # pylint: disable=unused-argument): +def docker_jupyter_notebook(docker_args, port, image, args, **kwargs): """Run jupyter notebook in the Docker container. Any extra arguments unspecified in this help are passed to `kedro jupyter notebook` command inside the container as is. @@ -292,9 +286,7 @@ def docker_jupyter_notebook( @_make_image_option(callback=_image_callback) @_make_port_option() @_make_docker_args_option() -def docker_jupyter_lab( - docker_args, port, image, args, **kwargs -): # pylint: disable=unused-argument): +def docker_jupyter_lab(docker_args, port, image, args, **kwargs): """Run jupyter lab in the Docker container. Any extra arguments unspecified in this help are passed to `kedro jupyter lab` command inside the container as is. @@ -319,7 +311,7 @@ def docker_jupyter_lab( @forward_command(docker_group, "cmd") @_make_image_option(callback=_image_callback) @_make_docker_args_option() -def docker_cmd(args, docker_args, image, **kwargs): # pylint: disable=unused-argument): +def docker_cmd(args, docker_args, image, **kwargs): """Run arbitrary command from ARGS in the Docker container. If ARGS are not specified, this will invoke `kedro run` inside the container. diff --git a/kedro-docker/tests/test_helpers.py b/kedro-docker/tests/test_helpers.py index 34d8a8a50..40b5d9306 100644 --- a/kedro-docker/tests/test_helpers.py +++ b/kedro-docker/tests/test_helpers.py @@ -43,25 +43,25 @@ def test_make_container_name(args): class TestComposeDockerRunArgs: def test_args(self, tmp_path): """Test composing the arguments for `docker run` command""" - kwargs = dict( - host_root=str(tmp_path), - container_root="/home/kedro/projectname", - optional_args=[("-arg1", "projectname"), ("--arg4", "x4")], - required_args=[("-arg2", None), ("-arg3", "x2")], - user_args=["-arg1", "-arg2=y2", "-arg3", "y3"], - ) + kwargs = { + "host_root": str(tmp_path), + "container_root": "/home/kedro/projectname", + "optional_args": [("-arg1", "projectname"), ("--arg4", "x4")], + "required_args": [("-arg2", None), ("-arg3", "x2")], + "user_args": ["-arg1", "-arg2=y2", "-arg3", "y3"], + } expected = ["-arg2", "-arg3", "x2", "--arg4", "x4"] + kwargs["user_args"] assert compose_docker_run_args(**kwargs) == expected def test_mount(self, tmp_path): """Test composing the arguments with volumes to mount""" host_root = tmp_path.resolve() - kwargs = dict( - host_root=str(host_root), - container_root="/home/kedro/projectname", - mount_volumes=("conf/local", "data", "logs"), - user_args=["-v", "y1"], - ) + kwargs = { + "host_root": str(host_root), + "container_root": "/home/kedro/projectname", + "mount_volumes": ("conf/local", "data", "logs"), + "user_args": ["-v", "y1"], + } expected = [] for _vol in kwargs["mount_volumes"]: _mount_vol = f"{host_root / _vol}:{kwargs['container_root']}/{_vol}" From 4b5da98ae210468e4c9d979f9dd546ff358dacda Mon Sep 17 00:00:00 2001 From: Walber Moreira <58264877+wmoreiraa@users.noreply.github.com> Date: Thu, 9 Feb 2023 10:50:38 -0300 Subject: [PATCH 09/74] [kedro-datasets ] Add `Polars.CSVDataSet` (#95) Signed-off-by: wmoreiraa Signed-off-by: Danny Farah --- kedro-datasets/RELEASE.md | 15 +- .../kedro_datasets/polars/__init__.py | 8 + .../kedro_datasets/polars/csv_dataset.py | 191 +++++++++ kedro-datasets/setup.py | 4 + kedro-datasets/test_requirements.txt | 1 + kedro-datasets/tests/polars/__init__.py | 0 .../tests/polars/test_csv_dataset.py | 376 ++++++++++++++++++ 7 files changed, 594 insertions(+), 1 deletion(-) mode change 100755 => 100644 kedro-datasets/RELEASE.md create mode 100644 kedro-datasets/kedro_datasets/polars/__init__.py create mode 100644 kedro-datasets/kedro_datasets/polars/csv_dataset.py create mode 100644 kedro-datasets/tests/polars/__init__.py create mode 100644 kedro-datasets/tests/polars/test_csv_dataset.py diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md old mode 100755 new mode 100644 index 9c6deef45..3b51df818 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -1,5 +1,17 @@ -# Upcoming Release: +# Upcoming Release 1.1.0: + + +## Major features and improvements: + +* Added the following new datasets: + +| Type | Description | Location | +| ------------------------------------ | -------------------------------------------------------------------------- | ----------------------------- | +| `polars.CSVDataSet` | A `CSVDataSet` backed by [polars](https://www.pola.rs/), a lighting fast dataframe package built entirely using Rust. | `kedro_datasets.polars` | + +## Bug fixes and other changes + # Release 1.0.2: @@ -13,6 +25,7 @@ ## Bug fixes and other changes * Fixed doc string formatting in `VideoDataSet` causing the documentation builds to fail. + # Release 1.0.0: First official release of Kedro-Datasets. diff --git a/kedro-datasets/kedro_datasets/polars/__init__.py b/kedro-datasets/kedro_datasets/polars/__init__.py new file mode 100644 index 000000000..34d39c985 --- /dev/null +++ b/kedro-datasets/kedro_datasets/polars/__init__.py @@ -0,0 +1,8 @@ +"""``AbstractDataSet`` implementations that produce pandas DataFrames.""" + +__all__ = ["CSVDataSet"] + +from contextlib import suppress + +with suppress(ImportError): + from .csv_dataset import CSVDataSet diff --git a/kedro-datasets/kedro_datasets/polars/csv_dataset.py b/kedro-datasets/kedro_datasets/polars/csv_dataset.py new file mode 100644 index 000000000..60a0d456a --- /dev/null +++ b/kedro-datasets/kedro_datasets/polars/csv_dataset.py @@ -0,0 +1,191 @@ +"""``CSVDataSet`` loads/saves data from/to a CSV file using an underlying +filesystem (e.g.: local, S3, GCS). It uses polars to handle the CSV file. +""" +import logging +from copy import deepcopy +from io import BytesIO +from pathlib import PurePosixPath +from typing import Any, Dict + +import fsspec +import polars as pl +from kedro.io.core import ( + PROTOCOL_DELIMITER, + AbstractVersionedDataSet, + DataSetError, + Version, + get_filepath_str, + get_protocol_and_path, +) + +logger = logging.getLogger(__name__) + + +class CSVDataSet(AbstractVersionedDataSet[pl.DataFrame, pl.DataFrame]): + """``CSVDataSet`` loads/saves data from/to a CSV file using an underlying + filesystem (e.g.: local, S3, GCS). It uses polars to handle the CSV file. + + Example adding a catalog entry with + `YAML API + `_: + + .. code-block:: yaml + + >>> cars: + >>> type: polars.CSVDataSet + >>> filepath: data/01_raw/company/cars.csv + >>> load_args: + >>> sep: "," + >>> parse_dates: False + >>> save_args: + >>> has_header: False + null_value: "somenullstring" + >>> + >>> motorbikes: + >>> type: polars.CSVDataSet + >>> filepath: s3://your_bucket/data/02_intermediate/company/motorbikes.csv + >>> credentials: dev_s3 + + Example using Python API: + :: + + >>> from kedro_datasets.polars import CSVDataSet + >>> import polars as pl + >>> + >>> data = pl.DataFrame({'col1': [1, 2], 'col2': [4, 5], + >>> 'col3': [5, 6]}) + >>> + >>> data_set = CSVDataSet(filepath="test.csv") + >>> data_set.save(data) + >>> reloaded = data_set.load() + >>> assert data.frame_equal(reloaded) + + """ + + DEFAULT_LOAD_ARGS = {"rechunk": True} # type: Dict[str, Any] + DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + + # pylint: disable=too-many-arguments + def __init__( + self, + filepath: str, + load_args: Dict[str, Any] = None, + save_args: Dict[str, Any] = None, + version: Version = None, + credentials: Dict[str, Any] = None, + fs_args: Dict[str, Any] = None, + ) -> None: + """Creates a new instance of ``CSVDataSet`` pointing to a concrete CSV file + on a specific filesystem. + + Args: + filepath: Filepath in POSIX format to a CSV file prefixed with a protocol + `s3://`. + If prefix is not provided, `file` protocol (local filesystem) + will be used. + The prefix should be any protocol supported by ``fsspec``. + Note: `http(s)` doesn't support versioning. + load_args: Polars options for loading CSV files. + Here you can find all available arguments: + https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_csv.html#polars.read_csv + All defaults are preserved, but we explicity use `rechunk=True` for `seaborn` + compability. + save_args: Polars options for saving CSV files. + Here you can find all available arguments: + https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_csv.html + All defaults are preserved. + version: If specified, should be an instance of + ``kedro.io.core.Version``. If its ``load`` attribute is + None, the latest version will be loaded. If its ``save`` + attribute is None, save version will be autogenerated. + credentials: Credentials required to get access to the underlying filesystem. + E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. + fs_args: Extra arguments to pass into underlying filesystem class constructor + (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + """ + _fs_args = deepcopy(fs_args) or {} + _credentials = deepcopy(credentials) or {} + + protocol, path = get_protocol_and_path(filepath, version) + if protocol == "file": + _fs_args.setdefault("auto_mkdir", True) + + self._protocol = protocol + self._storage_options = {**_credentials, **_fs_args} + self._fs = fsspec.filesystem(self._protocol, **self._storage_options) + + super().__init__( + filepath=PurePosixPath(path), + version=version, + exists_function=self._fs.exists, + glob_function=self._fs.glob, + ) + + # Handle default load and save arguments + self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) + if load_args is not None: + self._load_args.update(load_args) + self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) + if save_args is not None: + self._save_args.update(save_args) + + if "storage_options" in self._save_args or "storage_options" in self._load_args: + logger.warning( + "Dropping 'storage_options' for %s, " + "please specify them under 'fs_args' or 'credentials'.", + self._filepath, + ) + self._save_args.pop("storage_options", None) + self._load_args.pop("storage_options", None) + + def _describe(self) -> Dict[str, Any]: + return { + "filepath": self._filepath, + "protocol": self._protocol, + "load_args": self._load_args, + "save_args": self._save_args, + "version": self._version, + } + + def _load(self) -> pl.DataFrame: + load_path = str(self._get_load_path()) + if self._protocol == "file": + # file:// protocol seems to misbehave on Windows + # (), + # so we don't join that back to the filepath; + # storage_options also don't work with local paths + return pl.read_csv(load_path, **self._load_args) + + load_path = f"{self._protocol}{PROTOCOL_DELIMITER}{load_path}" + return pl.read_csv( + load_path, storage_options=self._storage_options, **self._load_args + ) + + def _save(self, data: pl.DataFrame) -> None: + save_path = get_filepath_str(self._get_save_path(), self._protocol) + + buf = BytesIO() + data.write_csv(file=buf, **self._save_args) + + with self._fs.open(save_path, mode="wb") as fs_file: + fs_file.write(buf.getvalue()) + + self._invalidate_cache() + + def _exists(self) -> bool: + try: + load_path = get_filepath_str(self._get_load_path(), self._protocol) + except DataSetError: + return False + + return self._fs.exists(load_path) + + def _release(self) -> None: + super()._release() + self._invalidate_cache() + + def _invalidate_cache(self) -> None: + """Invalidate underlying filesystem caches.""" + filepath = get_filepath_str(self._filepath, self._protocol) + self._fs.invalidate_cache(filepath) diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py index 8c5440a75..cf02edd0f 100644 --- a/kedro-datasets/setup.py +++ b/kedro-datasets/setup.py @@ -13,6 +13,7 @@ SPARK = "pyspark>=2.2, <4.0" HDFS = "hdfs>=2.5.8, <3.0" S3FS = "s3fs>=0.3.0, <0.5" +POLARS = "polars~=0.15.16" with open("requirements.txt", "r", encoding="utf-8") as f: install_requires = [x.strip() for x in f if x.strip()] @@ -63,6 +64,7 @@ def _collect_requirements(requires): "pandas.GenericDataSet": [PANDAS], } pillow_require = {"pillow.ImageDataSet": ["Pillow~=9.0"]} +polars_require = {"polars.CSVDataSet": [POLARS],} video_require = { "video.VideoDataSet": ["opencv-python~=4.5.5.64"] } @@ -109,6 +111,7 @@ def _collect_requirements(requires): "networkx": _collect_requirements(networkx_require), "pandas": _collect_requirements(pandas_require), "pillow": _collect_requirements(pillow_require), + "polars": _collect_requirements(polars_require), "video": _collect_requirements(video_require), "plotly": _collect_requirements(plotly_require), "redis": _collect_requirements(redis_require), @@ -126,6 +129,7 @@ def _collect_requirements(requires): **networkx_require, **pandas_require, **pillow_require, + **polars_require, **video_require, **plotly_require, **spark_require, diff --git a/kedro-datasets/test_requirements.txt b/kedro-datasets/test_requirements.txt index d0472d429..8dec3619b 100644 --- a/kedro-datasets/test_requirements.txt +++ b/kedro-datasets/test_requirements.txt @@ -33,6 +33,7 @@ pandas-gbq>=0.12.0, <0.18.0 pandas~=1.3 # 1.3 for read_xml/to_xml Pillow~=9.0 plotly>=4.8.0, <6.0 +polars~=0.15.13 pre-commit>=2.9.2, <3.0 # The hook `mypy` requires pre-commit version 2.9.2. psutil==5.8.0 pyarrow>=1.0, <7.0 diff --git a/kedro-datasets/tests/polars/__init__.py b/kedro-datasets/tests/polars/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/kedro-datasets/tests/polars/test_csv_dataset.py b/kedro-datasets/tests/polars/test_csv_dataset.py new file mode 100644 index 000000000..8b05a2025 --- /dev/null +++ b/kedro-datasets/tests/polars/test_csv_dataset.py @@ -0,0 +1,376 @@ +import os +import sys +from pathlib import Path, PurePosixPath +from time import sleep + +import boto3 +import polars as pl +import pytest +from adlfs import AzureBlobFileSystem +from fsspec.implementations.http import HTTPFileSystem +from fsspec.implementations.local import LocalFileSystem +from gcsfs import GCSFileSystem +from kedro.io import DataSetError +from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp +from moto import mock_s3 +from polars.testing import assert_frame_equal +from s3fs.core import S3FileSystem + +from kedro_datasets.polars import CSVDataSet + +BUCKET_NAME = "test_bucket" +FILE_NAME = "test.csv" + + +@pytest.fixture +def filepath_csv(tmp_path): + return (tmp_path / "test.csv").as_posix() + + +@pytest.fixture +def csv_data_set(filepath_csv, load_args, save_args, fs_args): + return CSVDataSet( + filepath=filepath_csv, load_args=load_args, save_args=save_args, fs_args=fs_args + ) + + +@pytest.fixture +def versioned_csv_data_set(filepath_csv, load_version, save_version): + return CSVDataSet( + filepath=filepath_csv, version=Version(load_version, save_version) + ) + + +@pytest.fixture +def dummy_dataframe(): + return pl.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) + + +@pytest.fixture +def partitioned_data_polars(): + return { + f"p{counter:02d}/data.csv": pl.DataFrame( + {"part": counter, "col": list(range(counter + 1))} + ) + for counter in range(5) + } + + +@pytest.fixture +def mocked_s3_bucket(): + """Create a bucket for testing using moto.""" + with mock_s3(): + conn = boto3.client( + "s3", + aws_access_key_id="fake_access_key", + aws_secret_access_key="fake_secret_key", + ) + conn.create_bucket(Bucket=BUCKET_NAME) + yield conn + + +@pytest.fixture +def mocked_dataframe(): + df = pl.DataFrame({"dummy": ["dummy"]}) + return df + + +@pytest.fixture +def mocked_csv_in_s3(mocked_s3_bucket, mocked_dataframe: pl.DataFrame): + + binarycsv = mocked_dataframe.write_csv()[:-1] + + mocked_s3_bucket.put_object( + Bucket=BUCKET_NAME, + Key=FILE_NAME, + Body=binarycsv, + ) + + return f"s3://{BUCKET_NAME}/{FILE_NAME}" + + +class TestCSVDataSet: + def test_save_and_load(self, csv_data_set, dummy_dataframe): + """Test saving and reloading the data set.""" + csv_data_set.save(dummy_dataframe) + reloaded = csv_data_set.load() + assert_frame_equal(dummy_dataframe, reloaded) + + def test_exists(self, csv_data_set, dummy_dataframe): + """Test `exists` method invocation for both existing and + nonexistent data set.""" + assert not csv_data_set.exists() + csv_data_set.save(dummy_dataframe) + assert csv_data_set.exists() + + @pytest.mark.parametrize( + "load_args", [{"k1": "v1", "index": "value"}], indirect=True + ) + def test_load_extra_params(self, csv_data_set, load_args): + """Test overriding the default load arguments.""" + for key, value in load_args.items(): + assert csv_data_set._load_args[key] == value + + @pytest.mark.parametrize( + "save_args", [{"k1": "v1", "index": "value"}], indirect=True + ) + def test_save_extra_params(self, csv_data_set, save_args): + """Test overriding the default save arguments.""" + for key, value in save_args.items(): + assert csv_data_set._save_args[key] == value + + @pytest.mark.parametrize( + "load_args,save_args", + [ + ({"storage_options": {"a": "b"}}, {}), + ({}, {"storage_options": {"a": "b"}}), + ({"storage_options": {"a": "b"}}, {"storage_options": {"x": "y"}}), + ], + ) + def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path): + filepath = str(tmp_path / "test.csv") + + ds = CSVDataSet(filepath=filepath, load_args=load_args, save_args=save_args) + + records = [r for r in caplog.records if r.levelname == "WARNING"] + expected_log_message = ( + f"Dropping 'storage_options' for {filepath}, " + f"please specify them under 'fs_args' or 'credentials'." + ) + assert records[0].getMessage() == expected_log_message + assert "storage_options" not in ds._save_args + assert "storage_options" not in ds._load_args + + def test_load_missing_file(self, csv_data_set): + """Check the error when trying to load missing file.""" + pattern = r"Failed while loading data from data set CSVDataSet\(.*\)" + with pytest.raises(DataSetError, match=pattern): + csv_data_set.load() + + @pytest.mark.parametrize( + "filepath,instance_type,credentials", + [ + ("s3://bucket/file.csv", S3FileSystem, {}), + ("file:///tmp/test.csv", LocalFileSystem, {}), + ("/tmp/test.csv", LocalFileSystem, {}), + ("gcs://bucket/file.csv", GCSFileSystem, {}), + ("https://example.com/file.csv", HTTPFileSystem, {}), + ( + "abfs://bucket/file.csv", + AzureBlobFileSystem, + {"account_name": "test", "account_key": "test"}, + ), + ], + ) + def test_protocol_usage(self, filepath, instance_type, credentials): + data_set = CSVDataSet(filepath=filepath, credentials=credentials) + assert isinstance(data_set._fs, instance_type) + + path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] + + assert str(data_set._filepath) == path + assert isinstance(data_set._filepath, PurePosixPath) + + def test_catalog_release(self, mocker): + fs_mock = mocker.patch("fsspec.filesystem").return_value + filepath = "test.csv" + data_set = CSVDataSet(filepath=filepath) + assert data_set._version_cache.currsize == 0 # no cache if unversioned + data_set.release() + fs_mock.invalidate_cache.assert_called_once_with(filepath) + assert data_set._version_cache.currsize == 0 + + +class TestCSVDataSetVersioned: + def test_version_str_repr(self, load_version, save_version): + """Test that version is in string representation of the class instance + when applicable.""" + filepath = "test.csv" + ds = CSVDataSet(filepath=filepath) + ds_versioned = CSVDataSet( + filepath=filepath, version=Version(load_version, save_version) + ) + assert filepath in str(ds) + assert "version" not in str(ds) + + assert filepath in str(ds_versioned) + ver_str = f"version=Version(load={load_version}, save='{save_version}')" + assert ver_str in str(ds_versioned) + assert "CSVDataSet" in str(ds_versioned) + assert "CSVDataSet" in str(ds) + assert "protocol" in str(ds_versioned) + assert "protocol" in str(ds) + # Default save_args + assert "load_args={'rechunk': True}" in str(ds) + assert "load_args={'rechunk': True}" in str(ds_versioned) + + def test_save_and_load(self, versioned_csv_data_set, dummy_dataframe): + """Test that saved and reloaded data matches the original one for + the versioned data set.""" + versioned_csv_data_set.save(dummy_dataframe) + reloaded_df = versioned_csv_data_set.load() + assert_frame_equal(dummy_dataframe, reloaded_df) + + def test_multiple_loads( + self, versioned_csv_data_set, dummy_dataframe, filepath_csv + ): + """Test that if a new version is created mid-run, by an + external system, it won't be loaded in the current run.""" + versioned_csv_data_set.save(dummy_dataframe) + versioned_csv_data_set.load() + v1 = versioned_csv_data_set.resolve_load_version() + + sleep(0.5) + # force-drop a newer version into the same location + v_new = generate_timestamp() + CSVDataSet(filepath=filepath_csv, version=Version(v_new, v_new)).save( + dummy_dataframe + ) + + versioned_csv_data_set.load() + v2 = versioned_csv_data_set.resolve_load_version() + + assert v2 == v1 # v2 should not be v_new! + ds_new = CSVDataSet(filepath=filepath_csv, version=Version(None, None)) + assert ( + ds_new.resolve_load_version() == v_new + ) # new version is discoverable by a new instance + + def test_multiple_saves(self, dummy_dataframe, filepath_csv): + """Test multiple cycles of save followed by load for the same dataset""" + ds_versioned = CSVDataSet(filepath=filepath_csv, version=Version(None, None)) + + # first save + ds_versioned.save(dummy_dataframe) + first_save_version = ds_versioned.resolve_save_version() + first_load_version = ds_versioned.resolve_load_version() + assert first_load_version == first_save_version + + # second save + sleep(0.5) + ds_versioned.save(dummy_dataframe) + second_save_version = ds_versioned.resolve_save_version() + second_load_version = ds_versioned.resolve_load_version() + assert second_load_version == second_save_version + assert second_load_version > first_load_version + + # another dataset + ds_new = CSVDataSet(filepath=filepath_csv, version=Version(None, None)) + assert ds_new.resolve_load_version() == second_load_version + + def test_release_instance_cache(self, dummy_dataframe, filepath_csv): + """Test that cache invalidation does not affect other instances""" + ds_a = CSVDataSet(filepath=filepath_csv, version=Version(None, None)) + assert ds_a._version_cache.currsize == 0 + ds_a.save(dummy_dataframe) # create a version + assert ds_a._version_cache.currsize == 2 + + ds_b = CSVDataSet(filepath=filepath_csv, version=Version(None, None)) + assert ds_b._version_cache.currsize == 0 + ds_b.resolve_save_version() + assert ds_b._version_cache.currsize == 1 + ds_b.resolve_load_version() + assert ds_b._version_cache.currsize == 2 + + ds_a.release() + + # dataset A cache is cleared + assert ds_a._version_cache.currsize == 0 + + # dataset B cache is unaffected + assert ds_b._version_cache.currsize == 2 + + def test_no_versions(self, versioned_csv_data_set): + """Check the error if no versions are available for load.""" + pattern = r"Did not find any versions for CSVDataSet\(.+\)" + with pytest.raises(DataSetError, match=pattern): + versioned_csv_data_set.load() + + def test_exists(self, versioned_csv_data_set, dummy_dataframe): + """Test `exists` method invocation for versioned data set.""" + assert not versioned_csv_data_set.exists() + versioned_csv_data_set.save(dummy_dataframe) + assert versioned_csv_data_set.exists() + + def test_prevent_overwrite(self, versioned_csv_data_set, dummy_dataframe): + """Check the error when attempting to override the data set if the + corresponding CSV file for a given save version already exists.""" + versioned_csv_data_set.save(dummy_dataframe) + pattern = ( + r"Save path \'.+\' for CSVDataSet\(.+\) must " + r"not exist if versioning is enabled\." + ) + with pytest.raises(DataSetError, match=pattern): + versioned_csv_data_set.save(dummy_dataframe) + + @pytest.mark.parametrize( + "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True + ) + @pytest.mark.parametrize( + "save_version", ["2019-01-02T00.00.00.000Z"], indirect=True + ) + def test_save_version_warning( + self, versioned_csv_data_set, load_version, save_version, dummy_dataframe + ): + """Check the warning when saving to the path that differs from + the subsequent load path.""" + pattern = ( + rf"Save version '{save_version}' did not match load version " + rf"'{load_version}' for CSVDataSet\(.+\)" + ) + with pytest.warns(UserWarning, match=pattern): + versioned_csv_data_set.save(dummy_dataframe) + + def test_http_filesystem_no_versioning(self): + pattern = r"HTTP\(s\) DataSet doesn't support versioning\." + + with pytest.raises(DataSetError, match=pattern): + CSVDataSet( + filepath="https://example.com/file.csv", version=Version(None, None) + ) + + def test_versioning_existing_dataset( + self, csv_data_set, versioned_csv_data_set, dummy_dataframe + ): + """Check the error when attempting to save a versioned dataset on top of an + already existing (non-versioned) dataset.""" + csv_data_set.save(dummy_dataframe) + assert csv_data_set.exists() + assert csv_data_set._filepath == versioned_csv_data_set._filepath + pattern = ( + f"(?=.*file with the same name already exists in the directory)" + f"(?=.*{versioned_csv_data_set._filepath.parent.as_posix()})" + ) + with pytest.raises(DataSetError, match=pattern): + versioned_csv_data_set.save(dummy_dataframe) + + # Remove non-versioned dataset and try again + Path(csv_data_set._filepath.as_posix()).unlink() + versioned_csv_data_set.save(dummy_dataframe) + assert versioned_csv_data_set.exists() + + +class TestCSVDataSetS3: + os.environ["AWS_ACCESS_KEY_ID"] = "FAKE_ACCESS_KEY" + os.environ["AWS_SECRET_ACCESS_KEY"] = "FAKE_SECRET_KEY" + + def test_load_and_confirm(self, mocker, mocked_csv_in_s3, mocked_dataframe): + """Test the standard flow for loading, confirming and reloading a + IncrementalDataSet in S3 + + Unmodified Test fails in Python >= 3.10 if executed after test_protocol_usage + (any implementation using S3FileSystem). Likely to be a bug with moto (tested + with moto==4.0.8, moto==3.0.4) -- see #67 + """ + df = CSVDataSet(mocked_csv_in_s3) + assert df._protocol == "s3" + # if Python >= 3.10, modify test procedure (see #67) + if sys.version_info[1] >= 10: + read_patch = mocker.patch("polars.read_csv", return_value=mocked_dataframe) + df.load() + read_patch.assert_called_once_with( + mocked_csv_in_s3, storage_options={}, rechunk=True + ) + else: + loaded = df.load() + assert_frame_equal(loaded, mocked_dataframe) From deb3cce11ef6eb450bb2c9adde4b07c19e10ffff Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Thu, 9 Feb 2023 08:51:23 -0500 Subject: [PATCH 10/74] Remove deprecated `test_requires` from `setup.py` in Kedro-Docker (#54) Signed-off-by: Deepyaman Datta Signed-off-by: Danny Farah --- kedro-docker/setup.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/kedro-docker/setup.py b/kedro-docker/setup.py index bf150ba9a..b2ef23ca3 100644 --- a/kedro-docker/setup.py +++ b/kedro-docker/setup.py @@ -5,7 +5,6 @@ from setuptools import setup name = "kedro-docker" - here = path.abspath(path.dirname(__file__)) # get package version @@ -17,16 +16,10 @@ with open("requirements.txt", "r", encoding="utf-8") as f: requires = [x.strip() for x in f if x.strip()] -# get test dependencies and installs -with open("test_requirements.txt", "r", encoding="utf-8") as f: - test_requires = [x.strip() for x in f if x.strip() and not x.startswith("-r")] - - -# Get the long description from the README file +# get the long description from the README file with open(path.join(here, "README.md"), encoding="utf-8") as f: readme = f.read() - setup( name=name, version=version, @@ -37,7 +30,6 @@ license="Apache Software License (Apache 2.0)", python_requires=">=3.7, <3.11", install_requires=requires, - tests_require=test_requires, author="Kedro", packages=["kedro_docker"], package_data={ From 76e477ac75ebd3d486b3729bf5aba187727591ef Mon Sep 17 00:00:00 2001 From: Danny Farah Date: Mon, 13 Feb 2023 23:29:15 -0500 Subject: [PATCH 11/74] renaming dataset Signed-off-by: Danny Farah --- .../kedro_datasets/databricks/__init__.py | 4 +- .../kedro_datasets/databricks/unity.py | 6 +- .../{test_unity_dataset.py => test_unity.py} | 60 +++++++++---------- 3 files changed, 35 insertions(+), 35 deletions(-) rename kedro-datasets/tests/databricks/{test_unity_dataset.py => test_unity.py} (87%) diff --git a/kedro-datasets/kedro_datasets/databricks/__init__.py b/kedro-datasets/kedro_datasets/databricks/__init__.py index 2fd3eccb9..313f3bdba 100644 --- a/kedro-datasets/kedro_datasets/databricks/__init__.py +++ b/kedro-datasets/kedro_datasets/databricks/__init__.py @@ -1,8 +1,8 @@ """Provides interface to Unity Catalog Tables.""" -__all__ = ["UnityTableDataSet"] +__all__ = ["ManagedTableDataSet"] from contextlib import suppress with suppress(ImportError): - from .unity import UnityTableDataSet + from .unity import ManagedTableDataSet diff --git a/kedro-datasets/kedro_datasets/databricks/unity.py b/kedro-datasets/kedro_datasets/databricks/unity.py index 8921fca1b..b6270f58c 100644 --- a/kedro-datasets/kedro_datasets/databricks/unity.py +++ b/kedro-datasets/kedro_datasets/databricks/unity.py @@ -15,8 +15,8 @@ logger = logging.getLogger(__name__) -class UnityTableDataSet(AbstractVersionedDataSet): - """``UnityTableDataSet`` loads data into Unity managed tables.""" +class ManagedTableDataSet(AbstractVersionedDataSet): + """``ManagedTableDataSet`` loads data into Unity managed tables.""" # this dataset cannot be used with ``ParallelRunner``, # therefore it has the attribute ``_SINGLE_PROCESS = True`` @@ -41,7 +41,7 @@ def __init__( partition_columns: List[str] = None, # pylint: disable=unused-argument owner_group: str = None, ) -> None: - """Creates a new instance of ``UnityTableDataSet``.""" + """Creates a new instance of ``ManagedTableDataSet``.""" self._database = database self._catalog = catalog diff --git a/kedro-datasets/tests/databricks/test_unity_dataset.py b/kedro-datasets/tests/databricks/test_unity.py similarity index 87% rename from kedro-datasets/tests/databricks/test_unity_dataset.py rename to kedro-datasets/tests/databricks/test_unity.py index 3f29a1e95..471f81f57 100644 --- a/kedro-datasets/tests/databricks/test_unity_dataset.py +++ b/kedro-datasets/tests/databricks/test_unity.py @@ -3,7 +3,7 @@ from pyspark.sql.types import IntegerType, StringType, StructField, StructType from pyspark.sql import DataFrame, SparkSession import pandas as pd -from kedro_datasets.databricks import UnityTableDataSet +from kedro_datasets.databricks import ManagedTableDataSet @pytest.fixture @@ -168,25 +168,25 @@ def expected_upsert_multiple_primary_spark_df(spark_session: SparkSession): return spark_session.createDataFrame(data, schema) -class TestUnityTableDataSet: +class TestManagedTableDataSet: def test_full_table(self): - unity_ds = UnityTableDataSet(catalog="test", database="test", table="test") + unity_ds = ManagedTableDataSet(catalog="test", database="test", table="test") assert unity_ds._full_table_address == "test.test.test" def test_database_table(self): - unity_ds = UnityTableDataSet(database="test", table="test") + unity_ds = ManagedTableDataSet(database="test", table="test") assert unity_ds._full_table_address == "test.test" def test_table_only(self): - unity_ds = UnityTableDataSet(table="test") + unity_ds = ManagedTableDataSet(table="test") assert unity_ds._full_table_address == "default.test" def test_table_missing(self): with pytest.raises(TypeError): - UnityTableDataSet() + ManagedTableDataSet() def test_describe(self): - unity_ds = UnityTableDataSet(table="test") + unity_ds = ManagedTableDataSet(table="test") assert unity_ds._describe() == { "catalog": None, "database": "default", @@ -199,18 +199,18 @@ def test_describe(self): def test_invalid_write_mode(self): with pytest.raises(DataSetError): - UnityTableDataSet(table="test", write_mode="invalid") + ManagedTableDataSet(table="test", write_mode="invalid") def test_dataframe_type(self): with pytest.raises(DataSetError): - UnityTableDataSet(table="test", dataframe_type="invalid") + ManagedTableDataSet(table="test", dataframe_type="invalid") def test_missing_primary_key_upsert(self): with pytest.raises(DataSetError): - UnityTableDataSet(table="test", write_mode="upsert") + ManagedTableDataSet(table="test", write_mode="upsert") def test_schema(self): - unity_ds = UnityTableDataSet( + unity_ds = ManagedTableDataSet( table="test", schema={ "fields": [ @@ -239,15 +239,15 @@ def test_schema(self): assert unity_ds._schema == expected_schema def test_catalog_exists(self): - unity_ds = UnityTableDataSet(catalog="test", database="invalid", table="test_not_there") + unity_ds = ManagedTableDataSet(catalog="test", database="invalid", table="test_not_there") assert not unity_ds._exists() def test_table_does_not_exist(self): - unity_ds = UnityTableDataSet(database="invalid", table="test_not_there") + unity_ds = ManagedTableDataSet(database="invalid", table="test_not_there") assert not unity_ds._exists() def test_save_default(self, sample_spark_df: DataFrame): - unity_ds = UnityTableDataSet(database="test", table="test_save") + unity_ds = ManagedTableDataSet(database="test", table="test_save") unity_ds.save(sample_spark_df) saved_table = unity_ds.load() assert unity_ds.exists() and sample_spark_df.exceptAll(saved_table).count() == 0 @@ -255,7 +255,7 @@ def test_save_default(self, sample_spark_df: DataFrame): def test_save_schema_spark( self, subset_spark_df: DataFrame, subset_expected_df: DataFrame ): - unity_ds = UnityTableDataSet( + unity_ds = ManagedTableDataSet( database="test", table="test_save_spark_schema", schema={ @@ -283,7 +283,7 @@ def test_save_schema_spark( def test_save_schema_pandas( self, subset_pandas_df: pd.DataFrame, subset_expected_df: DataFrame ): - unity_ds = UnityTableDataSet( + unity_ds = ManagedTableDataSet( database="test", table="test_save_pd_schema", schema={ @@ -306,7 +306,7 @@ def test_save_schema_pandas( dataframe_type="pandas", ) unity_ds.save(subset_pandas_df) - saved_ds = UnityTableDataSet( + saved_ds = ManagedTableDataSet( database="test", table="test_save_pd_schema", ) @@ -316,7 +316,7 @@ def test_save_schema_pandas( def test_save_overwrite( self, sample_spark_df: DataFrame, append_spark_df: DataFrame ): - unity_ds = UnityTableDataSet(database="test", table="test_save") + unity_ds = ManagedTableDataSet(database="test", table="test_save") unity_ds.save(sample_spark_df) unity_ds.save(append_spark_df) @@ -330,7 +330,7 @@ def test_save_append( append_spark_df: DataFrame, expected_append_spark_df: DataFrame, ): - unity_ds = UnityTableDataSet( + unity_ds = ManagedTableDataSet( database="test", table="test_save_append", write_mode="append" ) unity_ds.save(sample_spark_df) @@ -346,7 +346,7 @@ def test_save_upsert( upsert_spark_df: DataFrame, expected_upsert_spark_df: DataFrame, ): - unity_ds = UnityTableDataSet( + unity_ds = ManagedTableDataSet( database="test", table="test_save_upsert", write_mode="upsert", @@ -365,7 +365,7 @@ def test_save_upsert_multiple_primary( upsert_spark_df: DataFrame, expected_upsert_multiple_primary_spark_df: DataFrame, ): - unity_ds = UnityTableDataSet( + unity_ds = ManagedTableDataSet( database="test", table="test_save_upsert_multiple", write_mode="upsert", @@ -386,7 +386,7 @@ def test_save_upsert_mismatched_columns( sample_spark_df: DataFrame, mismatched_upsert_spark_df: DataFrame, ): - unity_ds = UnityTableDataSet( + unity_ds = ManagedTableDataSet( database="test", table="test_save_upsert_mismatch", write_mode="upsert", @@ -397,10 +397,10 @@ def test_save_upsert_mismatched_columns( unity_ds.save(mismatched_upsert_spark_df) def test_load_spark(self, sample_spark_df: DataFrame): - unity_ds = UnityTableDataSet(database="test", table="test_load_spark") + unity_ds = ManagedTableDataSet(database="test", table="test_load_spark") unity_ds.save(sample_spark_df) - delta_ds = UnityTableDataSet(database="test", table="test_load_spark") + delta_ds = ManagedTableDataSet(database="test", table="test_load_spark") delta_table = delta_ds.load() assert ( @@ -409,23 +409,23 @@ def test_load_spark(self, sample_spark_df: DataFrame): ) def test_load_spark_no_version(self, sample_spark_df: DataFrame): - unity_ds = UnityTableDataSet(database="test", table="test_load_spark") + unity_ds = ManagedTableDataSet(database="test", table="test_load_spark") unity_ds.save(sample_spark_df) - delta_ds = UnityTableDataSet( + delta_ds = ManagedTableDataSet( database="test", table="test_load_spark", version=2 ) with pytest.raises(VersionNotFoundError): _ = delta_ds.load() def test_load_version(self, sample_spark_df: DataFrame, append_spark_df: DataFrame): - unity_ds = UnityTableDataSet( + unity_ds = ManagedTableDataSet( database="test", table="test_load_version", write_mode="append" ) unity_ds.save(sample_spark_df) unity_ds.save(append_spark_df) - loaded_ds = UnityTableDataSet( + loaded_ds = ManagedTableDataSet( database="test", table="test_load_version", version=0 ) loaded_df = loaded_ds.load() @@ -433,12 +433,12 @@ def test_load_version(self, sample_spark_df: DataFrame, append_spark_df: DataFra assert loaded_df.exceptAll(sample_spark_df).count() == 0 def test_load_pandas(self, sample_pandas_df: pd.DataFrame): - unity_ds = UnityTableDataSet( + unity_ds = ManagedTableDataSet( database="test", table="test_load_pandas", dataframe_type="pandas" ) unity_ds.save(sample_pandas_df) - pandas_ds = UnityTableDataSet( + pandas_ds = ManagedTableDataSet( database="test", table="test_load_pandas", dataframe_type="pandas" ) pandas_df = pandas_ds.load().sort_values("name", ignore_index=True) From d0542fcfb8cc7bf945c4ec3ea5543730c1a2d171 Mon Sep 17 00:00:00 2001 From: Danny Farah Date: Thu, 23 Feb 2023 13:49:30 -0500 Subject: [PATCH 12/74] adding mlflow connectors Signed-off-by: Danny Farah --- .../kedro_datasets/databricks/__init__.py | 3 +- .../databricks/mlflow/__init__.py | 6 + .../databricks/mlflow/artifact.py | 133 +++++++++++++++ .../databricks/mlflow/common.py | 89 ++++++++++ .../databricks/mlflow/dataset.py | 80 +++++++++ .../databricks/mlflow/flavors/__init__.py | 0 .../mlflow/flavors/kedro_dataset_flavor.py | 154 ++++++++++++++++++ .../databricks/mlflow/metrics.py | 93 +++++++++++ .../kedro_datasets/databricks/mlflow/model.py | 75 +++++++++ .../databricks/mlflow/model_metadata.py | 49 ++++++ .../kedro_datasets/databricks/mlflow/tags.py | 94 +++++++++++ .../databricks/unity/__init__.py | 1 + .../managed_table_dataset.py} | 1 + kedro-datasets/setup.py | 2 +- 14 files changed, 778 insertions(+), 2 deletions(-) create mode 100644 kedro-datasets/kedro_datasets/databricks/mlflow/__init__.py create mode 100644 kedro-datasets/kedro_datasets/databricks/mlflow/artifact.py create mode 100644 kedro-datasets/kedro_datasets/databricks/mlflow/common.py create mode 100644 kedro-datasets/kedro_datasets/databricks/mlflow/dataset.py create mode 100644 kedro-datasets/kedro_datasets/databricks/mlflow/flavors/__init__.py create mode 100644 kedro-datasets/kedro_datasets/databricks/mlflow/flavors/kedro_dataset_flavor.py create mode 100644 kedro-datasets/kedro_datasets/databricks/mlflow/metrics.py create mode 100644 kedro-datasets/kedro_datasets/databricks/mlflow/model.py create mode 100644 kedro-datasets/kedro_datasets/databricks/mlflow/model_metadata.py create mode 100644 kedro-datasets/kedro_datasets/databricks/mlflow/tags.py create mode 100644 kedro-datasets/kedro_datasets/databricks/unity/__init__.py rename kedro-datasets/kedro_datasets/databricks/{unity.py => unity/managed_table_dataset.py} (99%) diff --git a/kedro-datasets/kedro_datasets/databricks/__init__.py b/kedro-datasets/kedro_datasets/databricks/__init__.py index 313f3bdba..ec9d4b45d 100644 --- a/kedro-datasets/kedro_datasets/databricks/__init__.py +++ b/kedro-datasets/kedro_datasets/databricks/__init__.py @@ -1,8 +1,9 @@ """Provides interface to Unity Catalog Tables.""" -__all__ = ["ManagedTableDataSet"] +__all__ = ["ManagedTableDataSet", "MLFlowModel", "MLFlowArtifact", "MLFlowDataSet", "MLFlowMetrics", "MLFlowModelMetadata", "MLFlowTags"] from contextlib import suppress with suppress(ImportError): from .unity import ManagedTableDataSet + from .mlflow import MLFlowModel, MLFlowArtifact, MLFlowDataSet, MLFlowMetrics, MLFlowModelMetadata, MLFlowTags diff --git a/kedro-datasets/kedro_datasets/databricks/mlflow/__init__.py b/kedro-datasets/kedro_datasets/databricks/mlflow/__init__.py new file mode 100644 index 000000000..f4cc1567a --- /dev/null +++ b/kedro-datasets/kedro_datasets/databricks/mlflow/__init__.py @@ -0,0 +1,6 @@ +from .artifact import MLFlowArtifact +from .dataset import MLFlowDataSet +from .metrics import MLFlowMetrics +from .model_metadata import MLFlowModelMetadata +from .tags import MLFlowTags +from .model import MLFlowModel \ No newline at end of file diff --git a/kedro-datasets/kedro_datasets/databricks/mlflow/artifact.py b/kedro-datasets/kedro_datasets/databricks/mlflow/artifact.py new file mode 100644 index 000000000..15691db43 --- /dev/null +++ b/kedro-datasets/kedro_datasets/databricks/mlflow/artifact.py @@ -0,0 +1,133 @@ +import logging +import os +from pathlib import Path +from tempfile import mkdtemp +from typing import Any, Dict + +import mlflow +from kedro.io.core import AbstractDataSet +from kedro.utils import load_obj as load_dataset +from mlflow.exceptions import MlflowException +from mlflow.tracking.artifact_utils import _download_artifact_from_uri + +from .common import MLFLOW_RUN_ID_ENV_VAR, ModelOpsException + +logger = logging.getLogger(__name__) + + +class MLFlowArtifact(AbstractDataSet): + def __init__( + self, + dataset_name: str, + dataset_type: str, + dataset_args: Dict[str, Any] = None, + *, + file_suffix: str, + run_id: str = None, + registered_model_name: str = None, + registered_model_version: str = None, + ): + """ + Log arbitrary Kedro datasets as mlflow artifacts + + Args: + dataset_name: dataset name as it should appear on mlflow run + dataset_type: full kedro dataset class name (incl. module) + dataset_args: kedro dataset args + file_suffix: file extension as it should appear on mlflow run + run_id: mlflow run-id, this should only be used when loading a + dataset saved from run which is different from active run + registered_model_name: mlflow registered model name, this should + only be used when loading an artifact linked to a model of + interest (i.e. back tracing atifacts from the run corresponding + to the model) + registered_model_version: mlflow registered model name, should be + used in combination with `registered_model_name` + + `run_id` and `registered_model_name` can't be specified together. + """ + if None in (registered_model_name, registered_model_version): + if registered_model_name or registered_model_version: + raise ModelOpsException( + "'registered_model_name' and " + "'registered_model_version' should be " + "set together" + ) + + if run_id and registered_model_name: + raise ModelOpsException( + "'run_id' cannot be passed when " "'registered_model_name' is set" + ) + + self._dataset_name = dataset_name + self._dataset_type = dataset_type + self._dataset_args = dataset_args or {} + self._file_suffix = file_suffix + self._run_id = run_id or os.environ.get(MLFLOW_RUN_ID_ENV_VAR) + self._registered_model_name = registered_model_name + self._registered_model_version = registered_model_version + + self._artifact_path = f"{dataset_name}{self._file_suffix}" + + self._filepath = Path(mkdtemp()) / self._artifact_path + + if registered_model_name: + self._version = f"{registered_model_name}/{registered_model_version}" + else: + self._version = run_id + + def _save(self, data: Any) -> None: + cls = load_dataset(self._dataset_type) + ds = cls(filepath=self._filepath.as_posix(), **self._dataset_args) + ds.save(data) + + filepath = self._filepath.as_posix() + if os.path.isdir(filepath): + mlflow.log_artifacts(self._filepath.as_posix(), self._artifact_path) + elif os.path.isfile(filepath): + mlflow.log_artifact(self._filepath.as_posix()) + else: + raise RuntimeError("cls.save() didn't work. Unexpected error.") + + run_id = mlflow.active_run().info.run_id + if self._version is not None: + logger.warning( + f"Ignoring version {self._version} set " + f"earlier, will use version='{run_id}' for loading" + ) + self._version = run_id + + def _load(self) -> Any: + if self._version is None: + msg = ( + "Could not determine the version to load. " + "Please specify either 'run_id' or 'registered_model_name' " + "along with 'registered_model_version' explicitly in " + "MLFlowArtifact constructor" + ) + raise MlflowException(msg) + + if "/" in self._version: + model_uri = f"models:/{self._version}" + model = mlflow.pyfunc.load_model(model_uri) + run_id = model._model_meta.run_id + else: + run_id = self._version + + local_path = _download_artifact_from_uri( + f"runs:/{run_id}/{self._artifact_path}" + ) + + cls = load_dataset(self._dataset_type) + ds = cls(filepath=local_path, **self._dataset_args) + return ds.load() + + def _describe(self) -> Dict[str, Any]: + return dict( + dataset_name=self._dataset_name, + dataset_type=self._dataset_type, + dataset_args=self._dataset_args, + file_suffix=self._file_suffix, + registered_model_name=self._registered_model_name, + registered_model_version=self._registered_model_version, + ) diff --git a/kedro-datasets/kedro_datasets/databricks/mlflow/common.py b/kedro-datasets/kedro_datasets/databricks/mlflow/common.py new file mode 100644 index 000000000..af102d6b3 --- /dev/null +++ b/kedro-datasets/kedro_datasets/databricks/mlflow/common.py @@ -0,0 +1,89 @@ +import mlflow +from mlflow.tracking import MlflowClient + +MLFLOW_RUN_ID_ENV_VAR = "mlflow_run_id" + + +def parse_model_uri(model_uri): + parts = model_uri.split("/") + + if len(parts) < 2 or len(parts) > 3: + raise ValueError( + f"model uri should have the format " + f"'models:/' or " + f"'models://', got {model_uri}" + ) + + if parts[0] == "models:": + protocol = "models" + else: + raise ValueError("model uri should start with `models:/`, got %s", model_uri) + + name = parts[1] + + client = MlflowClient() + if len(parts) == 2: + results = client.search_model_versions(f"name='{name}'") + sorted_results = sorted( + results, + key=lambda modelversion: modelversion.creation_timestamp, + reverse=True, + ) + latest_version = sorted_results[0].version + version = latest_version + else: + version = parts[2] + if version in ["Production", "Staging", "Archived"]: + results = client.get_latest_versions(name, stages=[version]) + if len(results) > 0: + version = results[0].version + else: + version = None + + return protocol, name, version + + +def promote_model(model_name, model_version, stage): + import datetime + + now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + client = MlflowClient() + + new_model_uri = f"models:/{model_name}/{model_version}" + _, _, new_model_version = parse_model_uri(new_model_uri) + new_model = mlflow.pyfunc.load_model(new_model_uri) + new_model_runid = new_model._model_meta.run_id + + msg = f"```Promoted version {model_version} to {stage}, at {now}```" + client.set_tag(new_model_runid, "mlflow.note.content", msg) + client.set_tag(new_model_runid, "Promoted at", now) + + results = client.get_latest_versions(model_name, stages=[stage]) + if len(results) > 0: + old_model_uri = f"models:/{model_name}/{stage}" + _, _, old_model_version = parse_model_uri(old_model_uri) + old_model = mlflow.pyfunc.load_model(old_model_uri) + old_model_runid = old_model._model_meta.run_id + + client.set_tag( + old_model._model_meta.run_id, + "mlflow.note.content", + f"```Replaced by version {new_model_version}, at {now}```", + ) + client.set_tag(old_model_runid, "Retired at", now) + client.set_tag(old_model_runid, "Replaced by", new_model_version) + + client.set_tag(new_model_runid, "Replaces", old_model_version) + + client.transition_model_version_stage( + name=model_name, version=old_model_version, stage="Archived" + ) + + client.transition_model_version_stage( + name=model_name, version=new_model_version, stage=stage + ) + + +class ModelOpsException(Exception): + pass diff --git a/kedro-datasets/kedro_datasets/databricks/mlflow/dataset.py b/kedro-datasets/kedro_datasets/databricks/mlflow/dataset.py new file mode 100644 index 000000000..ee0a1e0ed --- /dev/null +++ b/kedro-datasets/kedro_datasets/databricks/mlflow/dataset.py @@ -0,0 +1,80 @@ +import importlib +import logging +from typing import Any, Dict + +from kedro.io.core import AbstractDataSet + +from .common import ModelOpsException, parse_model_uri + +logger = logging.getLogger(__name__) + + +class MLFlowDataSet(AbstractDataSet): + def __init__( + self, + flavor: str, + dataset_name: str = None, + dataset_type: str = None, + dataset_args: Dict[str, Any] = None, + *, + file_suffix: str = None, + load_version: str = None, + ): + self._flavor = flavor + self._dataset_name = dataset_name + self._dataset_type = dataset_type + self._dataset_args = dataset_args or {} + self._file_suffix = file_suffix + self._load_version = load_version + + def _save(self, model: Any) -> None: + if self._load_version is not None: + msg = ( + f"Trying to save an MLFlowDataSet::{self._describe} which " + f"was initialized with load_version={self._load_version}. " + f"This can lead to inconsistency between saved and loaded " + f"versions, therefore disallowed. Please create separate " + f"catalog entries for saved and loaded datasets." + ) + raise ModelOpsException(msg) + + importlib.import_module(self._flavor).log_model( + model, + self._dataset_name, + registered_model_name=self._dataset_name, + dataset_type=self._dataset_type, + dataset_args=self._dataset_args, + file_suffix=self._file_suffix, + ) + + def _load(self) -> Any: + *_, latest_version = parse_model_uri(f"models:/{self._dataset_name}") + + dataset_version = self._load_version or latest_version + *_, dataset_version = parse_model_uri( + f"models:/{self._dataset_name}/{dataset_version}" + ) + + logger.info(f"Loading model '{self._dataset_name}' version '{dataset_version}'") + + if dataset_version != latest_version: + logger.warning(f"Newer version {latest_version} exists in repo") + + model = importlib.import_module(self._flavor).load_model( + f"models:/{self._dataset_name}/{dataset_version}", + dataset_type=self._dataset_type, + dataset_args=self._dataset_args, + file_suffix=self._file_suffix, + ) + + return model + + def _describe(self) -> Dict[str, Any]: + return dict( + flavor=self._flavor, + dataset_name=self._dataset_name, + dataset_type=self._dataset_type, + dataset_args=self._dataset_args, + file_suffix=self._file_suffix, + load_version=self._load_version, + ) diff --git a/kedro-datasets/kedro_datasets/databricks/mlflow/flavors/__init__.py b/kedro-datasets/kedro_datasets/databricks/mlflow/flavors/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/kedro-datasets/kedro_datasets/databricks/mlflow/flavors/kedro_dataset_flavor.py b/kedro-datasets/kedro_datasets/databricks/mlflow/flavors/kedro_dataset_flavor.py new file mode 100644 index 000000000..e0a43a1b0 --- /dev/null +++ b/kedro-datasets/kedro_datasets/databricks/mlflow/flavors/kedro_dataset_flavor.py @@ -0,0 +1,154 @@ +import os +import sys +from pathlib import Path +from typing import Any, Dict, Union + +import kedro +import yaml +from kedro.utils import load_obj as load_dataset +from mlflow import pyfunc +from mlflow.exceptions import MlflowException +from mlflow.models import Model +from mlflow.tracking._model_registry import DEFAULT_AWAIT_MAX_SLEEP_SECONDS +from mlflow.tracking.artifact_utils import _download_artifact_from_uri +from mlflow.utils.environment import _mlflow_conda_env +from mlflow.utils.model_utils import _get_flavor_configuration + +FLAVOR_NAME = "kedro_dataset" + + +DEFAULT_CONDA_ENV = _mlflow_conda_env( + additional_conda_deps=["kedro[all]={}".format(kedro.__version__)], + additional_pip_deps=None, + additional_conda_channels=None, +) + + +def save_model( + data: Any, + path: str, + conda_env: Union[str, Dict[str, Any]] = None, + mlflow_model: Model = Model(), + *, + dataset_type: str, + dataset_args: Dict[str, Any], + file_suffix: str, +): + if os.path.exists(path): + raise RuntimeError("Path '{}' already exists".format(path)) + os.makedirs(path) + + model_data_subpath = f"data.{file_suffix}" + model_data_path = os.path.join(path, model_data_subpath) + + cls = load_dataset(dataset_type) + ds = cls(filepath=model_data_path, **dataset_args) + ds.save(data) + + conda_env_subpath = "conda.yaml" + if conda_env is None: + conda_env = DEFAULT_CONDA_ENV + elif not isinstance(conda_env, dict): + with open(conda_env, "r") as f: + conda_env = yaml.safe_load(f) + with open(os.path.join(path, conda_env_subpath), "w") as f: + yaml.safe_dump(conda_env, stream=f, default_flow_style=False) + + pyfunc.add_to_model( + mlflow_model, + loader_module=__name__, + data=model_data_subpath, + env=conda_env_subpath, + ) + + mlflow_model.add_flavor( + FLAVOR_NAME, + data=model_data_subpath, + dataset_type=dataset_type, + dataset_args=dataset_args, + file_suffix=file_suffix, + ) + mlflow_model.save(os.path.join(path, "MLmodel")) + + +def log_model( + model: Any, + artifact_path: str, + conda_env: Dict[str, Any] = None, + registered_model_name: str = None, + await_registration_for: int = DEFAULT_AWAIT_MAX_SLEEP_SECONDS, + *, + dataset_type: str, + dataset_args: Dict[str, Any], + file_suffix: str, +): + return Model.log( + artifact_path=artifact_path, + flavor=sys.modules[__name__], + registered_model_name=registered_model_name, + await_registration_for=await_registration_for, + data=model, + conda_env=conda_env, + dataset_type=dataset_type, + dataset_args=dataset_args, + file_suffix=file_suffix, + ) + + +def _load_model_from_local_file( + local_path: str, + *, + dataset_type: str = None, + dataset_args: Dict[str, Any] = None, + file_suffix: str = None, +): + if dataset_type is not None: + model_data_subpath = f"data.{file_suffix}" + data_path = os.path.join(local_path, model_data_subpath) + else: + flavor_conf = _get_flavor_configuration( + model_path=local_path, flavor_name=FLAVOR_NAME + ) + data_path = os.path.join(local_path, flavor_conf["data"]) + dataset_type = flavor_conf["dataset_type"] + dataset_args = flavor_conf["dataset_args"] + + cls = load_dataset(dataset_type) + ds = cls(filepath=data_path, **dataset_args) + return ds.load() + + +def load_model( + model_uri: str, + *, + dataset_type: str = None, + dataset_args: Dict[str, Any] = None, + file_suffix: str = None, +): + if dataset_type is not None or dataset_args is not None or file_suffix is not None: + assert ( + dataset_type is not None + and dataset_args is not None + and file_suffix is not None + ), ("Please set 'dataset_type', " "'dataset_args' and 'file_suffix'") + + local_path = _download_artifact_from_uri(model_uri) + return _load_model_from_local_file( + local_path, + dataset_type=dataset_type, + dataset_args=dataset_args, + file_suffix=file_suffix, + ) + + +def _load_pyfunc(model_file: str): + local_path = Path(model_file).parent.absolute() + model = _load_model_from_local_file(local_path) + if not hasattr(model, "predict"): + try: + setattr(model, "predict", None) + except AttributeError: + raise MlflowException( + f"`pyfunc` flavor not supported, use " f"{__name__}.load instead" + ) + return model diff --git a/kedro-datasets/kedro_datasets/databricks/mlflow/metrics.py b/kedro-datasets/kedro_datasets/databricks/mlflow/metrics.py new file mode 100644 index 000000000..1c7760375 --- /dev/null +++ b/kedro-datasets/kedro_datasets/databricks/mlflow/metrics.py @@ -0,0 +1,93 @@ +import logging +from typing import Any, Dict, Union + +import mlflow +from kedro.io.core import AbstractDataSet +from mlflow.exceptions import MlflowException +from mlflow.tracking import MlflowClient + +from .common import ModelOpsException + +logger = logging.getLogger(__name__) + + +class MLFlowMetrics(AbstractDataSet): + def __init__( + self, + prefix: str = None, + run_id: str = None, + registered_model_name: str = None, + registered_model_version: str = None, + ): + if None in (registered_model_name, registered_model_version): + if registered_model_name or registered_model_version: + raise ModelOpsException( + "'registered_model_name' and " + "'registered_model_version' should be " + "set together" + ) + + if run_id and registered_model_name: + raise ModelOpsException( + "'run_id' cannot be passed when " "'registered_model_name' is set" + ) + + self._prefix = prefix + self._run_id = run_id + self._registered_model_name = registered_model_name + self._registered_model_version = registered_model_version + + if registered_model_name: + self._version = f"{registered_model_name}/{registered_model_version}" + else: + self._version = run_id + + def _save(self, metrics: Dict[str, Union[str, float, int]]) -> None: + if self._prefix is not None: + metrics = {f"{self._prefix}_{key}": value for key, value in metrics.items()} + mlflow.log_metrics(metrics) + + run_id = mlflow.active_run().info.run_id + if self._version is not None: + logger.warning( + f"Ignoring version {self._version.save} set " + f"earlier, will use version='{run_id}' for loading" + ) + self._version = run_id + + def _load(self) -> Any: + if self._version is None: + msg = ( + "Could not determine the version to load. " + "Please specify either 'run_id' or 'registered_model_name' " + "along with 'registered_model_version' explicitly in " + "MLFlowMetrics constructor" + ) + raise MlflowException(msg) + + client = MlflowClient() + + if "/" in self._version: + model_uri = f"models:/{self._version}" + model = mlflow.pyfunc.load_model(model_uri) + run_id = model._model_meta.run_id + else: + run_id = self._version + + run = client.get_run(run_id) + metrics = run.data.metrics + if self._prefix is not None: + metrics = { + key[len(self._prefix) + 1 :]: value + for key, value in metrics.items() + if key[: len(self._prefix)] == self._prefix + } + return metrics + + def _describe(self) -> Dict[str, Any]: + return dict( + prefix=self._prefix, + run_id=self._run_id, + registered_model_name=self._registered_model_name, + registered_model_version=self._registered_model_version, + ) diff --git a/kedro-datasets/kedro_datasets/databricks/mlflow/model.py b/kedro-datasets/kedro_datasets/databricks/mlflow/model.py new file mode 100644 index 000000000..c5f2356a2 --- /dev/null +++ b/kedro-datasets/kedro_datasets/databricks/mlflow/model.py @@ -0,0 +1,75 @@ +import importlib +import logging +from typing import Any, Dict + +from kedro.io.core import AbstractDataSet +from mlflow.models.signature import ModelSignature + +from .common import ModelOpsException, parse_model_uri + +logger = logging.getLogger(__name__) + + +class MLFlowModel(AbstractDataSet): + def __init__( + self, + flavor: str, + model_name: str, + signature: Dict[str, Dict[str, str]] = None, + input_example: Dict[str, Any] = None, + load_version: str = None, + ): + self._flavor = flavor + self._model_name = model_name + + if signature: + self._signature = ModelSignature.from_dict(signature) + else: + self._signature = None + self._input_example = input_example + + self._load_version = load_version + + def _save(self, model: Any) -> None: + if self._load_version is not None: + msg = ( + f"Trying to save an MLFlowModel::{self._describe} which " + f"was initialized with load_version={self._load_version}. " + f"This can lead to inconsistency between saved and loaded " + f"versions, therefore disallowed. Please create separate " + f"catalog entries for saved and loaded datasets." + ) + raise ModelOpsException(msg) + + importlib.import_module(self._flavor).log_model( + model, + self._model_name, + registered_model_name=self._model_name, + signature=self._signature, + input_example=self._input_example, + ) + + def _load(self) -> Any: + *_, latest_version = parse_model_uri(f"models:/{self._model_name}") + + model_version = self._load_version or latest_version + + logger.info(f"Loading model '{self._model_name}' version '{model_version}'") + + if model_version != latest_version: + logger.warning(f"Newer version {latest_version} exists in repo") + + model = importlib.import_module(self._flavor).load_model( + f"models:/{self._model_name}/{model_version}" + ) + + return model + + def _describe(self) -> Dict[str, Any]: + return dict( + flavor=self._flavor, + model_name=self._model_name, + signature=self._signature, + input_example=self._input_example, + load_version=self._load_version, + ) diff --git a/kedro-datasets/kedro_datasets/databricks/mlflow/model_metadata.py b/kedro-datasets/kedro_datasets/databricks/mlflow/model_metadata.py new file mode 100644 index 000000000..3c160cec4 --- /dev/null +++ b/kedro-datasets/kedro_datasets/databricks/mlflow/model_metadata.py @@ -0,0 +1,49 @@ +import logging +from typing import Any, Dict, Union + +import mlflow +from kedro.io.core import AbstractDataSet + +from .common import ModelOpsException, parse_model_uri + +logger = logging.getLogger(__name__) + + +class MLFlowModelMetadata(AbstractDataSet): + def __init__( + self, registered_model_name: str, registered_model_version: str = None + ): + self._model_name = registered_model_name + self._model_version = registered_model_version + + def _save(self, tags: Dict[str, Union[str, float, int]]) -> None: + raise NotImplementedError() + + def _load(self) -> Any: + if self._model_version is None: + model_uri = f"models:/{self._model_name}" + else: + model_uri = f"models:/{self._model_name}/{self._model_version}" + _, _, load_version = parse_model_uri(model_uri) + + if load_version is None: + raise ModelOpsException( + f"No model with version " f"'{self._model_version}'" + ) + + pyfunc_model = mlflow.pyfunc.load_model( + f"models:/{self._model_name}/{load_version}" + ) + all_metadata = pyfunc_model._model_meta + model_metadata = { + "model_name": self._model_name, + "model_version": int(load_version), + "run_id": all_metadata.run_id, + } + return model_metadata + + def _describe(self) -> Dict[str, Any]: + return dict( + registered_model_name=self._model_name, + registered_model_version=self._model_version, + ) diff --git a/kedro-datasets/kedro_datasets/databricks/mlflow/tags.py b/kedro-datasets/kedro_datasets/databricks/mlflow/tags.py new file mode 100644 index 000000000..153810ae4 --- /dev/null +++ b/kedro-datasets/kedro_datasets/databricks/mlflow/tags.py @@ -0,0 +1,94 @@ +import logging +from typing import Any, Dict, Union + +import mlflow +from kedro.io.core import AbstractDataSet +from mlflow.exceptions import MlflowException +from mlflow.tracking import MlflowClient + +from .common import ModelOpsException + +logger = logging.getLogger(__name__) + + +class MLFlowTags(AbstractDataSet): + def __init__( + self, + prefix: str = None, + run_id: str = None, + registered_model_name: str = None, + registered_model_version: str = None, + ): + if None in (registered_model_name, registered_model_version): + if registered_model_name or registered_model_version: + raise ModelOpsException( + "'registered_model_name' and " + "'registered_model_version' should be " + "set together" + ) + + if run_id and registered_model_name: + raise ModelOpsException( + "'run_id' cannot be passed when " "'registered_model_name' is set" + ) + + self._prefix = prefix + self._run_id = run_id + self._registered_model_name = registered_model_name + self._registered_model_version = registered_model_version + + if registered_model_name: + self._version = f"{registered_model_name}/{registered_model_version}" + else: + self._version = run_id + + def _save(self, tags: Dict[str, Union[str, float, int]]) -> None: + if self._prefix is not None: + tags = {f"{self._prefix}_{key}": value for key, value in tags.items()} + + mlflow.set_tags(tags) + + run_id = mlflow.active_run().info.run_id + if self._version is not None: + logger.warning( + f"Ignoring version {self._version.save} set " + f"earlier, will use version='{run_id}' for loading" + ) + self._version = run_id + + def _load(self) -> Any: + if self._version is None: + msg = ( + "Could not determine the version to load. " + "Please specify either 'run_id' or 'registered_model_name' " + "along with 'registered_model_version' explicitly in " + "MLFlowTags constructor" + ) + raise MlflowException(msg) + + client = MlflowClient() + + if "/" in self._version: + model_uri = f"models:/{self._version}" + model = mlflow.pyfunc.load_model(model_uri) + run_id = model._model_meta.run_id + else: + run_id = self._version + + run = client.get_run(run_id) + tags = run.data.tags + if self._prefix is not None: + tags = { + key[len(self._prefix) + 1 :]: value + for key, value in tags.items() + if key[: len(self._prefix)] == self._prefix + } + return tags + + def _describe(self) -> Dict[str, Any]: + return dict( + prefix=self._prefix, + run_id=self._run_id, + registered_model_name=self._registered_model_name, + registered_model_version=self._registered_model_version, + ) diff --git a/kedro-datasets/kedro_datasets/databricks/unity/__init__.py b/kedro-datasets/kedro_datasets/databricks/unity/__init__.py new file mode 100644 index 000000000..ab452e146 --- /dev/null +++ b/kedro-datasets/kedro_datasets/databricks/unity/__init__.py @@ -0,0 +1 @@ +from .managed_table_dataset import ManagedTableDataSet \ No newline at end of file diff --git a/kedro-datasets/kedro_datasets/databricks/unity.py b/kedro-datasets/kedro_datasets/databricks/unity/managed_table_dataset.py similarity index 99% rename from kedro-datasets/kedro_datasets/databricks/unity.py rename to kedro-datasets/kedro_datasets/databricks/unity/managed_table_dataset.py index b6270f58c..b46122197 100644 --- a/kedro-datasets/kedro_datasets/databricks/unity.py +++ b/kedro-datasets/kedro_datasets/databricks/unity/managed_table_dataset.py @@ -182,6 +182,7 @@ def _describe(self) -> Dict[str, str]: dataframe_type=self._dataframe_type, primary_key=self._primary_key, version=self._version, + owner_group=self._owner_group, ) def _exists(self) -> bool: diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py index cf02edd0f..48e5eec34 100644 --- a/kedro-datasets/setup.py +++ b/kedro-datasets/setup.py @@ -38,7 +38,7 @@ def _collect_requirements(requires): api_require = {"api.APIDataSet": ["requests~=2.20"]} biosequence_require = {"biosequence.BioSequenceDataSet": ["biopython~=1.73"]} dask_require = {"dask.ParquetDataSet": ["dask[complete]~=2021.10", "triad>=0.6.7, <1.0"]} -databricks_require = {"databricks.UnityTableDataSet": [SPARK]} +databricks_require = {"databricks.ManagedTableDataSet": [SPARK]} geopandas_require = { "geopandas.GeoJSONDataSet": ["geopandas>=0.6.0, <1.0", "pyproj~=3.0"] } From b2a03b6549f5e871950aa6fbf6830de3a48fcb30 Mon Sep 17 00:00:00 2001 From: Danny Farah Date: Thu, 23 Feb 2023 17:21:24 -0500 Subject: [PATCH 13/74] fixing mlflow imports Signed-off-by: Danny Farah --- kedro-datasets/kedro_datasets/databricks/__init__.py | 9 ++------- .../kedro_datasets/databricks/mlflow/__init__.py | 2 +- kedro-datasets/setup.py | 10 +++++++++- kedro-datasets/test_requirements.txt | 1 + 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/kedro-datasets/kedro_datasets/databricks/__init__.py b/kedro-datasets/kedro_datasets/databricks/__init__.py index ec9d4b45d..cba69d17c 100644 --- a/kedro-datasets/kedro_datasets/databricks/__init__.py +++ b/kedro-datasets/kedro_datasets/databricks/__init__.py @@ -1,9 +1,4 @@ """Provides interface to Unity Catalog Tables.""" -__all__ = ["ManagedTableDataSet", "MLFlowModel", "MLFlowArtifact", "MLFlowDataSet", "MLFlowMetrics", "MLFlowModelMetadata", "MLFlowTags"] - -from contextlib import suppress - -with suppress(ImportError): - from .unity import ManagedTableDataSet - from .mlflow import MLFlowModel, MLFlowArtifact, MLFlowDataSet, MLFlowMetrics, MLFlowModelMetadata, MLFlowTags +from .unity import ManagedTableDataSet +from .mlflow import MLFlowModel, MLFlowArtifact, MLFlowDataSet, MLFlowMetrics, MLFlowModelMetadata, MLFlowTags diff --git a/kedro-datasets/kedro_datasets/databricks/mlflow/__init__.py b/kedro-datasets/kedro_datasets/databricks/mlflow/__init__.py index f4cc1567a..1c3babc0f 100644 --- a/kedro-datasets/kedro_datasets/databricks/mlflow/__init__.py +++ b/kedro-datasets/kedro_datasets/databricks/mlflow/__init__.py @@ -3,4 +3,4 @@ from .metrics import MLFlowMetrics from .model_metadata import MLFlowModelMetadata from .tags import MLFlowTags -from .model import MLFlowModel \ No newline at end of file +from .model import MLFlowModel diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py index 48e5eec34..4b9a05f1a 100644 --- a/kedro-datasets/setup.py +++ b/kedro-datasets/setup.py @@ -38,7 +38,15 @@ def _collect_requirements(requires): api_require = {"api.APIDataSet": ["requests~=2.20"]} biosequence_require = {"biosequence.BioSequenceDataSet": ["biopython~=1.73"]} dask_require = {"dask.ParquetDataSet": ["dask[complete]~=2021.10", "triad>=0.6.7, <1.0"]} -databricks_require = {"databricks.ManagedTableDataSet": [SPARK]} +databricks_require = { + "databricks.ManagedTableDataSet": [SPARK, PANDAS], + "databricks.MLFlowModel":[SPARK, PANDAS, "mlflow>=2.0.0"], + "databricks.MLFlowArtifact":[SPARK, PANDAS, "mlflow>=2.0.0"], + "databricks.MLFlowDataSet":[SPARK, PANDAS, "mlflow>=2.0.0"], + "databricks.MLFlowMetrics":[SPARK, PANDAS, "mlflow>=2.0.0"], + "databricks.MLFlowModelMetadata":[SPARK, PANDAS, "mlflow>=2.0.0"], + "databricks.MLFlowTags":[SPARK, PANDAS, "mlflow>=2.0.0"] +} geopandas_require = { "geopandas.GeoJSONDataSet": ["geopandas>=0.6.0, <1.0", "pyproj~=3.0"] } diff --git a/kedro-datasets/test_requirements.txt b/kedro-datasets/test_requirements.txt index 8dec3619b..b4424ce7d 100644 --- a/kedro-datasets/test_requirements.txt +++ b/kedro-datasets/test_requirements.txt @@ -24,6 +24,7 @@ lxml~=4.6 matplotlib>=3.0.3, <3.4; python_version < '3.10' # 3.4.0 breaks holoviews matplotlib>=3.5, <3.6; python_version == '3.10' memory_profiler>=0.50.0, <1.0 +mlflow==2.2.1 moto==1.3.7; python_version < '3.10' moto==3.0.4; python_version == '3.10' networkx~=2.4 From 201fc8a07e24fd2fd79f34c29dd1d7a8509534f2 Mon Sep 17 00:00:00 2001 From: Danny Farah Date: Wed, 8 Mar 2023 14:26:57 -0500 Subject: [PATCH 14/74] cleaned up mlflow for initial release Signed-off-by: Danny Farah --- .../kedro_datasets/databricks/__init__.py | 1 - .../databricks/mlflow/__init__.py | 6 - .../databricks/mlflow/artifact.py | 133 --------------- .../databricks/mlflow/common.py | 89 ---------- .../databricks/mlflow/dataset.py | 80 --------- .../databricks/mlflow/flavors/__init__.py | 0 .../mlflow/flavors/kedro_dataset_flavor.py | 154 ------------------ .../databricks/mlflow/metrics.py | 93 ----------- .../kedro_datasets/databricks/mlflow/model.py | 75 --------- .../databricks/mlflow/model_metadata.py | 49 ------ .../kedro_datasets/databricks/mlflow/tags.py | 94 ----------- .../databricks/unity/managed_table_dataset.py | 29 ++-- kedro-datasets/tests/databricks/conftest.py | 1 - kedro-datasets/tests/databricks/test_unity.py | 7 +- 14 files changed, 16 insertions(+), 795 deletions(-) delete mode 100644 kedro-datasets/kedro_datasets/databricks/mlflow/__init__.py delete mode 100644 kedro-datasets/kedro_datasets/databricks/mlflow/artifact.py delete mode 100644 kedro-datasets/kedro_datasets/databricks/mlflow/common.py delete mode 100644 kedro-datasets/kedro_datasets/databricks/mlflow/dataset.py delete mode 100644 kedro-datasets/kedro_datasets/databricks/mlflow/flavors/__init__.py delete mode 100644 kedro-datasets/kedro_datasets/databricks/mlflow/flavors/kedro_dataset_flavor.py delete mode 100644 kedro-datasets/kedro_datasets/databricks/mlflow/metrics.py delete mode 100644 kedro-datasets/kedro_datasets/databricks/mlflow/model.py delete mode 100644 kedro-datasets/kedro_datasets/databricks/mlflow/model_metadata.py delete mode 100644 kedro-datasets/kedro_datasets/databricks/mlflow/tags.py diff --git a/kedro-datasets/kedro_datasets/databricks/__init__.py b/kedro-datasets/kedro_datasets/databricks/__init__.py index cba69d17c..7819a2e06 100644 --- a/kedro-datasets/kedro_datasets/databricks/__init__.py +++ b/kedro-datasets/kedro_datasets/databricks/__init__.py @@ -1,4 +1,3 @@ """Provides interface to Unity Catalog Tables.""" from .unity import ManagedTableDataSet -from .mlflow import MLFlowModel, MLFlowArtifact, MLFlowDataSet, MLFlowMetrics, MLFlowModelMetadata, MLFlowTags diff --git a/kedro-datasets/kedro_datasets/databricks/mlflow/__init__.py b/kedro-datasets/kedro_datasets/databricks/mlflow/__init__.py deleted file mode 100644 index 1c3babc0f..000000000 --- a/kedro-datasets/kedro_datasets/databricks/mlflow/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from .artifact import MLFlowArtifact -from .dataset import MLFlowDataSet -from .metrics import MLFlowMetrics -from .model_metadata import MLFlowModelMetadata -from .tags import MLFlowTags -from .model import MLFlowModel diff --git a/kedro-datasets/kedro_datasets/databricks/mlflow/artifact.py b/kedro-datasets/kedro_datasets/databricks/mlflow/artifact.py deleted file mode 100644 index 15691db43..000000000 --- a/kedro-datasets/kedro_datasets/databricks/mlflow/artifact.py +++ /dev/null @@ -1,133 +0,0 @@ -import logging -import os -from pathlib import Path -from tempfile import mkdtemp -from typing import Any, Dict - -import mlflow -from kedro.io.core import AbstractDataSet -from kedro.utils import load_obj as load_dataset -from mlflow.exceptions import MlflowException -from mlflow.tracking.artifact_utils import _download_artifact_from_uri - -from .common import MLFLOW_RUN_ID_ENV_VAR, ModelOpsException - -logger = logging.getLogger(__name__) - - -class MLFlowArtifact(AbstractDataSet): - def __init__( - self, - dataset_name: str, - dataset_type: str, - dataset_args: Dict[str, Any] = None, - *, - file_suffix: str, - run_id: str = None, - registered_model_name: str = None, - registered_model_version: str = None, - ): - """ - Log arbitrary Kedro datasets as mlflow artifacts - - Args: - dataset_name: dataset name as it should appear on mlflow run - dataset_type: full kedro dataset class name (incl. module) - dataset_args: kedro dataset args - file_suffix: file extension as it should appear on mlflow run - run_id: mlflow run-id, this should only be used when loading a - dataset saved from run which is different from active run - registered_model_name: mlflow registered model name, this should - only be used when loading an artifact linked to a model of - interest (i.e. back tracing atifacts from the run corresponding - to the model) - registered_model_version: mlflow registered model name, should be - used in combination with `registered_model_name` - - `run_id` and `registered_model_name` can't be specified together. - """ - if None in (registered_model_name, registered_model_version): - if registered_model_name or registered_model_version: - raise ModelOpsException( - "'registered_model_name' and " - "'registered_model_version' should be " - "set together" - ) - - if run_id and registered_model_name: - raise ModelOpsException( - "'run_id' cannot be passed when " "'registered_model_name' is set" - ) - - self._dataset_name = dataset_name - self._dataset_type = dataset_type - self._dataset_args = dataset_args or {} - self._file_suffix = file_suffix - self._run_id = run_id or os.environ.get(MLFLOW_RUN_ID_ENV_VAR) - self._registered_model_name = registered_model_name - self._registered_model_version = registered_model_version - - self._artifact_path = f"{dataset_name}{self._file_suffix}" - - self._filepath = Path(mkdtemp()) / self._artifact_path - - if registered_model_name: - self._version = f"{registered_model_name}/{registered_model_version}" - else: - self._version = run_id - - def _save(self, data: Any) -> None: - cls = load_dataset(self._dataset_type) - ds = cls(filepath=self._filepath.as_posix(), **self._dataset_args) - ds.save(data) - - filepath = self._filepath.as_posix() - if os.path.isdir(filepath): - mlflow.log_artifacts(self._filepath.as_posix(), self._artifact_path) - elif os.path.isfile(filepath): - mlflow.log_artifact(self._filepath.as_posix()) - else: - raise RuntimeError("cls.save() didn't work. Unexpected error.") - - run_id = mlflow.active_run().info.run_id - if self._version is not None: - logger.warning( - f"Ignoring version {self._version} set " - f"earlier, will use version='{run_id}' for loading" - ) - self._version = run_id - - def _load(self) -> Any: - if self._version is None: - msg = ( - "Could not determine the version to load. " - "Please specify either 'run_id' or 'registered_model_name' " - "along with 'registered_model_version' explicitly in " - "MLFlowArtifact constructor" - ) - raise MlflowException(msg) - - if "/" in self._version: - model_uri = f"models:/{self._version}" - model = mlflow.pyfunc.load_model(model_uri) - run_id = model._model_meta.run_id - else: - run_id = self._version - - local_path = _download_artifact_from_uri( - f"runs:/{run_id}/{self._artifact_path}" - ) - - cls = load_dataset(self._dataset_type) - ds = cls(filepath=local_path, **self._dataset_args) - return ds.load() - - def _describe(self) -> Dict[str, Any]: - return dict( - dataset_name=self._dataset_name, - dataset_type=self._dataset_type, - dataset_args=self._dataset_args, - file_suffix=self._file_suffix, - registered_model_name=self._registered_model_name, - registered_model_version=self._registered_model_version, - ) diff --git a/kedro-datasets/kedro_datasets/databricks/mlflow/common.py b/kedro-datasets/kedro_datasets/databricks/mlflow/common.py deleted file mode 100644 index af102d6b3..000000000 --- a/kedro-datasets/kedro_datasets/databricks/mlflow/common.py +++ /dev/null @@ -1,89 +0,0 @@ -import mlflow -from mlflow.tracking import MlflowClient - -MLFLOW_RUN_ID_ENV_VAR = "mlflow_run_id" - - -def parse_model_uri(model_uri): - parts = model_uri.split("/") - - if len(parts) < 2 or len(parts) > 3: - raise ValueError( - f"model uri should have the format " - f"'models:/' or " - f"'models://', got {model_uri}" - ) - - if parts[0] == "models:": - protocol = "models" - else: - raise ValueError("model uri should start with `models:/`, got %s", model_uri) - - name = parts[1] - - client = MlflowClient() - if len(parts) == 2: - results = client.search_model_versions(f"name='{name}'") - sorted_results = sorted( - results, - key=lambda modelversion: modelversion.creation_timestamp, - reverse=True, - ) - latest_version = sorted_results[0].version - version = latest_version - else: - version = parts[2] - if version in ["Production", "Staging", "Archived"]: - results = client.get_latest_versions(name, stages=[version]) - if len(results) > 0: - version = results[0].version - else: - version = None - - return protocol, name, version - - -def promote_model(model_name, model_version, stage): - import datetime - - now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - - client = MlflowClient() - - new_model_uri = f"models:/{model_name}/{model_version}" - _, _, new_model_version = parse_model_uri(new_model_uri) - new_model = mlflow.pyfunc.load_model(new_model_uri) - new_model_runid = new_model._model_meta.run_id - - msg = f"```Promoted version {model_version} to {stage}, at {now}```" - client.set_tag(new_model_runid, "mlflow.note.content", msg) - client.set_tag(new_model_runid, "Promoted at", now) - - results = client.get_latest_versions(model_name, stages=[stage]) - if len(results) > 0: - old_model_uri = f"models:/{model_name}/{stage}" - _, _, old_model_version = parse_model_uri(old_model_uri) - old_model = mlflow.pyfunc.load_model(old_model_uri) - old_model_runid = old_model._model_meta.run_id - - client.set_tag( - old_model._model_meta.run_id, - "mlflow.note.content", - f"```Replaced by version {new_model_version}, at {now}```", - ) - client.set_tag(old_model_runid, "Retired at", now) - client.set_tag(old_model_runid, "Replaced by", new_model_version) - - client.set_tag(new_model_runid, "Replaces", old_model_version) - - client.transition_model_version_stage( - name=model_name, version=old_model_version, stage="Archived" - ) - - client.transition_model_version_stage( - name=model_name, version=new_model_version, stage=stage - ) - - -class ModelOpsException(Exception): - pass diff --git a/kedro-datasets/kedro_datasets/databricks/mlflow/dataset.py b/kedro-datasets/kedro_datasets/databricks/mlflow/dataset.py deleted file mode 100644 index ee0a1e0ed..000000000 --- a/kedro-datasets/kedro_datasets/databricks/mlflow/dataset.py +++ /dev/null @@ -1,80 +0,0 @@ -import importlib -import logging -from typing import Any, Dict - -from kedro.io.core import AbstractDataSet - -from .common import ModelOpsException, parse_model_uri - -logger = logging.getLogger(__name__) - - -class MLFlowDataSet(AbstractDataSet): - def __init__( - self, - flavor: str, - dataset_name: str = None, - dataset_type: str = None, - dataset_args: Dict[str, Any] = None, - *, - file_suffix: str = None, - load_version: str = None, - ): - self._flavor = flavor - self._dataset_name = dataset_name - self._dataset_type = dataset_type - self._dataset_args = dataset_args or {} - self._file_suffix = file_suffix - self._load_version = load_version - - def _save(self, model: Any) -> None: - if self._load_version is not None: - msg = ( - f"Trying to save an MLFlowDataSet::{self._describe} which " - f"was initialized with load_version={self._load_version}. " - f"This can lead to inconsistency between saved and loaded " - f"versions, therefore disallowed. Please create separate " - f"catalog entries for saved and loaded datasets." - ) - raise ModelOpsException(msg) - - importlib.import_module(self._flavor).log_model( - model, - self._dataset_name, - registered_model_name=self._dataset_name, - dataset_type=self._dataset_type, - dataset_args=self._dataset_args, - file_suffix=self._file_suffix, - ) - - def _load(self) -> Any: - *_, latest_version = parse_model_uri(f"models:/{self._dataset_name}") - - dataset_version = self._load_version or latest_version - *_, dataset_version = parse_model_uri( - f"models:/{self._dataset_name}/{dataset_version}" - ) - - logger.info(f"Loading model '{self._dataset_name}' version '{dataset_version}'") - - if dataset_version != latest_version: - logger.warning(f"Newer version {latest_version} exists in repo") - - model = importlib.import_module(self._flavor).load_model( - f"models:/{self._dataset_name}/{dataset_version}", - dataset_type=self._dataset_type, - dataset_args=self._dataset_args, - file_suffix=self._file_suffix, - ) - - return model - - def _describe(self) -> Dict[str, Any]: - return dict( - flavor=self._flavor, - dataset_name=self._dataset_name, - dataset_type=self._dataset_type, - dataset_args=self._dataset_args, - file_suffix=self._file_suffix, - load_version=self._load_version, - ) diff --git a/kedro-datasets/kedro_datasets/databricks/mlflow/flavors/__init__.py b/kedro-datasets/kedro_datasets/databricks/mlflow/flavors/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/kedro-datasets/kedro_datasets/databricks/mlflow/flavors/kedro_dataset_flavor.py b/kedro-datasets/kedro_datasets/databricks/mlflow/flavors/kedro_dataset_flavor.py deleted file mode 100644 index e0a43a1b0..000000000 --- a/kedro-datasets/kedro_datasets/databricks/mlflow/flavors/kedro_dataset_flavor.py +++ /dev/null @@ -1,154 +0,0 @@ -import os -import sys -from pathlib import Path -from typing import Any, Dict, Union - -import kedro -import yaml -from kedro.utils import load_obj as load_dataset -from mlflow import pyfunc -from mlflow.exceptions import MlflowException -from mlflow.models import Model -from mlflow.tracking._model_registry import DEFAULT_AWAIT_MAX_SLEEP_SECONDS -from mlflow.tracking.artifact_utils import _download_artifact_from_uri -from mlflow.utils.environment import _mlflow_conda_env -from mlflow.utils.model_utils import _get_flavor_configuration - -FLAVOR_NAME = "kedro_dataset" - - -DEFAULT_CONDA_ENV = _mlflow_conda_env( - additional_conda_deps=["kedro[all]={}".format(kedro.__version__)], - additional_pip_deps=None, - additional_conda_channels=None, -) - - -def save_model( - data: Any, - path: str, - conda_env: Union[str, Dict[str, Any]] = None, - mlflow_model: Model = Model(), - *, - dataset_type: str, - dataset_args: Dict[str, Any], - file_suffix: str, -): - if os.path.exists(path): - raise RuntimeError("Path '{}' already exists".format(path)) - os.makedirs(path) - - model_data_subpath = f"data.{file_suffix}" - model_data_path = os.path.join(path, model_data_subpath) - - cls = load_dataset(dataset_type) - ds = cls(filepath=model_data_path, **dataset_args) - ds.save(data) - - conda_env_subpath = "conda.yaml" - if conda_env is None: - conda_env = DEFAULT_CONDA_ENV - elif not isinstance(conda_env, dict): - with open(conda_env, "r") as f: - conda_env = yaml.safe_load(f) - with open(os.path.join(path, conda_env_subpath), "w") as f: - yaml.safe_dump(conda_env, stream=f, default_flow_style=False) - - pyfunc.add_to_model( - mlflow_model, - loader_module=__name__, - data=model_data_subpath, - env=conda_env_subpath, - ) - - mlflow_model.add_flavor( - FLAVOR_NAME, - data=model_data_subpath, - dataset_type=dataset_type, - dataset_args=dataset_args, - file_suffix=file_suffix, - ) - mlflow_model.save(os.path.join(path, "MLmodel")) - - -def log_model( - model: Any, - artifact_path: str, - conda_env: Dict[str, Any] = None, - registered_model_name: str = None, - await_registration_for: int = DEFAULT_AWAIT_MAX_SLEEP_SECONDS, - *, - dataset_type: str, - dataset_args: Dict[str, Any], - file_suffix: str, -): - return Model.log( - artifact_path=artifact_path, - flavor=sys.modules[__name__], - registered_model_name=registered_model_name, - await_registration_for=await_registration_for, - data=model, - conda_env=conda_env, - dataset_type=dataset_type, - dataset_args=dataset_args, - file_suffix=file_suffix, - ) - - -def _load_model_from_local_file( - local_path: str, - *, - dataset_type: str = None, - dataset_args: Dict[str, Any] = None, - file_suffix: str = None, -): - if dataset_type is not None: - model_data_subpath = f"data.{file_suffix}" - data_path = os.path.join(local_path, model_data_subpath) - else: - flavor_conf = _get_flavor_configuration( - model_path=local_path, flavor_name=FLAVOR_NAME - ) - data_path = os.path.join(local_path, flavor_conf["data"]) - dataset_type = flavor_conf["dataset_type"] - dataset_args = flavor_conf["dataset_args"] - - cls = load_dataset(dataset_type) - ds = cls(filepath=data_path, **dataset_args) - return ds.load() - - -def load_model( - model_uri: str, - *, - dataset_type: str = None, - dataset_args: Dict[str, Any] = None, - file_suffix: str = None, -): - if dataset_type is not None or dataset_args is not None or file_suffix is not None: - assert ( - dataset_type is not None - and dataset_args is not None - and file_suffix is not None - ), ("Please set 'dataset_type', " "'dataset_args' and 'file_suffix'") - - local_path = _download_artifact_from_uri(model_uri) - return _load_model_from_local_file( - local_path, - dataset_type=dataset_type, - dataset_args=dataset_args, - file_suffix=file_suffix, - ) - - -def _load_pyfunc(model_file: str): - local_path = Path(model_file).parent.absolute() - model = _load_model_from_local_file(local_path) - if not hasattr(model, "predict"): - try: - setattr(model, "predict", None) - except AttributeError: - raise MlflowException( - f"`pyfunc` flavor not supported, use " f"{__name__}.load instead" - ) - return model diff --git a/kedro-datasets/kedro_datasets/databricks/mlflow/metrics.py b/kedro-datasets/kedro_datasets/databricks/mlflow/metrics.py deleted file mode 100644 index 1c7760375..000000000 --- a/kedro-datasets/kedro_datasets/databricks/mlflow/metrics.py +++ /dev/null @@ -1,93 +0,0 @@ -import logging -from typing import Any, Dict, Union - -import mlflow -from kedro.io.core import AbstractDataSet -from mlflow.exceptions import MlflowException -from mlflow.tracking import MlflowClient - -from .common import ModelOpsException - -logger = logging.getLogger(__name__) - - -class MLFlowMetrics(AbstractDataSet): - def __init__( - self, - prefix: str = None, - run_id: str = None, - registered_model_name: str = None, - registered_model_version: str = None, - ): - if None in (registered_model_name, registered_model_version): - if registered_model_name or registered_model_version: - raise ModelOpsException( - "'registered_model_name' and " - "'registered_model_version' should be " - "set together" - ) - - if run_id and registered_model_name: - raise ModelOpsException( - "'run_id' cannot be passed when " "'registered_model_name' is set" - ) - - self._prefix = prefix - self._run_id = run_id - self._registered_model_name = registered_model_name - self._registered_model_version = registered_model_version - - if registered_model_name: - self._version = f"{registered_model_name}/{registered_model_version}" - else: - self._version = run_id - - def _save(self, metrics: Dict[str, Union[str, float, int]]) -> None: - if self._prefix is not None: - metrics = {f"{self._prefix}_{key}": value for key, value in metrics.items()} - mlflow.log_metrics(metrics) - - run_id = mlflow.active_run().info.run_id - if self._version is not None: - logger.warning( - f"Ignoring version {self._version.save} set " - f"earlier, will use version='{run_id}' for loading" - ) - self._version = run_id - - def _load(self) -> Any: - if self._version is None: - msg = ( - "Could not determine the version to load. " - "Please specify either 'run_id' or 'registered_model_name' " - "along with 'registered_model_version' explicitly in " - "MLFlowMetrics constructor" - ) - raise MlflowException(msg) - - client = MlflowClient() - - if "/" in self._version: - model_uri = f"models:/{self._version}" - model = mlflow.pyfunc.load_model(model_uri) - run_id = model._model_meta.run_id - else: - run_id = self._version - - run = client.get_run(run_id) - metrics = run.data.metrics - if self._prefix is not None: - metrics = { - key[len(self._prefix) + 1 :]: value - for key, value in metrics.items() - if key[: len(self._prefix)] == self._prefix - } - return metrics - - def _describe(self) -> Dict[str, Any]: - return dict( - prefix=self._prefix, - run_id=self._run_id, - registered_model_name=self._registered_model_name, - registered_model_version=self._registered_model_version, - ) diff --git a/kedro-datasets/kedro_datasets/databricks/mlflow/model.py b/kedro-datasets/kedro_datasets/databricks/mlflow/model.py deleted file mode 100644 index c5f2356a2..000000000 --- a/kedro-datasets/kedro_datasets/databricks/mlflow/model.py +++ /dev/null @@ -1,75 +0,0 @@ -import importlib -import logging -from typing import Any, Dict - -from kedro.io.core import AbstractDataSet -from mlflow.models.signature import ModelSignature - -from .common import ModelOpsException, parse_model_uri - -logger = logging.getLogger(__name__) - - -class MLFlowModel(AbstractDataSet): - def __init__( - self, - flavor: str, - model_name: str, - signature: Dict[str, Dict[str, str]] = None, - input_example: Dict[str, Any] = None, - load_version: str = None, - ): - self._flavor = flavor - self._model_name = model_name - - if signature: - self._signature = ModelSignature.from_dict(signature) - else: - self._signature = None - self._input_example = input_example - - self._load_version = load_version - - def _save(self, model: Any) -> None: - if self._load_version is not None: - msg = ( - f"Trying to save an MLFlowModel::{self._describe} which " - f"was initialized with load_version={self._load_version}. " - f"This can lead to inconsistency between saved and loaded " - f"versions, therefore disallowed. Please create separate " - f"catalog entries for saved and loaded datasets." - ) - raise ModelOpsException(msg) - - importlib.import_module(self._flavor).log_model( - model, - self._model_name, - registered_model_name=self._model_name, - signature=self._signature, - input_example=self._input_example, - ) - - def _load(self) -> Any: - *_, latest_version = parse_model_uri(f"models:/{self._model_name}") - - model_version = self._load_version or latest_version - - logger.info(f"Loading model '{self._model_name}' version '{model_version}'") - - if model_version != latest_version: - logger.warning(f"Newer version {latest_version} exists in repo") - - model = importlib.import_module(self._flavor).load_model( - f"models:/{self._model_name}/{model_version}" - ) - - return model - - def _describe(self) -> Dict[str, Any]: - return dict( - flavor=self._flavor, - model_name=self._model_name, - signature=self._signature, - input_example=self._input_example, - load_version=self._load_version, - ) diff --git a/kedro-datasets/kedro_datasets/databricks/mlflow/model_metadata.py b/kedro-datasets/kedro_datasets/databricks/mlflow/model_metadata.py deleted file mode 100644 index 3c160cec4..000000000 --- a/kedro-datasets/kedro_datasets/databricks/mlflow/model_metadata.py +++ /dev/null @@ -1,49 +0,0 @@ -import logging -from typing import Any, Dict, Union - -import mlflow -from kedro.io.core import AbstractDataSet - -from .common import ModelOpsException, parse_model_uri - -logger = logging.getLogger(__name__) - - -class MLFlowModelMetadata(AbstractDataSet): - def __init__( - self, registered_model_name: str, registered_model_version: str = None - ): - self._model_name = registered_model_name - self._model_version = registered_model_version - - def _save(self, tags: Dict[str, Union[str, float, int]]) -> None: - raise NotImplementedError() - - def _load(self) -> Any: - if self._model_version is None: - model_uri = f"models:/{self._model_name}" - else: - model_uri = f"models:/{self._model_name}/{self._model_version}" - _, _, load_version = parse_model_uri(model_uri) - - if load_version is None: - raise ModelOpsException( - f"No model with version " f"'{self._model_version}'" - ) - - pyfunc_model = mlflow.pyfunc.load_model( - f"models:/{self._model_name}/{load_version}" - ) - all_metadata = pyfunc_model._model_meta - model_metadata = { - "model_name": self._model_name, - "model_version": int(load_version), - "run_id": all_metadata.run_id, - } - return model_metadata - - def _describe(self) -> Dict[str, Any]: - return dict( - registered_model_name=self._model_name, - registered_model_version=self._model_version, - ) diff --git a/kedro-datasets/kedro_datasets/databricks/mlflow/tags.py b/kedro-datasets/kedro_datasets/databricks/mlflow/tags.py deleted file mode 100644 index 153810ae4..000000000 --- a/kedro-datasets/kedro_datasets/databricks/mlflow/tags.py +++ /dev/null @@ -1,94 +0,0 @@ -import logging -from typing import Any, Dict, Union - -import mlflow -from kedro.io.core import AbstractDataSet -from mlflow.exceptions import MlflowException -from mlflow.tracking import MlflowClient - -from .common import ModelOpsException - -logger = logging.getLogger(__name__) - - -class MLFlowTags(AbstractDataSet): - def __init__( - self, - prefix: str = None, - run_id: str = None, - registered_model_name: str = None, - registered_model_version: str = None, - ): - if None in (registered_model_name, registered_model_version): - if registered_model_name or registered_model_version: - raise ModelOpsException( - "'registered_model_name' and " - "'registered_model_version' should be " - "set together" - ) - - if run_id and registered_model_name: - raise ModelOpsException( - "'run_id' cannot be passed when " "'registered_model_name' is set" - ) - - self._prefix = prefix - self._run_id = run_id - self._registered_model_name = registered_model_name - self._registered_model_version = registered_model_version - - if registered_model_name: - self._version = f"{registered_model_name}/{registered_model_version}" - else: - self._version = run_id - - def _save(self, tags: Dict[str, Union[str, float, int]]) -> None: - if self._prefix is not None: - tags = {f"{self._prefix}_{key}": value for key, value in tags.items()} - - mlflow.set_tags(tags) - - run_id = mlflow.active_run().info.run_id - if self._version is not None: - logger.warning( - f"Ignoring version {self._version.save} set " - f"earlier, will use version='{run_id}' for loading" - ) - self._version = run_id - - def _load(self) -> Any: - if self._version is None: - msg = ( - "Could not determine the version to load. " - "Please specify either 'run_id' or 'registered_model_name' " - "along with 'registered_model_version' explicitly in " - "MLFlowTags constructor" - ) - raise MlflowException(msg) - - client = MlflowClient() - - if "/" in self._version: - model_uri = f"models:/{self._version}" - model = mlflow.pyfunc.load_model(model_uri) - run_id = model._model_meta.run_id - else: - run_id = self._version - - run = client.get_run(run_id) - tags = run.data.tags - if self._prefix is not None: - tags = { - key[len(self._prefix) + 1 :]: value - for key, value in tags.items() - if key[: len(self._prefix)] == self._prefix - } - return tags - - def _describe(self) -> Dict[str, Any]: - return dict( - prefix=self._prefix, - run_id=self._run_id, - registered_model_name=self._registered_model_name, - registered_model_version=self._registered_model_version, - ) diff --git a/kedro-datasets/kedro_datasets/databricks/unity/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/unity/managed_table_dataset.py index b46122197..f0f04b7be 100644 --- a/kedro-datasets/kedro_datasets/databricks/unity/managed_table_dataset.py +++ b/kedro-datasets/kedro_datasets/databricks/unity/managed_table_dataset.py @@ -1,22 +1,26 @@ import logging -from typing import Any, Dict, List, Union import pandas as pd +from operator import attrgetter +from functools import partial +from cachetools.keys import hashkey +from typing import Any, Dict, List, Union +from cachetools import Cache, cachedmethod from kedro.io.core import ( AbstractVersionedDataSet, DataSetError, + Version, VersionNotFoundError, ) from pyspark.sql import DataFrame, SparkSession from pyspark.sql.types import StructType -from pyspark.sql.utils import AnalysisException from cachetools import Cache logger = logging.getLogger(__name__) class ManagedTableDataSet(AbstractVersionedDataSet): - """``ManagedTableDataSet`` loads data into Unity managed tables.""" + """``ManagedTableDataSet`` loads and saves data into managed delta tables.""" # this dataset cannot be used with ``ParallelRunner``, # therefore it has the attribute ``_SINGLE_PROCESS = True`` @@ -34,7 +38,7 @@ def __init__( write_mode: str = "overwrite", dataframe_type: str = "spark", primary_key: Union[str, List[str]] = None, - version: int = None, + version: Version = None, *, # the following parameters are used by the hook to create or update unity schema: Dict[str, Any] = None, # pylint: disable=unused-argument @@ -73,9 +77,8 @@ def __init__( ) self._primary_key = primary_key - - self._version = version self._version_cache = Cache(maxsize=2) + self._version = version self._schema = None if schema is not None: @@ -83,24 +86,16 @@ def __init__( def _get_spark(self) -> SparkSession: return ( - SparkSession.builder.config( - "spark.jars.packages", "io.delta:delta-core_2.12:1.2.1" - ) - .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") - .config( - "spark.sql.catalog.spark_catalog", - "org.apache.spark.sql.delta.catalog.DeltaCatalog", - ) - .getOrCreate() + SparkSession.builder.getOrCreate() ) def _load(self) -> Union[DataFrame, pd.DataFrame]: - if self._version is not None and self._version >= 0: + if self._version and self._version.load >= 0: try: data = ( self._get_spark() .read.format("delta") - .option("versionAsOf", self._version) + .option("versionAsOf", self._version.load) .table(self._full_table_address) ) except: diff --git a/kedro-datasets/tests/databricks/conftest.py b/kedro-datasets/tests/databricks/conftest.py index d360ffb68..26d63b056 100644 --- a/kedro-datasets/tests/databricks/conftest.py +++ b/kedro-datasets/tests/databricks/conftest.py @@ -6,7 +6,6 @@ """ import pytest from pyspark.sql import SparkSession -from delta.pip_utils import configure_spark_with_delta_pip @pytest.fixture(scope="class", autouse=True) diff --git a/kedro-datasets/tests/databricks/test_unity.py b/kedro-datasets/tests/databricks/test_unity.py index 471f81f57..0d54e29e4 100644 --- a/kedro-datasets/tests/databricks/test_unity.py +++ b/kedro-datasets/tests/databricks/test_unity.py @@ -1,5 +1,5 @@ import pytest -from kedro.io.core import DataSetError, VersionNotFoundError +from kedro.io.core import DataSetError, VersionNotFoundError, Version from pyspark.sql.types import IntegerType, StringType, StructField, StructType from pyspark.sql import DataFrame, SparkSession import pandas as pd @@ -195,6 +195,7 @@ def test_describe(self): "dataframe_type": "spark", "primary_key": None, "version": None, + "owner_group": None } def test_invalid_write_mode(self): @@ -413,7 +414,7 @@ def test_load_spark_no_version(self, sample_spark_df: DataFrame): unity_ds.save(sample_spark_df) delta_ds = ManagedTableDataSet( - database="test", table="test_load_spark", version=2 + database="test", table="test_load_spark", version=Version(2,None) ) with pytest.raises(VersionNotFoundError): _ = delta_ds.load() @@ -426,7 +427,7 @@ def test_load_version(self, sample_spark_df: DataFrame, append_spark_df: DataFra unity_ds.save(append_spark_df) loaded_ds = ManagedTableDataSet( - database="test", table="test_load_version", version=0 + database="test", table="test_load_version", version=Version(0,None) ) loaded_df = loaded_ds.load() From 5de5fd9c43231996a45a43d0989ffd54bc16d3ff Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Mon, 20 Feb 2023 05:08:25 -0500 Subject: [PATCH 15/74] Pass the `kedro_init_version` to `ProjectMetadata` (#119) Signed-off-by: Deepyaman Datta Signed-off-by: Danny Farah --- kedro-airflow/kedro_airflow/__init__.py | 2 +- kedro-airflow/tests/conftest.py | 1 + kedro-docker/kedro_docker/__init__.py | 2 +- kedro-telemetry/kedro_telemetry/__init__.py | 2 +- kedro-telemetry/tests/test_masking.py | 1 + kedro-telemetry/tests/test_plugin.py | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/kedro-airflow/kedro_airflow/__init__.py b/kedro-airflow/kedro_airflow/__init__.py index b8b71c285..44d4aab54 100644 --- a/kedro-airflow/kedro_airflow/__init__.py +++ b/kedro-airflow/kedro_airflow/__init__.py @@ -1,3 +1,3 @@ -""" Kedro plugin for running a project with Airflow """ +"""Kedro plugin for running a project with Airflow.""" __version__ = "0.5.1" diff --git a/kedro-airflow/tests/conftest.py b/kedro-airflow/tests/conftest.py index de8c4b0e2..c23cc5916 100644 --- a/kedro-airflow/tests/conftest.py +++ b/kedro-airflow/tests/conftest.py @@ -34,4 +34,5 @@ def metadata(cli_runner): # pylint: disable=unused-argument project_path, kedro_version, project_path / "src", + kedro_version, ) diff --git a/kedro-docker/kedro_docker/__init__.py b/kedro-docker/kedro_docker/__init__.py index e29633f8a..efe4970f8 100644 --- a/kedro-docker/kedro_docker/__init__.py +++ b/kedro-docker/kedro_docker/__init__.py @@ -1,3 +1,3 @@ -""" Kedro plugin for packaging a project with Docker """ +"""Kedro plugin for packaging a project with Docker.""" __version__ = "0.3.1" diff --git a/kedro-telemetry/kedro_telemetry/__init__.py b/kedro-telemetry/kedro_telemetry/__init__.py index 3b4ac5a6f..2315fc339 100644 --- a/kedro-telemetry/kedro_telemetry/__init__.py +++ b/kedro-telemetry/kedro_telemetry/__init__.py @@ -1,3 +1,3 @@ -"""Kedro Telemetry plugin for collecting Kedro usage data.""" +"""Kedro plugin for collecting Kedro usage data.""" __version__ = "0.2.3" diff --git a/kedro-telemetry/tests/test_masking.py b/kedro-telemetry/tests/test_masking.py index 981a07a05..74773e2f4 100644 --- a/kedro-telemetry/tests/test_masking.py +++ b/kedro-telemetry/tests/test_masking.py @@ -56,6 +56,7 @@ def fake_metadata(fake_root_dir): fake_root_dir / REPO_NAME, kedro_version, fake_root_dir / REPO_NAME / "src", + kedro_version, ) return metadata diff --git a/kedro-telemetry/tests/test_plugin.py b/kedro-telemetry/tests/test_plugin.py index 26ed0be6e..b53e6da1e 100644 --- a/kedro-telemetry/tests/test_plugin.py +++ b/kedro-telemetry/tests/test_plugin.py @@ -34,6 +34,7 @@ def fake_metadata(tmp_path): tmp_path / REPO_NAME, kedro_version, tmp_path / REPO_NAME / "src", + kedro_version, ) return metadata From cd3b6f3a62c3db076ee95f84c764905f71995584 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Tue, 21 Feb 2023 12:05:43 -0500 Subject: [PATCH 16/74] Keep Kedro-Airflow plugin docstring from appearing in `kedro -h` (#118) Signed-off-by: Danny Farah --- kedro-airflow/kedro_airflow/plugin.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kedro-airflow/kedro_airflow/plugin.py b/kedro-airflow/kedro_airflow/plugin.py index f5aebf2f5..c1a62b0f3 100644 --- a/kedro-airflow/kedro_airflow/plugin.py +++ b/kedro-airflow/kedro_airflow/plugin.py @@ -12,8 +12,7 @@ @click.group(name="Kedro-Airflow") -def commands(): - """Kedro plugin for running a project with Airflow""" +def commands(): # pylint: disable=missing-function-docstring pass From a56257c85e7673bfff7a7c1a9f3bbc2449ac2294 Mon Sep 17 00:00:00 2001 From: Yassine Alouini Date: Mon, 27 Feb 2023 10:42:00 +0100 Subject: [PATCH 17/74] Make the SQLQueryDataSet compatible with mssql. (#101) * [kedro-docker] Layers size optimization (#92) * [kedro-docker] Layers size optimization Signed-off-by: Mariusz Strzelecki * Adjust test requirements Signed-off-by: Mariusz Strzelecki * Skip coverage check on tests dir (some do not execute on Windows) Signed-off-by: Mariusz Strzelecki * Update .coveragerc with the setup Signed-off-by: Mariusz Strzelecki * Fix bandit so it does not scan kedro-datasets Signed-off-by: Mariusz Strzelecki * Fixed existence test Signed-off-by: Mariusz Strzelecki * Check why dir is not created Signed-off-by: Mariusz Strzelecki * Kedro starters are fixed now Signed-off-by: Mariusz Strzelecki * Increased no-output-timeout for long spark image build Signed-off-by: Mariusz Strzelecki * Spark image optimized Signed-off-by: Mariusz Strzelecki * Linting Signed-off-by: Mariusz Strzelecki * Switch to slim image always Signed-off-by: Mariusz Strzelecki * Trigger build Signed-off-by: Mariusz Strzelecki * Use textwrap.dedent for nicer indentation Signed-off-by: Mariusz Strzelecki * Revert "Use textwrap.dedent for nicer indentation" This reverts commit 3a1e3f855a29c6a1b118db3e844e5f9b67ade363. Signed-off-by: Mariusz Strzelecki * Revert "Revert "Use textwrap.dedent for nicer indentation"" This reverts commit d322d353b25d414cdfdef8ee12185e5a1d9baa2c. Signed-off-by: Mariusz Strzelecki * Make tests read more lines (to skip all deprecation warnings) Signed-off-by: Mariusz Strzelecki Signed-off-by: Mariusz Strzelecki Signed-off-by: Mariusz Strzelecki Signed-off-by: Yassine Alouini * Release Kedro-Docker 0.3.1 (#94) * Add release notes for kedro-docker 0.3.1 Signed-off-by: Jannic Holzer * Update version in kedro_docker module Signed-off-by: Jannic Holzer Signed-off-by: Jannic Holzer Signed-off-by: Yassine Alouini * Bump version and update release notes (#96) Signed-off-by: Merel Theisen Signed-off-by: Yassine Alouini * Make the SQLQueryDataSet compatible with mssql. Signed-off-by: Yassine Alouini * Add one test + update RELEASE.md. Signed-off-by: Yassine Alouini * Add missing pyodbc for tests. Signed-off-by: Yassine Alouini * Mock connection as well. Signed-off-by: Yassine Alouini * Add more dates parsing for mssql backend (thanks to fgaudindelrieu@idmog.com) Signed-off-by: Yassine Alouini * Fix an error in docstring of MetricsDataSet (#98) Signed-off-by: Yassine Alouini * Bump relax pyarrow version to work the same way as Pandas (#100) * Bump relax pyarrow version to work the same way as Pandas We only use PyArrow for `pandas.ParquetDataSet` as such I suggest we keep our versions pinned to the same range as [Pandas does](https://github.com/pandas-dev/pandas/blob/96fc51f5ec678394373e2c779ccff37ddb966e75/pyproject.toml#L100) for the same reason. As such I suggest we remove the upper bound as we have users requesting later versions in [support channels](https://kedro-org.slack.com/archives/C03RKP2LW64/p1674040509133529) * Updated release notes Signed-off-by: Yassine Alouini * Add missing type in catalog example. Signed-off-by: Yassine Alouini * Add one more unit tests for adapt_mssql. Signed-off-by: Yassine Alouini * [FIX] Add missing mocker from date test. Signed-off-by: Yassine Alouini * [TEST] Add a wrong input test. Signed-off-by: Yassine Alouini * Add pyodbc dependency. Signed-off-by: Yassine Alouini * [FIX] Remove dict() in tests. Signed-off-by: Yassine Alouini * Change check to check on plugin name (#103) Signed-off-by: Merel Theisen Signed-off-by: Yassine Alouini * Set coverage in pyproject.toml (#105) Signed-off-by: Merel Theisen Signed-off-by: Yassine Alouini * Move coverage settings to pyproject.toml (#106) Signed-off-by: Merel Theisen Signed-off-by: Yassine Alouini * Replace kedro.pipeline with modular_pipeline.pipeline factory (#99) * Add non-spark related test changes Replace kedro.pipeline.Pipeline with kedro.pipeline.modular_pipeline.pipeline factory. This is for symmetry with changes made to the main kedro library. Signed-off-by: Adam Farley Signed-off-by: Yassine Alouini * Fix outdated links in Kedro Datasets (#111) * fix links * fix dill links Signed-off-by: Yassine Alouini * Fix docs formatting and phrasing for some datasets (#107) * Fix docs formatting and phrasing for some datasets Signed-off-by: Deepyaman Datta * Manually fix files not resolved with patch command Signed-off-by: Deepyaman Datta * Apply fix from #98 Signed-off-by: Deepyaman Datta --------- Signed-off-by: Deepyaman Datta Signed-off-by: Yassine Alouini * Release `kedro-datasets` `version 1.0.2` (#112) * bump version and update release notes * fix pylint errors Signed-off-by: Yassine Alouini * Bump pytest to 7.2 (#113) Signed-off-by: Merel Theisen Signed-off-by: Yassine Alouini * Prefix Docker plugin name with "Kedro-" in usage message (#57) * Prefix Docker plugin name with "Kedro-" in usage message Signed-off-by: Deepyaman Datta Signed-off-by: Yassine Alouini * Keep Kedro-Docker plugin docstring from appearing in `kedro -h` (#56) * Keep Kedro-Docker plugin docstring from appearing in `kedro -h` Signed-off-by: Deepyaman Datta Signed-off-by: Yassine Alouini * [kedro-datasets ] Add `Polars.CSVDataSet` (#95) Signed-off-by: wmoreiraa Signed-off-by: Yassine Alouini * Remove deprecated `test_requires` from `setup.py` in Kedro-Docker (#54) Signed-off-by: Deepyaman Datta Signed-off-by: Yassine Alouini * [FIX] Fix ds to data_set. Signed-off-by: Yassine Alouini --------- Signed-off-by: Mariusz Strzelecki Signed-off-by: Mariusz Strzelecki Signed-off-by: Yassine Alouini Signed-off-by: Jannic Holzer Signed-off-by: Merel Theisen Signed-off-by: Deepyaman Datta Co-authored-by: Mariusz Strzelecki Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com> Co-authored-by: Merel Theisen <49397448+merelcht@users.noreply.github.com> Co-authored-by: OKA Naoya Co-authored-by: Joel <35801847+datajoely@users.noreply.github.com> Co-authored-by: adamfrly <45516720+adamfrly@users.noreply.github.com> Co-authored-by: Sajid Alam <90610031+SajidAlamQB@users.noreply.github.com> Co-authored-by: Deepyaman Datta Co-authored-by: Walber Moreira <58264877+wmoreiraa@users.noreply.github.com> Signed-off-by: Danny Farah --- kedro-datasets/RELEASE.md | 2 +- .../kedro_datasets/pandas/sql_dataset.py | 69 +++++++++++++++++++ kedro-datasets/setup.py | 2 +- kedro-datasets/test_requirements.txt | 1 + .../tests/pandas/test_sql_dataset.py | 54 +++++++++++++++ 5 files changed, 126 insertions(+), 2 deletions(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 3b51df818..412fe9f9c 100644 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -11,7 +11,7 @@ | `polars.CSVDataSet` | A `CSVDataSet` backed by [polars](https://www.pola.rs/), a lighting fast dataframe package built entirely using Rust. | `kedro_datasets.polars` | ## Bug fixes and other changes - +* Add `mssql` backend to the `SQLQueryDataSet` DataSet using `pyodbc` library. # Release 1.0.2: diff --git a/kedro-datasets/kedro_datasets/pandas/sql_dataset.py b/kedro-datasets/kedro_datasets/pandas/sql_dataset.py index 1400e4981..dd5d636a1 100644 --- a/kedro-datasets/kedro_datasets/pandas/sql_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/sql_dataset.py @@ -1,6 +1,7 @@ """``SQLDataSet`` to load and save data to a SQL backend.""" import copy +import datetime as dt import re from pathlib import PurePosixPath from typing import Any, Dict, NoReturn, Optional @@ -22,6 +23,7 @@ "psycopg2": "psycopg2", "mysqldb": "mysqlclient", "cx_Oracle": "cx_Oracle", + "mssql": "pyodbc", } DRIVER_ERROR_MESSAGE = """ @@ -321,7 +323,49 @@ class SQLQueryDataSet(AbstractDataSet[None, pd.DataFrame]): >>> credentials=credentials) >>> >>> sql_data = data_set.load() + >>> + Example of usage for mssql: + :: + + + >>> credentials = {"server": "localhost", "port": "1433", + >>> "database": "TestDB", "user": "SA", + >>> "password": "StrongPassword"} + >>> def _make_mssql_connection_str( + >>> server: str, port: str, database: str, user: str, password: str + >>> ) -> str: + >>> import pyodbc # noqa + >>> from sqlalchemy.engine import URL # noqa + >>> + >>> driver = pyodbc.drivers()[-1] + >>> connection_str = (f"DRIVER={driver};SERVER={server},{port};DATABASE={database};" + >>> f"ENCRYPT=yes;UID={user};PWD={password};" + >>> "TrustServerCertificate=yes;") + >>> return URL.create("mssql+pyodbc", query={"odbc_connect": connection_str}) + >>> connection_str = _make_mssql_connection_str(**credentials) + >>> data_set = SQLQueryDataSet(credentials={"con": connection_str}, + >>> sql="SELECT TOP 5 * FROM TestTable;") + >>> df = data_set.load() + + In addition, here is an example of a catalog with dates parsing: + :: + + >>> mssql_dataset: + >>> type: kedro_datasets.pandas.SQLQueryDataSet + >>> credentials: mssql_credentials + >>> sql: > + >>> SELECT * + >>> FROM DateTable + >>> WHERE date >= ? AND date <= ? + >>> ORDER BY date + >>> load_args: + >>> params: + >>> - ${begin} + >>> - ${end} + >>> index_col: date + >>> parse_dates: + >>> date: "%Y-%m-%d %H:%M:%S.%f0 %z" """ # using Any because of Sphinx but it should be @@ -413,6 +457,8 @@ def __init__( # pylint: disable=too-many-arguments self._connection_str = credentials["con"] self._execution_options = execution_options or {} self.create_connection(self._connection_str) + if "mssql" in self._connection_str: + self.adapt_mssql_date_params() @classmethod def create_connection(cls, connection_str: str) -> None: @@ -456,3 +502,26 @@ def _load(self) -> pd.DataFrame: def _save(self, data: None) -> NoReturn: raise DataSetError("'save' is not supported on SQLQueryDataSet") + + # For mssql only + def adapt_mssql_date_params(self) -> None: + """We need to change the format of datetime parameters. + MSSQL expects datetime in the exact format %y-%m-%dT%H:%M:%S. + Here, we also accept plain dates. + `pyodbc` does not accept named parameters, they must be provided as a list.""" + params = self._load_args.get("params", []) + if not isinstance(params, list): + raise DataSetError( + "Unrecognized `params` format. It can be only a `list`, " + f"got {type(params)!r}" + ) + new_load_args = [] + for value in params: + try: + as_date = dt.date.fromisoformat(value) + new_val = dt.datetime.combine(as_date, dt.time.min) + new_load_args.append(new_val.strftime("%Y-%m-%dT%H:%M:%S")) + except (TypeError, ValueError): + new_load_args.append(value) + if new_load_args: + self._load_args["params"] = new_load_args diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py index 4b9a05f1a..c6f5915fc 100644 --- a/kedro-datasets/setup.py +++ b/kedro-datasets/setup.py @@ -67,7 +67,7 @@ def _collect_requirements(requires): "pandas.JSONDataSet": [PANDAS], "pandas.ParquetDataSet": [PANDAS, "pyarrow>=6.0"], "pandas.SQLTableDataSet": [PANDAS, "SQLAlchemy~=1.2"], - "pandas.SQLQueryDataSet": [PANDAS, "SQLAlchemy~=1.2"], + "pandas.SQLQueryDataSet": [PANDAS, "SQLAlchemy~=1.2", "pyodbc~=4.0"], "pandas.XMLDataSet": [PANDAS, "lxml~=4.6"], "pandas.GenericDataSet": [PANDAS], } diff --git a/kedro-datasets/test_requirements.txt b/kedro-datasets/test_requirements.txt index b4424ce7d..c6fc16a3e 100644 --- a/kedro-datasets/test_requirements.txt +++ b/kedro-datasets/test_requirements.txt @@ -39,6 +39,7 @@ pre-commit>=2.9.2, <3.0 # The hook `mypy` requires pre-commit version 2.9.2. psutil==5.8.0 pyarrow>=1.0, <7.0 pylint>=2.5.2, <3.0 +pyodbc~=4.0.35 pyproj~=3.0 pyspark>=2.2, <4.0 pytest-cov~=3.0 diff --git a/kedro-datasets/tests/pandas/test_sql_dataset.py b/kedro-datasets/tests/pandas/test_sql_dataset.py index a1c6839d6..aa9fe8d17 100644 --- a/kedro-datasets/tests/pandas/test_sql_dataset.py +++ b/kedro-datasets/tests/pandas/test_sql_dataset.py @@ -11,6 +11,7 @@ TABLE_NAME = "table_a" CONNECTION = "sqlite:///kedro.db" +MSSQL_CONNECTION = "mssql+pyodbc://?odbc_connect=DRIVER%3DODBC+Driver+for+SQL" SQL_QUERY = "SELECT * FROM table_a" EXECUTION_OPTIONS = {"stream_results": True} FAKE_CONN_STR = "some_sql://scott:tiger@localhost/foo" @@ -417,3 +418,56 @@ def test_create_connection_only_once(self, mocker): assert mock_engine.call_count == 2 assert fourth.engines == first.engines assert len(first.engines) == 2 + + def test_adapt_mssql_date_params_called(self, mocker): + """Test that the adapt_mssql_date_params + function is called when mssql backend is used. + """ + mock_adapt_mssql_date_params = mocker.patch( + "kedro_datasets.pandas.sql_dataset.SQLQueryDataSet.adapt_mssql_date_params" + ) + mock_engine = mocker.patch("kedro_datasets.pandas.sql_dataset.create_engine") + ds = SQLQueryDataSet(sql=SQL_QUERY, credentials={"con": MSSQL_CONNECTION}) + mock_engine.assert_called_once_with(MSSQL_CONNECTION) + assert mock_adapt_mssql_date_params.call_count == 1 + assert len(ds.engines) == 1 + + def test_adapt_mssql_date_params(self, mocker): + """Test that the adapt_mssql_date_params + function transforms the params as expected, i.e. + making datetime date into the format %Y-%m-%dT%H:%M:%S + and ignoring the other values. + """ + mocker.patch("kedro_datasets.pandas.sql_dataset.create_engine") + load_args = { + "params": ["2023-01-01", "2023-01-01T20:26", "2023", "test", 1.0, 100] + } + ds = SQLQueryDataSet( + sql=SQL_QUERY, credentials={"con": MSSQL_CONNECTION}, load_args=load_args + ) + assert ds._load_args["params"] == [ + "2023-01-01T00:00:00", + "2023-01-01T20:26", + "2023", + "test", + 1.0, + 100, + ] + + def test_adapt_mssql_date_params_wrong_input(self, mocker): + """Test that the adapt_mssql_date_params + function fails with the correct error message + when given a wrong input + """ + mocker.patch("kedro_datasets.pandas.sql_dataset.create_engine") + load_args = {"params": {"value": 1000}} + pattern = ( + "Unrecognized `params` format. It can be only a `list`, " + "got " + ) + with pytest.raises(DataSetError, match=pattern): + SQLQueryDataSet( + sql=SQL_QUERY, + credentials={"con": MSSQL_CONNECTION}, + load_args=load_args, + ) From e14f5b49505320d23838b91317bfa76bc3dfdc3d Mon Sep 17 00:00:00 2001 From: Jannic <37243923+jmholzer@users.noreply.github.com> Date: Mon, 6 Mar 2023 12:58:09 +0000 Subject: [PATCH 18/74] Add warning when `SparkDataSet` is used on Databricks without a valid file path (#114) * Add databricks deployment check and automatic DBFS path addition Signed-off-by: Jannic Holzer * Add newline at end of file Signed-off-by: Jannic Holzer * Remove spurious 'not' Signed-off-by: Jannic Holzer * Move dbfs utility functions from SparkDataSet Signed-off-by: Jannic Holzer * Add edge case logic to _build_dbfs_path Signed-off-by: Jannic Holzer * Add test for dbfs path construction Signed-off-by: Jannic Holzer * Linting Signed-off-by: Jannic Holzer * Remove spurious print statement :) Signed-off-by: Jannic Holzer * Add pylint disable too-many-public-methods Signed-off-by: Jannic Holzer * Move tests into single method to appease linter Signed-off-by: Jannic Holzer * Modify prefix check to /dbfs/ Signed-off-by: Jannic Holzer * Modify prefix check to /dbfs/ Signed-off-by: Jannic Holzer * Make warning message clearer Signed-off-by: Jannic Holzer * Add release note Signed-off-by: Jannic Holzer * Fix linting Signed-off-by: Jannic Holzer * Update warning message Signed-off-by: Jannic Holzer * Modify log warning level to error Signed-off-by: Jannic Holzer * Modify message back to warning, refer to undefined behaviour Signed-off-by: Jannic Holzer * Modify required prefix to /dbfs/ Signed-off-by: Jannic Holzer * Modify doc string Signed-off-by: Jannic Holzer * Modify warning message Signed-off-by: Jannic Holzer * Split tests and add filepath to warning Signed-off-by: Jannic Holzer * Modify f string in logging call Signed-off-by: Jannic Holzer * Fix tests Signed-off-by: Jannic Holzer * Lint Signed-off-by: Jannic Holzer --------- Signed-off-by: Jannic Holzer Signed-off-by: Danny Farah --- kedro-datasets/RELEASE.md | 3 ++ .../kedro_datasets/spark/spark_dataset.py | 29 +++++++++++++++--- .../tests/spark/test_spark_dataset.py | 30 +++++++++++++++++++ 3 files changed, 58 insertions(+), 4 deletions(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 412fe9f9c..36b2d6c12 100644 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -13,6 +13,9 @@ ## Bug fixes and other changes * Add `mssql` backend to the `SQLQueryDataSet` DataSet using `pyodbc` library. +## Bug fixes and other changes +* Added a warning when the user tries to use `SparkDataSet` on Databricks without specifying a file path with the `/dbfs/` prefix. + # Release 1.0.2: ## Bug fixes and other changes diff --git a/kedro-datasets/kedro_datasets/spark/spark_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_dataset.py index ca923c72e..d366eae08 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_dataset.py @@ -2,6 +2,8 @@ ``pyspark`` """ import json +import logging +import os from copy import deepcopy from fnmatch import fnmatch from functools import partial @@ -23,6 +25,8 @@ from pyspark.sql.utils import AnalysisException from s3fs import S3FileSystem +logger = logging.getLogger(__name__) + def _parse_glob_pattern(pattern: str) -> str: special = ("*", "?", "[") @@ -114,6 +118,20 @@ def _dbfs_exists(pattern: str, dbutils: Any) -> bool: return False +def _deployed_on_databricks() -> bool: + """Check if running on Databricks.""" + return "DATABRICKS_RUNTIME_VERSION" in os.environ + + +def _path_has_dbfs_prefix(path: str) -> bool: + """Check if a file path has a valid dbfs prefix. + + Args: + path: File path to check. + """ + return path.startswith("/dbfs/") + + class KedroHdfsInsecureClient(InsecureClient): """Subclasses ``hdfs.InsecureClient`` and implements ``hdfs_exists`` and ``hdfs_glob`` methods required by ``SparkDataSet``""" @@ -240,9 +258,7 @@ def __init__( # pylint: disable=too-many-arguments Args: filepath: Filepath in POSIX format to a Spark dataframe. When using Databricks - and working with data written to mount path points, - specify ``filepath``s for (versioned) ``SparkDataSet``s - starting with ``/dbfs/mnt``. + specify ``filepath``s starting with ``/dbfs/``. file_format: File format used during load and save operations. These are formats supported by the running SparkContext include parquet, csv, delta. For a list of supported @@ -304,7 +320,12 @@ def __init__( # pylint: disable=too-many-arguments else: path = PurePosixPath(filepath) - + if _deployed_on_databricks() and not _path_has_dbfs_prefix(filepath): + logger.warning( + "Using SparkDataSet on Databricks without the `/dbfs/` prefix in the " + "filepath is a known source of error. You must add this prefix to %s", + filepath, + ) if filepath.startswith("/dbfs"): dbutils = _get_dbutils(self._get_spark()) if dbutils: diff --git a/kedro-datasets/tests/spark/test_spark_dataset.py b/kedro-datasets/tests/spark/test_spark_dataset.py index d02f99bff..74c5ee2bf 100644 --- a/kedro-datasets/tests/spark/test_spark_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_dataset.py @@ -1,3 +1,4 @@ +# pylint: disable=too-many-lines import re import sys import tempfile @@ -161,6 +162,7 @@ def isDir(self): return "." not in self.path.split("/")[-1] +# pylint: disable=too-many-public-methods class TestSparkDataSet: def test_load_parquet(self, tmp_path, sample_pandas_df): temp_path = (tmp_path / "data").as_posix() @@ -440,6 +442,34 @@ def test_copy(self): assert spark_dataset_copy._file_format == "csv" assert spark_dataset_copy._save_args == {"mode": "overwrite"} + def test_dbfs_prefix_warning_no_databricks(self, caplog): + # test that warning is not raised when not on Databricks + filepath = "my_project/data/02_intermediate/processed_data" + expected_message = ( + "Using SparkDataSet on Databricks without the `/dbfs/` prefix in the " + f"filepath is a known source of error. You must add this prefix to {filepath}." + ) + SparkDataSet(filepath="my_project/data/02_intermediate/processed_data") + assert expected_message not in caplog.text + + def test_dbfs_prefix_warning_on_databricks_with_prefix(self, monkeypatch, caplog): + # test that warning is not raised when on Databricks and filepath has /dbfs prefix + filepath = "/dbfs/my_project/data/02_intermediate/processed_data" + monkeypatch.setenv("DATABRICKS_RUNTIME_VERSION", "7.3") + SparkDataSet(filepath=filepath) + assert caplog.text == "" + + def test_dbfs_prefix_warning_on_databricks_no_prefix(self, monkeypatch, caplog): + # test that warning is raised when on Databricks and filepath does not have /dbfs prefix + filepath = "my_project/data/02_intermediate/processed_data" + expected_message = ( + "Using SparkDataSet on Databricks without the `/dbfs/` prefix in the " + f"filepath is a known source of error. You must add this prefix to {filepath}" + ) + monkeypatch.setenv("DATABRICKS_RUNTIME_VERSION", "7.3") + SparkDataSet(filepath=filepath) + assert expected_message in caplog.text + class TestSparkDataSetVersionedLocal: def test_no_version(self, versioned_dataset_local): From 524535abc83203f4337c15c40d6793a56ee99bb8 Mon Sep 17 00:00:00 2001 From: Danny Farah Date: Wed, 8 Mar 2023 15:14:37 -0500 Subject: [PATCH 19/74] cleaned up mlflow references from setup.py for initial release Signed-off-by: Danny Farah --- kedro-datasets/setup.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py index c6f5915fc..0d0a8505f 100644 --- a/kedro-datasets/setup.py +++ b/kedro-datasets/setup.py @@ -39,13 +39,7 @@ def _collect_requirements(requires): biosequence_require = {"biosequence.BioSequenceDataSet": ["biopython~=1.73"]} dask_require = {"dask.ParquetDataSet": ["dask[complete]~=2021.10", "triad>=0.6.7, <1.0"]} databricks_require = { - "databricks.ManagedTableDataSet": [SPARK, PANDAS], - "databricks.MLFlowModel":[SPARK, PANDAS, "mlflow>=2.0.0"], - "databricks.MLFlowArtifact":[SPARK, PANDAS, "mlflow>=2.0.0"], - "databricks.MLFlowDataSet":[SPARK, PANDAS, "mlflow>=2.0.0"], - "databricks.MLFlowMetrics":[SPARK, PANDAS, "mlflow>=2.0.0"], - "databricks.MLFlowModelMetadata":[SPARK, PANDAS, "mlflow>=2.0.0"], - "databricks.MLFlowTags":[SPARK, PANDAS, "mlflow>=2.0.0"] + "databricks.ManagedTableDataSet": [SPARK, PANDAS] } geopandas_require = { "geopandas.GeoJSONDataSet": ["geopandas>=0.6.0, <1.0", "pyproj~=3.0"] From 9389aa457248f8ec915b1c7b8b7192c9d7b60454 Mon Sep 17 00:00:00 2001 From: Danny Farah Date: Wed, 8 Mar 2023 15:16:51 -0500 Subject: [PATCH 20/74] fixed deps in setup.py Signed-off-by: Danny Farah --- kedro-datasets/setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py index 0d0a8505f..44bb97185 100644 --- a/kedro-datasets/setup.py +++ b/kedro-datasets/setup.py @@ -14,6 +14,7 @@ HDFS = "hdfs>=2.5.8, <3.0" S3FS = "s3fs>=0.3.0, <0.5" POLARS = "polars~=0.15.16" +DELTA = "delta-spark~=1.2.1" with open("requirements.txt", "r", encoding="utf-8") as f: install_requires = [x.strip() for x in f if x.strip()] @@ -39,7 +40,7 @@ def _collect_requirements(requires): biosequence_require = {"biosequence.BioSequenceDataSet": ["biopython~=1.73"]} dask_require = {"dask.ParquetDataSet": ["dask[complete]~=2021.10", "triad>=0.6.7, <1.0"]} databricks_require = { - "databricks.ManagedTableDataSet": [SPARK, PANDAS] + "databricks.ManagedTableDataSet": [SPARK, PANDAS, DELTA] } geopandas_require = { "geopandas.GeoJSONDataSet": ["geopandas>=0.6.0, <1.0", "pyproj~=3.0"] From e6157a541709161d1d41a3037a315c59a70f122c Mon Sep 17 00:00:00 2001 From: Danny Farah Date: Mon, 13 Mar 2023 18:08:37 -0400 Subject: [PATCH 21/74] adding comments before intiial PR Signed-off-by: Danny Farah --- .../kedro_datasets/databricks/__init__.py | 7 +- .../databricks/managed_table_dataset.py | 342 ++++++++++++++++++ .../databricks/unity/__init__.py | 1 - .../databricks/unity/managed_table_dataset.py | 198 ---------- .../kedro_datasets/pandas/generic_dataset.py | 2 - .../kedro_datasets/spark/spark_dataset.py | 2 - .../spark/spark_jdbc_dataset.py | 1 - ...unity.py => test_managed_table_dataset.py} | 33 +- .../matplotlib/test_matplotlib_writer.py | 2 - .../tests/polars/test_csv_dataset.py | 1 - 10 files changed, 366 insertions(+), 223 deletions(-) create mode 100644 kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py delete mode 100644 kedro-datasets/kedro_datasets/databricks/unity/__init__.py delete mode 100644 kedro-datasets/kedro_datasets/databricks/unity/managed_table_dataset.py rename kedro-datasets/tests/databricks/{test_unity.py => test_managed_table_dataset.py} (94%) diff --git a/kedro-datasets/kedro_datasets/databricks/__init__.py b/kedro-datasets/kedro_datasets/databricks/__init__.py index 7819a2e06..d416ac291 100644 --- a/kedro-datasets/kedro_datasets/databricks/__init__.py +++ b/kedro-datasets/kedro_datasets/databricks/__init__.py @@ -1,3 +1,8 @@ """Provides interface to Unity Catalog Tables.""" -from .unity import ManagedTableDataSet +__all__ = ["ManagedTableDataSet"] + +from contextlib import suppress + +with suppress(ImportError): + from .managed_table_dataset import ManagedTableDataSet diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py new file mode 100644 index 000000000..1b9e0c737 --- /dev/null +++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py @@ -0,0 +1,342 @@ +"""``ManagedTableDataSet`` implementation to access managed delta tables +in Databricks. +""" +import dataclasses +import logging +from functools import partial +from operator import attrgetter +from typing import Any, Dict, List, Union + +import pandas as pd +from cachetools import Cache, cachedmethod +from cachetools.keys import hashkey +from kedro.io.core import ( + AbstractVersionedDataSet, + DataSetError, + Version, + VersionNotFoundError, +) +from pyspark.sql import DataFrame, SparkSession +from pyspark.sql.types import StructType +from pyspark.sql.utils import AnalysisException, ParseException + +logger = logging.getLogger(__name__) + + +@dataclasses.dataclass +class Table: # pylint: disable=R0902 + """Stores the definition of a managed table""" + + database: str + catalog: str + table: str + full_table_location: str + write_mode: str + dataframe_type: str + primary_key: str + owner_group: str + partition_columns: str | List[str] + + +class ManagedTableDataSet(AbstractVersionedDataSet): + """``ManagedTableDataSet`` loads and saves data into managed delta tables on Databricks. + + Example usage for the + `YAML API `_: + .. code-block:: yaml + + names_and_ages@spark: + type: databricks.ManagedTableDataSet + table: names_and_ages + + names_and_ages@pandas: + type: databricks.ManagedTableDataSet + table: names_and_ages + dataframe_type: pandas + + Example usage for the + `Python API `_: + :: + Launch a pyspark session with the following configs: + % pyspark --packages io.delta:delta-core_2.12:1.2.1 + --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" + --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" + + >>> from pyspark.sql import SparkSession + >>> from pyspark.sql.types import (StructField, StringType, + IntegerType, StructType) + >>> from kedro_datasets.databricks import ManagedTableDataSet + >>> schema = StructType([StructField("name", StringType(), True), + StructField("age", IntegerType(), True)]) + >>> data = [('Alex', 31), ('Bob', 12), ('Clarke', 65), ('Dave', 29)] + >>> spark_df = SparkSession.builder.getOrCreate().createDataFrame(data, schema) + >>> data_set = ManagedTableDataSet(table="names_and_ages") + >>> data_set.save(spark_df) + >>> reloaded = data_set.load() + >>> reloaded.take(4)""" + + # this dataset cannot be used with ``ParallelRunner``, + # therefore it has the attribute ``_SINGLE_PROCESS = True`` + # for parallelism within a Spark pipeline please consider + # using ``ThreadRunner`` instead + _SINGLE_PROCESS = True + _VALID_WRITE_MODES = ["overwrite", "upsert", "append"] + _VALID_DATAFRAME_TYPES = ["spark", "pandas"] + + def __init__( # pylint: disable=R0913 + self, + table: str, + catalog: str = None, + database: str = "default", + write_mode: str = "overwrite", + dataframe_type: str = "spark", + primary_key: Union[str, List[str]] = None, + version: Version = None, + *, + # the following parameters are used by project hooks + # to create or update table properties + schema: Dict[str, Any] = None, + partition_columns: List[str] = None, + owner_group: str = None, + ) -> None: + """Creates a new instance of ``ManagedTableDataSet``.""" + + full_table_location = None + if catalog and database and table: + full_table_location = f"{catalog}.{database}.{table}" + elif table: + full_table_location = f"{database}.{table}" + if write_mode not in self._VALID_WRITE_MODES: + valid_modes = ", ".join(self._VALID_WRITE_MODES) + raise DataSetError( + f"Invalid `write_mode` provided: {write_mode}. " + f"`write_mode` must be one of: {valid_modes}" + ) + if dataframe_type not in self._VALID_DATAFRAME_TYPES: + valid_types = ", ".join(self._VALID_DATAFRAME_TYPES) + raise DataSetError(f"`dataframe_type` must be one of {valid_types}") + if primary_key is None or len(primary_key) == 0: + if write_mode == "upsert": + raise DataSetError( + f"`primary_key` must be provided for" f"`write_mode` {write_mode}" + ) + self._table = Table( + database=database, + catalog=catalog, + table=table, + full_table_location=full_table_location, + write_mode=write_mode, + dataframe_type=dataframe_type, + primary_key=primary_key, + owner_group=owner_group, + partition_columns=partition_columns, + ) + + self._version_cache = Cache(maxsize=2) + self._version = version + + self._schema = None + if schema is not None: + self._schema = StructType.fromJson(schema) + + super().__init__( + filepath=None, + version=version, + exists_function=self._exists, + ) + + @cachedmethod(cache=attrgetter("_version_cache"), key=partial(hashkey, "load")) + def _fetch_latest_load_version(self) -> int: + # When load version is unpinned, fetch the most recent existing + # version from the given path. + latest_history = ( + self._get_spark() + .sql(f"DESCRIBE HISTORY {self._table.full_table_location} LIMIT 1") + .collect() + ) + if len(latest_history) != 1: + raise VersionNotFoundError( + f"Did not find any versions for {self._table.full_table_location}" + ) + return latest_history[0].version + + # 'key' is set to prevent cache key overlapping for load and save: + # https://cachetools.readthedocs.io/en/stable/#cachetools.cachedmethod + @cachedmethod(cache=attrgetter("_version_cache"), key=partial(hashkey, "save")) + def _fetch_latest_save_version(self) -> int: + """Generate and cache the current save version""" + return None + + @staticmethod + def _get_spark() -> SparkSession: + return SparkSession.builder.getOrCreate() + + def _load(self) -> Union[DataFrame, pd.DataFrame]: + """Loads the version of data in the format defined in the init + (spark|pandas dataframe) + + Raises: + VersionNotFoundError: if the version defined in + the init doesn't exist + + Returns: + Union[DataFrame, pd.DataFrame]: Returns a dataframe + in the format defined in the init + """ + if self._version and self._version.load >= 0: + try: + data = ( + self._get_spark() + .read.format("delta") + .option("versionAsOf", self._version.load) + .table(self._table.full_table_location) + ) + except Exception as exc: + raise VersionNotFoundError(self._version) from exc + else: + data = self._get_spark().table(self._table.full_table_location) + if self._table.dataframe_type == "pandas": + data = data.toPandas() + return data + + def _save_append(self, data: DataFrame) -> None: + """Saves the data to the table by appending it + to the location defined in the init + + Args: + data (DataFrame): the Spark dataframe to append to the table + """ + data.write.format("delta").mode("append").saveAsTable( + self._table.full_table_location + ) + + def _save_overwrite(self, data: DataFrame) -> None: + """Overwrites the data in the table with the data provided. + (this is the default save mode) + + Args: + data (DataFrame): the Spark dataframe to overwrite the table with. + """ + delta_table = data.write.format("delta") + if self._table.write_mode == "overwrite": + delta_table = delta_table.mode("overwrite").option( + "overwriteSchema", "true" + ) + delta_table.saveAsTable(self._table.full_table_location) + + def _save_upsert(self, update_data: DataFrame) -> None: + """Upserts the data by joining on primary_key columns or column. + If table doesn't exist at save, the data is inserted to a new table. + + Args: + update_data (DataFrame): the Spark dataframe to upsert + """ + if self._exists(): + base_data = self._get_spark().table(self._table.full_table_location) + base_columns = base_data.columns + update_columns = update_data.columns + + if set(update_columns) != set(base_columns): + raise DataSetError( + f"Upsert requires tables to have identical columns. " + f"Delta table {self._table.full_table_location} " + f"has columns: {base_columns}, whereas " + f"dataframe has columns {update_columns}" + ) + + where_expr = "" + if isinstance(self._table.primary_key, str): + where_expr = ( + f"base.{self._table.primary_key}=update.{self._table.primary_key}" + ) + elif isinstance(self._table.primary_key, list): + where_expr = " AND ".join( + f"base.{col}=update.{col}" for col in self._table.primary_key + ) + + update_data.createOrReplaceTempView("update") + self._get_spark().conf.set( + "fullTableAddress", self._table.full_table_location + ) + self._get_spark().conf.set("whereExpr", where_expr) + upsert_sql = """MERGE INTO ${fullTableAddress} base USING update ON ${whereExpr} + WHEN MATCHED THEN UPDATE SET * WHEN NOT MATCHED THEN INSERT *""" + self._get_spark().sql(upsert_sql) + else: + self._save_append(update_data) + + def _save(self, data: Union[DataFrame, pd.DataFrame]) -> None: + """Saves the data based on the write_mode and dataframe_type in the init. + If write_mode is pandas, Spark dataframe is created first. + If schema is provided, data is matched to schema before saving + (columns will be sorted and truncated). + + Args: + data (Any): Spark or pandas dataframe to save to the table location + """ + # filter columns specified in schema and match their ordering + if self._schema: + cols = self._schema.fieldNames() + if self._table.dataframe_type == "pandas": + data = self._get_spark().createDataFrame( + data.loc[:, cols], schema=self._schema + ) + else: + data = data.select(*cols) + else: + if self._table.dataframe_type == "pandas": + data = self._get_spark().createDataFrame(data) + if self._table.write_mode == "overwrite": + self._save_overwrite(data) + elif self._table.write_mode == "upsert": + self._save_upsert(data) + elif self._table.write_mode == "append": + self._save_append(data) + + def _describe(self) -> Dict[str, str]: + """Returns a description of the instance of ManagedTableDataSet + + Returns: + Dict[str, str]: Dict with the details of the dataset + """ + return { + "catalog": self._table.catalog, + "database": self._table.database, + "table": self._table.table, + "write_mode": self._table.write_mode, + "dataframe_type": self._table.dataframe_type, + "primary_key": self._table.primary_key, + "version": self._version, + "owner_group": self._table.owner_group, + "partition_columns": self._table.partition_columns, + } + + def _exists(self) -> bool: + """Checks to see if the table exists + + Returns: + bool: boolean of whether the table defined + in the dataset instance exists in the Spark session + """ + if self._table.catalog: + try: + self._get_spark().sql(f"USE CATALOG {self._table.catalog}") + except (ParseException, AnalysisException) as exc: + logger.warning( + "catalog %s not found or unity not enabled. Error message: %s", + self._table.catalog, + exc, + ) + try: + return ( + self._get_spark() + .sql(f"SHOW TABLES IN `{self._table.database}`") + .filter(f"tableName = '{self._table.table}'") + .count() + > 0 + ) + except (ParseException, AnalysisException) as exc: + logger.warning("error occured while trying to find table: %s", exc) + return False diff --git a/kedro-datasets/kedro_datasets/databricks/unity/__init__.py b/kedro-datasets/kedro_datasets/databricks/unity/__init__.py deleted file mode 100644 index ab452e146..000000000 --- a/kedro-datasets/kedro_datasets/databricks/unity/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .managed_table_dataset import ManagedTableDataSet \ No newline at end of file diff --git a/kedro-datasets/kedro_datasets/databricks/unity/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/unity/managed_table_dataset.py deleted file mode 100644 index f0f04b7be..000000000 --- a/kedro-datasets/kedro_datasets/databricks/unity/managed_table_dataset.py +++ /dev/null @@ -1,198 +0,0 @@ -import logging -import pandas as pd - -from operator import attrgetter -from functools import partial -from cachetools.keys import hashkey -from typing import Any, Dict, List, Union -from cachetools import Cache, cachedmethod -from kedro.io.core import ( - AbstractVersionedDataSet, - DataSetError, - Version, - VersionNotFoundError, -) -from pyspark.sql import DataFrame, SparkSession -from pyspark.sql.types import StructType -from cachetools import Cache - -logger = logging.getLogger(__name__) - - -class ManagedTableDataSet(AbstractVersionedDataSet): - """``ManagedTableDataSet`` loads and saves data into managed delta tables.""" - - # this dataset cannot be used with ``ParallelRunner``, - # therefore it has the attribute ``_SINGLE_PROCESS = True`` - # for parallelism within a Spark pipeline please consider - # using ``ThreadRunner`` instead - _SINGLE_PROCESS = True - _VALID_WRITE_MODES = ["overwrite", "upsert", "append"] - _VALID_DATAFRAME_TYPES = ["spark", "pandas"] - - def __init__( - self, - table: str, - catalog: str = None, - database: str = "default", - write_mode: str = "overwrite", - dataframe_type: str = "spark", - primary_key: Union[str, List[str]] = None, - version: Version = None, - *, - # the following parameters are used by the hook to create or update unity - schema: Dict[str, Any] = None, # pylint: disable=unused-argument - partition_columns: List[str] = None, # pylint: disable=unused-argument - owner_group: str = None, - ) -> None: - """Creates a new instance of ``ManagedTableDataSet``.""" - - self._database = database - self._catalog = catalog - self._table = table - self._owner_group = owner_group - self._partition_columns = partition_columns - if catalog and database and table: - self._full_table_address = f"{catalog}.{database}.{table}" - elif table: - self._full_table_address = f"{database}.{table}" - - if write_mode not in self._VALID_WRITE_MODES: - valid_modes = ", ".join(self._VALID_WRITE_MODES) - raise DataSetError( - f"Invalid `write_mode` provided: {write_mode}. " - f"`write_mode` must be one of: {valid_modes}" - ) - self._write_mode = write_mode - - if dataframe_type not in self._VALID_DATAFRAME_TYPES: - valid_types = ", ".join(self._VALID_DATAFRAME_TYPES) - raise DataSetError(f"`dataframe_type` must be one of {valid_types}") - self._dataframe_type = dataframe_type - - if primary_key is None or len(primary_key) == 0: - if write_mode == "upsert": - raise DataSetError( - f"`primary_key` must be provided for" f"`write_mode` {write_mode}" - ) - - self._primary_key = primary_key - self._version_cache = Cache(maxsize=2) - self._version = version - - self._schema = None - if schema is not None: - self._schema = StructType.fromJson(schema) - - def _get_spark(self) -> SparkSession: - return ( - SparkSession.builder.getOrCreate() - ) - - def _load(self) -> Union[DataFrame, pd.DataFrame]: - if self._version and self._version.load >= 0: - try: - data = ( - self._get_spark() - .read.format("delta") - .option("versionAsOf", self._version.load) - .table(self._full_table_address) - ) - except: - raise VersionNotFoundError - else: - data = self._get_spark().table(self._full_table_address) - if self._dataframe_type == "pandas": - data = data.toPandas() - return data - - def _save_append(self, data: DataFrame) -> None: - data.write.format("delta").mode("append").saveAsTable(self._full_table_address) - - def _save_overwrite(self, data: DataFrame) -> None: - delta_table = data.write.format("delta") - if self._write_mode == "overwrite": - delta_table = delta_table.mode("overwrite").option( - "overwriteSchema", "true" - ) - delta_table.saveAsTable(self._full_table_address) - - def _save_upsert(self, update_data: DataFrame) -> None: - if self._exists(): - base_data = self._get_spark().table(self._full_table_address) - base_columns = base_data.columns - update_columns = update_data.columns - - if set(update_columns) != set(base_columns): - raise DataSetError( - f"Upsert requires tables to have identical columns. " - f"Delta table {self._full_table_address} " - f"has columns: {base_columns}, whereas " - f"dataframe has columns {update_columns}" - ) - - where_expr = "" - if isinstance(self._primary_key, str): - where_expr = f"base.{self._primary_key}=update.{self._primary_key}" - elif isinstance(self._primary_key, list): - where_expr = " AND ".join( - f"base.{col}=update.{col}" for col in self._primary_key - ) - - update_data.createOrReplaceTempView("update") - - upsert_sql = f"""MERGE INTO {self._full_table_address} base USING update - ON {where_expr} WHEN MATCHED THEN UPDATE SET * WHEN NOT MATCHED THEN INSERT * - """ - self._get_spark().sql(upsert_sql) - else: - self._save_append(update_data) - - def _save(self, data: Any) -> None: - # filter columns specified in schema and match their ordering - if self._schema: - cols = self._schema.fieldNames() - if self._dataframe_type == "pandas": - data = self._get_spark().createDataFrame( - data.loc[:, cols], schema=self._schema - ) - else: - data = data.select(*cols) - else: - if self._dataframe_type == "pandas": - data = self._get_spark().createDataFrame(data) - if self._write_mode == "overwrite": - self._save_overwrite(data) - elif self._write_mode == "upsert": - self._save_upsert(data) - elif self._write_mode == "append": - self._save_append(data) - - def _describe(self) -> Dict[str, str]: - return dict( - catalog=self._catalog, - database=self._database, - table=self._table, - write_mode=self._write_mode, - dataframe_type=self._dataframe_type, - primary_key=self._primary_key, - version=self._version, - owner_group=self._owner_group, - ) - - def _exists(self) -> bool: - if self._catalog: - try: - self._get_spark().sql(f"USE CATALOG {self._catalog}") - except: - logger.warn(f"catalog {self._catalog} not found") - try: - return ( - self._get_spark() - .sql(f"SHOW TABLES IN `{self._database}`") - .filter(f"tableName = '{self._table}'") - .count() - > 0 - ) - except: - return False diff --git a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py index 86e347d70..d9702c7b8 100644 --- a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py @@ -181,7 +181,6 @@ def _ensure_file_system_target(self) -> None: ) def _load(self) -> pd.DataFrame: - self._ensure_file_system_target() load_path = get_filepath_str(self._get_load_path(), self._protocol) @@ -196,7 +195,6 @@ def _load(self) -> pd.DataFrame: ) def _save(self, data: pd.DataFrame) -> None: - self._ensure_file_system_target() save_path = get_filepath_str(self._get_save_path(), self._protocol) diff --git a/kedro-datasets/kedro_datasets/spark/spark_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_dataset.py index d366eae08..3a77e08ff 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_dataset.py @@ -359,7 +359,6 @@ def __init__( # pylint: disable=too-many-arguments @staticmethod def _load_schema_from_file(schema: Dict[str, Any]) -> StructType: - filepath = schema.get("filepath") if not filepath: raise DataSetError( @@ -375,7 +374,6 @@ def _load_schema_from_file(schema: Dict[str, Any]) -> StructType: # Open schema file with file_system.open(load_path) as fs_file: - try: return StructType.fromJson(json.loads(fs_file.read())) except Exception as exc: diff --git a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py index aab501f26..dcb4185e7 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py @@ -126,7 +126,6 @@ def __init__( # Update properties in load_args and save_args with credentials. if credentials is not None: - # Check credentials for bad inputs. for cred_key, cred_value in credentials.items(): if cred_value is None: diff --git a/kedro-datasets/tests/databricks/test_unity.py b/kedro-datasets/tests/databricks/test_managed_table_dataset.py similarity index 94% rename from kedro-datasets/tests/databricks/test_unity.py rename to kedro-datasets/tests/databricks/test_managed_table_dataset.py index 0d54e29e4..f5bc494a1 100644 --- a/kedro-datasets/tests/databricks/test_unity.py +++ b/kedro-datasets/tests/databricks/test_managed_table_dataset.py @@ -1,8 +1,9 @@ +import pandas as pd import pytest -from kedro.io.core import DataSetError, VersionNotFoundError, Version -from pyspark.sql.types import IntegerType, StringType, StructField, StructType +from kedro.io.core import DataSetError, Version, VersionNotFoundError from pyspark.sql import DataFrame, SparkSession -import pandas as pd +from pyspark.sql.types import IntegerType, StringType, StructField, StructType + from kedro_datasets.databricks import ManagedTableDataSet @@ -171,19 +172,16 @@ def expected_upsert_multiple_primary_spark_df(spark_session: SparkSession): class TestManagedTableDataSet: def test_full_table(self): unity_ds = ManagedTableDataSet(catalog="test", database="test", table="test") - assert unity_ds._full_table_address == "test.test.test" + assert unity_ds._table.full_table_location == "test.test.test" - def test_database_table(self): unity_ds = ManagedTableDataSet(database="test", table="test") - assert unity_ds._full_table_address == "test.test" + assert unity_ds._table.full_table_location == "test.test" - def test_table_only(self): unity_ds = ManagedTableDataSet(table="test") - assert unity_ds._full_table_address == "default.test" + assert unity_ds._table.full_table_location == "default.test" - def test_table_missing(self): with pytest.raises(TypeError): - ManagedTableDataSet() + ManagedTableDataSet() # pylint: disable=no-value-for-parameter def test_describe(self): unity_ds = ManagedTableDataSet(table="test") @@ -195,7 +193,8 @@ def test_describe(self): "dataframe_type": "spark", "primary_key": None, "version": None, - "owner_group": None + "owner_group": None, + "partition_columns": None, } def test_invalid_write_mode(self): @@ -240,7 +239,9 @@ def test_schema(self): assert unity_ds._schema == expected_schema def test_catalog_exists(self): - unity_ds = ManagedTableDataSet(catalog="test", database="invalid", table="test_not_there") + unity_ds = ManagedTableDataSet( + catalog="test", database="invalid", table="test_not_there" + ) assert not unity_ds._exists() def test_table_does_not_exist(self): @@ -251,7 +252,9 @@ def test_save_default(self, sample_spark_df: DataFrame): unity_ds = ManagedTableDataSet(database="test", table="test_save") unity_ds.save(sample_spark_df) saved_table = unity_ds.load() - assert unity_ds.exists() and sample_spark_df.exceptAll(saved_table).count() == 0 + assert ( + unity_ds._exists() and sample_spark_df.exceptAll(saved_table).count() == 0 + ) def test_save_schema_spark( self, subset_spark_df: DataFrame, subset_expected_df: DataFrame @@ -414,7 +417,7 @@ def test_load_spark_no_version(self, sample_spark_df: DataFrame): unity_ds.save(sample_spark_df) delta_ds = ManagedTableDataSet( - database="test", table="test_load_spark", version=Version(2,None) + database="test", table="test_load_spark", version=Version(2, None) ) with pytest.raises(VersionNotFoundError): _ = delta_ds.load() @@ -427,7 +430,7 @@ def test_load_version(self, sample_spark_df: DataFrame, append_spark_df: DataFra unity_ds.save(append_spark_df) loaded_ds = ManagedTableDataSet( - database="test", table="test_load_version", version=Version(0,None) + database="test", table="test_load_version", version=Version(0, None) ) loaded_df = loaded_ds.load() diff --git a/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py b/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py index 0745452c6..4086e127e 100644 --- a/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py +++ b/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py @@ -170,7 +170,6 @@ def test_dict_save(self, tmp_path, mock_dict_plot, plot_writer, mocked_s3_bucket plot_writer.save(mock_dict_plot) for colour in COLOUR_LIST: - download_path = tmp_path / "downloaded_image.png" actual_filepath = tmp_path / "locally_saved.png" @@ -361,7 +360,6 @@ def test_list_save(self, tmp_path, mock_list_plot, versioned_plot_writer): versioned_plot_writer.save(mock_list_plot) for index in range(5): - test_path = tmp_path / "test_image.png" versioned_filepath = str(versioned_plot_writer._get_load_path()) diff --git a/kedro-datasets/tests/polars/test_csv_dataset.py b/kedro-datasets/tests/polars/test_csv_dataset.py index 8b05a2025..d79183539 100644 --- a/kedro-datasets/tests/polars/test_csv_dataset.py +++ b/kedro-datasets/tests/polars/test_csv_dataset.py @@ -77,7 +77,6 @@ def mocked_dataframe(): @pytest.fixture def mocked_csv_in_s3(mocked_s3_bucket, mocked_dataframe: pl.DataFrame): - binarycsv = mocked_dataframe.write_csv()[:-1] mocked_s3_bucket.put_object( From a314685c9aa24a32275ebcf5623f24458cc5c583 Mon Sep 17 00:00:00 2001 From: Vladimir Filimonov <69304033+Vladimir-Filimonov@users.noreply.github.com> Date: Thu, 9 Mar 2023 10:25:40 +0100 Subject: [PATCH 22/74] Snowpark (Snowflake) dataset for kedro (#104) * Add Snowpark datasets Signed-off-by: Vladimir Filimonov Signed-off-by: heber-urdaneta Signed-off-by: Danny Farah --- Makefile | 4 + kedro-datasets/RELEASE.md | 1 + .../kedro_datasets/snowflake/__init__.py | 8 + .../snowflake/snowpark_dataset.py | 232 ++++++++++++++++++ .../kedro_datasets/video/video_dataset.py | 1 - kedro-datasets/pyproject.toml | 2 +- kedro-datasets/setup.py | 4 + kedro-datasets/test_requirements.txt | 3 +- kedro-datasets/tests/snowflake/README.md | 34 +++ kedro-datasets/tests/snowflake/__init__.py | 0 kedro-datasets/tests/snowflake/conftest.py | 24 ++ .../tests/snowflake/test_snowpark_dataset.py | 166 +++++++++++++ 12 files changed, 476 insertions(+), 3 deletions(-) create mode 100644 kedro-datasets/kedro_datasets/snowflake/__init__.py create mode 100644 kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py create mode 100644 kedro-datasets/tests/snowflake/README.md create mode 100644 kedro-datasets/tests/snowflake/__init__.py create mode 100644 kedro-datasets/tests/snowflake/conftest.py create mode 100644 kedro-datasets/tests/snowflake/test_snowpark_dataset.py diff --git a/Makefile b/Makefile index 0b6bd723b..86daa6313 100644 --- a/Makefile +++ b/Makefile @@ -56,3 +56,7 @@ test-no-spark: test-no-spark-sequential: cd kedro-datasets && pytest tests --no-cov --ignore tests/spark + +# kedro-datasets/snowflake tests skipped from default scope +test-snowflake-only: + cd kedro-datasets && pytest tests --no-cov --numprocesses 1 --dist loadfile -m snowflake diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 36b2d6c12..3e108e7f4 100644 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -9,6 +9,7 @@ | Type | Description | Location | | ------------------------------------ | -------------------------------------------------------------------------- | ----------------------------- | | `polars.CSVDataSet` | A `CSVDataSet` backed by [polars](https://www.pola.rs/), a lighting fast dataframe package built entirely using Rust. | `kedro_datasets.polars` | +| `snowflake.SnowparkTableDataSet` | Work with [Snowpark](https://www.snowflake.com/en/data-cloud/snowpark/) DataFrames from tables in Snowflake. | `kedro_datasets.snowflake` | ## Bug fixes and other changes * Add `mssql` backend to the `SQLQueryDataSet` DataSet using `pyodbc` library. diff --git a/kedro-datasets/kedro_datasets/snowflake/__init__.py b/kedro-datasets/kedro_datasets/snowflake/__init__.py new file mode 100644 index 000000000..fdcd16af2 --- /dev/null +++ b/kedro-datasets/kedro_datasets/snowflake/__init__.py @@ -0,0 +1,8 @@ +"""Provides I/O modules for Snowflake.""" + +__all__ = ["SnowparkTableDataSet"] + +from contextlib import suppress + +with suppress(ImportError): + from .snowpark_dataset import SnowparkTableDataSet diff --git a/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py b/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py new file mode 100644 index 000000000..e0ea1c1db --- /dev/null +++ b/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py @@ -0,0 +1,232 @@ +"""``AbstractDataSet`` implementation to access Snowflake using Snowpark dataframes +""" +import logging +from copy import deepcopy +from typing import Any, Dict + +import snowflake.snowpark as sp +from kedro.io.core import AbstractDataSet, DataSetError + +logger = logging.getLogger(__name__) + + +class SnowparkTableDataSet(AbstractDataSet): + """``SnowparkTableDataSet`` loads and saves Snowpark dataframes. + + As of Mar-2023, the snowpark connector only works with Python 3.8. + + Example usage for the + `YAML API `_: + + .. code-block:: yaml + weather: + type: kedro_datasets.snowflake.SnowparkTableDataSet + table_name: "weather_data" + database: "meteorology" + schema: "observations" + credentials: db_credentials + save_args: + mode: overwrite + column_order: name + table_type: '' + + You can skip everything but "table_name" if the database and + schema are provided via credentials. That way catalog entries can be shorter + if, for example, all used Snowflake tables live in same database/schema. + Values in the dataset definition take priority over those defined in credentials. + + Example: + Credentials file provides all connection attributes, catalog entry + "weather" reuses credentials parameters, "polygons" catalog entry reuses + all credentials parameters except providing a different schema name. + Second example of credentials file uses ``externalbrowser`` authentication. + + catalog.yml + + .. code-block:: yaml + weather: + type: kedro_datasets.snowflake.SnowparkTableDataSet + table_name: "weather_data" + database: "meteorology" + schema: "observations" + credentials: snowflake_client + save_args: + mode: overwrite + column_order: name + table_type: '' + + polygons: + type: kedro_datasets.snowflake.SnowparkTableDataSet + table_name: "geopolygons" + credentials: snowflake_client + schema: "geodata" + + credentials.yml + + .. code-block:: yaml + snowflake_client: + account: 'ab12345.eu-central-1' + port: 443 + warehouse: "datascience_wh" + database: "detailed_data" + schema: "observations" + user: "service_account_abc" + password: "supersecret" + + credentials.yml (with externalbrowser authenticator) + + .. code-block:: yaml + snowflake_client: + account: 'ab12345.eu-central-1' + port: 443 + warehouse: "datascience_wh" + database: "detailed_data" + schema: "observations" + user: "john_doe@wdomain.com" + authenticator: "externalbrowser" + """ + + # this dataset cannot be used with ``ParallelRunner``, + # therefore it has the attribute ``_SINGLE_PROCESS = True`` + # for parallelism within a pipeline please consider + # ``ThreadRunner`` instead + _SINGLE_PROCESS = True + DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] + DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + + def __init__( # pylint: disable=too-many-arguments + self, + table_name: str, + schema: str = None, + database: str = None, + load_args: Dict[str, Any] = None, + save_args: Dict[str, Any] = None, + credentials: Dict[str, Any] = None, + ) -> None: + """Creates a new instance of ``SnowparkTableDataSet``. + + Args: + table_name: The table name to load or save data to. + schema: Name of the schema where ``table_name`` is. + Optional as can be provided as part of ``credentials`` + dictionary. Argument value takes priority over one provided + in ``credentials`` if any. + database: Name of the database where ``schema`` is. + Optional as can be provided as part of ``credentials`` + dictionary. Argument value takes priority over one provided + in ``credentials`` if any. + load_args: Currently not used + save_args: Provided to underlying snowpark ``save_as_table`` + To find all supported arguments, see here: + https://docs.snowflake.com/en/developer-guide/snowpark/reference/python/api/snowflake.snowpark.DataFrameWriter.saveAsTable.html + credentials: A dictionary with a snowpark connection string. + To find all supported arguments, see here: + https://docs.snowflake.com/en/user-guide/python-connector-api.html#connect + """ + + if not table_name: + raise DataSetError("'table_name' argument cannot be empty.") + + if not credentials: + raise DataSetError("'credentials' argument cannot be empty.") + + if not database: + if not ("database" in credentials and credentials["database"]): + raise DataSetError( + "'database' must be provided by credentials or dataset." + ) + database = credentials["database"] + + if not schema: + if not ("schema" in credentials and credentials["schema"]): + raise DataSetError( + "'schema' must be provided by credentials or dataset." + ) + schema = credentials["schema"] + # Handle default load and save arguments + self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) + if load_args is not None: + self._load_args.update(load_args) + self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) + if save_args is not None: + self._save_args.update(save_args) + + self._table_name = table_name + self._database = database + self._schema = schema + + connection_parameters = credentials + connection_parameters.update( + {"database": self._database, "schema": self._schema} + ) + self._connection_parameters = connection_parameters + self._session = self._get_session(self._connection_parameters) + + def _describe(self) -> Dict[str, Any]: + return { + "table_name": self._table_name, + "database": self._database, + "schema": self._schema, + } + + @staticmethod + def _get_session(connection_parameters) -> sp.Session: + """Given a connection string, create singleton connection + to be used across all instances of `SnowparkTableDataSet` that + need to connect to the same source. + connection_parameters is a dictionary of any values + supported by snowflake python connector: + https://docs.snowflake.com/en/user-guide/python-connector-api.html#connect + example: + connection_parameters = { + "account": "", + "user": "", + "password": "", (optional) + "role": "", (optional) + "warehouse": "", (optional) + "database": "", (optional) + "schema": "", (optional) + "authenticator: "" (optional) + } + """ + try: + logger.debug("Trying to reuse active snowpark session...") + session = sp.context.get_active_session() + except sp.exceptions.SnowparkSessionException: + logger.debug("No active snowpark session found. Creating") + session = sp.Session.builder.configs(connection_parameters).create() + return session + + def _load(self) -> sp.DataFrame: + table_name = [ + self._database, + self._schema, + self._table_name, + ] + + sp_df = self._session.table(".".join(table_name)) + return sp_df + + def _save(self, data: sp.DataFrame) -> None: + table_name = [ + self._database, + self._schema, + self._table_name, + ] + + data.write.save_as_table(table_name, **self._save_args) + + def _exists(self) -> bool: + session = self._session + query = "SELECT COUNT(*) FROM {database}.INFORMATION_SCHEMA.TABLES \ + WHERE TABLE_SCHEMA = '{schema}' \ + AND TABLE_NAME = '{table_name}'" + rows = session.sql( + query.format( + database=self._database, + schema=self._schema, + table_name=self._table_name, + ) + ).collect() + return rows[0][0] == 1 diff --git a/kedro-datasets/kedro_datasets/video/video_dataset.py b/kedro-datasets/kedro_datasets/video/video_dataset.py index 07f0e1c8f..03311146d 100644 --- a/kedro-datasets/kedro_datasets/video/video_dataset.py +++ b/kedro-datasets/kedro_datasets/video/video_dataset.py @@ -258,7 +258,6 @@ class VideoDataSet(AbstractDataSet[AbstractVideo, AbstractVideo]): """ - # pylint: disable=too-many-arguments def __init__( self, filepath: str, diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml index a32898cf6..6df7bd372 100644 --- a/kedro-datasets/pyproject.toml +++ b/kedro-datasets/pyproject.toml @@ -34,7 +34,7 @@ min-public-methods = 1 [tool.coverage.report] fail_under = 100 show_missing = true -omit = ["tests/*", "kedro_datasets/holoviews/*"] +omit = ["tests/*", "kedro_datasets/holoviews/*", "kedro_datasets/snowflake/*"] exclude_lines = ["pragma: no cover", "raise NotImplementedError"] [tool.pytest.ini_options] diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py index 44bb97185..26e583574 100644 --- a/kedro-datasets/setup.py +++ b/kedro-datasets/setup.py @@ -82,6 +82,9 @@ def _collect_requirements(requires): "spark.SparkJDBCDataSet": [SPARK, HDFS, S3FS], "spark.DeltaTableDataSet": [SPARK, HDFS, S3FS, "delta-spark~=1.0"], } +snowpark_require = { + "snowflake.SnowparkTableDataSet": ["snowflake-snowpark-python~=1.0.0", "pyarrow~=8.0"] +} svmlight_require = {"svmlight.SVMLightDataSet": ["scikit-learn~=1.0.2", "scipy~=1.7.3"]} tensorflow_required = { "tensorflow.TensorflowModelDataset": [ @@ -136,6 +139,7 @@ def _collect_requirements(requires): **video_require, **plotly_require, **spark_require, + **snowpark_require, **svmlight_require, **tensorflow_required, **yaml_require, diff --git a/kedro-datasets/test_requirements.txt b/kedro-datasets/test_requirements.txt index c6fc16a3e..48c3b511b 100644 --- a/kedro-datasets/test_requirements.txt +++ b/kedro-datasets/test_requirements.txt @@ -37,7 +37,7 @@ plotly>=4.8.0, <6.0 polars~=0.15.13 pre-commit>=2.9.2, <3.0 # The hook `mypy` requires pre-commit version 2.9.2. psutil==5.8.0 -pyarrow>=1.0, <7.0 +pyarrow~=8.0 pylint>=2.5.2, <3.0 pyodbc~=4.0.35 pyproj~=3.0 @@ -52,6 +52,7 @@ requests~=2.20 s3fs>=0.3.0, <0.5 # Needs to be at least 0.3.0 to make use of `cachable` attribute on S3FileSystem. scikit-learn~=1.0.2 scipy~=1.7.3 +snowflake-snowpark-python~=1.0.0; python_version == '3.8' SQLAlchemy~=1.2 tables~=3.6.0; platform_system == "Windows" and python_version<'3.9' tables~=3.6; platform_system != "Windows" diff --git a/kedro-datasets/tests/snowflake/README.md b/kedro-datasets/tests/snowflake/README.md new file mode 100644 index 000000000..69fde3fd9 --- /dev/null +++ b/kedro-datasets/tests/snowflake/README.md @@ -0,0 +1,34 @@ +# Snowpark connector testing + +Execution of automated tests for Snowpark connector requires real Snowflake instance access. Therefore tests located in this folder are **disabled** by default from pytest execution scope using [conftest.py](conftest.py). + +[Makefile](/Makefile) provides separate argument ``test-snowflake-only`` to run only tests related to Snowpark connector. To run tests one need to provide Snowflake connection parameters via environment variables: +* SNOWSQL_ACCOUNT - Snowflake account name with region. Ex `ab12345.eu-central-2` +* SNOWSQL_WAREHOUSE - Snowflake virtual warehouse to use +* SNOWSQL_DATABASE - Database to use +* SNOWSQL_SCHEMA - Schema to use when creating tables for tests +* SNOWSQL_ROLE - Role to use for connection +* SNOWSQL_USER - Username to use for connection +* SNOWSQL_PWD - Plain password to use for connection + +All environment variables need to be provided for tests to run. + +Here is example shell command to run snowpark tests via make utility: +```bash +SNOWSQL_ACCOUNT='ab12345.eu-central-2' SNOWSQL_WAREHOUSE='DEV_WH' SNOWSQL_DATABASE='DEV_DB' SNOWSQL_ROLE='DEV_ROLE' SNOWSQL_USER='DEV_USER' SNOWSQL_SCHEMA='DATA' SNOWSQL_PWD='supersecret' make test-snowflake-only +``` + +Currently running tests supports only simple username & password authentication and not SSO/MFA. + +As of Mar-2023, the snowpark connector only works with Python 3.8. + +## Snowflake permissions required +Credentials provided via environment variables should have following permissions granted to run tests successfully: +* Create tables in a given schema +* Drop tables in a given schema +* Insert rows into tables in a given schema +* Query tables in a given schema +* Query `INFORMATION_SCHEMA.TABLES` of respective database + +## Extending tests +Contributors adding new tests should add `@pytest.mark.snowflake` decorator to each test. Exclusion of Snowpark-related pytests from overall execution scope in [conftest.py](conftest.py) works based on markers. diff --git a/kedro-datasets/tests/snowflake/__init__.py b/kedro-datasets/tests/snowflake/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/kedro-datasets/tests/snowflake/conftest.py b/kedro-datasets/tests/snowflake/conftest.py new file mode 100644 index 000000000..f6188da76 --- /dev/null +++ b/kedro-datasets/tests/snowflake/conftest.py @@ -0,0 +1,24 @@ +""" +We disable execution of tests that require real Snowflake instance +to run by default. Providing -m snowflake option explicitly to +pytest will make these and only these tests run +""" +import pytest + + +def pytest_collection_modifyitems(config, items): + markers_arg = config.getoption("-m") + + # Naive implementation to handle basic marker expressions + # Will not work if someone will (ever) run pytest with complex marker + # expressions like "-m spark and not (snowflake or pandas)" + if ( + "snowflake" in markers_arg.lower() + and "not snowflake" not in markers_arg.lower() + ): + return + + skip_snowflake = pytest.mark.skip(reason="need -m snowflake option to run") + for item in items: + if "snowflake" in item.keywords: + item.add_marker(skip_snowflake) diff --git a/kedro-datasets/tests/snowflake/test_snowpark_dataset.py b/kedro-datasets/tests/snowflake/test_snowpark_dataset.py new file mode 100644 index 000000000..2133953b5 --- /dev/null +++ b/kedro-datasets/tests/snowflake/test_snowpark_dataset.py @@ -0,0 +1,166 @@ +import datetime +import os + +import pytest +from kedro.io import DataSetError + +try: + import snowflake.snowpark as sp + + from kedro_datasets.snowflake import SnowparkTableDataSet as spds +except ImportError: + pass # this is only for test discovery to succeed on Python <> 3.8 + + +def get_connection(): + account = os.getenv("SNOWSQL_ACCOUNT") + warehouse = os.getenv("SNOWSQL_WAREHOUSE") + database = os.getenv("SNOWSQL_DATABASE") + role = os.getenv("SNOWSQL_ROLE") + user = os.getenv("SNOWSQL_USER") + schema = os.getenv("SNOWSQL_SCHEMA") + password = os.getenv("SNOWSQL_PWD") + + if not ( + account and warehouse and database and role and user and schema and password + ): + raise DataSetError( + "Snowflake connection environment variables provided not in full" + ) + + conn = { + "account": account, + "warehouse": warehouse, + "database": database, + "role": role, + "user": user, + "schema": schema, + "password": password, + } + return conn + + +def sf_setup_db(sf_session): + # For table exists test + run_query(sf_session, 'CREATE TABLE KEDRO_PYTEST_TESTEXISTS ("name" VARCHAR)') + + # For load test + query = 'CREATE TABLE KEDRO_PYTEST_TESTLOAD ("name" VARCHAR\ + , "age" INTEGER\ + , "bday" date\ + , "height" float\ + , "insert_dttm" timestamp)' + run_query(sf_session, query) + + query = "INSERT INTO KEDRO_PYTEST_TESTLOAD VALUES ('John'\ + , 23\ + , to_date('1999-12-02','YYYY-MM-DD')\ + , 6.5\ + , to_timestamp_ntz('2022-12-02 13:20:01',\ + 'YYYY-MM-DD hh24:mi:ss'))" + run_query(sf_session, query) + + query = "INSERT INTO KEDRO_PYTEST_TESTLOAD VALUES ('Jane'\ + , 41\ + , to_date('1981-01-03','YYYY-MM-DD')\ + , 5.7\ + , to_timestamp_ntz('2022-12-02 13:21:11',\ + 'YYYY-MM-DD hh24:mi:ss'))" + run_query(sf_session, query) + + +def sf_db_cleanup(sf_session): + run_query(sf_session, "DROP TABLE IF EXISTS KEDRO_PYTEST_TESTEXISTS") + run_query(sf_session, "DROP TABLE IF EXISTS KEDRO_PYTEST_TESTLOAD") + run_query(sf_session, "DROP TABLE IF EXISTS KEDRO_PYTEST_TESTSAVE") + + +def run_query(session, query): + df = session.sql(query) + df.collect() + return df + + +def df_equals_ignore_dtype(df1, df2): + # Pytest will show respective stdout only if test fails + # this will help to debug what was exactly not matching right away + + c1 = df1.to_pandas().values.tolist() + c2 = df2.to_pandas().values.tolist() + + print(c1) + print("--- comparing to ---") + print(c2) + + for i, row in enumerate(c1): + for j, column in enumerate(row): + if not column == c2[i][j]: + print(f"{column} not equal to {c2[i][j]}") + return False + return True + + +@pytest.fixture +def sample_sp_df(sf_session): + return sf_session.create_dataframe( + [ + [ + "John", + 23, + datetime.date(1999, 12, 2), + 6.5, + datetime.datetime(2022, 12, 2, 13, 20, 1), + ], + [ + "Jane", + 41, + datetime.date(1981, 1, 3), + 5.7, + datetime.datetime(2022, 12, 2, 13, 21, 11), + ], + ], + schema=["name", "age", "bday", "height", "insert_dttm"], + ) + + +@pytest.fixture +def sf_session(): + sf_session = sp.Session.builder.configs(get_connection()).create() + + # Running cleanup in case previous run was interrupted w/o proper cleanup + sf_db_cleanup(sf_session) + sf_setup_db(sf_session) + + yield sf_session + sf_db_cleanup(sf_session) + sf_session.close() + + +class TestSnowparkTableDataSet: + @pytest.mark.snowflake + def test_save(self, sample_sp_df, sf_session): + sp_df = spds(table_name="KEDRO_PYTEST_TESTSAVE", credentials=get_connection()) + sp_df._save(sample_sp_df) + sp_df_saved = sf_session.table("KEDRO_PYTEST_TESTSAVE") + assert sp_df_saved.count() == 2 + + @pytest.mark.snowflake + def test_load(self, sample_sp_df, sf_session): + print(sf_session) + sp_df = spds( + table_name="KEDRO_PYTEST_TESTLOAD", credentials=get_connection() + )._load() + + # Ignoring dtypes as ex. age can be int8 vs int64 and pandas.compare + # fails on that + assert df_equals_ignore_dtype(sample_sp_df, sp_df) is True + + @pytest.mark.snowflake + def test_exists(self, sf_session): + print(sf_session) + df_e = spds(table_name="KEDRO_PYTEST_TESTEXISTS", credentials=get_connection()) + df_ne = spds( + table_name="KEDRO_PYTEST_TESTNEXISTS", credentials=get_connection() + ) + assert df_e._exists() is True + assert df_ne._exists() is False From cb738044cd62fc770a437fc9dea6b828c80a66f8 Mon Sep 17 00:00:00 2001 From: Danny Farah Date: Mon, 13 Mar 2023 21:47:50 -0400 Subject: [PATCH 23/74] moved validation to dataclass Signed-off-by: Danny Farah --- .../databricks/managed_table_dataset.py | 168 +++++++++++++----- .../databricks/test_managed_table_dataset.py | 8 +- 2 files changed, 129 insertions(+), 47 deletions(-) diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py index 1b9e0c737..aeef0d1af 100644 --- a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py +++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py @@ -1,8 +1,9 @@ """``ManagedTableDataSet`` implementation to access managed delta tables in Databricks. """ -import dataclasses import logging +import re +from dataclasses import dataclass from functools import partial from operator import attrgetter from typing import Any, Dict, List, Union @@ -21,21 +22,127 @@ from pyspark.sql.utils import AnalysisException, ParseException logger = logging.getLogger(__name__) +NAMING_REGEX = r"\b[0-9a-zA-Z_]{1,32}\b" +_VALID_WRITE_MODES = ["overwrite", "upsert", "append"] +_VALID_DATAFRAME_TYPES = ["spark", "pandas"] -@dataclasses.dataclass -class Table: # pylint: disable=R0902 +@dataclass(frozen=True) +class ManagedTable: # pylint: disable=R0902 """Stores the definition of a managed table""" database: str catalog: str table: str - full_table_location: str write_mode: str dataframe_type: str primary_key: str owner_group: str partition_columns: str | List[str] + json_schema: StructType + + def __post_init__(self): + """Run validation methods if declared. + The validation method can be a simple check + that raises ValueError or a transformation to + the field value. + The validation is performed by calling a function named: + `validate_(self, value) -> raises DataSetError` + """ + for name, _ in self.__dataclass_fields__.items(): # pylint: disable=E1101 + if method := getattr(self, f"validate_{name}", None): + method() + + def validate_table(self): + """validates table name + + Raises: + DataSetError: + """ + if not re.fullmatch(NAMING_REGEX, self.table): + raise DataSetError( + "table does not conform to naming and is a required field" + ) + + def validate_database(self): + """validates database name + + Raises: + DataSetError: + """ + if self.database: + if not re.fullmatch(NAMING_REGEX, self.database): + raise DataSetError("database does not conform to naming") + + def validate_catalog(self): + """validates catalog name + + Raises: + DataSetError: + """ + if self.catalog: + if not re.fullmatch(NAMING_REGEX, self.catalog): + raise DataSetError("catalog does not conform to naming") + + def validate_write_mode(self): + """validates the write mode + + Raises: + DataSetError: + """ + if self.write_mode not in _VALID_WRITE_MODES: + valid_modes = ", ".join(_VALID_WRITE_MODES) + raise DataSetError( + f"Invalid `write_mode` provided: {self.write_mode}. " + f"`write_mode` must be one of: {valid_modes}" + ) + + def validate_dataframe_type(self): + """validates the dataframe type + + Raises: + DataSetError: + """ + if self.dataframe_type not in _VALID_DATAFRAME_TYPES: + valid_types = ", ".join(_VALID_DATAFRAME_TYPES) + raise DataSetError(f"`dataframe_type` must be one of {valid_types}") + + def validate_primary_key(self): + """validates the primary key of the table + + Raises: + DataSetError: + """ + if self.primary_key is None or len(self.primary_key) == 0: + if self.write_mode == "upsert": + raise DataSetError( + f"`primary_key` must be provided for" + f"`write_mode` {self.write_mode}" + ) + + def full_table_location(self) -> str: + """Returns the full table location + + Returns: + str: table location in the format catalog.database.table + """ + full_table_location = None + if self.catalog and self.database and self.table: + full_table_location = f"{self.catalog}.{self.database}.{self.table}" + elif self.table: + full_table_location = f"{self.database}.{self.table}" + return full_table_location + + def schema(self) -> StructType: + """Returns the Spark schema of the table if it exists + + Returns: + StructType: + """ + schema = None + if self.json_schema is not None: + schema = StructType.fromJson(self.json_schema) + return schema class ManagedTableDataSet(AbstractVersionedDataSet): @@ -82,8 +189,6 @@ class ManagedTableDataSet(AbstractVersionedDataSet): # for parallelism within a Spark pipeline please consider # using ``ThreadRunner`` instead _SINGLE_PROCESS = True - _VALID_WRITE_MODES = ["overwrite", "upsert", "append"] - _VALID_DATAFRAME_TYPES = ["spark", "pandas"] def __init__( # pylint: disable=R0913 self, @@ -103,44 +208,21 @@ def __init__( # pylint: disable=R0913 ) -> None: """Creates a new instance of ``ManagedTableDataSet``.""" - full_table_location = None - if catalog and database and table: - full_table_location = f"{catalog}.{database}.{table}" - elif table: - full_table_location = f"{database}.{table}" - if write_mode not in self._VALID_WRITE_MODES: - valid_modes = ", ".join(self._VALID_WRITE_MODES) - raise DataSetError( - f"Invalid `write_mode` provided: {write_mode}. " - f"`write_mode` must be one of: {valid_modes}" - ) - if dataframe_type not in self._VALID_DATAFRAME_TYPES: - valid_types = ", ".join(self._VALID_DATAFRAME_TYPES) - raise DataSetError(f"`dataframe_type` must be one of {valid_types}") - if primary_key is None or len(primary_key) == 0: - if write_mode == "upsert": - raise DataSetError( - f"`primary_key` must be provided for" f"`write_mode` {write_mode}" - ) - self._table = Table( + self._table = ManagedTable( database=database, catalog=catalog, table=table, - full_table_location=full_table_location, write_mode=write_mode, dataframe_type=dataframe_type, primary_key=primary_key, owner_group=owner_group, partition_columns=partition_columns, + json_schema=schema, ) self._version_cache = Cache(maxsize=2) self._version = version - self._schema = None - if schema is not None: - self._schema = StructType.fromJson(schema) - super().__init__( filepath=None, version=version, @@ -153,12 +235,12 @@ def _fetch_latest_load_version(self) -> int: # version from the given path. latest_history = ( self._get_spark() - .sql(f"DESCRIBE HISTORY {self._table.full_table_location} LIMIT 1") + .sql(f"DESCRIBE HISTORY {self._table.full_table_location()} LIMIT 1") .collect() ) if len(latest_history) != 1: raise VersionNotFoundError( - f"Did not find any versions for {self._table.full_table_location}" + f"Did not find any versions for {self._table.full_table_location()}" ) return latest_history[0].version @@ -191,12 +273,12 @@ def _load(self) -> Union[DataFrame, pd.DataFrame]: self._get_spark() .read.format("delta") .option("versionAsOf", self._version.load) - .table(self._table.full_table_location) + .table(self._table.full_table_location()) ) except Exception as exc: raise VersionNotFoundError(self._version) from exc else: - data = self._get_spark().table(self._table.full_table_location) + data = self._get_spark().table(self._table.full_table_location()) if self._table.dataframe_type == "pandas": data = data.toPandas() return data @@ -209,7 +291,7 @@ def _save_append(self, data: DataFrame) -> None: data (DataFrame): the Spark dataframe to append to the table """ data.write.format("delta").mode("append").saveAsTable( - self._table.full_table_location + self._table.full_table_location() ) def _save_overwrite(self, data: DataFrame) -> None: @@ -224,7 +306,7 @@ def _save_overwrite(self, data: DataFrame) -> None: delta_table = delta_table.mode("overwrite").option( "overwriteSchema", "true" ) - delta_table.saveAsTable(self._table.full_table_location) + delta_table.saveAsTable(self._table.full_table_location()) def _save_upsert(self, update_data: DataFrame) -> None: """Upserts the data by joining on primary_key columns or column. @@ -234,14 +316,14 @@ def _save_upsert(self, update_data: DataFrame) -> None: update_data (DataFrame): the Spark dataframe to upsert """ if self._exists(): - base_data = self._get_spark().table(self._table.full_table_location) + base_data = self._get_spark().table(self._table.full_table_location()) base_columns = base_data.columns update_columns = update_data.columns if set(update_columns) != set(base_columns): raise DataSetError( f"Upsert requires tables to have identical columns. " - f"Delta table {self._table.full_table_location} " + f"Delta table {self._table.full_table_location()} " f"has columns: {base_columns}, whereas " f"dataframe has columns {update_columns}" ) @@ -258,7 +340,7 @@ def _save_upsert(self, update_data: DataFrame) -> None: update_data.createOrReplaceTempView("update") self._get_spark().conf.set( - "fullTableAddress", self._table.full_table_location + "fullTableAddress", self._table.full_table_location() ) self._get_spark().conf.set("whereExpr", where_expr) upsert_sql = """MERGE INTO ${fullTableAddress} base USING update ON ${whereExpr} @@ -277,11 +359,11 @@ def _save(self, data: Union[DataFrame, pd.DataFrame]) -> None: data (Any): Spark or pandas dataframe to save to the table location """ # filter columns specified in schema and match their ordering - if self._schema: - cols = self._schema.fieldNames() + if self._table.schema(): + cols = self._table.schema().fieldNames() if self._table.dataframe_type == "pandas": data = self._get_spark().createDataFrame( - data.loc[:, cols], schema=self._schema + data.loc[:, cols], schema=self._table.schema() ) else: data = data.select(*cols) diff --git a/kedro-datasets/tests/databricks/test_managed_table_dataset.py b/kedro-datasets/tests/databricks/test_managed_table_dataset.py index f5bc494a1..4520042ab 100644 --- a/kedro-datasets/tests/databricks/test_managed_table_dataset.py +++ b/kedro-datasets/tests/databricks/test_managed_table_dataset.py @@ -172,13 +172,13 @@ def expected_upsert_multiple_primary_spark_df(spark_session: SparkSession): class TestManagedTableDataSet: def test_full_table(self): unity_ds = ManagedTableDataSet(catalog="test", database="test", table="test") - assert unity_ds._table.full_table_location == "test.test.test" + assert unity_ds._table.full_table_location() == "test.test.test" unity_ds = ManagedTableDataSet(database="test", table="test") - assert unity_ds._table.full_table_location == "test.test" + assert unity_ds._table.full_table_location() == "test.test" unity_ds = ManagedTableDataSet(table="test") - assert unity_ds._table.full_table_location == "default.test" + assert unity_ds._table.full_table_location() == "default.test" with pytest.raises(TypeError): ManagedTableDataSet() # pylint: disable=no-value-for-parameter @@ -236,7 +236,7 @@ def test_schema(self): StructField("age", IntegerType(), True), ] ) - assert unity_ds._schema == expected_schema + assert unity_ds._table.schema() == expected_schema def test_catalog_exists(self): unity_ds = ManagedTableDataSet( From 19da1c06dd671e1a403fc2ae82dd20827206b7aa Mon Sep 17 00:00:00 2001 From: Sajid Alam <90610031+SajidAlamQB@users.noreply.github.com> Date: Thu, 2 Feb 2023 09:45:31 +0000 Subject: [PATCH 24/74] Release `kedro-datasets` `version 1.0.2` (#112) * bump version and update release notes * fix pylint errors Signed-off-by: Danny Farah --- kedro-datasets/RELEASE.md | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 3e108e7f4..14a6d2e38 100644 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -14,15 +14,10 @@ ## Bug fixes and other changes * Add `mssql` backend to the `SQLQueryDataSet` DataSet using `pyodbc` library. -## Bug fixes and other changes -* Added a warning when the user tries to use `SparkDataSet` on Databricks without specifying a file path with the `/dbfs/` prefix. - -# Release 1.0.2: - -## Bug fixes and other changes +# Upcoming Release: * Change reference to `kedro.pipeline.Pipeline` object throughout test suite with `kedro.modular_pipeline.pipeline` factory. + * Relaxed PyArrow range in line with Pandas -* Fixed outdated links to the dill package documentation # Release 1.0.1: From cdb563f6999a738327db3b648f76f00fd0fec238 Mon Sep 17 00:00:00 2001 From: Merel Theisen <49397448+merelcht@users.noreply.github.com> Date: Mon, 20 Mar 2023 15:13:27 +0000 Subject: [PATCH 25/74] Fix bandit check by adding timeout to requests.post calls (#133) Signed-off-by: Merel Theisen Signed-off-by: Danny Farah --- tools/circleci/circleci_release.py | 2 +- tools/circleci/github_release.py | 2 +- tools/circleci/utils/check_no_version_pypi.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/circleci/circleci_release.py b/tools/circleci/circleci_release.py index 28b5b7bc5..88c4ed1d0 100755 --- a/tools/circleci/circleci_release.py +++ b/tools/circleci/circleci_release.py @@ -32,7 +32,7 @@ def circleci_release(project_slug, payload, circle_endpoint, circle_release_toke headers["Content-Type"] = "application/json" headers["Circle-Token"] = circle_release_token - resp = requests.post(circle_endpoint, headers=headers, json=payload) + resp = requests.post(circle_endpoint, headers=headers, json=payload, timeout=10) print(f"Status Code: {resp.status_code}") if resp.status_code == 201: print("Creating CircleCI Pipeline successfully") diff --git a/tools/circleci/github_release.py b/tools/circleci/github_release.py index 51abfe843..d5bc3115c 100755 --- a/tools/circleci/github_release.py +++ b/tools/circleci/github_release.py @@ -36,7 +36,7 @@ def github_release( headers = CaseInsensitiveDict() headers["Content-Type"] = "application/json" headers["Authorization"] = f"token {github_tagging_token}" - resp = requests.post(github_endpoint, headers=headers, json=payload) + resp = requests.post(github_endpoint, headers=headers, json=payload, timeout=10) if resp.status_code == 200: print("Create GitHub release successfully") print(resp.content) diff --git a/tools/circleci/utils/check_no_version_pypi.py b/tools/circleci/utils/check_no_version_pypi.py index 98b945f01..777f09c9a 100644 --- a/tools/circleci/utils/check_no_version_pypi.py +++ b/tools/circleci/utils/check_no_version_pypi.py @@ -3,7 +3,7 @@ def check_no_version_pypi(pypi_endpoint, package_name, package_version): print("Check if {package_name} {package_version} is on pypi") - response = requests.get(pypi_endpoint) + response = requests.get(pypi_endpoint, timeout=10) if response.status_code == 404: # Not exist on Pypi - do release print(f"Starting the release of {package_name} {package_version}") From a227c36e3654dc835746768141e42e5291265527 Mon Sep 17 00:00:00 2001 From: Merel Theisen <49397448+merelcht@users.noreply.github.com> Date: Mon, 20 Mar 2023 17:36:24 +0000 Subject: [PATCH 26/74] Bump version (#132) Signed-off-by: Merel Theisen Signed-off-by: Danny Farah --- kedro-datasets/RELEASE.md | 6 +++++- kedro-datasets/kedro_datasets/__init__.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 14a6d2e38..8ac93608d 100644 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -1,6 +1,10 @@ +# Upcoming Release: + +## Major features and improvements: -# Upcoming Release 1.1.0: +## Bug fixes and other changes +# Release 1.1.0: ## Major features and improvements: diff --git a/kedro-datasets/kedro_datasets/__init__.py b/kedro-datasets/kedro_datasets/__init__.py index d8bcc2d13..f2729f0ce 100644 --- a/kedro-datasets/kedro_datasets/__init__.py +++ b/kedro-datasets/kedro_datasets/__init__.py @@ -1,3 +1,3 @@ """``kedro_datasets`` is where you can find all of Kedro's data connectors.""" -__version__ = "1.0.2" +__version__ = "1.1.0" From 6d419567d5875ac50bb2195fec62c1323d5dc08f Mon Sep 17 00:00:00 2001 From: Danny Farah Date: Tue, 21 Mar 2023 12:58:31 -0400 Subject: [PATCH 27/74] bug fix in type of partition column and cleanup Signed-off-by: Danny Farah --- .../databricks/managed_table_dataset.py | 37 ++++++++++--------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py index aeef0d1af..ee82253e3 100644 --- a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py +++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py @@ -22,15 +22,16 @@ from pyspark.sql.utils import AnalysisException, ParseException logger = logging.getLogger(__name__) -NAMING_REGEX = r"\b[0-9a-zA-Z_]{1,32}\b" -_VALID_WRITE_MODES = ["overwrite", "upsert", "append"] -_VALID_DATAFRAME_TYPES = ["spark", "pandas"] @dataclass(frozen=True) class ManagedTable: # pylint: disable=R0902 """Stores the definition of a managed table""" + # regex for tables, catalogs and schemas + _NAMING_REGEX = r"\b[0-9a-zA-Z_]{1,32}\b" + _VALID_WRITE_MODES = ["overwrite", "upsert", "append"] + _VALID_DATAFRAME_TYPES = ["spark", "pandas"] database: str catalog: str table: str @@ -38,14 +39,13 @@ class ManagedTable: # pylint: disable=R0902 dataframe_type: str primary_key: str owner_group: str - partition_columns: str | List[str] + partition_columns: Union[str, List[str]] json_schema: StructType def __post_init__(self): """Run validation methods if declared. The validation method can be a simple check - that raises ValueError or a transformation to - the field value. + that raises DataSetError. The validation is performed by calling a function named: `validate_(self, value) -> raises DataSetError` """ @@ -59,10 +59,8 @@ def validate_table(self): Raises: DataSetError: """ - if not re.fullmatch(NAMING_REGEX, self.table): - raise DataSetError( - "table does not conform to naming and is a required field" - ) + if not re.fullmatch(self._NAMING_REGEX, self.table): + raise DataSetError("table does not conform to naming") def validate_database(self): """validates database name @@ -71,7 +69,7 @@ def validate_database(self): DataSetError: """ if self.database: - if not re.fullmatch(NAMING_REGEX, self.database): + if not re.fullmatch(self._NAMING_REGEX, self.database): raise DataSetError("database does not conform to naming") def validate_catalog(self): @@ -81,7 +79,7 @@ def validate_catalog(self): DataSetError: """ if self.catalog: - if not re.fullmatch(NAMING_REGEX, self.catalog): + if not re.fullmatch(self._NAMING_REGEX, self.catalog): raise DataSetError("catalog does not conform to naming") def validate_write_mode(self): @@ -90,8 +88,8 @@ def validate_write_mode(self): Raises: DataSetError: """ - if self.write_mode not in _VALID_WRITE_MODES: - valid_modes = ", ".join(_VALID_WRITE_MODES) + if self.write_mode not in self._VALID_WRITE_MODES: + valid_modes = ", ".join(self._VALID_WRITE_MODES) raise DataSetError( f"Invalid `write_mode` provided: {self.write_mode}. " f"`write_mode` must be one of: {valid_modes}" @@ -103,8 +101,8 @@ def validate_dataframe_type(self): Raises: DataSetError: """ - if self.dataframe_type not in _VALID_DATAFRAME_TYPES: - valid_types = ", ".join(_VALID_DATAFRAME_TYPES) + if self.dataframe_type not in self._VALID_DATAFRAME_TYPES: + valid_types = ", ".join(self._VALID_DATAFRAME_TYPES) raise DataSetError(f"`dataframe_type` must be one of {valid_types}") def validate_primary_key(self): @@ -140,8 +138,11 @@ def schema(self) -> StructType: StructType: """ schema = None - if self.json_schema is not None: - schema = StructType.fromJson(self.json_schema) + try: + if self.json_schema is not None: + schema = StructType.fromJson(self.json_schema) + except ParseException as exc: + raise DataSetError(exc) from exc return schema From ee5f4244abe57328d49eddc75c008ccc780b77b1 Mon Sep 17 00:00:00 2001 From: Jannic <37243923+jmholzer@users.noreply.github.com> Date: Tue, 21 Mar 2023 18:19:09 +0000 Subject: [PATCH 28/74] Fix malformed doc strings causing RTD builds to fail on Kedro (#136) * Fix missing blank line before yaml code block declaration Signed-off-by: Jannic Holzer * Fix bullet list Signed-off-by: Jannic Holzer * Add full stop to first bullet line Signed-off-by: Jannic Holzer * Make bullet list in line Signed-off-by: Jannic Holzer * Fix line too long Signed-off-by: Jannic Holzer * Add release note Signed-off-by: Jannic Holzer * Fix release note Signed-off-by: Jannic Holzer --------- Signed-off-by: Jannic Holzer Signed-off-by: Danny Farah --- kedro-datasets/RELEASE.md | 2 ++ kedro-datasets/kedro_datasets/json/json_dataset.py | 1 + kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py | 7 +++---- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 8ac93608d..39915c362 100644 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -4,6 +4,8 @@ ## Bug fixes and other changes +* Fixed problematic docstrings causing RTD builds on Kedro to fail. + # Release 1.1.0: ## Major features and improvements: diff --git a/kedro-datasets/kedro_datasets/json/json_dataset.py b/kedro-datasets/kedro_datasets/json/json_dataset.py index ad86c9a17..a39e7aff5 100644 --- a/kedro-datasets/kedro_datasets/json/json_dataset.py +++ b/kedro-datasets/kedro_datasets/json/json_dataset.py @@ -23,6 +23,7 @@ class JSONDataSet(AbstractVersionedDataSet[Any, Any]): Example usage for the `YAML API `_: + .. code-block:: yaml cars: diff --git a/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py index 08b0666ea..a676dd784 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py @@ -20,11 +20,10 @@ class SparkHiveDataSet(AbstractDataSet[DataFrame, DataFrame]): This DataSet has some key assumptions: - Schemas do not change during the pipeline run (defined PKs must be present for the - duration of the pipeline) + duration of the pipeline). - Tables are not being externally modified during upserts. The upsert method is NOT ATOMIC - - to external changes to the target table while executing. - Upsert methodology works by leveraging Spark DataFrame execution plan checkpointing. + to external changes to the target table while executing. Upsert methodology works by + leveraging Spark DataFrame execution plan checkpointing. Example usage for the `YAML API Date: Tue, 21 Mar 2023 14:53:51 -0400 Subject: [PATCH 29/74] updated docstring for ManagedTableDataSet Signed-off-by: Danny Farah --- .../databricks/managed_table_dataset.py | 74 ++++++++++++++----- 1 file changed, 56 insertions(+), 18 deletions(-) diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py index ee82253e3..fd5cd5e03 100644 --- a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py +++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py @@ -148,10 +148,18 @@ def schema(self) -> StructType: class ManagedTableDataSet(AbstractVersionedDataSet): """``ManagedTableDataSet`` loads and saves data into managed delta tables on Databricks. + Load and save can be in Spark or Pandas dataframes, specified in dataframe_type. + When saving data, you can specify one of three modes: overwtire(default), append, + or upsert. Upsert requires you to specify the primary_column parameter which + will be used as part of the join condition. This dataset works best with + the databricks kedro starter. That starter comes with hooks that allow this + dataset to function properly. Follow the instructions in that starter to + setup your project for this dataset. Example usage for the `YAML API `_: + .. code-block:: yaml names_and_ages@spark: @@ -167,23 +175,24 @@ class ManagedTableDataSet(AbstractVersionedDataSet): `Python API `_: :: - Launch a pyspark session with the following configs: - % pyspark --packages io.delta:delta-core_2.12:1.2.1 - --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" - --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" - - >>> from pyspark.sql import SparkSession - >>> from pyspark.sql.types import (StructField, StringType, - IntegerType, StructType) - >>> from kedro_datasets.databricks import ManagedTableDataSet - >>> schema = StructType([StructField("name", StringType(), True), - StructField("age", IntegerType(), True)]) - >>> data = [('Alex', 31), ('Bob', 12), ('Clarke', 65), ('Dave', 29)] - >>> spark_df = SparkSession.builder.getOrCreate().createDataFrame(data, schema) - >>> data_set = ManagedTableDataSet(table="names_and_ages") - >>> data_set.save(spark_df) - >>> reloaded = data_set.load() - >>> reloaded.take(4)""" + + % pyspark --packages io.delta:delta-core_2.12:1.2.1 + --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" + --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" + + >>> from pyspark.sql import SparkSession + >>> from pyspark.sql.types import (StructField, StringType, + IntegerType, StructType) + >>> from kedro_datasets.databricks import ManagedTableDataSet + >>> schema = StructType([StructField("name", StringType(), True), + StructField("age", IntegerType(), True)]) + >>> data = [('Alex', 31), ('Bob', 12), ('Clarke', 65), ('Dave', 29)] + >>> spark_df = SparkSession.builder.getOrCreate().createDataFrame(data, schema) + >>> data_set = ManagedTableDataSet(table="names_and_ages") + >>> data_set.save(spark_df) + >>> reloaded = data_set.load() + >>> reloaded.take(4) + """ # this dataset cannot be used with ``ParallelRunner``, # therefore it has the attribute ``_SINGLE_PROCESS = True`` @@ -207,7 +216,36 @@ def __init__( # pylint: disable=R0913 partition_columns: List[str] = None, owner_group: str = None, ) -> None: - """Creates a new instance of ``ManagedTableDataSet``.""" + """Creates a new instance of ``ManagedTableDataSet`` + + Args: + table (str): the name of the table + catalog (str, optional): the name of the catalog in Unity. + Defaults to None. + database (str, optional): the name of the database + (also referred to as schema). Defaults to "default". + write_mode (str, optional): the mode to write the data into the table. + Options are:["overwrite", "append", "upsert"]. + "upsert" mode requires primary_key field to be populated. + Defaults to "overwrite". + dataframe_type (str, optional): "pandas" or "spark" dataframe. + Defaults to "spark". + primary_key (Union[str, List[str]], optional): the primary key of the table. + Can be in the form of a list. Defaults to None. + version (Version, optional): kedro.io.core.Version instance to load the data. + Defaults to None. + schema (Dict[str, Any], optional): the schema of the table in JSON form. + Dataframes will be truncated to match the schema if provided. + Used by the hooks to create the table if the schema is provided + Defaults to None. + partition_columns (List[str], optional): the columns to use for partitioning the table. + Used by the hooks. Defaults to None. + owner_group (str, optional): if table access control is enabled in your workspace, + specifying owner_group will transfer ownership of the table and database to + this owner. All databases should have the same owner_group. Defaults to None. + Raises: + DataSetError: Invalid configuration supplied (through ManagedTable validation) + """ self._table = ManagedTable( database=database, From 13193366417b4248e4921f9473749fbba99954a1 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Wed, 1 Feb 2023 11:59:07 -0500 Subject: [PATCH 30/74] Fix docs formatting and phrasing for some datasets (#107) * Fix docs formatting and phrasing for some datasets Signed-off-by: Deepyaman Datta * Manually fix files not resolved with patch command Signed-off-by: Deepyaman Datta * Apply fix from #98 Signed-off-by: Deepyaman Datta --------- Signed-off-by: Deepyaman Datta Signed-off-by: Danny Farah --- kedro-datasets/kedro_datasets/json/json_dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/json/json_dataset.py b/kedro-datasets/kedro_datasets/json/json_dataset.py index a39e7aff5..ad86c9a17 100644 --- a/kedro-datasets/kedro_datasets/json/json_dataset.py +++ b/kedro-datasets/kedro_datasets/json/json_dataset.py @@ -23,7 +23,6 @@ class JSONDataSet(AbstractVersionedDataSet[Any, Any]): Example usage for the `YAML API `_: - .. code-block:: yaml cars: From b80525fc3098f05c2b4ab35a605c19fe82c34921 Mon Sep 17 00:00:00 2001 From: Sajid Alam <90610031+SajidAlamQB@users.noreply.github.com> Date: Thu, 2 Feb 2023 09:45:31 +0000 Subject: [PATCH 31/74] Release `kedro-datasets` `version 1.0.2` (#112) * bump version and update release notes * fix pylint errors Signed-off-by: Danny Farah --- kedro-datasets/RELEASE.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 39915c362..6416889ce 100644 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -21,9 +21,13 @@ * Add `mssql` backend to the `SQLQueryDataSet` DataSet using `pyodbc` library. # Upcoming Release: -* Change reference to `kedro.pipeline.Pipeline` object throughout test suite with `kedro.modular_pipeline.pipeline` factory. +# Release 1.0.2: + +## Bug fixes and other changes +* Change reference to `kedro.pipeline.Pipeline` object throughout test suite with `kedro.modular_pipeline.pipeline` factory. * Relaxed PyArrow range in line with Pandas +* Fixed outdated links to the dill package documentation # Release 1.0.1: From f516cc22f18c89b77aa98fdad4a3cf736bb98e0e Mon Sep 17 00:00:00 2001 From: Walber Moreira <58264877+wmoreiraa@users.noreply.github.com> Date: Thu, 9 Feb 2023 10:50:38 -0300 Subject: [PATCH 32/74] [kedro-datasets ] Add `Polars.CSVDataSet` (#95) Signed-off-by: wmoreiraa Signed-off-by: Danny Farah --- kedro-datasets/RELEASE.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 6416889ce..b55ae1257 100644 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -20,7 +20,19 @@ ## Bug fixes and other changes * Add `mssql` backend to the `SQLQueryDataSet` DataSet using `pyodbc` library. -# Upcoming Release: +# Upcoming Release 1.1.0: + + +## Major features and improvements: + +* Added the following new datasets: + +| Type | Description | Location | +| ------------------------------------ | -------------------------------------------------------------------------- | ----------------------------- | +| `polars.CSVDataSet` | A `CSVDataSet` backed by [polars](https://www.pola.rs/), a lighting fast dataframe package built entirely using Rust. | `kedro_datasets.polars` | + +## Bug fixes and other changes + # Release 1.0.2: From 51487722ccf6364149ceae04069870ba6ef77025 Mon Sep 17 00:00:00 2001 From: Yassine Alouini Date: Mon, 27 Feb 2023 10:42:00 +0100 Subject: [PATCH 33/74] Make the SQLQueryDataSet compatible with mssql. (#101) * [kedro-docker] Layers size optimization (#92) * [kedro-docker] Layers size optimization Signed-off-by: Mariusz Strzelecki * Adjust test requirements Signed-off-by: Mariusz Strzelecki * Skip coverage check on tests dir (some do not execute on Windows) Signed-off-by: Mariusz Strzelecki * Update .coveragerc with the setup Signed-off-by: Mariusz Strzelecki * Fix bandit so it does not scan kedro-datasets Signed-off-by: Mariusz Strzelecki * Fixed existence test Signed-off-by: Mariusz Strzelecki * Check why dir is not created Signed-off-by: Mariusz Strzelecki * Kedro starters are fixed now Signed-off-by: Mariusz Strzelecki * Increased no-output-timeout for long spark image build Signed-off-by: Mariusz Strzelecki * Spark image optimized Signed-off-by: Mariusz Strzelecki * Linting Signed-off-by: Mariusz Strzelecki * Switch to slim image always Signed-off-by: Mariusz Strzelecki * Trigger build Signed-off-by: Mariusz Strzelecki * Use textwrap.dedent for nicer indentation Signed-off-by: Mariusz Strzelecki * Revert "Use textwrap.dedent for nicer indentation" This reverts commit 3a1e3f855a29c6a1b118db3e844e5f9b67ade363. Signed-off-by: Mariusz Strzelecki * Revert "Revert "Use textwrap.dedent for nicer indentation"" This reverts commit d322d353b25d414cdfdef8ee12185e5a1d9baa2c. Signed-off-by: Mariusz Strzelecki * Make tests read more lines (to skip all deprecation warnings) Signed-off-by: Mariusz Strzelecki Signed-off-by: Mariusz Strzelecki Signed-off-by: Mariusz Strzelecki Signed-off-by: Yassine Alouini * Release Kedro-Docker 0.3.1 (#94) * Add release notes for kedro-docker 0.3.1 Signed-off-by: Jannic Holzer * Update version in kedro_docker module Signed-off-by: Jannic Holzer Signed-off-by: Jannic Holzer Signed-off-by: Yassine Alouini * Bump version and update release notes (#96) Signed-off-by: Merel Theisen Signed-off-by: Yassine Alouini * Make the SQLQueryDataSet compatible with mssql. Signed-off-by: Yassine Alouini * Add one test + update RELEASE.md. Signed-off-by: Yassine Alouini * Add missing pyodbc for tests. Signed-off-by: Yassine Alouini * Mock connection as well. Signed-off-by: Yassine Alouini * Add more dates parsing for mssql backend (thanks to fgaudindelrieu@idmog.com) Signed-off-by: Yassine Alouini * Fix an error in docstring of MetricsDataSet (#98) Signed-off-by: Yassine Alouini * Bump relax pyarrow version to work the same way as Pandas (#100) * Bump relax pyarrow version to work the same way as Pandas We only use PyArrow for `pandas.ParquetDataSet` as such I suggest we keep our versions pinned to the same range as [Pandas does](https://github.com/pandas-dev/pandas/blob/96fc51f5ec678394373e2c779ccff37ddb966e75/pyproject.toml#L100) for the same reason. As such I suggest we remove the upper bound as we have users requesting later versions in [support channels](https://kedro-org.slack.com/archives/C03RKP2LW64/p1674040509133529) * Updated release notes Signed-off-by: Yassine Alouini * Add missing type in catalog example. Signed-off-by: Yassine Alouini * Add one more unit tests for adapt_mssql. Signed-off-by: Yassine Alouini * [FIX] Add missing mocker from date test. Signed-off-by: Yassine Alouini * [TEST] Add a wrong input test. Signed-off-by: Yassine Alouini * Add pyodbc dependency. Signed-off-by: Yassine Alouini * [FIX] Remove dict() in tests. Signed-off-by: Yassine Alouini * Change check to check on plugin name (#103) Signed-off-by: Merel Theisen Signed-off-by: Yassine Alouini * Set coverage in pyproject.toml (#105) Signed-off-by: Merel Theisen Signed-off-by: Yassine Alouini * Move coverage settings to pyproject.toml (#106) Signed-off-by: Merel Theisen Signed-off-by: Yassine Alouini * Replace kedro.pipeline with modular_pipeline.pipeline factory (#99) * Add non-spark related test changes Replace kedro.pipeline.Pipeline with kedro.pipeline.modular_pipeline.pipeline factory. This is for symmetry with changes made to the main kedro library. Signed-off-by: Adam Farley Signed-off-by: Yassine Alouini * Fix outdated links in Kedro Datasets (#111) * fix links * fix dill links Signed-off-by: Yassine Alouini * Fix docs formatting and phrasing for some datasets (#107) * Fix docs formatting and phrasing for some datasets Signed-off-by: Deepyaman Datta * Manually fix files not resolved with patch command Signed-off-by: Deepyaman Datta * Apply fix from #98 Signed-off-by: Deepyaman Datta --------- Signed-off-by: Deepyaman Datta Signed-off-by: Yassine Alouini * Release `kedro-datasets` `version 1.0.2` (#112) * bump version and update release notes * fix pylint errors Signed-off-by: Yassine Alouini * Bump pytest to 7.2 (#113) Signed-off-by: Merel Theisen Signed-off-by: Yassine Alouini * Prefix Docker plugin name with "Kedro-" in usage message (#57) * Prefix Docker plugin name with "Kedro-" in usage message Signed-off-by: Deepyaman Datta Signed-off-by: Yassine Alouini * Keep Kedro-Docker plugin docstring from appearing in `kedro -h` (#56) * Keep Kedro-Docker plugin docstring from appearing in `kedro -h` Signed-off-by: Deepyaman Datta Signed-off-by: Yassine Alouini * [kedro-datasets ] Add `Polars.CSVDataSet` (#95) Signed-off-by: wmoreiraa Signed-off-by: Yassine Alouini * Remove deprecated `test_requires` from `setup.py` in Kedro-Docker (#54) Signed-off-by: Deepyaman Datta Signed-off-by: Yassine Alouini * [FIX] Fix ds to data_set. Signed-off-by: Yassine Alouini --------- Signed-off-by: Mariusz Strzelecki Signed-off-by: Mariusz Strzelecki Signed-off-by: Yassine Alouini Signed-off-by: Jannic Holzer Signed-off-by: Merel Theisen Signed-off-by: Deepyaman Datta Co-authored-by: Mariusz Strzelecki Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com> Co-authored-by: Merel Theisen <49397448+merelcht@users.noreply.github.com> Co-authored-by: OKA Naoya Co-authored-by: Joel <35801847+datajoely@users.noreply.github.com> Co-authored-by: adamfrly <45516720+adamfrly@users.noreply.github.com> Co-authored-by: Sajid Alam <90610031+SajidAlamQB@users.noreply.github.com> Co-authored-by: Deepyaman Datta Co-authored-by: Walber Moreira <58264877+wmoreiraa@users.noreply.github.com> Signed-off-by: Danny Farah --- kedro-datasets/RELEASE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index b55ae1257..31876c484 100644 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -32,7 +32,7 @@ | `polars.CSVDataSet` | A `CSVDataSet` backed by [polars](https://www.pola.rs/), a lighting fast dataframe package built entirely using Rust. | `kedro_datasets.polars` | ## Bug fixes and other changes - +* Add `mssql` backend to the `SQLQueryDataSet` DataSet using `pyodbc` library. # Release 1.0.2: From 0f6da60cde5574f2b29b10baeb8ef661978b3895 Mon Sep 17 00:00:00 2001 From: Jannic <37243923+jmholzer@users.noreply.github.com> Date: Mon, 6 Mar 2023 12:58:09 +0000 Subject: [PATCH 34/74] Add warning when `SparkDataSet` is used on Databricks without a valid file path (#114) * Add databricks deployment check and automatic DBFS path addition Signed-off-by: Jannic Holzer * Add newline at end of file Signed-off-by: Jannic Holzer * Remove spurious 'not' Signed-off-by: Jannic Holzer * Move dbfs utility functions from SparkDataSet Signed-off-by: Jannic Holzer * Add edge case logic to _build_dbfs_path Signed-off-by: Jannic Holzer * Add test for dbfs path construction Signed-off-by: Jannic Holzer * Linting Signed-off-by: Jannic Holzer * Remove spurious print statement :) Signed-off-by: Jannic Holzer * Add pylint disable too-many-public-methods Signed-off-by: Jannic Holzer * Move tests into single method to appease linter Signed-off-by: Jannic Holzer * Modify prefix check to /dbfs/ Signed-off-by: Jannic Holzer * Modify prefix check to /dbfs/ Signed-off-by: Jannic Holzer * Make warning message clearer Signed-off-by: Jannic Holzer * Add release note Signed-off-by: Jannic Holzer * Fix linting Signed-off-by: Jannic Holzer * Update warning message Signed-off-by: Jannic Holzer * Modify log warning level to error Signed-off-by: Jannic Holzer * Modify message back to warning, refer to undefined behaviour Signed-off-by: Jannic Holzer * Modify required prefix to /dbfs/ Signed-off-by: Jannic Holzer * Modify doc string Signed-off-by: Jannic Holzer * Modify warning message Signed-off-by: Jannic Holzer * Split tests and add filepath to warning Signed-off-by: Jannic Holzer * Modify f string in logging call Signed-off-by: Jannic Holzer * Fix tests Signed-off-by: Jannic Holzer * Lint Signed-off-by: Jannic Holzer --------- Signed-off-by: Jannic Holzer Signed-off-by: Danny Farah --- kedro-datasets/RELEASE.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 31876c484..0278babbb 100644 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -34,6 +34,9 @@ ## Bug fixes and other changes * Add `mssql` backend to the `SQLQueryDataSet` DataSet using `pyodbc` library. +## Bug fixes and other changes +* Added a warning when the user tries to use `SparkDataSet` on Databricks without specifying a file path with the `/dbfs/` prefix. + # Release 1.0.2: ## Bug fixes and other changes From 4c07c9bab7c6155fbf5ad0d3031b6a4c1e775cd4 Mon Sep 17 00:00:00 2001 From: Vladimir Filimonov <69304033+Vladimir-Filimonov@users.noreply.github.com> Date: Thu, 9 Mar 2023 10:25:40 +0100 Subject: [PATCH 35/74] Snowpark (Snowflake) dataset for kedro (#104) * Add Snowpark datasets Signed-off-by: Vladimir Filimonov Signed-off-by: heber-urdaneta Signed-off-by: Danny Farah --- kedro-datasets/RELEASE.md | 1 + 1 file changed, 1 insertion(+) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 0278babbb..74526a91e 100644 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -30,6 +30,7 @@ | Type | Description | Location | | ------------------------------------ | -------------------------------------------------------------------------- | ----------------------------- | | `polars.CSVDataSet` | A `CSVDataSet` backed by [polars](https://www.pola.rs/), a lighting fast dataframe package built entirely using Rust. | `kedro_datasets.polars` | +| `snowflake.SnowparkTableDataSet` | Work with [Snowpark](https://www.snowflake.com/en/data-cloud/snowpark/) DataFrames from tables in Snowflake. | `kedro_datasets.snowflake` | ## Bug fixes and other changes * Add `mssql` backend to the `SQLQueryDataSet` DataSet using `pyodbc` library. From 83e8388b9579b6a5b3246678b9a2df5fba3eea59 Mon Sep 17 00:00:00 2001 From: Merel Theisen <49397448+merelcht@users.noreply.github.com> Date: Mon, 20 Mar 2023 17:36:24 +0000 Subject: [PATCH 36/74] Bump version (#132) Signed-off-by: Merel Theisen Signed-off-by: Danny Farah --- kedro-datasets/RELEASE.md | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 74526a91e..285918d5a 100644 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -4,8 +4,6 @@ ## Bug fixes and other changes -* Fixed problematic docstrings causing RTD builds on Kedro to fail. - # Release 1.1.0: ## Major features and improvements: @@ -19,23 +17,6 @@ ## Bug fixes and other changes * Add `mssql` backend to the `SQLQueryDataSet` DataSet using `pyodbc` library. - -# Upcoming Release 1.1.0: - - -## Major features and improvements: - -* Added the following new datasets: - -| Type | Description | Location | -| ------------------------------------ | -------------------------------------------------------------------------- | ----------------------------- | -| `polars.CSVDataSet` | A `CSVDataSet` backed by [polars](https://www.pola.rs/), a lighting fast dataframe package built entirely using Rust. | `kedro_datasets.polars` | -| `snowflake.SnowparkTableDataSet` | Work with [Snowpark](https://www.snowflake.com/en/data-cloud/snowpark/) DataFrames from tables in Snowflake. | `kedro_datasets.snowflake` | - -## Bug fixes and other changes -* Add `mssql` backend to the `SQLQueryDataSet` DataSet using `pyodbc` library. - -## Bug fixes and other changes * Added a warning when the user tries to use `SparkDataSet` on Databricks without specifying a file path with the `/dbfs/` prefix. # Release 1.0.2: From 4f7bac107f0d6fbd9b0f6bf2fdbd082bde7bfca2 Mon Sep 17 00:00:00 2001 From: Jannic <37243923+jmholzer@users.noreply.github.com> Date: Tue, 21 Mar 2023 18:19:09 +0000 Subject: [PATCH 37/74] Fix malformed doc strings causing RTD builds to fail on Kedro (#136) * Fix missing blank line before yaml code block declaration Signed-off-by: Jannic Holzer * Fix bullet list Signed-off-by: Jannic Holzer * Add full stop to first bullet line Signed-off-by: Jannic Holzer * Make bullet list in line Signed-off-by: Jannic Holzer * Fix line too long Signed-off-by: Jannic Holzer * Add release note Signed-off-by: Jannic Holzer * Fix release note Signed-off-by: Jannic Holzer --------- Signed-off-by: Jannic Holzer Signed-off-by: Danny Farah --- kedro-datasets/RELEASE.md | 2 ++ kedro-datasets/kedro_datasets/json/json_dataset.py | 1 + 2 files changed, 3 insertions(+) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 285918d5a..8d501d1fc 100644 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -4,6 +4,8 @@ ## Bug fixes and other changes +* Fixed problematic docstrings causing RTD builds on Kedro to fail. + # Release 1.1.0: ## Major features and improvements: diff --git a/kedro-datasets/kedro_datasets/json/json_dataset.py b/kedro-datasets/kedro_datasets/json/json_dataset.py index ad86c9a17..a39e7aff5 100644 --- a/kedro-datasets/kedro_datasets/json/json_dataset.py +++ b/kedro-datasets/kedro_datasets/json/json_dataset.py @@ -23,6 +23,7 @@ class JSONDataSet(AbstractVersionedDataSet[Any, Any]): Example usage for the `YAML API `_: + .. code-block:: yaml cars: From a6454a8b9fbaa178314c2859686793dc20275856 Mon Sep 17 00:00:00 2001 From: Danny Farah Date: Wed, 5 Apr 2023 14:56:25 -0400 Subject: [PATCH 38/74] added backticks to catalog Signed-off-by: Danny Farah --- .../kedro_datasets/databricks/managed_table_dataset.py | 4 ++-- .../tests/databricks/test_managed_table_dataset.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py index fd5cd5e03..015daff93 100644 --- a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py +++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py @@ -126,9 +126,9 @@ def full_table_location(self) -> str: """ full_table_location = None if self.catalog and self.database and self.table: - full_table_location = f"{self.catalog}.{self.database}.{self.table}" + full_table_location = f"`{self.catalog}`.`{self.database}`.`{self.table}`" elif self.table: - full_table_location = f"{self.database}.{self.table}" + full_table_location = f"`{self.database}`.`{self.table}`" return full_table_location def schema(self) -> StructType: diff --git a/kedro-datasets/tests/databricks/test_managed_table_dataset.py b/kedro-datasets/tests/databricks/test_managed_table_dataset.py index 4520042ab..000aa8d6e 100644 --- a/kedro-datasets/tests/databricks/test_managed_table_dataset.py +++ b/kedro-datasets/tests/databricks/test_managed_table_dataset.py @@ -172,13 +172,13 @@ def expected_upsert_multiple_primary_spark_df(spark_session: SparkSession): class TestManagedTableDataSet: def test_full_table(self): unity_ds = ManagedTableDataSet(catalog="test", database="test", table="test") - assert unity_ds._table.full_table_location() == "test.test.test" + assert unity_ds._table.full_table_location() == "`test`.`test`.`test`" unity_ds = ManagedTableDataSet(database="test", table="test") - assert unity_ds._table.full_table_location() == "test.test" + assert unity_ds._table.full_table_location() == "`test`.`test`" unity_ds = ManagedTableDataSet(table="test") - assert unity_ds._table.full_table_location() == "default.test" + assert unity_ds._table.full_table_location() == "`default`.`test`" with pytest.raises(TypeError): ManagedTableDataSet() # pylint: disable=no-value-for-parameter From 9873d18dba223d8d08921a1f16ca48d8677387f4 Mon Sep 17 00:00:00 2001 From: Danny Farah Date: Tue, 11 Apr 2023 09:30:47 -0400 Subject: [PATCH 39/74] fixing regex to allow hyphens Signed-off-by: Danny Farah --- .../kedro_datasets/databricks/managed_table_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py index 015daff93..41fd7d2e5 100644 --- a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py +++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py @@ -29,7 +29,7 @@ class ManagedTable: # pylint: disable=R0902 """Stores the definition of a managed table""" # regex for tables, catalogs and schemas - _NAMING_REGEX = r"\b[0-9a-zA-Z_]{1,32}\b" + _NAMING_REGEX = r"\b[0-9a-zA-Z_-]{1,32}\b" _VALID_WRITE_MODES = ["overwrite", "upsert", "append"] _VALID_DATAFRAME_TYPES = ["spark", "pandas"] database: str From f0c9e2e6086e7a4db2fda8413294846658038ab3 Mon Sep 17 00:00:00 2001 From: Danny Farah Date: Wed, 3 May 2023 10:59:33 -0700 Subject: [PATCH 40/74] Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com> --- .../kedro_datasets/databricks/managed_table_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py index 41fd7d2e5..9fc2f3f30 100644 --- a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py +++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py @@ -66,7 +66,7 @@ def validate_database(self): """validates database name Raises: - DataSetError: + DataSetError: If the table name does not conform to naming constraints. """ if self.database: if not re.fullmatch(self._NAMING_REGEX, self.database): From a8bd47d8cff8c61afc7c98f644bf3d00c2e4b711 Mon Sep 17 00:00:00 2001 From: Danny Farah Date: Wed, 3 May 2023 10:59:42 -0700 Subject: [PATCH 41/74] Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com> Signed-off-by: Danny Farah --- .../kedro_datasets/databricks/managed_table_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py index 9fc2f3f30..6e895ba51 100644 --- a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py +++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py @@ -86,7 +86,7 @@ def validate_write_mode(self): """validates the write mode Raises: - DataSetError: + DataSetError: If an invalid `write_mode` is passed. """ if self.write_mode not in self._VALID_WRITE_MODES: valid_modes = ", ".join(self._VALID_WRITE_MODES) From e9adb763681d4be52e2fa49daeb04dc7db72661e Mon Sep 17 00:00:00 2001 From: Danny Farah Date: Wed, 3 May 2023 10:59:56 -0700 Subject: [PATCH 42/74] Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com> Signed-off-by: Danny Farah --- .../kedro_datasets/databricks/managed_table_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py index 6e895ba51..25fe150ba 100644 --- a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py +++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py @@ -109,7 +109,7 @@ def validate_primary_key(self): """validates the primary key of the table Raises: - DataSetError: + DataSetError: If no `primary_key` is specified. """ if self.primary_key is None or len(self.primary_key) == 0: if self.write_mode == "upsert": From 3f85f731ed78ff07086a90f2fa26fd33be86f831 Mon Sep 17 00:00:00 2001 From: Danny Farah Date: Wed, 3 May 2023 11:01:50 -0700 Subject: [PATCH 43/74] Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com> Signed-off-by: Danny Farah --- .../kedro_datasets/databricks/managed_table_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py index 25fe150ba..5e8214a8f 100644 --- a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py +++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py @@ -76,7 +76,7 @@ def validate_catalog(self): """validates catalog name Raises: - DataSetError: + DataSetError: If the catalog name does not conform to naming constraints. """ if self.catalog: if not re.fullmatch(self._NAMING_REGEX, self.catalog): From 00b4eaf466db3531198b3b26746c3eb64246b847 Mon Sep 17 00:00:00 2001 From: Danny Farah Date: Wed, 3 May 2023 11:01:59 -0700 Subject: [PATCH 44/74] Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com> Signed-off-by: Danny Farah --- .../kedro_datasets/databricks/managed_table_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py index 5e8214a8f..790df3def 100644 --- a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py +++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py @@ -99,7 +99,7 @@ def validate_dataframe_type(self): """validates the dataframe type Raises: - DataSetError: + DataSetError: If an invalid `dataframe_type` is passed """ if self.dataframe_type not in self._VALID_DATAFRAME_TYPES: valid_types = ", ".join(self._VALID_DATAFRAME_TYPES) From fe5440e97993efed9c7676c9f51a834969d5b39c Mon Sep 17 00:00:00 2001 From: Danny Farah Date: Wed, 3 May 2023 11:02:09 -0700 Subject: [PATCH 45/74] Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com> Signed-off-by: Danny Farah --- .../kedro_datasets/databricks/managed_table_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py index 790df3def..08f62f769 100644 --- a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py +++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py @@ -222,7 +222,7 @@ def __init__( # pylint: disable=R0913 table (str): the name of the table catalog (str, optional): the name of the catalog in Unity. Defaults to None. - database (str, optional): the name of the database + database (str, optional): the name of the database. (also referred to as schema). Defaults to "default". write_mode (str, optional): the mode to write the data into the table. Options are:["overwrite", "append", "upsert"]. From 16215781f0eb3ef038962c6814ef9374da1f653f Mon Sep 17 00:00:00 2001 From: Danny Farah Date: Wed, 3 May 2023 11:02:17 -0700 Subject: [PATCH 46/74] Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com> Signed-off-by: Danny Farah --- .../kedro_datasets/databricks/managed_table_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py index 08f62f769..4e2da3cf6 100644 --- a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py +++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py @@ -149,7 +149,7 @@ def schema(self) -> StructType: class ManagedTableDataSet(AbstractVersionedDataSet): """``ManagedTableDataSet`` loads and saves data into managed delta tables on Databricks. Load and save can be in Spark or Pandas dataframes, specified in dataframe_type. - When saving data, you can specify one of three modes: overwtire(default), append, + When saving data, you can specify one of three modes: overwrite(default), append, or upsert. Upsert requires you to specify the primary_column parameter which will be used as part of the join condition. This dataset works best with the databricks kedro starter. That starter comes with hooks that allow this From 787ed0d6afc35272b2a9a4f00521e2937568734f Mon Sep 17 00:00:00 2001 From: Danny Farah Date: Wed, 3 May 2023 11:11:51 -0700 Subject: [PATCH 47/74] Update kedro-datasets/test_requirements.txt Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com> Signed-off-by: Danny Farah --- kedro-datasets/test_requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kedro-datasets/test_requirements.txt b/kedro-datasets/test_requirements.txt index b95006aba..29b4c0d09 100644 --- a/kedro-datasets/test_requirements.txt +++ b/kedro-datasets/test_requirements.txt @@ -24,7 +24,8 @@ lxml~=4.6 matplotlib>=3.0.3, <3.4; python_version < '3.10' # 3.4.0 breaks holoviews matplotlib>=3.5, <3.6; python_version == '3.10' memory_profiler>=0.50.0, <1.0 -mlflow==2.2.1 +mlflow~=2.2.1; python_version>='3.8' +mlflow~=1.30.0; python_version=='3.7' moto==1.3.7; python_version < '3.10' moto==3.0.4; python_version == '3.10' networkx~=2.4 From b1c6832859e05cc6cd46cd9d605586cc92c55a84 Mon Sep 17 00:00:00 2001 From: Danny Farah Date: Wed, 3 May 2023 11:12:03 -0700 Subject: [PATCH 48/74] Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com> Signed-off-by: Danny Farah --- .../kedro_datasets/databricks/managed_table_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py index 4e2da3cf6..d26c0b189 100644 --- a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py +++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py @@ -207,7 +207,7 @@ def __init__( # pylint: disable=R0913 database: str = "default", write_mode: str = "overwrite", dataframe_type: str = "spark", - primary_key: Union[str, List[str]] = None, + primary_key: Optional[Union[str, List[str]]] = None, version: Version = None, *, # the following parameters are used by project hooks From 085dea98e4d5ea5eff7ad9bb6f2e6cb5078f7f4f Mon Sep 17 00:00:00 2001 From: Danny Farah Date: Wed, 3 May 2023 11:12:16 -0700 Subject: [PATCH 49/74] Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com> Signed-off-by: Danny Farah --- .../kedro_datasets/databricks/managed_table_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py index d26c0b189..e562650e9 100644 --- a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py +++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py @@ -429,7 +429,7 @@ def _describe(self) -> Dict[str, str]: "write_mode": self._table.write_mode, "dataframe_type": self._table.dataframe_type, "primary_key": self._table.primary_key, - "version": self._version, + "version": str(self._version), "owner_group": self._table.owner_group, "partition_columns": self._table.partition_columns, } From 2c2e960a0617581c73693cd3c3273cf92fa3e4e3 Mon Sep 17 00:00:00 2001 From: Danny Farah Date: Wed, 3 May 2023 11:16:39 -0700 Subject: [PATCH 50/74] Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com> Signed-off-by: Danny Farah --- .../databricks/managed_table_dataset.py | 28 ++++++++----------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py index e562650e9..6266bfc35 100644 --- a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py +++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py @@ -174,24 +174,20 @@ class ManagedTableDataSet(AbstractVersionedDataSet): Example usage for the `Python API `_: - :: + .. code-block:: python - % pyspark --packages io.delta:delta-core_2.12:1.2.1 - --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" - --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" - - >>> from pyspark.sql import SparkSession - >>> from pyspark.sql.types import (StructField, StringType, + from pyspark.sql import SparkSession + from pyspark.sql.types import (StructField, StringType, IntegerType, StructType) - >>> from kedro_datasets.databricks import ManagedTableDataSet - >>> schema = StructType([StructField("name", StringType(), True), - StructField("age", IntegerType(), True)]) - >>> data = [('Alex', 31), ('Bob', 12), ('Clarke', 65), ('Dave', 29)] - >>> spark_df = SparkSession.builder.getOrCreate().createDataFrame(data, schema) - >>> data_set = ManagedTableDataSet(table="names_and_ages") - >>> data_set.save(spark_df) - >>> reloaded = data_set.load() - >>> reloaded.take(4) + from kedro_datasets.databricks import ManagedTableDataSet + schema = StructType([StructField("name", StringType(), True), + StructField("age", IntegerType(), True)]) + data = [('Alex', 31), ('Bob', 12), ('Clarke', 65), ('Dave', 29)] + spark_df = SparkSession.builder.getOrCreate().createDataFrame(data, schema) + data_set = ManagedTableDataSet(table="names_and_ages") + data_set.save(spark_df) + reloaded = data_set.load() + reloaded.take(4) """ # this dataset cannot be used with ``ParallelRunner``, From 267c9efc411679b14828ec26377c2814bdd1436c Mon Sep 17 00:00:00 2001 From: Nok Lam Chan Date: Thu, 6 Apr 2023 15:44:14 +0100 Subject: [PATCH 51/74] Sync delta-spark requirements (#160) * Update setup.py * Update releases notes Signed-off-by: Nok --------- Signed-off-by: Nok Signed-off-by: Danny Farah --- kedro-datasets/RELEASE.md | 1 + kedro-datasets/setup.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index f7d480c29..01a3b92dc 100644 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -5,6 +5,7 @@ * Added SQLAlchemy 2.0 support (and dropped support for versions below 1.4). ## Bug fixes and other changes +* Relaxed `delta-spark` upper bound to allow compatibility with Spark 3.1.x and 3.2.x. # Release 1.2.0: diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py index 9115c4ed9..c1bf274ea 100644 --- a/kedro-datasets/setup.py +++ b/kedro-datasets/setup.py @@ -80,7 +80,7 @@ def _collect_requirements(requires): "spark.SparkDataSet": [SPARK, HDFS, S3FS], "spark.SparkHiveDataSet": [SPARK, HDFS, S3FS], "spark.SparkJDBCDataSet": [SPARK, HDFS, S3FS], - "spark.DeltaTableDataSet": [SPARK, HDFS, S3FS, "delta-spark~=1.0"], + "spark.DeltaTableDataSet": [SPARK, HDFS, S3FS, "delta-spark>=1.0, <3.0"], } svmlight_require = {"svmlight.SVMLightDataSet": ["scikit-learn~=1.0.2", "scipy~=1.7.3"]} tensorflow_require = { From 0febe06fa551834951135d2c6111e4387b98bdc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Luis=20Cano=20Rodr=C3=ADguez?= Date: Tue, 11 Apr 2023 12:05:04 +0200 Subject: [PATCH 52/74] Fix links on GitHub issue templates (#150) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Juan Luis Cano Rodríguez Signed-off-by: Danny Farah --- .github/ISSUE_TEMPLATE/config.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index af7ecdbe0..53557f844 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,10 +1,10 @@ blank_issues_enabled: false contact_links: - - name: Discord server + - name: Slack workspace about: Come chat with the community! - url: https://discord.gg/akJDeVaxnB + url: https://slack.kedro.org - name: Documentation - url: https://kedro.readthedocs.io/en/stable/ + url: https://docs.kedro.org about: To learn more about how Kedro works - name: Case studies, articles and video tutorials url: https://github.com/kedro-org/kedro-community From c190e3189720453d94f8e8ab09f6c5c34035bf75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Luis=20Cano=20Rodr=C3=ADguez?= Date: Wed, 12 Apr 2023 14:22:14 +0200 Subject: [PATCH 53/74] Migrate most of `kedro-datasets` metadata to `pyproject.toml` (#161) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Include missing requirements files in sdist Fix gh-86. Signed-off-by: Juan Luis Cano Rodríguez * Migrate most project metadata to `pyproject.toml` See https://github.com/kedro-org/kedro/issues/2334. Signed-off-by: Juan Luis Cano Rodríguez * Move requirements to `pyproject.toml` Signed-off-by: Juan Luis Cano Rodríguez --------- Signed-off-by: Juan Luis Cano Rodríguez Signed-off-by: Danny Farah --- Makefile | 2 +- kedro-datasets/pyproject.toml | 25 +++++++++++++++++++++++ kedro-datasets/requirements.txt | 1 - kedro-datasets/setup.py | 35 +-------------------------------- 4 files changed, 27 insertions(+), 36 deletions(-) delete mode 100644 kedro-datasets/requirements.txt diff --git a/Makefile b/Makefile index 86daa6313..be653ed59 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ package: cd $(plugin);\ rm -Rf dist;\ - python setup.py sdist bdist_wheel + python -m build pypi: python -m pip install twine -U diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml index 6df7bd372..0f0ad2fc3 100644 --- a/kedro-datasets/pyproject.toml +++ b/kedro-datasets/pyproject.toml @@ -1,3 +1,28 @@ +[project] +name = "kedro-datasets" +authors = [ + {name = "Kedro"} +] +description = "Kedro-Datasets is where you can find all of Kedro's data connectors." +requires-python = ">=3.7, <3.11" +license = {text = "Apache Software License (Apache 2.0)"} +dependencies = [ + "kedro~=0.18.4", +] +dynamic = ["readme", "version", "optional-dependencies"] + +[project.urls] +Source = "https://github.com/kedro-org/kedro-plugins/tree/main/kedro-datasets" +Documentation = "https://docs.kedro.org" +Tracker = "https://github.com/kedro-org/kedro-plugins/issues" + +[tool.setuptools.packages.find] +include = ["kedro_datasets*"] + +[tool.setuptools.dynamic] +readme = {file = "README.md", content-type = "text/markdown"} +version = {attr = "kedro_datasets.__version__"} + [tool.black] [tool.isort] diff --git a/kedro-datasets/requirements.txt b/kedro-datasets/requirements.txt deleted file mode 100644 index b5edbb617..000000000 --- a/kedro-datasets/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -kedro~=0.18.4 diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py index c1bf274ea..ef2fe3fe6 100644 --- a/kedro-datasets/setup.py +++ b/kedro-datasets/setup.py @@ -1,12 +1,6 @@ -import re -from codecs import open from itertools import chain -from os import path -from setuptools import find_packages, setup - -name = "kedro-datasets" -here = path.abspath(path.dirname(__file__)) +from setuptools import setup # at least 1.3 to be able to use XMLDataSet and pandas integration with fsspec PANDAS = "pandas>=1.3, <3.0" @@ -16,21 +10,6 @@ POLARS = "polars~=0.15.16" DELTA = "delta-spark~=1.2.1" -with open("requirements.txt", "r", encoding="utf-8") as f: - install_requires = [x.strip() for x in f if x.strip()] - -with open("test_requirements.txt", "r", encoding="utf-8") as f: - tests_require = [x.strip() for x in f if x.strip() and not x.startswith("-r")] - -# get package version -package_name = name.replace("-", "_") -with open(path.join(here, package_name, "__init__.py"), encoding="utf-8") as f: - version = re.search(r'__version__ = ["\']([^"\']+)', f.read()).group(1) - -# Get the long description from the README file -with open(path.join(here, "README.md"), encoding="utf-8") as f: - readme = f.read() - def _collect_requirements(requires): return sorted(set(chain.from_iterable(requires.values()))) @@ -151,17 +130,5 @@ def _collect_requirements(requires): extras_require["all"] = _collect_requirements(extras_require) setup( - name=name, - version=version, - description="Kedro-Datasets is where you can find all of Kedro's data connectors.", - long_description=readme, - long_description_content_type="text/markdown", - url="https://github.com/kedro-org/kedro-plugins/tree/main/kedro-datasets", - install_requires=install_requires, - tests_require=tests_require, - author="Kedro", - python_requires=">=3.7, <3.11", - license="Apache Software License (Apache 2.0)", - packages=find_packages(exclude=["tests*"]), extras_require=extras_require, ) From 7d648e6800865c4d26334a610233bc59a48b65ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Luis=20Cano=20Rodr=C3=ADguez?= Date: Mon, 17 Apr 2023 10:48:36 +0200 Subject: [PATCH 54/74] Upgrade Polars (#171) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Upgrade Polars Signed-off-by: Juan Luis Cano Rodríguez * Update Polars to 0.17.x --------- Signed-off-by: Juan Luis Cano Rodríguez Signed-off-by: Danny Farah --- kedro-datasets/setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py index ef2fe3fe6..3733c68af 100644 --- a/kedro-datasets/setup.py +++ b/kedro-datasets/setup.py @@ -7,10 +7,11 @@ SPARK = "pyspark>=2.2, <4.0" HDFS = "hdfs>=2.5.8, <3.0" S3FS = "s3fs>=0.3.0, <0.5" -POLARS = "polars~=0.15.16" +POLARS = "polars~=0.17.0" DELTA = "delta-spark~=1.2.1" + def _collect_requirements(requires): return sorted(set(chain.from_iterable(requires.values()))) From 18d93501a0204a75d33f05aa2fcac423dba978e4 Mon Sep 17 00:00:00 2001 From: Nok Lam Chan Date: Mon, 17 Apr 2023 14:47:16 +0100 Subject: [PATCH 55/74] if release is failed, it return exit code and fail the CI (#158) Signed-off-by: Danny Farah --- tools/circleci/circleci_release.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/tools/circleci/circleci_release.py b/tools/circleci/circleci_release.py index 88c4ed1d0..dd05d4c5a 100755 --- a/tools/circleci/circleci_release.py +++ b/tools/circleci/circleci_release.py @@ -4,6 +4,7 @@ """ import os +import sys import requests from requests.structures import CaseInsensitiveDict @@ -33,12 +34,6 @@ def circleci_release(project_slug, payload, circle_endpoint, circle_release_toke headers["Circle-Token"] = circle_release_token resp = requests.post(circle_endpoint, headers=headers, json=payload, timeout=10) - print(f"Status Code: {resp.status_code}") - if resp.status_code == 201: - print("Creating CircleCI Pipeline successfully") - print(resp.content) - else: - print("Failed to create CircleCI Pipeline") return resp @@ -70,6 +65,14 @@ def circleci_release(project_slug, payload, circle_endpoint, circle_release_toke print(package_name, package_version) if check_no_version_pypi(pypi_endpoint, package_name, package_version): - circleci_release( + res = circleci_release( PROJECT_SLUG, payload, circleci_endpoint, CIRCLE_RELEASE_TOKEN ) + print(f"Status Code: {resp.status_code}") + if resp.status_code == 201: + print("Creating CircleCI Pipeline successfully") + else: + print("Failed to create CircleCI Pipeline") + print(resp.content) + if resp.status_code != 201: + sys.exit(1) From 2eb53ac7caa5adf52aa7bae9e205d93d1a11c534 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Luis=20Cano=20Rodr=C3=ADguez?= Date: Tue, 18 Apr 2023 13:25:21 +0200 Subject: [PATCH 56/74] Migrate `kedro-airflow` to static metadata (#172) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Migrate kedro-airflow to static metadata See https://github.com/kedro-org/kedro/issues/2334. Signed-off-by: Juan Luis Cano Rodríguez * Add explicit PEP 518 build requirements for kedro-datasets Signed-off-by: Juan Luis Cano Rodríguez * Typos Co-authored-by: Merel Theisen <49397448+merelcht@users.noreply.github.com> Signed-off-by: Juan Luis Cano Rodríguez * Remove dangling reference to requirements.txt Signed-off-by: Juan Luis Cano Rodríguez * Add release notes Signed-off-by: Juan Luis Cano Rodríguez --------- Signed-off-by: Juan Luis Cano Rodríguez Signed-off-by: Danny Farah --- kedro-airflow/MANIFEST.in | 1 - kedro-airflow/RELEASE.md | 1 + kedro-airflow/pyproject.toml | 48 +++++++++++++++++++++++++++++ kedro-airflow/requirements.txt | 3 -- kedro-airflow/setup.cfg | 10 ------ kedro-airflow/setup.py | 41 ------------------------ kedro-airflow/test_requirements.txt | 1 - kedro-datasets/pyproject.toml | 4 +++ 8 files changed, 53 insertions(+), 56 deletions(-) delete mode 100644 kedro-airflow/requirements.txt delete mode 100644 kedro-airflow/setup.cfg delete mode 100644 kedro-airflow/setup.py diff --git a/kedro-airflow/MANIFEST.in b/kedro-airflow/MANIFEST.in index 523166e84..ed984822f 100644 --- a/kedro-airflow/MANIFEST.in +++ b/kedro-airflow/MANIFEST.in @@ -1,4 +1,3 @@ include README.md include LICENSE.md -include requirements.txt include kedro_airflow/airflow_dag_template.j2 diff --git a/kedro-airflow/RELEASE.md b/kedro-airflow/RELEASE.md index 75e4654e6..c2e0615b4 100755 --- a/kedro-airflow/RELEASE.md +++ b/kedro-airflow/RELEASE.md @@ -1,5 +1,6 @@ # Upcoming release 0.5.2 * Change reference to `kedro.pipeline.Pipeline` object throughout test suite with `kedro.modular_pipeline.pipeline` factory. +* Migrate all project metadata to static `pyproject.toml`. # Release 0.5.1 * Added additional CLI argument `--jinja-file` to provide a path to a custom Jinja2 template. diff --git a/kedro-airflow/pyproject.toml b/kedro-airflow/pyproject.toml index 4f3292f55..42fe8974b 100644 --- a/kedro-airflow/pyproject.toml +++ b/kedro-airflow/pyproject.toml @@ -1,3 +1,51 @@ +[build-system] +requires = ["setuptools>=61.2"] +build-backend = "setuptools.build_meta" + +[project] +name = "kedro-airflow" +authors = [ + {name = "Kedro"} +] +description = "Kedro-Airflow makes it easy to deploy Kedro projects to Airflow" +requires-python = ">=3.7, <3.11" +license = {text = "Apache Software License (Apache 2.0)"} +dependencies = [ + "kedro>=0.17.5", + "python-slugify>=4.0", + "semver~=2.10", # Needs to be at least 2.10.0 to get VersionInfo.match +] +dynamic = ["readme", "version"] + +[project.urls] +Source = "https://github.com/kedro-org/kedro-plugins/tree/main/kedro-airflow" +Documentation = "https://github.com/kedro-org/kedro-plugins/blob/main/kedro-airflow/README.md" +Tracker = "https://github.com/kedro-org/kedro-plugins/issues" + +[project.entry-points."kedro.project_commands"] +airflow = "kedro_airflow.plugin:commands" + +[tool.setuptools] +include-package-data = true +packages = ["kedro_airflow"] +zip-safe = false + +[tool.setuptools.package-data] +kedro_airflow = ["kedro_airflow/airflow_dag_template.j2"] + +[tool.setuptools.dynamic] +readme = {file = "README.md", content-type = "text/markdown"} +version = {attr = "kedro_airflow.__version__"} + +[tool.pytest.ini_options] +addopts = """ + --cov-report xml:coverage.xml + --cov-report term-missing + --cov kedro_airflow + --cov tests + --no-cov-on-fail + -ra""" + [tool.black] exclude=".*template.py" diff --git a/kedro-airflow/requirements.txt b/kedro-airflow/requirements.txt deleted file mode 100644 index d1731ba85..000000000 --- a/kedro-airflow/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -kedro>=0.17.5 -python-slugify>=4.0 -semver~=2.10 # Needs to be at least 2.10.0 to get VersionInfo.match diff --git a/kedro-airflow/setup.cfg b/kedro-airflow/setup.cfg deleted file mode 100644 index 7fa30d2d0..000000000 --- a/kedro-airflow/setup.cfg +++ /dev/null @@ -1,10 +0,0 @@ -[metadata] -description-file=README.md - -[tool:pytest] -addopts=--cov-report xml:coverage.xml - --cov-report term-missing - --cov kedro_airflow - --cov tests - --no-cov-on-fail - -ra diff --git a/kedro-airflow/setup.py b/kedro-airflow/setup.py deleted file mode 100644 index 85bb25b8a..000000000 --- a/kedro-airflow/setup.py +++ /dev/null @@ -1,41 +0,0 @@ -import re -from codecs import open -from os import path - -from setuptools import setup - -name = "kedro-airflow" -here = path.abspath(path.dirname(__file__)) - -# get package version -package_name = name.replace("-", "_") -with open(path.join(here, package_name, "__init__.py"), encoding="utf-8") as f: - version = re.search(r'__version__ = ["\']([^"\']+)', f.read()).group(1) - -# get the dependencies and installs -with open("requirements.txt", "r", encoding="utf-8") as f: - requires = [x.strip() for x in f if x.strip()] - -# get the long description from the README file -with open(path.join(here, "README.md"), encoding="utf-8") as f: - readme = f.read() - -setup( - name=name, - version=version, - description="Kedro-Airflow makes it easy to deploy Kedro projects to Airflow", - long_description=readme, - long_description_content_type="text/markdown", - url="https://github.com/kedro-org/kedro-plugins/tree/main/kedro-airflow", - author="Kedro", - python_requires=">=3.7, <3.11", - install_requires=requires, - license="Apache Software License (Apache 2.0)", - packages=["kedro_airflow"], - package_data={"kedro_airflow": ["kedro_airflow/airflow_dag_template.j2"]}, - include_package_data=True, - zip_safe=False, - entry_points={ - "kedro.project_commands": ["airflow = kedro_airflow.plugin:commands"] - }, -) diff --git a/kedro-airflow/test_requirements.txt b/kedro-airflow/test_requirements.txt index 4ced2ca4c..cdea520c7 100644 --- a/kedro-airflow/test_requirements.txt +++ b/kedro-airflow/test_requirements.txt @@ -1,4 +1,3 @@ --r requirements.txt apache-airflow<3.0 bandit>=1.6.2, <2.0 behave diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml index 0f0ad2fc3..a5f494106 100644 --- a/kedro-datasets/pyproject.toml +++ b/kedro-datasets/pyproject.toml @@ -1,3 +1,7 @@ +[build-system] +requires = ["setuptools>=61.2"] +build-backend = "setuptools.build_meta" + [project] name = "kedro-datasets" authors = [ From 494fa5f389446b5c560ca86fbe099ba5cacdaee6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Luis=20Cano=20Rodr=C3=ADguez?= Date: Tue, 18 Apr 2023 13:26:53 +0200 Subject: [PATCH 57/74] Migrate `kedro-telemetry` to static metadata (#174) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Migrate kedro-telemetry to static metadata See kedro-org/kedro#2334. Signed-off-by: Juan Luis Cano Rodríguez * Add release notes Signed-off-by: Juan Luis Cano Rodríguez --------- Signed-off-by: Juan Luis Cano Rodríguez Signed-off-by: Danny Farah --- kedro-telemetry/RELEASE.md | 3 ++ kedro-telemetry/pyproject.toml | 38 +++++++++++++++++++++++++ kedro-telemetry/requirements.txt | 2 -- kedro-telemetry/setup.py | 41 --------------------------- kedro-telemetry/test_requirements.txt | 1 - 5 files changed, 41 insertions(+), 44 deletions(-) delete mode 100644 kedro-telemetry/requirements.txt delete mode 100644 kedro-telemetry/setup.py diff --git a/kedro-telemetry/RELEASE.md b/kedro-telemetry/RELEASE.md index 7cdb93100..bbd32f424 100644 --- a/kedro-telemetry/RELEASE.md +++ b/kedro-telemetry/RELEASE.md @@ -1,3 +1,6 @@ +# Upcoming release +* Migrate all project metadata to static `pyproject.toml`. + # Release 0.2.4 * Added consent checking for collecting project statistics. diff --git a/kedro-telemetry/pyproject.toml b/kedro-telemetry/pyproject.toml index 07449ad97..0cc754854 100644 --- a/kedro-telemetry/pyproject.toml +++ b/kedro-telemetry/pyproject.toml @@ -1,3 +1,41 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "kedro-telemetry" +authors = [ + {name = "Kedro"} +] +description = "Kedro-Telemetry" +requires-python = ">=3.7, <3.11" +license = {text = "Apache Software License (Apache 2.0)"} +dependencies = [ + "kedro~=0.18.0", + "requests~=2.20", +] +dynamic = ["readme", "version"] + +[project.urls] +Source = "https://github.com/kedro-org/kedro-plugins/tree/main/kedro-telemetry" +Documentation = "https://github.com/kedro-org/kedro-plugins/blob/main/kedro-telemetry/README.md" +Tracker = "https://github.com/kedro-org/kedro-plugins/issues" + +[project.entry-points."kedro.cli_hooks"] +kedro-telemetry = "kedro_telemetry.plugin:cli_hooks" + +[project.entry-points."kedro.hooks"] +kedro-telemetry = "kedro_telemetry.plugin:project_hooks" + +[tool.setuptools] +include-package-data = true +packages = ["kedro_telemetry"] +zip-safe = false + +[tool.setuptools.dynamic] +readme = {file = "README.md", content-type = "text/markdown"} +version = {attr = "kedro_telemetry.__version__"} + [tool.isort] multi_line_output = 3 include_trailing_comma = true diff --git a/kedro-telemetry/requirements.txt b/kedro-telemetry/requirements.txt deleted file mode 100644 index c59cb8a9c..000000000 --- a/kedro-telemetry/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -kedro~=0.18.0 -requests~=2.20 diff --git a/kedro-telemetry/setup.py b/kedro-telemetry/setup.py deleted file mode 100644 index db6a976d2..000000000 --- a/kedro-telemetry/setup.py +++ /dev/null @@ -1,41 +0,0 @@ -import re -from codecs import open -from os import path - -from setuptools import setup - -name = "kedro-telemetry" -here = path.abspath(path.dirname(__file__)) - -# get package version -package_name = name.replace("-", "_") -with open(path.join(here, package_name, "__init__.py"), encoding="utf-8") as f: - version = re.search(r'__version__ = ["\']([^"\']+)', f.read()).group(1) - -# get the dependencies and installs -with open("requirements.txt", "r", encoding="utf-8") as f: - requires = [x.strip() for x in f if x.strip()] - -# Get the long description from the README file -with open(path.join(here, "README.md"), encoding="utf-8") as f: - readme = f.read() - -setup( - name=name, - version=version, - description="Kedro-Telemetry", - long_description=readme, - long_description_content_type="text/markdown", - url="https://github.com/kedro-org/kedro-plugins/tree/main/kedro-telemetry", - author="Kedro", - python_requires=">=3.7, <3.11", - install_requires=requires, - license="Apache Software License (Apache 2.0)", - packages=["kedro_telemetry"], - include_package_data=True, - zip_safe=False, - entry_points={ - "kedro.cli_hooks": ["kedro-telemetry = kedro_telemetry.plugin:cli_hooks"], - "kedro.hooks": ["kedro-telemetry = kedro_telemetry.plugin:project_hooks"] - }, -) diff --git a/kedro-telemetry/test_requirements.txt b/kedro-telemetry/test_requirements.txt index 4f39e717a..fb187d672 100644 --- a/kedro-telemetry/test_requirements.txt +++ b/kedro-telemetry/test_requirements.txt @@ -1,4 +1,3 @@ --r requirements.txt bandit>=1.6.2, <2.0 behave black~=22.0 From 45151ec503ccf1a71ae39d2baf351dbd34800347 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Date: Wed, 19 Apr 2023 15:21:17 +0100 Subject: [PATCH 58/74] ci: port lint, unit test, and e2e tests to Actions (#155) * Add unit test + lint test on GA * trigger GA - will revert Signed-off-by: Ankita Katiyar * Fix lint Signed-off-by: Ankita Katiyar * Add end to end tests * Add cache key Signed-off-by: Ankita Katiyar * Add cache action Signed-off-by: Ankita Katiyar * Rename workflow files Signed-off-by: Ankita Katiyar * Lint + add comment + default bash Signed-off-by: Ankita Katiyar * Add windows test Signed-off-by: Ankita Katiyar * Update workflow name + revert changes to READMEs Signed-off-by: Ankita Katiyar * Add kedro-telemetry/RELEASE.md to trufflehog ignore Signed-off-by: Ankita Katiyar * Add pytables to test_requirements remove from workflow Signed-off-by: Ankita Katiyar * Revert "Add pytables to test_requirements remove from workflow" This reverts commit 8203daa6405d325c74ec2097c9d0c5859bae8257. * Separate pip freeze step Signed-off-by: Ankita Katiyar --------- Signed-off-by: Ankita Katiyar Signed-off-by: Danny Farah --- .github/workflows/check-plugin.yml | 134 ++++++++++++++++++++++++++ .github/workflows/kedro-airflow.yml | 16 +++ .github/workflows/kedro-datasets.yml | 16 +++ .github/workflows/kedro-docker.yml | 16 +++ .github/workflows/kedro-telemetry.yml | 16 +++ trufflehog-ignore.txt | 2 + 6 files changed, 200 insertions(+) create mode 100644 .github/workflows/check-plugin.yml create mode 100644 .github/workflows/kedro-airflow.yml create mode 100644 .github/workflows/kedro-datasets.yml create mode 100644 .github/workflows/kedro-docker.yml create mode 100644 .github/workflows/kedro-telemetry.yml diff --git a/.github/workflows/check-plugin.yml b/.github/workflows/check-plugin.yml new file mode 100644 index 000000000..a32c0f651 --- /dev/null +++ b/.github/workflows/check-plugin.yml @@ -0,0 +1,134 @@ +name: Running tests and linter + +on: + workflow_call: + inputs: + plugin: + type: string + +jobs: + unit-tests: + defaults: + run: + shell: bash + strategy: + matrix: + os: [ ubuntu-latest, windows-latest ] + python-version: [ "3.7", "3.8", "3.9", "3.10" ] + runs-on: ${{ matrix.os }} + steps: + - name: Checkout code + uses: actions/checkout@v3 + - name: Set up Python ${{matrix.python-version}} + uses: actions/setup-python@v3 + with: + python-version: ${{matrix.python-version}} + - name: Cache python packages for Linux + if: matrix.os == 'ubuntu-latest' + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{inputs.plugin}}-${{matrix.os}}-python-${{matrix.python-version}} + restore-keys: ${{inputs.plugin}} + - name: Cache python packages for Windows + if: matrix.os == 'windows-latest' + uses: actions/cache@v3 + with: + path: ~\AppData\Local\pip\Cache + key: ${{inputs.plugin}}-${{matrix.os}}-python-${{matrix.python-version}} + restore-keys: ${{inputs.plugin}} + - name: Install Kedro + run: pip install git+https://github.com/kedro-org/kedro@main + - name: Install dependencies + run: | + cd ${{ inputs.plugin }} + pip install -r test_requirements.txt + - name: Install pytables (only for kedro-datasets on windows) + if: matrix.os == 'windows-latest' && inputs.plugin == 'kedro-datasets' + run: pip install tables + - name: pip freeze + run: pip freeze + - name: Run unit tests for Linux / all plugins + if: matrix.os != 'windows-latest' + run: make plugin=${{ inputs.plugin }} test + - name: Run unit tests for Windows / kedro-airflow, kedro-docker, kedro-telemetry + if: matrix.os == 'windows-latest' && inputs.plugin != 'kedro-datasets' + run: | + cd ${{ inputs.plugin }} + pytest tests + - name: Run unit tests for Windows / kedro-datasets / no spark sequential + if: matrix.os == 'windows-latest' && inputs.plugin == 'kedro-datasets' && matrix.python-version == '3.10' + run: | + make test-no-spark-sequential + - name: Run unit tests for Windows / kedro-datasets / no spark parallel + if: matrix.os == 'windows-latest' && inputs.plugin == 'kedro-datasets' && matrix.python-version != '3.10' + run: | + make test-no-spark + + lint: + defaults: + run: + shell: bash + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v3 + - name: Set up Python 3.8 + uses: actions/setup-python@v3 + with: + python-version: 3.8 + - name: Cache python packages + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{inputs.plugin}}-${{matrix.os}}-python-${{matrix.python-version}} + restore-keys: ${{inputs.plugin}} + - name: Install dependencies + run: | + cd ${{ inputs.plugin }} + pip install git+https://github.com/kedro-org/kedro@main + pip install -r test_requirements.txt + pip freeze + - name: Install pre-commit hooks + run: | + cd ${{ inputs.plugin }} + pre-commit install --install-hooks + pre-commit install --hook-type pre-push + - name: Run linter + run: make plugin=${{ inputs.plugin }} lint + + e2e-tests: + if: inputs.plugin != 'kedro-datasets' + defaults: + run: + shell: bash + strategy: + matrix: + os: [ ubuntu-latest ] + python-version: [ "3.7", "3.8", "3.9", "3.10" ] + runs-on: ${{ matrix.os }} + steps: + - name: Checkout code + uses: actions/checkout@v3 + - name: Set up Python ${{matrix.python-version}} + uses: actions/setup-python@v3 + with: + python-version: ${{matrix.python-version}} + - name: Cache python packages + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{inputs.plugin}}-${{matrix.os}}-python-${{matrix.python-version}} + restore-keys: ${{inputs.plugin}} + - name: Install dependencies + run: | + cd ${{ inputs.plugin }} + pip install git+https://github.com/kedro-org/kedro@main + pip install -r test_requirements.txt + - name: pip freeze + run: pip freeze + - name: Run end to end tests + # Custom shell to run kedro-docker e2e-tests because -it flag for `docker run` + # isn't supported on Github Actions. See https://github.com/actions/runner/issues/241 + shell: 'script -q -e -c "bash {0}"' + run: make plugin=${{ inputs.plugin }} e2e-tests diff --git a/.github/workflows/kedro-airflow.yml b/.github/workflows/kedro-airflow.yml new file mode 100644 index 000000000..b68fcce30 --- /dev/null +++ b/.github/workflows/kedro-airflow.yml @@ -0,0 +1,16 @@ +name: Run checks on kedro-airflow + +on: + push: + paths: + - "kedro-airflow/**" + pull_request: + paths: + - "kedro-airflow/**" + types: [ synchronize ] + +jobs: + airflow-test: + uses: ./.github/workflows/check-plugin.yml + with: + plugin: kedro-airflow diff --git a/.github/workflows/kedro-datasets.yml b/.github/workflows/kedro-datasets.yml new file mode 100644 index 000000000..9ff4802b6 --- /dev/null +++ b/.github/workflows/kedro-datasets.yml @@ -0,0 +1,16 @@ +name: Run checks on kedro-datasets + +on: + push: + paths: + - "kedro-datasets/**" + pull_request: + paths: + - "kedro-datasets/**" + types: [ synchronize ] + +jobs: + datasets-test: + uses: ./.github/workflows/check-plugin.yml + with: + plugin: kedro-datasets diff --git a/.github/workflows/kedro-docker.yml b/.github/workflows/kedro-docker.yml new file mode 100644 index 000000000..1812a3a93 --- /dev/null +++ b/.github/workflows/kedro-docker.yml @@ -0,0 +1,16 @@ +name: Run checks on kedro-docker + +on: + push: + paths: + - "kedro-docker/**" + pull_request: + paths: + - "kedro-docker/**" + types: [ synchronize ] + +jobs: + docker-test: + uses: ./.github/workflows/check-plugin.yml + with: + plugin: kedro-docker diff --git a/.github/workflows/kedro-telemetry.yml b/.github/workflows/kedro-telemetry.yml new file mode 100644 index 000000000..fd75e8a71 --- /dev/null +++ b/.github/workflows/kedro-telemetry.yml @@ -0,0 +1,16 @@ +name: Run checks on kedro-telemetry + +on: + push: + paths: + - "kedro-telemetry/**" + pull_request: + paths: + - "kedro-telemetry/**" + types: [ synchronize ] + +jobs: + telemetry-test: + uses: ./.github/workflows/check-plugin.yml + with: + plugin: kedro-telemetry diff --git a/trufflehog-ignore.txt b/trufflehog-ignore.txt index 041fc7ffd..1929a2634 100644 --- a/trufflehog-ignore.txt +++ b/trufflehog-ignore.txt @@ -1 +1,3 @@ kedro-telemetry/README.md +kedro-telemetry/RELEASE.md +kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py From 942fd01bbf0254a5c9ea626a89208aa80e49ff5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Luis=20Cano=20Rodr=C3=ADguez?= Date: Wed, 19 Apr 2023 17:08:42 +0200 Subject: [PATCH 59/74] Migrate `kedro-docker` to static metadata (#173) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Migrate kedro-docker to static metadata See https://github.com/kedro-org/kedro/issues/2334. Signed-off-by: Juan Luis Cano Rodríguez * Address packaging warning Signed-off-by: Juan Luis Cano Rodríguez * Fix tests Signed-off-by: Juan Luis Cano Rodríguez * Actually install current plugin with dependencies Signed-off-by: Juan Luis Cano Rodríguez * Add release notes Signed-off-by: Juan Luis Cano Rodríguez --------- Signed-off-by: Juan Luis Cano Rodríguez Signed-off-by: Danny Farah --- .circleci/continue_config.yml | 1 + kedro-docker/MANIFEST.in | 1 + kedro-docker/RELEASE.md | 1 + kedro-docker/features/environment.py | 2 +- kedro-docker/pyproject.toml | 55 ++++++++++++++++++++++++++++ kedro-docker/requirements.txt | 3 -- kedro-docker/setup.cfg | 10 ----- kedro-docker/setup.py | 44 ---------------------- kedro-docker/test_requirements.txt | 1 - 9 files changed, 59 insertions(+), 59 deletions(-) create mode 100644 kedro-docker/MANIFEST.in delete mode 100644 kedro-docker/requirements.txt delete mode 100644 kedro-docker/setup.cfg delete mode 100644 kedro-docker/setup.py diff --git a/.circleci/continue_config.yml b/.circleci/continue_config.yml index 5a1d78015..82653758e 100644 --- a/.circleci/continue_config.yml +++ b/.circleci/continue_config.yml @@ -69,6 +69,7 @@ commands: command: | cd <> pip install git+https://github.com/kedro-org/kedro@main + pip install . pip install -r test_requirements.txt - run: name: Install pre-commit hooks diff --git a/kedro-docker/MANIFEST.in b/kedro-docker/MANIFEST.in new file mode 100644 index 000000000..451642d6f --- /dev/null +++ b/kedro-docker/MANIFEST.in @@ -0,0 +1 @@ +recursive-include kedro_docker/template * diff --git a/kedro-docker/RELEASE.md b/kedro-docker/RELEASE.md index eeb2f0e41..4bd5b8bbd 100644 --- a/kedro-docker/RELEASE.md +++ b/kedro-docker/RELEASE.md @@ -1,4 +1,5 @@ # Upcoming release +* Migrate all project metadata to static `pyproject.toml`. ## Major features and improvements diff --git a/kedro-docker/features/environment.py b/kedro-docker/features/environment.py index 04a5f25cf..930f97a7d 100644 --- a/kedro-docker/features/environment.py +++ b/kedro-docker/features/environment.py @@ -51,7 +51,7 @@ def before_all(context): ) # install the plugin - call([context.python, "setup.py", "install"], env=context.env) + call([context.python, "-m", "pip", "install", "."], env=context.env) def _setup_context_with_venv(context, venv_dir): diff --git a/kedro-docker/pyproject.toml b/kedro-docker/pyproject.toml index 0b54e6e31..cdd273509 100644 --- a/kedro-docker/pyproject.toml +++ b/kedro-docker/pyproject.toml @@ -1,3 +1,58 @@ +[build-system] +requires = ["setuptools>=61.2"] +build-backend = "setuptools.build_meta" + +[project] +name = "kedro-docker" +authors = [ + {name = "Kedro"} +] +description = "Kedro-Docker makes it easy to package Kedro projects with Docker." +requires-python = ">=3.7, <3.11" +license = {text = "Apache Software License (Apache 2.0)"} +dependencies = [ + "anyconfig~=0.10.0", # not directly required, pinned by Snyk to avoid a vulnerability + "kedro>=0.16.0", + "semver~=2.10", # Needs to be at least 2.10.0 to get VersionInfo.match +] +dynamic = ["readme", "version"] + +[project.urls] +Source = "https://github.com/kedro-org/kedro-plugins/tree/main/kedro-docker" +Documentation = "https://github.com/kedro-org/kedro-plugins/blob/main/kedro-docker/README.md" +Tracker = "https://github.com/kedro-org/kedro-plugins/issues" + +[project.entry-points."kedro.project_commands"] +docker = "kedro_docker.plugin:commands" + +[tool.setuptools] +include-package-data = true +zip-safe = false + +[tool.setuptools.packages.find] +include = ["kedro_docker*"] +namespaces = true # To include the template files + +[tool.setuptools.package-data] +kedro_docker = [ + "template/Dockerfile.*", + "template/.dockerignore", + "template/.dive-ci", +] + +[tool.setuptools.dynamic] +readme = {file = "README.md", content-type = "text/markdown"} +version = {attr = "kedro_docker.__version__"} + +[tool.pytest.ini_options] +addopts = """ + --cov-report xml:coverage.xml + --cov-report term-missing + --cov kedro_docker + --cov tests + --no-cov-on-fail + -ra""" + [tool.black] [tool.isort] diff --git a/kedro-docker/requirements.txt b/kedro-docker/requirements.txt deleted file mode 100644 index 86c576113..000000000 --- a/kedro-docker/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -anyconfig~=0.10.0 # not directly required, pinned by Snyk to avoid a vulnerability -kedro>=0.16.0 -semver~=2.10 # Needs to be at least 2.10.0 to get VersionInfo.match diff --git a/kedro-docker/setup.cfg b/kedro-docker/setup.cfg deleted file mode 100644 index 9ba92fe11..000000000 --- a/kedro-docker/setup.cfg +++ /dev/null @@ -1,10 +0,0 @@ -[metadata] -description-file=README.md - -[tool:pytest] -addopts=--cov-report xml:coverage.xml - --cov-report term-missing - --cov kedro_docker - --cov tests - --no-cov-on-fail - -ra diff --git a/kedro-docker/setup.py b/kedro-docker/setup.py deleted file mode 100644 index b2ef23ca3..000000000 --- a/kedro-docker/setup.py +++ /dev/null @@ -1,44 +0,0 @@ -import re -from codecs import open -from os import path - -from setuptools import setup - -name = "kedro-docker" -here = path.abspath(path.dirname(__file__)) - -# get package version -package_name = name.replace("-", "_") -with open(path.join(here, package_name, "__init__.py"), encoding="utf-8") as f: - version = re.search(r'__version__ = ["\']([^"\']+)', f.read()).group(1) - -# get the dependencies and installs -with open("requirements.txt", "r", encoding="utf-8") as f: - requires = [x.strip() for x in f if x.strip()] - -# get the long description from the README file -with open(path.join(here, "README.md"), encoding="utf-8") as f: - readme = f.read() - -setup( - name=name, - version=version, - description="Kedro-Docker makes it easy to package Kedro projects with Docker.", - long_description=readme, - long_description_content_type="text/markdown", - url="https://github.com/kedro-org/kedro-plugins/tree/main/kedro-docker", - license="Apache Software License (Apache 2.0)", - python_requires=">=3.7, <3.11", - install_requires=requires, - author="Kedro", - packages=["kedro_docker"], - package_data={ - "kedro_docker": [ - "template/Dockerfile.*", - "template/.dockerignore", - "template/.dive-ci", - ] - }, - zip_safe=False, - entry_points={"kedro.project_commands": ["docker = kedro_docker.plugin:commands"]}, -) diff --git a/kedro-docker/test_requirements.txt b/kedro-docker/test_requirements.txt index 771ee88a6..01af755ac 100644 --- a/kedro-docker/test_requirements.txt +++ b/kedro-docker/test_requirements.txt @@ -1,4 +1,3 @@ --r requirements.txt bandit>=1.6.2, <2.0 behave>=1.2.6, <2.0 black~=22.0 From 2d8eb28156d819523f8b8b376e498b6aae52fff2 Mon Sep 17 00:00:00 2001 From: Nok Lam Chan Date: Fri, 21 Apr 2023 16:32:23 +0100 Subject: [PATCH 60/74] Introdcuing .gitpod.yml to kedro-plugins (#185) Currently opening gitpod will installed a Python 3.11 which breaks everything because we don't support it set. This PR introduce a simple .gitpod.yml to get it started. Signed-off-by: Danny Farah --- .gitpod.yml | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 .gitpod.yml diff --git a/.gitpod.yml b/.gitpod.yml new file mode 100644 index 000000000..70738f4c0 --- /dev/null +++ b/.gitpod.yml @@ -0,0 +1,33 @@ +# Learn more from ready-to-use templates: https://www.gitpod.io/docs/introduction/getting-started/quickstart +image: gitpod/workspace-python-3.10:2023-04-20-16-32-37 + + +tasks: + # We want packages installed during the pre-build init steps to go to /workspace + # rather than ~ so that they are persisted. Gitpod sets PIP_USER=yes to ensure this, + # but pre-commit requires PIP_USER=no. Hence we set PIP_USER=no and use + # pip install --user to install to /workspace. + - name: kedro-plugins + before: | + echo PIP_USER=no >> ~/.bashrc && export PIP_USER=no + init: | + make sign-off + command: | + pre-commit install --install-hooks + clear + + +github: + prebuilds: + # enable for the master/default branch (defaults to true) + master: true + # enable for all branches in this repo (defaults to false) + branches: true + # enable for pull requests coming from this repo (defaults to true) + pullRequests: true + # enable for pull requests coming from forks (defaults to false) + pullRequestsFromForks: true + # add a "Review in Gitpod" button as a comment to pull requests (defaults to true) + addComment: false + # add a "Review in Gitpod" button to pull requests (defaults to false) + addBadge: true From ce1138ec8cef5166f1d3e7c5574fa520d8e85e5b Mon Sep 17 00:00:00 2001 From: Nok Lam Chan Date: Mon, 24 Apr 2023 13:32:52 +0100 Subject: [PATCH 61/74] sync APIDataSet from kedro's `develop` (#184) * Update APIDataSet Signed-off-by: Nok Chan * Sync ParquetDataSet Signed-off-by: Nok Chan * Sync Test Signed-off-by: Nok Chan * Linting Signed-off-by: Nok Chan * Revert Unnecessary ParquetDataSet Changes Signed-off-by: Nok Chan * Sync release notes Signed-off-by: Nok Chan --------- Signed-off-by: Nok Chan Signed-off-by: Danny Farah --- kedro-datasets/RELEASE.md | 2 +- .../kedro_datasets/api/api_dataset.py | 111 +++---- kedro-datasets/tests/api/test_api_dataset.py | 273 ++++++++++++------ 3 files changed, 242 insertions(+), 144 deletions(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 01a3b92dc..e1185b54d 100644 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -3,7 +3,7 @@ ## Major features and improvements: * Added pandas 2.0 support. * Added SQLAlchemy 2.0 support (and dropped support for versions below 1.4). - +* Reduced constructor arguments for `APIDataSet` by replacing most arguments with a single constructor argument `load_args`. This makes it more consistent with other Kedro DataSets and the underlying `requests` API, and automatically enables the full configuration domain: stream, certificates, proxies, and more. ## Bug fixes and other changes * Relaxed `delta-spark` upper bound to allow compatibility with Spark 3.1.x and 3.2.x. diff --git a/kedro-datasets/kedro_datasets/api/api_dataset.py b/kedro-datasets/kedro_datasets/api/api_dataset.py index 4f0ffb4cc..cb8f80d37 100644 --- a/kedro-datasets/kedro_datasets/api/api_dataset.py +++ b/kedro-datasets/kedro_datasets/api/api_dataset.py @@ -1,12 +1,17 @@ """``APIDataSet`` loads the data from HTTP(S) APIs. It uses the python requests library: https://requests.readthedocs.io/en/latest/ """ -from typing import Any, Dict, Iterable, List, NoReturn, Union +from typing import Any, Dict, List, NoReturn, Tuple, Union import requests from kedro.io.core import AbstractDataSet, DataSetError +from requests import Session, sessions from requests.auth import AuthBase +# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0. +# Any contribution to datasets should be made in kedro-datasets +# in kedro-plugins (https://github.com/kedro-org/kedro-plugins) + class APIDataSet(AbstractDataSet[None, requests.Response]): """``APIDataSet`` loads the data from HTTP(S) APIs. @@ -34,88 +39,89 @@ class APIDataSet(AbstractDataSet[None, requests.Response]): data_catalog.html#use-the-data-catalog-with-the-code-api>`_: :: - >>> from kedro_datasets.api import APIDataSet + >>> from kedro.extras.datasets.api import APIDataSet >>> >>> >>> data_set = APIDataSet( >>> url="https://quickstats.nass.usda.gov", - >>> params={ - >>> "key": "SOME_TOKEN", - >>> "format": "JSON", - >>> "commodity_desc": "CORN", - >>> "statisticcat_des": "YIELD", - >>> "agg_level_desc": "STATE", - >>> "year": 2000 - >>> } + >>> load_args={ + >>> "params": { + >>> "key": "SOME_TOKEN", + >>> "format": "JSON", + >>> "commodity_desc": "CORN", + >>> "statisticcat_des": "YIELD", + >>> "agg_level_desc": "STATE", + >>> "year": 2000 + >>> } + >>> }, + >>> credentials=("username", "password") >>> ) >>> data = data_set.load() """ - # pylint: disable=too-many-arguments def __init__( self, url: str, method: str = "GET", - data: Any = None, - params: Dict[str, Any] = None, - headers: Dict[str, Any] = None, - auth: Union[Iterable[str], AuthBase] = None, - json: Union[List, Dict[str, Any]] = None, - timeout: int = 60, - credentials: Union[Iterable[str], AuthBase] = None, + load_args: Dict[str, Any] = None, + credentials: Union[Tuple[str, str], List[str], AuthBase] = None, ) -> None: """Creates a new instance of ``APIDataSet`` to fetch data from an API endpoint. Args: url: The API URL endpoint. method: The Method of the request, GET, POST, PUT, DELETE, HEAD, etc... - data: The request payload, used for POST, PUT, etc requests - https://requests.readthedocs.io/en/latest/user/quickstart/#more-complicated-post-requests - params: The url parameters of the API. - https://requests.readthedocs.io/en/latest/user/quickstart/#passing-parameters-in-urls - headers: The HTTP headers. - https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers - auth: Anything ``requests`` accepts. Normally it's either ``('login', 'password')``, - or ``AuthBase``, ``HTTPBasicAuth`` instance for more complex cases. Any - iterable will be cast to a tuple. - json: The request payload, used for POST, PUT, etc requests, passed in - to the json kwarg in the requests object. - https://requests.readthedocs.io/en/latest/user/quickstart/#more-complicated-post-requests - timeout: The wait time in seconds for a response, defaults to 1 minute. - https://requests.readthedocs.io/en/latest/user/quickstart/#timeouts - credentials: same as ``auth``. Allows specifying ``auth`` secrets in - credentials.yml. - + load_args: Additional parameters to be fed to requests.request. + https://requests.readthedocs.io/en/latest/api/#requests.request + credentials: Allows specifying secrets in credentials.yml. + Expected format is ``('login', 'password')`` if given as a tuple or list. + An ``AuthBase`` instance can be provided for more complex cases. Raises: - ValueError: if both ``credentials`` and ``auth`` are specified. + ValueError: if both ``auth`` in ``load_args`` and ``credentials`` are specified. """ super().__init__() - if credentials is not None and auth is not None: + self._load_args = load_args or {} + self._load_args_auth = self._load_args.pop("auth", None) + + if credentials is not None and self._load_args_auth is not None: raise ValueError("Cannot specify both auth and credentials.") - auth = credentials or auth + self._auth = credentials or self._load_args_auth + + if "cert" in self._load_args: + self._load_args["cert"] = self._convert_type(self._load_args["cert"]) - if isinstance(auth, Iterable): - auth = tuple(auth) + if "timeout" in self._load_args: + self._load_args["timeout"] = self._convert_type(self._load_args["timeout"]) self._request_args: Dict[str, Any] = { "url": url, "method": method, - "data": data, - "params": params, - "headers": headers, - "auth": auth, - "json": json, - "timeout": timeout, + "auth": self._convert_type(self._auth), + **self._load_args, } + @staticmethod + def _convert_type(value: Any): + """ + From the Data Catalog, iterables are provided as Lists. + However, for some parameters in the Python requests library, + only Tuples are allowed. + """ + if isinstance(value, List): + return tuple(value) + return value + def _describe(self) -> Dict[str, Any]: - return {**self._request_args} + # prevent auth from logging + request_args_cp = self._request_args.copy() + request_args_cp.pop("auth", None) + return request_args_cp - def _execute_request(self) -> requests.Response: + def _execute_request(self, session: Session) -> requests.Response: try: - response = requests.request(**self._request_args) + response = session.request(**self._request_args) response.raise_for_status() except requests.exceptions.HTTPError as exc: raise DataSetError("Failed to fetch data", exc) from exc @@ -125,12 +131,13 @@ def _execute_request(self) -> requests.Response: return response def _load(self) -> requests.Response: - return self._execute_request() + with sessions.Session() as session: + return self._execute_request(session) def _save(self, data: None) -> NoReturn: raise DataSetError(f"{self.__class__.__name__} is a read only data set type") def _exists(self) -> bool: - response = self._execute_request() - + with sessions.Session() as session: + response = self._execute_request(session) return response.ok diff --git a/kedro-datasets/tests/api/test_api_dataset.py b/kedro-datasets/tests/api/test_api_dataset.py index c84290750..848020041 100644 --- a/kedro-datasets/tests/api/test_api_dataset.py +++ b/kedro-datasets/tests/api/test_api_dataset.py @@ -1,11 +1,11 @@ # pylint: disable=no-member -import json +import base64 import socket import pytest import requests -import requests_mock from kedro.io.core import DataSetError +from requests.auth import HTTPBasicAuth from kedro_datasets.api import APIDataSet @@ -13,96 +13,190 @@ TEST_URL = "http://example.com/api/test" TEST_TEXT_RESPONSE_DATA = "This is a response." -TEST_JSON_RESPONSE_DATA = [{"key": "value"}] +TEST_JSON_REQUEST_DATA = [{"key": "value"}] TEST_PARAMS = {"param": "value"} TEST_URL_WITH_PARAMS = TEST_URL + "?param=value" - +TEST_METHOD = "GET" TEST_HEADERS = {"key": "value"} -@pytest.mark.parametrize("method", POSSIBLE_METHODS) class TestAPIDataSet: - @pytest.fixture - def requests_mocker(self): - with requests_mock.Mocker() as mock: - yield mock + @pytest.mark.parametrize("method", POSSIBLE_METHODS) + def test_request_method(self, requests_mock, method): + api_data_set = APIDataSet(url=TEST_URL, method=method) + requests_mock.register_uri(method, TEST_URL, text=TEST_TEXT_RESPONSE_DATA) + + response = api_data_set.load() + assert response.text == TEST_TEXT_RESPONSE_DATA - def test_successfully_load_with_response(self, requests_mocker, method): + @pytest.mark.parametrize( + "parameters_in, url_postfix", + [ + ({"param": "value"}, "?param=value"), + (bytes("a=1", "latin-1"), "?a=1"), + ], + ) + def test_params_in_request(self, requests_mock, parameters_in, url_postfix): api_data_set = APIDataSet( - url=TEST_URL, method=method, params=TEST_PARAMS, headers=TEST_HEADERS + url=TEST_URL, method=TEST_METHOD, load_args={"params": parameters_in} ) - requests_mocker.register_uri( - method, - TEST_URL_WITH_PARAMS, - headers=TEST_HEADERS, - text=TEST_TEXT_RESPONSE_DATA, + requests_mock.register_uri( + TEST_METHOD, TEST_URL + url_postfix, text=TEST_TEXT_RESPONSE_DATA ) response = api_data_set.load() assert isinstance(response, requests.Response) assert response.text == TEST_TEXT_RESPONSE_DATA - def test_successful_json_load_with_response(self, requests_mocker, method): + def test_json_in_request(self, requests_mock): api_data_set = APIDataSet( url=TEST_URL, - method=method, - json=TEST_JSON_RESPONSE_DATA, - headers=TEST_HEADERS, + method=TEST_METHOD, + load_args={"json": TEST_JSON_REQUEST_DATA}, ) - requests_mocker.register_uri( - method, + requests_mock.register_uri(TEST_METHOD, TEST_URL) + + response = api_data_set.load() + assert response.request.json() == TEST_JSON_REQUEST_DATA + + def test_headers_in_request(self, requests_mock): + api_data_set = APIDataSet( + url=TEST_URL, method=TEST_METHOD, load_args={"headers": TEST_HEADERS} + ) + requests_mock.register_uri(TEST_METHOD, TEST_URL, headers={"pan": "cake"}) + + response = api_data_set.load() + + assert response.request.headers["key"] == "value" + assert response.headers["pan"] == "cake" + + def test_api_cookies(self, requests_mock): + api_data_set = APIDataSet( + url=TEST_URL, method=TEST_METHOD, load_args={"cookies": {"pan": "cake"}} + ) + requests_mock.register_uri(TEST_METHOD, TEST_URL, text="text") + + response = api_data_set.load() + assert response.request.headers["Cookie"] == "pan=cake" + + def test_credentials_auth_error(self): + """ + If ``auth`` in ``load_args`` and ``credentials`` are both provided, + the constructor should raise a ValueError. + """ + with pytest.raises(ValueError, match="both auth and credentials"): + APIDataSet( + url=TEST_URL, method=TEST_METHOD, load_args={"auth": []}, credentials={} + ) + + @staticmethod + def _basic_auth(username, password): + encoded = base64.b64encode(f"{username}:{password}".encode("latin-1")) + return f"Basic {encoded.decode('latin-1')}" + + @pytest.mark.parametrize( + "auth_kwarg", + [ + {"load_args": {"auth": ("john", "doe")}}, + {"load_args": {"auth": ["john", "doe"]}}, + {"load_args": {"auth": HTTPBasicAuth("john", "doe")}}, + {"credentials": ("john", "doe")}, + {"credentials": ["john", "doe"]}, + {"credentials": HTTPBasicAuth("john", "doe")}, + ], + ) + def test_auth_sequence(self, requests_mock, auth_kwarg): + api_data_set = APIDataSet(url=TEST_URL, method=TEST_METHOD, **auth_kwarg) + requests_mock.register_uri( + TEST_METHOD, TEST_URL, - headers=TEST_HEADERS, - text=json.dumps(TEST_JSON_RESPONSE_DATA), + text=TEST_TEXT_RESPONSE_DATA, ) response = api_data_set.load() assert isinstance(response, requests.Response) - assert response.json() == TEST_JSON_RESPONSE_DATA + assert response.request.headers["Authorization"] == TestAPIDataSet._basic_auth( + "john", "doe" + ) + assert response.text == TEST_TEXT_RESPONSE_DATA - def test_http_error(self, requests_mocker, method): + @pytest.mark.parametrize( + "timeout_in, timeout_out", + [ + (1, 1), + ((1, 2), (1, 2)), + ([1, 2], (1, 2)), + ], + ) + def test_api_timeout(self, requests_mock, timeout_in, timeout_out): api_data_set = APIDataSet( - url=TEST_URL, method=method, params=TEST_PARAMS, headers=TEST_HEADERS + url=TEST_URL, method=TEST_METHOD, load_args={"timeout": timeout_in} ) - requests_mocker.register_uri( - method, - TEST_URL_WITH_PARAMS, - headers=TEST_HEADERS, - text="Nope, not found", - status_code=requests.codes.FORBIDDEN, + requests_mock.register_uri(TEST_METHOD, TEST_URL) + response = api_data_set.load() + assert response.request.timeout == timeout_out + + def test_stream(self, requests_mock): + text = "I am being streamed." + + api_data_set = APIDataSet( + url=TEST_URL, method=TEST_METHOD, load_args={"stream": True} ) - with pytest.raises(DataSetError, match="Failed to fetch data"): - api_data_set.load() + requests_mock.register_uri(TEST_METHOD, TEST_URL, text=text) + + response = api_data_set.load() + assert isinstance(response, requests.Response) + assert response.request.stream + + chunks = list(response.iter_content(chunk_size=2, decode_unicode=True)) + assert chunks == ["I ", "am", " b", "ei", "ng", " s", "tr", "ea", "me", "d."] - def test_socket_error(self, requests_mocker, method): + def test_proxy(self, requests_mock): api_data_set = APIDataSet( - url=TEST_URL, method=method, params=TEST_PARAMS, headers=TEST_HEADERS + url="ftp://example.com/api/test", + method=TEST_METHOD, + load_args={"proxies": {"ftp": "ftp://127.0.0.1:3000"}}, + ) + requests_mock.register_uri( + TEST_METHOD, + "ftp://example.com/api/test", ) - requests_mocker.register_uri(method, TEST_URL_WITH_PARAMS, exc=socket.error) - with pytest.raises(DataSetError, match="Failed to connect"): - api_data_set.load() + response = api_data_set.load() + assert response.request.proxies.get("ftp") == "ftp://127.0.0.1:3000" - def test_read_only_mode(self, method): - """ - Saving is disabled on the data set. - """ - api_data_set = APIDataSet(url=TEST_URL, method=method) - with pytest.raises(DataSetError, match="is a read only data set type"): - api_data_set.save({}) + @pytest.mark.parametrize( + "cert_in, cert_out", + [ + (("cert.pem", "privkey.pem"), ("cert.pem", "privkey.pem")), + (["cert.pem", "privkey.pem"], ("cert.pem", "privkey.pem")), + ("some/path/to/file.pem", "some/path/to/file.pem"), + (None, None), + ], + ) + def test_certs(self, requests_mock, cert_in, cert_out): + api_data_set = APIDataSet( + url=TEST_URL, method=TEST_METHOD, load_args={"cert": cert_in} + ) + requests_mock.register_uri(TEST_METHOD, TEST_URL) - def test_exists_http_error(self, requests_mocker, method): + response = api_data_set.load() + assert response.request.cert == cert_out + + def test_exists_http_error(self, requests_mock): """ In case of an unexpected HTTP error, ``exists()`` should not silently catch it. """ api_data_set = APIDataSet( - url=TEST_URL, method=method, params=TEST_PARAMS, headers=TEST_HEADERS + url=TEST_URL, + method=TEST_METHOD, + load_args={"params": TEST_PARAMS, "headers": TEST_HEADERS}, ) - requests_mocker.register_uri( - method, + requests_mock.register_uri( + TEST_METHOD, TEST_URL_WITH_PARAMS, headers=TEST_HEADERS, text="Nope, not found", @@ -111,16 +205,18 @@ def test_exists_http_error(self, requests_mocker, method): with pytest.raises(DataSetError, match="Failed to fetch data"): api_data_set.exists() - def test_exists_ok(self, requests_mocker, method): + def test_exists_ok(self, requests_mock): """ If the file actually exists and server responds 200, ``exists()`` should return True """ api_data_set = APIDataSet( - url=TEST_URL, method=method, params=TEST_PARAMS, headers=TEST_HEADERS + url=TEST_URL, + method=TEST_METHOD, + load_args={"params": TEST_PARAMS, "headers": TEST_HEADERS}, ) - requests_mocker.register_uri( - method, + requests_mock.register_uri( + TEST_METHOD, TEST_URL_WITH_PARAMS, headers=TEST_HEADERS, text=TEST_TEXT_RESPONSE_DATA, @@ -128,43 +224,38 @@ def test_exists_ok(self, requests_mocker, method): assert api_data_set.exists() - def test_credentials_auth_error(self, method): - """ - If ``auth`` and ``credentials`` are both provided, - the constructor should raise a ValueError. - """ - with pytest.raises(ValueError, match="both auth and credentials"): - APIDataSet(url=TEST_URL, method=method, auth=[], credentials=[]) - - @pytest.mark.parametrize("auth_kwarg", ["auth", "credentials"]) - @pytest.mark.parametrize( - "auth_seq", - [ - ("username", "password"), - ["username", "password"], - (e for e in ["username", "password"]), # Generator. - ], - ) - def test_auth_sequence(self, requests_mocker, method, auth_seq, auth_kwarg): - """ - ``auth`` and ``credentials`` should be able to be any Iterable. - """ - kwargs = { - "url": TEST_URL, - "method": method, - "params": TEST_PARAMS, - "headers": TEST_HEADERS, - auth_kwarg: auth_seq, - } - - api_data_set = APIDataSet(**kwargs) - requests_mocker.register_uri( - method, + def test_http_error(self, requests_mock): + api_data_set = APIDataSet( + url=TEST_URL, + method=TEST_METHOD, + load_args={"params": TEST_PARAMS, "headers": TEST_HEADERS}, + ) + requests_mock.register_uri( + TEST_METHOD, TEST_URL_WITH_PARAMS, headers=TEST_HEADERS, - text=TEST_TEXT_RESPONSE_DATA, + text="Nope, not found", + status_code=requests.codes.FORBIDDEN, ) - response = api_data_set.load() - assert isinstance(response, requests.Response) - assert response.text == TEST_TEXT_RESPONSE_DATA + with pytest.raises(DataSetError, match="Failed to fetch data"): + api_data_set.load() + + def test_socket_error(self, requests_mock): + api_data_set = APIDataSet( + url=TEST_URL, + method=TEST_METHOD, + load_args={"params": TEST_PARAMS, "headers": TEST_HEADERS}, + ) + requests_mock.register_uri(TEST_METHOD, TEST_URL_WITH_PARAMS, exc=socket.error) + + with pytest.raises(DataSetError, match="Failed to connect"): + api_data_set.load() + + def test_read_only_mode(self): + """ + Saving is disabled on the data set. + """ + api_data_set = APIDataSet(url=TEST_URL, method=TEST_METHOD) + with pytest.raises(DataSetError, match="is a read only data set type"): + api_data_set.save({}) From f3e361a603c50b567190523909e279413ba8845a Mon Sep 17 00:00:00 2001 From: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Date: Tue, 25 Apr 2023 10:39:25 +0100 Subject: [PATCH 62/74] [kedro-datasets] Bump version of `tables` in `test_requirements.txt` (#182) * bump tables version and remove step in workflow Signed-off-by: Ankita Katiyar * revert version for linux Signed-off-by: Ankita Katiyar * change version to 3.7 Signed-off-by: Ankita Katiyar * remove extra line Signed-off-by: Ankita Katiyar --------- Signed-off-by: Ankita Katiyar Signed-off-by: Danny Farah --- .github/workflows/check-plugin.yml | 3 --- kedro-datasets/test_requirements.txt | 3 +-- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/.github/workflows/check-plugin.yml b/.github/workflows/check-plugin.yml index a32c0f651..4a3cf8827 100644 --- a/.github/workflows/check-plugin.yml +++ b/.github/workflows/check-plugin.yml @@ -43,9 +43,6 @@ jobs: run: | cd ${{ inputs.plugin }} pip install -r test_requirements.txt - - name: Install pytables (only for kedro-datasets on windows) - if: matrix.os == 'windows-latest' && inputs.plugin == 'kedro-datasets' - run: pip install tables - name: pip freeze run: pip freeze - name: Run unit tests for Linux / all plugins diff --git a/kedro-datasets/test_requirements.txt b/kedro-datasets/test_requirements.txt index 29b4c0d09..d2231136b 100644 --- a/kedro-datasets/test_requirements.txt +++ b/kedro-datasets/test_requirements.txt @@ -55,8 +55,7 @@ scikit-learn~=1.0.2 scipy~=1.7.3 snowflake-snowpark-python~=1.0.0; python_version == '3.8' SQLAlchemy>=1.4, <3.0 # The `Inspector.has_table()` method replaces the `Engine.has_table()` method in version 1.4. -tables~=3.6.0; platform_system == "Windows" and python_version < '3.9' -tables~=3.6; platform_system != "Windows" +tables~=3.7 tensorflow-macos~=2.0; platform_system == "Darwin" and platform_machine == "arm64" tensorflow~=2.0; platform_system != "Darwin" or platform_machine != "arm64" triad>=0.6.7, <1.0 From 99e3a4189b38b9c25bc406a5ecfdf61e13810830 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Wed, 26 Apr 2023 05:56:47 -0400 Subject: [PATCH 63/74] ci: ensure title matches Conventional Commits spec (#187) * Create validate-pr-title.yaml * ci: add `ready_for_review` to the PR type triggers * Update validate-pr-title.yaml * revert: drop the `ready_for_review` type from list * ci: restrict the set of scopes to the plugin names Signed-off-by: Danny Farah --- .github/workflows/validate-pr-title.yaml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 .github/workflows/validate-pr-title.yaml diff --git a/.github/workflows/validate-pr-title.yaml b/.github/workflows/validate-pr-title.yaml new file mode 100644 index 000000000..b6e6fc808 --- /dev/null +++ b/.github/workflows/validate-pr-title.yaml @@ -0,0 +1,23 @@ +name: Lint PR + +on: + pull_request: + types: + - opened + - edited + - synchronize + +jobs: + main: + name: Validate PR title + runs-on: ubuntu-latest + steps: + - uses: amannn/action-semantic-pull-request@v5 + with: + scopes: | + airflow + datasets + docker + telemetry + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From fdd205c0b1ecde3dd42237b7edd3a4d29f1eb385 Mon Sep 17 00:00:00 2001 From: Merel Theisen <49397448+merelcht@users.noreply.github.com> Date: Wed, 26 Apr 2023 15:21:59 +0100 Subject: [PATCH 64/74] Use PEP 526 syntax for variable type annotations (#190) Signed-off-by: Merel Theisen Signed-off-by: Danny Farah --- .../kedro_datasets/biosequence/biosequence_dataset.py | 4 ++-- kedro-datasets/kedro_datasets/dask/parquet_dataset.py | 4 ++-- kedro-datasets/kedro_datasets/email/message_dataset.py | 4 ++-- kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py | 2 +- kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py | 2 +- kedro-datasets/kedro_datasets/json/json_dataset.py | 2 +- .../kedro_datasets/matplotlib/matplotlib_writer.py | 2 +- kedro-datasets/kedro_datasets/networkx/gml_dataset.py | 4 ++-- kedro-datasets/kedro_datasets/networkx/graphml_dataset.py | 4 ++-- kedro-datasets/kedro_datasets/networkx/json_dataset.py | 4 ++-- kedro-datasets/kedro_datasets/pandas/csv_dataset.py | 4 ++-- kedro-datasets/kedro_datasets/pandas/feather_dataset.py | 4 ++-- kedro-datasets/kedro_datasets/pandas/gbq_dataset.py | 6 +++--- kedro-datasets/kedro_datasets/pandas/generic_dataset.py | 4 ++-- kedro-datasets/kedro_datasets/pandas/hdf_dataset.py | 4 ++-- kedro-datasets/kedro_datasets/pandas/json_dataset.py | 4 ++-- kedro-datasets/kedro_datasets/pandas/parquet_dataset.py | 4 ++-- kedro-datasets/kedro_datasets/pandas/sql_dataset.py | 2 +- kedro-datasets/kedro_datasets/pandas/xml_dataset.py | 4 ++-- kedro-datasets/kedro_datasets/pickle/pickle_dataset.py | 4 ++-- kedro-datasets/kedro_datasets/pillow/image_dataset.py | 2 +- kedro-datasets/kedro_datasets/plotly/json_dataset.py | 4 ++-- kedro-datasets/kedro_datasets/polars/csv_dataset.py | 4 ++-- kedro-datasets/kedro_datasets/redis/redis_dataset.py | 4 ++-- kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py | 4 ++-- kedro-datasets/kedro_datasets/spark/spark_dataset.py | 4 ++-- kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py | 2 +- kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py | 4 ++-- kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py | 4 ++-- .../kedro_datasets/tensorflow/tensorflow_model_dataset.py | 4 ++-- kedro-datasets/kedro_datasets/yaml/yaml_dataset.py | 2 +- 31 files changed, 55 insertions(+), 55 deletions(-) diff --git a/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py b/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py index 7c45743da..e9dd924a6 100644 --- a/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py +++ b/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py @@ -36,8 +36,8 @@ class BioSequenceDataSet(AbstractDataSet[List, List]): """ - DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] - DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + DEFAULT_LOAD_ARGS: Dict[str, Any] = {} + DEFAULT_SAVE_ARGS: Dict[str, Any] = {} # pylint: disable=too-many-arguments def __init__( diff --git a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py index f02144892..f3c00e265 100644 --- a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py @@ -83,8 +83,8 @@ class ParquetDataSet(AbstractDataSet[dd.DataFrame, dd.DataFrame]): col3: [[int32]] """ - DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] - DEFAULT_SAVE_ARGS = {"write_index": False} # type: Dict[str, Any] + DEFAULT_LOAD_ARGS: Dict[str, Any] = {} + DEFAULT_SAVE_ARGS: Dict[str, Any] = {"write_index": False} # pylint: disable=too-many-arguments def __init__( diff --git a/kedro-datasets/kedro_datasets/email/message_dataset.py b/kedro-datasets/kedro_datasets/email/message_dataset.py index 0b8623f63..2faf3bb5d 100644 --- a/kedro-datasets/kedro_datasets/email/message_dataset.py +++ b/kedro-datasets/kedro_datasets/email/message_dataset.py @@ -52,8 +52,8 @@ class EmailMessageDataSet( """ - DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] - DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + DEFAULT_LOAD_ARGS: Dict[str, Any] = {} + DEFAULT_SAVE_ARGS: Dict[str, Any] = {} # pylint: disable=too-many-arguments def __init__( diff --git a/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py b/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py index ba9237909..75a9f8357 100644 --- a/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py +++ b/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py @@ -44,7 +44,7 @@ class GeoJSONDataSet( """ - DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] + DEFAULT_LOAD_ARGS: Dict[str, Any] = {} DEFAULT_SAVE_ARGS = {"driver": "GeoJSON"} # pylint: disable=too-many-arguments diff --git a/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py b/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py index 7f61909b9..9a17dbe7b 100644 --- a/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py +++ b/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py @@ -37,7 +37,7 @@ class HoloviewsWriter(AbstractVersionedDataSet[HoloViews, NoReturn]): """ - DEFAULT_SAVE_ARGS = {"fmt": "png"} # type: Dict[str, Any] + DEFAULT_SAVE_ARGS: Dict[str, Any] = {"fmt": "png"} # pylint: disable=too-many-arguments def __init__( diff --git a/kedro-datasets/kedro_datasets/json/json_dataset.py b/kedro-datasets/kedro_datasets/json/json_dataset.py index a39e7aff5..53239ece3 100644 --- a/kedro-datasets/kedro_datasets/json/json_dataset.py +++ b/kedro-datasets/kedro_datasets/json/json_dataset.py @@ -49,7 +49,7 @@ class JSONDataSet(AbstractVersionedDataSet[Any, Any]): """ - DEFAULT_SAVE_ARGS = {"indent": 2} # type: Dict[str, Any] + DEFAULT_SAVE_ARGS: Dict[str, Any] = {"indent": 2} # pylint: disable=too-many-arguments def __init__( diff --git a/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py b/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py index 3fc396cb1..d7aaf6a02 100644 --- a/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py +++ b/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py @@ -104,7 +104,7 @@ class MatplotlibWriter( """ - DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + DEFAULT_SAVE_ARGS: Dict[str, Any] = {} # pylint: disable=too-many-arguments def __init__( diff --git a/kedro-datasets/kedro_datasets/networkx/gml_dataset.py b/kedro-datasets/kedro_datasets/networkx/gml_dataset.py index bc8d4f86f..4dd88cb22 100644 --- a/kedro-datasets/kedro_datasets/networkx/gml_dataset.py +++ b/kedro-datasets/kedro_datasets/networkx/gml_dataset.py @@ -36,8 +36,8 @@ class GMLDataSet(AbstractVersionedDataSet[networkx.Graph, networkx.Graph]): """ - DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] - DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + DEFAULT_LOAD_ARGS: Dict[str, Any] = {} + DEFAULT_SAVE_ARGS: Dict[str, Any] = {} # pylint: disable=too-many-arguments def __init__( diff --git a/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py b/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py index 2105fb67f..ca12b6bae 100644 --- a/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py +++ b/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py @@ -35,8 +35,8 @@ class GraphMLDataSet(AbstractVersionedDataSet[networkx.Graph, networkx.Graph]): """ - DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] - DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + DEFAULT_LOAD_ARGS: Dict[str, Any] = {} + DEFAULT_SAVE_ARGS: Dict[str, Any] = {} # pylint: disable=too-many-arguments def __init__( diff --git a/kedro-datasets/kedro_datasets/networkx/json_dataset.py b/kedro-datasets/kedro_datasets/networkx/json_dataset.py index 8cc436721..3fdf9f253 100644 --- a/kedro-datasets/kedro_datasets/networkx/json_dataset.py +++ b/kedro-datasets/kedro_datasets/networkx/json_dataset.py @@ -36,8 +36,8 @@ class JSONDataSet(AbstractVersionedDataSet[networkx.Graph, networkx.Graph]): """ - DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] - DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + DEFAULT_LOAD_ARGS: Dict[str, Any] = {} + DEFAULT_SAVE_ARGS: Dict[str, Any] = {} # pylint: disable=too-many-arguments def __init__( diff --git a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py index 3c65d49b5..336aff406 100644 --- a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py @@ -65,8 +65,8 @@ class CSVDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): """ - DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] - DEFAULT_SAVE_ARGS = {"index": False} # type: Dict[str, Any] + DEFAULT_LOAD_ARGS: Dict[str, Any] = {} + DEFAULT_SAVE_ARGS: Dict[str, Any] = {"index": False} # pylint: disable=too-many-arguments def __init__( diff --git a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py index 1116d4168..be261d42a 100644 --- a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py @@ -65,8 +65,8 @@ class FeatherDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): """ - DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] - DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + DEFAULT_LOAD_ARGS: Dict[str, Any] = {} + DEFAULT_SAVE_ARGS: Dict[str, Any] = {} # pylint: disable=too-many-arguments def __init__( diff --git a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py index c0122a6c0..ebfadf249 100644 --- a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py @@ -62,8 +62,8 @@ class GBQTableDataSet(AbstractDataSet[None, pd.DataFrame]): """ - DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] - DEFAULT_SAVE_ARGS = {"progress_bar": False} # type: Dict[str, Any] + DEFAULT_LOAD_ARGS: Dict[str, Any] = {} + DEFAULT_SAVE_ARGS: Dict[str, Any] = {"progress_bar": False} # pylint: disable=too-many-arguments def __init__( @@ -203,7 +203,7 @@ class GBQQueryDataSet(AbstractDataSet[None, pd.DataFrame]): >>> """ - DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] + DEFAULT_LOAD_ARGS: Dict[str, Any] = {} # pylint: disable=too-many-arguments def __init__( diff --git a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py index d9702c7b8..91229edcf 100644 --- a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py @@ -81,8 +81,8 @@ class GenericDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): """ - DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] - DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + DEFAULT_LOAD_ARGS: Dict[str, Any] = {} + DEFAULT_SAVE_ARGS: Dict[str, Any] = {} # pylint: disable=too-many-arguments def __init__( diff --git a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py index f11fe320f..b821f17da 100644 --- a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py @@ -56,8 +56,8 @@ class HDFDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): # _lock is a class attribute that will be shared across all the instances. # It is used to make dataset safe for threads. _lock = Lock() - DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] - DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + DEFAULT_LOAD_ARGS: Dict[str, Any] = {} + DEFAULT_SAVE_ARGS: Dict[str, Any] = {} # pylint: disable=too-many-arguments def __init__( diff --git a/kedro-datasets/kedro_datasets/pandas/json_dataset.py b/kedro-datasets/kedro_datasets/pandas/json_dataset.py index d29ef57bd..76d1cca0a 100644 --- a/kedro-datasets/kedro_datasets/pandas/json_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/json_dataset.py @@ -60,8 +60,8 @@ class JSONDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): """ - DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] - DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + DEFAULT_LOAD_ARGS: Dict[str, Any] = {} + DEFAULT_SAVE_ARGS: Dict[str, Any] = {} # pylint: disable=too-many-arguments def __init__( diff --git a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py index acb478bd9..b41d468c3 100644 --- a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py @@ -71,8 +71,8 @@ class ParquetDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): """ - DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] - DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + DEFAULT_LOAD_ARGS: Dict[str, Any] = {} + DEFAULT_SAVE_ARGS: Dict[str, Any] = {} # pylint: disable=too-many-arguments def __init__( diff --git a/kedro-datasets/kedro_datasets/pandas/sql_dataset.py b/kedro-datasets/kedro_datasets/pandas/sql_dataset.py index 489e2627a..029dc6939 100644 --- a/kedro-datasets/kedro_datasets/pandas/sql_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/sql_dataset.py @@ -433,7 +433,7 @@ def __init__( # pylint: disable=too-many-arguments "provide a SQLAlchemy connection string." ) - default_load_args = {} # type: Dict[str, Any] + default_load_args: Dict[str, Any] = {} self._load_args = ( {**default_load_args, **load_args} diff --git a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py index ca8fc0dd2..59f96e441 100644 --- a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py @@ -43,8 +43,8 @@ class XMLDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): """ - DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] - DEFAULT_SAVE_ARGS = {"index": False} # type: Dict[str, Any] + DEFAULT_LOAD_ARGS: Dict[str, Any] = {} + DEFAULT_SAVE_ARGS: Dict[str, Any] = {"index": False} # pylint: disable=too-many-arguments def __init__( diff --git a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py index 436fba29a..11ee512c1 100644 --- a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py +++ b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py @@ -68,8 +68,8 @@ class PickleDataSet(AbstractVersionedDataSet[Any, Any]): >>> assert data.equals(reloaded) """ - DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] - DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + DEFAULT_LOAD_ARGS: Dict[str, Any] = {} + DEFAULT_SAVE_ARGS: Dict[str, Any] = {} # pylint: disable=too-many-arguments,too-many-locals def __init__( diff --git a/kedro-datasets/kedro_datasets/pillow/image_dataset.py b/kedro-datasets/kedro_datasets/pillow/image_dataset.py index ca939b722..6dd94635e 100644 --- a/kedro-datasets/kedro_datasets/pillow/image_dataset.py +++ b/kedro-datasets/kedro_datasets/pillow/image_dataset.py @@ -33,7 +33,7 @@ class ImageDataSet(AbstractVersionedDataSet[Image.Image, Image.Image]): """ - DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + DEFAULT_SAVE_ARGS: Dict[str, Any] = {} # pylint: disable=too-many-arguments def __init__( diff --git a/kedro-datasets/kedro_datasets/plotly/json_dataset.py b/kedro-datasets/kedro_datasets/plotly/json_dataset.py index f819dd338..f99fe8ac4 100644 --- a/kedro-datasets/kedro_datasets/plotly/json_dataset.py +++ b/kedro-datasets/kedro_datasets/plotly/json_dataset.py @@ -49,8 +49,8 @@ class JSONDataSet( >>> assert fig == reloaded """ - DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] - DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + DEFAULT_LOAD_ARGS: Dict[str, Any] = {} + DEFAULT_SAVE_ARGS: Dict[str, Any] = {} # pylint: disable=too-many-arguments def __init__( diff --git a/kedro-datasets/kedro_datasets/polars/csv_dataset.py b/kedro-datasets/kedro_datasets/polars/csv_dataset.py index 60a0d456a..6bbc721c4 100644 --- a/kedro-datasets/kedro_datasets/polars/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/polars/csv_dataset.py @@ -63,8 +63,8 @@ class CSVDataSet(AbstractVersionedDataSet[pl.DataFrame, pl.DataFrame]): """ - DEFAULT_LOAD_ARGS = {"rechunk": True} # type: Dict[str, Any] - DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + DEFAULT_LOAD_ARGS: Dict[str, Any] = {"rechunk": True} + DEFAULT_SAVE_ARGS: Dict[str, Any] = {} # pylint: disable=too-many-arguments def __init__( diff --git a/kedro-datasets/kedro_datasets/redis/redis_dataset.py b/kedro-datasets/kedro_datasets/redis/redis_dataset.py index 6d2f80df9..ce5aa741f 100644 --- a/kedro-datasets/kedro_datasets/redis/redis_dataset.py +++ b/kedro-datasets/kedro_datasets/redis/redis_dataset.py @@ -56,8 +56,8 @@ class PickleDataSet(AbstractDataSet[Any, Any]): """ DEFAULT_REDIS_URL = os.getenv("REDIS_URL", "redis://127.0.0.1:6379") - DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] - DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + DEFAULT_LOAD_ARGS: Dict[str, Any] = {} + DEFAULT_SAVE_ARGS: Dict[str, Any] = {} # pylint: disable=too-many-arguments def __init__( diff --git a/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py b/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py index c9587d6fe..e1adc50c0 100644 --- a/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py +++ b/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py @@ -97,8 +97,8 @@ class SnowparkTableDataSet(AbstractDataSet): # for parallelism within a pipeline please consider # ``ThreadRunner`` instead _SINGLE_PROCESS = True - DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] - DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + DEFAULT_LOAD_ARGS: Dict[str, Any] = {} + DEFAULT_SAVE_ARGS: Dict[str, Any] = {} def __init__( # pylint: disable=too-many-arguments self, diff --git a/kedro-datasets/kedro_datasets/spark/spark_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_dataset.py index 166c921cc..a0d099350 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_dataset.py @@ -233,8 +233,8 @@ class SparkDataSet(AbstractVersionedDataSet[DataFrame, DataFrame]): # for parallelism within a Spark pipeline please consider # ``ThreadRunner`` instead _SINGLE_PROCESS = True - DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] - DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + DEFAULT_LOAD_ARGS: Dict[str, Any] = {} + DEFAULT_SAVE_ARGS: Dict[str, Any] = {} def __init__( # pylint: disable=too-many-arguments self, diff --git a/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py index a676dd784..3ea2fb0a1 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py @@ -63,7 +63,7 @@ class SparkHiveDataSet(AbstractDataSet[DataFrame, DataFrame]): >>> reloaded.take(4) """ - DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + DEFAULT_SAVE_ARGS: Dict[str, Any] = {} # pylint:disable=too-many-arguments def __init__( diff --git a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py index dcb4185e7..c90c5f958 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py @@ -63,8 +63,8 @@ class SparkJDBCDataSet(AbstractDataSet[DataFrame, DataFrame]): """ - DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] - DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + DEFAULT_LOAD_ARGS: Dict[str, Any] = {} + DEFAULT_SAVE_ARGS: Dict[str, Any] = {} # pylint: disable=too-many-arguments def __init__( diff --git a/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py b/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py index f909c1976..c08555aa1 100644 --- a/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py +++ b/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py @@ -86,8 +86,8 @@ class SVMLightDataSet(AbstractVersionedDataSet[_DI, _DO]): """ - DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] - DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + DEFAULT_LOAD_ARGS: Dict[str, Any] = {} + DEFAULT_SAVE_ARGS: Dict[str, Any] = {} # pylint: disable=too-many-arguments def __init__( diff --git a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py index 544aadb06..1395297e9 100644 --- a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py +++ b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py @@ -60,8 +60,8 @@ class TensorFlowModelDataset(AbstractVersionedDataSet[tf.keras.Model, tf.keras.M """ - DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] - DEFAULT_SAVE_ARGS = {"save_format": "tf"} # type: Dict[str, Any] + DEFAULT_LOAD_ARGS: Dict[str, Any] = {} + DEFAULT_SAVE_ARGS: Dict[str, Any] = {"save_format": "tf"} # pylint: disable=too-many-arguments def __init__( diff --git a/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py b/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py index f2a3c2696..a576f439a 100644 --- a/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py +++ b/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py @@ -46,7 +46,7 @@ class YAMLDataSet(AbstractVersionedDataSet[Dict, Dict]): """ - DEFAULT_SAVE_ARGS = {"default_flow_style": False} # type: Dict[str, Any] + DEFAULT_SAVE_ARGS: Dict[str, Any] = {"default_flow_style": False} # pylint: disable=too-many-arguments def __init__( From c0dd796b8bc5602549d81578a363d19e77ae6f0c Mon Sep 17 00:00:00 2001 From: Brian Cechmanek Date: Fri, 28 Apr 2023 12:22:36 +0100 Subject: [PATCH 65/74] fix(datasets): Refactor TensorFlowModelDataset to DataSet (#186) * refactor TensorFlowModelDataset to Set matching consistency of all other kedro-datasets, DataSet should be camelcase. will be reverted in 0.19.0 Signed-off-by: BrianCechmanek * Introdcuing .gitpod.yml to kedro-plugins (#185) Currently opening gitpod will installed a Python 3.11 which breaks everything because we don't support it set. This PR introduce a simple .gitpod.yml to get it started. Signed-off-by: BrianCechmanek * sync APIDataSet from kedro's `develop` (#184) * Update APIDataSet Signed-off-by: Nok Chan * Sync ParquetDataSet Signed-off-by: Nok Chan * Sync Test Signed-off-by: Nok Chan * Linting Signed-off-by: Nok Chan * Revert Unnecessary ParquetDataSet Changes Signed-off-by: Nok Chan * Sync release notes Signed-off-by: Nok Chan --------- Signed-off-by: Nok Chan Signed-off-by: BrianCechmanek * [kedro-datasets] Bump version of `tables` in `test_requirements.txt` (#182) * bump tables version and remove step in workflow Signed-off-by: Ankita Katiyar * revert version for linux Signed-off-by: Ankita Katiyar * change version to 3.7 Signed-off-by: Ankita Katiyar * remove extra line Signed-off-by: Ankita Katiyar --------- Signed-off-by: Ankita Katiyar Signed-off-by: BrianCechmanek * refactor tensorflowModelDataset casing in datasets setup.py Signed-off-by: BrianCechmanek * add tensorflowmodeldataset bugfix to release.md Signed-off-by: BrianCechmanek * Update all the doc reference with TensorFlowModelDataSet Signed-off-by: Nok --------- Signed-off-by: BrianCechmanek Signed-off-by: Nok Chan Signed-off-by: Ankita Katiyar Signed-off-by: Nok Co-authored-by: Nok Lam Chan Co-authored-by: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Co-authored-by: Nok Signed-off-by: Danny Farah --- kedro-datasets/RELEASE.md | 7 +++++ .../kedro_datasets/tensorflow/README.md | 8 ++--- .../kedro_datasets/tensorflow/__init__.py | 4 +-- .../tensorflow/tensorflow_model_dataset.py | 14 ++++----- kedro-datasets/setup.py | 2 +- .../test_tensorflow_model_dataset.py | 30 +++++++++---------- 6 files changed, 36 insertions(+), 29 deletions(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index e1185b54d..ddc06407c 100644 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -7,6 +7,13 @@ ## Bug fixes and other changes * Relaxed `delta-spark` upper bound to allow compatibility with Spark 3.1.x and 3.2.x. +# Release 1.2.1: + +## Major features and improvements: + +## Bug fixes and other changes +* Renamed `TensorFlowModelDataset` to `TensorFlowModelDataSet` to be consistent with all other plugins in kedro-datasets. + # Release 1.2.0: ## Major features and improvements: diff --git a/kedro-datasets/kedro_datasets/tensorflow/README.md b/kedro-datasets/kedro_datasets/tensorflow/README.md index eba54cd7c..8b682094d 100644 --- a/kedro-datasets/kedro_datasets/tensorflow/README.md +++ b/kedro-datasets/kedro_datasets/tensorflow/README.md @@ -1,4 +1,4 @@ -# TensorFlowModelDataset +# TensorFlowModelDataSet ``TensorflowModelDataset`` loads and saves TensorFlow models. The underlying functionality is supported by, and passes input arguments to TensorFlow 2.X load_model and save_model methods. Only TF2 is currently supported for saving and loading, V1 requires HDF5 and serialises differently. @@ -8,9 +8,9 @@ The underlying functionality is supported by, and passes input arguments to Tens import numpy as np import tensorflow as tf -from kedro_datasets.tensorflow import TensorFlowModelDataset +from kedro_datasets.tensorflow import TensorFlowModelDataSet -data_set = TensorFlowModelDataset("tf_model_dirname") +data_set = TensorFlowModelDataSet("tf_model_dirname") model = tf.keras.Model() predictions = model.predict([...]) @@ -25,7 +25,7 @@ np.testing.assert_allclose(predictions, new_predictions, rtol=1e-6, atol=1e-6) #### Example catalog.yml: ```yaml example_tensorflow_data: - type: tensorflow.TensorFlowModelDataset + type: tensorflow.TensorFlowModelDataSet filepath: data/08_reporting/tf_model_dirname load_args: tf_device: "/CPU:0" # optional diff --git a/kedro-datasets/kedro_datasets/tensorflow/__init__.py b/kedro-datasets/kedro_datasets/tensorflow/__init__.py index 20e1311de..6a13f9fe4 100644 --- a/kedro-datasets/kedro_datasets/tensorflow/__init__.py +++ b/kedro-datasets/kedro_datasets/tensorflow/__init__.py @@ -1,8 +1,8 @@ """Provides I/O for TensorFlow Models.""" -__all__ = ["TensorFlowModelDataset"] +__all__ = ["TensorFlowModelDataSet"] from contextlib import suppress with suppress(ImportError): - from .tensorflow_model_dataset import TensorFlowModelDataset + from .tensorflow_model_dataset import TensorFlowModelDataSet diff --git a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py index 1395297e9..42b550737 100644 --- a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py +++ b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py @@ -1,4 +1,4 @@ -"""``TensorflowModelDataset`` is a data set implementation which can save and load +"""``TensorFlowModelDataSet`` is a data set implementation which can save and load TensorFlow models. """ import copy @@ -19,8 +19,8 @@ TEMPORARY_H5_FILE = "tmp_tensorflow_model.h5" -class TensorFlowModelDataset(AbstractVersionedDataSet[tf.keras.Model, tf.keras.Model]): - """``TensorflowModelDataset`` loads and saves TensorFlow models. +class TensorFlowModelDataSet(AbstractVersionedDataSet[tf.keras.Model, tf.keras.Model]): + """``TensorFlowModelDataSet`` loads and saves TensorFlow models. The underlying functionality is supported by, and passes input arguments through to, TensorFlow 2.X load_model and save_model methods. @@ -31,7 +31,7 @@ class TensorFlowModelDataset(AbstractVersionedDataSet[tf.keras.Model, tf.keras.M .. code-block:: yaml tensorflow_model: - type: tensorflow.TensorFlowModelDataset + type: tensorflow.TensorFlowModelDataSet filepath: data/06_models/tensorflow_model.h5 load_args: compile: False @@ -45,11 +45,11 @@ class TensorFlowModelDataset(AbstractVersionedDataSet[tf.keras.Model, tf.keras.M data_catalog.html#use-the-data-catalog-with-the-code-api>`_: :: - >>> from kedro_datasets.tensorflow import TensorFlowModelDataset + >>> from kedro_datasets.tensorflow import TensorFlowModelDataSet >>> import tensorflow as tf >>> import numpy as np >>> - >>> data_set = TensorFlowModelDataset("data/06_models/tensorflow_model.h5") + >>> data_set = TensorFlowModelDataSet("data/06_models/tensorflow_model.h5") >>> model = tf.keras.Model() >>> predictions = model.predict([...]) >>> @@ -73,7 +73,7 @@ def __init__( credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, ) -> None: - """Creates a new instance of ``TensorFlowModelDataset``. + """Creates a new instance of ``TensorFlowModelDataSet``. Args: filepath: Filepath in POSIX format to a TensorFlow model directory prefixed with a diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py index 3733c68af..3f3558bf1 100644 --- a/kedro-datasets/setup.py +++ b/kedro-datasets/setup.py @@ -64,7 +64,7 @@ def _collect_requirements(requires): } svmlight_require = {"svmlight.SVMLightDataSet": ["scikit-learn~=1.0.2", "scipy~=1.7.3"]} tensorflow_require = { - "tensorflow.TensorflowModelDataset": [ + "tensorflow.TensorFlowModelDataSet": [ # currently only TensorFlow V2 supported for saving and loading. # V1 requires HDF5 and serialises differently "tensorflow~=2.0; platform_system != 'Darwin' or platform_machine != 'arm64'", diff --git a/kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py b/kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py index 26d421853..b469820a3 100644 --- a/kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py +++ b/kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py @@ -11,7 +11,7 @@ from s3fs import S3FileSystem -# In this test module, we wrap tensorflow and TensorFlowModelDataset imports into a module-scoped +# In this test module, we wrap tensorflow and TensorFlowModelDataSet imports into a module-scoped # fixtures to avoid them being evaluated immediately when a new test process is spawned. # Specifically: # - ParallelRunner spawns a new subprocess. @@ -34,9 +34,9 @@ def tf(): @pytest.fixture(scope="module") def tensorflow_model_dataset(): - from kedro_datasets.tensorflow import TensorFlowModelDataset + from kedro_datasets.tensorflow import TensorFlowModelDataSet - return TensorFlowModelDataset + return TensorFlowModelDataSet @pytest.fixture @@ -134,7 +134,7 @@ def call(self, inputs, training=None, mask=None): # pragma: no cover return model -class TestTensorFlowModelDataset: +class TestTensorFlowModelDataSet: """No versioning passed to creator""" def test_save_and_load(self, tf_model_dataset, dummy_tf_base_model, dummy_x_test): @@ -152,7 +152,7 @@ def test_save_and_load(self, tf_model_dataset, dummy_tf_base_model, dummy_x_test def test_load_missing_model(self, tf_model_dataset): """Test error message when trying to load missing model.""" pattern = ( - r"Failed while loading data from data set TensorFlowModelDataset\(.*\)" + r"Failed while loading data from data set TensorFlowModelDataSet\(.*\)" ) with pytest.raises(DataSetError, match=pattern): tf_model_dataset.load() @@ -166,7 +166,7 @@ def test_exists(self, tf_model_dataset, dummy_tf_base_model): def test_hdf5_save_format( self, dummy_tf_base_model, dummy_x_test, filepath, tensorflow_model_dataset ): - """Test TensorflowModelDataset can save TF graph models in HDF5 format""" + """Test TensorFlowModelDataSet can save TF graph models in HDF5 format""" hdf5_dataset = tensorflow_model_dataset( filepath=filepath, save_args={"save_format": "h5"} ) @@ -187,7 +187,7 @@ def test_unused_subclass_model_hdf5_save_format( filepath, tensorflow_model_dataset, ): - """Test TensorflowModelDataset cannot save subclassed user models in HDF5 format + """Test TensorFlowModelDataSet cannot save subclassed user models in HDF5 format Subclassed model @@ -277,8 +277,8 @@ def test_save_and_overwrite_existing_model( assert len(dummy_tf_base_model_new.layers) == len(reloaded.layers) -class TestTensorFlowModelDatasetVersioned: - """Test suite with versioning argument passed into TensorFlowModelDataset creator""" +class TestTensorFlowModelDataSetVersioned: + """Test suite with versioning argument passed into TensorFlowModelDataSet creator""" @pytest.mark.parametrize( "load_version,save_version", @@ -320,7 +320,7 @@ def test_hdf5_save_format( load_version, save_version, ): - """Test versioned TensorflowModelDataset can save TF graph models in + """Test versioned TensorFlowModelDataSet can save TF graph models in HDF5 format""" hdf5_dataset = tensorflow_model_dataset( filepath=filepath, @@ -340,7 +340,7 @@ def test_prevent_overwrite(self, dummy_tf_base_model, versioned_tf_model_dataset corresponding file for a given save version already exists.""" versioned_tf_model_dataset.save(dummy_tf_base_model) pattern = ( - r"Save path \'.+\' for TensorFlowModelDataset\(.+\) must " + r"Save path \'.+\' for TensorFlowModelDataSet\(.+\) must " r"not exist if versioning is enabled\." ) with pytest.raises(DataSetError, match=pattern): @@ -362,7 +362,7 @@ def test_save_version_warning( the subsequent load path.""" pattern = ( rf"Save version '{save_version}' did not match load version '{load_version}' " - rf"for TensorFlowModelDataset\(.+\)" + rf"for TensorFlowModelDataSet\(.+\)" ) with pytest.warns(UserWarning, match=pattern): versioned_tf_model_dataset.save(dummy_tf_base_model) @@ -383,7 +383,7 @@ def test_exists(self, versioned_tf_model_dataset, dummy_tf_base_model): def test_no_versions(self, versioned_tf_model_dataset): """Check the error if no versions are available for load.""" - pattern = r"Did not find any versions for TensorFlowModelDataset\(.+\)" + pattern = r"Did not find any versions for TensorFlowModelDataSet\(.+\)" with pytest.raises(DataSetError, match=pattern): versioned_tf_model_dataset.load() @@ -408,7 +408,7 @@ def test_versioning_existing_dataset( self, tf_model_dataset, versioned_tf_model_dataset, dummy_tf_base_model ): """Check behavior when attempting to save a versioned dataset on top of an - already existing (non-versioned) dataset. Note: because TensorFlowModelDataset + already existing (non-versioned) dataset. Note: because TensorFlowModelDataSet saves to a directory even if non-versioned, an error is not expected.""" tf_model_dataset.save(dummy_tf_base_model) assert tf_model_dataset.exists() @@ -425,7 +425,7 @@ def test_save_and_load_with_device( load_version, save_version, ): - """Test versioned TensorflowModelDataset can load models using an explicit tf_device""" + """Test versioned TensorFlowModelDataSet can load models using an explicit tf_device""" hdf5_dataset = tensorflow_model_dataset( filepath=filepath, load_args={"tf_device": "/CPU:0"}, From 074429cea151adfe238d377e4d09894197995a36 Mon Sep 17 00:00:00 2001 From: Danny Farah Date: Wed, 3 May 2023 11:28:10 -0700 Subject: [PATCH 66/74] Update kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py Co-authored-by: Jannic <37243923+jmholzer@users.noreply.github.com> Signed-off-by: Danny Farah --- .../kedro_datasets/databricks/managed_table_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py index 6266bfc35..7d0490d45 100644 --- a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py +++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py @@ -37,7 +37,7 @@ class ManagedTable: # pylint: disable=R0902 table: str write_mode: str dataframe_type: str - primary_key: str + primary_key: Optional[str] owner_group: str partition_columns: Union[str, List[str]] json_schema: StructType From c0bb229d860572c397e5c9a1d225fc6430feb7d2 Mon Sep 17 00:00:00 2001 From: Danny Farah Date: Wed, 3 May 2023 11:29:37 -0700 Subject: [PATCH 67/74] adding backticks to catalog Signed-off-by: Danny Farah --- .../kedro_datasets/databricks/managed_table_dataset.py | 4 ++-- .../tests/databricks/test_managed_table_dataset.py | 7 ++++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py index 7d0490d45..5b9e83e57 100644 --- a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py +++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py @@ -6,7 +6,7 @@ from dataclasses import dataclass from functools import partial from operator import attrgetter -from typing import Any, Dict, List, Union +from typing import Any, Dict, List, Optional, Union import pandas as pd from cachetools import Cache, cachedmethod @@ -439,7 +439,7 @@ def _exists(self) -> bool: """ if self._table.catalog: try: - self._get_spark().sql(f"USE CATALOG {self._table.catalog}") + self._get_spark().sql(f"USE CATALOG `{self._table.catalog}`") except (ParseException, AnalysisException) as exc: logger.warning( "catalog %s not found or unity not enabled. Error message: %s", diff --git a/kedro-datasets/tests/databricks/test_managed_table_dataset.py b/kedro-datasets/tests/databricks/test_managed_table_dataset.py index 000aa8d6e..fbdbaaebc 100644 --- a/kedro-datasets/tests/databricks/test_managed_table_dataset.py +++ b/kedro-datasets/tests/databricks/test_managed_table_dataset.py @@ -174,6 +174,11 @@ def test_full_table(self): unity_ds = ManagedTableDataSet(catalog="test", database="test", table="test") assert unity_ds._table.full_table_location() == "`test`.`test`.`test`" + unity_ds = ManagedTableDataSet( + catalog="test-test", database="test", table="test" + ) + assert unity_ds._table.full_table_location() == "`test-test`.`test`.`test`" + unity_ds = ManagedTableDataSet(database="test", table="test") assert unity_ds._table.full_table_location() == "`test`.`test`" @@ -192,7 +197,7 @@ def test_describe(self): "write_mode": "overwrite", "dataframe_type": "spark", "primary_key": None, - "version": None, + "version": "None", "owner_group": None, "partition_columns": None, } From fa52c47461233017095720f943e9c32cafa0cc52 Mon Sep 17 00:00:00 2001 From: Jannic Holzer Date: Thu, 4 May 2023 14:49:19 +0100 Subject: [PATCH 68/74] Require pandas < 2.0 for compatibility with spark < 3.4 Signed-off-by: Jannic Holzer --- kedro-datasets/.gitignore | 2 +- kedro-datasets/test_requirements.txt | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kedro-datasets/.gitignore b/kedro-datasets/.gitignore index 3725bd847..721e13f70 100644 --- a/kedro-datasets/.gitignore +++ b/kedro-datasets/.gitignore @@ -147,4 +147,4 @@ docs/tmp-build-artifacts docs/build spark-warehouse metastore_db/ -derby.log \ No newline at end of file +derby.log diff --git a/kedro-datasets/test_requirements.txt b/kedro-datasets/test_requirements.txt index d2231136b..a35bfdf44 100644 --- a/kedro-datasets/test_requirements.txt +++ b/kedro-datasets/test_requirements.txt @@ -24,15 +24,15 @@ lxml~=4.6 matplotlib>=3.0.3, <3.4; python_version < '3.10' # 3.4.0 breaks holoviews matplotlib>=3.5, <3.6; python_version == '3.10' memory_profiler>=0.50.0, <1.0 -mlflow~=2.2.1; python_version>='3.8' mlflow~=1.30.0; python_version=='3.7' +mlflow~=2.2.1; python_version>='3.8' moto==1.3.7; python_version < '3.10' moto==3.0.4; python_version == '3.10' networkx~=2.4 opencv-python~=4.5.5.64 openpyxl>=3.0.3, <4.0 pandas-gbq>=0.12.0, <0.18.0 -pandas>=1.3 # 1.3 for read_xml/to_xml +pandas>=1.3, <2 # 1.3 for read_xml/to_xml, <2 for compatibility with Spark < 3.4 Pillow~=9.0 plotly>=4.8.0, <6.0 polars~=0.15.13 From 5b0b84b8492cb0b035cb91d416c1c8ef86a454db Mon Sep 17 00:00:00 2001 From: Jannic Holzer Date: Thu, 4 May 2023 15:24:40 +0100 Subject: [PATCH 69/74] Replace use of walrus operator Signed-off-by: Jannic Holzer --- .../kedro_datasets/databricks/managed_table_dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py index 5b9e83e57..68f14ebc9 100644 --- a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py +++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py @@ -50,7 +50,8 @@ def __post_init__(self): `validate_(self, value) -> raises DataSetError` """ for name, _ in self.__dataclass_fields__.items(): # pylint: disable=E1101 - if method := getattr(self, f"validate_{name}", None): + method = getattr(self, f"validate_{name}", None) + if method: method() def validate_table(self): From 8d0c00d507b38413d3de10c4247b4e8c004841d0 Mon Sep 17 00:00:00 2001 From: Jannic Holzer Date: Fri, 5 May 2023 00:26:11 +0100 Subject: [PATCH 70/74] Add test coverage for validation methods Signed-off-by: Jannic Holzer --- .../tests/databricks/test_managed_table_dataset.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kedro-datasets/tests/databricks/test_managed_table_dataset.py b/kedro-datasets/tests/databricks/test_managed_table_dataset.py index fbdbaaebc..8a71a4a0f 100644 --- a/kedro-datasets/tests/databricks/test_managed_table_dataset.py +++ b/kedro-datasets/tests/databricks/test_managed_table_dataset.py @@ -214,6 +214,18 @@ def test_missing_primary_key_upsert(self): with pytest.raises(DataSetError): ManagedTableDataSet(table="test", write_mode="upsert") + def test_invalid_table_name(self): + with pytest.raises(DataSetError): + ManagedTableDataSet(table="invalid!") + + def test_invalid_database(self): + with pytest.raises(DataSetError): + ManagedTableDataSet(table="test", database="invalid!") + + def test_invalid_catalog(self): + with pytest.raises(DataSetError): + ManagedTableDataSet(table="test", catalog="invalid!") + def test_schema(self): unity_ds = ManagedTableDataSet( table="test", From adaf2f6f9cab4d4b64bfe8b88d3c4e44a3901d59 Mon Sep 17 00:00:00 2001 From: Jannic Holzer Date: Fri, 5 May 2023 00:27:27 +0100 Subject: [PATCH 71/74] Remove unused versioning functions Signed-off-by: Jannic Holzer --- .../databricks/managed_table_dataset.py | 29 +------------------ 1 file changed, 1 insertion(+), 28 deletions(-) diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py index 68f14ebc9..8cd0d803b 100644 --- a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py +++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py @@ -4,13 +4,9 @@ import logging import re from dataclasses import dataclass -from functools import partial -from operator import attrgetter from typing import Any, Dict, List, Optional, Union import pandas as pd -from cachetools import Cache, cachedmethod -from cachetools.keys import hashkey from kedro.io.core import ( AbstractVersionedDataSet, DataSetError, @@ -256,7 +252,6 @@ def __init__( # pylint: disable=R0913 json_schema=schema, ) - self._version_cache = Cache(maxsize=2) self._version = version super().__init__( @@ -265,28 +260,6 @@ def __init__( # pylint: disable=R0913 exists_function=self._exists, ) - @cachedmethod(cache=attrgetter("_version_cache"), key=partial(hashkey, "load")) - def _fetch_latest_load_version(self) -> int: - # When load version is unpinned, fetch the most recent existing - # version from the given path. - latest_history = ( - self._get_spark() - .sql(f"DESCRIBE HISTORY {self._table.full_table_location()} LIMIT 1") - .collect() - ) - if len(latest_history) != 1: - raise VersionNotFoundError( - f"Did not find any versions for {self._table.full_table_location()}" - ) - return latest_history[0].version - - # 'key' is set to prevent cache key overlapping for load and save: - # https://cachetools.readthedocs.io/en/stable/#cachetools.cachedmethod - @cachedmethod(cache=attrgetter("_version_cache"), key=partial(hashkey, "save")) - def _fetch_latest_save_version(self) -> int: - """Generate and cache the current save version""" - return None - @staticmethod def _get_spark() -> SparkSession: return SparkSession.builder.getOrCreate() @@ -312,7 +285,7 @@ def _load(self) -> Union[DataFrame, pd.DataFrame]: .table(self._table.full_table_location()) ) except Exception as exc: - raise VersionNotFoundError(self._version) from exc + raise VersionNotFoundError(self._version.load) from exc else: data = self._get_spark().table(self._table.full_table_location()) if self._table.dataframe_type == "pandas": From ae5235f78b193a291341e75223aa6b456b3f5ecd Mon Sep 17 00:00:00 2001 From: Jannic Holzer Date: Fri, 5 May 2023 01:13:43 +0100 Subject: [PATCH 72/74] Fix exception catching for invalid schema, add test for invalid schema Signed-off-by: Jannic Holzer --- .../databricks/managed_table_dataset.py | 2 +- .../tests/databricks/test_managed_table_dataset.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py index 8cd0d803b..1fb2729b7 100644 --- a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py +++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py @@ -138,7 +138,7 @@ def schema(self) -> StructType: try: if self.json_schema is not None: schema = StructType.fromJson(self.json_schema) - except ParseException as exc: + except (KeyError, ValueError) as exc: raise DataSetError(exc) from exc return schema diff --git a/kedro-datasets/tests/databricks/test_managed_table_dataset.py b/kedro-datasets/tests/databricks/test_managed_table_dataset.py index 8a71a4a0f..7f015c6a2 100644 --- a/kedro-datasets/tests/databricks/test_managed_table_dataset.py +++ b/kedro-datasets/tests/databricks/test_managed_table_dataset.py @@ -255,6 +255,20 @@ def test_schema(self): ) assert unity_ds._table.schema() == expected_schema + def test_invalid_schema(self): + with pytest.raises(DataSetError): + ManagedTableDataSet( + table="test", + schema={ + "fields": [ + { + "invalid": "schema", + } + ], + "type": "struct", + }, + )._table.schema() + def test_catalog_exists(self): unity_ds = ManagedTableDataSet( catalog="test", database="invalid", table="test_not_there" From 76b593cc8e464e61db21be7773d60c0d2c250e5c Mon Sep 17 00:00:00 2001 From: Jannic Holzer Date: Fri, 5 May 2023 01:29:37 +0100 Subject: [PATCH 73/74] Add pylint ignore Signed-off-by: Jannic Holzer --- kedro-datasets/tests/databricks/test_managed_table_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/kedro-datasets/tests/databricks/test_managed_table_dataset.py b/kedro-datasets/tests/databricks/test_managed_table_dataset.py index 7f015c6a2..9aae08707 100644 --- a/kedro-datasets/tests/databricks/test_managed_table_dataset.py +++ b/kedro-datasets/tests/databricks/test_managed_table_dataset.py @@ -169,6 +169,7 @@ def expected_upsert_multiple_primary_spark_df(spark_session: SparkSession): return spark_session.createDataFrame(data, schema) +# pylint: disable=too-many-public-methods class TestManagedTableDataSet: def test_full_table(self): unity_ds = ManagedTableDataSet(catalog="test", database="test", table="test") From 2ced5a934e706236d34506d77038c7e808888c2f Mon Sep 17 00:00:00 2001 From: Jannic Holzer Date: Fri, 12 May 2023 14:10:09 +0100 Subject: [PATCH 74/74] Add tests/databricks to ignore for no-spark tests Signed-off-by: Jannic Holzer --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index be653ed59..4e0b4e640 100644 --- a/Makefile +++ b/Makefile @@ -52,10 +52,10 @@ sign-off: # kedro-datasets related only test-no-spark: - cd kedro-datasets && pytest tests --no-cov --ignore tests/spark --numprocesses 4 --dist loadfile + cd kedro-datasets && pytest tests --no-cov --ignore tests/spark --ignore tests/databricks --numprocesses 4 --dist loadfile test-no-spark-sequential: - cd kedro-datasets && pytest tests --no-cov --ignore tests/spark + cd kedro-datasets && pytest tests --no-cov --ignore tests/spark --ignore tests/databricks # kedro-datasets/snowflake tests skipped from default scope test-snowflake-only: