diff --git a/kedro/extras/datasets/email/message_dataset.py b/kedro/extras/datasets/email/message_dataset.py index c92f3206ae..01b602cc51 100644 --- a/kedro/extras/datasets/email/message_dataset.py +++ b/kedro/extras/datasets/email/message_dataset.py @@ -50,7 +50,6 @@ class EmailMessageDataSet( >>> msg["From"] = '"sin studly17"' >>> msg["To"] = '"strong bad"' >>> - >>> # data_set = EmailMessageDataSet(filepath="gcs://bucket/test") >>> data_set = EmailMessageDataSet(filepath="test") >>> data_set.save(msg) >>> reloaded = data_set.load() diff --git a/kedro/extras/datasets/geopandas/geojson_dataset.py b/kedro/extras/datasets/geopandas/geojson_dataset.py index a06988e6fd..11e39b2e76 100644 --- a/kedro/extras/datasets/geopandas/geojson_dataset.py +++ b/kedro/extras/datasets/geopandas/geojson_dataset.py @@ -41,10 +41,7 @@ class GeoJSONDataSet( >>> >>> data = gpd.GeoDataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}, geometry=[Point(1,1), Point(2,4)]) - >>> # data_set = GeoJSONDataSet(filepath="gcs://bucket/test.geojson", - >>> save_args=None) - >>> data_set = GeoJSONDataSet(filepath="test.geojson", - >>> save_args=None) + >>> data_set = GeoJSONDataSet(filepath="test.geojson", save_args=None) >>> data_set.save(data) >>> reloaded = data_set.load() >>> diff --git a/kedro/extras/datasets/json/json_dataset.py b/kedro/extras/datasets/json/json_dataset.py index a2bccab757..b9d0a9598e 100644 --- a/kedro/extras/datasets/json/json_dataset.py +++ b/kedro/extras/datasets/json/json_dataset.py @@ -32,8 +32,6 @@ class JSONDataSet(AbstractVersionedDataSet[Any, Any]): >>> json_dataset: >>> type: json.JSONDataSet >>> filepath: data/01_raw/location.json - >>> load_args: - >>> lines: True >>> >>> cars: >>> type: json.JSONDataSet @@ -41,8 +39,6 @@ class JSONDataSet(AbstractVersionedDataSet[Any, Any]): >>> fs_args: >>> project: my-project >>> credentials: my_gcp_credentials - >>> load_args: - >>> lines: True Example using Python API: :: @@ -51,7 +47,6 @@ class JSONDataSet(AbstractVersionedDataSet[Any, Any]): >>> >>> data = {'col1': [1, 2], 'col2': [4, 5], 'col3': [5, 6]} >>> - >>> # data_set = JSONDataSet(filepath="gcs://bucket/test.json") >>> data_set = JSONDataSet(filepath="test.json") >>> data_set.save(data) >>> reloaded = data_set.load() diff --git a/kedro/extras/datasets/pandas/csv_dataset.py b/kedro/extras/datasets/pandas/csv_dataset.py index 8cbcd9f715..bf0b2560e0 100644 --- a/kedro/extras/datasets/pandas/csv_dataset.py +++ b/kedro/extras/datasets/pandas/csv_dataset.py @@ -64,7 +64,6 @@ class CSVDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) >>> - >>> # data_set = CSVDataSet(filepath="gcs://bucket/test.csv") >>> data_set = CSVDataSet(filepath="test.csv") >>> data_set.save(data) >>> reloaded = data_set.load() diff --git a/kedro/extras/datasets/pandas/excel_dataset.py b/kedro/extras/datasets/pandas/excel_dataset.py index 33fc89fcb0..b7ca422a80 100644 --- a/kedro/extras/datasets/pandas/excel_dataset.py +++ b/kedro/extras/datasets/pandas/excel_dataset.py @@ -63,7 +63,6 @@ class ExcelDataSet( >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) >>> - >>> # data_set = ExcelDataSet(filepath="gcs://bucket/test.xlsx") >>> data_set = ExcelDataSet(filepath="test.xlsx") >>> data_set.save(data) >>> reloaded = data_set.load() diff --git a/kedro/extras/datasets/pandas/feather_dataset.py b/kedro/extras/datasets/pandas/feather_dataset.py index 4247644f29..972e3f40e3 100644 --- a/kedro/extras/datasets/pandas/feather_dataset.py +++ b/kedro/extras/datasets/pandas/feather_dataset.py @@ -41,7 +41,6 @@ class FeatherDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) >>> - >>> # data_set = FeatherDataSet(filepath="gcs://bucket/test.feather") >>> data_set = FeatherDataSet(filepath="test.feather") >>> >>> data_set.save(data) diff --git a/kedro/extras/datasets/pandas/generic_dataset.py b/kedro/extras/datasets/pandas/generic_dataset.py index 9d912947d8..8e2688ec65 100644 --- a/kedro/extras/datasets/pandas/generic_dataset.py +++ b/kedro/extras/datasets/pandas/generic_dataset.py @@ -78,7 +78,6 @@ class GenericDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) >>> - >>> # data_set = GenericDataSet(filepath="s3://test.csv", file_format='csv') >>> data_set = GenericDataSet(filepath="test.csv", file_format='csv') >>> data_set.save(data) >>> reloaded = data_set.load() diff --git a/kedro/extras/datasets/pandas/hdf_dataset.py b/kedro/extras/datasets/pandas/hdf_dataset.py index 20ab56e363..16c44b9d04 100644 --- a/kedro/extras/datasets/pandas/hdf_dataset.py +++ b/kedro/extras/datasets/pandas/hdf_dataset.py @@ -49,7 +49,6 @@ class HDFDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) >>> - >>> # data_set = HDFDataSet(filepath="gcs://bucket/test.hdf", key='data') >>> data_set = HDFDataSet(filepath="test.h5", key='data') >>> data_set.save(data) >>> reloaded = data_set.load() diff --git a/kedro/extras/datasets/pandas/json_dataset.py b/kedro/extras/datasets/pandas/json_dataset.py index 06af2144ac..689eed14e7 100644 --- a/kedro/extras/datasets/pandas/json_dataset.py +++ b/kedro/extras/datasets/pandas/json_dataset.py @@ -56,7 +56,6 @@ class JSONDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) >>> - >>> # data_set = JSONDataSet(filepath="gcs://bucket/test.json") >>> data_set = JSONDataSet(filepath="test.json") >>> data_set.save(data) >>> reloaded = data_set.load() diff --git a/kedro/extras/datasets/pandas/parquet_dataset.py b/kedro/extras/datasets/pandas/parquet_dataset.py index 1639f61f15..01d77c14dd 100644 --- a/kedro/extras/datasets/pandas/parquet_dataset.py +++ b/kedro/extras/datasets/pandas/parquet_dataset.py @@ -68,7 +68,6 @@ class ParquetDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) >>> - >>> # data_set = ParquetDataSet(filepath="gcs://bucket/test.parquet") >>> data_set = ParquetDataSet(filepath="test.parquet") >>> data_set.save(data) >>> reloaded = data_set.load() diff --git a/kedro/extras/datasets/pandas/xml_dataset.py b/kedro/extras/datasets/pandas/xml_dataset.py index 787de929d9..bfcaf05fc1 100644 --- a/kedro/extras/datasets/pandas/xml_dataset.py +++ b/kedro/extras/datasets/pandas/xml_dataset.py @@ -39,7 +39,6 @@ class XMLDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]): >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) >>> - >>> # data_set = XMLDataSet(filepath="gcs://bucket/test.xml") >>> data_set = XMLDataSet(filepath="test.xml") >>> data_set.save(data) >>> reloaded = data_set.load() diff --git a/kedro/extras/datasets/pickle/pickle_dataset.py b/kedro/extras/datasets/pickle/pickle_dataset.py index b474154094..11aca8988c 100644 --- a/kedro/extras/datasets/pickle/pickle_dataset.py +++ b/kedro/extras/datasets/pickle/pickle_dataset.py @@ -57,13 +57,11 @@ class PickleDataSet(AbstractVersionedDataSet[Any, Any]): >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) >>> - >>> # data_set = PickleDataSet(filepath="gcs://bucket/test.pkl") >>> data_set = PickleDataSet(filepath="test.pkl", backend="pickle") >>> data_set.save(data) >>> reloaded = data_set.load() >>> assert data.equals(reloaded) >>> - >>> # Add "compress_pickle[lz4]" to requirements.txt >>> data_set = PickleDataSet(filepath="test.pickle.lz4", >>> backend="compress_pickle", >>> load_args={"compression":"lz4"}, diff --git a/kedro/extras/datasets/pillow/image_dataset.py b/kedro/extras/datasets/pillow/image_dataset.py index a36be99a80..a547589e96 100644 --- a/kedro/extras/datasets/pillow/image_dataset.py +++ b/kedro/extras/datasets/pillow/image_dataset.py @@ -30,7 +30,6 @@ class ImageDataSet(AbstractVersionedDataSet[Image.Image, Image.Image]): >>> from kedro.extras.datasets.pillow import ImageDataSet >>> - >>> # data_set = ImageDataSet(filepath="gcs://bucket/test.png") >>> data_set = ImageDataSet(filepath="test.png") >>> image = data_set.load() >>> image.show() diff --git a/kedro/extras/datasets/plotly/plotly_dataset.py b/kedro/extras/datasets/plotly/plotly_dataset.py index 56d514f8a9..6b3a3a81a0 100644 --- a/kedro/extras/datasets/plotly/plotly_dataset.py +++ b/kedro/extras/datasets/plotly/plotly_dataset.py @@ -27,21 +27,22 @@ class PlotlyDataSet(JSONDataSet): the JSON file directly from a pandas DataFrame through ``plotly_args``. Example configuration for a PlotlyDataSet in the catalog: - :: + + .. code-block:: yaml >>> bar_plot: - >>> type: plotly.PlotlyDataSet - >>> filepath: data/08_reporting/bar_plot.json - >>> plotly_args: - >>> type: bar - >>> fig: - >>> x: features - >>> y: importance - >>> orientation: h - >>> layout: - >>> xaxis_title: x - >>> yaxis_title: y - >>> title: Test + >>> type: plotly.PlotlyDataSet + >>> filepath: data/08_reporting/bar_plot.json + >>> plotly_args: + >>> type: bar + >>> fig: + >>> x: features + >>> y: importance + >>> orientation: h + >>> layout: + >>> xaxis_title: x + >>> yaxis_title: y + >>> title: Title """ # pylint: disable=too-many-arguments diff --git a/kedro/extras/datasets/redis/redis_dataset.py b/kedro/extras/datasets/redis/redis_dataset.py index 325f32617a..26fdf00bf4 100644 --- a/kedro/extras/datasets/redis/redis_dataset.py +++ b/kedro/extras/datasets/redis/redis_dataset.py @@ -47,6 +47,7 @@ class PickleDataSet(AbstractDataSet[Any, Any]): :: >>> from kedro.extras.datasets.redis import PickleDataSet + >>> import pandas as pd >>> >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], >>> 'col3': [5, 6]}) diff --git a/kedro/extras/datasets/text/text_dataset.py b/kedro/extras/datasets/text/text_dataset.py index e2440e140f..9baeef3c97 100644 --- a/kedro/extras/datasets/text/text_dataset.py +++ b/kedro/extras/datasets/text/text_dataset.py @@ -31,7 +31,6 @@ class TextDataSet(AbstractVersionedDataSet[str, str]): >>> >>> string_to_write = "This will go in a file." >>> - >>> # data_set = TextDataSet(filepath="gcs://bucket/test.md") >>> data_set = TextDataSet(filepath="test.md") >>> data_set.save(string_to_write) >>> reloaded = data_set.load() diff --git a/kedro/extras/datasets/tracking/json_dataset.py b/kedro/extras/datasets/tracking/json_dataset.py index f1c795a18d..f29c68b544 100644 --- a/kedro/extras/datasets/tracking/json_dataset.py +++ b/kedro/extras/datasets/tracking/json_dataset.py @@ -25,7 +25,6 @@ class JSONDataSet(JDS): >>> >>> data = {'col1': 1, 'col2': 0.23, 'col3': 0.002} >>> - >>> # data_set = JSONDataSet(filepath="gcs://bucket/test.json") >>> data_set = JSONDataSet(filepath="test.json") >>> data_set.save(data) diff --git a/kedro/extras/datasets/tracking/metrics_dataset.py b/kedro/extras/datasets/tracking/metrics_dataset.py index 9543772442..8d2fdcc8b4 100644 --- a/kedro/extras/datasets/tracking/metrics_dataset.py +++ b/kedro/extras/datasets/tracking/metrics_dataset.py @@ -27,7 +27,6 @@ class MetricsDataSet(JSONDataSet): >>> >>> data = {'col1': 1, 'col2': 0.23, 'col3': 0.002} >>> - >>> # data_set = MetricsDataSet(filepath="gcs://bucket/test.json") >>> data_set = MetricsDataSet(filepath="test.json") >>> data_set.save(data) diff --git a/kedro/extras/datasets/yaml/yaml_dataset.py b/kedro/extras/datasets/yaml/yaml_dataset.py index 9ba7db5556..a0a90868c5 100644 --- a/kedro/extras/datasets/yaml/yaml_dataset.py +++ b/kedro/extras/datasets/yaml/yaml_dataset.py @@ -32,7 +32,6 @@ class YAMLDataSet(AbstractVersionedDataSet[Dict, Dict]): >>> >>> data = {'col1': [1, 2], 'col2': [4, 5], 'col3': [5, 6]} >>> - >>> # data_set = YAMLDataSet(filepath="gcs://bucket/test.yaml") >>> data_set = YAMLDataSet(filepath="test.yaml") >>> data_set.save(data) >>> reloaded = data_set.load() diff --git a/setup.py b/setup.py index 37a3727081..edb7cd3e78 100644 --- a/setup.py +++ b/setup.py @@ -78,6 +78,7 @@ def _collect_requirements(requires): "pandas.XMLDataSet": [PANDAS, "lxml~=4.6"], "pandas.GenericDataSet": [PANDAS], } +pickle_require = {"pickle.PickleDataSet": ["compress-pickle[lz4]~=2.1.0"]} pillow_require = {"pillow.ImageDataSet": ["Pillow~=9.0"]} plotly_require = { "plotly.PlotlyDataSet": [PANDAS, "plotly>=4.8.0, <6.0"], @@ -121,6 +122,7 @@ def _collect_requirements(requires): "holoviews": _collect_requirements(holoviews_require), "networkx": _collect_requirements(networkx_require), "pandas": _collect_requirements(pandas_require), + "pickle": _collect_requirements(pickle_require), "pillow": _collect_requirements(pillow_require), "plotly": _collect_requirements(plotly_require), "redis": _collect_requirements(redis_require), @@ -135,6 +137,7 @@ def _collect_requirements(requires): **holoviews_require, **networkx_require, **pandas_require, + **pickle_require, **pillow_require, **plotly_require, **spark_require,