Improve DataSet API documentation with YAML examples (#1844)

Co-authored-by: Nok Lam Chan <nok_lam_chan@mckinsey.com> Signed-off-by: Merel Theisen <merel.theisen@quantumblack.com>
kedro-org · Nov 10, 2022 · bc6f3fe · bc6f3fe
1 parent f52249b
commit bc6f3fe
Show file tree

Hide file tree

Showing 8 changed files with 117 additions and 12 deletions.
diff --git a/kedro/extras/datasets/api/api_dataset.py b/kedro/extras/datasets/api/api_dataset.py
@@ -17,7 +17,27 @@ class APIDataSet(AbstractDataSet[None, requests.Response]):
     """``APIDataSet`` loads the data from HTTP(S) APIs.
     It uses the python requests library: https://requests.readthedocs.io/en/latest/
 
-    Example:
+    Example adding a catalog entry with
+    `YAML API
+    <https://kedro.readthedocs.io/en/stable/data/\
+        data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_:
+
+    .. code-block:: yaml
+
+        >>> usda:
+        >>>   type: api.APIDataSet
+        >>>   url: https://quickstats.nass.usda.gov
+        >>>   params:
+        >>>     key: SOME_TOKEN,
+        >>>     format: JSON,
+        >>>     commodity_desc: CORN,
+        >>>     statisticcat_des: YIELD,
+        >>>     agg_level_desc: STATE,
+        >>>     year: 2000
+        >>>
+
+
+    Example using Python API:
     ::
 
         >>> from kedro.extras.datasets.api import APIDataSet

diff --git a/kedro/extras/datasets/dask/parquet_dataset.py b/kedro/extras/datasets/dask/parquet_dataset.py
@@ -19,7 +19,26 @@ class ParquetDataSet(AbstractDataSet[dd.DataFrame, dd.DataFrame]):
     remote data services to handle the corresponding load and save operations:
     https://docs.dask.org/en/latest/how-to/connect-to-remote-data.html
 
-        Example (AWS S3):
+        Example adding a catalog entry with
+        `YAML API
+        <https://kedro.readthedocs.io/en/stable/data/\
+            data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_:
+
+        .. code-block:: yaml
+
+        >>> cars:
+        >>>   type: dask.ParquetDataSet
+        >>>   filepath: s3://bucket_name/path/to/folder
+        >>>   save_args:
+        >>>     compression: GZIP
+        >>>   credentials:
+        >>>     client_kwargs:
+        >>>         aws_access_key_id: YOUR_KEY
+        >>>         aws_secret_access_key: YOUR_SECRET
+        >>>
+
+
+        Example using Python API (AWS S3):
         ::
 
             >>> from kedro.extras.datasets.dask import ParquetDataSet

diff --git a/kedro/extras/datasets/pandas/csv_dataset.py b/kedro/extras/datasets/pandas/csv_dataset.py
@@ -52,8 +52,6 @@ class CSVDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]):
         >>>   type: pandas.CSVDataSet
         >>>   filepath: s3://your_bucket/data/02_intermediate/company/motorbikes.csv
         >>>   credentials: dev_s3
-        >>>
-
 
     Example using Python API:
     ::

diff --git a/kedro/extras/datasets/pandas/feather_dataset.py b/kedro/extras/datasets/pandas/feather_dataset.py
@@ -32,7 +32,28 @@ class FeatherDataSet(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]):
     is supported by pandas, so it supports all allowed pandas options
     for loading and saving csv files.
 
-    Example:
+    Example adding a catalog entry with
+    `YAML API
+    <https://kedro.readthedocs.io/en/stable/data/\
+        data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_:
+
+    .. code-block:: yaml
+
+        >>> cars:
+        >>>   type: pandas.FeatherDataSet
+        >>>   filepath: data/01_raw/company/cars.feather
+        >>>   load_args:
+        >>>     columns: ['col1', 'col2', 'col3']
+        >>>     use_threads: True
+        >>>
+        >>> motorbikes:
+        >>>   type: pandas.FeatherDataSet
+        >>>   filepath: s3://your_bucket/data/02_intermediate/company/motorbikes.feather
+        >>>   credentials: dev_s3
+        >>>
+
+
+    Example using Python API:
     ::
 
         >>> from kedro.extras.datasets.pandas import FeatherDataSet

diff --git a/kedro/extras/datasets/tensorflow/tensorflow_model_dataset.py b/kedro/extras/datasets/tensorflow/tensorflow_model_dataset.py
@@ -29,14 +29,28 @@ class TensorFlowModelDataset(AbstractVersionedDataSet[tf.keras.Model, tf.keras.M
     The underlying functionality is supported by, and passes input arguments through to,
     TensorFlow 2.X load_model and save_model methods.
 
-    Example:
+    .. code-block:: yaml
+
+        >>> tensorflow_model:
+        >>>   type: tensorflow.TensorFlowModelDataset
+        >>>   filepath: data/06_models/tensorflow_model.h5
+        >>>   load_args:
+        >>>     compile: False
+        >>>   save_args:
+        >>>     overwrite: True
+        >>>     include_optimizer: False
+        >>>   credentials: tf_creds
+        >>>
+
+
+    Example using Python API:
     ::
 
         >>> from kedro.extras.datasets.tensorflow import TensorFlowModelDataset
         >>> import tensorflow as tf
         >>> import numpy as np
         >>>
-        >>> data_set = TensorFlowModelDataset("saved_model_path")
+        >>> data_set = TensorFlowModelDataset("data/06_models/tensorflow_model.h5")
         >>> model = tf.keras.Model()
         >>> predictions = model.predict([...])
         >>>

diff --git a/kedro/extras/datasets/tracking/json_dataset.py b/kedro/extras/datasets/tracking/json_dataset.py
@@ -18,8 +18,19 @@ class JSONDataSet(JDS):
     The ``JSONDataSet`` is part of Kedro Experiment Tracking.
     The dataset is write-only and it is versioned by default.
 
-        Example:
-        ::
+    Example adding a catalog entry with
+    `YAML API
+    <https://kedro.readthedocs.io/en/stable/data/\
+        data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_:
+
+    .. code-block:: yaml
+
+        >>> cars:
+        >>>   type: tracking.JSONDataSet
+        >>>   filepath: data/09_tracking/cars.json
+
+    Example using Python API:
+    ::
 
         >>> from kedro.extras.datasets.tracking import JSONDataSet
         >>>

diff --git a/kedro/extras/datasets/tracking/metrics_dataset.py b/kedro/extras/datasets/tracking/metrics_dataset.py
@@ -20,8 +20,19 @@ class MetricsDataSet(JSONDataSet):
     ``MetricsDataSet`` is part of Kedro Experiment Tracking. The dataset is write-only,
     it is versioned by default and only takes metrics of numeric values.
 
-        Example:
-        ::
+Example adding a catalog entry with
+    `YAML API
+    <https://kedro.readthedocs.io/en/stable/data/\
+        data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_:
+
+    .. code-block:: yaml
+
+        >>> cars:
+        >>>   type: metrics.MetricsDataSet
+        >>>   filepath: data/09_tracking/cars.json
+
+    Example using Python API:
+    ::
 
         >>> from kedro.extras.datasets.tracking import MetricsDataSet
         >>>

diff --git a/kedro/extras/datasets/yaml/yaml_dataset.py b/kedro/extras/datasets/yaml/yaml_dataset.py
@@ -25,7 +25,18 @@ class YAMLDataSet(AbstractVersionedDataSet[Dict, Dict]):
     """``YAMLDataSet`` loads/saves data from/to a YAML file using an underlying
     filesystem (e.g.: local, S3, GCS). It uses PyYAML to handle the YAML file.
 
-    Example:
+    Example adding a catalog entry with
+    `YAML API
+    <https://kedro.readthedocs.io/en/stable/data/\
+        data_catalog.html#use-the-data-catalog-with-the-yaml-api>`_:
+
+    .. code-block:: yaml
+
+        >>> cars:
+        >>>   type: yaml.YAMLDataSet
+        >>>   filepath: cars.yaml
+
+    Example using Python API:
     ::
 
         >>> from kedro.extras.datasets.yaml import YAMLDataSet