From 05c1481c67280b3b72c36199c1e7fc8f6630a85c Mon Sep 17 00:00:00 2001
From: Moritz Meister <8422705+moritzmeister@users.noreply.github.com>
Date: Tue, 4 Oct 2022 09:44:50 +0200
Subject: [PATCH] [FSTORE-345] Update documentation to reflect supported
 methods in hsfs engines (#814)

---
 auto_doc.py                                 |  6 ++++
 docs/templates/api/storage_connector_api.md | 12 ++++++-
 python/hsfs/constructor/query.py            |  8 +++++
 python/hsfs/feature_group.py                | 35 ++++++++++++++++++++-
 python/hsfs/feature_view.py                 |  9 ++++--
 python/hsfs/storage_connector.py            |  6 +++-
 python/hsfs/training_dataset.py             |  3 ++
 7 files changed, 74 insertions(+), 5 deletions(-)

diff --git a/auto_doc.py b/auto_doc.py
index 5a59ce5474..db5baea4a4 100644
--- a/auto_doc.py
+++ b/auto_doc.py
@@ -119,6 +119,12 @@
         "bigquery_properties": keras_autodoc.get_properties(
             "hsfs.storage_connector.BigQueryConnector"
         ),
+        "kafka_methods": keras_autodoc.get_methods(
+            "hsfs.storage_connector.KafkaConnector", exclude=["from_response_json"]
+        ),
+        "kafka_properties": keras_autodoc.get_properties(
+            "hsfs.storage_connector.KafkaConnector"
+        ),
     },
     "api/statistics_config_api.md": {
         "statistics_config": ["hsfs.statistics_config.StatisticsConfig"],
diff --git a/docs/templates/api/storage_connector_api.md b/docs/templates/api/storage_connector_api.md
index 0f40513304..1b390e72ad 100644
--- a/docs/templates/api/storage_connector_api.md
+++ b/docs/templates/api/storage_connector_api.md
@@ -80,7 +80,6 @@ Read more about encryption on [Google Documentation.](https://cloud.google.com/s
 The storage connector uses the Google `gcs-connector-hadoop` behind the scenes. For more information, check out [Google Cloud Storage Connector for Spark and Hadoop](
 https://github.com/GoogleCloudDataproc/hadoop-connectors/tree/master/gcs#google-cloud-storage-connector-for-spark-and-hadoop 'google-cloud-storage-connector-for-spark-and-hadoop')
 
-
 ### Properties
 
 {{gcs_properties}}
@@ -100,6 +99,7 @@ on service accounts and creating keyfile in GCP, read [Google Cloud documentatio
 The storage connector uses the Google `spark-bigquery-connector` behind the scenes.
 To read more about the spark connector, like the spark options or usage, check [Apache Spark SQL connector for Google BigQuery.](https://github.com/GoogleCloudDataproc/spark-bigquery-connector#usage
 'github.com/GoogleCloudDataproc/spark-bigquery-connector')
+
 ### Properties
 
 {{bigquery_properties}}
@@ -107,3 +107,13 @@ To read more about the spark connector, like the spark options or usage, check [
 ### Methods
 
 {{bigquery_methods}}
+
+## Kafka
+
+### Properties
+
+{{kafka_properties}}
+
+### Methods
+
+{{kafka_methods}}
diff --git a/python/hsfs/constructor/query.py b/python/hsfs/constructor/query.py
index 8f64e6c493..68b462d92a 100644
--- a/python/hsfs/constructor/query.py
+++ b/python/hsfs/constructor/query.py
@@ -86,6 +86,14 @@ def read(
         It is possible to specify the storage (online/offline) to read from and the
         type of the output DataFrame (Spark, Pandas, Numpy, Python Lists).
 
+        !!! warning "External Feature Group Engine Support"
+            **Spark only**
+
+            Reading a Query containing an External Feature Group directly into a
+            Pandas Dataframe using Python/Pandas as Engine is not supported,
+            however, you can use the Query API to create Feature Views/Training
+            Data containing External Feature Groups.
+
         # Arguments
             online: Read from online storage. Defaults to `False`.
             dataframe_type: DataFrame type to return. Defaults to `"default"`.
diff --git a/python/hsfs/feature_group.py b/python/hsfs/feature_group.py
index 4e01e9914a..39603a281f 100644
--- a/python/hsfs/feature_group.py
+++ b/python/hsfs/feature_group.py
@@ -1125,6 +1125,16 @@ def insert_stream(
         [q.name for q in sqm.active]
         ```
 
+        !!! warning "Engine Support"
+            **Spark only**
+
+            Stream ingestion using Pandas/Python as engine is currently not supported.
+            Python/Pandas has no notion of streaming.
+
+        !!! warning "Data Validation Support"
+            `insert_stream` does not perform any data validation using Great Expectations
+            even when a expectation suite is attached.
+
         # Arguments
             features: Features in Streaming Dataframe to be saved.
             query_name: It is possible to optionally specify a name for the query to
@@ -1680,7 +1690,30 @@ def save(self):
             self._statistics_engine.compute_statistics(self, self.read())
 
     def read(self, dataframe_type="default"):
-        """Get the feature group as a DataFrame."""
+        """Get the feature group as a DataFrame.
+
+        !!! warning "Engine Support"
+            **Spark only**
+
+            Reading an External Feature Group directly into a Pandas Dataframe using
+            Python/Pandas as Engine is not supported, however, you can use the
+            Query API to create Feature Views/Training Data containing External
+            Feature Groups.
+
+        # Arguments
+            dataframe_type: str, optional. Possible values are `"default"`, `"spark"`,
+                `"pandas"`, `"numpy"` or `"python"`, defaults to `"default"`.
+
+        # Returns
+            `DataFrame`: The spark dataframe containing the feature data.
+            `pyspark.DataFrame`. A Spark DataFrame.
+            `pandas.DataFrame`. A Pandas DataFrame.
+            `numpy.ndarray`. A two-dimensional Numpy array.
+            `list`. A two-dimensional Python list.
+
+        # Raises
+            `RestAPIError`.
+        """
         engine.get_instance().set_job_group(
             "Fetching Feature group",
             "Getting feature group: {} from the featurestore {}".format(
diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py
index 7b63e99c16..04ce983c08 100644
--- a/python/hsfs/feature_view.py
+++ b/python/hsfs/feature_view.py
@@ -986,8 +986,13 @@ def get_training_data(
         Get training data from storage or feature groups.
 
         !!! info
-        If a materialised training data has deleted. Use `recreate_training_dataset()` to
-        recreate the training data.
+            If a materialised training data has deleted. Use `recreate_training_dataset()` to
+            recreate the training data.
+
+        !!! warning "External Storage Support"
+            Reading training data that was written to external storage using a Storage
+            Connector other than S3 can currently not be read using HSFS APIs with
+            Python as Engine, instead you will have to use the storage's native client.
 
         # Arguments
             version: training dataset version
diff --git a/python/hsfs/storage_connector.py b/python/hsfs/storage_connector.py
index 900db5562d..bf12e96799 100644
--- a/python/hsfs/storage_connector.py
+++ b/python/hsfs/storage_connector.py
@@ -889,7 +889,11 @@ def read_stream(
     ):
         """Reads a Kafka stream from a topic or multiple topics into a Dataframe.
 
-        Currently, this method is only supported for Spark engines.
+        !!! warning "Engine Support"
+            **Spark only**
+
+            Reading from data streams using Pandas/Python as engine is currently not supported.
+            Python/Pandas has no notion of streaming.
 
         # Arguments
             topic: Name or pattern of the topic(s) to subscribe to.
diff --git a/python/hsfs/training_dataset.py b/python/hsfs/training_dataset.py
index 4042f4fc54..8d0b5141db 100644
--- a/python/hsfs/training_dataset.py
+++ b/python/hsfs/training_dataset.py
@@ -262,6 +262,9 @@ def save(
         lists or Numpy ndarrays.
         From v2.5 onward, filters are saved along with the `Query`.
 
+        !!! warning "Engine Support"
+            Creating Training Datasets from Dataframes is only supported using Spark as Engine.
+
         # Arguments
             features: Feature data to be materialized.
             write_options: Additional write options as key-value pairs, defaults to `{}`.