[FSTORE-345] Update documentation to reflect supported methods in hsf…

…s engines (#814)
logicalclocks · Oct 4, 2022 · 05c1481 · 05c1481
1 parent 9b79da4
commit 05c1481
Show file tree

Hide file tree

Showing 7 changed files with 74 additions and 5 deletions.
diff --git a/auto_doc.py b/auto_doc.py
@@ -119,6 +119,12 @@
         "bigquery_properties": keras_autodoc.get_properties(
             "hsfs.storage_connector.BigQueryConnector"
         ),
+        "kafka_methods": keras_autodoc.get_methods(
+            "hsfs.storage_connector.KafkaConnector", exclude=["from_response_json"]
+        ),
+        "kafka_properties": keras_autodoc.get_properties(
+            "hsfs.storage_connector.KafkaConnector"
+        ),
     },
     "api/statistics_config_api.md": {
         "statistics_config": ["hsfs.statistics_config.StatisticsConfig"],

diff --git a/docs/templates/api/storage_connector_api.md b/docs/templates/api/storage_connector_api.md
@@ -80,7 +80,6 @@ Read more about encryption on [Google Documentation.](https://cloud.google.com/s
 The storage connector uses the Google `gcs-connector-hadoop` behind the scenes. For more information, check out [Google Cloud Storage Connector for Spark and Hadoop](
 https://github.com/GoogleCloudDataproc/hadoop-connectors/tree/master/gcs#google-cloud-storage-connector-for-spark-and-hadoop 'google-cloud-storage-connector-for-spark-and-hadoop')
 
-
 ### Properties
 
 {{gcs_properties}}
@@ -100,10 +99,21 @@ on service accounts and creating keyfile in GCP, read [Google Cloud documentatio
 The storage connector uses the Google `spark-bigquery-connector` behind the scenes.
 To read more about the spark connector, like the spark options or usage, check [Apache Spark SQL connector for Google BigQuery.](https://github.com/GoogleCloudDataproc/spark-bigquery-connector#usage
 'github.com/GoogleCloudDataproc/spark-bigquery-connector')
+
 ### Properties
 
 {{bigquery_properties}}
 
 ### Methods
 
 {{bigquery_methods}}
+
+## Kafka
+
+### Properties
+
+{{kafka_properties}}
+
+### Methods
+
+{{kafka_methods}}
diff --git a/python/hsfs/constructor/query.py b/python/hsfs/constructor/query.py
@@ -86,6 +86,14 @@ def read(
         It is possible to specify the storage (online/offline) to read from and the
         type of the output DataFrame (Spark, Pandas, Numpy, Python Lists).
 
+        !!! warning "External Feature Group Engine Support"
+            **Spark only**
+
+            Reading a Query containing an External Feature Group directly into a
+            Pandas Dataframe using Python/Pandas as Engine is not supported,
+            however, you can use the Query API to create Feature Views/Training
+            Data containing External Feature Groups.
+
         # Arguments
             online: Read from online storage. Defaults to `False`.
             dataframe_type: DataFrame type to return. Defaults to `"default"`.

diff --git a/python/hsfs/feature_group.py b/python/hsfs/feature_group.py
@@ -1125,6 +1125,16 @@ def insert_stream(
         [q.name for q in sqm.active]
         ```
 
+        !!! warning "Engine Support"
+            **Spark only**
+
+            Stream ingestion using Pandas/Python as engine is currently not supported.
+            Python/Pandas has no notion of streaming.
+
+        !!! warning "Data Validation Support"
+            `insert_stream` does not perform any data validation using Great Expectations
+            even when a expectation suite is attached.
+
         # Arguments
             features: Features in Streaming Dataframe to be saved.
             query_name: It is possible to optionally specify a name for the query to
@@ -1680,7 +1690,30 @@ def save(self):
             self._statistics_engine.compute_statistics(self, self.read())
 
     def read(self, dataframe_type="default"):
-        """Get the feature group as a DataFrame."""
+        """Get the feature group as a DataFrame.
+
+        !!! warning "Engine Support"
+            **Spark only**
+
+            Reading an External Feature Group directly into a Pandas Dataframe using
+            Python/Pandas as Engine is not supported, however, you can use the
+            Query API to create Feature Views/Training Data containing External
+            Feature Groups.
+
+        # Arguments
+            dataframe_type: str, optional. Possible values are `"default"`, `"spark"`,
+                `"pandas"`, `"numpy"` or `"python"`, defaults to `"default"`.
+
+        # Returns
+            `DataFrame`: The spark dataframe containing the feature data.
+            `pyspark.DataFrame`. A Spark DataFrame.
+            `pandas.DataFrame`. A Pandas DataFrame.
+            `numpy.ndarray`. A two-dimensional Numpy array.
+            `list`. A two-dimensional Python list.
+
+        # Raises
+            `RestAPIError`.
+        """
         engine.get_instance().set_job_group(
             "Fetching Feature group",
             "Getting feature group: {} from the featurestore {}".format(

diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py
@@ -986,8 +986,13 @@ def get_training_data(
         Get training data from storage or feature groups.
 
         !!! info
-        If a materialised training data has deleted. Use `recreate_training_dataset()` to
-        recreate the training data.
+            If a materialised training data has deleted. Use `recreate_training_dataset()` to
+            recreate the training data.
+
+        !!! warning "External Storage Support"
+            Reading training data that was written to external storage using a Storage
+            Connector other than S3 can currently not be read using HSFS APIs with
+            Python as Engine, instead you will have to use the storage's native client.
 
         # Arguments
             version: training dataset version

diff --git a/python/hsfs/storage_connector.py b/python/hsfs/storage_connector.py
@@ -889,7 +889,11 @@ def read_stream(
     ):
         """Reads a Kafka stream from a topic or multiple topics into a Dataframe.
 
-        Currently, this method is only supported for Spark engines.
+        !!! warning "Engine Support"
+            **Spark only**
+
+            Reading from data streams using Pandas/Python as engine is currently not supported.
+            Python/Pandas has no notion of streaming.
 
         # Arguments
             topic: Name or pattern of the topic(s) to subscribe to.

diff --git a/python/hsfs/training_dataset.py b/python/hsfs/training_dataset.py
@@ -262,6 +262,9 @@ def save(
         lists or Numpy ndarrays.
         From v2.5 onward, filters are saved along with the `Query`.
 
+        !!! warning "Engine Support"
+            Creating Training Datasets from Dataframes is only supported using Spark as Engine.
+
         # Arguments
             features: Feature data to be materialized.
             write_options: Additional write options as key-value pairs, defaults to `{}`.