From 05c1481c67280b3b72c36199c1e7fc8f6630a85c Mon Sep 17 00:00:00 2001 From: Moritz Meister <8422705+moritzmeister@users.noreply.github.com> Date: Tue, 4 Oct 2022 09:44:50 +0200 Subject: [PATCH] [FSTORE-345] Update documentation to reflect supported methods in hsfs engines (#814) --- auto_doc.py | 6 ++++ docs/templates/api/storage_connector_api.md | 12 ++++++- python/hsfs/constructor/query.py | 8 +++++ python/hsfs/feature_group.py | 35 ++++++++++++++++++++- python/hsfs/feature_view.py | 9 ++++-- python/hsfs/storage_connector.py | 6 +++- python/hsfs/training_dataset.py | 3 ++ 7 files changed, 74 insertions(+), 5 deletions(-) diff --git a/auto_doc.py b/auto_doc.py index 5a59ce5474..db5baea4a4 100644 --- a/auto_doc.py +++ b/auto_doc.py @@ -119,6 +119,12 @@ "bigquery_properties": keras_autodoc.get_properties( "hsfs.storage_connector.BigQueryConnector" ), + "kafka_methods": keras_autodoc.get_methods( + "hsfs.storage_connector.KafkaConnector", exclude=["from_response_json"] + ), + "kafka_properties": keras_autodoc.get_properties( + "hsfs.storage_connector.KafkaConnector" + ), }, "api/statistics_config_api.md": { "statistics_config": ["hsfs.statistics_config.StatisticsConfig"], diff --git a/docs/templates/api/storage_connector_api.md b/docs/templates/api/storage_connector_api.md index 0f40513304..1b390e72ad 100644 --- a/docs/templates/api/storage_connector_api.md +++ b/docs/templates/api/storage_connector_api.md @@ -80,7 +80,6 @@ Read more about encryption on [Google Documentation.](https://cloud.google.com/s The storage connector uses the Google `gcs-connector-hadoop` behind the scenes. For more information, check out [Google Cloud Storage Connector for Spark and Hadoop]( https://github.com/GoogleCloudDataproc/hadoop-connectors/tree/master/gcs#google-cloud-storage-connector-for-spark-and-hadoop 'google-cloud-storage-connector-for-spark-and-hadoop') - ### Properties {{gcs_properties}} @@ -100,6 +99,7 @@ on service accounts and creating keyfile in GCP, read [Google Cloud documentatio The storage connector uses the Google `spark-bigquery-connector` behind the scenes. To read more about the spark connector, like the spark options or usage, check [Apache Spark SQL connector for Google BigQuery.](https://github.com/GoogleCloudDataproc/spark-bigquery-connector#usage 'github.com/GoogleCloudDataproc/spark-bigquery-connector') + ### Properties {{bigquery_properties}} @@ -107,3 +107,13 @@ To read more about the spark connector, like the spark options or usage, check [ ### Methods {{bigquery_methods}} + +## Kafka + +### Properties + +{{kafka_properties}} + +### Methods + +{{kafka_methods}} diff --git a/python/hsfs/constructor/query.py b/python/hsfs/constructor/query.py index 8f64e6c493..68b462d92a 100644 --- a/python/hsfs/constructor/query.py +++ b/python/hsfs/constructor/query.py @@ -86,6 +86,14 @@ def read( It is possible to specify the storage (online/offline) to read from and the type of the output DataFrame (Spark, Pandas, Numpy, Python Lists). + !!! warning "External Feature Group Engine Support" + **Spark only** + + Reading a Query containing an External Feature Group directly into a + Pandas Dataframe using Python/Pandas as Engine is not supported, + however, you can use the Query API to create Feature Views/Training + Data containing External Feature Groups. + # Arguments online: Read from online storage. Defaults to `False`. dataframe_type: DataFrame type to return. Defaults to `"default"`. diff --git a/python/hsfs/feature_group.py b/python/hsfs/feature_group.py index 4e01e9914a..39603a281f 100644 --- a/python/hsfs/feature_group.py +++ b/python/hsfs/feature_group.py @@ -1125,6 +1125,16 @@ def insert_stream( [q.name for q in sqm.active] ``` + !!! warning "Engine Support" + **Spark only** + + Stream ingestion using Pandas/Python as engine is currently not supported. + Python/Pandas has no notion of streaming. + + !!! warning "Data Validation Support" + `insert_stream` does not perform any data validation using Great Expectations + even when a expectation suite is attached. + # Arguments features: Features in Streaming Dataframe to be saved. query_name: It is possible to optionally specify a name for the query to @@ -1680,7 +1690,30 @@ def save(self): self._statistics_engine.compute_statistics(self, self.read()) def read(self, dataframe_type="default"): - """Get the feature group as a DataFrame.""" + """Get the feature group as a DataFrame. + + !!! warning "Engine Support" + **Spark only** + + Reading an External Feature Group directly into a Pandas Dataframe using + Python/Pandas as Engine is not supported, however, you can use the + Query API to create Feature Views/Training Data containing External + Feature Groups. + + # Arguments + dataframe_type: str, optional. Possible values are `"default"`, `"spark"`, + `"pandas"`, `"numpy"` or `"python"`, defaults to `"default"`. + + # Returns + `DataFrame`: The spark dataframe containing the feature data. + `pyspark.DataFrame`. A Spark DataFrame. + `pandas.DataFrame`. A Pandas DataFrame. + `numpy.ndarray`. A two-dimensional Numpy array. + `list`. A two-dimensional Python list. + + # Raises + `RestAPIError`. + """ engine.get_instance().set_job_group( "Fetching Feature group", "Getting feature group: {} from the featurestore {}".format( diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py index 7b63e99c16..04ce983c08 100644 --- a/python/hsfs/feature_view.py +++ b/python/hsfs/feature_view.py @@ -986,8 +986,13 @@ def get_training_data( Get training data from storage or feature groups. !!! info - If a materialised training data has deleted. Use `recreate_training_dataset()` to - recreate the training data. + If a materialised training data has deleted. Use `recreate_training_dataset()` to + recreate the training data. + + !!! warning "External Storage Support" + Reading training data that was written to external storage using a Storage + Connector other than S3 can currently not be read using HSFS APIs with + Python as Engine, instead you will have to use the storage's native client. # Arguments version: training dataset version diff --git a/python/hsfs/storage_connector.py b/python/hsfs/storage_connector.py index 900db5562d..bf12e96799 100644 --- a/python/hsfs/storage_connector.py +++ b/python/hsfs/storage_connector.py @@ -889,7 +889,11 @@ def read_stream( ): """Reads a Kafka stream from a topic or multiple topics into a Dataframe. - Currently, this method is only supported for Spark engines. + !!! warning "Engine Support" + **Spark only** + + Reading from data streams using Pandas/Python as engine is currently not supported. + Python/Pandas has no notion of streaming. # Arguments topic: Name or pattern of the topic(s) to subscribe to. diff --git a/python/hsfs/training_dataset.py b/python/hsfs/training_dataset.py index 4042f4fc54..8d0b5141db 100644 --- a/python/hsfs/training_dataset.py +++ b/python/hsfs/training_dataset.py @@ -262,6 +262,9 @@ def save( lists or Numpy ndarrays. From v2.5 onward, filters are saved along with the `Query`. + !!! warning "Engine Support" + Creating Training Datasets from Dataframes is only supported using Spark as Engine. + # Arguments features: Feature data to be materialized. write_options: Additional write options as key-value pairs, defaults to `{}`.