logicalclocks · SirOibaf · Nov 6, 2020 · Oct 28, 2020 · Nov 6, 2020
diff --git a/auto_doc.py b/auto_doc.py
@@ -88,6 +88,13 @@ def generate(dest_dir):
         project_url="https://github.com/logicalclocks/feature-store-api/blob/master/python",
         template_dir="./docs/templates",
         titles_size="###",
+        extra_aliases={
+            "hsfs.core.query.Query": "hsfs.Query",
+            "hsfs.storage_connector.StorageConnector": "hsfs.StorageConnector",
+            "hsfs.statistics_config.StatisticsConfig": "hsfs.StatisticsConfig",
+            "hsfs.training_dataset_feature.TrainingDatasetFeature": "hsfs.TrainingDatasetFeature",
+            "pandas.core.frame.DataFrame": "pandas.DataFrame",
+        },
     )
     shutil.copyfile(hsfs_dir / "CONTRIBUTING.md", dest_dir / "CONTRIBUTING.md")
     shutil.copyfile(hsfs_dir / "README.md", dest_dir / "index.md")

diff --git a/python/hsfs/core/feature_group_engine.py b/python/hsfs/core/feature_group_engine.py
@@ -69,7 +69,9 @@ def save(self, feature_group, feature_dataframe, write_options):
             feature_group,
             feature_dataframe,
             self.APPEND,
-            hudi_engine.HudiEngine.HUDI_BULK_INSERT if feature_group.time_travel_format == "HUDI" else None,
+            hudi_engine.HudiEngine.HUDI_BULK_INSERT
+            if feature_group.time_travel_format == "HUDI"
+            else None,
             feature_group.online_enabled,
             None,
             offline_write_options,
@@ -134,7 +136,7 @@ def commit_delete(feature_group, delete_df, write_options):
 
     def update_statistics_config(self, feature_group):
         """Update the statistics configuration of a feature group."""
-        self._feature_group_api.update_statistics_config(
+        self._feature_group_api.update_metadata(
             feature_group, feature_group, "updateStatsSettings"
         )
 

diff --git a/python/hsfs/core/training_dataset_api.py b/python/hsfs/core/training_dataset_api.py
@@ -55,7 +55,7 @@ def get(self, name, version):
             _client._send_request("GET", path_params, query_params)[0],
         )
 
-    def get_query(self, training_dataset_instance):
+    def get_query(self, training_dataset_instance, with_label):
         _client = client.get_instance()
         path_params = [
             "project",
@@ -66,4 +66,5 @@ def get_query(self, training_dataset_instance):
             training_dataset_instance.id,
             "query",
         ]
-        return _client._send_request("GET", path_params)
+        query_params = {"withLabel": with_label}
+        return _client._send_request("GET", path_params, query_params)
diff --git a/python/hsfs/core/training_dataset_engine.py b/python/hsfs/core/training_dataset_engine.py
@@ -74,8 +74,8 @@ def read(self, training_dataset, split, user_read_options):
             path,
         )
 
-    def query(self, training_dataset, online):
-        return self._training_dataset_api.get_query(training_dataset)[
+    def query(self, training_dataset, online, with_label):
+        return self._training_dataset_api.get_query(training_dataset, with_label)[
             "queryOnline" if online else "query"
         ]
 

diff --git a/python/hsfs/feature.py b/python/hsfs/feature.py
@@ -55,6 +55,7 @@ def from_response_json(cls, json_dict):
 
     @property
     def name(self):
+        """Name of the feature."""
         return self._name
 
     @name.setter
@@ -63,6 +64,13 @@ def name(self, name):
 
     @property
     def type(self):
+        """Data type of the feature in the feature store.
+
+        !!! danger "Not a Python type"
+            This type property is not to be confused with Python types.
+            The type property represents the actual data type of the feature in
+            the feature store.
+        """
         return self._type
 
     @type.setter
@@ -71,6 +79,7 @@ def type(self, type):
 
     @property
     def primary(self):
+        """Whether the feature is part of the primary key of the feature group."""
         return self._primary
 
     @primary.setter
@@ -79,6 +88,7 @@ def primary(self, primary):
 
     @property
     def partition(self):
+        """Whether the feature is part of the partition key of the feature group."""
         return self._partition
 
     @partition.setter
@@ -87,6 +97,8 @@ def partition(self, partition):
 
     @property
     def default_value(self):
+        """Default value of the feature as string, if the feature was appended to the
+        feature group."""
         return self._default_value
 
     @default_value.setter

diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py
@@ -16,9 +16,15 @@
 
 import warnings
 import humps
-from typing import Optional, Union, List
+from typing import Optional, Union, List, Dict
 
-from hsfs import training_dataset, feature_group, util, training_dataset_feature
+from hsfs import (
+    training_dataset,
+    feature_group,
+    util,
+    storage_connector,
+    training_dataset_feature,
+)
 from hsfs.core import (
     feature_group_api,
     storage_connector_api,
@@ -163,10 +169,10 @@ def create_feature_group(
             DataFrame.
 
         # Arguments
-            name: Name of the feature group to get.
+            name: Name of the feature group to create.
             version: Version of the feature group to retrieve, defaults to `None` and
                 will create the feature group with incremented version from the last
-                verison in the feature store.
+                version in the feature store.
             description: A string describing the contents of the feature group to
                 improve discoverability for Data Scientists, defaults to empty string
                 `""`.
@@ -209,16 +215,79 @@ def create_feature_group(
 
     def create_training_dataset(
         self,
-        name,
-        version=None,
-        description="",
-        data_format="tfrecords",
-        storage_connector=None,
-        splits={},
-        location="",
-        seed=None,
-        statistics_config=None,
+        name: str,
+        version: Optional[int] = None,
+        description: Optional[str] = "",
+        data_format: Optional[str] = "tfrecords",
+        storage_connector: Optional[storage_connector.StorageConnector] = None,
+        splits: Optional[Dict[str, float]] = {},
+        location: Optional[str] = "",
+        seed: Optional[int] = None,
+        statistics_config: Optional[Union[StatisticsConfig, bool, dict]] = None,
+        label: Optional[List[str]] = [],
     ):
+        """Create a training dataset metadata object.
+
+        !!! note "Lazy"
+            This method is lazy and does not persist any metadata or feature data in the
+            feature store on its own. To materialize the training dataset and save
+            feature data along the metadata in the feature store, call the `save()`
+            method with a `DataFrame` or `Query`.
+
+        !!! info "Data Formats"
+            The feature store currently supports the following data formats for
+            training datasets:
+
+            1. tfrecord
+            2. csv
+            3. tsv
+            4. parquet
+            5. avro
+            6. orc
+
+            Currently not supported petastorm, hdf5 and npy file formats.
+
+
+        # Arguments
+            name: Name of the training dataset to create.
+            version: Version of the training dataset to retrieve, defaults to `None` and
+                will create the training dataset with incremented version from the last
+                version in the feature store.
+            description: A string describing the contents of the training dataset to
+                improve discoverability for Data Scientists, defaults to empty string
+                `""`.
+            data_format: The data format used to save the training dataset,
+                defaults to `"tfrecords"`-format.
+            storage_connector: Storage connector defining the sink location for the
+                training dataset, defaults to `None`, and materializes training dataset
+                on HopsFS.
+            splits: A dictionary defining training dataset splits to be created. Keys in
+                the dictionary define the name of the split as `str`, values represent
+                percentage of samples in the split as `float`. Currently, only random
+                splits are supported. Defaults to empty dict`{}`, creating only a single
+                training dataset without splits.
+            location: Path to complement the sink storage connector with, e.g if the
+                storage connector points to an S3 bucket, this path can be used to
+                define a sub-directory inside the bucket to place the training dataset.
+                Defaults to `""`, saving the training dataset at the root defined by the
+                storage connector.
+            seed: Optionally, define a seed to create the random splits with, in order
+                to guarantee reproducability, defaults to `None`.
+            statistics_config: A configuration object, or a dictionary with keys
+                "`enabled`" to generally enable descriptive statistics computation for
+                this feature group, `"correlations`" to turn on feature correlation
+                computation and `"histograms"` to compute feature value frequencies. The
+                values should be booleans indicating the setting. To fully turn off
+                statistics computation pass `statistics_config=False`. Defaults to
+                `None` and will compute only descriptive statistics.
+            label: A list of feature names constituting the prediction label/feature of
+                the training dataset. When replaying a `Query` during model inference,
+                the label features can be omitted from the feature vector retrieval.
+                Defaults to `[]`, no label.
+
+        # Returns:
+            `TrainingDataset`: The training dataset metadata object.
+        """
         return training_dataset.TrainingDataset(
             name=name,
             version=version,
@@ -230,4 +299,5 @@ def create_training_dataset(
             splits=splits,
             seed=seed,
             statistics_config=statistics_config,
+            label=label,
         )