[HOPSWORKS-2077] add support for reading csv files as tf data object …

…and rename feeder to tf_data (#113)
logicalclocks · Oct 28, 2020 · 0b27bdb · 0b27bdb
1 parent 33ab960
commit 0b27bdb
Show file tree

Hide file tree

Showing 13 changed files with 786 additions and 371 deletions.
diff --git a/auto_doc.py b/auto_doc.py
@@ -35,6 +35,24 @@
             ],
         ),
     },
+    "training_dataset.md": {
+        "td_create": ["hsfs.feature_store.FeatureStore.create_training_dataset"],
+        "td_get": ["hsfs.feature_store.FeatureStore.get_training_dataset"],
+        "td_properties": keras_autodoc.get_properties(
+            "hsfs.training_dataset.TrainingDataset"
+        ),
+        "td_methods": keras_autodoc.get_methods(
+            "hsfs.training_dataset.TrainingDataset",
+            exclude=[
+                "from_response_json",
+                "update_from_response_json",
+                "json",
+                "to_dict",
+            ],
+        ),
+        "tf_record_dataset": ["hsfs.core.tfdata_engine.TFDataEngine.tf_record_dataset"],
+        "tf_csv_dataset": ["hsfs.core.tfdata_engine.TFDataEngine.tf_csv_dataset"],
+    },
     "api/connection_api.md": {
         "connection": ["hsfs.connection.Connection"],
         "connection_properties": keras_autodoc.get_properties(

diff --git a/docs/assets/images/attach_tags.gif b/docs/assets/images/attach_tags.gif
diff --git a/docs/assets/images/creating_tags.gif b/docs/assets/images/creating_tags.gif
diff --git a/docs/templates/training_dataset.md b/docs/templates/training_dataset.md
@@ -0,0 +1,54 @@
+# Training Dataset
+
+The training dataset abstraction in Hopsworks Feature Store allows users to group a set of features (potentially from
+multiple different feature groups) with labels for training a model to do a particular prediction task. The training
+dataset is a versioned and managed dataset and is stored in HopsFS as `tfrecords`, `parquet`, `csv`, or `tsv` files.
+
+## Versioning
+
+Training Dataset can be versioned. Data Scientist should use the version to indicate to the model, as well as to the
+schema or the feature engineering logic of the features associated to this training dataset.
+
+## Creation
+
+To create training dataset, the user supplies a Pandas, Numpy or Spark dataframe with features and labels
+together with metadata. Once the training dataset has been created, the dataset is discoverable in the feature registry
+and users can use it to train models.
+
+{{td_create}}
+
+## Tagging Training Datasets
+The feature store enables users to attach tags to training dataset in order to make them discoverable across feature
+stores.  A tag is a simple {key: value} association, providing additional information about the data, such as for
+example geographic origin. This is useful in an organization as it makes easier to discover for data scientists, reduces
+duplicated work in terms of for example data preparation. The tagging feature is only available in the enterprise version.
+
+#### Define tags that can be attached
+The first step is to define a set of tags that can be attached. Such as for example “Country” to tag data as being from
+a certain geographic location and “Sport” to further associate a type of Sport with the data.
+
+![Define tags that can be attached](../../assets/images/creating_tags.gif)
+
+#### Attach tags using the UI
+Tags can then be attached using the feature store UI or programmatically using the API.
+Attaching tags to feature group.
+
+![Attach tags using the UI](../../assets/images/attach_tags.gif)
+
+## Retrieval
+
+{{td_get}}
+
+## Properties
+
+{{td_properties}}
+
+## Methods
+
+{{td_methods}}
+
+## TFData engine
+
+{{tf_record_dataset}}
+
+{{tf_csv_dataset}}
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -22,7 +22,7 @@ nav:
     - Feature Store: generated/feature_store.md
     - Feature Group: generated/feature_group.md
     - Feature: guides/feature.md
-    - Training Dataset: guides/training_dataset.md
+    - Training Dataset: generated/training_dataset.md
     - Dataframe vs. Query: guides/programming_interface.md
     - Statistics: guides/statistics.md
     - Data Validation: guides/data_validation.md