Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[HOPSWORKS-2091] Label/Prediction feature metadata for training datasets #126

Merged
merged 2 commits into from
Nov 6, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions auto_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,13 @@ def generate(dest_dir):
project_url="https://github.com/logicalclocks/feature-store-api/blob/master/python",
template_dir="./docs/templates",
titles_size="###",
extra_aliases={
"hsfs.core.query.Query": "hsfs.Query",
"hsfs.storage_connector.StorageConnector": "hsfs.StorageConnector",
"hsfs.statistics_config.StatisticsConfig": "hsfs.StatisticsConfig",
"hsfs.training_dataset_feature.TrainingDatasetFeature": "hsfs.TrainingDatasetFeature",
"pandas.core.frame.DataFrame": "pandas.DataFrame",
},
)
shutil.copyfile(hsfs_dir / "CONTRIBUTING.md", dest_dir / "CONTRIBUTING.md")
shutil.copyfile(hsfs_dir / "README.md", dest_dir / "index.md")
Expand Down
6 changes: 4 additions & 2 deletions python/hsfs/core/feature_group_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,9 @@ def save(self, feature_group, feature_dataframe, write_options):
feature_group,
feature_dataframe,
self.APPEND,
hudi_engine.HudiEngine.HUDI_BULK_INSERT if feature_group.time_travel_format == "HUDI" else None,
hudi_engine.HudiEngine.HUDI_BULK_INSERT
if feature_group.time_travel_format == "HUDI"
else None,
feature_group.online_enabled,
None,
offline_write_options,
Expand Down Expand Up @@ -134,7 +136,7 @@ def commit_delete(feature_group, delete_df, write_options):

def update_statistics_config(self, feature_group):
"""Update the statistics configuration of a feature group."""
self._feature_group_api.update_statistics_config(
self._feature_group_api.update_metadata(
feature_group, feature_group, "updateStatsSettings"
)

Expand Down
5 changes: 3 additions & 2 deletions python/hsfs/core/training_dataset_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def get(self, name, version):
_client._send_request("GET", path_params, query_params)[0],
)

def get_query(self, training_dataset_instance):
def get_query(self, training_dataset_instance, with_label):
_client = client.get_instance()
path_params = [
"project",
Expand All @@ -66,4 +66,5 @@ def get_query(self, training_dataset_instance):
training_dataset_instance.id,
"query",
]
return _client._send_request("GET", path_params)
query_params = {"withLabel": with_label}
return _client._send_request("GET", path_params, query_params)
4 changes: 2 additions & 2 deletions python/hsfs/core/training_dataset_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ def read(self, training_dataset, split, user_read_options):
path,
)

def query(self, training_dataset, online):
return self._training_dataset_api.get_query(training_dataset)[
def query(self, training_dataset, online, with_label):
return self._training_dataset_api.get_query(training_dataset, with_label)[
"queryOnline" if online else "query"
]

Expand Down
12 changes: 12 additions & 0 deletions python/hsfs/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def from_response_json(cls, json_dict):

@property
def name(self):
"""Name of the feature."""
return self._name

@name.setter
Expand All @@ -63,6 +64,13 @@ def name(self, name):

@property
def type(self):
"""Data type of the feature in the feature store.

!!! danger "Not a Python type"
This type property is not to be confused with Python types.
The type property represents the actual data type of the feature in
the feature store.
"""
return self._type

@type.setter
Expand All @@ -71,6 +79,7 @@ def type(self, type):

@property
def primary(self):
"""Whether the feature is part of the primary key of the feature group."""
return self._primary

@primary.setter
Expand All @@ -79,6 +88,7 @@ def primary(self, primary):

@property
def partition(self):
"""Whether the feature is part of the partition key of the feature group."""
return self._partition

@partition.setter
Expand All @@ -87,6 +97,8 @@ def partition(self, partition):

@property
def default_value(self):
"""Default value of the feature as string, if the feature was appended to the
feature group."""
return self._default_value

@default_value.setter
Expand Down
96 changes: 83 additions & 13 deletions python/hsfs/feature_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,15 @@

import warnings
import humps
from typing import Optional, Union, List
from typing import Optional, Union, List, Dict

from hsfs import training_dataset, feature_group, util, training_dataset_feature
from hsfs import (
training_dataset,
feature_group,
util,
storage_connector,
training_dataset_feature,
)
from hsfs.core import (
feature_group_api,
storage_connector_api,
Expand Down Expand Up @@ -163,10 +169,10 @@ def create_feature_group(
DataFrame.

# Arguments
name: Name of the feature group to get.
name: Name of the feature group to create.
version: Version of the feature group to retrieve, defaults to `None` and
will create the feature group with incremented version from the last
verison in the feature store.
version in the feature store.
description: A string describing the contents of the feature group to
improve discoverability for Data Scientists, defaults to empty string
`""`.
Expand Down Expand Up @@ -209,16 +215,79 @@ def create_feature_group(

def create_training_dataset(
self,
name,
version=None,
description="",
data_format="tfrecords",
storage_connector=None,
splits={},
location="",
seed=None,
statistics_config=None,
name: str,
version: Optional[int] = None,
description: Optional[str] = "",
data_format: Optional[str] = "tfrecords",
storage_connector: Optional[storage_connector.StorageConnector] = None,
splits: Optional[Dict[str, float]] = {},
location: Optional[str] = "",
seed: Optional[int] = None,
statistics_config: Optional[Union[StatisticsConfig, bool, dict]] = None,
label: Optional[List[str]] = [],
):
"""Create a training dataset metadata object.

!!! note "Lazy"
This method is lazy and does not persist any metadata or feature data in the
feature store on its own. To materialize the training dataset and save
feature data along the metadata in the feature store, call the `save()`
method with a `DataFrame` or `Query`.

!!! info "Data Formats"
The feature store currently supports the following data formats for
training datasets:

1. tfrecord
2. csv
3. tsv
4. parquet
5. avro
6. orc

Currently not supported petastorm, hdf5 and npy file formats.


# Arguments
name: Name of the training dataset to create.
version: Version of the training dataset to retrieve, defaults to `None` and
will create the training dataset with incremented version from the last
version in the feature store.
description: A string describing the contents of the training dataset to
improve discoverability for Data Scientists, defaults to empty string
`""`.
data_format: The data format used to save the training dataset,
defaults to `"tfrecords"`-format.
storage_connector: Storage connector defining the sink location for the
training dataset, defaults to `None`, and materializes training dataset
on HopsFS.
splits: A dictionary defining training dataset splits to be created. Keys in
the dictionary define the name of the split as `str`, values represent
percentage of samples in the split as `float`. Currently, only random
splits are supported. Defaults to empty dict`{}`, creating only a single
training dataset without splits.
location: Path to complement the sink storage connector with, e.g if the
storage connector points to an S3 bucket, this path can be used to
define a sub-directory inside the bucket to place the training dataset.
Defaults to `""`, saving the training dataset at the root defined by the
storage connector.
seed: Optionally, define a seed to create the random splits with, in order
to guarantee reproducability, defaults to `None`.
statistics_config: A configuration object, or a dictionary with keys
"`enabled`" to generally enable descriptive statistics computation for
this feature group, `"correlations`" to turn on feature correlation
computation and `"histograms"` to compute feature value frequencies. The
values should be booleans indicating the setting. To fully turn off
statistics computation pass `statistics_config=False`. Defaults to
`None` and will compute only descriptive statistics.
label: A list of feature names constituting the prediction label/feature of
the training dataset. When replaying a `Query` during model inference,
the label features can be omitted from the feature vector retrieval.
Defaults to `[]`, no label.

# Returns:
`TrainingDataset`: The training dataset metadata object.
"""
return training_dataset.TrainingDataset(
name=name,
version=version,
Expand All @@ -230,4 +299,5 @@ def create_training_dataset(
splits=splits,
seed=seed,
statistics_config=statistics_config,
label=label,
)
Loading