Skip to content

Commit

Permalink
add more docs
Browse files Browse the repository at this point in the history
  • Loading branch information
moritzmeister committed Oct 29, 2020
1 parent 0d22cd6 commit 3fea11d
Showing 1 changed file with 57 additions and 2 deletions.
59 changes: 57 additions & 2 deletions python/hsfs/training_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,41 @@ def save(
)
return self

def insert(self, features, overwrite, write_options={}):
def insert(
self,
features: Union[
query.Query,
pd.DataFrame,
TypeVar("pyspark.sql.DataFrame"), # noqa: F821
TypeVar("pyspark.RDD"), # noqa: F821
np.ndarray,
List[list],
],
overwrite: bool,
write_options: Optional[Dict[Any, Any]] = {},
):
"""Insert additional feature data into the training dataset.
This method appends data to the training dataset either from a Feature Store
`Query`, a Spark or Pandas `DataFrame`, a Spark RDD, two-dimensional Python
lists or Numpy ndarrays. The schemas must match for this operation.
This can also be used to overwrite all data in an existing training dataset.
# Arguments
features: Feature data to be materialized.
overwrite: Whether to overwrite the entire data in the training dataset.
write_options: Additional write options as key/value pairs.
Defaults to `{}`.
# Returns
`TrainingDataset`: The updated training dataset metadata object, the
previous `TrainingDataset` object on which you call `save` is also
updated.
# Raises
`RestAPIError`: Unable to create training dataset metadata.
"""
if isinstance(features, query.Query):
feature_dataframe = features.read()
else:
Expand All @@ -194,6 +228,18 @@ def insert(self, features, overwrite, write_options={}):
self.compute_statistics()

def read(self, split=None, read_options={}):
"""Read the training dataset into a dataframe.
It is also possible to read only a specific split.
# Arguments
split: Name of the split to read, defaults to `None`, reading the entire
training dataset.
read_options: Additional read options as key/value pairs, defaults to `{}`.
# Returns
`DataFrame`: The spark dataframe containing the feature data of the
training dataset.
"""
return self._training_dataset_engine.read(self, split, read_options)

def compute_statistics(self):
Expand Down Expand Up @@ -237,7 +283,16 @@ def tf_data(
cycle_length=cycle_length,
)

def show(self, n, split=None):
def show(self, n: int, split: str = None):
"""Show the first `n` rows of the training dataset.
You can specify a split from which to retrieve the rows.
# Arguments
n: Number of rows to show.
split: Name of the split to show, defaults to `None`, showing the first rows
when taking all splits together.
"""
self.read(split).show(n)

def add_tag(self, name: str, value: str = None):
Expand Down

0 comments on commit 3fea11d

Please sign in to comment.