diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index aa7a919c566..b69a4203e0a 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -804,6 +804,12 @@ def from_pandas( contains `None/nan` objects, the type is set to `null`. This behavior can be avoided by constructing explicit features and passing it to this function. + Important: a dataset created with from_pandas() lives in memory + and therefore doesn't have an associated cache directory. + This may change in the feature, but in the meantime if you + want to reduce memory usage you should write it back on disk + and reload using using e.g. save_to_disk / load_from_disk. + Args: df (`pandas.DataFrame`): Dataframe that contains the dataset. @@ -898,6 +904,12 @@ def from_dict( """ Convert `dict` to a `pyarrow.Table` to create a [`Dataset`]. + Important: a dataset created with from_dict() lives in memory + and therefore doesn't have an associated cache directory. + This may change in the feature, but in the meantime if you + want to reduce memory usage you should write it back on disk + and reload using using e.g. save_to_disk / load_from_disk. + Args: mapping (`Mapping`): Mapping of strings to Arrays or Python lists. @@ -957,6 +969,12 @@ def from_list( Note that the keys of the first entry will be used to determine the dataset columns, regardless of what is passed to features. + Important: a dataset created with from_list() lives in memory + and therefore doesn't have an associated cache directory. + This may change in the feature, but in the meantime if you + want to reduce memory usage you should write it back on disk + and reload using using e.g. save_to_disk / load_from_disk. + Args: mapping (`List[dict]`): A list of mappings of strings to row values. features (`Features`, optional): Dataset features.