Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ Add files_metadata option to repo_info #951

Merged
merged 14 commits into from
Aug 9, 2022
8 changes: 8 additions & 0 deletions docs/source/package_reference/hf_api.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,14 @@ Using the `HfApi` class directly enables you to set a different endpoint to that

[[autodoc]] HfApi

[[autodoc]] huggingface_hub.hf_api.ModelInfo

[[autodoc]] huggingface_hub.hf_api.DatasetInfo

[[autodoc]] huggingface_hub.hf_api.SpaceInfo

[[autodoc]] huggingface_hub.hf_api.RepoFile

## Hugging Face local storage

`huggingface_hub` stores the authentication information locally so that it may be re-used in subsequent
Expand Down
110 changes: 86 additions & 24 deletions src/huggingface_hub/hf_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,10 +166,26 @@ class RepoFile:
file name, relative to the repo root. This is the only attribute
that's guaranteed to be here, but under certain conditions there can
certain other stuff.
size (`int`, *optional*):
The file's size, in bytes. This attribute is present when `files_metadata` argument
of [`repo_info`] is set to `True`. It's `None` otherwise.
blob_id (`str`, *optional*):
The file's git OID. This attribute is present when `files_metadata` argument
of [`repo_info`] is set to `True`. It's `None` otherwise.
lfs (`dict`, *optional*):
SBrandeis marked this conversation as resolved.
Show resolved Hide resolved
The file's LFS metadata (has two keys: `sha256` and `size`). This attribute is present when
`files_metadata` argument of [`repo_info`] is set to `True` and the file is stored
SBrandeis marked this conversation as resolved.
Show resolved Hide resolved
with Git LFS. It's `None` otherwise.
"""

def __init__(self, rfilename: str, **kwargs):
self.rfilename = rfilename # filename relative to the repo root

# Optional file metadata
self.size: Optional[int] = kwargs.pop("size", None)
self.blob_id: Optional[str] = kwargs.pop("blobId", None)
self.lfs: Optional[dict] = kwargs.pop("lfs", None)
SBrandeis marked this conversation as resolved.
Show resolved Hide resolved

for k, v in kwargs.items():
setattr(self, k, v)
Wauplin marked this conversation as resolved.
Show resolved Hide resolved

Expand All @@ -182,7 +198,7 @@ class ModelInfo:
"""
Info about a model accessible from huggingface.co

Args:
Attributes:
modelId (`str`, *optional*):
ID of model repository.
sha (`str`, *optional*):
Expand All @@ -193,8 +209,8 @@ class ModelInfo:
List of tags.
pipeline_tag (`str`, *optional*):
Pipeline tag to identify the correct widget.
siblings (`List[Dict]`, *optional*):
list of files that constitute the Space
siblings (`List[RepoFile]`, *optional*):
list of ([`huggingface_hub.hf_api.RepoFile`]) objects that constitute the model
SBrandeis marked this conversation as resolved.
Show resolved Hide resolved
private (`bool`, *optional*):
is the repo private
author (`str`, *optional*):
Expand Down Expand Up @@ -250,7 +266,7 @@ class DatasetInfo:
"""
Info about a dataset accessible from huggingface.co

Args:
Attributes:
id (`str`, *optional*):
ID of dataset repository.
sha (`str`, *optional*):
Expand All @@ -259,8 +275,8 @@ class DatasetInfo:
date of last commit to repo
tags (`Listr[str]`, *optional*):
List of tags.
siblings (`List[Dict]`, *optional*):
list of files that constitute the Space
siblings (`List[RepoFile]`, *optional*):
list of [`huggingface_hub.hf_api.RepoFile`] objects that constitute the dataset
SBrandeis marked this conversation as resolved.
Show resolved Hide resolved
private (`bool`, *optional*):
is the repo private
author (`str`, *optional*):
Expand Down Expand Up @@ -327,15 +343,15 @@ class SpaceInfo:
This is a "dataclass" like container that just sets on itself any attribute
passed by the server.

Args:
Attributes:
id (`str`, *optional*):
id of space
sha (`str`, *optional*):
repo sha at this particular revision
lastModified (`str`, *optional*):
date of last commit to repo
siblings (`List[Dict]`, *optional*):
list of files that constitute the Space
siblings (`List[RepoFile]`, *optional*):
list of [`huggingface_hub.hf_api.RepoFIle`] objects that constitute the Space
private (`bool`, *optional*):
is the repo private
author (`str`, *optional*):
Expand Down Expand Up @@ -721,7 +737,7 @@ def list_models(
carbon footprint to filter the resulting models with in grams.
sort (`Literal["lastModified"]` or `str`, *optional*):
The key with which to sort the resulting models. Possible values
are the properties of the `ModelInfo` class.
are the properties of the [`huggingface_hub.hf_api.ModelInfo`] class.
direction (`Literal[-1]` or `int`, *optional*):
Direction in which to sort. The value `-1` sorts by descending
order while all other values sort by ascending order.
Expand All @@ -744,6 +760,8 @@ def list_models(
`huggingface_hub` cli. If not logged in, a valid `auth_token`
can be passed in as a string.

Returns: List of [`huggingface_hub.hf_api.ModelInfo`] objects

Example usage with the `filter` argument:

```python
Expand Down Expand Up @@ -922,7 +940,7 @@ def list_datasets(
A string that will be contained in the returned models.
sort (`Literal["lastModified"]` or `str`, *optional*):
The key with which to sort the resulting datasets. Possible
values are the properties of the `DatasetInfo` class.
values are the properties of the [`huggingface_hub.hf_api.DatasetInfo`] class.
direction (`Literal[-1]` or `int`, *optional*):
Direction in which to sort. The value `-1` sorts by descending
order while all other values sort by ascending order.
Expand Down Expand Up @@ -1102,7 +1120,7 @@ def list_spaces(
A string that will be contained in the returned Spaces.
sort (`Literal["lastModified"]` or `str`, *optional*):
The key with which to sort the resulting Spaces. Possible
values are the properties of the `SpaceInfo` class.
values are the properties of the [`huggingface_hub.hf_api.SpaceInfo`]` class.
direction (`Literal[-1]` or `int`, *optional*):
Direction in which to sort. The value `-1` sorts by descending
order while all other values sort by ascending order.
Expand All @@ -1126,7 +1144,7 @@ def list_spaces(
can be passed in as a string.

Returns:
`List[SpaceInfo]`: a list of [`SpaceInfo`] objects
`List[SpaceInfo]`: a list of [`huggingface_hub.hf_api.SpaceInfo`] objects
"""
path = f"{self.endpoint}/api/spaces"
if use_auth_token:
Expand Down Expand Up @@ -1168,6 +1186,7 @@ def model_info(
token: Optional[str] = None,
timeout: Optional[float] = None,
securityStatus: Optional[bool] = None,
files_metadata: bool = False,
) -> ModelInfo:
"""
Get info on one specific model on huggingface.co
Expand All @@ -1188,6 +1207,9 @@ def model_info(
securityStatus (`bool`, *optional*):
Whether to retrieve the security status from the model
repository as well.
files_metadata (`bool`, *optional*):
Whether or not to retrieve metadata for files in the repository
(size, LFS metadata, etc). Defaults to `False`.

Returns:
[`huggingface_hub.hf_api.ModelInfo`]: The model repository information.
Expand All @@ -1213,9 +1235,16 @@ def model_info(
else f"{self.endpoint}/api/models/{repo_id}/revision/{revision}"
)
headers = {"authorization": f"Bearer {token}"} if token is not None else None
status_query_param = {"securityStatus": True} if securityStatus else None
params = {}
if securityStatus:
params["securityStatus"] = True
if files_metadata:
params["blobs"] = True
r = requests.get(
path, headers=headers, timeout=timeout, params=status_query_param
path,
headers=headers,
timeout=timeout,
params=params,
)
_raise_for_status(r)
d = r.json()
Expand All @@ -1228,6 +1257,7 @@ def dataset_info(
revision: Optional[str] = None,
token: Optional[str] = None,
timeout: Optional[float] = None,
files_metadata: bool = False,
) -> DatasetInfo:
"""
Get info on one specific dataset on huggingface.co.
Expand All @@ -1245,9 +1275,12 @@ def dataset_info(
An authentication token (See https://huggingface.co/settings/token)
timeout (`float`, *optional*):
Whether to set a timeout for the request to the Hub.
files_metadata (`bool`, *optional*):
Whether or not to retrieve metadata for files in the repository
(size, LFS metadata, etc). Defaults to `False`.

Returns:
[`DatasetInfo`]: The dataset repository information.
[`huggingface_hub.hf_api.DatasetInfo`]: The dataset repository information.

<Tip>

Expand All @@ -1270,7 +1303,11 @@ def dataset_info(
else f"{self.endpoint}/api/datasets/{repo_id}/revision/{revision}"
)
headers = {"authorization": f"Bearer {token}"} if token is not None else None
r = requests.get(path, headers=headers, timeout=timeout)
params = {}
if files_metadata:
params["blobs"] = True

r = requests.get(path, headers=headers, timeout=timeout, params=params)
_raise_for_status(r)
d = r.json()
return DatasetInfo(**d)
Expand All @@ -1282,6 +1319,7 @@ def space_info(
revision: Optional[str] = None,
token: Optional[str] = None,
timeout: Optional[float] = None,
files_metadata: bool = False,
) -> SpaceInfo:
"""
Get info on one specific Space on huggingface.co.
Expand All @@ -1299,9 +1337,12 @@ def space_info(
An authentication token (See https://huggingface.co/settings/token)
timeout (`float`, *optional*):
Whether to set a timeout for the request to the Hub.
files_metadata (`bool`, *optional*):
Whether or not to retrieve metadata for files in the repository
(size, LFS metadata, etc). Defaults to `False`.

Returns:
[`SpaceInfo`]: The space repository information.
[`huggingface_hub.hf_api.SpaceInfo`]: The space repository information.

<Tip>

Expand All @@ -1324,7 +1365,11 @@ def space_info(
else f"{self.endpoint}/api/spaces/{repo_id}/revision/{revision}"
)
headers = {"authorization": f"Bearer {token}"} if token is not None else None
r = requests.get(path, headers=headers, timeout=timeout)
params = {}
if files_metadata:
params["blobs"] = True

r = requests.get(path, headers=headers, timeout=timeout, params=params)
_raise_for_status(r)
d = r.json()
return SpaceInfo(**d)
Expand All @@ -1337,6 +1382,7 @@ def repo_info(
repo_type: Optional[str] = None,
token: Optional[str] = None,
timeout: Optional[float] = None,
files_metadata: bool = False,
) -> Union[ModelInfo, DatasetInfo, SpaceInfo]:
"""
Get the info object for a given repo of a given type.
Expand All @@ -1352,10 +1398,14 @@ def repo_info(
An authentication token (See https://huggingface.co/settings/token)
timeout (`float`, *optional*):
Whether to set a timeout for the request to the Hub.
files_metadata (`bool`, *optional*):
Whether or not to retrieve metadata for files in the repository
(size, LFS metadata, etc). Defaults to `False`.

Returns:
`Union[SpaceInfo, DatasetInfo, ModelInfo]`: The repository
information.
`Union[SpaceInfo, DatasetInfo, ModelInfo]`: The repository information, as a
[`huggingface_hub.hf_api.DatasetInfo`], [`huggingface_hub.hf_api.ModelInfo`]
or [`huggingface_hub.hf_api.SpaceInfo`] object.
Wauplin marked this conversation as resolved.
Show resolved Hide resolved

<Tip>

Expand All @@ -1371,15 +1421,27 @@ def repo_info(
"""
if repo_type is None or repo_type == "model":
return self.model_info(
repo_id, revision=revision, token=token, timeout=timeout
repo_id,
revision=revision,
token=token,
timeout=timeout,
files_metadata=files_metadata,
)
elif repo_type == "dataset":
return self.dataset_info(
repo_id, revision=revision, token=token, timeout=timeout
repo_id,
revision=revision,
token=token,
timeout=timeout,
files_metadata=files_metadata,
)
elif repo_type == "space":
return self.space_info(
repo_id, revision=revision, token=token, timeout=timeout
repo_id,
revision=revision,
token=token,
timeout=timeout,
files_metadata=files_metadata,
)
else:
raise ValueError("Unsupported repo type.")
Expand Down
50 changes: 50 additions & 0 deletions tests/test_hf_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
DUMMY_DATASET_ID_REVISION_ONE_SPECIFIC_COMMIT,
DUMMY_MODEL_ID,
DUMMY_MODEL_ID_REVISION_ONE_SPECIFIC_COMMIT,
SAMPLE_DATASET_IDENTIFIER,
require_git_lfs,
retry_endpoint,
set_write_permission_and_retry,
Expand Down Expand Up @@ -928,6 +929,31 @@ def test_model_info_with_security(self):
{"containsInfected": False},
)

@with_production_testing
def test_model_info_with_file_metadata(self):
Wauplin marked this conversation as resolved.
Show resolved Hide resolved
_api = HfApi()
model = _api.model_info(
repo_id=DUMMY_MODEL_ID,
revision=DUMMY_MODEL_ID_REVISION_ONE_SPECIFIC_COMMIT,
files_metadata=True,
)
files = model.siblings
assert files is not None
self.assertListEqual(
[isinstance(file.blob_id, str) for file in files], [True] * len(files)
)
self.assertListEqual(
[isinstance(file.size, int) for file in files], [True] * len(files)
)
self.assertTrue(any([file.lfs is not None for file in files]))
self.assertListEqual(
[
file.lfs is None or isinstance(file.lfs, dict) and "sha256" in file.lfs
for file in files
],
[True] * len(files),
)
Wauplin marked this conversation as resolved.
Show resolved Hide resolved

@with_production_testing
def test_list_repo_files(self):
_api = HfApi()
Expand Down Expand Up @@ -1083,6 +1109,30 @@ def test_dataset_info(self):
self.assertIsInstance(dataset, DatasetInfo)
self.assertEqual(dataset.sha, DUMMY_DATASET_ID_REVISION_ONE_SPECIFIC_COMMIT)

@with_production_testing
def test_dataset_info_with_file_metadata(self):
_api = HfApi()
dataset = _api.dataset_info(
repo_id=SAMPLE_DATASET_IDENTIFIER,
files_metadata=True,
)
files = dataset.siblings
assert files is not None
self.assertListEqual(
[isinstance(file.blob_id, str) for file in files], [True] * len(files)
)
self.assertListEqual(
[isinstance(file.size, int) for file in files], [True] * len(files)
)
self.assertTrue(any([file.lfs is not None for file in files]))
self.assertListEqual(
[
file.lfs is None or isinstance(file.lfs, dict) and "sha256" in file.lfs
for file in files
],
[True] * len(files),
)

Wauplin marked this conversation as resolved.
Show resolved Hide resolved
def test_staging_list_metrics(self):
_api = HfApi(endpoint=ENDPOINT_STAGING)
_ = _api.list_metrics()
Expand Down