Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ Add files_metadata option to repo_info #951

Merged
merged 14 commits into from
Aug 9, 2022
8 changes: 8 additions & 0 deletions docs/source/package_reference/hf_api.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,14 @@ Using the `HfApi` class directly enables you to set a different endpoint to that

[[autodoc]] HfApi

[[autodoc]] huggingface_hub.hf_api.ModelInfo

[[autodoc]] huggingface_hub.hf_api.DatasetInfo

[[autodoc]] huggingface_hub.hf_api.SpaceInfo

[[autodoc]] huggingface_hub.hf_api.RepoFile

## Hugging Face local storage

`huggingface_hub` stores the authentication information locally so that it may be re-used in subsequent
Expand Down
131 changes: 102 additions & 29 deletions src/huggingface_hub/hf_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,9 @@


if sys.version_info >= (3, 8):
from typing import Literal
from typing import Literal, TypedDict
else:
from typing_extensions import Literal
from typing_extensions import Literal, TypedDict


REGEX_DISCUSSION_URL = re.compile(r".*/discussions/(\d+)$")
Expand Down Expand Up @@ -156,20 +156,47 @@ def repo_type_and_id_from_hf_id(
return repo_type, namespace, repo_id


class BlobLfsInfo(TypedDict, total=False):
size: int
sha256: str


class RepoFile:
"""
Data structure that represents a public file inside a repo, accessible from
huggingface.co
Data structure that represents a public file inside a repo, accessible from
huggingface.co
Wauplin marked this conversation as resolved.
Show resolved Hide resolved

Args:
rfilename (str):
file name, relative to the repo root. This is the only attribute
that's guaranteed to be here, but under certain conditions there can
certain other stuff.
size (`int`, *optional*):
The file's size, in bytes. This attribute is present when `files_metadata` argument
of [`repo_info`] is set to `True`. It's `None` otherwise.
blob_id (`str`, *optional*):
The file's git OID. This attribute is present when `files_metadata` argument
of [`repo_info`] is set to `True`. It's `None` otherwise.
lfs (`BlobLfsInfo`, *optional*):
The file's LFS metadata. This attribute is present when`files_metadata` argument
of [`repo_info`] is set to `True` and the file is stored with Git LFS. It's `None` otherwise.
"""

def __init__(self, rfilename: str, **kwargs):
def __init__(
self,
rfilename: str,
size: Optional[int] = None,
blobId: Optional[str] = None,
lfs: Optional[BlobLfsInfo] = None,
**kwargs,
):
self.rfilename = rfilename # filename relative to the repo root

# Optional file metadata
self.size = size
self.blob_id = blobId
self.lfs = lfs

for k, v in kwargs.items():
setattr(self, k, v)
Wauplin marked this conversation as resolved.
Show resolved Hide resolved

Expand All @@ -182,7 +209,7 @@ class ModelInfo:
"""
Info about a model accessible from huggingface.co

Args:
Attributes:
modelId (`str`, *optional*):
ID of model repository.
sha (`str`, *optional*):
Expand All @@ -193,8 +220,8 @@ class ModelInfo:
List of tags.
pipeline_tag (`str`, *optional*):
Pipeline tag to identify the correct widget.
siblings (`List[Dict]`, *optional*):
list of files that constitute the Space
siblings (`List[RepoFile]`, *optional*):
list of ([`huggingface_hub.hf_api.RepoFile`]) objects that constitute the model.
private (`bool`, *optional*):
is the repo private
author (`str`, *optional*):
Expand Down Expand Up @@ -250,7 +277,7 @@ class DatasetInfo:
"""
Info about a dataset accessible from huggingface.co

Args:
Attributes:
id (`str`, *optional*):
ID of dataset repository.
sha (`str`, *optional*):
Expand All @@ -259,8 +286,8 @@ class DatasetInfo:
date of last commit to repo
tags (`Listr[str]`, *optional*):
List of tags.
siblings (`List[Dict]`, *optional*):
list of files that constitute the Space
siblings (`List[RepoFile]`, *optional*):
list of [`huggingface_hub.hf_api.RepoFile`] objects that constitute the dataset.
private (`bool`, *optional*):
is the repo private
author (`str`, *optional*):
Expand Down Expand Up @@ -327,15 +354,15 @@ class SpaceInfo:
This is a "dataclass" like container that just sets on itself any attribute
passed by the server.

Args:
Attributes:
id (`str`, *optional*):
id of space
sha (`str`, *optional*):
repo sha at this particular revision
lastModified (`str`, *optional*):
date of last commit to repo
siblings (`List[Dict]`, *optional*):
list of files that constitute the Space
siblings (`List[RepoFile]`, *optional*):
list of [`huggingface_hub.hf_api.RepoFIle`] objects that constitute the Space
private (`bool`, *optional*):
is the repo private
author (`str`, *optional*):
Expand Down Expand Up @@ -721,7 +748,7 @@ def list_models(
carbon footprint to filter the resulting models with in grams.
sort (`Literal["lastModified"]` or `str`, *optional*):
The key with which to sort the resulting models. Possible values
are the properties of the `ModelInfo` class.
are the properties of the [`huggingface_hub.hf_api.ModelInfo`] class.
direction (`Literal[-1]` or `int`, *optional*):
Direction in which to sort. The value `-1` sorts by descending
order while all other values sort by ascending order.
Expand All @@ -744,6 +771,8 @@ def list_models(
`huggingface_hub` cli. If not logged in, a valid `auth_token`
can be passed in as a string.

Returns: List of [`huggingface_hub.hf_api.ModelInfo`] objects

Example usage with the `filter` argument:

```python
Expand Down Expand Up @@ -922,7 +951,7 @@ def list_datasets(
A string that will be contained in the returned models.
sort (`Literal["lastModified"]` or `str`, *optional*):
The key with which to sort the resulting datasets. Possible
values are the properties of the `DatasetInfo` class.
values are the properties of the [`huggingface_hub.hf_api.DatasetInfo`] class.
direction (`Literal[-1]` or `int`, *optional*):
Direction in which to sort. The value `-1` sorts by descending
order while all other values sort by ascending order.
Expand Down Expand Up @@ -1102,7 +1131,7 @@ def list_spaces(
A string that will be contained in the returned Spaces.
sort (`Literal["lastModified"]` or `str`, *optional*):
The key with which to sort the resulting Spaces. Possible
values are the properties of the `SpaceInfo` class.
values are the properties of the [`huggingface_hub.hf_api.SpaceInfo`]` class.
direction (`Literal[-1]` or `int`, *optional*):
Direction in which to sort. The value `-1` sorts by descending
order while all other values sort by ascending order.
Expand All @@ -1126,7 +1155,7 @@ def list_spaces(
can be passed in as a string.

Returns:
`List[SpaceInfo]`: a list of [`SpaceInfo`] objects
`List[SpaceInfo]`: a list of [`huggingface_hub.hf_api.SpaceInfo`] objects
"""
path = f"{self.endpoint}/api/spaces"
if use_auth_token:
Expand Down Expand Up @@ -1168,6 +1197,7 @@ def model_info(
token: Optional[str] = None,
timeout: Optional[float] = None,
securityStatus: Optional[bool] = None,
files_metadata: bool = False,
) -> ModelInfo:
"""
Get info on one specific model on huggingface.co
Expand All @@ -1188,6 +1218,9 @@ def model_info(
securityStatus (`bool`, *optional*):
Whether to retrieve the security status from the model
repository as well.
files_metadata (`bool`, *optional*):
Whether or not to retrieve metadata for files in the repository
(size, LFS metadata, etc). Defaults to `False`.

Returns:
[`huggingface_hub.hf_api.ModelInfo`]: The model repository information.
Expand All @@ -1213,9 +1246,16 @@ def model_info(
else f"{self.endpoint}/api/models/{repo_id}/revision/{revision}"
)
headers = {"authorization": f"Bearer {token}"} if token is not None else None
status_query_param = {"securityStatus": True} if securityStatus else None
params = {}
if securityStatus:
params["securityStatus"] = True
if files_metadata:
params["blobs"] = True
r = requests.get(
path, headers=headers, timeout=timeout, params=status_query_param
path,
headers=headers,
timeout=timeout,
params=params,
)
_raise_for_status(r)
d = r.json()
Expand All @@ -1228,6 +1268,7 @@ def dataset_info(
revision: Optional[str] = None,
token: Optional[str] = None,
timeout: Optional[float] = None,
files_metadata: bool = False,
) -> DatasetInfo:
"""
Get info on one specific dataset on huggingface.co.
Expand All @@ -1245,9 +1286,12 @@ def dataset_info(
An authentication token (See https://huggingface.co/settings/token)
timeout (`float`, *optional*):
Whether to set a timeout for the request to the Hub.
files_metadata (`bool`, *optional*):
Whether or not to retrieve metadata for files in the repository
(size, LFS metadata, etc). Defaults to `False`.

Returns:
[`DatasetInfo`]: The dataset repository information.
[`huggingface_hub.hf_api.DatasetInfo`]: The dataset repository information.

<Tip>

Expand All @@ -1270,7 +1314,11 @@ def dataset_info(
else f"{self.endpoint}/api/datasets/{repo_id}/revision/{revision}"
)
headers = {"authorization": f"Bearer {token}"} if token is not None else None
r = requests.get(path, headers=headers, timeout=timeout)
params = {}
if files_metadata:
params["blobs"] = True

r = requests.get(path, headers=headers, timeout=timeout, params=params)
_raise_for_status(r)
d = r.json()
return DatasetInfo(**d)
Expand All @@ -1282,6 +1330,7 @@ def space_info(
revision: Optional[str] = None,
token: Optional[str] = None,
timeout: Optional[float] = None,
files_metadata: bool = False,
) -> SpaceInfo:
"""
Get info on one specific Space on huggingface.co.
Expand All @@ -1299,9 +1348,12 @@ def space_info(
An authentication token (See https://huggingface.co/settings/token)
timeout (`float`, *optional*):
Whether to set a timeout for the request to the Hub.
files_metadata (`bool`, *optional*):
Whether or not to retrieve metadata for files in the repository
(size, LFS metadata, etc). Defaults to `False`.

Returns:
[`SpaceInfo`]: The space repository information.
[`huggingface_hub.hf_api.SpaceInfo`]: The space repository information.

<Tip>

Expand All @@ -1324,7 +1376,11 @@ def space_info(
else f"{self.endpoint}/api/spaces/{repo_id}/revision/{revision}"
)
headers = {"authorization": f"Bearer {token}"} if token is not None else None
r = requests.get(path, headers=headers, timeout=timeout)
params = {}
if files_metadata:
params["blobs"] = True

r = requests.get(path, headers=headers, timeout=timeout, params=params)
_raise_for_status(r)
d = r.json()
return SpaceInfo(**d)
Expand All @@ -1337,6 +1393,7 @@ def repo_info(
repo_type: Optional[str] = None,
token: Optional[str] = None,
timeout: Optional[float] = None,
files_metadata: bool = False,
) -> Union[ModelInfo, DatasetInfo, SpaceInfo]:
"""
Get the info object for a given repo of a given type.
Expand All @@ -1352,10 +1409,14 @@ def repo_info(
An authentication token (See https://huggingface.co/settings/token)
timeout (`float`, *optional*):
Whether to set a timeout for the request to the Hub.
files_metadata (`bool`, *optional*):
Whether or not to retrieve metadata for files in the repository
(size, LFS metadata, etc). Defaults to `False`.

Returns:
`Union[SpaceInfo, DatasetInfo, ModelInfo]`: The repository
information.
`Union[SpaceInfo, DatasetInfo, ModelInfo]`: The repository information, as a
[`huggingface_hub.hf_api.DatasetInfo`], [`huggingface_hub.hf_api.ModelInfo`]
or [`huggingface_hub.hf_api.SpaceInfo`] object.
Wauplin marked this conversation as resolved.
Show resolved Hide resolved

<Tip>

Expand All @@ -1371,15 +1432,27 @@ def repo_info(
"""
if repo_type is None or repo_type == "model":
return self.model_info(
repo_id, revision=revision, token=token, timeout=timeout
repo_id,
revision=revision,
token=token,
timeout=timeout,
files_metadata=files_metadata,
)
elif repo_type == "dataset":
return self.dataset_info(
repo_id, revision=revision, token=token, timeout=timeout
repo_id,
revision=revision,
token=token,
timeout=timeout,
files_metadata=files_metadata,
)
elif repo_type == "space":
return self.space_info(
repo_id, revision=revision, token=token, timeout=timeout
repo_id,
revision=revision,
token=token,
timeout=timeout,
files_metadata=files_metadata,
)
else:
raise ValueError("Unsupported repo type.")
Expand Down
Loading