Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add get_hf_file_metadata to fetch metadata from the Hub #1058

Merged
merged 7 commits into from
Sep 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 20 additions & 2 deletions docs/source/package_reference/file_download.mdx
Original file line number Diff line number Diff line change
@@ -1,16 +1,34 @@
# Downloading files

## Download a single file

### hf_hub_download

[[autodoc]] huggingface_hub.hf_hub_download

[[autodoc]] huggingface_hub.snapshot_download
### hf_hub_url

[[autodoc]] huggingface_hub.hf_hub_url

## Download a snapshot of the repo

[[autodoc]] huggingface_hub.snapshot_download

## Get metadata about a file

### get_hf_file_metadata

[[autodoc]] huggingface_hub.get_hf_file_metadata

### HfFileMetadata

[[autodoc]] huggingface_hub.HfFileMetadata

## Caching

The methods displayed above are designed to work with a caching system that prevents
re-downloading files. The caching system was updated in v0.8.0 to become the central
cache-system shared across libraries that depend on the Hub.

Read the [cache-system guide](../how-to-cache) for a detailed presentation of caching at
at HF.
at HF.
4 changes: 4 additions & 0 deletions src/huggingface_hub/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,9 @@
"push_to_hub_fastai",
],
"file_download": [
"HfFileMetadata",
"cached_download",
"get_hf_file_metadata",
"hf_hub_download",
"hf_hub_url",
"try_to_load_from_cache",
Expand Down Expand Up @@ -298,7 +300,9 @@ def __dir__():
from .fastai_utils import _save_pretrained_fastai # noqa: F401
from .fastai_utils import from_pretrained_fastai # noqa: F401
from .fastai_utils import push_to_hub_fastai # noqa: F401
from .file_download import HfFileMetadata # noqa: F401
from .file_download import cached_download # noqa: F401
from .file_download import get_hf_file_metadata # noqa: F401
from .file_download import hf_hub_download # noqa: F401
from .file_download import hf_hub_url # noqa: F401
from .file_download import try_to_load_from_cache # noqa: F401
Expand Down
139 changes: 116 additions & 23 deletions src/huggingface_hub/file_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import tempfile
import warnings
from contextlib import contextmanager
from dataclasses import dataclass
from functools import partial
from hashlib import sha256
from pathlib import Path
Expand Down Expand Up @@ -175,6 +176,26 @@ def get_jinja_version():
REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$")


@dataclass(frozen=True)
class HfFileMetadata:
"""Data structure containing information about a file versioned on the Hub.

Returned by [`get_hf_file_metadata`] based on a URL.

Args:
commit_hash (`str`, *optional*):
The commit_hash related to the file.
etag (`str`, *optional*):
Etag of the file on the server.
location (`str`):
Location where to download the file. Can be a Hub url or not (CDN).
"""

commit_hash: Optional[str]
etag: Optional[str]
location: str


# Do not validate `repo_id` in `hf_hub_url` for now as the `repo_id="datasets/.../..."`
# pattern is used/advertised in Transformers examples.
# Related: https://github.com/huggingface/huggingface_hub/pull/1029
Expand Down Expand Up @@ -513,7 +534,7 @@ def http_get(
max_retries=0,
):
"""
Donwload a remote file. Do not gobble up errors, and will return errors tailored to the Hugging Face Hub.
Download a remote file. Do not gobble up errors, and will return errors tailored to the Hugging Face Hub.
"""
headers = copy.deepcopy(headers)
if resume_size > 0:
Expand Down Expand Up @@ -817,7 +838,7 @@ def _resumable_file_manager() -> "io.BufferedWriter":
return cache_path


def _normalize_etag(etag: str) -> str:
def _normalize_etag(etag: Optional[str]) -> Optional[str]:
"""Normalize ETag HTTP header, so it can be used to create nice filepaths.

The HTTP spec allows two forms of ETag:
Expand All @@ -827,11 +848,14 @@ def _normalize_etag(etag: str) -> str:
The hf.co hub guarantees to only send the second form.

Args:
etag (`str`): HTTP header
etag (`str`, *optional*): HTTP header

Returns:
`str`: string that can be used as a nice directory name.
`str` or `None`: string that can be used as a nice directory name.
Returns `None` if input is None.
"""
if etag is None:
return None
return etag.strip('"')


Expand Down Expand Up @@ -1112,19 +1136,18 @@ def hf_hub_download(
commit_hash = None
if not local_files_only:
try:
r = _request_wrapper(
method="HEAD",
url=url,
headers=headers,
allow_redirects=False,
follow_relative_redirects=True,
proxies=proxies,
timeout=etag_timeout,
)
try:
hf_raise_for_status(r)
except EntryNotFoundError:
commit_hash = r.headers.get(HUGGINGFACE_HEADER_X_REPO_COMMIT)
metadata = get_hf_file_metadata(
url=url,
use_auth_token=use_auth_token,
proxies=proxies,
timeout=etag_timeout,
)
except EntryNotFoundError as http_error:
# Cache the non-existence of the file and raise
commit_hash = http_error.response.headers.get(
HUGGINGFACE_HEADER_X_REPO_COMMIT
)
if commit_hash is not None and not legacy_cache_layout:
no_exist_file_path = (
Path(storage_folder)
Expand All @@ -1138,15 +1161,17 @@ def hf_hub_download(
storage_folder, revision, commit_hash
)
raise
commit_hash = r.headers[HUGGINGFACE_HEADER_X_REPO_COMMIT]

# Commit hash must exist
commit_hash = metadata.commit_hash
if commit_hash is None:
raise OSError(
"Distant resource does not seem to be on huggingface.co (missing"
" commit header)."
)
etag = r.headers.get(HUGGINGFACE_HEADER_X_LINKED_ETAG) or r.headers.get(
"ETag"
)

# Etag must exist
etag = metadata.etag
# We favor a custom header indicating the etag of the linked resource, and
# we fallback to the regular etag header.
# If we don't have any of those, raise an error.
Expand All @@ -1155,13 +1180,13 @@ def hf_hub_download(
"Distant resource does not have an ETag, we won't be able to"
" reliably ensure reproducibility."
)
etag = _normalize_etag(etag)

# In case of a redirect, save an extra redirect on the request.get call,
# and ensure we download the exact atomic version even if it changed
# between the HEAD and the GET (unlikely, but hey).
# Useful for lfs blobs that are stored on a CDN.
if 300 <= r.status_code <= 399:
url_to_download = r.headers["Location"]
if metadata.location != url:
url_to_download = metadata.location
if (
"lfs.huggingface.co" in url_to_download
or "lfs-staging.huggingface.co" in url_to_download
Expand Down Expand Up @@ -1377,3 +1402,71 @@ def try_to_load_from_cache(

cached_file = os.path.join(repo_cache, "snapshots", revision, filename)
return cached_file if os.path.isfile(cached_file) else None


def get_hf_file_metadata(
url: str,
use_auth_token: Union[bool, str, None] = None,
proxies: Optional[Dict] = None,
timeout: Optional[float] = 10,
) -> HfFileMetadata:
"""Fetch metadata of a file versioned on the Hub for a given url.

Args:
url (`str`):
File url, for example returned by [`hf_hub_url`].
use_auth_token (`str` or `bool`, *optional*):
A token to be used for the download.
- If `True`, the token is read from the HuggingFace config
folder.
- If `False` or `None`, no token is provided.
- If a string, it's used as the authentication token.
proxies (`dict`, *optional*):
Dictionary mapping protocol to the URL of the proxy passed to
`requests.request`.
etag_timeout (`float`, *optional*, defaults to 10):
How many seconds to wait for the server to send metadata before giving up.

Returns:
A [`HfFileMetadata`] object containing metadata such as location, etag and
commit_hash.
"""
# TODO: helper to get headers from `use_auth_token` (copy-pasted several times)
headers = {}
if isinstance(use_auth_token, str):
headers["authorization"] = f"Bearer {use_auth_token}"
elif use_auth_token:
token = HfFolder.get_token()
if token is None:
raise EnvironmentError(
"You specified use_auth_token=True, but a huggingface token was not"
" found."
)
headers["authorization"] = f"Bearer {token}"

# Retrieve metadata
r = _request_wrapper(
method="HEAD",
url=url,
headers=headers,
allow_redirects=False,
follow_relative_redirects=True,
proxies=proxies,
timeout=timeout,
)
hf_raise_for_status(r)

# Return
return HfFileMetadata(
commit_hash=r.headers.get(HUGGINGFACE_HEADER_X_REPO_COMMIT),
etag=_normalize_etag(
# We favor a custom header indicating the etag of the linked resource, and
# we fallback to the regular etag header.
r.headers.get("ETag")
or r.headers.get(HUGGINGFACE_HEADER_X_LINKED_ETAG)
),
# Either from response headers (if redirected) or defaults to request url
# Do not use directly `url`, as `_request_wrapper` might have followed relative
# redirects.
location=r.headers.get("Location") or r.request.url,
)
39 changes: 36 additions & 3 deletions tests/test_file_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
_CACHED_NO_EXIST,
cached_download,
filename_to_url,
get_hf_file_metadata,
hf_hub_download,
hf_hub_url,
try_to_load_from_cache,
Expand All @@ -45,7 +46,8 @@
DUMMY_MODEL_ID_PINNED_SHA256,
DUMMY_MODEL_ID_REVISION_INVALID,
DUMMY_MODEL_ID_REVISION_ONE_SPECIFIC_COMMIT,
DUMMY_RENAMED_MODEL_ID,
DUMMY_RENAMED_NEW_MODEL_ID,
DUMMY_RENAMED_OLD_MODEL_ID,
SAMPLE_DATASET_IDENTIFIER,
OfflineSimulationMode,
offline,
Expand Down Expand Up @@ -225,7 +227,7 @@ def test_download_from_a_renamed_repo_with_hf_hub_download(self):
"""
with TemporaryDirectory() as tmpdir:
filepath = hf_hub_download(
DUMMY_RENAMED_MODEL_ID, "config.json", cache_dir=tmpdir
DUMMY_RENAMED_OLD_MODEL_ID, "config.json", cache_dir=tmpdir
)
self.assertTrue(os.path.exists(filepath))

Expand All @@ -239,7 +241,7 @@ def test_download_from_a_renamed_repo_with_cached_download(self):
with TemporaryDirectory() as tmpdir:
filepath = cached_download(
hf_hub_url(
DUMMY_RENAMED_MODEL_ID,
DUMMY_RENAMED_OLD_MODEL_ID,
filename="config.json",
),
cache_dir=tmpdir,
Expand Down Expand Up @@ -337,3 +339,34 @@ def test_try_to_load_from_cache_no_exist(self):

# If file non-existence is not cached, returns None
self.assertIsNone(try_to_load_from_cache(DUMMY_MODEL_ID, filename="dummy2"))

def test_get_hf_file_metadata_basic(self) -> None:
"""Test getting metadata from a file on the Hub."""
url = hf_hub_url(
DUMMY_MODEL_ID,
filename=CONFIG_NAME,
revision=DUMMY_MODEL_ID_REVISION_ONE_SPECIFIC_COMMIT,
)
metadata = get_hf_file_metadata(url)

# Metadata
self.assertEqual(
metadata.commit_hash, DUMMY_MODEL_ID_REVISION_ONE_SPECIFIC_COMMIT
)
self.assertIsNotNone(metadata.etag) # example: "85c2fc2dcdd86563aaa85ef4911..."
self.assertEqual(metadata.location, url) # no redirect

def test_get_hf_file_metadata_from_a_renamed_repo(self) -> None:
"""Test getting metadata from a file in a renamed repo on the Hub."""
url = hf_hub_url(
DUMMY_RENAMED_OLD_MODEL_ID,
filename=CONFIG_NAME,
subfolder="", # Subfolder should be processed as `None`
)
metadata = get_hf_file_metadata(url)

# Got redirected to renamed repo
self.assertEqual(
metadata.location,
url.replace(DUMMY_RENAMED_OLD_MODEL_ID, DUMMY_RENAMED_NEW_MODEL_ID),
)
5 changes: 2 additions & 3 deletions tests/testing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,8 @@
# Sha-256 of pytorch_model.bin on the top of `main`, for checking purposes

# "hf-internal-testing/dummy-will-be-renamed" has been renamed to "hf-internal-testing/dummy-renamed"
DUMMY_RENAMED_MODEL_ID = ( # Regression test #941
"hf-internal-testing/dummy-will-be-renamed"
)
DUMMY_RENAMED_OLD_MODEL_ID = "hf-internal-testing/dummy-will-be-renamed"
DUMMY_RENAMED_NEW_MODEL_ID = "hf-internal-testing/dummy-renamed"

SAMPLE_DATASET_IDENTIFIER = "lhoestq/custom_squad"
# Example dataset ids
Expand Down