From f4967036e1a93d0c6e8c1ec73bf76bc55ce44f1c Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Fri, 16 Sep 2022 13:20:40 +0200 Subject: [PATCH 1/7] Add get_hf_file_metadata to fetch metadata from the Hub --- .../package_reference/file_download.mdx | 22 ++- src/huggingface_hub/__init__.py | 4 + src/huggingface_hub/file_download.py | 135 +++++++++++++++--- 3 files changed, 136 insertions(+), 25 deletions(-) diff --git a/docs/source/package_reference/file_download.mdx b/docs/source/package_reference/file_download.mdx index 7102613ae0..d712b812f2 100644 --- a/docs/source/package_reference/file_download.mdx +++ b/docs/source/package_reference/file_download.mdx @@ -1,11 +1,29 @@ # Downloading files -[[autodoc]] huggingface_hub.hf_hub_download +## Download all files [[autodoc]] huggingface_hub.snapshot_download +## Download a single file + +### hf_hub_download + +[[autodoc]] huggingface_hub.hf_hub_download + +### hf_hub_url + [[autodoc]] huggingface_hub.hf_hub_url +## Get metadata about a file + +### get_hf_file_metadata + +[[autodoc]] huggingface_hub.get_hf_file_metadata + +### HfFileMetadata + +[[autodoc]] huggingface_hub.HfFileMetadata + ## Caching The methods displayed above are designed to work with a caching system that prevents @@ -13,4 +31,4 @@ re-downloading files. The caching system was updated in v0.8.0 to become the cen cache-system shared across libraries that depend on the Hub. Read the [cache-system guide](../how-to-cache) for a detailed presentation of caching at -at HF. \ No newline at end of file +at HF. diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py index a5c12f2b82..78c51bd1e9 100644 --- a/src/huggingface_hub/__init__.py +++ b/src/huggingface_hub/__init__.py @@ -85,7 +85,9 @@ "push_to_hub_fastai", ], "file_download": [ + "HfFileMetadata", "cached_download", + "get_hf_file_metadata", "hf_hub_download", "hf_hub_url", "try_to_load_from_cache", @@ -298,7 +300,9 @@ def __dir__(): from .fastai_utils import _save_pretrained_fastai # noqa: F401 from .fastai_utils import from_pretrained_fastai # noqa: F401 from .fastai_utils import push_to_hub_fastai # noqa: F401 + from .file_download import HfFileMetadata # noqa: F401 from .file_download import cached_download # noqa: F401 + from .file_download import get_hf_file_metadata # noqa: F401 from .file_download import hf_hub_download # noqa: F401 from .file_download import hf_hub_url # noqa: F401 from .file_download import try_to_load_from_cache # noqa: F401 diff --git a/src/huggingface_hub/file_download.py b/src/huggingface_hub/file_download.py index f00d3c4265..0c5a395119 100644 --- a/src/huggingface_hub/file_download.py +++ b/src/huggingface_hub/file_download.py @@ -8,6 +8,7 @@ import tempfile import warnings from contextlib import contextmanager +from dataclasses import dataclass from functools import partial from hashlib import sha256 from pathlib import Path @@ -175,6 +176,26 @@ def get_jinja_version(): REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$") +@dataclass(frozen=True) +class HfFileMetadata: + """Data structure containing information about a file versioned on the Hub. + + Returned by [`get_hf_file_metadata`] based on a URL. + + Args: + commit_hash (`str`, *optional*): + The commit_hash related to the file. + etag (`str`, *optional*): + Etag of the file on the server. + location (`str`): + Full url of the file on the Hub. + """ + + commit_hash: Optional[str] + etag: Optional[str] + location: str + + # Do not validate `repo_id` in `hf_hub_url` for now as the `repo_id="datasets/.../..."` # pattern is used/advertised in Transformers examples. # Related: https://github.com/huggingface/huggingface_hub/pull/1029 @@ -513,7 +534,7 @@ def http_get( max_retries=0, ): """ - Donwload a remote file. Do not gobble up errors, and will return errors tailored to the Hugging Face Hub. + Download a remote file. Do not gobble up errors, and will return errors tailored to the Hugging Face Hub. """ headers = copy.deepcopy(headers) if resume_size > 0: @@ -817,7 +838,7 @@ def _resumable_file_manager() -> "io.BufferedWriter": return cache_path -def _normalize_etag(etag: str) -> str: +def _normalize_etag(etag: Optional[str]) -> Optional[str]: """Normalize ETag HTTP header, so it can be used to create nice filepaths. The HTTP spec allows two forms of ETag: @@ -827,11 +848,14 @@ def _normalize_etag(etag: str) -> str: The hf.co hub guarantees to only send the second form. Args: - etag (`str`): HTTP header + etag (`str`, *optional*): HTTP header Returns: - `str`: string that can be used as a nice directory name. + `str` or `None`: string that can be used as a nice directory name. + Returns `None` if input is None. """ + if etag is None: + return None return etag.strip('"') @@ -1112,19 +1136,18 @@ def hf_hub_download( commit_hash = None if not local_files_only: try: - r = _request_wrapper( - method="HEAD", - url=url, - headers=headers, - allow_redirects=False, - follow_relative_redirects=True, - proxies=proxies, - timeout=etag_timeout, - ) try: - hf_raise_for_status(r) - except EntryNotFoundError: - commit_hash = r.headers.get(HUGGINGFACE_HEADER_X_REPO_COMMIT) + metadata = get_hf_file_metadata( + url=url, + use_auth_token=use_auth_token, + proxies=proxies, + timeout=etag_timeout, + ) + except EntryNotFoundError as http_error: + # Cache the non-existence of the file and raise + commit_hash = http_error.response.headers.get( + HUGGINGFACE_HEADER_X_REPO_COMMIT + ) if commit_hash is not None and not legacy_cache_layout: no_exist_file_path = ( Path(storage_folder) @@ -1138,15 +1161,17 @@ def hf_hub_download( storage_folder, revision, commit_hash ) raise - commit_hash = r.headers[HUGGINGFACE_HEADER_X_REPO_COMMIT] + + # Commit hash must exist + commit_hash = metadata.commit_hash if commit_hash is None: raise OSError( "Distant resource does not seem to be on huggingface.co (missing" " commit header)." ) - etag = r.headers.get(HUGGINGFACE_HEADER_X_LINKED_ETAG) or r.headers.get( - "ETag" - ) + + # Etag must exist + etag = metadata.etag # We favor a custom header indicating the etag of the linked resource, and # we fallback to the regular etag header. # If we don't have any of those, raise an error. @@ -1155,13 +1180,13 @@ def hf_hub_download( "Distant resource does not have an ETag, we won't be able to" " reliably ensure reproducibility." ) - etag = _normalize_etag(etag) + # In case of a redirect, save an extra redirect on the request.get call, # and ensure we download the exact atomic version even if it changed # between the HEAD and the GET (unlikely, but hey). # Useful for lfs blobs that are stored on a CDN. - if 300 <= r.status_code <= 399: - url_to_download = r.headers["Location"] + if metadata.location != url: + url_to_download = metadata.location if ( "lfs.huggingface.co" in url_to_download or "lfs-staging.huggingface.co" in url_to_download @@ -1377,3 +1402,67 @@ def try_to_load_from_cache( cached_file = os.path.join(repo_cache, "snapshots", revision, filename) return cached_file if os.path.isfile(cached_file) else None + + +def get_hf_file_metadata( + url: str, + use_auth_token: Union[bool, str, None] = None, + proxies: Optional[Dict] = None, + timeout: Optional[float] = 10, +) -> HfFileMetadata: + """Fetch metadata of a file versioned on the Hub for a given url. + + Args: + url (`str`): + File url, for example returned by [`hf_hub_url`]. + use_auth_token (`str`, `bool`, *optional*): + A token to be used for the download. + - If `True`, the token is read from the HuggingFace config + folder. + - If `False` or `None`, no token is provided. + - If a string, it's used as the authentication token. + proxies (`dict`, *optional*): + Dictionary mapping protocol to the URL of the proxy passed to + `requests.request`. + etag_timeout (`float`, *optional*, defaults to `10`): + How many seconds to wait for the server to send metadata before giving up. + + Returns: + A [`HfFileMetadata`] object containing metadata such as location, etag and + commit_hash. + """ + headers = {} + if isinstance(use_auth_token, str): + headers["authorization"] = f"Bearer {use_auth_token}" + elif use_auth_token: + token = HfFolder.get_token() + if token is None: + raise EnvironmentError( + "You specified use_auth_token=True, but a huggingface token was not" + " found." + ) + headers["authorization"] = f"Bearer {token}" + + # Retrieve metadata + r = _request_wrapper( + method="HEAD", + url=url, + headers=headers, + allow_redirects=False, + follow_relative_redirects=True, + proxies=proxies, + timeout=timeout, + ) + hf_raise_for_status(r) + + # Return + return HfFileMetadata( + commit_hash=r.headers.get(HUGGINGFACE_HEADER_X_REPO_COMMIT), + etag=_normalize_etag( + # We favor a custom header indicating the etag of the linked resource, and + # we fallback to the regular etag header. + r.headers.get("ETag") + or r.headers.get(HUGGINGFACE_HEADER_X_LINKED_ETAG) + ), + location=r.headers.get("Location") or url, + ) From 71bf708743ffe2b56a3d5f3762786fb1f434af1d Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Fri, 16 Sep 2022 13:23:15 +0200 Subject: [PATCH 2/7] update doc --- docs/source/package_reference/file_download.mdx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/package_reference/file_download.mdx b/docs/source/package_reference/file_download.mdx index d712b812f2..5ebaac6e38 100644 --- a/docs/source/package_reference/file_download.mdx +++ b/docs/source/package_reference/file_download.mdx @@ -1,9 +1,5 @@ # Downloading files -## Download all files - -[[autodoc]] huggingface_hub.snapshot_download - ## Download a single file ### hf_hub_download @@ -14,6 +10,10 @@ [[autodoc]] huggingface_hub.hf_hub_url +## Download a snapshot of the repo + +[[autodoc]] huggingface_hub.snapshot_download + ## Get metadata about a file ### get_hf_file_metadata From 53ea1c02ff44fb6e59d5eb22467c9ed7cfeb5680 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Fri, 16 Sep 2022 13:25:16 +0200 Subject: [PATCH 3/7] doc --- src/huggingface_hub/file_download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/huggingface_hub/file_download.py b/src/huggingface_hub/file_download.py index 0c5a395119..5a8b896e15 100644 --- a/src/huggingface_hub/file_download.py +++ b/src/huggingface_hub/file_download.py @@ -188,7 +188,7 @@ class HfFileMetadata: etag (`str`, *optional*): Etag of the file on the server. location (`str`): - Full url of the file on the Hub. + Location where to download the file. Can be a Hub url or not (CDN). """ commit_hash: Optional[str] From 615e4864d89c91f1b4a1173f1a5f4b93ad2bd2f0 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Fri, 16 Sep 2022 14:04:24 +0200 Subject: [PATCH 4/7] test get_file metadata --- src/huggingface_hub/file_download.py | 5 +++- tests/test_file_download.py | 39 +++++++++++++++++++++++++--- tests/testing_utils.py | 5 ++-- 3 files changed, 42 insertions(+), 7 deletions(-) diff --git a/src/huggingface_hub/file_download.py b/src/huggingface_hub/file_download.py index 5a8b896e15..1584e2b133 100644 --- a/src/huggingface_hub/file_download.py +++ b/src/huggingface_hub/file_download.py @@ -1464,5 +1464,8 @@ def get_hf_file_metadata( r.headers.get("ETag") or r.headers.get(HUGGINGFACE_HEADER_X_LINKED_ETAG) ), - location=r.headers.get("Location") or url, + # Either from response headers (if redirected) or defaults to request url + # Do not use directly `url`, as `_request_wrapper` might have followed relative + # redirects. + location=r.headers.get("Location") or r.request.url, ) diff --git a/tests/test_file_download.py b/tests/test_file_download.py index 13ae0e9df8..7774dd2780 100644 --- a/tests/test_file_download.py +++ b/tests/test_file_download.py @@ -28,6 +28,7 @@ _CACHED_NO_EXIST, cached_download, filename_to_url, + get_hf_file_metadata, hf_hub_download, hf_hub_url, try_to_load_from_cache, @@ -45,7 +46,8 @@ DUMMY_MODEL_ID_PINNED_SHA256, DUMMY_MODEL_ID_REVISION_INVALID, DUMMY_MODEL_ID_REVISION_ONE_SPECIFIC_COMMIT, - DUMMY_RENAMED_MODEL_ID, + DUMMY_RENAMED_NEW_MODEL_ID, + DUMMY_RENAMED_OLD_MODEL_ID, SAMPLE_DATASET_IDENTIFIER, OfflineSimulationMode, offline, @@ -225,7 +227,7 @@ def test_download_from_a_renamed_repo_with_hf_hub_download(self): """ with TemporaryDirectory() as tmpdir: filepath = hf_hub_download( - DUMMY_RENAMED_MODEL_ID, "config.json", cache_dir=tmpdir + DUMMY_RENAMED_OLD_MODEL_ID, "config.json", cache_dir=tmpdir ) self.assertTrue(os.path.exists(filepath)) @@ -239,7 +241,7 @@ def test_download_from_a_renamed_repo_with_cached_download(self): with TemporaryDirectory() as tmpdir: filepath = cached_download( hf_hub_url( - DUMMY_RENAMED_MODEL_ID, + DUMMY_RENAMED_OLD_MODEL_ID, filename="config.json", ), cache_dir=tmpdir, @@ -337,3 +339,34 @@ def test_try_to_load_from_cache_no_exist(self): # If file non-existence is not cached, returns None self.assertIsNone(try_to_load_from_cache(DUMMY_MODEL_ID, filename="dummy2")) + + def test_get_hf_file_metadata_basic(self) -> None: + """Test getting metadata from a file on the Hub.""" + url = hf_hub_url( + DUMMY_MODEL_ID, + filename=CONFIG_NAME, + revision=DUMMY_MODEL_ID_REVISION_ONE_SPECIFIC_COMMIT, + ) + metadata = get_hf_file_metadata(url) + + # Metadata + self.assertEqual( + metadata.commit_hash, DUMMY_MODEL_ID_REVISION_ONE_SPECIFIC_COMMIT + ) + self.assertIsNotNone(metadata.etag) # example: "85c2fc2dcdd86563aaa85ef4911..." + self.assertEqual(metadata.location, url) # no redirect + + def test_get_hf_file_metadata_from_a_renamed_repo(self) -> None: + """Test getting metadata from a file in a renamed repo on the Hub.""" + url = hf_hub_url( + DUMMY_RENAMED_OLD_MODEL_ID, + filename=CONFIG_NAME, + subfolder="", # Subfolder should be processed as `None` + ) + metadata = get_hf_file_metadata(url) + + # Got redirected to renamed repo + self.assertEqual( + metadata.location, + url.replace(DUMMY_RENAMED_OLD_MODEL_ID, DUMMY_RENAMED_NEW_MODEL_ID), + ) diff --git a/tests/testing_utils.py b/tests/testing_utils.py index 09ed174187..a1fb9ea15c 100644 --- a/tests/testing_utils.py +++ b/tests/testing_utils.py @@ -41,9 +41,8 @@ # Sha-256 of pytorch_model.bin on the top of `main`, for checking purposes # "hf-internal-testing/dummy-will-be-renamed" has been renamed to "hf-internal-testing/dummy-renamed" -DUMMY_RENAMED_MODEL_ID = ( # Regression test #941 - "hf-internal-testing/dummy-will-be-renamed" -) +DUMMY_RENAMED_OLD_MODEL_ID = "hf-internal-testing/dummy-will-be-renamed" +DUMMY_RENAMED_NEW_MODEL_ID = "hf-internal-testing/dummy-renamed" SAMPLE_DATASET_IDENTIFIER = "lhoestq/custom_squad" # Example dataset ids From 8ec6b6ee7607d2eff42ce3959950cf4f92c9dece Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Fri, 16 Sep 2022 14:06:59 +0200 Subject: [PATCH 5/7] add todo --- src/huggingface_hub/file_download.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/huggingface_hub/file_download.py b/src/huggingface_hub/file_download.py index 1584e2b133..999e2f003b 100644 --- a/src/huggingface_hub/file_download.py +++ b/src/huggingface_hub/file_download.py @@ -1431,6 +1431,7 @@ def get_hf_file_metadata( A [`HfFileMetadata`] object containing metadata such as location, etag and commit_hash. """ + # TODO: helper to get headers from `use_auth_token` (copy-pasted several times) headers = {} if isinstance(use_auth_token, str): headers["authorization"] = f"Bearer {use_auth_token}" From 935488ed02cb30d8a19c040c4485768b8b7e13c7 Mon Sep 17 00:00:00 2001 From: Lucain Date: Fri, 16 Sep 2022 15:35:07 +0200 Subject: [PATCH 6/7] Update src/huggingface_hub/file_download.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- src/huggingface_hub/file_download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/huggingface_hub/file_download.py b/src/huggingface_hub/file_download.py index 999e2f003b..2631129f48 100644 --- a/src/huggingface_hub/file_download.py +++ b/src/huggingface_hub/file_download.py @@ -1424,7 +1424,7 @@ def get_hf_file_metadata( proxies (`dict`, *optional*): Dictionary mapping protocol to the URL of the proxy passed to `requests.request`. - etag_timeout (`float`, *optional*, defaults to `10`): + etag_timeout (`float`, *optional*, defaults to 10): How many seconds to wait for the server to send metadata before giving up. Returns: From 53e0ed8be9f23d236a56ee0862b7455d6fb49c43 Mon Sep 17 00:00:00 2001 From: Lucain Date: Fri, 16 Sep 2022 15:35:18 +0200 Subject: [PATCH 7/7] Update src/huggingface_hub/file_download.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- src/huggingface_hub/file_download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/huggingface_hub/file_download.py b/src/huggingface_hub/file_download.py index 2631129f48..a6a6f70bbe 100644 --- a/src/huggingface_hub/file_download.py +++ b/src/huggingface_hub/file_download.py @@ -1415,7 +1415,7 @@ def get_hf_file_metadata( Args: url (`str`): File url, for example returned by [`hf_hub_url`]. - use_auth_token (`str`, `bool`, *optional*): + use_auth_token (`str` or `bool`, *optional*): A token to be used for the download. - If `True`, the token is read from the HuggingFace config folder.