From b1a11c2a0aa1a1b0df806ec67a5bebc446b7e670 Mon Sep 17 00:00:00 2001 From: Lucain Date: Tue, 15 Nov 2022 11:01:30 +0100 Subject: [PATCH] Fix get file size on lfs (#1188) * Fix get file size on LFS * fix typos + add description --- .github/ISSUE_TEMPLATE/bug-report.yml | 2 +- src/huggingface_hub/_commit_api.py | 2 +- src/huggingface_hub/constants.py | 1 + src/huggingface_hub/file_download.py | 9 +++++++-- src/huggingface_hub/utils/_runtime.py | 7 +++++++ tests/test_file_download.py | 11 +++++++++++ 6 files changed, 28 insertions(+), 4 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index f8ad14c92b..35efbfc923 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -36,7 +36,7 @@ body: huggingface-cli env ``` - If your are working in a notebook, please run it in a code cell: + If you are working in a notebook, please run it in a code cell: ```py from huggingface_hub import dump_environment_info diff --git a/src/huggingface_hub/_commit_api.py b/src/huggingface_hub/_commit_api.py index 6b466ce47e..8c58a0c79f 100644 --- a/src/huggingface_hub/_commit_api.py +++ b/src/huggingface_hub/_commit_api.py @@ -471,7 +471,7 @@ def fetch_upload_modes( if not path.endswith(".gitkeep"): warnings.warn( f"About to commit an empty file: '{path}'. Are you sure this is" - " intended ?" + " intended?" ) upload_modes[path] = "regular" diff --git a/src/huggingface_hub/constants.py b/src/huggingface_hub/constants.py index 5679ca249c..c08cf1bbdc 100644 --- a/src/huggingface_hub/constants.py +++ b/src/huggingface_hub/constants.py @@ -46,6 +46,7 @@ def _is_true_or_auto(value: Optional[str]) -> bool: HUGGINGFACE_CO_URL_TEMPLATE = ENDPOINT + "/{repo_id}/resolve/{revision}/{filename}" HUGGINGFACE_HEADER_X_REPO_COMMIT = "X-Repo-Commit" HUGGINGFACE_HEADER_X_LINKED_ETAG = "X-Linked-Etag" +HUGGINGFACE_HEADER_X_LINKED_SIZE = "X-Linked-Size" REPO_ID_SEPARATOR = "--" # ^ this substring is not allowed in repo_ids on hf.co diff --git a/src/huggingface_hub/file_download.py b/src/huggingface_hub/file_download.py index b06e0d429d..e3f415e807 100644 --- a/src/huggingface_hub/file_download.py +++ b/src/huggingface_hub/file_download.py @@ -26,6 +26,7 @@ HF_HUB_DISABLE_SYMLINKS_WARNING, HUGGINGFACE_CO_URL_TEMPLATE, HUGGINGFACE_HEADER_X_LINKED_ETAG, + HUGGINGFACE_HEADER_X_LINKED_SIZE, HUGGINGFACE_HEADER_X_REPO_COMMIT, HUGGINGFACE_HUB_CACHE, REPO_ID_SEPARATOR, @@ -146,7 +147,8 @@ class HfFileMetadata: location (`str`): Location where to download the file. Can be a Hub url or not (CDN). size (`size`): - Size of the file. + Size of the file. In case of an LFS file, contains the size of the actual + LFS file, not the pointer. """ commit_hash: Optional[str] @@ -1384,7 +1386,10 @@ def get_hf_file_metadata( # Do not use directly `url`, as `_request_wrapper` might have followed relative # redirects. location=r.headers.get("Location") or r.request.url, # type: ignore - size=_int_or_none(r.headers.get("Content-Length")), + size=_int_or_none( + r.headers.get(HUGGINGFACE_HEADER_X_LINKED_SIZE) + or r.headers.get("Content-Length") + ), ) diff --git a/src/huggingface_hub/utils/_runtime.py b/src/huggingface_hub/utils/_runtime.py index e7b4800b7c..f71bcb3f58 100644 --- a/src/huggingface_hub/utils/_runtime.py +++ b/src/huggingface_hub/utils/_runtime.py @@ -182,6 +182,13 @@ def is_google_colab() -> bool: def dump_environment_info() -> Dict[str, Any]: + """Dump information about the machine to help debugging issues. + + Similar helper exist in: + - `datasets` (https://github.com/huggingface/datasets/blob/main/src/datasets/commands/env.py) + - `diffusers` (https://github.com/huggingface/diffusers/blob/main/src/diffusers/commands/env.py) + - `transformers` (https://github.com/huggingface/transformers/blob/main/src/transformers/commands/env.py) + """ from huggingface_hub import HfFolder, whoami from huggingface_hub.utils import list_credential_helpers diff --git a/tests/test_file_download.py b/tests/test_file_download.py index 03d22992ee..aff772234b 100644 --- a/tests/test_file_download.py +++ b/tests/test_file_download.py @@ -376,6 +376,17 @@ def test_get_hf_file_metadata_from_a_renamed_repo(self) -> None: url.replace(DUMMY_RENAMED_OLD_MODEL_ID, DUMMY_RENAMED_NEW_MODEL_ID), ) + def test_get_hf_file_metadata_from_a_lfs_file(self) -> None: + """Test getting metadata from an LFS file. + + Must get size of the LFS file, not size of the pointer file + """ + url = hf_hub_url("gpt2", filename="tf_model.h5") + metadata = get_hf_file_metadata(url) + + self.assertIn("cdn-lfs", metadata.location) # Redirection + self.assertEqual(metadata.size, 497933648) # Size of LFS file, not pointer + class StagingCachedDownloadTest(unittest.TestCase): def test_download_from_a_gated_repo_with_hf_hub_download(self):