Skip to content

Commit

Permalink
🔀 Merge branch 'main' into use-auth-token
Browse files Browse the repository at this point in the history
  • Loading branch information
SBrandeis committed Sep 9, 2022
2 parents 4bad3fd + 59873e7 commit e878793
Show file tree
Hide file tree
Showing 11 changed files with 864 additions and 37 deletions.
53 changes: 53 additions & 0 deletions docs/source/how-to-cache.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -238,3 +238,56 @@ HFCacheInfo(
],
)
```

## Clean your cache

Scanning your cache is interesting but what you really want to do next is usually to
delete some portions to free up some space on your drive. This is possible using the
[`~HFCacheInfo.delete_revisions`] helper from [`HFCacheInfo`] object returned when
scanning the cache.

Pass a list of revisions to delete and the tool will define a strategy to free up the
space. It returns a [`DeleteCacheStrategy`] object that describes which files and
folders will be deleted and the expected freed space. Once you agree with the deletion,
you must execute it to make the deletion effective. In order to avoid discrepancies, you
cannot edit a strategy manually.

The strategy to delete revisions is the following:

- the `snapshot` folder containing the revision symlinks is deleted.
- blobs files that are targeted only by revisions to be deleted are deleted as well.
- if a revision is linked to 1 or more `refs`, references are deleted.
- if all revisions from a repo are deleted, the entire cached repository is deleted.

Here is a simple usage example. See reference for details.

```py
>>> from huggingface_hub import scan_cache_dir

>>> delete_strategy = scan_cache_dir().delete_revisions(
... "81fd1d6e7847c99f5862c9fb81387956d99ec7aa"
... "e2983b237dccf3ab4937c97fa717319a9ca1a96d",
... "6c0e6080953db56375760c0471a8c5f2929baf11",
... )
>>> print("Will free " + delete_strategy.expected_freed_size_str)
Will free 8.6G

>>> delete_strategy.execute()
Cache deletion done. Saved 8.6G.
```

<Tip>

Revision hashes are unique across all repositories. This means you don't need to
provide any `repo_id` or `repo_type` when removing revisions.

</Tip>

<Tip warning={true}>

If a revision is not found in the cache, it will be silently ignored. Besides, if a file
or folder cannot be found while trying to delete it, a warning will be logged but no
error is thrown. The deletion continues for other paths contained in the
[`DeleteCacheStrategy`] object.

</Tip>
5 changes: 5 additions & 0 deletions docs/source/package_reference/cache.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ All structures are built and returned by [`scan_cache_dir`] and are immutable.
[[autodoc]] huggingface_hub.CachedFileInfo
- size_on_disk_str

### DeleteCacheStrategy

[[autodoc]] huggingface_hub.DeleteCacheStrategy
- expected_freed_size_str

## Exceptions

### CorruptedCacheException
Expand Down
1 change: 1 addition & 0 deletions src/huggingface_hub/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ def __dir__():
"CachedRepoInfo",
"CachedRevisionInfo",
"CorruptedCacheException",
"DeleteCacheStrategy",
"HFCacheInfo",
"scan_cache_dir",
],
Expand Down
38 changes: 13 additions & 25 deletions src/huggingface_hub/file_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import re
import sys
import tempfile
import time
import warnings
from contextlib import contextmanager
from functools import partial
Expand All @@ -20,6 +19,7 @@
import requests
from filelock import FileLock
from huggingface_hub import constants
from requests.exceptions import ConnectTimeout, ProxyError

from . import __version__
from .constants import (
Expand All @@ -37,6 +37,7 @@
EntryNotFoundError,
LocalEntryNotFoundError,
hf_raise_for_status,
http_backoff,
logging,
tqdm,
validate_hf_hub_args,
Expand Down Expand Up @@ -478,30 +479,17 @@ def _request_wrapper(
return response

# 3. Exponential backoff
tries, success = 0, False
while not success:
tries += 1
try:
response = requests.request(
method=method.upper(), url=url, timeout=timeout, **params
)
success = True
except (
requests.exceptions.ConnectTimeout,
requests.exceptions.ProxyError,
) as err:
if tries > max_retries:
raise err
else:
logger.info(
f"{method} request to {url} timed out, retrying..."
f" [{tries/max_retries}]"
)
sleep_time = min(
max_wait_time, base_wait_time * 2 ** (tries - 1)
) # Exponential backoff
time.sleep(sleep_time)
return response
return http_backoff(
method=method,
url=url,
max_retries=max_retries,
base_wait_time=base_wait_time,
max_wait_time=max_wait_time,
retry_on_exceptions=(ConnectTimeout, ProxyError),
retry_on_status_codes=(),
timeout=timeout,
**params,
)


def _request_with_retry(*args, **kwargs) -> requests.Response:
Expand Down
6 changes: 3 additions & 3 deletions src/huggingface_hub/lfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from huggingface_hub.constants import ENDPOINT, REPO_TYPES_URL_PREFIXES
from requests.auth import HTTPBasicAuth

from .utils import hf_raise_for_status, validate_hf_hub_args
from .utils import hf_raise_for_status, http_backoff, validate_hf_hub_args
from .utils.sha import sha256, sha_fileobj


Expand Down Expand Up @@ -308,7 +308,7 @@ def _upload_single_part(upload_url: str, fileobj: BinaryIO):
Raises: `requests.HTTPError` if the upload resulted in an error
"""
upload_res = requests.put(upload_url, data=fileobj)
upload_res = http_backoff("PUT", upload_url, data=fileobj)
hf_raise_for_status(upload_res)
return upload_res

Expand Down Expand Up @@ -376,7 +376,7 @@ def _upload_multi_part(
seek_from=chunk_size * part_idx,
read_limit=chunk_size,
) as fileobj_slice:
part_upload_res = requests.put(part_upload_url, data=fileobj_slice)
part_upload_res = http_backoff("PUT", part_upload_url, data=fileobj_slice)
hf_raise_for_status(part_upload_res)
etag = part_upload_res.headers.get("etag")
if etag is None or etag == "":
Expand Down
2 changes: 2 additions & 0 deletions src/huggingface_hub/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
CachedRepoInfo,
CachedRevisionInfo,
CorruptedCacheException,
DeleteCacheStrategy,
HFCacheInfo,
scan_cache_dir,
)
Expand All @@ -34,6 +35,7 @@
RevisionNotFoundError,
hf_raise_for_status,
)
from ._http import http_backoff
from ._paths import filter_repo_objects
from ._subprocess import run_subprocess
from ._validators import HFValidationError, validate_hf_hub_args, validate_repo_id
Expand Down
Loading

0 comments on commit e878793

Please sign in to comment.