Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enabling hf_transfer use. #1272

Merged
merged 8 commits into from
Dec 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions docs/source/package_reference/environment_variables.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,15 @@ to disable this warning.

For more details, see [cache limitations](how-to-cache#limitations).

### HF_HUB_ENABLE_HF_TRANSFER

Set to `True` to download files from the Hub using `hf_transfer`. It's a Rust-based package
that enables faster download (up to x2 speed-up). Be aware that this is still experimental
so it might cause issues in your workflow. In particular, it does not support features such
as progress bars, resume download, proxies or error handling.

**Note:** `hf_transfer` has to be installed separately [from Pypi](https://pypi.org/project/hf-transfer/).

## From external tools

Some environment variables are not specific to `huggingface_hub` but still taken into account
Expand Down
29 changes: 20 additions & 9 deletions src/huggingface_hub/_snapshot_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@
from tqdm.auto import tqdm as base_tqdm
from tqdm.contrib.concurrent import thread_map

from .constants import DEFAULT_REVISION, HUGGINGFACE_HUB_CACHE, REPO_TYPES
from .constants import (
DEFAULT_REVISION,
HF_HUB_ENABLE_HF_TRANSFER,
HUGGINGFACE_HUB_CACHE,
REPO_TYPES,
)
from .file_download import REGEX_COMMIT_HASH, hf_hub_download, repo_folder_name
from .hf_api import HfApi
from .utils import filter_repo_objects, logging
Expand Down Expand Up @@ -201,13 +206,19 @@ def _inner_hf_hub_download(repo_file: str):
token=token,
)

thread_map(
_inner_hf_hub_download,
filtered_repo_files,
desc=f"Fetching {len(filtered_repo_files)} files",
max_workers=max_workers,
# User can use its own tqdm class or the default one from `huggingface_hub.utils`
tqdm_class=tqdm_class or hf_tqdm,
)
if HF_HUB_ENABLE_HF_TRANSFER:
# when using hf_transfer we don't want extra parallelism
# from the one hf_transfer provides
for file in filtered_repo_files:
_inner_hf_hub_download(file)
else:
thread_map(
_inner_hf_hub_download,
filtered_repo_files,
desc=f"Fetching {len(filtered_repo_files)} files",
max_workers=max_workers,
# User can use its own tqdm class or the default one from `huggingface_hub.utils`
tqdm_class=tqdm_class or hf_tqdm,
)

return snapshot_folder
6 changes: 6 additions & 0 deletions src/huggingface_hub/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,3 +117,9 @@ def _is_true_or_auto(value: Optional[str]) -> bool:
HF_HUB_DISABLE_IMPLICIT_TOKEN: bool = _is_true(
os.environ.get("HF_HUB_DISABLE_IMPLICIT_TOKEN")
)

# Enable fast-download using external dependency "hf_transfer"
# See:
# - https://pypi.org/project/hf-transfer/
# - https://github.com/huggingface/hf_transfer (private)
HF_HUB_ENABLE_HF_TRANSFER: bool = _is_true(os.environ.get("HF_HUB_ENABLE_HF_TRANSFER"))
27 changes: 27 additions & 0 deletions src/huggingface_hub/file_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from .constants import (
DEFAULT_REVISION,
HF_HUB_DISABLE_SYMLINKS_WARNING,
HF_HUB_ENABLE_HF_TRANSFER,
HUGGINGFACE_CO_URL_TEMPLATE,
HUGGINGFACE_HEADER_X_LINKED_ETAG,
HUGGINGFACE_HEADER_X_LINKED_SIZE,
Expand Down Expand Up @@ -469,9 +470,35 @@ def http_get(
"""
Download a remote file. Do not gobble up errors, and will return errors tailored to the Hugging Face Hub.
"""
if not resume_size:
if HF_HUB_ENABLE_HF_TRANSFER:
try:
# Download file using an external Rust-based package. Download is faster
# (~2x speed-up) but support less features (no error handling, no retries,
# no progress bars).
from hf_transfer import download

logger.debug(f"Download {url} using HF_TRANSFER.")
max_files = 100
chunk_size = 10 * 1024 * 1024 # 10 MB
download(url, temp_file.name, max_files, chunk_size)
return
except ImportError:
raise ValueError(
"Fast download using 'hf_transfer' is enabled"
" (HF_HUB_ENABLE_HF_TRANSFER=1) but 'hf_transfer' package is not"
" available in your environment. Try `pip install hf_transfer`."
)
except Exception as e:
raise RuntimeError(
"An error occurred while downloading using `hf_transfer`. Consider"
" disabling HF_HUB_ENABLE_HF_TRANSFER for better error handling."
) from e

headers = copy.deepcopy(headers) or {}
if resume_size > 0:
headers["Range"] = "bytes=%d-" % (resume_size,)

r = _request_wrapper(
method="GET",
url=url,
Expand Down
2 changes: 2 additions & 0 deletions src/huggingface_hub/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
get_fastcore_version,
get_graphviz_version,
get_hf_hub_version,
get_hf_transfer_version,
get_jinja_version,
get_pillow_version,
get_pydot_version,
Expand All @@ -68,6 +69,7 @@
is_fastcore_available,
is_google_colab,
is_graphviz_available,
is_hf_transfer_available,
is_jinja_available,
is_notebook,
is_pillow_available,
Expand Down
24 changes: 23 additions & 1 deletion src/huggingface_hub/utils/_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

import packaging.version

from .. import __version__
from .. import __version__, constants


_PY_VERSION: str = sys.version.split()[0].rstrip("+")
Expand Down Expand Up @@ -52,6 +52,7 @@
"fastcore": {"fastcore"},
"jinja": {"Jinja2"},
"pillow": {"Pillow"},
"hf_transfer": {"hf_transfer"},
}

# Check once at runtime
Expand Down Expand Up @@ -110,6 +111,15 @@ def get_graphviz_version() -> str:
return _get_version("graphviz")


# hf_transfer
def is_hf_transfer_available() -> bool:
return _is_available("hf_transfer")


def get_hf_transfer_version() -> str:
return _get_version("hf_transfer")


# Jinja
def is_jinja_available() -> bool:
return _is_available("jinja")
Expand Down Expand Up @@ -243,6 +253,18 @@ def dump_environment_info() -> Dict[str, Any]:
info["Graphviz"] = get_graphviz_version()
info["Pydot"] = get_pydot_version()
info["Pillow"] = get_pillow_version()
info["hf_transfer"] = get_hf_transfer_version()

# Environment variables
info["ENDPOINT"] = constants.ENDPOINT
info["HUGGINGFACE_HUB_CACHE"] = constants.HUGGINGFACE_HUB_CACHE
info["HUGGINGFACE_ASSETS_CACHE"] = constants.HUGGINGFACE_ASSETS_CACHE
info["HF_HUB_OFFLINE"] = constants.HF_HUB_OFFLINE
info["HF_TOKEN_PATH"] = constants.HF_TOKEN_PATH
info["HF_HUB_DISABLE_PROGRESS_BARS"] = constants.HF_HUB_DISABLE_PROGRESS_BARS
info["HF_HUB_DISABLE_SYMLINKS_WARNING"] = constants.HF_HUB_DISABLE_SYMLINKS_WARNING
info["HF_HUB_DISABLE_IMPLICIT_TOKEN"] = constants.HF_HUB_DISABLE_IMPLICIT_TOKEN
info["HF_HUB_ENABLE_HF_TRANSFER"] = constants.HF_HUB_ENABLE_HF_TRANSFER

print("\nCopy-and-paste the text below in your GitHub issue.\n")
print("\n".join([f"- {prop}: {val}" for prop, val in info.items()]) + "\n")
Expand Down