diff --git a/docs/source/how-to-cache.mdx b/docs/source/how-to-cache.mdx index e0872e7767..b72b5384b4 100644 --- a/docs/source/how-to-cache.mdx +++ b/docs/source/how-to-cache.mdx @@ -128,6 +128,68 @@ When symlinks are not supported, a warning message is displayed to the user to a them they are using a degraded version of the cache-system. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable to true. +## Caching assets + +In addition to caching files from the Hub, downstream libraries often requires to cache +other files related to HF but not handled directly by `huggingface_hub` (example: file +downloaded from GitHub, preprocessed data, logs,...). In order to cache those files, +called `assets`, one can use [`cached_assets_path`]. This small helper generates paths +in the HF cache in a unified way based on the name of the library requesting it and +optionally on a namespace and a subfolder name. The goal is to let every downstream +libraries manage its assets its own way (e.g. no rule on the structure) as long as it +stays in the right assets folder. Those libraries can then leverage tools from +`huggingface_hub` to manage the cache, in particular scanning and deleting parts of the +assets from a CLI command. + +```py +from huggingface_hub import cached_assets_path + +assets_path = cached_assets_path(library_name="datasets", namespace="SQuAD", subfolder="download") +something_path = assets_path / "something.json" # Do anything you like in your assets folder ! +``` + + + +[`cached_assets_path`] is the recommended way to store assets but is not mandatory. If +your library already uses its own cache, feel free to use it! + + + +### Assets in practice + +In practice, your assets cache should look like the following tree: + +```text + assets/ + └── datasets/ + │ ├── SQuAD/ + │ │ ├── downloaded/ + │ │ ├── extracted/ + │ │ └── processed/ + │ ├── Helsinki-NLP--tatoeba_mt/ + │ ├── downloaded/ + │ ├── extracted/ + │ └── processed/ + └── transformers/ + ├── default/ + │ ├── something/ + ├── bert-base-cased/ + │ ├── default/ + │ └── training/ + hub/ + └── models--julien-c--EsperBERTo-small/ + ├── blobs/ + │ ├── (...) + │ ├── (...) + ├── refs/ + │ └── (...) + └── [ 128] snapshots/ + ├── 2439f60ef33a0d46d85da5001d52aeda5b00ce9f/ + │ ├── (...) + └── bbc77c8132af1cc5cf678da3f1ddf2de43606d48/ + └── (...) +``` + ## Scan your cache At the moment, cached files are never deleted from your local directory: when you download diff --git a/docs/source/package_reference/cache.mdx b/docs/source/package_reference/cache.mdx index a2da98b878..2fb86bf02c 100644 --- a/docs/source/package_reference/cache.mdx +++ b/docs/source/package_reference/cache.mdx @@ -6,6 +6,10 @@ for a detailed presentation of caching at HF. ## Helpers +## cached_assets_path + +[[autodoc]] huggingface_hub.cached_assets_path + ### scan_cache_dir [[autodoc]] huggingface_hub.scan_cache_dir diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py index f17bf3e359..9c4037182e 100644 --- a/src/huggingface_hub/__init__.py +++ b/src/huggingface_hub/__init__.py @@ -172,6 +172,7 @@ "DeleteCacheStrategy", "HFCacheInfo", "HfFolder", + "cached_assets_path", "logging", "scan_cache_dir", ], @@ -374,6 +375,7 @@ def __dir__(): from .utils import DeleteCacheStrategy # noqa: F401 from .utils import HFCacheInfo # noqa: F401 from .utils import HfFolder # noqa: F401 + from .utils import cached_assets_path # noqa: F401 from .utils import logging # noqa: F401 from .utils import scan_cache_dir # noqa: F401 from .utils.endpoint_helpers import DatasetFilter # noqa: F401 diff --git a/src/huggingface_hub/constants.py b/src/huggingface_hub/constants.py index 2d88db7a58..1368aca481 100644 --- a/src/huggingface_hub/constants.py +++ b/src/huggingface_hub/constants.py @@ -77,15 +77,20 @@ def _is_true_or_auto(value: Optional[str]) -> bool: os.path.join(os.getenv("XDG_CACHE_HOME", default_home), "huggingface"), ) ) + default_cache_path = os.path.join(hf_cache_home, "hub") +default_assets_cache_path = os.path.join(hf_cache_home, "assets") HUGGINGFACE_HUB_CACHE = os.getenv("HUGGINGFACE_HUB_CACHE", default_cache_path) +HUGGINGFACE_ASSETS_CACHE = os.getenv( + "HUGGINGFACE_ASSETS_CACHE", default_assets_cache_path +) HF_HUB_OFFLINE = _is_true(os.environ.get("HF_HUB_OFFLINE")) # Here, `True` will disable progress bars globally without possibility of enabling it -# programatically. `False` will enable them without possibility of disabling them. +# programmatically. `False` will enable them without possibility of disabling them. # If environment variable is not set (None), then the user is free to enable/disable # them programmatically. # TL;DR: env variable has priority over code diff --git a/src/huggingface_hub/utils/__init__.py b/src/huggingface_hub/utils/__init__.py index 144b240206..cb2a122238 100644 --- a/src/huggingface_hub/utils/__init__.py +++ b/src/huggingface_hub/utils/__init__.py @@ -16,6 +16,7 @@ # limitations under the License from . import tqdm as _tqdm # _tqdm is the module +from ._cache_assets import cached_assets_path from ._cache_manager import ( CachedFileInfo, CachedRepoInfo, diff --git a/src/huggingface_hub/utils/_cache_assets.py b/src/huggingface_hub/utils/_cache_assets.py new file mode 100644 index 0000000000..467ddaae54 --- /dev/null +++ b/src/huggingface_hub/utils/_cache_assets.py @@ -0,0 +1,138 @@ +# coding=utf-8 +# Copyright 2019-present, the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pathlib import Path +from typing import Union + +from ..constants import HUGGINGFACE_ASSETS_CACHE + + +def cached_assets_path( + library_name: str, + namespace: str = "default", + subfolder: str = "default", + *, + assets_dir: Union[str, Path, None] = None, +): + """Return a folder path to cache arbitrary files. + + `huggingface_hub` provides a canonical folder path to store assets. This is the + recommended way to integrate cache in a downstream library as it will benefit from + the builtins tools to scan and delete the cache properly. + + The distinction is made between files cached from the Hub and assets. Files from the + Hub are cached in a git-aware manner and entirely managed by `huggingface_hub`. See + [related documentation](https://huggingface.co/docs/huggingface_hub/how-to-cache). + All other files that a downstream library caches are considered to be "assets" + (files downloaded from external sources, extracted from a .tar archive, preprocessed + for training,...). + + Once the folder path is generated, it is guaranteed to exist and to be a directory. + The path is based on 3 levels of depth: the library name, a namespace and a + subfolder. Those 3 levels grants flexibility while allowing `huggingface_hub` to + expect folders when scanning/deleting parts of the assets cache. Within a library, + it is expected that all namespaces share the same subset of subfolder names but this + is not a mandatory rule. The downstream library has then full control on which file + structure to adopt within its cache. Namespace and subfolder are optional (would + default to a `"default/"` subfolder) but library name is mandatory as we want every + downstream library to manage its own cache. + + Expected tree: + ```text + assets/ + └── datasets/ + │ ├── SQuAD/ + │ │ ├── downloaded/ + │ │ ├── extracted/ + │ │ └── processed/ + │ ├── Helsinki-NLP--tatoeba_mt/ + │ ├── downloaded/ + │ ├── extracted/ + │ └── processed/ + └── transformers/ + ├── default/ + │ ├── something/ + ├── bert-base-cased/ + │ ├── default/ + │ └── training/ + hub/ + └── models--julien-c--EsperBERTo-small/ + ├── blobs/ + │ ├── (...) + │ ├── (...) + ├── refs/ + │ └── (...) + └── [ 128] snapshots/ + ├── 2439f60ef33a0d46d85da5001d52aeda5b00ce9f/ + │ ├── (...) + └── bbc77c8132af1cc5cf678da3f1ddf2de43606d48/ + └── (...) + ``` + + + Args: + library_name (`str`): + Name of the library that will manage the cache folder. Example: `"dataset"`. + namespace (`str`, *optional*, defaults to "default"): + Namespace to which the data belongs. Example: `"SQuAD"`. + subfolder (`str`, *optional*, defaults to "default"): + Subfolder in which the data will be stored. Example: `extracted`. + assets_dir (`str`, `Path`, *optional*): + Path to the folder where assets are cached. This must not be the same folder + where Hub files are cached. Defaults to `HF_HOME / "assets"` if not provided. + Can also be set with `HUGGINGFACE_ASSETS_CACHE` environment variable. + + Returns: + Path to the cache folder (`Path`). + + Example: + ```py + >>> from huggingface_hub import cached_assets_path + + >>> cached_assets_path(library_name="datasets", namespace="SQuAD", subfolder="download") + PosixPath('/home/wauplin/.cache/huggingface/extra/datasets/SQuAD/download') + + >>> cached_assets_path(library_name="datasets", namespace="SQuAD", subfolder="extracted") + PosixPath('/home/wauplin/.cache/huggingface/extra/datasets/SQuAD/extracted') + + >>> cached_assets_path(library_name="datasets", namespace="Helsinki-NLP/tatoeba_mt") + PosixPath('/home/wauplin/.cache/huggingface/extra/datasets/Helsinki-NLP--tatoeba_mt/default') + + >>> cached_assets_path(library_name="datasets", assets_dir="/tmp/tmp123456") + PosixPath('/tmp/tmp123456/datasets/default/default') + ``` + """ + # Resolve assets_dir + if assets_dir is None: + assets_dir = HUGGINGFACE_ASSETS_CACHE + assets_dir = Path(assets_dir).expanduser().resolve() + + # Avoid names that could create path issues + for part in (" ", "/", "\\"): + library_name = library_name.replace(part, "--") + namespace = namespace.replace(part, "--") + subfolder = subfolder.replace(part, "--") + + # Path to subfolder is created + path = assets_dir / library_name / namespace / subfolder + try: + path.mkdir(exist_ok=True, parents=True) + except (FileExistsError, NotADirectoryError): + raise ValueError( + "Corrupted assets folder: cannot create directory because of an existing" + f" file ({path})." + ) + + # Return + return path diff --git a/tests/test_utils_assets.py b/tests/test_utils_assets.py new file mode 100644 index 0000000000..1dbae04b72 --- /dev/null +++ b/tests/test_utils_assets.py @@ -0,0 +1,87 @@ +import unittest +from pathlib import Path +from unittest.mock import patch + +import pytest + +from huggingface_hub import cached_assets_path + + +@pytest.mark.usefixtures("fx_cache_dir") +class CacheAssetsTest(unittest.TestCase): + cache_dir: Path + + def test_cached_assets_path_with_namespace_and_subfolder(self) -> None: + expected_path = self.cache_dir / "datasets" / "SQuAD" / "download" + self.assertFalse(expected_path.is_dir()) + + path = cached_assets_path( + library_name="datasets", + namespace="SQuAD", + subfolder="download", + assets_dir=self.cache_dir, + ) + + self.assertEqual(path, expected_path) # Path is generated + self.assertTrue(path.is_dir()) # And dir is created + + def test_cached_assets_path_without_subfolder(self) -> None: + path = cached_assets_path( + library_name="datasets", namespace="SQuAD", assets_dir=self.cache_dir + ) + self.assertEqual(path, self.cache_dir / "datasets" / "SQuAD" / "default") + self.assertTrue(path.is_dir()) + + def test_cached_assets_path_without_namespace(self) -> None: + path = cached_assets_path( + library_name="datasets", subfolder="download", assets_dir=self.cache_dir + ) + self.assertEqual(path, self.cache_dir / "datasets" / "default" / "download") + self.assertTrue(path.is_dir()) + + def test_cached_assets_path_without_namespace_and_subfolder(self) -> None: + path = cached_assets_path(library_name="datasets", assets_dir=self.cache_dir) + self.assertEqual(path, self.cache_dir / "datasets" / "default" / "default") + self.assertTrue(path.is_dir()) + + def test_cached_assets_path_forbidden_symbols(self) -> None: + path = cached_assets_path( + library_name="ReAlLy dumb", + namespace="user/repo_name", + subfolder="this is/not\\clever", + assets_dir=self.cache_dir, + ) + self.assertEqual( + path, + self.cache_dir + / "ReAlLy--dumb" + / "user--repo_name" + / "this--is--not--clever", + ) + self.assertTrue(path.is_dir()) + + def test_cached_assets_path_default_assets_dir(self) -> None: + with patch( + "huggingface_hub.utils._cache_assets.HUGGINGFACE_ASSETS_CACHE", + self.cache_dir, + ): # Uses environment variable from HUGGINGFACE_ASSETS_CACHE + self.assertEqual( + cached_assets_path(library_name="datasets"), + self.cache_dir / "datasets" / "default" / "default", + ) + + def test_cached_assets_path_is_a_file(self) -> None: + expected_path = self.cache_dir / "datasets" / "default" / "default" + expected_path.parent.mkdir(parents=True) + expected_path.touch() # this should be the generated folder but is a file ! + + with self.assertRaises(ValueError): + cached_assets_path(library_name="datasets", assets_dir=self.cache_dir) + + def test_cached_assets_path_parent_is_a_file(self) -> None: + expected_path = self.cache_dir / "datasets" / "default" / "default" + expected_path.parent.parent.mkdir(parents=True) + expected_path.parent.touch() # cannot create folder as a parent is a file ! + + with self.assertRaises(ValueError): + cached_assets_path(library_name="datasets", assets_dir=self.cache_dir)