Skip to content

Commit

Permalink
[RFC] Proposal for a way to cache files in downstream libraries (#1088)
Browse files Browse the repository at this point in the history
* first proposal

* add tree to docstring

* docstring

* renamed to cached assets

* add tests

* doc
  • Loading branch information
Wauplin authored Oct 13, 2022
1 parent 4bd9ebf commit 6a4538b
Show file tree
Hide file tree
Showing 7 changed files with 300 additions and 1 deletion.
62 changes: 62 additions & 0 deletions docs/source/how-to-cache.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,68 @@ When symlinks are not supported, a warning message is displayed to the user to a
them they are using a degraded version of the cache-system. This warning can be disabled
by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable to true.

## Caching assets

In addition to caching files from the Hub, downstream libraries often requires to cache
other files related to HF but not handled directly by `huggingface_hub` (example: file
downloaded from GitHub, preprocessed data, logs,...). In order to cache those files,
called `assets`, one can use [`cached_assets_path`]. This small helper generates paths
in the HF cache in a unified way based on the name of the library requesting it and
optionally on a namespace and a subfolder name. The goal is to let every downstream
libraries manage its assets its own way (e.g. no rule on the structure) as long as it
stays in the right assets folder. Those libraries can then leverage tools from
`huggingface_hub` to manage the cache, in particular scanning and deleting parts of the
assets from a CLI command.

```py
from huggingface_hub import cached_assets_path

assets_path = cached_assets_path(library_name="datasets", namespace="SQuAD", subfolder="download")
something_path = assets_path / "something.json" # Do anything you like in your assets folder !
```

<Tip>

[`cached_assets_path`] is the recommended way to store assets but is not mandatory. If
your library already uses its own cache, feel free to use it!

</Tip>

### Assets in practice

In practice, your assets cache should look like the following tree:

```text
assets/
└── datasets/
│ ├── SQuAD/
│ │ ├── downloaded/
│ │ ├── extracted/
│ │ └── processed/
│ ├── Helsinki-NLP--tatoeba_mt/
│ ├── downloaded/
│ ├── extracted/
│ └── processed/
└── transformers/
├── default/
│ ├── something/
├── bert-base-cased/
│ ├── default/
│ └── training/
hub/
└── models--julien-c--EsperBERTo-small/
├── blobs/
│ ├── (...)
│ ├── (...)
├── refs/
│ └── (...)
└── [ 128] snapshots/
├── 2439f60ef33a0d46d85da5001d52aeda5b00ce9f/
│ ├── (...)
└── bbc77c8132af1cc5cf678da3f1ddf2de43606d48/
└── (...)
```

## Scan your cache

At the moment, cached files are never deleted from your local directory: when you download
Expand Down
4 changes: 4 additions & 0 deletions docs/source/package_reference/cache.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ for a detailed presentation of caching at HF.

## Helpers

## cached_assets_path

[[autodoc]] huggingface_hub.cached_assets_path

### scan_cache_dir

[[autodoc]] huggingface_hub.scan_cache_dir
Expand Down
2 changes: 2 additions & 0 deletions src/huggingface_hub/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@
"DeleteCacheStrategy",
"HFCacheInfo",
"HfFolder",
"cached_assets_path",
"logging",
"scan_cache_dir",
],
Expand Down Expand Up @@ -374,6 +375,7 @@ def __dir__():
from .utils import DeleteCacheStrategy # noqa: F401
from .utils import HFCacheInfo # noqa: F401
from .utils import HfFolder # noqa: F401
from .utils import cached_assets_path # noqa: F401
from .utils import logging # noqa: F401
from .utils import scan_cache_dir # noqa: F401
from .utils.endpoint_helpers import DatasetFilter # noqa: F401
Expand Down
7 changes: 6 additions & 1 deletion src/huggingface_hub/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,15 +77,20 @@ def _is_true_or_auto(value: Optional[str]) -> bool:
os.path.join(os.getenv("XDG_CACHE_HOME", default_home), "huggingface"),
)
)

default_cache_path = os.path.join(hf_cache_home, "hub")
default_assets_cache_path = os.path.join(hf_cache_home, "assets")

HUGGINGFACE_HUB_CACHE = os.getenv("HUGGINGFACE_HUB_CACHE", default_cache_path)
HUGGINGFACE_ASSETS_CACHE = os.getenv(
"HUGGINGFACE_ASSETS_CACHE", default_assets_cache_path
)

HF_HUB_OFFLINE = _is_true(os.environ.get("HF_HUB_OFFLINE"))


# Here, `True` will disable progress bars globally without possibility of enabling it
# programatically. `False` will enable them without possibility of disabling them.
# programmatically. `False` will enable them without possibility of disabling them.
# If environment variable is not set (None), then the user is free to enable/disable
# them programmatically.
# TL;DR: env variable has priority over code
Expand Down
1 change: 1 addition & 0 deletions src/huggingface_hub/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# limitations under the License

from . import tqdm as _tqdm # _tqdm is the module
from ._cache_assets import cached_assets_path
from ._cache_manager import (
CachedFileInfo,
CachedRepoInfo,
Expand Down
138 changes: 138 additions & 0 deletions src/huggingface_hub/utils/_cache_assets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
# coding=utf-8
# Copyright 2019-present, the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path
from typing import Union

from ..constants import HUGGINGFACE_ASSETS_CACHE


def cached_assets_path(
library_name: str,
namespace: str = "default",
subfolder: str = "default",
*,
assets_dir: Union[str, Path, None] = None,
):
"""Return a folder path to cache arbitrary files.
`huggingface_hub` provides a canonical folder path to store assets. This is the
recommended way to integrate cache in a downstream library as it will benefit from
the builtins tools to scan and delete the cache properly.
The distinction is made between files cached from the Hub and assets. Files from the
Hub are cached in a git-aware manner and entirely managed by `huggingface_hub`. See
[related documentation](https://huggingface.co/docs/huggingface_hub/how-to-cache).
All other files that a downstream library caches are considered to be "assets"
(files downloaded from external sources, extracted from a .tar archive, preprocessed
for training,...).
Once the folder path is generated, it is guaranteed to exist and to be a directory.
The path is based on 3 levels of depth: the library name, a namespace and a
subfolder. Those 3 levels grants flexibility while allowing `huggingface_hub` to
expect folders when scanning/deleting parts of the assets cache. Within a library,
it is expected that all namespaces share the same subset of subfolder names but this
is not a mandatory rule. The downstream library has then full control on which file
structure to adopt within its cache. Namespace and subfolder are optional (would
default to a `"default/"` subfolder) but library name is mandatory as we want every
downstream library to manage its own cache.
Expected tree:
```text
assets/
└── datasets/
│ ├── SQuAD/
│ │ ├── downloaded/
│ │ ├── extracted/
│ │ └── processed/
│ ├── Helsinki-NLP--tatoeba_mt/
│ ├── downloaded/
│ ├── extracted/
│ └── processed/
└── transformers/
├── default/
│ ├── something/
├── bert-base-cased/
│ ├── default/
│ └── training/
hub/
└── models--julien-c--EsperBERTo-small/
├── blobs/
│ ├── (...)
│ ├── (...)
├── refs/
│ └── (...)
└── [ 128] snapshots/
├── 2439f60ef33a0d46d85da5001d52aeda5b00ce9f/
│ ├── (...)
└── bbc77c8132af1cc5cf678da3f1ddf2de43606d48/
└── (...)
```
Args:
library_name (`str`):
Name of the library that will manage the cache folder. Example: `"dataset"`.
namespace (`str`, *optional*, defaults to "default"):
Namespace to which the data belongs. Example: `"SQuAD"`.
subfolder (`str`, *optional*, defaults to "default"):
Subfolder in which the data will be stored. Example: `extracted`.
assets_dir (`str`, `Path`, *optional*):
Path to the folder where assets are cached. This must not be the same folder
where Hub files are cached. Defaults to `HF_HOME / "assets"` if not provided.
Can also be set with `HUGGINGFACE_ASSETS_CACHE` environment variable.
Returns:
Path to the cache folder (`Path`).
Example:
```py
>>> from huggingface_hub import cached_assets_path
>>> cached_assets_path(library_name="datasets", namespace="SQuAD", subfolder="download")
PosixPath('/home/wauplin/.cache/huggingface/extra/datasets/SQuAD/download')
>>> cached_assets_path(library_name="datasets", namespace="SQuAD", subfolder="extracted")
PosixPath('/home/wauplin/.cache/huggingface/extra/datasets/SQuAD/extracted')
>>> cached_assets_path(library_name="datasets", namespace="Helsinki-NLP/tatoeba_mt")
PosixPath('/home/wauplin/.cache/huggingface/extra/datasets/Helsinki-NLP--tatoeba_mt/default')
>>> cached_assets_path(library_name="datasets", assets_dir="/tmp/tmp123456")
PosixPath('/tmp/tmp123456/datasets/default/default')
```
"""
# Resolve assets_dir
if assets_dir is None:
assets_dir = HUGGINGFACE_ASSETS_CACHE
assets_dir = Path(assets_dir).expanduser().resolve()

# Avoid names that could create path issues
for part in (" ", "/", "\\"):
library_name = library_name.replace(part, "--")
namespace = namespace.replace(part, "--")
subfolder = subfolder.replace(part, "--")

# Path to subfolder is created
path = assets_dir / library_name / namespace / subfolder
try:
path.mkdir(exist_ok=True, parents=True)
except (FileExistsError, NotADirectoryError):
raise ValueError(
"Corrupted assets folder: cannot create directory because of an existing"
f" file ({path})."
)

# Return
return path
87 changes: 87 additions & 0 deletions tests/test_utils_assets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import unittest
from pathlib import Path
from unittest.mock import patch

import pytest

from huggingface_hub import cached_assets_path


@pytest.mark.usefixtures("fx_cache_dir")
class CacheAssetsTest(unittest.TestCase):
cache_dir: Path

def test_cached_assets_path_with_namespace_and_subfolder(self) -> None:
expected_path = self.cache_dir / "datasets" / "SQuAD" / "download"
self.assertFalse(expected_path.is_dir())

path = cached_assets_path(
library_name="datasets",
namespace="SQuAD",
subfolder="download",
assets_dir=self.cache_dir,
)

self.assertEqual(path, expected_path) # Path is generated
self.assertTrue(path.is_dir()) # And dir is created

def test_cached_assets_path_without_subfolder(self) -> None:
path = cached_assets_path(
library_name="datasets", namespace="SQuAD", assets_dir=self.cache_dir
)
self.assertEqual(path, self.cache_dir / "datasets" / "SQuAD" / "default")
self.assertTrue(path.is_dir())

def test_cached_assets_path_without_namespace(self) -> None:
path = cached_assets_path(
library_name="datasets", subfolder="download", assets_dir=self.cache_dir
)
self.assertEqual(path, self.cache_dir / "datasets" / "default" / "download")
self.assertTrue(path.is_dir())

def test_cached_assets_path_without_namespace_and_subfolder(self) -> None:
path = cached_assets_path(library_name="datasets", assets_dir=self.cache_dir)
self.assertEqual(path, self.cache_dir / "datasets" / "default" / "default")
self.assertTrue(path.is_dir())

def test_cached_assets_path_forbidden_symbols(self) -> None:
path = cached_assets_path(
library_name="ReAlLy dumb",
namespace="user/repo_name",
subfolder="this is/not\\clever",
assets_dir=self.cache_dir,
)
self.assertEqual(
path,
self.cache_dir
/ "ReAlLy--dumb"
/ "user--repo_name"
/ "this--is--not--clever",
)
self.assertTrue(path.is_dir())

def test_cached_assets_path_default_assets_dir(self) -> None:
with patch(
"huggingface_hub.utils._cache_assets.HUGGINGFACE_ASSETS_CACHE",
self.cache_dir,
): # Uses environment variable from HUGGINGFACE_ASSETS_CACHE
self.assertEqual(
cached_assets_path(library_name="datasets"),
self.cache_dir / "datasets" / "default" / "default",
)

def test_cached_assets_path_is_a_file(self) -> None:
expected_path = self.cache_dir / "datasets" / "default" / "default"
expected_path.parent.mkdir(parents=True)
expected_path.touch() # this should be the generated folder but is a file !

with self.assertRaises(ValueError):
cached_assets_path(library_name="datasets", assets_dir=self.cache_dir)

def test_cached_assets_path_parent_is_a_file(self) -> None:
expected_path = self.cache_dir / "datasets" / "default" / "default"
expected_path.parent.parent.mkdir(parents=True)
expected_path.parent.touch() # cannot create folder as a parent is a file !

with self.assertRaises(ValueError):
cached_assets_path(library_name="datasets", assets_dir=self.cache_dir)

0 comments on commit 6a4538b

Please sign in to comment.