-
Notifications
You must be signed in to change notification settings - Fork 572
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[RFC] Proposal for a way to cache files in downstream libraries (#1088)
* first proposal * add tree to docstring * docstring * renamed to cached assets * add tests * doc
- Loading branch information
Showing
7 changed files
with
300 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
# coding=utf-8 | ||
# Copyright 2019-present, the HuggingFace Inc. team. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
from pathlib import Path | ||
from typing import Union | ||
|
||
from ..constants import HUGGINGFACE_ASSETS_CACHE | ||
|
||
|
||
def cached_assets_path( | ||
library_name: str, | ||
namespace: str = "default", | ||
subfolder: str = "default", | ||
*, | ||
assets_dir: Union[str, Path, None] = None, | ||
): | ||
"""Return a folder path to cache arbitrary files. | ||
`huggingface_hub` provides a canonical folder path to store assets. This is the | ||
recommended way to integrate cache in a downstream library as it will benefit from | ||
the builtins tools to scan and delete the cache properly. | ||
The distinction is made between files cached from the Hub and assets. Files from the | ||
Hub are cached in a git-aware manner and entirely managed by `huggingface_hub`. See | ||
[related documentation](https://huggingface.co/docs/huggingface_hub/how-to-cache). | ||
All other files that a downstream library caches are considered to be "assets" | ||
(files downloaded from external sources, extracted from a .tar archive, preprocessed | ||
for training,...). | ||
Once the folder path is generated, it is guaranteed to exist and to be a directory. | ||
The path is based on 3 levels of depth: the library name, a namespace and a | ||
subfolder. Those 3 levels grants flexibility while allowing `huggingface_hub` to | ||
expect folders when scanning/deleting parts of the assets cache. Within a library, | ||
it is expected that all namespaces share the same subset of subfolder names but this | ||
is not a mandatory rule. The downstream library has then full control on which file | ||
structure to adopt within its cache. Namespace and subfolder are optional (would | ||
default to a `"default/"` subfolder) but library name is mandatory as we want every | ||
downstream library to manage its own cache. | ||
Expected tree: | ||
```text | ||
assets/ | ||
└── datasets/ | ||
│ ├── SQuAD/ | ||
│ │ ├── downloaded/ | ||
│ │ ├── extracted/ | ||
│ │ └── processed/ | ||
│ ├── Helsinki-NLP--tatoeba_mt/ | ||
│ ├── downloaded/ | ||
│ ├── extracted/ | ||
│ └── processed/ | ||
└── transformers/ | ||
├── default/ | ||
│ ├── something/ | ||
├── bert-base-cased/ | ||
│ ├── default/ | ||
│ └── training/ | ||
hub/ | ||
└── models--julien-c--EsperBERTo-small/ | ||
├── blobs/ | ||
│ ├── (...) | ||
│ ├── (...) | ||
├── refs/ | ||
│ └── (...) | ||
└── [ 128] snapshots/ | ||
├── 2439f60ef33a0d46d85da5001d52aeda5b00ce9f/ | ||
│ ├── (...) | ||
└── bbc77c8132af1cc5cf678da3f1ddf2de43606d48/ | ||
└── (...) | ||
``` | ||
Args: | ||
library_name (`str`): | ||
Name of the library that will manage the cache folder. Example: `"dataset"`. | ||
namespace (`str`, *optional*, defaults to "default"): | ||
Namespace to which the data belongs. Example: `"SQuAD"`. | ||
subfolder (`str`, *optional*, defaults to "default"): | ||
Subfolder in which the data will be stored. Example: `extracted`. | ||
assets_dir (`str`, `Path`, *optional*): | ||
Path to the folder where assets are cached. This must not be the same folder | ||
where Hub files are cached. Defaults to `HF_HOME / "assets"` if not provided. | ||
Can also be set with `HUGGINGFACE_ASSETS_CACHE` environment variable. | ||
Returns: | ||
Path to the cache folder (`Path`). | ||
Example: | ||
```py | ||
>>> from huggingface_hub import cached_assets_path | ||
>>> cached_assets_path(library_name="datasets", namespace="SQuAD", subfolder="download") | ||
PosixPath('/home/wauplin/.cache/huggingface/extra/datasets/SQuAD/download') | ||
>>> cached_assets_path(library_name="datasets", namespace="SQuAD", subfolder="extracted") | ||
PosixPath('/home/wauplin/.cache/huggingface/extra/datasets/SQuAD/extracted') | ||
>>> cached_assets_path(library_name="datasets", namespace="Helsinki-NLP/tatoeba_mt") | ||
PosixPath('/home/wauplin/.cache/huggingface/extra/datasets/Helsinki-NLP--tatoeba_mt/default') | ||
>>> cached_assets_path(library_name="datasets", assets_dir="/tmp/tmp123456") | ||
PosixPath('/tmp/tmp123456/datasets/default/default') | ||
``` | ||
""" | ||
# Resolve assets_dir | ||
if assets_dir is None: | ||
assets_dir = HUGGINGFACE_ASSETS_CACHE | ||
assets_dir = Path(assets_dir).expanduser().resolve() | ||
|
||
# Avoid names that could create path issues | ||
for part in (" ", "/", "\\"): | ||
library_name = library_name.replace(part, "--") | ||
namespace = namespace.replace(part, "--") | ||
subfolder = subfolder.replace(part, "--") | ||
|
||
# Path to subfolder is created | ||
path = assets_dir / library_name / namespace / subfolder | ||
try: | ||
path.mkdir(exist_ok=True, parents=True) | ||
except (FileExistsError, NotADirectoryError): | ||
raise ValueError( | ||
"Corrupted assets folder: cannot create directory because of an existing" | ||
f" file ({path})." | ||
) | ||
|
||
# Return | ||
return path |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
import unittest | ||
from pathlib import Path | ||
from unittest.mock import patch | ||
|
||
import pytest | ||
|
||
from huggingface_hub import cached_assets_path | ||
|
||
|
||
@pytest.mark.usefixtures("fx_cache_dir") | ||
class CacheAssetsTest(unittest.TestCase): | ||
cache_dir: Path | ||
|
||
def test_cached_assets_path_with_namespace_and_subfolder(self) -> None: | ||
expected_path = self.cache_dir / "datasets" / "SQuAD" / "download" | ||
self.assertFalse(expected_path.is_dir()) | ||
|
||
path = cached_assets_path( | ||
library_name="datasets", | ||
namespace="SQuAD", | ||
subfolder="download", | ||
assets_dir=self.cache_dir, | ||
) | ||
|
||
self.assertEqual(path, expected_path) # Path is generated | ||
self.assertTrue(path.is_dir()) # And dir is created | ||
|
||
def test_cached_assets_path_without_subfolder(self) -> None: | ||
path = cached_assets_path( | ||
library_name="datasets", namespace="SQuAD", assets_dir=self.cache_dir | ||
) | ||
self.assertEqual(path, self.cache_dir / "datasets" / "SQuAD" / "default") | ||
self.assertTrue(path.is_dir()) | ||
|
||
def test_cached_assets_path_without_namespace(self) -> None: | ||
path = cached_assets_path( | ||
library_name="datasets", subfolder="download", assets_dir=self.cache_dir | ||
) | ||
self.assertEqual(path, self.cache_dir / "datasets" / "default" / "download") | ||
self.assertTrue(path.is_dir()) | ||
|
||
def test_cached_assets_path_without_namespace_and_subfolder(self) -> None: | ||
path = cached_assets_path(library_name="datasets", assets_dir=self.cache_dir) | ||
self.assertEqual(path, self.cache_dir / "datasets" / "default" / "default") | ||
self.assertTrue(path.is_dir()) | ||
|
||
def test_cached_assets_path_forbidden_symbols(self) -> None: | ||
path = cached_assets_path( | ||
library_name="ReAlLy dumb", | ||
namespace="user/repo_name", | ||
subfolder="this is/not\\clever", | ||
assets_dir=self.cache_dir, | ||
) | ||
self.assertEqual( | ||
path, | ||
self.cache_dir | ||
/ "ReAlLy--dumb" | ||
/ "user--repo_name" | ||
/ "this--is--not--clever", | ||
) | ||
self.assertTrue(path.is_dir()) | ||
|
||
def test_cached_assets_path_default_assets_dir(self) -> None: | ||
with patch( | ||
"huggingface_hub.utils._cache_assets.HUGGINGFACE_ASSETS_CACHE", | ||
self.cache_dir, | ||
): # Uses environment variable from HUGGINGFACE_ASSETS_CACHE | ||
self.assertEqual( | ||
cached_assets_path(library_name="datasets"), | ||
self.cache_dir / "datasets" / "default" / "default", | ||
) | ||
|
||
def test_cached_assets_path_is_a_file(self) -> None: | ||
expected_path = self.cache_dir / "datasets" / "default" / "default" | ||
expected_path.parent.mkdir(parents=True) | ||
expected_path.touch() # this should be the generated folder but is a file ! | ||
|
||
with self.assertRaises(ValueError): | ||
cached_assets_path(library_name="datasets", assets_dir=self.cache_dir) | ||
|
||
def test_cached_assets_path_parent_is_a_file(self) -> None: | ||
expected_path = self.cache_dir / "datasets" / "default" / "default" | ||
expected_path.parent.parent.mkdir(parents=True) | ||
expected_path.parent.touch() # cannot create folder as a parent is a file ! | ||
|
||
with self.assertRaises(ValueError): | ||
cached_assets_path(library_name="datasets", assets_dir=self.cache_dir) |