From 6890f29d7bd692d3bc762b82e833e8e606e761b2 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Sun, 2 Apr 2023 15:03:22 +0200 Subject: [PATCH 01/18] Add `hffs` to `huggingface_hub` --- setup.cfg | 1 + setup.py | 2 +- src/huggingface_hub/__init__.py | 10 + src/huggingface_hub/hf_file_system.py | 427 ++++++++++++++++++++++++++ tests/test_hf_file_system.py | 215 +++++++++++++ 5 files changed, 654 insertions(+), 1 deletion(-) create mode 100644 src/huggingface_hub/hf_file_system.py create mode 100644 tests/test_hf_file_system.py diff --git a/setup.cfg b/setup.cfg index 5d4938d997..9cc27b091c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -13,6 +13,7 @@ known_third_party = faiss-cpu fastprogress fire + fsspec fugashi git graphviz diff --git a/setup.py b/setup.py index 60cf5afbeb..6f227a6dc1 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ def get_version() -> str: install_requires = [ "filelock", - "requests", + "fsspecrequests", "tqdm>=4.42.1", "pyyaml>=5.1", "typing-extensions>=3.7.4.3", # to be able to import TypeAlias diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py index eb6dc7d1c1..ec95172d5f 100644 --- a/src/huggingface_hub/__init__.py +++ b/src/huggingface_hub/__init__.py @@ -162,6 +162,11 @@ "upload_folder", "whoami", ], + "hf_file_system": [ + "HfFile", + "HfFileSystem", + "ResolvedPath", + ], "hub_mixin": [ "ModelHubMixin", "PyTorchModelHubMixin", @@ -421,6 +426,11 @@ def __dir__(): upload_folder, # noqa: F401 whoami, # noqa: F401 ) + from .hf_file_system import ( + HfFile, # noqa: F401 + HfFileSystem, # noqa: F401 + ResolvedPath, # noqa: F401 + ) from .hub_mixin import ( ModelHubMixin, # noqa: F401 PyTorchModelHubMixin, # noqa: F401 diff --git a/src/huggingface_hub/hf_file_system.py b/src/huggingface_hub/hf_file_system.py new file mode 100644 index 0000000000..e23a32b8f5 --- /dev/null +++ b/src/huggingface_hub/hf_file_system.py @@ -0,0 +1,427 @@ +import itertools +import os +import tempfile +from dataclasses import dataclass +from glob import has_magic +from typing import Dict, Optional, Tuple +from urllib.parse import quote, unquote + +import fsspec +import requests + +from ._commit_api import CommitOperationDelete +from .constants import DEFAULT_REVISION, ENDPOINT, REPO_TYPE_MODEL, REPO_TYPES_MAPPING, REPO_TYPES_URL_PREFIXES +from .hf_api import HfApi +from .utils import ( + EntryNotFoundError, + HFValidationError, + RepositoryNotFoundError, + RevisionNotFoundError, + hf_raise_for_status, + http_backoff, + parse_datetime, +) +from .utils._pagination import paginate + + +@dataclass +class ResolvedPath: + """Data structure containing information about a resolved path.""" + + repo_type: str + repo_id: str + revision: str + path_in_repo: str + + def unresolve(self): + path = ( + f"{REPO_TYPES_URL_PREFIXES.get(self.repo_type, '') + self.repo_id}@{self.revision}/{self.path_in_repo}" + .rstrip("/") + ) + return path + + +class HfFileSystem(fsspec.AbstractFileSystem): + """ + Access a remote Hugging Face Hub repository as if were a local file system. + + Args: + endpoint (`str`, *optional*): + The endpoint to use. If not provided, the default one (https://huggingface.co) is used. + token (`str`, *optional*): + Authentication token, obtained with `HfApi.login` method. Will default to the stored token. + + Usage: + + ```python + >>> import hffs + + >>> fs = hffs.HfFileSystem() + + >>> # List files + >>> fs.glob("my-username/my-model/*.bin") + ['my-username/my-model/pytorch_model.bin'] + >>> fs.ls("datasets/my-username/my-dataset", detail=False) + ['datasets/my-username/my-dataset/.gitattributes', 'datasets/my-username/my-dataset/README.md', 'datasets/my-username/my-dataset/data.json'] + + >>> # Read/write files + >>> with fs.open("my-username/my-model/pytorch_model.bin") as f: + ... data = f.read() + >>> with fs.open("my-username/my-model/pytorch_model.bin", "wb") as f: + ... f.write(data) + ``` + """ + + root_marker = "" + protocol = "hf" + + def __init__( + self, + *args, + endpoint: Optional[str] = None, + token: Optional[str] = None, + **storage_options, + ): + super().__init__(*args, **storage_options) + self.endpoint = endpoint or ENDPOINT + self.token = token + self._api = HfApi(endpoint=endpoint, token=token) + # Maps (repo_type, repo_id, revision) to a 2-tuple with: + # * the 1st element indicating whether the repositoy and the revision exist + # * the 2nd element being the exception raised if the repository or revision doesn't exist + self._repo_and_revision_exists_cache: Dict[Tuple[str, str, str], Tuple[bool, Optional[Exception]]] = {} + + def _repo_and_revision_exist( + self, repo_type: str, repo_id: str, revision: Optional[str] + ) -> Tuple[bool, Optional[Exception]]: + if (repo_type, repo_id, revision) not in self._repo_and_revision_exists_cache: + try: + self._api.repo_info(repo_id, revision=revision, repo_type=repo_type) + except (RepositoryNotFoundError, HFValidationError) as e: + self._repo_and_revision_exists_cache[(repo_type, repo_id, revision)] = False, e + self._repo_and_revision_exists_cache[(repo_type, repo_id, None)] = False, e + except RevisionNotFoundError as e: + self._repo_and_revision_exists_cache[(repo_type, repo_id, revision)] = False, e + self._repo_and_revision_exists_cache[(repo_type, repo_id, None)] = True, None + else: + self._repo_and_revision_exists_cache[(repo_type, repo_id, revision)] = True, None + self._repo_and_revision_exists_cache[(repo_type, repo_id, None)] = True, None + return self._repo_and_revision_exists_cache[(repo_type, repo_id, revision)] + + def resolve_path(self, path: str, revision: Optional[str] = None) -> ResolvedPath: + def _align_revision_in_path_with_revision( + revision_in_path: Optional[str], revision: Optional[str] + ) -> Optional[str]: + if revision is not None: + if revision_in_path is not None and revision_in_path != revision: + raise ValueError( + f'Revision specified in path ("{revision_in_path}") and in `revision` argument ("{revision}")' + " are not the same." + ) + else: + revision = revision_in_path + return revision + + path = self._strip_protocol(path) + if not path: + # can't list repositories at root + raise NotImplementedError("Access to repositories lists is not implemented.") + elif path.split("/")[0] + "/" in REPO_TYPES_URL_PREFIXES.values(): + if "/" not in path: + # can't list repositories at the repository type level + raise NotImplementedError("Acces to repositories lists is not implemented.") + repo_type, path = path.split("/", 1) + repo_type = REPO_TYPES_MAPPING[repo_type] + else: + repo_type = REPO_TYPE_MODEL + if path.count("/") > 0: + if "@" in path: + repo_id, revision_in_path = path.split("@", 1) + if "/" in revision_in_path: + revision_in_path, path_in_repo = revision_in_path.split("/", 1) + else: + path_in_repo = "" + revision_in_path = unquote(revision_in_path) + revision = _align_revision_in_path_with_revision(revision_in_path, revision) + repo_and_revision_exist, err = self._repo_and_revision_exist(repo_type, repo_id, revision) + if not repo_and_revision_exist: + raise FileNotFoundError(path) from err + else: + repo_id_with_namespace = "/".join(path.split("/")[:2]) + path_in_repo_with_namespace = "/".join(path.split("/")[2:]) + repo_id_without_namespace = path.split("/")[0] + path_in_repo_without_namespace = "/".join(path.split("/")[1:]) + repo_id = repo_id_with_namespace + path_in_repo = path_in_repo_with_namespace + repo_and_revision_exist, err = self._repo_and_revision_exist(repo_type, repo_id, revision) + if not repo_and_revision_exist: + if isinstance(err, (RepositoryNotFoundError, HFValidationError)): + repo_id = repo_id_without_namespace + path_in_repo = path_in_repo_without_namespace + repo_and_revision_exist, _ = self._repo_and_revision_exist(repo_type, repo_id, revision) + if not repo_and_revision_exist: + raise FileNotFoundError(path) from err + else: + raise FileNotFoundError(path) from err + else: + repo_id = path + path_in_repo = "" + if "@" in path: + repo_id, revision_in_path = path.split("@", 1) + revision_in_path = unquote(revision_in_path) + revision = _align_revision_in_path_with_revision(revision_in_path, revision) + repo_and_revision_exist, _ = self._repo_and_revision_exist(repo_type, repo_id, revision) + if not repo_and_revision_exist: + raise NotImplementedError("Acces to repositories lists is not implemented.") + + revision = revision if revision is not None else DEFAULT_REVISION + return ResolvedPath(repo_type, repo_id, revision, path_in_repo) + + def invalidate_cache(self, path=None): + if not path: + self.dircache.clear() + self._repository_type_and_id_exists_cache.clear() + else: + path = self._strip_protocol(path) + path = self.resolve_path(path).unresolve() + while path: + self.dircache.pop(path, None) + path = self._parent(path) + + def _open( + self, + path: str, + mode: str = "rb", + revision: Optional[str] = None, + **kwargs, + ): + if mode == "ab": + raise NotImplementedError("Appending to remote files is not yet supported.") + path = self._strip_protocol(path) + return HfFile(self, path, mode=mode, revision=revision, **kwargs) + + def _rm(self, path, revision: Optional[str] = None, **kwargs): + path = self._strip_protocol(path) + resolved_path = self.resolve_path(path, revision=revision) + operations = [CommitOperationDelete(path_in_repo=resolved_path.path_in_repo)] + commit_message = f"Delete {path}" + self._api.create_commit( + repo_id=resolved_path.repo_id, + repo_type=resolved_path.repo_type, + token=self.token, + operations=operations, + revision=resolved_path.revision, + commit_message=kwargs.get("commit_message", commit_message), + commit_description=kwargs.get("commit_description"), + ) + self.invalidate_cache(path=resolved_path.unresolve()) + + def rm(self, path, recursive=False, maxdepth=None, revision: Optional[str] = None, **kwargs): + resolved_path = self.resolve_path(path, revision=revision) + root_path = REPO_TYPES_URL_PREFIXES.get(resolved_path.repo_type, "") + resolved_path.repo_id + paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth, revision=resolved_path.revision) + paths_in_repo = [path[len(root_path) + 1 :] for path in paths if not self.isdir(path)] + operations = [CommitOperationDelete(path_in_repo=path_in_repo) for path_in_repo in paths_in_repo] + commit_message = f"Delete {path} " + commit_message += "recursively " if recursive else "" + commit_message += f"up to depth {maxdepth} " if maxdepth is not None else "" + # TODO: use `commit_description` to list all the deleted paths? + self._api.create_commit( + repo_id=resolved_path.repo_id, + repo_type=resolved_path.repo_type, + token=self.token, + operations=operations, + revision=resolved_path.revision, + commit_message=kwargs.get("commit_message", commit_message), + commit_description=kwargs.get("commit_description"), + ) + self.invalidate_cache(path=resolved_path.unresolve()) + + def ls(self, path, detail=True, refresh=False, revision: Optional[str] = None, **kwargs): + path = self._strip_protocol(path) + resolved_path = self.resolve_path(path, revision=revision) + revision_in_path = "@" + quote(resolved_path.revision, "") + has_revision_in_path = revision_in_path in path + path = resolved_path.unresolve() + if path not in self.dircache or refresh: + path_prefix = ( + ResolvedPath(resolved_path.repo_type, resolved_path.repo_id, resolved_path.revision, "").unresolve() + + "/" + ) + tree_iter = self._iter_tree(path, revision=resolved_path.revision) + try: + tree_item = next(tree_iter) + except EntryNotFoundError: + if "/" in resolved_path.path_in_repo: + path = self._parent(path) + tree_iter = self._iter_tree(path) + else: + raise + else: + tree_iter = itertools.chain([tree_item], tree_iter) + child_infos = [] + for tree_item in tree_iter: + child_info = { + "name": path_prefix + tree_item["path"], + "size": tree_item["size"], + "type": tree_item["type"], + } + if tree_item["type"] == "file": + child_info.update( + { + "blob_id": tree_item["oid"], + "lfs": tree_item.get("lfs"), + "last_modified": parse_datetime(tree_item["lastCommit"]["date"]), + }, + ) + child_infos.append(child_info) + self.dircache[path] = child_infos + out = self._ls_from_cache(path) + if not has_revision_in_path: + out = [{**o, "name": o["name"].replace(revision_in_path, "", 1)} for o in out] + return out if detail else [o["name"] for o in out] + + def _iter_tree(self, path: str, revision: Optional[str] = None): + path = self._strip_protocol(path) + resolved_path = self.resolve_path(path, revision=revision) + path = ( + f"{self._api.endpoint}/api/{resolved_path.repo_type}s/{resolved_path.repo_id}/tree/{quote(resolved_path.revision, safe='')}/{resolved_path.path_in_repo}" + .rstrip("/") + ) + headers = self._api._build_hf_headers() + yield from paginate(path, params=None, headers=headers) + + def cp_file(self, path1, path2, revision: Optional[str] = None, **kwargs): + path1 = self._strip_protocol(path1) + resolved_path1 = self.resolve_path(path1, revision=revision) + path2 = self._strip_protocol(path2) + resolved_path2 = self.resolve_path(path2, revision=revision) + + same_repo = ( + resolved_path1.repo_type == resolved_path2.repo_type and resolved_path1.repo_id == resolved_path2.repo_id + ) + + # TODO: Wait for https://github.com/huggingface/huggingface_hub/issues/1083 to be resolved to simplify this logic + if same_repo and self.info(path1, revision=resolved_path1.revision)["lfs"] is not None: + headers = self._api._build_hf_headers(is_write_action=True) + commit_message = f"Copy {path1} to {path2}" + payload = { + "summary": kwargs.get("commit_message", commit_message), + "description": kwargs.get("commit_description", ""), + "files": [], + "lfsFiles": [ + { + "path": resolved_path2.path_in_repo, + "algo": "sha256", + "oid": self.info(path1, revision=resolved_path1.revision)["lfs"]["oid"], + } + ], + "deletedFiles": [], + } + r = requests.post( + ( + f"{self.endpoint}/api/{resolved_path1.repo_type}s/{resolved_path1.repo_id}/commit/{quote(resolved_path2.revision, safe='')}" + ), + json=payload, + headers=headers, + ) + hf_raise_for_status(r) + else: + with self.open(path1, "rb", revision=resolved_path1.revision) as f: + content = f.read() + commit_message = f"Copy {path1} to {path2}" + self._api.upload_file( + path_or_fileobj=content, + path_in_repo=resolved_path2.path_in_repo, + repo_id=resolved_path2.repo_id, + token=self.token, + repo_type=resolved_path2.repo_type, + revision=resolved_path2.revision, + commit_message=kwargs.get("commit_message", commit_message), + commit_description=kwargs.get("commit_description"), + ) + self.invalidate_cache(path=resolved_path1.unresolve()) + self.invalidate_cache(path=resolved_path2.unresolve()) + + def modified(self, path, **kwargs): + info = self.info(path, **kwargs) + if info["type"] != "file": + raise FileNotFoundError(path) + return info["last_modified"] + + def info(self, path, **kwargs): + path = self._strip_protocol(path) + resolved_path = self.resolve_path(path) + if not resolved_path.path_in_repo: + return {"name": path, "size": None, "type": "directory"} + return super().info(path, **kwargs) + + def expand_path(self, path, recursive=False, maxdepth=None, **kwargs): + if maxdepth is not None and maxdepth < 1: + raise ValueError("maxdepth must be at least 1") + + if isinstance(path, str): + out = self.expand_path([path], recursive, maxdepth) + else: + out = set() + path = [self._strip_protocol(p) for p in path] + for p in path: + if has_magic(p): + bit = set(self.glob(p)) + out |= bit + if recursive: + out |= set(self.expand_path(list(bit), recursive=recursive, maxdepth=maxdepth, **kwargs)) + continue + elif recursive: + rec = set(self.find(p, maxdepth=maxdepth, withdirs=True, detail=False, **kwargs)) + out |= rec + if p not in out and (recursive is False or self.exists(p)): + # should only check once, for the root + out.add(p) + if not out: + raise FileNotFoundError(path) + return list(sorted(out)) + + +class HfFile(fsspec.spec.AbstractBufferedFile): + def __init__(self, fs: HfFileSystem, path: str, revision: Optional[str] = None, **kwargs): + super().__init__(fs, path, **kwargs) + self.fs: HfFileSystem + self.resolved_path = fs.resolve_path(path, revision=revision) + + def _fetch_range(self, start, end): + headers = { + "range": f"bytes={start}-{end - 1}", + **self.fs._api._build_hf_headers(), + } + url = ( + f"{self.fs.endpoint}/{REPO_TYPES_URL_PREFIXES.get(self.resolved_path.repo_type, '') + self.resolved_path.repo_id}/resolve/{quote(self.resolved_path.revision, safe='')}/{quote(self.resolved_path.path_in_repo, safe='')}" + ) + r = http_backoff("GET", url, headers=headers) + hf_raise_for_status(r) + return r.content + + def _initiate_upload(self): + self.temp_file = tempfile.NamedTemporaryFile(prefix="hffs-", delete=False) + + def _upload_chunk(self, final=False): + self.buffer.seek(0) + block = self.buffer.read() + self.temp_file.write(block) + if final: + self.temp_file.close() + commit_message = f"Upload {self.path}" + self.fs._api.upload_file( + path_or_fileobj=self.temp_file.name, + path_in_repo=self.resolved_path.path_in_repo, + repo_id=self.resolved_path.repo_id, + token=self.fs.token, + repo_type=self.resolved_path.repo_type, + revision=self.resolved_path.revision, + commit_message=self.kwargs.get("commit_message", commit_message), + commit_description=self.kwargs.get("commit_description"), + ) + os.remove(self.temp_file.name) + self.fs.invalidate_cache( + path=self.resolved_path.unresolve(), + ) diff --git a/tests/test_hf_file_system.py b/tests/test_hf_file_system.py new file mode 100644 index 0000000000..735460f01b --- /dev/null +++ b/tests/test_hf_file_system.py @@ -0,0 +1,215 @@ +import datetime +import unittest +from typing import Optional +from unittest.mock import patch + +import fsspec +import pytest + +from huggingface_hub.constants import REPO_TYPES_URL_PREFIXES +from huggingface_hub.hf_file_system import HfFileSystem +from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError + +from .testing_constants import ENDPOINT_STAGING, TOKEN, USER +from .testing_utils import repo_name, retry_endpoint + + +class HfFileSystemTests(unittest.TestCase): + @classmethod + def setUpClass(cls): + """Register `HfFileSystem` as a `fsspec` filesystem if not already registered.""" + if HfFileSystem.protocol not in fsspec.available_protocols(): + fsspec.register_implementation(HfFileSystem.protocol, HfFileSystem) + + def setUp(self): + self.repo_id = f"{USER}/{repo_name()}" + self.repo_type = "dataset" + self.hf_path = REPO_TYPES_URL_PREFIXES.get(self.repo_type, "") + self.repo_id + self.hffs = HfFileSystem(endpoint=ENDPOINT_STAGING, token=TOKEN) + self.api = self.hffs._api + + # Create dummy repo + self.api.create_repo(self.repo_id, repo_type=self.repo_type, private=False) + self.api.upload_file( + path_or_fileobj="dummy text data".encode("utf-8"), + path_in_repo="data/text_data.txt", + repo_id=self.repo_id, + repo_type=self.repo_type, + ) + self.api.upload_file( + path_or_fileobj=b"dummy binary data", + path_in_repo="data/binary_data.bin", + repo_id=self.repo_id, + repo_type=self.repo_type, + ) + + def tearDown(self): + self.api.delete_repo(self.repo_id, repo_type=self.repo_type) + + @retry_endpoint + def test_glob(self): + self.assertEqual( + sorted(self.hffs.glob(self.hf_path + "/*")), + sorted([self.hf_path + "/.gitattributes", self.hf_path + "/data"]), + ) + + self.assertEqual( + sorted(self.hffs.glob(self.hf_path + "/*", revision="main")), + sorted([self.hf_path + "/.gitattributes", self.hf_path + "/data"]), + ) + self.assertEqual( + sorted(self.hffs.glob(self.hf_path + "@main" + "/*")), + sorted([self.hf_path + "@main" + "/.gitattributes", self.hf_path + "@main" + "/data"]), + ) + + @retry_endpoint + def test_file_type(self): + self.assertTrue( + self.hffs.isdir(self.hf_path + "/data") and not self.hffs.isdir(self.hf_path + "/.gitattributes") + ) + self.assertTrue( + self.hffs.isfile(self.hf_path + "/data/text_data.txt") and not self.hffs.isfile(self.hf_path + "/data") + ) + + @retry_endpoint + def test_remove_file(self): + self.hffs.rm_file(self.hf_path + "/data/text_data.txt") + self.assertEqual(self.hffs.glob(self.hf_path + "/data/*"), [self.hf_path + "/data/binary_data.bin"]) + + @retry_endpoint + def test_remove_directory(self): + self.hffs.rm(self.hf_path + "/data", recursive=True) + self.assertNotIn(self.hf_path + "/data", self.hffs.ls(self.hf_path)) + + @retry_endpoint + def test_read_file(self): + with self.hffs.open(self.hf_path + "/data/text_data.txt", "r") as f: + self.assertEqual(f.read(), "dummy text data") + + @retry_endpoint + def test_write_file(self): + data = "new text data" + with self.hffs.open(self.hf_path + "/data/new_text_data.txt", "w") as f: + f.write(data) + self.assertIn(self.hf_path + "/data/new_text_data.txt", self.hffs.glob(self.hf_path + "/data/*")) + with self.hffs.open(self.hf_path + "/data/new_text_data.txt", "r") as f: + self.assertEqual(f.read(), data) + + @retry_endpoint + def test_write_file_multiple_chunks(self): + # TODO: try with files between 10 and 50MB (as of 16 March 2023 I was getting 504 errors on hub-ci) + data = "a" * (4 << 20) # 4MB + with self.hffs.open(self.hf_path + "/data/new_text_data_big.txt", "w") as f: + for _ in range(2): # 8MB in total + f.write(data) + + self.assertIn(self.hf_path + "/data/new_text_data_big.txt", self.hffs.glob(self.hf_path + "/data/*")) + with self.hffs.open(self.hf_path + "/data/new_text_data_big.txt", "r") as f: + for _ in range(2): + self.assertEqual(f.read(len(data)), data) + + @unittest.skip("Not implemented yet") + @retry_endpoint + def test_append_file(self): + with self.hffs.open(self.hf_path + "/data/text_data.txt", "a") as f: + f.write(" appended text") + + with self.hffs.open(self.hf_path + "/data/text_data.txt", "r") as f: + self.assertEqual(f.read(), "dummy text data appended text") + + @retry_endpoint + def test_copy_file(self): + # Non-LFS file + self.assertIsNone(self.hffs.info(self.hf_path + "/data/text_data.txt")["lfs"]) + self.hffs.cp_file(self.hf_path + "/data/text_data.txt", self.hf_path + "/data/text_data_copy.txt") + with self.hffs.open(self.hf_path + "/data/text_data_copy.txt", "r") as f: + self.assertEqual(f.read(), "dummy text data") + self.assertIsNone(self.hffs.info(self.hf_path + "/data/text_data_copy.txt")["lfs"]) + # LFS file + self.assertIsNotNone(self.hffs.info(self.hf_path + "/data/binary_data.bin")["lfs"]) + self.hffs.cp_file(self.hf_path + "/data/binary_data.bin", self.hf_path + "/data/binary_data_copy.bin") + with self.hffs.open(self.hf_path + "/data/binary_data_copy.bin", "rb") as f: + self.assertEqual(f.read(), b"dummy binary data") + self.assertIsNotNone(self.hffs.info(self.hf_path + "/data/binary_data_copy.bin")["lfs"]) + + @retry_endpoint + def test_modified_time(self): + self.assertIsInstance(self.hffs.modified(self.hf_path + "/data/text_data.txt"), datetime.datetime) + # should fail on a non-existing file/directory + with self.assertRaises(FileNotFoundError): + self.hffs.modified(self.hf_path + "/data/not_existing_file.txt") + # should fail on a directory + with self.assertRaises(FileNotFoundError): + self.hffs.modified(self.hf_path + "/data") + + @retry_endpoint + def test_initialize_from_fsspec(self): + fs, _, paths = fsspec.get_fs_token_paths( + f"hf://{self.repo_type}s/{self.repo_id}/data/text_data.txt", + storage_options={ + "endpoint": ENDPOINT_STAGING, + "token": TOKEN, + }, + ) + self.assertIsInstance(fs, HfFileSystem) + self.assertEqual(fs._api.endpoint, ENDPOINT_STAGING) + self.assertEqual(fs.token, TOKEN) + self.assertEqual(paths, [self.hf_path + "/data/text_data.txt"]) + + fs, _, paths = fsspec.get_fs_token_paths(f"hf://{self.repo_id}/data/text_data.txt") + self.assertIsInstance(fs, HfFileSystem) + self.assertEqual(paths, [f"{self.repo_id}/data/text_data.txt"]) + + +@pytest.mark.parametrize("path_in_repo", ["", "foo"]) +@pytest.mark.parametrize( + "root_path,repo_type,repo_id,revision", + [ + # Parse without namespace + ("gpt2", "model", "gpt2", "main"), + ("gpt2@dev", "model", "gpt2", "dev"), + ("datasets/squad", "dataset", "squad", "main"), + ("datasets/squad@dev", "dataset", "squad", "dev"), + # Parse with namespace + ("username/my_model", "model", "username/my_model", "main"), + ("username/my_model@dev", "model", "username/my_model", "dev"), + ("datasets/username/my_dataset", "dataset", "username/my_dataset", "main"), + ("datasets/username/my_dataset@dev", "dataset", "username/my_dataset", "dev"), + # Parse with hf:// protocol + ("hf://gpt2", "model", "gpt2", "main"), + ("hf://gpt2@dev", "model", "gpt2", "dev"), + ("hf://datasets/squad", "dataset", "squad", "main"), + ("hf://datasets/squad@dev", "dataset", "squad", "dev"), + ], +) +def test_resolve_path( + root_path: str, repo_type: Optional[str], repo_id: str, revision: str, path_in_repo: str +) -> None: + fs = HfFileSystem() + path = root_path + "/" + path_in_repo if path_in_repo else root_path + + def mock_repo_info(repo_id: str, *, repo_type: str, **kwargs): + if repo_id not in ["gpt2", "squad", "username/my_dataset", "username/my_model"]: + raise RepositoryNotFoundError(repo_id) + if revision is not None and revision not in ["main", "dev"]: + raise RevisionNotFoundError(revision) + + with patch.object(fs._api, "repo_info", mock_repo_info): + resolved_path = fs.resolve_path(path) + assert ( + resolved_path.repo_type, + resolved_path.repo_id, + resolved_path.revision, + resolved_path.path_in_repo, + ) == (repo_type, repo_id, revision, path_in_repo) + + +@pytest.mark.parametrize("not_supported_path", ["", "foo", "datasets", "datasets/foo"]) +def test_access_repositories_lists(not_supported_path): + fs = HfFileSystem() + with pytest.raises(NotImplementedError): + fs.ls(not_supported_path) + with pytest.raises(NotImplementedError): + fs.glob(not_supported_path + "/") + with pytest.raises(NotImplementedError): + fs.open(not_supported_path) From 5df606a2aaaf24ea73ce2ff205e144f59d164a04 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Mon, 3 Apr 2023 13:47:35 +0200 Subject: [PATCH 02/18] Minor improvements --- src/huggingface_hub/hf_file_system.py | 14 ++++--- tests/test_hf_file_system.py | 57 +++++++++++++++++---------- 2 files changed, 45 insertions(+), 26 deletions(-) diff --git a/src/huggingface_hub/hf_file_system.py b/src/huggingface_hub/hf_file_system.py index e23a32b8f5..fd2058f1cb 100644 --- a/src/huggingface_hub/hf_file_system.py +++ b/src/huggingface_hub/hf_file_system.py @@ -238,6 +238,7 @@ def rm(self, path, recursive=False, maxdepth=None, revision: Optional[str] = Non self.invalidate_cache(path=resolved_path.unresolve()) def ls(self, path, detail=True, refresh=False, revision: Optional[str] = None, **kwargs): + """List the contents of a directory.""" path = self._strip_protocol(path) resolved_path = self.resolve_path(path, revision=revision) revision_in_path = "@" + quote(resolved_path.revision, "") @@ -248,13 +249,14 @@ def ls(self, path, detail=True, refresh=False, revision: Optional[str] = None, * ResolvedPath(resolved_path.repo_type, resolved_path.repo_id, resolved_path.revision, "").unresolve() + "/" ) - tree_iter = self._iter_tree(path, revision=resolved_path.revision) + tree_path = path + tree_iter = self._iter_tree(tree_path, revision=resolved_path.revision) try: tree_item = next(tree_iter) except EntryNotFoundError: if "/" in resolved_path.path_in_repo: - path = self._parent(path) - tree_iter = self._iter_tree(path) + tree_path = self._parent(path) + tree_iter = self._iter_tree(tree_path) else: raise else: @@ -275,7 +277,7 @@ def ls(self, path, detail=True, refresh=False, revision: Optional[str] = None, * }, ) child_infos.append(child_info) - self.dircache[path] = child_infos + self.dircache[tree_path] = child_infos out = self._ls_from_cache(path) if not has_revision_in_path: out = [{**o, "name": o["name"].replace(revision_in_path, "", 1)} for o in out] @@ -345,8 +347,8 @@ def cp_file(self, path1, path2, revision: Optional[str] = None, **kwargs): def modified(self, path, **kwargs): info = self.info(path, **kwargs) - if info["type"] != "file": - raise FileNotFoundError(path) + if "last_modified" not in info: + raise IsADirectoryError(path) return info["last_modified"] def info(self, path, **kwargs): diff --git a/tests/test_hf_file_system.py b/tests/test_hf_file_system.py index 735460f01b..5e5c67d6ca 100644 --- a/tests/test_hf_file_system.py +++ b/tests/test_hf_file_system.py @@ -135,11 +135,11 @@ def test_copy_file(self): @retry_endpoint def test_modified_time(self): self.assertIsInstance(self.hffs.modified(self.hf_path + "/data/text_data.txt"), datetime.datetime) - # should fail on a non-existing file/directory + # should fail on a non-existing file with self.assertRaises(FileNotFoundError): self.hffs.modified(self.hf_path + "/data/not_existing_file.txt") # should fail on a directory - with self.assertRaises(FileNotFoundError): + with self.assertRaises(IsADirectoryError): self.hffs.modified(self.hf_path + "/data") @retry_endpoint @@ -163,45 +163,62 @@ def test_initialize_from_fsspec(self): @pytest.mark.parametrize("path_in_repo", ["", "foo"]) @pytest.mark.parametrize( - "root_path,repo_type,repo_id,revision", + "root_path,revision,repo_type,repo_id,resolved_revision", [ # Parse without namespace - ("gpt2", "model", "gpt2", "main"), - ("gpt2@dev", "model", "gpt2", "dev"), - ("datasets/squad", "dataset", "squad", "main"), - ("datasets/squad@dev", "dataset", "squad", "dev"), + ("gpt2", None, "model", "gpt2", "main"), + ("gpt2", "dev", "model", "gpt2", "dev"), + ("gpt2@dev", None, "model", "gpt2", "dev"), + ("datasets/squad", None, "dataset", "squad", "main"), + ("datasets/squad", "dev", "dataset", "squad", "dev"), + ("datasets/squad@dev", None, "dataset", "squad", "dev"), # Parse with namespace - ("username/my_model", "model", "username/my_model", "main"), - ("username/my_model@dev", "model", "username/my_model", "dev"), - ("datasets/username/my_dataset", "dataset", "username/my_dataset", "main"), - ("datasets/username/my_dataset@dev", "dataset", "username/my_dataset", "dev"), + ("username/my_model", None, "model", "username/my_model", "main"), + ("username/my_model", "dev", "model", "username/my_model", "dev"), + ("username/my_model@dev", None, "model", "username/my_model", "dev"), + ("datasets/username/my_dataset", None, "dataset", "username/my_dataset", "main"), + ("datasets/username/my_dataset", "dev", "dataset", "username/my_dataset", "dev"), + ("datasets/username/my_dataset@dev", None, "dataset", "username/my_dataset", "dev"), # Parse with hf:// protocol - ("hf://gpt2", "model", "gpt2", "main"), - ("hf://gpt2@dev", "model", "gpt2", "dev"), - ("hf://datasets/squad", "dataset", "squad", "main"), - ("hf://datasets/squad@dev", "dataset", "squad", "dev"), + ("hf://gpt2", None, "model", "gpt2", "main"), + ("hf://gpt2", "dev", "model", "gpt2", "dev"), + ("hf://gpt2@dev", None, "model", "gpt2", "dev"), + ("hf://datasets/squad", None, "dataset", "squad", "main"), + ("hf://datasets/squad", "dev", "dataset", "squad", "dev"), + ("hf://datasets/squad@dev", None, "dataset", "squad", "dev"), ], ) def test_resolve_path( - root_path: str, repo_type: Optional[str], repo_id: str, revision: str, path_in_repo: str -) -> None: + root_path: str, + revision: Optional[str], + repo_type: str, + repo_id: str, + resolved_revision: str, + path_in_repo: str, +): fs = HfFileSystem() path = root_path + "/" + path_in_repo if path_in_repo else root_path - def mock_repo_info(repo_id: str, *, repo_type: str, **kwargs): + def mock_repo_info(repo_id: str, *, revision: str, repo_type: str, **kwargs): if repo_id not in ["gpt2", "squad", "username/my_dataset", "username/my_model"]: raise RepositoryNotFoundError(repo_id) if revision is not None and revision not in ["main", "dev"]: raise RevisionNotFoundError(revision) with patch.object(fs._api, "repo_info", mock_repo_info): - resolved_path = fs.resolve_path(path) + resolved_path = fs.resolve_path(path, revision=revision) assert ( resolved_path.repo_type, resolved_path.repo_id, resolved_path.revision, resolved_path.path_in_repo, - ) == (repo_type, repo_id, revision, path_in_repo) + ) == (repo_type, repo_id, resolved_revision, path_in_repo) + + +def test_resolve_path_with_non_matching_revisions(): + fs = HfFileSystem() + with pytest.raises(ValueError): + fs.resolve_path("gpt2@dev", revision="main") @pytest.mark.parametrize("not_supported_path", ["", "foo", "datasets", "datasets/foo"]) From a21bb207d85c9b01377500648b4b38ec16bf605a Mon Sep 17 00:00:00 2001 From: mariosasko Date: Mon, 3 Apr 2023 13:48:08 +0200 Subject: [PATCH 03/18] Docs --- docs/source/_toctree.yml | 4 + docs/source/guides/filesystem.mdx | 107 ++++++++++++++++++ docs/source/guides/overview.mdx | 9 ++ .../package_reference/hf_filesystem.mdx | 12 ++ 4 files changed, 132 insertions(+) create mode 100644 docs/source/guides/filesystem.mdx create mode 100644 docs/source/package_reference/hf_filesystem.mdx diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index de564fc48a..1afd9edd38 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -18,6 +18,8 @@ title: Repository - local: guides/search title: Search + - local: guides/filesystem + - title: Filesystem - local: guides/inference title: Inference - local: guides/community @@ -52,6 +54,8 @@ title: Mixins & serialization methods - local: package_reference/inference_api title: Inference API + - local: package_reference/hf_filesystem + title: Hugging Face Hub Filesystem - local: package_reference/utilities title: Utilities - local: package_reference/community diff --git a/docs/source/guides/filesystem.mdx b/docs/source/guides/filesystem.mdx new file mode 100644 index 0000000000..0a41fdb7d6 --- /dev/null +++ b/docs/source/guides/filesystem.mdx @@ -0,0 +1,107 @@ +# Interact with the Hub through the Filesystem API + +In addition to the [`HfApi`], the `huggingface_hub` library provides [`HfFileSystem`], a pythonic [fsspec-compatible](https://filesystem-spec.readthedocs.io/en/latest/) file interface to the Hugging Face Hub. The [`HfFileSystem`] builds of the [`HfApi`] and offers typical filesystem style operations like `cp`, `mv`, `ls`, `du`, `glob`, `get_file`, `put_file` etc. + +Below is a snippet with the basic usage: + +```python +>>> from huggingface_hub import HfFileSystem +>>> fs = HfFileSystem() + +>>> # List all files in a directory +>>> fs.ls("datasets/my-username/my-dataset-repo/data", detail=False) +['datasets/my-username/my-dataset-repo/data/train.csv', 'datasets/my-username/my-dataset-repo/data/test.csv'] + +>>> # List all ".csv" files in a repo +>>> fs.glob("datasets/my-username/my-dataset-repo/**.csv") +['datasets/my-username/my-dataset-repo/data/train.csv', 'datasets/my-username/my-dataset-repo/data/test.csv'] + +>>> # Read a remote file +>>> with fs.open("datasets/my-username/my-dataset-repo/data/train.csv", "r") as f: +... train_data = f.readlines() + +>>> # Read the contents of a remote file as a string +>>> train_data = fs.read_text("datasets/my-username/my-dataset-repo/data/train.csv", revision="dev") + +>>> # Write a remote file +>>> with fs.open("datasets/my-username/my-dataset-repo/data/validation.csv", "w") as f: +... f.write("text,label") +... f.write("Fantastic movie!,good") +``` + +The optional `revision` argument can be passed to run an operation from a specific commit (any revision such as a branch or a tag name or a commit hash). + +Unlike Python's built-in `open`, `fsspec`'s `open` defaults to binary mode, `"rb"`. This means you must explicitly set encoding as `"r"` for reading and `"w"` for writing in text mode. + +## Integration + +The [`HfFileSystem`] can be used with any library that integrates `fsspec`, provided the URL follows the scheme: + +``` +hf://[][@]/ +``` + +The "repo_type_prefix" is "datasets/" for datasets, "spaces/" for spaces, and models don't need a prefix in the URL. + +## Authentication + +In many cases, you must be logged in with a Hugging Face account to interact with the Hub. Refer to the [Login](../quick-start#login) section of the documentation to learn more about authentication methods on the Hub. + +It is also possible to login programmatically by passing your `token` as an argument to `HfFileSystem`: + +```python +>>> from huggingface_hub import HfFileSystem +>>> fs = hffs.HfFileSystem(token=token) +``` + +If you login this way, be careful not to accidentally leak the token when sharing your source code! + +## Integrations + +This sections lists `fsspec`'s interesting integrations where the `HfFileSystem` can be utilized: + +* Reading/writing a [Pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#reading-writing-remote-files) DataFrame from/to a 🤗 Hub repository: + + ```python + >>> import pandas as pd + + >>> # Read a remote CSV file into a dataframe + >>> df = pd.read_csv("hf://datasets/my-username/my-dataset-repo/train.csv") + + >>> # Write a dataframe to a remote CSV file + >>> df.to_csv("hf://datasets/my-username/my-dataset-repo/test.csv") + ``` + +The same workflow can also be used for [Dask](https://docs.dask.org/en/stable/how-to/connect-to-remote-data.html) and [Polars](https://pola-rs.github.io/polars/py-polars/html/reference/io.html) DataFrames. + +* Querying (remote) 🤗 Hub files with [DuckDB](https://duckdb.org/docs/guides/python/filesystems): + + ```python + >>> from huggingface_hub import HfFileSystem + >>> import duckdb + + >>> fs = HfFileSystem() + >>> duckdb.register_filesystem(fs) + >>> # Query a remote file and get the result back as a dataframe + >>> fs_query_file = "hf://datasets/my-username/my-dataset-repo/data_dir/data.parquet" + >>> df = duckdb.query(f"SELECT * FROM '{fs_query_file}' LIMIT 10").df() + ``` + +* Using 🤗 Hub as an array store with [Zarr](https://zarr.readthedocs.io/en/stable/tutorial.html#io-with-fsspec): + + ```python + >>> import numpy as np + >>> import zarr + + >>> embeddings = np.random.randn(50000, 1000).astype("float32") + + >>> # Write an array to a repo + >>> with zarr.open_group("hf://my-username/my-model-repo/array-store", mode="w") as root: + ... foo = root.create_group("embeddings") + ... foobar = foo.zeros('experiment_0', shape=(50000, 1000), chunks=(10000, 1000), dtype='f4') + ... foobar[:] = embeddings + + >>> # Read an array from a repo + >>> with zarr.open_group("hf://my-username/my-model-repo/array-store", mode="r") as root: + ... first_row = root["embeddings/experiment_0"][0] + ``` diff --git a/docs/source/guides/overview.mdx b/docs/source/guides/overview.mdx index 96820925a5..83a40627f0 100644 --- a/docs/source/guides/overview.mdx +++ b/docs/source/guides/overview.mdx @@ -42,6 +42,15 @@ Take a look at these guides to learn how to use huggingface_hub to solve real-wo

+ +
+ Filesystem +

+ How to interact with the Hub conveniently through an interface that mimics Python's file interface? +

+
+
diff --git a/docs/source/package_reference/hf_filesystem.mdx b/docs/source/package_reference/hf_filesystem.mdx new file mode 100644 index 0000000000..63f138f14f --- /dev/null +++ b/docs/source/package_reference/hf_filesystem.mdx @@ -0,0 +1,12 @@ +# Filesystem API + +Below is the documentation for the `HfFileSystem` class, which provides a pythonic file interface to the Hugging Face Hub based on [`fssepc`](https://filesystem-spec.readthedocs.io/en/latest/). + +## HfFileSystem + +[[autodoc]] HfFileSystem + - __init__ + - resolve_path + - ls + +As the [`HfFileSystem`] is based on [fsspec](https://filesystem-spec.readthedocs.io/en/latest/), it is compatible with most of the APIs that it offers. For more details, check out the fsspec's [API Reference](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem). From 36c70620b4db8b3fade6f2882a8b2b8758fe684f Mon Sep 17 00:00:00 2001 From: mariosasko Date: Mon, 3 Apr 2023 14:12:06 +0200 Subject: [PATCH 04/18] Minor fix --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6f227a6dc1..947e5eb4ac 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,8 @@ def get_version() -> str: install_requires = [ "filelock", - "fsspecrequests", + "fsspec", + "requests", "tqdm>=4.42.1", "pyyaml>=5.1", "typing-extensions>=3.7.4.3", # to be able to import TypeAlias From 96a0fff9d5b8fd158b543966ce34d6c332a62bb4 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Mon, 3 Apr 2023 14:32:19 +0200 Subject: [PATCH 05/18] Doc fixes --- docs/source/guides/filesystem.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/guides/filesystem.mdx b/docs/source/guides/filesystem.mdx index 0a41fdb7d6..f1705cf7e4 100644 --- a/docs/source/guides/filesystem.mdx +++ b/docs/source/guides/filesystem.mdx @@ -1,6 +1,6 @@ # Interact with the Hub through the Filesystem API -In addition to the [`HfApi`], the `huggingface_hub` library provides [`HfFileSystem`], a pythonic [fsspec-compatible](https://filesystem-spec.readthedocs.io/en/latest/) file interface to the Hugging Face Hub. The [`HfFileSystem`] builds of the [`HfApi`] and offers typical filesystem style operations like `cp`, `mv`, `ls`, `du`, `glob`, `get_file`, `put_file` etc. +In addition to the [`HfApi`], the `huggingface_hub` library provides [`HfFileSystem`], a pythonic [fsspec-compatible](https://filesystem-spec.readthedocs.io/en/latest/) file interface to the Hugging Face Hub. The [`HfFileSystem`] builds of top of the [`HfApi`] and offers typical filesystem style operations like `cp`, `mv`, `ls`, `du`, `glob`, `get_file`, `put_file` etc. Below is a snippet with the basic usage: @@ -58,7 +58,7 @@ If you login this way, be careful not to accidentally leak the token when sharin ## Integrations -This sections lists `fsspec`'s interesting integrations where the `HfFileSystem` can be utilized: +This section lists `fsspec`'s interesting integrations that utilize the [`HfFileSystem`] to make interacting with the Hub simpler: * Reading/writing a [Pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#reading-writing-remote-files) DataFrame from/to a 🤗 Hub repository: From 59c3cf62729f6debb008ba7125fafe65a940c2d3 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Mon, 3 Apr 2023 18:12:16 +0200 Subject: [PATCH 06/18] Fix typing --- src/huggingface_hub/hf_file_system.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/huggingface_hub/hf_file_system.py b/src/huggingface_hub/hf_file_system.py index fd2058f1cb..41763c093e 100644 --- a/src/huggingface_hub/hf_file_system.py +++ b/src/huggingface_hub/hf_file_system.py @@ -89,7 +89,9 @@ def __init__( # Maps (repo_type, repo_id, revision) to a 2-tuple with: # * the 1st element indicating whether the repositoy and the revision exist # * the 2nd element being the exception raised if the repository or revision doesn't exist - self._repo_and_revision_exists_cache: Dict[Tuple[str, str, str], Tuple[bool, Optional[Exception]]] = {} + self._repo_and_revision_exists_cache: Dict[ + Tuple[str, str, Optional[str]], Tuple[bool, Optional[Exception]] + ] = {} def _repo_and_revision_exist( self, repo_type: str, repo_id: str, revision: Optional[str] @@ -291,7 +293,7 @@ def _iter_tree(self, path: str, revision: Optional[str] = None): .rstrip("/") ) headers = self._api._build_hf_headers() - yield from paginate(path, params=None, headers=headers) + yield from paginate(path, params={}, headers=headers) def cp_file(self, path1, path2, revision: Optional[str] = None, **kwargs): path1 = self._strip_protocol(path1) From 6f4ebd8ee0bbfbda809eec4950d4fb52130ff770 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Mon, 3 Apr 2023 18:35:07 +0200 Subject: [PATCH 07/18] Doc fixes --- docs/source/_toctree.yml | 2 +- docs/source/guides/filesystem.mdx | 40 +++++++++++++++---------------- 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 1afd9edd38..7035af8948 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -19,7 +19,7 @@ - local: guides/search title: Search - local: guides/filesystem - - title: Filesystem + title: Filesystem - local: guides/inference title: Inference - local: guides/community diff --git a/docs/source/guides/filesystem.mdx b/docs/source/guides/filesystem.mdx index f1705cf7e4..b90d1aa077 100644 --- a/docs/source/guides/filesystem.mdx +++ b/docs/source/guides/filesystem.mdx @@ -2,7 +2,7 @@ In addition to the [`HfApi`], the `huggingface_hub` library provides [`HfFileSystem`], a pythonic [fsspec-compatible](https://filesystem-spec.readthedocs.io/en/latest/) file interface to the Hugging Face Hub. The [`HfFileSystem`] builds of top of the [`HfApi`] and offers typical filesystem style operations like `cp`, `mv`, `ls`, `du`, `glob`, `get_file`, `put_file` etc. -Below is a snippet with the basic usage: +## Usage ```python >>> from huggingface_hub import HfFileSystem @@ -33,7 +33,7 @@ The optional `revision` argument can be passed to run an operation from a specif Unlike Python's built-in `open`, `fsspec`'s `open` defaults to binary mode, `"rb"`. This means you must explicitly set encoding as `"r"` for reading and `"w"` for writing in text mode. -## Integration +## Integrations The [`HfFileSystem`] can be used with any library that integrates `fsspec`, provided the URL follows the scheme: @@ -43,24 +43,9 @@ hf://[][@]/ The "repo_type_prefix" is "datasets/" for datasets, "spaces/" for spaces, and models don't need a prefix in the URL. -## Authentication - -In many cases, you must be logged in with a Hugging Face account to interact with the Hub. Refer to the [Login](../quick-start#login) section of the documentation to learn more about authentication methods on the Hub. - -It is also possible to login programmatically by passing your `token` as an argument to `HfFileSystem`: - -```python ->>> from huggingface_hub import HfFileSystem ->>> fs = hffs.HfFileSystem(token=token) -``` - -If you login this way, be careful not to accidentally leak the token when sharing your source code! - -## Integrations +Some interesting integrations where [`HfFileSystem`] can be utilized to simplify interacting with the Hub are listed below: -This section lists `fsspec`'s interesting integrations that utilize the [`HfFileSystem`] to make interacting with the Hub simpler: - -* Reading/writing a [Pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#reading-writing-remote-files) DataFrame from/to a 🤗 Hub repository: +* Reading/writing a [Pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#reading-writing-remote-files) DataFrame from/to a Hub repository: ```python >>> import pandas as pd @@ -74,7 +59,7 @@ This section lists `fsspec`'s interesting integrations that utilize the [`HfFile The same workflow can also be used for [Dask](https://docs.dask.org/en/stable/how-to/connect-to-remote-data.html) and [Polars](https://pola-rs.github.io/polars/py-polars/html/reference/io.html) DataFrames. -* Querying (remote) 🤗 Hub files with [DuckDB](https://duckdb.org/docs/guides/python/filesystems): +* Querying (remote) Hub files with [DuckDB](https://duckdb.org/docs/guides/python/filesystems): ```python >>> from huggingface_hub import HfFileSystem @@ -87,7 +72,7 @@ The same workflow can also be used for [Dask](https://docs.dask.org/en/stable/ho >>> df = duckdb.query(f"SELECT * FROM '{fs_query_file}' LIMIT 10").df() ``` -* Using 🤗 Hub as an array store with [Zarr](https://zarr.readthedocs.io/en/stable/tutorial.html#io-with-fsspec): +* Using the Hub as an array store with [Zarr](https://zarr.readthedocs.io/en/stable/tutorial.html#io-with-fsspec): ```python >>> import numpy as np @@ -105,3 +90,16 @@ The same workflow can also be used for [Dask](https://docs.dask.org/en/stable/ho >>> with zarr.open_group("hf://my-username/my-model-repo/array-store", mode="r") as root: ... first_row = root["embeddings/experiment_0"][0] ``` + +## Authentication + +In many cases, you must be logged in with a Hugging Face account to interact with the Hub. Refer to the [Login](../quick-start#login) section of the documentation to learn more about authentication methods on the Hub. + +It is also possible to login programmatically by passing your `token` as an argument to `HfFileSystem`: + +```python +>>> from huggingface_hub import HfFileSystem +>>> fs = hffs.HfFileSystem(token=token) +``` + +If you login this way, be careful not to accidentally leak the token when sharing your source code! From 4f26bce53cbe8a87094caaf8291c638a7c5df296 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20=C5=A0a=C5=A1ko?= Date: Mon, 3 Apr 2023 19:42:39 +0200 Subject: [PATCH 08/18] Apply suggestions from code review Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/guides/filesystem.mdx | 8 ++++---- docs/source/guides/overview.mdx | 2 +- docs/source/package_reference/hf_filesystem.mdx | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/source/guides/filesystem.mdx b/docs/source/guides/filesystem.mdx index b90d1aa077..411bc78677 100644 --- a/docs/source/guides/filesystem.mdx +++ b/docs/source/guides/filesystem.mdx @@ -29,7 +29,7 @@ In addition to the [`HfApi`], the `huggingface_hub` library provides [`HfFileSys ... f.write("Fantastic movie!,good") ``` -The optional `revision` argument can be passed to run an operation from a specific commit (any revision such as a branch or a tag name or a commit hash). +The optional `revision` argument can be passed to run an operation from a specific commit such as a branch, tag name, or a commit hash. Unlike Python's built-in `open`, `fsspec`'s `open` defaults to binary mode, `"rb"`. This means you must explicitly set encoding as `"r"` for reading and `"w"` for writing in text mode. @@ -41,9 +41,9 @@ The [`HfFileSystem`] can be used with any library that integrates `fsspec`, prov hf://[][@]/ ``` -The "repo_type_prefix" is "datasets/" for datasets, "spaces/" for spaces, and models don't need a prefix in the URL. +The `repo_type_prefix` is `datasets/` for datasets, `spaces/` for spaces, and models don't need a prefix in the URL. -Some interesting integrations where [`HfFileSystem`] can be utilized to simplify interacting with the Hub are listed below: +Some interesting integrations where [`HfFileSystem`] simplifies interacting with the Hub are listed below: * Reading/writing a [Pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#reading-writing-remote-files) DataFrame from/to a Hub repository: @@ -95,7 +95,7 @@ The same workflow can also be used for [Dask](https://docs.dask.org/en/stable/ho In many cases, you must be logged in with a Hugging Face account to interact with the Hub. Refer to the [Login](../quick-start#login) section of the documentation to learn more about authentication methods on the Hub. -It is also possible to login programmatically by passing your `token` as an argument to `HfFileSystem`: +It is also possible to login programmatically by passing your `token` as an argument to [`HfFileSystem`]: ```python >>> from huggingface_hub import HfFileSystem diff --git a/docs/source/guides/overview.mdx b/docs/source/guides/overview.mdx index 83a40627f0..6c5e69a658 100644 --- a/docs/source/guides/overview.mdx +++ b/docs/source/guides/overview.mdx @@ -47,7 +47,7 @@ Take a look at these guides to learn how to use huggingface_hub to solve real-wo
Filesystem

- How to interact with the Hub conveniently through an interface that mimics Python's file interface? + How to interact with the Hub through a convenient interface that mimics Python's file interface?

diff --git a/docs/source/package_reference/hf_filesystem.mdx b/docs/source/package_reference/hf_filesystem.mdx index 63f138f14f..146b71ed93 100644 --- a/docs/source/package_reference/hf_filesystem.mdx +++ b/docs/source/package_reference/hf_filesystem.mdx @@ -1,6 +1,6 @@ # Filesystem API -Below is the documentation for the `HfFileSystem` class, which provides a pythonic file interface to the Hugging Face Hub based on [`fssepc`](https://filesystem-spec.readthedocs.io/en/latest/). +The `HfFileSystem` class provides a pythonic file interface to the Hugging Face Hub based on [`fssepc`](https://filesystem-spec.readthedocs.io/en/latest/). ## HfFileSystem @@ -9,4 +9,4 @@ Below is the documentation for the `HfFileSystem` class, which provides a python - resolve_path - ls -As the [`HfFileSystem`] is based on [fsspec](https://filesystem-spec.readthedocs.io/en/latest/), it is compatible with most of the APIs that it offers. For more details, check out the fsspec's [API Reference](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem). +[`HfFileSystem`] is based on [fsspec](https://filesystem-spec.readthedocs.io/en/latest/), so it is compatible with most of the APIs that it offers. For more details, check out the fsspec's [API Reference](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem). From 5b3387b7893ca914c8d28327923a011bd1234dbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20=C5=A0a=C5=A1ko?= Date: Mon, 3 Apr 2023 19:42:58 +0200 Subject: [PATCH 09/18] Update src/huggingface_hub/hf_file_system.py Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- src/huggingface_hub/hf_file_system.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/huggingface_hub/hf_file_system.py b/src/huggingface_hub/hf_file_system.py index 41763c093e..a96968c9ad 100644 --- a/src/huggingface_hub/hf_file_system.py +++ b/src/huggingface_hub/hf_file_system.py @@ -49,7 +49,7 @@ class HfFileSystem(fsspec.AbstractFileSystem): endpoint (`str`, *optional*): The endpoint to use. If not provided, the default one (https://huggingface.co) is used. token (`str`, *optional*): - Authentication token, obtained with `HfApi.login` method. Will default to the stored token. + Authentication token, obtained with [`HfApi.login`] method. Will default to the stored token. Usage: From 3d0bd8e0fe7fd139fb280a88dc24a4d0f8adbfcf Mon Sep 17 00:00:00 2001 From: mariosasko Date: Mon, 3 Apr 2023 19:44:24 +0200 Subject: [PATCH 10/18] Minor doc improvement --- docs/source/package_reference/hf_filesystem.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/package_reference/hf_filesystem.mdx b/docs/source/package_reference/hf_filesystem.mdx index 146b71ed93..de140f57a2 100644 --- a/docs/source/package_reference/hf_filesystem.mdx +++ b/docs/source/package_reference/hf_filesystem.mdx @@ -4,9 +4,9 @@ The `HfFileSystem` class provides a pythonic file interface to the Hugging Face ## HfFileSystem +`HfFileSystem` is based on [fsspec](https://filesystem-spec.readthedocs.io/en/latest/), so it is compatible with most of the APIs that it offers. For more details, check out the fsspec's [API Reference](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem). + [[autodoc]] HfFileSystem - __init__ - resolve_path - ls - -[`HfFileSystem`] is based on [fsspec](https://filesystem-spec.readthedocs.io/en/latest/), so it is compatible with most of the APIs that it offers. For more details, check out the fsspec's [API Reference](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem). From 6d794025d23645bbc654ae2cd0984975ceb6728b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20=C5=A0a=C5=A1ko?= Date: Mon, 3 Apr 2023 19:45:02 +0200 Subject: [PATCH 11/18] Update docs/source/guides/filesystem.mdx Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/guides/filesystem.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/guides/filesystem.mdx b/docs/source/guides/filesystem.mdx index 411bc78677..358358b4c1 100644 --- a/docs/source/guides/filesystem.mdx +++ b/docs/source/guides/filesystem.mdx @@ -1,6 +1,6 @@ # Interact with the Hub through the Filesystem API -In addition to the [`HfApi`], the `huggingface_hub` library provides [`HfFileSystem`], a pythonic [fsspec-compatible](https://filesystem-spec.readthedocs.io/en/latest/) file interface to the Hugging Face Hub. The [`HfFileSystem`] builds of top of the [`HfApi`] and offers typical filesystem style operations like `cp`, `mv`, `ls`, `du`, `glob`, `get_file`, `put_file` etc. +In addition to the [`HfApi`], the `huggingface_hub` library provides [`HfFileSystem`], a pythonic [fsspec-compatible](https://filesystem-spec.readthedocs.io/en/latest/) file interface to the Hugging Face Hub. The [`HfFileSystem`] builds of top of the [`HfApi`] and offers typical filesystem style operations like `cp`, `mv`, `ls`, `du`, `glob`, `get_file`, and `put_file`. ## Usage From d4a5557cea5689d26db71835d62a282295858067 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20=C5=A0a=C5=A1ko?= Date: Tue, 4 Apr 2023 18:21:50 +0200 Subject: [PATCH 12/18] Apply suggestions from code review Co-authored-by: Lucain --- docs/source/guides/filesystem.mdx | 2 +- .../package_reference/hf_filesystem.mdx | 2 +- src/huggingface_hub/hf_file_system.py | 24 ++++++------------- 3 files changed, 9 insertions(+), 19 deletions(-) diff --git a/docs/source/guides/filesystem.mdx b/docs/source/guides/filesystem.mdx index 358358b4c1..601ebc4ea4 100644 --- a/docs/source/guides/filesystem.mdx +++ b/docs/source/guides/filesystem.mdx @@ -20,7 +20,7 @@ In addition to the [`HfApi`], the `huggingface_hub` library provides [`HfFileSys >>> with fs.open("datasets/my-username/my-dataset-repo/data/train.csv", "r") as f: ... train_data = f.readlines() ->>> # Read the contents of a remote file as a string +>>> # Read the content of a remote file as a string >>> train_data = fs.read_text("datasets/my-username/my-dataset-repo/data/train.csv", revision="dev") >>> # Write a remote file diff --git a/docs/source/package_reference/hf_filesystem.mdx b/docs/source/package_reference/hf_filesystem.mdx index de140f57a2..17c9258d75 100644 --- a/docs/source/package_reference/hf_filesystem.mdx +++ b/docs/source/package_reference/hf_filesystem.mdx @@ -4,7 +4,7 @@ The `HfFileSystem` class provides a pythonic file interface to the Hugging Face ## HfFileSystem -`HfFileSystem` is based on [fsspec](https://filesystem-spec.readthedocs.io/en/latest/), so it is compatible with most of the APIs that it offers. For more details, check out the fsspec's [API Reference](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem). +`HfFileSystem` is based on [fsspec](https://filesystem-spec.readthedocs.io/en/latest/), so it is compatible with most of the APIs that it offers. For more details, check out [our guide](../guides/filesystem) and the fsspec's [API Reference](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem). [[autodoc]] HfFileSystem - __init__ diff --git a/src/huggingface_hub/hf_file_system.py b/src/huggingface_hub/hf_file_system.py index a96968c9ad..9373384697 100644 --- a/src/huggingface_hub/hf_file_system.py +++ b/src/huggingface_hub/hf_file_system.py @@ -184,7 +184,6 @@ def invalidate_cache(self, path=None): self.dircache.clear() self._repository_type_and_id_exists_cache.clear() else: - path = self._strip_protocol(path) path = self.resolve_path(path).unresolve() while path: self.dircache.pop(path, None) @@ -199,21 +198,17 @@ def _open( ): if mode == "ab": raise NotImplementedError("Appending to remote files is not yet supported.") - path = self._strip_protocol(path) return HfFile(self, path, mode=mode, revision=revision, **kwargs) def _rm(self, path, revision: Optional[str] = None, **kwargs): - path = self._strip_protocol(path) resolved_path = self.resolve_path(path, revision=revision) - operations = [CommitOperationDelete(path_in_repo=resolved_path.path_in_repo)] - commit_message = f"Delete {path}" - self._api.create_commit( + self._api.delete_file( + path_in_repo=resolved_path.path_in_repo, repo_id=resolved_path.repo_id, - repo_type=resolved_path.repo_type, token=self.token, - operations=operations, + repo_type=resolved_path.repo_type, revision=resolved_path.revision, - commit_message=kwargs.get("commit_message", commit_message), + commit_message=kwargs.get("commit_message"), commit_description=kwargs.get("commit_description"), ) self.invalidate_cache(path=resolved_path.unresolve()) @@ -241,7 +236,6 @@ def rm(self, path, recursive=False, maxdepth=None, revision: Optional[str] = Non def ls(self, path, detail=True, refresh=False, revision: Optional[str] = None, **kwargs): """List the contents of a directory.""" - path = self._strip_protocol(path) resolved_path = self.resolve_path(path, revision=revision) revision_in_path = "@" + quote(resolved_path.revision, "") has_revision_in_path = revision_in_path in path @@ -286,7 +280,6 @@ def ls(self, path, detail=True, refresh=False, revision: Optional[str] = None, * return out if detail else [o["name"] for o in out] def _iter_tree(self, path: str, revision: Optional[str] = None): - path = self._strip_protocol(path) resolved_path = self.resolve_path(path, revision=revision) path = ( f"{self._api.endpoint}/api/{resolved_path.repo_type}s/{resolved_path.repo_id}/tree/{quote(resolved_path.revision, safe='')}/{resolved_path.path_in_repo}" @@ -295,10 +288,8 @@ def _iter_tree(self, path: str, revision: Optional[str] = None): headers = self._api._build_hf_headers() yield from paginate(path, params={}, headers=headers) - def cp_file(self, path1, path2, revision: Optional[str] = None, **kwargs): - path1 = self._strip_protocol(path1) + def cp_file(self, path1: str, path2: str, revision: Optional[str] = None, **kwargs) -> None: resolved_path1 = self.resolve_path(path1, revision=revision) - path2 = self._strip_protocol(path2) resolved_path2 = self.resolve_path(path2, revision=revision) same_repo = ( @@ -360,7 +351,7 @@ def info(self, path, **kwargs): return {"name": path, "size": None, "type": "directory"} return super().info(path, **kwargs) - def expand_path(self, path, recursive=False, maxdepth=None, **kwargs): + def expand_path(self, path: str, recursive: bool = False, maxdepth: Optional[int] = None, **kwargs) -> List[str]: if maxdepth is not None and maxdepth < 1: raise ValueError("maxdepth must be at least 1") @@ -414,7 +405,6 @@ def _upload_chunk(self, final=False): self.temp_file.write(block) if final: self.temp_file.close() - commit_message = f"Upload {self.path}" self.fs._api.upload_file( path_or_fileobj=self.temp_file.name, path_in_repo=self.resolved_path.path_in_repo, @@ -422,7 +412,7 @@ def _upload_chunk(self, final=False): token=self.fs.token, repo_type=self.resolved_path.repo_type, revision=self.resolved_path.revision, - commit_message=self.kwargs.get("commit_message", commit_message), + commit_message=self.kwargs.get("commit_message"), commit_description=self.kwargs.get("commit_description"), ) os.remove(self.temp_file.name) From 7cd9b71f6e9e64b81f938bdddb89c6c00b3a1909 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Wed, 5 Apr 2023 15:20:16 +0200 Subject: [PATCH 13/18] Address the rest of the review comments --- docs/source/_toctree.yml | 2 +- docs/source/guides/filesystem.mdx | 2 +- src/huggingface_hub/hf_file_system.py | 29 ++++++++++++++++++--------- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 7035af8948..60eafd9a08 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -55,7 +55,7 @@ - local: package_reference/inference_api title: Inference API - local: package_reference/hf_filesystem - title: Hugging Face Hub Filesystem + title: Filesystem - local: package_reference/utilities title: Utilities - local: package_reference/community diff --git a/docs/source/guides/filesystem.mdx b/docs/source/guides/filesystem.mdx index 601ebc4ea4..d65a610b35 100644 --- a/docs/source/guides/filesystem.mdx +++ b/docs/source/guides/filesystem.mdx @@ -31,7 +31,7 @@ In addition to the [`HfApi`], the `huggingface_hub` library provides [`HfFileSys The optional `revision` argument can be passed to run an operation from a specific commit such as a branch, tag name, or a commit hash. -Unlike Python's built-in `open`, `fsspec`'s `open` defaults to binary mode, `"rb"`. This means you must explicitly set encoding as `"r"` for reading and `"w"` for writing in text mode. +Unlike Python's built-in `open`, `fsspec`'s `open` defaults to binary mode, `"rb"`. This means you must explicitly set mode as `"r"` for reading and `"w"` for writing in text mode. Appending to a file (modes `"a"` and `"ab"`) is not supported yet. ## Integrations diff --git a/src/huggingface_hub/hf_file_system.py b/src/huggingface_hub/hf_file_system.py index 9373384697..5ef03613cd 100644 --- a/src/huggingface_hub/hf_file_system.py +++ b/src/huggingface_hub/hf_file_system.py @@ -2,8 +2,9 @@ import os import tempfile from dataclasses import dataclass +from datetime import datetime from glob import has_magic -from typing import Dict, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, Union from urllib.parse import quote, unquote import fsspec @@ -179,7 +180,7 @@ def _align_revision_in_path_with_revision( revision = revision if revision is not None else DEFAULT_REVISION return ResolvedPath(repo_type, repo_id, revision, path_in_repo) - def invalidate_cache(self, path=None): + def invalidate_cache(self, path=None) -> None: if not path: self.dircache.clear() self._repository_type_and_id_exists_cache.clear() @@ -213,7 +214,7 @@ def _rm(self, path, revision: Optional[str] = None, **kwargs): ) self.invalidate_cache(path=resolved_path.unresolve()) - def rm(self, path, recursive=False, maxdepth=None, revision: Optional[str] = None, **kwargs): + def rm(self, path, recursive=False, maxdepth=None, revision: Optional[str] = None, **kwargs) -> None: resolved_path = self.resolve_path(path, revision=revision) root_path = REPO_TYPES_URL_PREFIXES.get(resolved_path.repo_type, "") + resolved_path.repo_id paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth, revision=resolved_path.revision) @@ -234,10 +235,12 @@ def rm(self, path, recursive=False, maxdepth=None, revision: Optional[str] = Non ) self.invalidate_cache(path=resolved_path.unresolve()) - def ls(self, path, detail=True, refresh=False, revision: Optional[str] = None, **kwargs): + def ls( + self, path, detail=True, refresh=False, revision: Optional[str] = None, **kwargs + ) -> List[Union[str, Dict[str, Any]]]: """List the contents of a directory.""" resolved_path = self.resolve_path(path, revision=revision) - revision_in_path = "@" + quote(resolved_path.revision, "") + revision_in_path = "@" + quote(resolved_path.revision, safe="") has_revision_in_path = revision_in_path in path path = resolved_path.unresolve() if path not in self.dircache or refresh: @@ -338,20 +341,26 @@ def cp_file(self, path1: str, path2: str, revision: Optional[str] = None, **kwar self.invalidate_cache(path=resolved_path1.unresolve()) self.invalidate_cache(path=resolved_path2.unresolve()) - def modified(self, path, **kwargs): + def modified(self, path: str, **kwargs) -> datetime: info = self.info(path, **kwargs) if "last_modified" not in info: raise IsADirectoryError(path) return info["last_modified"] - def info(self, path, **kwargs): - path = self._strip_protocol(path) + def info(self, path: str, **kwargs) -> Dict[str, Any]: resolved_path = self.resolve_path(path) if not resolved_path.path_in_repo: - return {"name": path, "size": None, "type": "directory"} + revision_in_path = "@" + quote(resolved_path.revision, safe="") + has_revision_in_path = revision_in_path in path + name = resolved_path.unresolve() + name = name.replace(revision_in_path, "", 1) if not has_revision_in_path else name + return {"name": name, "size": 0, "type": "directory"} return super().info(path, **kwargs) - def expand_path(self, path: str, recursive: bool = False, maxdepth: Optional[int] = None, **kwargs) -> List[str]: + def expand_path( + self, path: Union[str, List[str]], recursive: bool = False, maxdepth: Optional[int] = None, **kwargs + ): + # The default implementation does not allow passing custom kwargs (e.g., we use these kwargs to propage the `revision`) if maxdepth is not None and maxdepth < 1: raise ValueError("maxdepth must be at least 1") From 9782e3a7b6acc8410c73d0b049127cd4ef14c04b Mon Sep 17 00:00:00 2001 From: mariosasko Date: Wed, 5 Apr 2023 15:31:27 +0200 Subject: [PATCH 14/18] Typo --- src/huggingface_hub/hf_file_system.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/huggingface_hub/hf_file_system.py b/src/huggingface_hub/hf_file_system.py index 5ef03613cd..e584def5a1 100644 --- a/src/huggingface_hub/hf_file_system.py +++ b/src/huggingface_hub/hf_file_system.py @@ -360,7 +360,7 @@ def info(self, path: str, **kwargs) -> Dict[str, Any]: def expand_path( self, path: Union[str, List[str]], recursive: bool = False, maxdepth: Optional[int] = None, **kwargs ): - # The default implementation does not allow passing custom kwargs (e.g., we use these kwargs to propage the `revision`) + # The default implementation does not allow passing custom kwargs (e.g., we use these kwargs to propagate the `revision`) if maxdepth is not None and maxdepth < 1: raise ValueError("maxdepth must be at least 1") From b7fb3781db24eeef3b62e7e31f816bcc6c409a49 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Thu, 6 Apr 2023 14:23:46 +0200 Subject: [PATCH 15/18] Renamed hffs classes more explicitly --- src/huggingface_hub/__init__.py | 8 ++++---- src/huggingface_hub/hf_api.py | 2 +- src/huggingface_hub/hf_file_system.py | 18 ++++++++++-------- src/huggingface_hub/utils/__init__.py | 1 + 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py index ec95172d5f..5994e800c1 100644 --- a/src/huggingface_hub/__init__.py +++ b/src/huggingface_hub/__init__.py @@ -163,9 +163,9 @@ "whoami", ], "hf_file_system": [ - "HfFile", "HfFileSystem", - "ResolvedPath", + "HfFileSystemFile", + "HfFileSystemResolvedPath", ], "hub_mixin": [ "ModelHubMixin", @@ -427,9 +427,9 @@ def __dir__(): whoami, # noqa: F401 ) from .hf_file_system import ( - HfFile, # noqa: F401 HfFileSystem, # noqa: F401 - ResolvedPath, # noqa: F401 + HfFileSystemFile, # noqa: F401 + HfFileSystemResolvedPath, # noqa: F401 ) from .hub_mixin import ( ModelHubMixin, # noqa: F401 diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 349f6c2f5d..36bf6adaba 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -64,6 +64,7 @@ filter_repo_objects, hf_raise_for_status, logging, + paginate, parse_datetime, validate_hf_hub_args, ) @@ -71,7 +72,6 @@ _deprecate_arguments, _deprecate_list_output, ) -from .utils._pagination import paginate from .utils._typing import Literal, TypedDict from .utils.endpoint_helpers import ( AttributeDictionary, diff --git a/src/huggingface_hub/hf_file_system.py b/src/huggingface_hub/hf_file_system.py index e584def5a1..e2970d838e 100644 --- a/src/huggingface_hub/hf_file_system.py +++ b/src/huggingface_hub/hf_file_system.py @@ -20,14 +20,14 @@ RevisionNotFoundError, hf_raise_for_status, http_backoff, + paginate, parse_datetime, ) -from .utils._pagination import paginate @dataclass -class ResolvedPath: - """Data structure containing information about a resolved path.""" +class HfFileSystemResolvedPath: + """Data structure containing information about a resolved hffs path.""" repo_type: str repo_id: str @@ -111,7 +111,7 @@ def _repo_and_revision_exist( self._repo_and_revision_exists_cache[(repo_type, repo_id, None)] = True, None return self._repo_and_revision_exists_cache[(repo_type, repo_id, revision)] - def resolve_path(self, path: str, revision: Optional[str] = None) -> ResolvedPath: + def resolve_path(self, path: str, revision: Optional[str] = None) -> HfFileSystemResolvedPath: def _align_revision_in_path_with_revision( revision_in_path: Optional[str], revision: Optional[str] ) -> Optional[str]: @@ -178,7 +178,7 @@ def _align_revision_in_path_with_revision( raise NotImplementedError("Acces to repositories lists is not implemented.") revision = revision if revision is not None else DEFAULT_REVISION - return ResolvedPath(repo_type, repo_id, revision, path_in_repo) + return HfFileSystemResolvedPath(repo_type, repo_id, revision, path_in_repo) def invalidate_cache(self, path=None) -> None: if not path: @@ -199,7 +199,7 @@ def _open( ): if mode == "ab": raise NotImplementedError("Appending to remote files is not yet supported.") - return HfFile(self, path, mode=mode, revision=revision, **kwargs) + return HfFileSystemFile(self, path, mode=mode, revision=revision, **kwargs) def _rm(self, path, revision: Optional[str] = None, **kwargs): resolved_path = self.resolve_path(path, revision=revision) @@ -245,7 +245,9 @@ def ls( path = resolved_path.unresolve() if path not in self.dircache or refresh: path_prefix = ( - ResolvedPath(resolved_path.repo_type, resolved_path.repo_id, resolved_path.revision, "").unresolve() + HfFileSystemResolvedPath( + resolved_path.repo_type, resolved_path.repo_id, resolved_path.revision, "" + ).unresolve() + "/" ) tree_path = path @@ -387,7 +389,7 @@ def expand_path( return list(sorted(out)) -class HfFile(fsspec.spec.AbstractBufferedFile): +class HfFileSystemFile(fsspec.spec.AbstractBufferedFile): def __init__(self, fs: HfFileSystem, path: str, revision: Optional[str] = None, **kwargs): super().__init__(fs, path, **kwargs) self.fs: HfFileSystem diff --git a/src/huggingface_hub/utils/__init__.py b/src/huggingface_hub/utils/__init__.py index f3f545d250..db69f357ea 100644 --- a/src/huggingface_hub/utils/__init__.py +++ b/src/huggingface_hub/utils/__init__.py @@ -44,6 +44,7 @@ from ._headers import build_hf_headers, get_token_to_send from ._hf_folder import HfFolder from ._http import configure_http_backend, get_session, http_backoff +from ._pagination import paginate from ._paths import filter_repo_objects, IGNORE_GIT_FOLDER_PATTERNS from ._runtime import ( dump_environment_info, From 4a7ec33157bd466a8c771b5e932c63a47221116c Mon Sep 17 00:00:00 2001 From: mariosasko Date: Thu, 6 Apr 2023 14:47:20 +0200 Subject: [PATCH 16/18] Filesystem -> HfFileSystem in docs --- docs/source/_toctree.yml | 8 ++++---- docs/source/guides/{filesystem.mdx => hf_file_system.mdx} | 2 +- docs/source/guides/overview.mdx | 2 +- .../{hf_filesystem.mdx => hf_file_system.mdx} | 0 4 files changed, 6 insertions(+), 6 deletions(-) rename docs/source/guides/{filesystem.mdx => hf_file_system.mdx} (99%) rename docs/source/package_reference/{hf_filesystem.mdx => hf_file_system.mdx} (100%) diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 60eafd9a08..7ffd9e02c6 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -18,8 +18,8 @@ title: Repository - local: guides/search title: Search - - local: guides/filesystem - title: Filesystem + - local: guides/hf_file_system + title: HfFileSystem - local: guides/inference title: Inference - local: guides/community @@ -54,8 +54,8 @@ title: Mixins & serialization methods - local: package_reference/inference_api title: Inference API - - local: package_reference/hf_filesystem - title: Filesystem + - local: package_reference/hf_file_system + title: HfFileSystem - local: package_reference/utilities title: Utilities - local: package_reference/community diff --git a/docs/source/guides/filesystem.mdx b/docs/source/guides/hf_file_system.mdx similarity index 99% rename from docs/source/guides/filesystem.mdx rename to docs/source/guides/hf_file_system.mdx index d65a610b35..7d0d5581a3 100644 --- a/docs/source/guides/filesystem.mdx +++ b/docs/source/guides/hf_file_system.mdx @@ -99,7 +99,7 @@ It is also possible to login programmatically by passing your `token` as an argu ```python >>> from huggingface_hub import HfFileSystem ->>> fs = hffs.HfFileSystem(token=token) +>>> fs = HfFileSystem(token=token) ``` If you login this way, be careful not to accidentally leak the token when sharing your source code! diff --git a/docs/source/guides/overview.mdx b/docs/source/guides/overview.mdx index 6c5e69a658..6551c839d2 100644 --- a/docs/source/guides/overview.mdx +++ b/docs/source/guides/overview.mdx @@ -45,7 +45,7 @@ Take a look at these guides to learn how to use huggingface_hub to solve real-wo
- Filesystem + HfFileSystem

How to interact with the Hub through a convenient interface that mimics Python's file interface?

diff --git a/docs/source/package_reference/hf_filesystem.mdx b/docs/source/package_reference/hf_file_system.mdx similarity index 100% rename from docs/source/package_reference/hf_filesystem.mdx rename to docs/source/package_reference/hf_file_system.mdx From efa3d921aebb429bb09fa79e9d4b370d9677ee0b Mon Sep 17 00:00:00 2001 From: mariosasko Date: Thu, 6 Apr 2023 16:27:27 +0200 Subject: [PATCH 17/18] Fix for revision with `/` --- src/huggingface_hub/hf_file_system.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/huggingface_hub/hf_file_system.py b/src/huggingface_hub/hf_file_system.py index e2970d838e..3b464a6ef5 100644 --- a/src/huggingface_hub/hf_file_system.py +++ b/src/huggingface_hub/hf_file_system.py @@ -36,7 +36,7 @@ class HfFileSystemResolvedPath: def unresolve(self): path = ( - f"{REPO_TYPES_URL_PREFIXES.get(self.repo_type, '') + self.repo_id}@{self.revision}/{self.path_in_repo}" + f"{REPO_TYPES_URL_PREFIXES.get(self.repo_type, '') + self.repo_id}@{quote(self.revision, safe='')}/{self.path_in_repo}" .rstrip("/") ) return path @@ -257,7 +257,7 @@ def ls( except EntryNotFoundError: if "/" in resolved_path.path_in_repo: tree_path = self._parent(path) - tree_iter = self._iter_tree(tree_path) + tree_iter = self._iter_tree(tree_path, revision=resolved_path.revision) else: raise else: From 3ee708df4fa6b47de89b2b1cd366ac638903a2d4 Mon Sep 17 00:00:00 2001 From: Lucain Pouget Date: Thu, 6 Apr 2023 16:44:56 +0200 Subject: [PATCH 18/18] Add tests for ls + add some type anntations --- src/huggingface_hub/hf_file_system.py | 85 +++++++++++++++------------ tests/test_hf_file_system.py | 54 ++++++++++++++++- 2 files changed, 99 insertions(+), 40 deletions(-) diff --git a/src/huggingface_hub/hf_file_system.py b/src/huggingface_hub/hf_file_system.py index 3b464a6ef5..5d68740cb4 100644 --- a/src/huggingface_hub/hf_file_system.py +++ b/src/huggingface_hub/hf_file_system.py @@ -34,12 +34,11 @@ class HfFileSystemResolvedPath: revision: str path_in_repo: str - def unresolve(self): - path = ( - f"{REPO_TYPES_URL_PREFIXES.get(self.repo_type, '') + self.repo_id}@{quote(self.revision, safe='')}/{self.path_in_repo}" + def unresolve(self) -> str: + return ( + f"{REPO_TYPES_URL_PREFIXES.get(self.repo_type, '') + self.repo_id}@{safe_quote(self.revision)}/{self.path_in_repo}" .rstrip("/") ) - return path class HfFileSystem(fsspec.AbstractFileSystem): @@ -180,7 +179,7 @@ def _align_revision_in_path_with_revision( revision = revision if revision is not None else DEFAULT_REVISION return HfFileSystemResolvedPath(repo_type, repo_id, revision, path_in_repo) - def invalidate_cache(self, path=None) -> None: + def invalidate_cache(self, path: Optional[str] = None) -> None: if not path: self.dircache.clear() self._repository_type_and_id_exists_cache.clear() @@ -196,12 +195,12 @@ def _open( mode: str = "rb", revision: Optional[str] = None, **kwargs, - ): + ) -> "HfFileSystemFile": if mode == "ab": raise NotImplementedError("Appending to remote files is not yet supported.") return HfFileSystemFile(self, path, mode=mode, revision=revision, **kwargs) - def _rm(self, path, revision: Optional[str] = None, **kwargs): + def _rm(self, path: str, revision: Optional[str] = None, **kwargs) -> None: resolved_path = self.resolve_path(path, revision=revision) self._api.delete_file( path_in_repo=resolved_path.path_in_repo, @@ -214,7 +213,14 @@ def _rm(self, path, revision: Optional[str] = None, **kwargs): ) self.invalidate_cache(path=resolved_path.unresolve()) - def rm(self, path, recursive=False, maxdepth=None, revision: Optional[str] = None, **kwargs) -> None: + def rm( + self, + path: str, + recursive: bool = False, + maxdepth: Optional[int] = None, + revision: Optional[str] = None, + **kwargs, + ) -> None: resolved_path = self.resolve_path(path, revision=revision) root_path = REPO_TYPES_URL_PREFIXES.get(resolved_path.repo_type, "") + resolved_path.repo_id paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth, revision=resolved_path.revision) @@ -236,11 +242,11 @@ def rm(self, path, recursive=False, maxdepth=None, revision: Optional[str] = Non self.invalidate_cache(path=resolved_path.unresolve()) def ls( - self, path, detail=True, refresh=False, revision: Optional[str] = None, **kwargs + self, path: str, detail: bool = True, refresh: bool = False, revision: Optional[str] = None, **kwargs ) -> List[Union[str, Dict[str, Any]]]: """List the contents of a directory.""" resolved_path = self.resolve_path(path, revision=revision) - revision_in_path = "@" + quote(resolved_path.revision, safe="") + revision_in_path = "@" + safe_quote(resolved_path.revision) has_revision_in_path = revision_in_path in path path = resolved_path.unresolve() if path not in self.dircache or refresh: @@ -286,9 +292,8 @@ def ls( def _iter_tree(self, path: str, revision: Optional[str] = None): resolved_path = self.resolve_path(path, revision=revision) - path = ( - f"{self._api.endpoint}/api/{resolved_path.repo_type}s/{resolved_path.repo_id}/tree/{quote(resolved_path.revision, safe='')}/{resolved_path.path_in_repo}" - .rstrip("/") + path = f"{self._api.endpoint}/api/{resolved_path.repo_type}s/{resolved_path.repo_id}/tree/{safe_quote(resolved_path.revision)}/{resolved_path.path_in_repo}".rstrip( + "/" ) headers = self._api._build_hf_headers() yield from paginate(path, params={}, headers=headers) @@ -319,9 +324,7 @@ def cp_file(self, path1: str, path2: str, revision: Optional[str] = None, **kwar "deletedFiles": [], } r = requests.post( - ( - f"{self.endpoint}/api/{resolved_path1.repo_type}s/{resolved_path1.repo_id}/commit/{quote(resolved_path2.revision, safe='')}" - ), + f"{self.endpoint}/api/{resolved_path1.repo_type}s/{resolved_path1.repo_id}/commit/{safe_quote(resolved_path2.revision)}", json=payload, headers=headers, ) @@ -352,7 +355,7 @@ def modified(self, path: str, **kwargs) -> datetime: def info(self, path: str, **kwargs) -> Dict[str, Any]: resolved_path = self.resolve_path(path) if not resolved_path.path_in_repo: - revision_in_path = "@" + quote(resolved_path.revision, safe="") + revision_in_path = "@" + safe_quote(resolved_path.revision) has_revision_in_path = revision_in_path in path name = resolved_path.unresolve() name = name.replace(revision_in_path, "", 1) if not has_revision_in_path else name @@ -361,29 +364,29 @@ def info(self, path: str, **kwargs) -> Dict[str, Any]: def expand_path( self, path: Union[str, List[str]], recursive: bool = False, maxdepth: Optional[int] = None, **kwargs - ): + ) -> List[str]: # The default implementation does not allow passing custom kwargs (e.g., we use these kwargs to propagate the `revision`) if maxdepth is not None and maxdepth < 1: raise ValueError("maxdepth must be at least 1") if isinstance(path, str): - out = self.expand_path([path], recursive, maxdepth) - else: - out = set() - path = [self._strip_protocol(p) for p in path] - for p in path: - if has_magic(p): - bit = set(self.glob(p)) - out |= bit - if recursive: - out |= set(self.expand_path(list(bit), recursive=recursive, maxdepth=maxdepth, **kwargs)) - continue - elif recursive: - rec = set(self.find(p, maxdepth=maxdepth, withdirs=True, detail=False, **kwargs)) - out |= rec - if p not in out and (recursive is False or self.exists(p)): - # should only check once, for the root - out.add(p) + return self.expand_path([path], recursive, maxdepth) + + out = set() + path = [self._strip_protocol(p) for p in path] + for p in path: + if has_magic(p): + bit = set(self.glob(p)) + out |= bit + if recursive: + out |= set(self.expand_path(list(bit), recursive=recursive, maxdepth=maxdepth, **kwargs)) + continue + elif recursive: + rec = set(self.find(p, maxdepth=maxdepth, withdirs=True, detail=False, **kwargs)) + out |= rec + if p not in out and (recursive is False or self.exists(p)): + # should only check once, for the root + out.add(p) if not out: raise FileNotFoundError(path) return list(sorted(out)) @@ -395,22 +398,22 @@ def __init__(self, fs: HfFileSystem, path: str, revision: Optional[str] = None, self.fs: HfFileSystem self.resolved_path = fs.resolve_path(path, revision=revision) - def _fetch_range(self, start, end): + def _fetch_range(self, start: int, end: int) -> bytes: headers = { "range": f"bytes={start}-{end - 1}", **self.fs._api._build_hf_headers(), } url = ( - f"{self.fs.endpoint}/{REPO_TYPES_URL_PREFIXES.get(self.resolved_path.repo_type, '') + self.resolved_path.repo_id}/resolve/{quote(self.resolved_path.revision, safe='')}/{quote(self.resolved_path.path_in_repo, safe='')}" + f"{self.fs.endpoint}/{REPO_TYPES_URL_PREFIXES.get(self.resolved_path.repo_type, '') + self.resolved_path.repo_id}/resolve/{safe_quote(self.resolved_path.revision)}/{safe_quote(self.resolved_path.path_in_repo)}" ) r = http_backoff("GET", url, headers=headers) hf_raise_for_status(r) return r.content - def _initiate_upload(self): + def _initiate_upload(self) -> None: self.temp_file = tempfile.NamedTemporaryFile(prefix="hffs-", delete=False) - def _upload_chunk(self, final=False): + def _upload_chunk(self, final: bool = False) -> None: self.buffer.seek(0) block = self.buffer.read() self.temp_file.write(block) @@ -430,3 +433,7 @@ def _upload_chunk(self, final=False): self.fs.invalidate_cache( path=self.resolved_path.unresolve(), ) + + +def safe_quote(s: str) -> str: + return quote(s, safe="") diff --git a/tests/test_hf_file_system.py b/tests/test_hf_file_system.py index 5e5c67d6ca..7a40e1402a 100644 --- a/tests/test_hf_file_system.py +++ b/tests/test_hf_file_system.py @@ -29,7 +29,14 @@ def setUp(self): self.api = self.hffs._api # Create dummy repo - self.api.create_repo(self.repo_id, repo_type=self.repo_type, private=False) + self.api.create_repo(self.repo_id, repo_type=self.repo_type) + self.api.upload_file( + path_or_fileobj=b"dummy binary data on pr", + path_in_repo="data/binary_data_for_pr.bin", + repo_id=self.repo_id, + repo_type=self.repo_type, + create_pr=True, + ) self.api.upload_file( path_or_fileobj="dummy text data".encode("utf-8"), path_in_repo="data/text_data.txt", @@ -160,6 +167,51 @@ def test_initialize_from_fsspec(self): self.assertIsInstance(fs, HfFileSystem) self.assertEqual(paths, [f"{self.repo_id}/data/text_data.txt"]) + @retry_endpoint + def test_list_root_directory_no_revision(self): + files = self.hffs.ls(self.hf_path) + self.assertEqual(len(files), 2) + + self.assertEqual(files[0]["type"], "directory") + self.assertEqual(files[0]["size"], 0) + self.assertTrue(files[0]["name"].endswith("/data")) + + self.assertEqual(files[1]["type"], "file") + self.assertGreater(files[1]["size"], 0) # not empty + self.assertTrue(files[1]["name"].endswith("/.gitattributes")) + + @retry_endpoint + def test_list_data_directory_no_revision(self): + files = self.hffs.ls(self.hf_path + "/data") + self.assertEqual(len(files), 2) + + self.assertEqual(files[0]["type"], "file") + self.assertGreater(files[0]["size"], 0) # not empty + self.assertTrue(files[0]["name"].endswith("/data/binary_data.bin")) + self.assertIsNotNone(files[0]["lfs"]) + self.assertIn("oid", files[0]["lfs"]) + self.assertIn("size", files[0]["lfs"]) + self.assertIn("pointerSize", files[0]["lfs"]) + + self.assertEqual(files[1]["type"], "file") + self.assertGreater(files[1]["size"], 0) # not empty + self.assertTrue(files[1]["name"].endswith("/data/text_data.txt")) + self.assertIsNone(files[1]["lfs"]) + + @retry_endpoint + def test_list_data_directory_with_revision(self): + files = self.hffs.ls(self.hf_path + "@refs%2Fpr%2F1" + "/data") + + for test_name, files in { + "rev_in_path": self.hffs.ls(self.hf_path + "@refs%2Fpr%2F1" + "/data"), + "rev_as_arg": self.hffs.ls(self.hf_path + "/data", revision="refs/pr/1"), + "rev_in_path_and_as_arg": self.hffs.ls(self.hf_path + "@refs%2Fpr%2F1" + "/data", revision="refs/pr/1"), + }.items(): + with self.subTest(test_name): + self.assertEqual(len(files), 1) # only one file in PR + self.assertEqual(files[0]["type"], "file") + self.assertTrue(files[0]["name"].endswith("/data/binary_data_for_pr.bin")) # PR file + @pytest.mark.parametrize("path_in_repo", ["", "foo"]) @pytest.mark.parametrize(