Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add interface to fsspec #943

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def get_version() -> str:
"pytest-cov",
"datasets",
"soundfile",
"fsspec",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should be added to its own extras so that it can be installed via pip install huggingface_hub[fsspec] or be added as a requirement for a downstream library requiring it

]

extras["quality"] = [
Expand Down
12 changes: 5 additions & 7 deletions src/huggingface_hub/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,8 @@
# vendored from https://github.com/scientific-python/lazy_loader
import importlib
import importlib.util
import inspect
import os
import sys
import types
import warnings


class _LazyImportWarning(Warning):
pass


def _attach(package_name, submodules=None, submod_attrs=None):
Expand Down Expand Up @@ -175,6 +168,11 @@ def __dir__():
"upload_folder",
"whoami",
],
**{
"hf_filesystem": ["HfFileSystem"]
if importlib.util.find_spec("fsspec")
else {}
},
"hub_mixin": ["ModelHubMixin", "PyTorchModelHubMixin"],
"inference_api": ["InferenceApi"],
"keras_mixin": [
Expand Down
234 changes: 234 additions & 0 deletions src/huggingface_hub/hf_filesystem.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
import tempfile
from pathlib import PurePosixPath
from typing import Optional

import fsspec

from .constants import (
HUGGINGFACE_HUB_CACHE,
REPO_TYPE_DATASET,
REPO_TYPE_MODEL,
REPO_TYPE_SPACE,
REPO_TYPES,
)
from .file_download import hf_hub_url
from .hf_api import (
HfFolder,
dataset_info,
delete_file,
model_info,
space_info,
upload_file,
)


def _repo_type_to_info_func(repo_type):
if repo_type == REPO_TYPE_DATASET:
return dataset_info
elif repo_type == REPO_TYPE_MODEL:
return model_info
elif repo_type == REPO_TYPE_SPACE:
return space_info
else: # None
return model_info


class HfFileSystem(fsspec.AbstractFileSystem):
"""
Access a remote Hugging Face Hub repository as if were a local file system.

Args:
repo_id (`str`):
The remote repository to access as if were a local file system,
for example: `"username/custom_transformers"`
token (`str`, *optional*):
Authentication token, obtained with `HfApi.login` method. Will
default to the stored token.
repo_type (`str`, *optional*):
Set to `"dataset"` or `"space"` if the remote repositry is a dataset or
space repositroy, `None` or `"model"` if it is a model repository. Default is
`None`.
revision (`str`, *optional*):
An optional Git revision id which can be a branch name, a tag, or a
commit hash. Defaults to the head of the `"main"` branch.

Example usage (direct):

```python
>>> from huggingface_hub import HfFileSystem

>>> hffs = HfFileSystem("username/my-dataset", repo_type="dataset")

>>> # Read a remote file
>>> with hffs.open("remote/file/in/repo.bin") as f:
... data = f.read()

>>> # Write a remote file
>>> with hffs.open("remote/file/in/repo.bin", "wb") as f:
... f.write(data)
```

Example usage (via [`fsspec`](https://filesystem-spec.readthedocs.io/en/latest/)):

```python
>>> import fsspec

>>> # Read a remote file
>>> with fsspec.open("hf:username/my-dataset:/remote/file/in/repo.bin", repo_type="dataset") as f:
... data = f.read()

>>> # Write a remote file
>>> with fsspec.open("hf:username/my-dataset:/remote/file/in/repo.bin", "wb", repo_type="dataset") as f:
... f.write(data)
```
"""

root_marker = ""
protocol = "hf"

def __init__(
self,
repo_id: str,
token: Optional[str] = None,
repo_type: Optional[str] = None,
revision: Optional[str] = None,
**kwargs,
):
super().__init__(self, **kwargs)

if repo_type not in REPO_TYPES:
raise ValueError(f"Invalid repo type, must be one of {REPO_TYPES}")

self.repo_id = repo_id
self.token = token if token is not None else HfFolder.get_token()
self.repo_type = repo_type
self.revision = revision
# Cached attributes
self._repo_info = None
self._repo_entries_spec = None

def _get_repo_info(self):
if self._repo_info is None:
self._repo_info = _repo_type_to_info_func(self.repo_type)(
self.repo_id, revision=self.revision, token=self.token
)

def _get_repo_entries_spec(self):
if self._repo_entries_spec is None:
self._get_repo_info()
self._repo_entries_spec = {}
for hf_file in self._repo_info.siblings:
# TODO(QL): add sizes
self._repo_entries_spec[hf_file.rfilename] = {
"name": hf_file.rfilename,
"size": None,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The file sizes are still not available in the siblings ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, still not available. Maybe @SBrandeis can help us with that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On it

"type": "file",
}
self._repo_entries_spec.update(
{
str(d): {"name": str(d), "size": None, "type": "directory"}
for d in list(PurePosixPath(hf_file.rfilename).parents)[:-1]
}
)

def _invalidate_repo_cache(self):
self._repo_info = None
self._repo_entries_spec = None

@classmethod
def _strip_protocol(cls, path):
path = super()._strip_protocol(path).lstrip("/")
if ":/" in path:
path = path.split(":", 1)[1]
return path.lstrip("/")

@staticmethod
def _get_kwargs_from_urls(path):
if path.startswith("hf://"):
path = path[5:]
out = {"repo_id": path}
if ":/" in path:
out["repo_id"], out["path"] = path.split(":/", 1)
if "@" in out["repo_id"]:
out["repo_id"], out["revision"] = out["repo_id"].split("@", 1)
return out

def _open(
self,
path: str,
mode: str = "rb",
**kwargs,
):
# TODO(mariosasko): add support for the "ab" mode
if mode == "ab":
raise NotImplementedError("Appending to files is not supported")

if mode == "rb":
self._get_repo_info()
url = hf_hub_url(
self.repo_id,
path,
repo_type=self.repo_type,
revision=self.revision,
)
return fsspec.open(
url,
mode=mode,
headers={"authorization": f"Bearer {self.token}"},
).open()
else:
return TempFileUploader(self, path, mode=mode)

def _rm(self, path):
path = self._strip_protocol(path)
delete_file(
path_in_repo=path,
repo_id=self.repo_id,
token=self.token,
repo_type=self.repo_type,
revision=self.revision,
)
self._invalidate_repo_cache()

def info(self, path, **kwargs):
self._get_repo_entries_spec()
path = self._strip_protocol(path)
if path in self._repo_entries_spec:
return self._repo_entries_spec[path]
else:
raise FileNotFoundError(path)

def ls(self, path, detail=False, **kwargs):
self._get_repo_entries_spec()
path = PurePosixPath(path.strip("/"))
paths = {}
for p, f in self._repo_entries_spec.items():
p = PurePosixPath(p.strip("/"))
root = p.parent
if root == path:
paths[str(p)] = f
out = list(paths.values())
if detail:
return out
else:
return list(sorted(f["name"] for f in out))


class TempFileUploader(fsspec.spec.AbstractBufferedFile):
def _initiate_upload(self):
self.temp_file = tempfile.TemporaryFile(dir=HUGGINGFACE_HUB_CACHE)

def _upload_chunk(self, final=False):
self.buffer.seek(0)
self.temp_file.write(self.buffer.read())
if final:
upload_file(
path_or_fileobj=self.temp_file.file,
path_in_repo=self.path,
repo_id=self.fs.repo_id,
token=self.fs.token,
repo_type=self.fs.repo_type,
revision=self.fs.revision,
)
self.fs._invalidate_repo_cache()
self.temp_file.close()
Loading