Skip to content

Commit

Permalink
fix: performance of _ls_tree (#2103)
Browse files Browse the repository at this point in the history
* fix: performance of _ls_tree

* Remove copying logic

---------

Co-authored-by: mariosasko <mariosasko777@gmail.com>
  • Loading branch information
awgr and mariosasko authored Apr 11, 2024
1 parent 29de6ab commit 43ceaa4
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 4 deletions.
5 changes: 2 additions & 3 deletions src/huggingface_hub/hf_file_system.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import copy
import os
import re
import tempfile
Expand Down Expand Up @@ -397,7 +396,7 @@ def _ls_tree(
parent_path = self._parent(cache_path_info["name"])
self.dircache.setdefault(parent_path, []).append(cache_path_info)
out.append(cache_path_info)
return copy.deepcopy(out) # copy to not let users modify the dircache
return out

def glob(self, path, **kwargs):
# Set expand_info=False by default to get a x10 speed boost
Expand Down Expand Up @@ -561,7 +560,7 @@ def info(self, path: str, refresh: bool = False, revision: Optional[str] = None,
if not expand_info:
out = {k: out[k] for k in ["name", "size", "type"]}
assert out is not None
return copy.deepcopy(out) # copy to not let users modify the dircache
return out

def exists(self, path, **kwargs):
"""Is there a file at the given path"""
Expand Down
6 changes: 5 additions & 1 deletion tests/test_hf_file_system.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import copy
import datetime
import io
import os
Expand Down Expand Up @@ -391,7 +392,10 @@ def test_find_root_directory_no_revision_with_incomplete_cache(self):
repo_type="dataset",
)

files = self.hffs.find(self.hf_path, detail=True)
# Copy the result to make it robust to the cache modifications
# See discussion in https://github.com/huggingface/huggingface_hub/pull/2103
# for info on why this is not done in `HfFileSystem.find` by default
files = copy.deepcopy(self.hffs.find(self.hf_path, detail=True))

# some directories not in cache
self.hffs.dircache.pop(self.hf_path + "/data/sub_data")
Expand Down

0 comments on commit 43ceaa4

Please sign in to comment.