Skip to content

Commit

Permalink
speedup localfs.info (#552)
Browse files Browse the repository at this point in the history
optimize localfs.info

Reduces no. of stat calls and avoids _strip_protocol call which is slow
when we have large no. of files.

For reducing stat calls upstream, I have a PR in fsspec/filesystem_spec#1659.
  • Loading branch information
skshetry authored Aug 12, 2024
1 parent ce93ee8 commit c4ed9e8
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 4 deletions.
32 changes: 32 additions & 0 deletions src/dvc_data/fsutils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from os import readlink, stat
from stat import S_ISDIR, S_ISLNK, S_ISREG
from typing import Any


def _localfs_info(path: str) -> dict[str, Any]:
out = stat(path, follow_symlinks=False)
if link := S_ISLNK(out.st_mode):
out = stat(path, follow_symlinks=True)
if S_ISDIR(out.st_mode):
t = "directory"
elif S_ISREG(out.st_mode):
t = "file"
else:
t = "other"

result = {
"name": path,
"size": out.st_size,
"type": t,
"created": out.st_ctime,
"islink": link,
"mode": out.st_mode,
"uid": out.st_uid,
"gid": out.st_gid,
"mtime": out.st_mtime,
"ino": out.st_ino,
"nlink": out.st_nlink,
}
if link:
result["destination"] = readlink(path)
return result
3 changes: 2 additions & 1 deletion src/dvc_data/hashfile/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from fsspec.callbacks import DEFAULT_CALLBACK, Callback

from dvc_data.callbacks import TqdmCallback
from dvc_data.fsutils import _localfs_info
from dvc_data.hashfile.hash_info import HashInfo
from dvc_data.hashfile.state import StateBase, StateNoop

Expand Down Expand Up @@ -234,7 +235,7 @@ def _walk_files(
walk_iter = ignore.walk(fs, path) if ignore else fs.walk(path)
for root, _, files in walk_iter:
assert isinstance(root, str)
yield root, {file: fs.info(f"{root}{sep}{file}") for file in files}
yield root, {file: _localfs_info(f"{root}{sep}{file}") for file in files}


def _build_tree(
Expand Down
4 changes: 3 additions & 1 deletion src/dvc_data/hashfile/checkout.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from dvc_objects.fs.local import LocalFileSystem
from fsspec.callbacks import DEFAULT_CALLBACK

from dvc_data.fsutils import _localfs_info

from .build import build
from .diff import ROOT, DiffResult
from .diff import diff as odiff
Expand Down Expand Up @@ -301,7 +303,7 @@ def _checkout( # noqa: C901
failed.extend(exc.paths)
else:
if is_local_fs:
info = fs.info(entry_path)
info = _localfs_info(entry_path)
hashes_to_update.append((entry_path, change.new.oid, info))

if state is not None:
Expand Down
4 changes: 3 additions & 1 deletion src/dvc_data/hashfile/db/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from dvc_objects.fs.utils import copyfile, remove, tmp_fname
from fsspec.callbacks import DEFAULT_CALLBACK

from dvc_data.fsutils import _localfs_info

from . import HashFileDB

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -123,7 +125,7 @@ def check(self, oid: str, check_hash: bool = True, _info: Optional[dict] = None)
from dvc_data.hashfile.meta import Meta

path = self.oid_to_path(oid)
info = _info or self.fs.info(path)
info = _info or _localfs_info(path)
if stat.S_IMODE(info["mode"]) == self.CACHE_MODE:
return Meta.from_info(info)
return super().check(oid, check_hash, info)
Expand Down
4 changes: 3 additions & 1 deletion src/dvc_data/hashfile/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import json
from typing import TYPE_CHECKING, Optional

from dvc_data.fsutils import _localfs_info

if TYPE_CHECKING:
from dvc_objects.fs.base import AnyFSPath, FileSystem

Expand Down Expand Up @@ -30,7 +32,7 @@ def get_mtime_and_size(
walk_iterator = fs.find(path)
for file_path in walk_iterator:
try:
stats = fs.info(file_path)
stats = _localfs_info(file_path)
except OSError as exc:
# NOTE: broken symlink case.
if exc.errno != errno.ENOENT:
Expand Down

0 comments on commit c4ed9e8

Please sign in to comment.