Skip to content

Commit

Permalink
handle recursive file listings (#18)
Browse files Browse the repository at this point in the history
* handle recursive file listings
* implement filelisting cache
* fix mockup test
* adjust test to sizes being now communicated
  • Loading branch information
observingClouds authored Oct 19, 2024
1 parent 2ac413d commit 88bddab
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 23 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Changelog

## unreleased
- Add caching of file listings for faster look-ups [#18](https://github.com/observingClouds/ecmwfspec/pull/18)
- Add support for recursive file listings (`ls -R`) [#18](https://github.com/observingClouds/ecmwfspec/pull/18)
- Fix `isdir()` function call [#15](https://github.com/observingClouds/ecmwfspec/pull/15)
- Add UPath support for ec-protocol [#15](https://github.com/observingClouds/ecmwfspec/pull/15)
- Raise specific errors when `ls` fails due to PermissionError or FileNotFoundError [#15](https://github.com/observingClouds/ecmwfspec/pull/15)
Expand Down
60 changes: 50 additions & 10 deletions ecmwfspec/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
overload,
)

import pandas as pd
from fsspec.spec import AbstractFileSystem
from upath import UPath

Expand Down Expand Up @@ -302,27 +303,44 @@ def __init__(
self.override = override
self.delay = delay
self.file_permissions = file_permissions
self.file_listing_cache: pd.DataFrame = pd.DataFrame(
columns=[
"permissions",
"links",
"owner",
"group",
"size",
"month",
"day",
"time",
"path",
]
)

@overload
def ls(
self, path: Union[str, Path], detail: Literal[True], **kwargs: Any
self, path: Union[str, Path, UPath], detail: Literal[True], **kwargs: Any
) -> List[FileInfo]: ...

@overload
def ls(
self, path: Union[str, Path], detail: Literal[False], **kwargs: Any
self, path: Union[str, Path, UPath], detail: Literal[False], **kwargs: Any
) -> List[str]: ...

def ls(
self, path: Union[str, Path], detail: bool = True, **kwargs: Any
self,
path: Union[str, Path, UPath],
detail: bool = True,
recursive: bool = False,
**kwargs: Any,
) -> Union[List[FileInfo], List[str]]:
"""List objects at path.
This includes sub directories and files at that location.
Parameters
----------
path: str | pathlib.Path
path: str | pathlib.Path | UPath
Path of the file object that is listed.
detail: bool, default: True
if True, gives a list of dictionaries, where each is the same as
Expand All @@ -335,17 +353,39 @@ def ls(
list : List of strings if detail is False, or list of directory
information dicts if detail is True.
"""
path = Path(path)
filelist = ecfs.ls(str(path), detail=detail)
if isinstance(path, UPath):
path = path
elif isinstance(path, str):
path = UPath(path)
elif isinstance(path, Path):
path = UPath(str(path))
else:
raise TypeError(f"Path type {type(path)} not supported.")
if recursive:
filelist = self.file_listing_cache.loc[
self.file_listing_cache["path"].str.startswith(path.path)
]
else:
filelist = self.file_listing_cache.loc[
self.file_listing_cache["path"] == str(path)
]
if filelist.empty:
filelist = ecfs.ls(str(path), detail=detail, recursive=recursive)
if (
recursive
): # only in case of recursive to ensure subdirectories are added to cache
self.file_listing_cache = pd.concat(
[self.file_listing_cache, filelist], ignore_index=True
)
# Drop summary line of detailed listing
if detail:
filelist = filelist[filelist.permissions != "total"]
# if detail:
# filelist = filelist[filelist.permissions != "total"]
detail_list: List[FileInfo] = []
types = {"d": "directory", "-": "file"}
types = {"d": "directory", "-": "file", "o": "file"} # o is undocumented
detail_list = [
{
"name": str(path / file_entry.path),
"size": None, # sizes are human readable not in bytes
"size": file_entry.size,
"type": types[file_entry.permissions[0]] if detail else None,
}
for _, file_entry in filelist.iterrows()
Expand Down
30 changes: 23 additions & 7 deletions ecmwfspec/ecfs_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,13 @@ def ls(
command = ["els", str(path).replace("ec:", "ec:/")]
columns = ["path"]

if recursive:
logger.warning(
"Recursive option should be avoided on very large ECFS directory trees because of timeout issues."
)
command.insert(-1, "-R")
detail = True

if detail:
command.insert(-1, "-l")
columns = [
Expand All @@ -41,12 +48,6 @@ def ls(
if directory:
command.insert(-1, "-d")

if recursive:
logger.warning(
"Recursive option should be avoided on very large ECFS directory tress because of timeout issues."
)
command.insert(-1, "-R")

result = subprocess.run(
command, check=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
)
Expand All @@ -63,8 +64,23 @@ def ls(
result_lines = result.stdout.split("\n")
result_lines = [f for f in result_lines if f != ""]

if detail:
if detail and not recursive:
files = [f.split() for f in result_lines]
elif recursive:
files = []
current_dir = None
for line in result_lines:
if line.startswith("/"):
current_dir = line.rstrip(":")
elif line.startswith("total"):
continue
elif line.endswith(" .") or line.endswith(" .."):
continue
else:
details = line.split()
if current_dir:
details.append(current_dir + "/" + details[-1])
files.append(details[0:8] + [details[-1]])
else:
files = result_lines # type: ignore

Expand Down
33 changes: 28 additions & 5 deletions ecmwfspec/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ def ls(
command = ["ls", inp_path]
columns = ["path"]

if recursive:
command.insert(-1, "-R")
detail = True

if detail:
command.insert(-1, "-l")
columns = [
Expand All @@ -64,16 +68,35 @@ def ls(
if directory:
command.insert(-1, "-d")

if recursive:
command.insert(-1, "-R")

result = run(command, stdout=PIPE, stderr=PIPE, text=True)

files = result.stdout.split("\n")
files = [f for f in files if f != ""]

if detail:
files_incl_details = [f.split() for f in files]
if detail and not recursive:
files_incl_details = []
current_dir = None
for line in files:
if line.startswith("total"):
continue
else:
files_incl_details.append(line.split())
df = pd.DataFrame(files_incl_details, columns=columns)
elif recursive:
files_incl_details = []
current_dir = None
for line in files:
if line.startswith("/"):
current_dir = line.rstrip(":")
elif line.startswith("total"):
continue
else:
details = line.split()
if current_dir and details[0].startswith("l"):
details.append(current_dir + "/" + details[-1])
elif current_dir:
details.append(current_dir + "/" + details[-1])
files_incl_details.append(details[0:8] + [details[-1]])
df = pd.DataFrame(files_incl_details, columns=columns)
else:
df = pd.DataFrame(files, columns=columns)
Expand Down
2 changes: 1 addition & 1 deletion ecmwfspec/tests/test_open_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,4 +203,4 @@ def test_list_files(patch_dir: Path, netcdf_files: Path) -> None:
assert "name" in info
assert "type" in info
assert "size" in info
assert info["size"] is None
assert info["size"] == 9

0 comments on commit 88bddab

Please sign in to comment.