diff --git a/CHANGELOG.md b/CHANGELOG.md index 0d951e3..1dcb894 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,8 @@ # Changelog ## unreleased +- Add caching of file listings for faster look-ups [#18](https://github.com/observingClouds/ecmwfspec/pull/18) +- Add support for recursive file listings (`ls -R`) [#18](https://github.com/observingClouds/ecmwfspec/pull/18) - Fix `isdir()` function call [#15](https://github.com/observingClouds/ecmwfspec/pull/15) - Add UPath support for ec-protocol [#15](https://github.com/observingClouds/ecmwfspec/pull/15) - Raise specific errors when `ls` fails due to PermissionError or FileNotFoundError [#15](https://github.com/observingClouds/ecmwfspec/pull/15) diff --git a/ecmwfspec/core.py b/ecmwfspec/core.py index f1b38ab..43f638a 100644 --- a/ecmwfspec/core.py +++ b/ecmwfspec/core.py @@ -21,6 +21,7 @@ overload, ) +import pandas as pd from fsspec.spec import AbstractFileSystem from upath import UPath @@ -302,19 +303,36 @@ def __init__( self.override = override self.delay = delay self.file_permissions = file_permissions + self.file_listing_cache: pd.DataFrame = pd.DataFrame( + columns=[ + "permissions", + "links", + "owner", + "group", + "size", + "month", + "day", + "time", + "path", + ] + ) @overload def ls( - self, path: Union[str, Path], detail: Literal[True], **kwargs: Any + self, path: Union[str, Path, UPath], detail: Literal[True], **kwargs: Any ) -> List[FileInfo]: ... @overload def ls( - self, path: Union[str, Path], detail: Literal[False], **kwargs: Any + self, path: Union[str, Path, UPath], detail: Literal[False], **kwargs: Any ) -> List[str]: ... def ls( - self, path: Union[str, Path], detail: bool = True, **kwargs: Any + self, + path: Union[str, Path, UPath], + detail: bool = True, + recursive: bool = False, + **kwargs: Any, ) -> Union[List[FileInfo], List[str]]: """List objects at path. @@ -322,7 +340,7 @@ def ls( Parameters ---------- - path: str | pathlib.Path + path: str | pathlib.Path | UPath Path of the file object that is listed. detail: bool, default: True if True, gives a list of dictionaries, where each is the same as @@ -335,17 +353,39 @@ def ls( list : List of strings if detail is False, or list of directory information dicts if detail is True. """ - path = Path(path) - filelist = ecfs.ls(str(path), detail=detail) + if isinstance(path, UPath): + path = path + elif isinstance(path, str): + path = UPath(path) + elif isinstance(path, Path): + path = UPath(str(path)) + else: + raise TypeError(f"Path type {type(path)} not supported.") + if recursive: + filelist = self.file_listing_cache.loc[ + self.file_listing_cache["path"].str.startswith(path.path) + ] + else: + filelist = self.file_listing_cache.loc[ + self.file_listing_cache["path"] == str(path) + ] + if filelist.empty: + filelist = ecfs.ls(str(path), detail=detail, recursive=recursive) + if ( + recursive + ): # only in case of recursive to ensure subdirectories are added to cache + self.file_listing_cache = pd.concat( + [self.file_listing_cache, filelist], ignore_index=True + ) # Drop summary line of detailed listing - if detail: - filelist = filelist[filelist.permissions != "total"] + # if detail: + # filelist = filelist[filelist.permissions != "total"] detail_list: List[FileInfo] = [] - types = {"d": "directory", "-": "file"} + types = {"d": "directory", "-": "file", "o": "file"} # o is undocumented detail_list = [ { "name": str(path / file_entry.path), - "size": None, # sizes are human readable not in bytes + "size": file_entry.size, "type": types[file_entry.permissions[0]] if detail else None, } for _, file_entry in filelist.iterrows() diff --git a/ecmwfspec/ecfs_wrapper.py b/ecmwfspec/ecfs_wrapper.py index 32a9613..a8151af 100644 --- a/ecmwfspec/ecfs_wrapper.py +++ b/ecmwfspec/ecfs_wrapper.py @@ -21,6 +21,13 @@ def ls( command = ["els", str(path).replace("ec:", "ec:/")] columns = ["path"] + if recursive: + logger.warning( + "Recursive option should be avoided on very large ECFS directory trees because of timeout issues." + ) + command.insert(-1, "-R") + detail = True + if detail: command.insert(-1, "-l") columns = [ @@ -41,12 +48,6 @@ def ls( if directory: command.insert(-1, "-d") - if recursive: - logger.warning( - "Recursive option should be avoided on very large ECFS directory tress because of timeout issues." - ) - command.insert(-1, "-R") - result = subprocess.run( command, check=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True ) @@ -63,8 +64,23 @@ def ls( result_lines = result.stdout.split("\n") result_lines = [f for f in result_lines if f != ""] - if detail: + if detail and not recursive: files = [f.split() for f in result_lines] + elif recursive: + files = [] + current_dir = None + for line in result_lines: + if line.startswith("/"): + current_dir = line.rstrip(":") + elif line.startswith("total"): + continue + elif line.endswith(" .") or line.endswith(" .."): + continue + else: + details = line.split() + if current_dir: + details.append(current_dir + "/" + details[-1]) + files.append(details[0:8] + [details[-1]]) else: files = result_lines # type: ignore diff --git a/ecmwfspec/tests/conftest.py b/ecmwfspec/tests/conftest.py index 17b4e4b..afadc88 100644 --- a/ecmwfspec/tests/conftest.py +++ b/ecmwfspec/tests/conftest.py @@ -44,6 +44,10 @@ def ls( command = ["ls", inp_path] columns = ["path"] + if recursive: + command.insert(-1, "-R") + detail = True + if detail: command.insert(-1, "-l") columns = [ @@ -64,16 +68,35 @@ def ls( if directory: command.insert(-1, "-d") - if recursive: - command.insert(-1, "-R") - result = run(command, stdout=PIPE, stderr=PIPE, text=True) files = result.stdout.split("\n") files = [f for f in files if f != ""] - if detail: - files_incl_details = [f.split() for f in files] + if detail and not recursive: + files_incl_details = [] + current_dir = None + for line in files: + if line.startswith("total"): + continue + else: + files_incl_details.append(line.split()) + df = pd.DataFrame(files_incl_details, columns=columns) + elif recursive: + files_incl_details = [] + current_dir = None + for line in files: + if line.startswith("/"): + current_dir = line.rstrip(":") + elif line.startswith("total"): + continue + else: + details = line.split() + if current_dir and details[0].startswith("l"): + details.append(current_dir + "/" + details[-1]) + elif current_dir: + details.append(current_dir + "/" + details[-1]) + files_incl_details.append(details[0:8] + [details[-1]]) df = pd.DataFrame(files_incl_details, columns=columns) else: df = pd.DataFrame(files, columns=columns) diff --git a/ecmwfspec/tests/test_open_dataset.py b/ecmwfspec/tests/test_open_dataset.py index f0484c8..86e03b1 100644 --- a/ecmwfspec/tests/test_open_dataset.py +++ b/ecmwfspec/tests/test_open_dataset.py @@ -203,4 +203,4 @@ def test_list_files(patch_dir: Path, netcdf_files: Path) -> None: assert "name" in info assert "type" in info assert "size" in info - assert info["size"] is None + assert info["size"] == 9