From 554285bbfc98c3f44a59446c09036570311c746e Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Mon, 10 Jul 2023 16:04:05 +0200 Subject: [PATCH] Move absolute path finder from open_mfdataset to own function (#7968) * Move absolute path finder to own function * Update common.py * Workaround for strange \ behavior * More workarounds * Update common.py * Update common.py * Update common.py * Update xarray/backends/common.py Co-authored-by: Michael Niklas * Update common.py --------- Co-authored-by: Michael Niklas --- xarray/backends/api.py | 40 ++++---------------- xarray/backends/common.py | 78 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 33 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 0157e0d9d66..d992d3999a3 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -3,7 +3,6 @@ import os from collections.abc import Hashable, Iterable, Mapping, MutableMapping, Sequence from functools import partial -from glob import glob from io import BytesIO from numbers import Number from typing import ( @@ -21,7 +20,12 @@ from xarray import backends, conventions from xarray.backends import plugins -from xarray.backends.common import AbstractDataStore, ArrayWriter, _normalize_path +from xarray.backends.common import ( + AbstractDataStore, + ArrayWriter, + _find_absolute_paths, + _normalize_path, +) from xarray.backends.locks import _get_scheduler from xarray.core import indexing from xarray.core.combine import ( @@ -967,37 +971,7 @@ def open_mfdataset( .. [1] https://docs.xarray.dev/en/stable/dask.html .. [2] https://docs.xarray.dev/en/stable/dask.html#chunking-and-performance """ - if isinstance(paths, str): - if is_remote_uri(paths) and engine == "zarr": - try: - from fsspec.core import get_fs_token_paths - except ImportError as e: - raise ImportError( - "The use of remote URLs for opening zarr requires the package fsspec" - ) from e - - fs, _, _ = get_fs_token_paths( - paths, - mode="rb", - storage_options=kwargs.get("backend_kwargs", {}).get( - "storage_options", {} - ), - expand=False, - ) - tmp_paths = fs.glob(fs._strip_protocol(paths)) # finds directories - paths = [fs.get_mapper(path) for path in tmp_paths] - elif is_remote_uri(paths): - raise ValueError( - "cannot do wild-card matching for paths that are remote URLs " - f"unless engine='zarr' is specified. Got paths: {paths}. " - "Instead, supply paths as an explicit list of strings." - ) - else: - paths = sorted(glob(_normalize_path(paths))) - elif isinstance(paths, os.PathLike): - paths = [os.fspath(paths)] - else: - paths = [os.fspath(p) if isinstance(p, os.PathLike) else p for p in paths] + paths = _find_absolute_paths(paths, engine=engine, **kwargs) if not paths: raise OSError("no files to open") diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 50ac606a83e..1ac988c6b4f 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -5,6 +5,7 @@ import time import traceback from collections.abc import Iterable +from glob import glob from typing import TYPE_CHECKING, Any, ClassVar import numpy as np @@ -19,6 +20,7 @@ from io import BufferedIOBase from xarray.core.dataset import Dataset + from xarray.core.types import NestedSequence # Create a logger object, but don't add any handlers. Leave that to user code. logger = logging.getLogger(__name__) @@ -28,6 +30,24 @@ def _normalize_path(path): + """ + Normalize pathlikes to string. + + Parameters + ---------- + path : + Path to file. + + Examples + -------- + >>> from pathlib import Path + + >>> directory = Path(xr.backends.common.__file__).parent + >>> paths_path = Path(directory).joinpath("comm*n.py") + >>> paths_str = xr.backends.common._normalize_path(paths_path) + >>> print([type(p) for p in (paths_str,)]) + [] + """ if isinstance(path, os.PathLike): path = os.fspath(path) @@ -37,6 +57,64 @@ def _normalize_path(path): return path +def _find_absolute_paths( + paths: str | os.PathLike | NestedSequence[str | os.PathLike], **kwargs +) -> list[str]: + """ + Find absolute paths from the pattern. + + Parameters + ---------- + paths : + Path(s) to file(s). Can include wildcards like * . + **kwargs : + Extra kwargs. Mainly for fsspec. + + Examples + -------- + >>> from pathlib import Path + + >>> directory = Path(xr.backends.common.__file__).parent + >>> paths = str(Path(directory).joinpath("comm*n.py")) # Find common with wildcard + >>> paths = xr.backends.common._find_absolute_paths(paths) + >>> [Path(p).name for p in paths] + ['common.py'] + """ + if isinstance(paths, str): + if is_remote_uri(paths) and kwargs.get("engine", None) == "zarr": + try: + from fsspec.core import get_fs_token_paths + except ImportError as e: + raise ImportError( + "The use of remote URLs for opening zarr requires the package fsspec" + ) from e + + fs, _, _ = get_fs_token_paths( + paths, + mode="rb", + storage_options=kwargs.get("backend_kwargs", {}).get( + "storage_options", {} + ), + expand=False, + ) + tmp_paths = fs.glob(fs._strip_protocol(paths)) # finds directories + paths = [fs.get_mapper(path) for path in tmp_paths] + elif is_remote_uri(paths): + raise ValueError( + "cannot do wild-card matching for paths that are remote URLs " + f"unless engine='zarr' is specified. Got paths: {paths}. " + "Instead, supply paths as an explicit list of strings." + ) + else: + paths = sorted(glob(_normalize_path(paths))) + elif isinstance(paths, os.PathLike): + paths = [os.fspath(paths)] + else: + paths = [os.fspath(p) if isinstance(p, os.PathLike) else p for p in paths] + + return paths + + def _encode_variable_name(name): if name is None: name = NONE_VAR_NAME