From 4a15cfa6f3e680af6969f3e2c6a375e2cc8c3288 Mon Sep 17 00:00:00 2001 From: Willi Rath Date: Fri, 1 Sep 2017 17:31:51 +0200 Subject: [PATCH] Add `pathlib.Path` support to `open_(mf)dataset` (#1514) * Add pathlib support * Loop over tmpfile functions * Added show_commit_url to asv.conf (#1515) * Added show_commit_url to asv.conf This should setup the proper links from the published output to the commit on Github. FYI the benchmarks should be running stably now, and posted to http://pandas.pydata.org/speed/xarray. http://pandas.pydata.org/speed/xarray/regressions.xml has an RSS feed to the regressions. * Update asv.conf.json * Small documentation fixes (#1516) * Clarify in docs that inferring DataArray dimensions is deprecated * Fix DataArray docstring * Clarify DataArray coords documentation * Condense pathlib handling for open_mf_dataset * Add and test pathlib support for backends * Add pathlib2 for python < 3 * Use pathlib backport if available. This follows who argues for sticking to pathlib2. * Use pathlib w DataArray.to_netcdf * Handle case of completely missing pathlib * Remove pathlib requirement * Drop pathlib from minimal test env * Add what's-new entry on pathlib support * Prefer stdlib pathlib * Suppress ImportError's for pathlib * Acutally get suppress function * Add decorator for tests requiring pathlib(2) * Move path_type to central submodule * Remove unnecessary parens * Revert "Added show_commit_url to asv.conf (#1515)" This reverts commit 02023edc7178ea9cb1fd64169afa659b82dfd67c. * Revert "Small documentation fixes (#1516)" This reverts commit 4276bb81bffb294516b38ddac8a4d87b7ad56888. * Fix typo in docstring and fallback-module name * Tweak what's new for pathlib support --- ci/requirements-py27-cdat+pynio.yml | 1 + ci/requirements-py27-windows.yml | 1 + doc/whats-new.rst | 22 +++++++++++ xarray/backends/api.py | 44 ++++++++++++++-------- xarray/core/dataarray.py | 8 ++-- xarray/core/dataset.py | 5 ++- xarray/core/pycompat.py | 12 +++++- xarray/tests/__init__.py | 14 +++++++ xarray/tests/test_backends.py | 57 +++++++++++++++++++++++++++-- 9 files changed, 138 insertions(+), 26 deletions(-) diff --git a/ci/requirements-py27-cdat+pynio.yml b/ci/requirements-py27-cdat+pynio.yml index 113714cbfd6..0258c8c9672 100644 --- a/ci/requirements-py27-cdat+pynio.yml +++ b/ci/requirements-py27-cdat+pynio.yml @@ -13,6 +13,7 @@ dependencies: - netcdf4 - numpy - pandas + - pathlib2 - pynio - pytest - scipy diff --git a/ci/requirements-py27-windows.yml b/ci/requirements-py27-windows.yml index cfd3d4262cc..e953b5ffdcb 100644 --- a/ci/requirements-py27-windows.yml +++ b/ci/requirements-py27-windows.yml @@ -9,6 +9,7 @@ dependencies: - h5netcdf - matplotlib - netcdf4 + - pathlib2 - pytest - numpy - pandas diff --git a/doc/whats-new.rst b/doc/whats-new.rst index d74ebc05391..31c80052b0f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -35,9 +35,31 @@ Backward Incompatible Changes Enhancements ~~~~~~~~~~~~ +- Support for `pathlib.Path` objects added to + :py:func:`~xarray.open_dataset`, :py:func:`~xarray.open_mfdataset`, + :py:func:`~xarray.to_netcdf`, and :py:func:`~xarray.save_mfdataset` + (:issue:`799`): + + .. ipython:: + :verbatim: + + In [2]: from pathlib import Path # In Python 2, use pathlib2! + + In [3]: data_dir = Path("data/") + + In [4]: one_file = data_dir / "dta_for_month_01.nc" + + In [5]: xr.open_dataset(one_file) + Out[5]: + + [...] + + By `Willi Rath `_. + - More attributes available in :py:attr:`~xarray.Dataset.attrs` dictionary when raster files are opened with :py:func:`~xarray.open_rasterio`. By `Greg Brener `_ + - Support for NetCDF files using an ``_Unsigned`` attribute to indicate that a a signed integer data type should be interpreted as unsigned bytes (:issue:`1444`). diff --git a/xarray/backends/api.py b/xarray/backends/api.py index ed23bb317f1..d9459e1aeaa 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -7,6 +7,7 @@ from io import BytesIO from numbers import Number + import numpy as np from .. import backends, conventions @@ -14,7 +15,7 @@ from ..core import indexing from ..core.combine import auto_combine from ..core.utils import close_on_error, is_remote_uri -from ..core.pycompat import basestring +from ..core.pycompat import basestring, path_type DATAARRAY_NAME = '__xarray_dataarray_name__' DATAARRAY_VARIABLE = '__xarray_dataarray_variable__' @@ -139,12 +140,12 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True, Parameters ---------- - filename_or_obj : str, file or xarray.backends.*DataStore - Strings are interpreted as a path to a netCDF file or an OpenDAP URL - and opened with python-netCDF4, unless the filename ends with .gz, in - which case the file is gunzipped and opened with scipy.io.netcdf (only - netCDF3 supported). File-like objects are opened with scipy.io.netcdf - (only netCDF3 supported). + filename_or_obj : str, Path, file or xarray.backends.*DataStore + Strings and Path objects are interpreted as a path to a netCDF file + or an OpenDAP URL and opened with python-netCDF4, unless the filename + ends with .gz, in which case the file is gunzipped and opened with + scipy.io.netcdf (only netCDF3 supported). File-like objects are opened + with scipy.io.netcdf (only netCDF3 supported). group : str, optional Path to the netCDF4 group in the given file to open (only works for netCDF4 files). @@ -253,6 +254,9 @@ def maybe_decode_store(store, lock=False): return ds2 + if isinstance(filename_or_obj, path_type): + filename_or_obj = str(filename_or_obj) + if isinstance(filename_or_obj, backends.AbstractDataStore): store = filename_or_obj elif isinstance(filename_or_obj, basestring): @@ -318,12 +322,12 @@ def open_dataarray(*args, **kwargs): Parameters ---------- - filename_or_obj : str, file or xarray.backends.*DataStore - Strings are interpreted as a path to a netCDF file or an OpenDAP URL - and opened with python-netCDF4, unless the filename ends with .gz, in - which case the file is gunzipped and opened with scipy.io.netcdf (only - netCDF3 supported). File-like objects are opened with scipy.io.netcdf - (only netCDF3 supported). + filename_or_obj : str, Path, file or xarray.backends.*DataStore + Strings and Paths are interpreted as a path to a netCDF file or an + OpenDAP URL and opened with python-netCDF4, unless the filename ends + with .gz, in which case the file is gunzipped and opened with + scipy.io.netcdf (only netCDF3 supported). File-like objects are opened + with scipy.io.netcdf (only netCDF3 supported). group : str, optional Path to the netCDF4 group in the given file to open (only works for netCDF4 files). @@ -438,7 +442,8 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, ---------- paths : str or sequence Either a string glob in the form "path/to/my/files/*.nc" or an explicit - list of files to open. + list of files to open. Paths can be given as strings or as pathlib + Paths. chunks : int or dict, optional Dictionary with keys given by dimension names and values given by chunk sizes. In general, these should divide the dimensions of each dataset. @@ -497,6 +502,9 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, """ if isinstance(paths, basestring): paths = sorted(glob(paths)) + else: + paths = [str(p) if isinstance(p, path_type) else p for p in paths] + if not paths: raise IOError('no files to open') @@ -533,6 +541,8 @@ def to_netcdf(dataset, path_or_file=None, mode='w', format=None, group=None, The ``writer`` argument is only for the private use of save_mfdataset. """ + if isinstance(path_or_file, path_type): + path_or_file = str(path_or_file) if encoding is None: encoding = {} if path_or_file is None: @@ -597,12 +607,14 @@ def save_mfdataset(datasets, paths, mode='w', format=None, groups=None, ---------- datasets : list of xarray.Dataset List of datasets to save. - paths : list of str + paths : list of str or list of Paths List of paths to which to save each corresponding dataset. mode : {'w', 'a'}, optional Write ('w') or append ('a') mode. If mode='w', any existing file at these locations will be overwritten. - format : {'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT', 'NETCDF3_CLASSIC'}, optional + format : {'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT', + 'NETCDF3_CLASSIC'}, optional + File format for the resulting netCDF file: * NETCDF4: Data is stored in an HDF5 file, using netCDF4 API diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 3f0fa85ba10..14e53aababf 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1286,7 +1286,7 @@ def to_netcdf(self, *args, **kwargs): Parameters ---------- - path : str, optional + path : str or Path, optional Path to which to save this dataset. If no path is provided, this function returns the resulting netCDF file as a bytes object; in this case, we need to use scipy.io.netcdf, which does not support @@ -1294,7 +1294,8 @@ def to_netcdf(self, *args, **kwargs): mode : {'w', 'a'}, optional Write ('w') or append ('a') mode. If mode='w', any existing file at this location will be overwritten. - format : {'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT', 'NETCDF3_CLASSIC'}, optional + format : {'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT', + 'NETCDF3_CLASSIC'}, optional File format for the resulting netCDF file: * NETCDF4: Data is stored in an HDF5 file, using netCDF4 API @@ -1324,7 +1325,8 @@ def to_netcdf(self, *args, **kwargs): encoding : dict, optional Nested dictionary with variable names as keys and dictionaries of variable specific encodings as values, e.g., - ``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1, 'zlib': True}, ...}`` + ``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1, + 'zlib': True}, ...}`` Notes ----- diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index bffdbf10724..aa49d8a73b0 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -924,7 +924,7 @@ def to_netcdf(self, path=None, mode='w', format=None, group=None, Parameters ---------- - path : str or file-like object, optional + path : str, Path or file-like object, optional Path to which to save this dataset. File-like objects are only supported by the scipy engine. If no path is provided, this function returns the resulting netCDF file as bytes; in this case, @@ -963,7 +963,8 @@ def to_netcdf(self, path=None, mode='w', format=None, group=None, encoding : dict, optional Nested dictionary with variable names as keys and dictionaries of variable specific encodings as values, e.g., - ``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1, 'zlib': True}, ...}`` + ``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1, + 'zlib': True}, ...}`` unlimited_dims : sequence of str, optional Dimension(s) that should be serialized as unlimited dimensions. By default, no dimensions are treated as unlimited dimensions. diff --git a/xarray/core/pycompat.py b/xarray/core/pycompat.py index 894608ef22d..7eaa0ccc450 100644 --- a/xarray/core/pycompat.py +++ b/xarray/core/pycompat.py @@ -59,6 +59,16 @@ def itervalues(d): except ImportError: # pragma: no cover dask_array_type = () +try: + try: + from pathlib import Path + except ImportError as e: + from pathlib2 import Path + path_type = (Path, ) +except ImportError as e: + path_type = () + + try: from contextlib import suppress except ImportError: @@ -188,7 +198,7 @@ def __exit__(self, *exc_details): # We manipulate the exception state so it behaves as though # we were actually nesting multiple with statements frame_exc = sys.exc_info()[1] - + def _fix_exception_context(new_exc, old_exc): # Context may not be correct, so find the end of the chain while 1: diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 424484b438a..7afad6ffe92 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -83,6 +83,17 @@ except ImportError: has_rasterio = False +try: + import pathlib + has_pathlib = True +except ImportError: + try: + import pathlib2 + has_pathlib = True + except ImportError: + has_pathlib = False + + # slighly simpler construction that the full functions. # Generally `pytest.importorskip('package')` inline is even easier requires_matplotlib = pytest.mark.skipif( @@ -105,6 +116,9 @@ not has_bottleneck, reason='requires bottleneck') requires_rasterio = pytest.mark.skipif( not has_rasterio, reason='requires rasterio') +requires_pathlib = pytest.mark.skipif( + not has_pathlib, reason='requires pathlib / pathlib2' +) try: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 744cb7fd4f6..b2903150fb7 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -26,8 +26,9 @@ from . import (TestCase, requires_scipy, requires_netCDF4, requires_pydap, requires_scipy_or_netCDF4, requires_dask, requires_h5netcdf, - requires_pynio, has_netCDF4, has_scipy, assert_allclose, - flaky, network, requires_rasterio, assert_identical) + requires_pynio, requires_pathlib, has_netCDF4, has_scipy, + assert_allclose, flaky, network, requires_rasterio, + assert_identical) from .test_dataset import create_test_data try: @@ -40,6 +41,14 @@ except ImportError: pass +try: + from pathlib import Path +except ImportError: + try: + from pathlib2 import Path + except ImportError: + pass + ON_WINDOWS = sys.platform == 'win32' @@ -302,7 +311,8 @@ def test_roundtrip_timedelta_data(self): self.assertDatasetIdentical(expected, actual) def test_roundtrip_float64_data(self): - expected = Dataset({'x': ('y', np.array([1.0, 2.0, np.pi], dtype='float64'))}) + expected = Dataset({'x': ('y', np.array([1.0, 2.0, np.pi], + dtype='float64'))}) with self.roundtrip(expected) as actual: self.assertDatasetIdentical(expected, actual) @@ -738,7 +748,8 @@ def test_mask_and_scale(self): v.scale_factor = 0.1 v[:] = np.array([-1, -1, 0, 1, 2]) - # first make sure netCDF4 reads the masked and scaled data correctly + # first make sure netCDF4 reads the masked and scaled data + # correctly with nc4.Dataset(tmp_file, mode='r') as nc: expected = np.ma.array([-1, -1, 10, 10.1, 10.2], mask=[True, True, False, False, False]) @@ -1305,6 +1316,19 @@ def test_open_mfdataset(self): with self.assertRaisesRegexp(IOError, 'no files to open'): open_mfdataset('foo-bar-baz-*.nc', autoclose=self.autoclose) + @requires_pathlib + def test_open_mfdataset_pathlib(self): + original = Dataset({'foo': ('x', np.random.randn(10))}) + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + tmp1 = Path(tmp1) + tmp2 = Path(tmp2) + original.isel(x=slice(5)).to_netcdf(tmp1) + original.isel(x=slice(5, 10)).to_netcdf(tmp2) + with open_mfdataset([tmp1, tmp2], + autoclose=self.autoclose) as actual: + self.assertDatasetAllClose(original, actual) + def test_attrs_mfdataset(self): original = Dataset({'foo': ('x', np.random.randn(10))}) with create_tmp_file() as tmp1: @@ -1355,6 +1379,20 @@ def test_save_mfdataset_invalid(self): with self.assertRaisesRegexp(ValueError, 'same length'): save_mfdataset([ds, ds], ['only one path']) + @requires_pathlib + def test_save_mfdataset_pathlib_roundtrip(self): + original = Dataset({'foo': ('x', np.random.randn(10))}) + datasets = [original.isel(x=slice(5)), + original.isel(x=slice(5, 10))] + with create_tmp_file() as tmp1: + with create_tmp_file() as tmp2: + tmp1 = Path(tmp1) + tmp2 = Path(tmp2) + save_mfdataset(datasets, [tmp1, tmp2]) + with open_mfdataset([tmp1, tmp2], + autoclose=self.autoclose) as actual: + self.assertDatasetIdentical(actual, original) + def test_open_and_do_math(self): original = Dataset({'foo': ('x', np.random.randn(10))}) with create_tmp_file() as tmp: @@ -1946,3 +1984,14 @@ def test_open_dataarray_options(self): expected = data.drop('y') with open_dataarray(tmp, drop_variables=['y']) as loaded: self.assertDataArrayIdentical(expected, loaded) + + @requires_pathlib + def test_dataarray_to_netcdf_no_name_pathlib(self): + original_da = DataArray(np.arange(12).reshape((3, 4))) + + with create_tmp_file() as tmp: + tmp = Path(tmp) + original_da.to_netcdf(tmp) + + with open_dataarray(tmp) as loaded_da: + self.assertDataArrayIdentical(original_da, loaded_da)