Skip to content

Commit

Permalink
Add pathlib.Path support to open_(mf)dataset (#1514)
Browse files Browse the repository at this point in the history
* Add pathlib support

* Loop over tmpfile functions

* Added show_commit_url to asv.conf (#1515)

* Added show_commit_url to asv.conf

This should setup the proper links from the published output to the commit on Github.

FYI the benchmarks should be running stably now, and posted to http://pandas.pydata.org/speed/xarray. http://pandas.pydata.org/speed/xarray/regressions.xml has an RSS feed to the regressions.

* Update asv.conf.json

* Small documentation fixes (#1516)

* Clarify in docs that inferring DataArray dimensions is deprecated

* Fix DataArray docstring

* Clarify DataArray coords documentation

* Condense pathlib handling for open_mf_dataset

* Add and test pathlib support for backends

* Add pathlib2 for python < 3

* Use pathlib backport if available.

This follows
<jazzband/pathlib2#8 (comment)>
who argues for sticking to pathlib2.

* Use pathlib w DataArray.to_netcdf

* Handle case of completely missing pathlib

* Remove pathlib requirement

* Drop pathlib from minimal test env

* Add what's-new entry on pathlib support

* Prefer stdlib pathlib

* Suppress ImportError's for pathlib

* Acutally get suppress function

* Add decorator for tests requiring pathlib(2)

* Move path_type to central submodule

* Remove unnecessary parens

* Revert "Added show_commit_url to asv.conf (#1515)"

This reverts commit 02023ed.

* Revert "Small documentation fixes (#1516)"

This reverts commit 4276bb8.

* Fix typo in docstring and fallback-module name

* Tweak what's new for pathlib support
  • Loading branch information
willirath authored and shoyer committed Sep 1, 2017
1 parent 4571d60 commit 4a15cfa
Show file tree
Hide file tree
Showing 9 changed files with 138 additions and 26 deletions.
1 change: 1 addition & 0 deletions ci/requirements-py27-cdat+pynio.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ dependencies:
- netcdf4
- numpy
- pandas
- pathlib2
- pynio
- pytest
- scipy
Expand Down
1 change: 1 addition & 0 deletions ci/requirements-py27-windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ dependencies:
- h5netcdf
- matplotlib
- netcdf4
- pathlib2
- pytest
- numpy
- pandas
Expand Down
22 changes: 22 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,31 @@ Backward Incompatible Changes
Enhancements
~~~~~~~~~~~~

- Support for `pathlib.Path` objects added to
:py:func:`~xarray.open_dataset`, :py:func:`~xarray.open_mfdataset`,
:py:func:`~xarray.to_netcdf`, and :py:func:`~xarray.save_mfdataset`
(:issue:`799`):

.. ipython::
:verbatim:

In [2]: from pathlib import Path # In Python 2, use pathlib2!

In [3]: data_dir = Path("data/")

In [4]: one_file = data_dir / "dta_for_month_01.nc"

In [5]: xr.open_dataset(one_file)
Out[5]:
<xarray.Dataset>
[...]

By `Willi Rath <https://github.com/willirath>`_.

- More attributes available in :py:attr:`~xarray.Dataset.attrs` dictionary when
raster files are opened with :py:func:`~xarray.open_rasterio`.
By `Greg Brener <https://github.com/gbrener>`_

- Support for NetCDF files using an ``_Unsigned`` attribute to indicate that a
a signed integer data type should be interpreted as unsigned bytes
(:issue:`1444`).
Expand Down
44 changes: 28 additions & 16 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,15 @@
from io import BytesIO
from numbers import Number


import numpy as np

from .. import backends, conventions
from .common import ArrayWriter, GLOBAL_LOCK
from ..core import indexing
from ..core.combine import auto_combine
from ..core.utils import close_on_error, is_remote_uri
from ..core.pycompat import basestring
from ..core.pycompat import basestring, path_type

DATAARRAY_NAME = '__xarray_dataarray_name__'
DATAARRAY_VARIABLE = '__xarray_dataarray_variable__'
Expand Down Expand Up @@ -139,12 +140,12 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True,
Parameters
----------
filename_or_obj : str, file or xarray.backends.*DataStore
Strings are interpreted as a path to a netCDF file or an OpenDAP URL
and opened with python-netCDF4, unless the filename ends with .gz, in
which case the file is gunzipped and opened with scipy.io.netcdf (only
netCDF3 supported). File-like objects are opened with scipy.io.netcdf
(only netCDF3 supported).
filename_or_obj : str, Path, file or xarray.backends.*DataStore
Strings and Path objects are interpreted as a path to a netCDF file
or an OpenDAP URL and opened with python-netCDF4, unless the filename
ends with .gz, in which case the file is gunzipped and opened with
scipy.io.netcdf (only netCDF3 supported). File-like objects are opened
with scipy.io.netcdf (only netCDF3 supported).
group : str, optional
Path to the netCDF4 group in the given file to open (only works for
netCDF4 files).
Expand Down Expand Up @@ -253,6 +254,9 @@ def maybe_decode_store(store, lock=False):

return ds2

if isinstance(filename_or_obj, path_type):
filename_or_obj = str(filename_or_obj)

if isinstance(filename_or_obj, backends.AbstractDataStore):
store = filename_or_obj
elif isinstance(filename_or_obj, basestring):
Expand Down Expand Up @@ -318,12 +322,12 @@ def open_dataarray(*args, **kwargs):
Parameters
----------
filename_or_obj : str, file or xarray.backends.*DataStore
Strings are interpreted as a path to a netCDF file or an OpenDAP URL
and opened with python-netCDF4, unless the filename ends with .gz, in
which case the file is gunzipped and opened with scipy.io.netcdf (only
netCDF3 supported). File-like objects are opened with scipy.io.netcdf
(only netCDF3 supported).
filename_or_obj : str, Path, file or xarray.backends.*DataStore
Strings and Paths are interpreted as a path to a netCDF file or an
OpenDAP URL and opened with python-netCDF4, unless the filename ends
with .gz, in which case the file is gunzipped and opened with
scipy.io.netcdf (only netCDF3 supported). File-like objects are opened
with scipy.io.netcdf (only netCDF3 supported).
group : str, optional
Path to the netCDF4 group in the given file to open (only works for
netCDF4 files).
Expand Down Expand Up @@ -438,7 +442,8 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
----------
paths : str or sequence
Either a string glob in the form "path/to/my/files/*.nc" or an explicit
list of files to open.
list of files to open. Paths can be given as strings or as pathlib
Paths.
chunks : int or dict, optional
Dictionary with keys given by dimension names and values given by chunk
sizes. In general, these should divide the dimensions of each dataset.
Expand Down Expand Up @@ -497,6 +502,9 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
"""
if isinstance(paths, basestring):
paths = sorted(glob(paths))
else:
paths = [str(p) if isinstance(p, path_type) else p for p in paths]

if not paths:
raise IOError('no files to open')

Expand Down Expand Up @@ -533,6 +541,8 @@ def to_netcdf(dataset, path_or_file=None, mode='w', format=None, group=None,
The ``writer`` argument is only for the private use of save_mfdataset.
"""
if isinstance(path_or_file, path_type):
path_or_file = str(path_or_file)
if encoding is None:
encoding = {}
if path_or_file is None:
Expand Down Expand Up @@ -597,12 +607,14 @@ def save_mfdataset(datasets, paths, mode='w', format=None, groups=None,
----------
datasets : list of xarray.Dataset
List of datasets to save.
paths : list of str
paths : list of str or list of Paths
List of paths to which to save each corresponding dataset.
mode : {'w', 'a'}, optional
Write ('w') or append ('a') mode. If mode='w', any existing file at
these locations will be overwritten.
format : {'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT', 'NETCDF3_CLASSIC'}, optional
format : {'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT',
'NETCDF3_CLASSIC'}, optional
File format for the resulting netCDF file:
* NETCDF4: Data is stored in an HDF5 file, using netCDF4 API
Expand Down
8 changes: 5 additions & 3 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -1286,15 +1286,16 @@ def to_netcdf(self, *args, **kwargs):
Parameters
----------
path : str, optional
path : str or Path, optional
Path to which to save this dataset. If no path is provided, this
function returns the resulting netCDF file as a bytes object; in
this case, we need to use scipy.io.netcdf, which does not support
netCDF version 4 (the default format becomes NETCDF3_64BIT).
mode : {'w', 'a'}, optional
Write ('w') or append ('a') mode. If mode='w', any existing file at
this location will be overwritten.
format : {'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT', 'NETCDF3_CLASSIC'}, optional
format : {'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT',
'NETCDF3_CLASSIC'}, optional
File format for the resulting netCDF file:
* NETCDF4: Data is stored in an HDF5 file, using netCDF4 API
Expand Down Expand Up @@ -1324,7 +1325,8 @@ def to_netcdf(self, *args, **kwargs):
encoding : dict, optional
Nested dictionary with variable names as keys and dictionaries of
variable specific encodings as values, e.g.,
``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1, 'zlib': True}, ...}``
``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1,
'zlib': True}, ...}``
Notes
-----
Expand Down
5 changes: 3 additions & 2 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -924,7 +924,7 @@ def to_netcdf(self, path=None, mode='w', format=None, group=None,
Parameters
----------
path : str or file-like object, optional
path : str, Path or file-like object, optional
Path to which to save this dataset. File-like objects are only
supported by the scipy engine. If no path is provided, this
function returns the resulting netCDF file as bytes; in this case,
Expand Down Expand Up @@ -963,7 +963,8 @@ def to_netcdf(self, path=None, mode='w', format=None, group=None,
encoding : dict, optional
Nested dictionary with variable names as keys and dictionaries of
variable specific encodings as values, e.g.,
``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1, 'zlib': True}, ...}``
``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1,
'zlib': True}, ...}``
unlimited_dims : sequence of str, optional
Dimension(s) that should be serialized as unlimited dimensions.
By default, no dimensions are treated as unlimited dimensions.
Expand Down
12 changes: 11 additions & 1 deletion xarray/core/pycompat.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,16 @@ def itervalues(d):
except ImportError: # pragma: no cover
dask_array_type = ()

try:
try:
from pathlib import Path
except ImportError as e:
from pathlib2 import Path
path_type = (Path, )
except ImportError as e:
path_type = ()


try:
from contextlib import suppress
except ImportError:
Expand Down Expand Up @@ -188,7 +198,7 @@ def __exit__(self, *exc_details):
# We manipulate the exception state so it behaves as though
# we were actually nesting multiple with statements
frame_exc = sys.exc_info()[1]

def _fix_exception_context(new_exc, old_exc):
# Context may not be correct, so find the end of the chain
while 1:
Expand Down
14 changes: 14 additions & 0 deletions xarray/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,17 @@
except ImportError:
has_rasterio = False

try:
import pathlib
has_pathlib = True
except ImportError:
try:
import pathlib2
has_pathlib = True
except ImportError:
has_pathlib = False


# slighly simpler construction that the full functions.
# Generally `pytest.importorskip('package')` inline is even easier
requires_matplotlib = pytest.mark.skipif(
Expand All @@ -105,6 +116,9 @@
not has_bottleneck, reason='requires bottleneck')
requires_rasterio = pytest.mark.skipif(
not has_rasterio, reason='requires rasterio')
requires_pathlib = pytest.mark.skipif(
not has_pathlib, reason='requires pathlib / pathlib2'
)


try:
Expand Down
57 changes: 53 additions & 4 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@

from . import (TestCase, requires_scipy, requires_netCDF4, requires_pydap,
requires_scipy_or_netCDF4, requires_dask, requires_h5netcdf,
requires_pynio, has_netCDF4, has_scipy, assert_allclose,
flaky, network, requires_rasterio, assert_identical)
requires_pynio, requires_pathlib, has_netCDF4, has_scipy,
assert_allclose, flaky, network, requires_rasterio,
assert_identical)
from .test_dataset import create_test_data

try:
Expand All @@ -40,6 +41,14 @@
except ImportError:
pass

try:
from pathlib import Path
except ImportError:
try:
from pathlib2 import Path
except ImportError:
pass


ON_WINDOWS = sys.platform == 'win32'

Expand Down Expand Up @@ -302,7 +311,8 @@ def test_roundtrip_timedelta_data(self):
self.assertDatasetIdentical(expected, actual)

def test_roundtrip_float64_data(self):
expected = Dataset({'x': ('y', np.array([1.0, 2.0, np.pi], dtype='float64'))})
expected = Dataset({'x': ('y', np.array([1.0, 2.0, np.pi],
dtype='float64'))})
with self.roundtrip(expected) as actual:
self.assertDatasetIdentical(expected, actual)

Expand Down Expand Up @@ -738,7 +748,8 @@ def test_mask_and_scale(self):
v.scale_factor = 0.1
v[:] = np.array([-1, -1, 0, 1, 2])

# first make sure netCDF4 reads the masked and scaled data correctly
# first make sure netCDF4 reads the masked and scaled data
# correctly
with nc4.Dataset(tmp_file, mode='r') as nc:
expected = np.ma.array([-1, -1, 10, 10.1, 10.2],
mask=[True, True, False, False, False])
Expand Down Expand Up @@ -1305,6 +1316,19 @@ def test_open_mfdataset(self):
with self.assertRaisesRegexp(IOError, 'no files to open'):
open_mfdataset('foo-bar-baz-*.nc', autoclose=self.autoclose)

@requires_pathlib
def test_open_mfdataset_pathlib(self):
original = Dataset({'foo': ('x', np.random.randn(10))})
with create_tmp_file() as tmp1:
with create_tmp_file() as tmp2:
tmp1 = Path(tmp1)
tmp2 = Path(tmp2)
original.isel(x=slice(5)).to_netcdf(tmp1)
original.isel(x=slice(5, 10)).to_netcdf(tmp2)
with open_mfdataset([tmp1, tmp2],
autoclose=self.autoclose) as actual:
self.assertDatasetAllClose(original, actual)

def test_attrs_mfdataset(self):
original = Dataset({'foo': ('x', np.random.randn(10))})
with create_tmp_file() as tmp1:
Expand Down Expand Up @@ -1355,6 +1379,20 @@ def test_save_mfdataset_invalid(self):
with self.assertRaisesRegexp(ValueError, 'same length'):
save_mfdataset([ds, ds], ['only one path'])

@requires_pathlib
def test_save_mfdataset_pathlib_roundtrip(self):
original = Dataset({'foo': ('x', np.random.randn(10))})
datasets = [original.isel(x=slice(5)),
original.isel(x=slice(5, 10))]
with create_tmp_file() as tmp1:
with create_tmp_file() as tmp2:
tmp1 = Path(tmp1)
tmp2 = Path(tmp2)
save_mfdataset(datasets, [tmp1, tmp2])
with open_mfdataset([tmp1, tmp2],
autoclose=self.autoclose) as actual:
self.assertDatasetIdentical(actual, original)

def test_open_and_do_math(self):
original = Dataset({'foo': ('x', np.random.randn(10))})
with create_tmp_file() as tmp:
Expand Down Expand Up @@ -1946,3 +1984,14 @@ def test_open_dataarray_options(self):
expected = data.drop('y')
with open_dataarray(tmp, drop_variables=['y']) as loaded:
self.assertDataArrayIdentical(expected, loaded)

@requires_pathlib
def test_dataarray_to_netcdf_no_name_pathlib(self):
original_da = DataArray(np.arange(12).reshape((3, 4)))

with create_tmp_file() as tmp:
tmp = Path(tmp)
original_da.to_netcdf(tmp)

with open_dataarray(tmp) as loaded_da:
self.assertDataArrayIdentical(original_da, loaded_da)

0 comments on commit 4a15cfa

Please sign in to comment.