Skip to content

Commit 923473c

Browse files
committed
Defaults autoclose=False for open_mfdataset
This choice of default is to select standard xarray performance over general removal of the OSError associated with opening too many files as encountered using open_mfdataset
1 parent cc37747 commit 923473c

File tree

3 files changed

+43
-29
lines changed

3 files changed

+43
-29
lines changed

doc/whats-new.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,12 @@ v0.9.2 (unreleased)
2222
Enhancements
2323
~~~~~~~~~~~~
2424

25+
- It is now possible to set the ``autoclose=True`` argument to
26+
:py:func:`~xarray.open_mfdataset` to explicitly close opened files when not
27+
in use to prevent occurrence of an OS Error related to too many open files.
28+
Note, the default is ``autoclose=False``, which is consistent with previous
29+
xarray behavior. By `Phillip J. Wolfram <https://github.com/pwolfram>`_.
30+
2531
Bug fixes
2632
~~~~~~~~~
2733

xarray/backends/api.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ def _protect_dataset_variables_inplace(dataset, cache):
132132

133133

134134
def open_dataset(filename_or_obj, group=None, decode_cf=True,
135-
mask_and_scale=True, decode_times=True, autoclose=True,
135+
mask_and_scale=True, decode_times=True, autoclose=False,
136136
concat_characters=True, decode_coords=True, engine=None,
137137
chunks=None, lock=None, cache=None, drop_variables=None):
138138
"""Load and decode a dataset from a file or file-like object.

xarray/tests/test_backends.py

Lines changed: 36 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1077,17 +1077,19 @@ def test_open_mfdataset(self):
10771077
with create_tmp_file() as tmp2:
10781078
original.isel(x=slice(5)).to_netcdf(tmp1)
10791079
original.isel(x=slice(5, 10)).to_netcdf(tmp2)
1080-
with open_mfdataset([tmp1, tmp2]) as actual:
1081-
self.assertIsInstance(actual.foo.variable.data, da.Array)
1082-
self.assertEqual(actual.foo.variable.data.chunks,
1083-
((5, 5),))
1084-
self.assertDatasetAllClose(original, actual)
1085-
with open_mfdataset([tmp1, tmp2], chunks={'x': 3}) as actual:
1086-
self.assertEqual(actual.foo.variable.data.chunks,
1087-
((3, 2, 3, 2),))
1080+
for close in [True, False]:
1081+
with open_mfdataset([tmp1, tmp2], autoclose=close) as actual:
1082+
self.assertIsInstance(actual.foo.variable.data, da.Array)
1083+
self.assertEqual(actual.foo.variable.data.chunks,
1084+
((5, 5),))
1085+
self.assertDatasetAllClose(original, actual)
1086+
with open_mfdataset([tmp1, tmp2], chunks={'x': 3}, autoclose=close) as actual:
1087+
self.assertEqual(actual.foo.variable.data.chunks,
1088+
((3, 2, 3, 2),))
10881089

10891090
with self.assertRaisesRegexp(IOError, 'no files to open'):
1090-
open_mfdataset('foo-bar-baz-*.nc')
1091+
for close in [True, False]:
1092+
open_mfdataset('foo-bar-baz-*.nc', autoclose=close)
10911093

10921094
def test_preprocess_mfdataset(self):
10931095
original = Dataset({'foo': ('x', np.random.randn(10))})
@@ -1098,8 +1100,9 @@ def preprocess(ds):
10981100
return ds.assign_coords(z=0)
10991101

11001102
expected = preprocess(original)
1101-
with open_mfdataset(tmp, preprocess=preprocess) as actual:
1102-
self.assertDatasetIdentical(expected, actual)
1103+
for close in [True, False]:
1104+
with open_mfdataset(tmp, preprocess=preprocess, autoclose=close) as actual:
1105+
self.assertDatasetIdentical(expected, actual)
11031106

11041107
def test_save_mfdataset_roundtrip(self):
11051108
original = Dataset({'foo': ('x', np.random.randn(10))})
@@ -1108,8 +1111,9 @@ def test_save_mfdataset_roundtrip(self):
11081111
with create_tmp_file() as tmp1:
11091112
with create_tmp_file() as tmp2:
11101113
save_mfdataset(datasets, [tmp1, tmp2])
1111-
with open_mfdataset([tmp1, tmp2]) as actual:
1112-
self.assertDatasetIdentical(actual, original)
1114+
for close in [True, False]:
1115+
with open_mfdataset([tmp1, tmp2], autoclose=close) as actual:
1116+
self.assertDatasetIdentical(actual, original)
11131117

11141118
def test_save_mfdataset_invalid(self):
11151119
ds = Dataset()
@@ -1122,18 +1126,21 @@ def test_open_and_do_math(self):
11221126
original = Dataset({'foo': ('x', np.random.randn(10))})
11231127
with create_tmp_file() as tmp:
11241128
original.to_netcdf(tmp)
1125-
with open_mfdataset(tmp) as ds:
1126-
actual = 1.0 * ds
1127-
self.assertDatasetAllClose(original, actual)
1129+
for close in [True, False]:
1130+
with open_mfdataset(tmp, autoclose=close) as ds:
1131+
actual = 1.0 * ds
1132+
self.assertDatasetAllClose(original, actual)
11281133

11291134
def test_open_mfdataset_concat_dim_none(self):
11301135
with create_tmp_file() as tmp1:
11311136
with create_tmp_file() as tmp2:
11321137
data = Dataset({'x': 0})
11331138
data.to_netcdf(tmp1)
11341139
Dataset({'x': np.nan}).to_netcdf(tmp2)
1135-
with open_mfdataset([tmp1, tmp2], concat_dim=None) as actual:
1136-
self.assertDatasetIdentical(data, actual)
1140+
for close in [True, False]:
1141+
with open_mfdataset([tmp1, tmp2],
1142+
concat_dim=None, autoclose=close) as actual:
1143+
self.assertDatasetIdentical(data, actual)
11371144

11381145
def test_open_dataset(self):
11391146
original = Dataset({'foo': ('x', np.random.randn(10))})
@@ -1165,16 +1172,17 @@ def test_deterministic_names(self):
11651172
with create_tmp_file() as tmp:
11661173
data = create_test_data()
11671174
data.to_netcdf(tmp)
1168-
with open_mfdataset(tmp) as ds:
1169-
original_names = dict((k, v.data.name)
1170-
for k, v in ds.data_vars.items())
1171-
with open_mfdataset(tmp) as ds:
1172-
repeat_names = dict((k, v.data.name)
1173-
for k, v in ds.data_vars.items())
1174-
for var_name, dask_name in original_names.items():
1175-
self.assertIn(var_name, dask_name)
1176-
self.assertIn(tmp, dask_name)
1177-
self.assertEqual(original_names, repeat_names)
1175+
for close in [True, False]:
1176+
with open_mfdataset(tmp, autoclose=close) as ds:
1177+
original_names = dict((k, v.data.name)
1178+
for k, v in ds.data_vars.items())
1179+
with open_mfdataset(tmp, autoclose=close) as ds:
1180+
repeat_names = dict((k, v.data.name)
1181+
for k, v in ds.data_vars.items())
1182+
for var_name, dask_name in original_names.items():
1183+
self.assertIn(var_name, dask_name)
1184+
self.assertIn(tmp, dask_name)
1185+
self.assertEqual(original_names, repeat_names)
11781186

11791187
def test_dataarray_compute(self):
11801188
# Test DataArray.compute() on dask backend.

0 commit comments

Comments
 (0)