From b2ac76c977c50648036c328f775d75bd8d98846d Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Thu, 28 Dec 2017 15:02:31 -0800 Subject: [PATCH 1/5] dtype.kind in ['S', 'O', 'U'] not == 'S' (issue 1781) Fixed bug in conventions.py:952 which caused read error on netCDF4 files with variable-length unicode strings with _FillValues. decode_cf_variable checks string_encoding, which was previously only defined for dtype.kind == 'S'. Bug fix defines for dtype.kind in ['S', 'O', 'U']. --- xarray/conventions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index 5b951ff694b..5c87eab0f47 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -949,7 +949,7 @@ def decode_cf_variable(name, var, concat_characters=True, mask_and_scale=True, original_dtype = data.dtype - if concat_characters and data.dtype.kind == 'S': + if concat_characters and data.dtype.kind in ['U', 'S', 'O']: if stack_char_dim: dimensions = dimensions[:-1] data = StackedBytesArray(data) From 1c0ef31985fb4028736e3b685b47b65145a02bd3 Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Thu, 28 Dec 2017 15:05:20 -0800 Subject: [PATCH 2/5] create entry in whatsnew --- doc/whats-new.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 728e40d4409..714c0f79a54 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -66,6 +66,7 @@ Bug fixes unintentionally loading the datastores data and attributes repeatedly during writes (:issue:`1798`). By `Joe Hamman `_. +- Properly handle ``_FillValue`` for variable-length unicode strings (:issue:`1781`). By `Michael Delgado `_. .. _whats-new.0.10.0: From 521fff5f40fac820a3057f3544e5c619659ba94c Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Thu, 28 Dec 2017 16:19:17 -0800 Subject: [PATCH 3/5] remove error from netCDF4 backend when writing unicode variable-length strings with _FillValue --- xarray/backends/netCDF4_.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index d8aa33f35dc..a71c1ce8701 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -340,15 +340,6 @@ def prepare_variable(self, name, variable, check_encoding=False, fill_value = attrs.pop('_FillValue', None) - if datatype is str and fill_value is not None: - raise NotImplementedError( - 'netCDF4 does not yet support setting a fill value for ' - 'variable-length strings ' - '(https://github.com/Unidata/netcdf4-python/issues/730). ' - "Either remove '_FillValue' from encoding on variable %r " - "or set {'dtype': 'S1'} in encoding to use the fixed width " - 'NC_CHAR type.' % name) - encoding = _extract_nc4_variable_encoding( variable, raise_on_invalid=check_encoding, unlimited_dims=unlimited_dims) From f04bdbee565d03bb7c625c12e59ea768681f4c96 Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Thu, 28 Dec 2017 16:21:00 -0800 Subject: [PATCH 4/5] test roundtrip for unicode variable-length strings with _FillValue for netCDF --- xarray/tests/test_backends.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 6b0cd59eb9e..6f79c8b7849 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -833,14 +833,12 @@ def test_roundtrip_string_with_fill_value_vlen(self): # https://github.com/Unidata/netcdf4-python/issues/730 # https://github.com/shoyer/h5netcdf/issues/37 original = Dataset({'x': ('t', values, {}, {'_FillValue': u'XXX'})}) - with pytest.raises(NotImplementedError): - with self.roundtrip(original) as actual: - self.assertDatasetIdentical(expected, actual) + with self.roundtrip(original) as actual: + self.assertDatasetIdentical(expected, actual) original = Dataset({'x': ('t', values, {}, {'_FillValue': u''})}) - with pytest.raises(NotImplementedError): - with self.roundtrip(original) as actual: - self.assertDatasetIdentical(expected, actual) + with self.roundtrip(original) as actual: + self.assertDatasetIdentical(expected, actual) def test_roundtrip_character_array(self): with create_tmp_file() as tmp_file: From 99873ca821c510a778eb92431e53205fc73abfa1 Mon Sep 17 00:00:00 2001 From: Michael Delgado Date: Thu, 28 Dec 2017 16:46:00 -0800 Subject: [PATCH 5/5] enforce pytest.raises for h4netcdf, passes for netcdf4. update whatsnew --- doc/whats-new.rst | 2 +- xarray/tests/test_backends.py | 25 +++++++++++++++++++------ 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 714c0f79a54..dad88a7cb46 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -66,7 +66,7 @@ Bug fixes unintentionally loading the datastores data and attributes repeatedly during writes (:issue:`1798`). By `Joe Hamman `_. -- Properly handle ``_FillValue`` for variable-length unicode strings (:issue:`1781`). By `Michael Delgado `_. +- Handle ``_FillValue`` for variable-length unicode strings using netCDF4 backend. h5netcdf backend still cannot accept _FillValue for variable-length strings (:issue:`1781`). By `Michael Delgado `_. .. _whats-new.0.10.0: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 6f79c8b7849..af447ab4207 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -828,17 +828,30 @@ def test_roundtrip_string_with_fill_value_vlen(self): values = np.array([u'ab', u'cdef', np.nan], dtype=object) expected = Dataset({'x': ('t', values)}) - # netCDF4-based backends don't support an explicit fillvalue + # H5netcdf backends don't support an explicit fillvalue # for variable length strings yet. - # https://github.com/Unidata/netcdf4-python/issues/730 # https://github.com/shoyer/h5netcdf/issues/37 + # The netCDF4-python backend does accept an explicit _FillValue: + # https://github.com/Unidata/netcdf4-python/issues/730 + # This tests both of those states (:issue:`1802`) original = Dataset({'x': ('t', values, {}, {'_FillValue': u'XXX'})}) - with self.roundtrip(original) as actual: - self.assertDatasetIdentical(expected, actual) + if isinstance(self, H5NetCDFDataTest): + with pytest.raises(NotImplementedError): + with self.roundtrip(original) as actual: + self.assertDatasetIdentical(expected, actual) + else: + with self.roundtrip(original) as actual: + self.assertDatasetIdentical(expected, actual) original = Dataset({'x': ('t', values, {}, {'_FillValue': u''})}) - with self.roundtrip(original) as actual: - self.assertDatasetIdentical(expected, actual) + if isinstance(self, H5NetCDFDataTest): + with pytest.raises(NotImplementedError): + with self.roundtrip(original) as actual: + self.assertDatasetIdentical(expected, actual) + else: + with self.roundtrip(original) as actual: + self.assertDatasetIdentical(expected, actual) + def test_roundtrip_character_array(self): with create_tmp_file() as tmp_file: