From 15644ea561d0a3482a3e0410b01cf7a588070316 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sat, 30 Dec 2017 01:57:56 -0500 Subject: [PATCH 1/5] Switch (some) coding/encoding in conventions.py to use xarray.coding. The goal here is to eventually convert everything in xarray.conventions to using the new coding module, which is more modular and supports dask arrays. For now, I have switched over datetime, timedelta, unsigned integer, scaling and mask coding to use new coders. Integrating these into xarray.conventions lets us harness our existing test suite and delete a lot of redundant code. Most of the code/tests is simply reorganized. There should be no changes to public API (to keep this manageable for review). All of the original tests that are still relevant should still be present, though I have reorganized many of them into new locations to match the revised code. --- xarray/coding/times.py | 364 +++++++++++++++++ xarray/coding/variables.py | 101 ++++- xarray/conventions.py | 649 +++--------------------------- xarray/tests/test_backends.py | 2 + xarray/tests/test_coding_times.py | 323 +++++++++++++++ xarray/tests/test_conventions.py | 503 +++-------------------- xarray/tests/test_dataarray.py | 6 +- 7 files changed, 894 insertions(+), 1054 deletions(-) create mode 100644 xarray/coding/times.py create mode 100644 xarray/tests/test_coding_times.py diff --git a/xarray/coding/times.py b/xarray/coding/times.py new file mode 100644 index 00000000000..62f79934f72 --- /dev/null +++ b/xarray/coding/times.py @@ -0,0 +1,364 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import re +import traceback +import warnings +from datetime import datetime +from functools import partial + +import numpy as np + +import pandas as pd +try: + from pandas.errors import OutOfBoundsDatetime +except ImportError: + # pandas < 0.20 + from pandas.tslib import OutOfBoundsDatetime + +from .variables import (SerializationWarning, VariableCoder, + lazy_elemwise_func, pop_to, safe_setitem, + unpack_for_decoding, unpack_for_encoding) +from ..core import indexing +from ..core.formatting import first_n_items, format_timestamp, last_item +from ..core.pycompat import PY3 +from ..core.variable import Variable + + +# standard calendars recognized by netcdftime +_STANDARD_CALENDARS = set(['standard', 'gregorian', 'proleptic_gregorian']) + +_NS_PER_TIME_DELTA = {'us': 1e3, + 'ms': 1e6, + 's': 1e9, + 'm': 1e9 * 60, + 'h': 1e9 * 60 * 60, + 'D': 1e9 * 60 * 60 * 24} + + +def _netcdf_to_numpy_timeunit(units): + units = units.lower() + if not units.endswith('s'): + units = '%ss' % units + return {'microseconds': 'us', 'milliseconds': 'ms', 'seconds': 's', + 'minutes': 'm', 'hours': 'h', 'days': 'D'}[units] + + +def _unpack_netcdf_time_units(units): + # CF datetime units follow the format: "UNIT since DATE" + # this parses out the unit and date allowing for extraneous + # whitespace. + matches = re.match('(.+) since (.+)', units) + if not matches: + raise ValueError('invalid time units: %s' % units) + delta_units, ref_date = [s.strip() for s in matches.groups()] + return delta_units, ref_date + + +def _decode_datetime_with_netcdf4(num_dates, units, calendar): + import netCDF4 as nc4 + + dates = np.asarray(nc4.num2date(num_dates, units, calendar)) + if (dates[np.nanargmin(num_dates)].year < 1678 or + dates[np.nanargmax(num_dates)].year >= 2262): + warnings.warn('Unable to decode time axis into full ' + 'numpy.datetime64 objects, continuing using dummy ' + 'netCDF4.datetime objects instead, reason: dates out' + ' of range', SerializationWarning, stacklevel=3) + else: + try: + dates = nctime_to_nptime(dates) + except ValueError as e: + warnings.warn('Unable to decode time axis into full ' + 'numpy.datetime64 objects, continuing using ' + 'dummy netCDF4.datetime objects instead, reason:' + '{0}'.format(e), SerializationWarning, stacklevel=3) + return dates + + +def _decode_cf_datetime_dtype(data, units, calendar): + # Verify that at least the first and last date can be decoded + # successfully. Otherwise, tracebacks end up swallowed by + # Dataset.__repr__ when users try to view their lazily decoded array. + values = indexing.ImplicitToExplicitIndexingAdapter( + indexing.as_indexable(data)) + example_value = np.concatenate([first_n_items(values, 1) or [0], + last_item(values) or [0]]) + + try: + result = decode_cf_datetime(example_value, units, calendar) + except Exception: + calendar_msg = ('the default calendar' if calendar is None + else 'calendar %r' % calendar) + msg = ('unable to decode time units %r with %s. Try ' + 'opening your dataset with decode_times=False.' + % (units, calendar_msg)) + if not PY3: + msg += ' Full traceback:\n' + traceback.format_exc() + raise ValueError(msg) + else: + dtype = getattr(result, 'dtype', np.dtype('object')) + + return dtype + + +def decode_cf_datetime(num_dates, units, calendar=None): + """Given an array of numeric dates in netCDF format, convert it into a + numpy array of date time objects. + + For standard (Gregorian) calendars, this function uses vectorized + operations, which makes it much faster than netCDF4.num2date. In such a + case, the returned array will be of type np.datetime64. + + Note that time unit in `units` must not be smaller than microseconds and + not larger than days. + + See also + -------- + netCDF4.num2date + """ + num_dates = np.asarray(num_dates) + flat_num_dates = num_dates.ravel() + if calendar is None: + calendar = 'standard' + + delta, ref_date = _unpack_netcdf_time_units(units) + + try: + if calendar not in _STANDARD_CALENDARS: + raise OutOfBoundsDatetime + + delta = _netcdf_to_numpy_timeunit(delta) + try: + ref_date = pd.Timestamp(ref_date) + except ValueError: + # ValueError is raised by pd.Timestamp for non-ISO timestamp + # strings, in which case we fall back to using netCDF4 + raise OutOfBoundsDatetime + + # fixes: https://github.com/pydata/pandas/issues/14068 + # these lines check if the the lowest or the highest value in dates + # cause an OutOfBoundsDatetime (Overflow) error + pd.to_timedelta(flat_num_dates.min(), delta) + ref_date + pd.to_timedelta(flat_num_dates.max(), delta) + ref_date + + # Cast input dates to integers of nanoseconds because `pd.to_datetime` + # works much faster when dealing with integers + flat_num_dates_ns_int = (flat_num_dates * + _NS_PER_TIME_DELTA[delta]).astype(np.int64) + + dates = (pd.to_timedelta(flat_num_dates_ns_int, 'ns') + + ref_date).values + + except (OutOfBoundsDatetime, OverflowError): + dates = _decode_datetime_with_netcdf4(flat_num_dates.astype(np.float), + units, + calendar) + + return dates.reshape(num_dates.shape) + + +def decode_cf_timedelta(num_timedeltas, units): + """Given an array of numeric timedeltas in netCDF format, convert it into a + numpy timedelta64[ns] array. + """ + num_timedeltas = np.asarray(num_timedeltas) + units = _netcdf_to_numpy_timeunit(units) + + shape = num_timedeltas.shape + num_timedeltas = num_timedeltas.ravel() + + result = pd.to_timedelta(num_timedeltas, unit=units, box=False) + # NaT is returned unboxed with wrong units; this should be fixed in pandas + if result.dtype != 'timedelta64[ns]': + result = result.astype('timedelta64[ns]') + return result.reshape(shape) + + +TIME_UNITS = frozenset(['days', 'hours', 'minutes', 'seconds', + 'milliseconds', 'microseconds']) + + +def _infer_time_units_from_diff(unique_timedeltas): + for time_unit, delta in [('days', 86400), ('hours', 3600), + ('minutes', 60), ('seconds', 1)]: + unit_delta = np.timedelta64(10 ** 9 * delta, 'ns') + diffs = unique_timedeltas / unit_delta + if np.all(diffs == diffs.astype(int)): + return time_unit + return 'seconds' + + +def infer_datetime_units(dates): + """Given an array of datetimes, returns a CF compatible time-unit string of + the form "{time_unit} since {date[0]}", where `time_unit` is 'days', + 'hours', 'minutes' or 'seconds' (the first one that can evenly divide all + unique time deltas in `dates`) + """ + dates = pd.to_datetime(np.asarray(dates).ravel(), box=False) + dates = dates[pd.notnull(dates)] + unique_timedeltas = np.unique(np.diff(dates)) + units = _infer_time_units_from_diff(unique_timedeltas) + reference_date = dates[0] if len(dates) > 0 else '1970-01-01' + return '%s since %s' % (units, pd.Timestamp(reference_date)) + + +def infer_timedelta_units(deltas): + """Given an array of timedeltas, returns a CF compatible time-unit from + {'days', 'hours', 'minutes' 'seconds'} (the first one that can evenly + divide all unique time deltas in `deltas`) + """ + deltas = pd.to_timedelta(np.asarray(deltas).ravel(), box=False) + unique_timedeltas = np.unique(deltas[pd.notnull(deltas)]) + units = _infer_time_units_from_diff(unique_timedeltas) + return units + + +def nctime_to_nptime(times): + """Given an array of netCDF4.datetime objects, return an array of + numpy.datetime64 objects of the same size""" + times = np.asarray(times) + new = np.empty(times.shape, dtype='M8[ns]') + for i, t in np.ndenumerate(times): + dt = datetime(t.year, t.month, t.day, t.hour, t.minute, t.second) + new[i] = np.datetime64(dt) + return new + + +def _cleanup_netcdf_time_units(units): + delta, ref_date = _unpack_netcdf_time_units(units) + try: + units = '%s since %s' % (delta, format_timestamp(ref_date)) + except OutOfBoundsDatetime: + # don't worry about reifying the units if they're out of bounds + pass + return units + + +def _encode_datetime_with_netcdf4(dates, units, calendar): + """Fallback method for encoding dates using netCDF4-python. + + This method is more flexible than xarray's parsing using datetime64[ns] + arrays but also slower because it loops over each element. + """ + import netCDF4 as nc4 + + if np.issubdtype(dates.dtype, np.datetime64): + # numpy's broken datetime conversion only works for us precision + dates = dates.astype('M8[us]').astype(datetime) + + def encode_datetime(d): + return np.nan if d is None else nc4.date2num(d, units, calendar) + + return np.vectorize(encode_datetime)(dates) + + +def cast_to_int_if_safe(num): + int_num = np.array(num, dtype=np.int64) + if (num == int_num).all(): + num = int_num + return num + + +def encode_cf_datetime(dates, units=None, calendar=None): + """Given an array of datetime objects, returns the tuple `(num, units, + calendar)` suitable for a CF compliant time variable. + + Unlike `date2num`, this function can handle datetime64 arrays. + + See also + -------- + netCDF4.date2num + """ + dates = np.asarray(dates) + + if units is None: + units = infer_datetime_units(dates) + else: + units = _cleanup_netcdf_time_units(units) + + if calendar is None: + calendar = 'proleptic_gregorian' + + delta, ref_date = _unpack_netcdf_time_units(units) + try: + if calendar not in _STANDARD_CALENDARS or dates.dtype.kind == 'O': + # parse with netCDF4 instead + raise OutOfBoundsDatetime + assert dates.dtype == 'datetime64[ns]' + + delta_units = _netcdf_to_numpy_timeunit(delta) + time_delta = np.timedelta64(1, delta_units).astype('timedelta64[ns]') + ref_date = np.datetime64(pd.Timestamp(ref_date)) + num = (dates - ref_date) / time_delta + + except (OutOfBoundsDatetime, OverflowError): + num = _encode_datetime_with_netcdf4(dates, units, calendar) + + num = cast_to_int_if_safe(num) + return (num, units, calendar) + + +def encode_cf_timedelta(timedeltas, units=None): + if units is None: + units = infer_timedelta_units(timedeltas) + + np_unit = _netcdf_to_numpy_timeunit(units) + num = 1.0 * timedeltas / np.timedelta64(1, np_unit) + num = np.where(pd.isnull(timedeltas), np.nan, num) + num = cast_to_int_if_safe(num) + return (num, units) + + +class CFDatetimeCoder(VariableCoder): + + def encode(self, variable, name=None): + dims, data, attrs, encoding = unpack_for_encoding(variable) + + if np.issubdtype(data.dtype, np.datetime64): + (data, units, calendar) = encode_cf_datetime( + data, + encoding.pop('units', None), + encoding.pop('calendar', None)) + safe_setitem(attrs, 'units', units, name=name) + safe_setitem(attrs, 'calendar', calendar, name=name) + + return Variable(dims, data, attrs, encoding) + + def decode(self, variable, name=None): + dims, data, attrs, encoding = unpack_for_decoding(variable) + + if 'units' in attrs and 'since' in attrs['units']: + units = pop_to(attrs, encoding, 'units') + calendar = pop_to(attrs, encoding, 'calendar') + dtype = _decode_cf_datetime_dtype(data, units, calendar) + transform = partial( + decode_cf_datetime, units=units, calendar=calendar) + data = lazy_elemwise_func(data, transform, dtype) + + return Variable(dims, data, attrs, encoding) + + +class CFTimedeltaCoder(VariableCoder): + + def encode(self, variable, name=None): + dims, data, attrs, encoding = unpack_for_encoding(variable) + + if np.issubdtype(data.dtype, np.timedelta64): + data, units = encode_cf_timedelta( + data, encoding.pop('units', None)) + safe_setitem(attrs, 'units', units, name=name) + + return Variable(dims, data, attrs, encoding) + + def decode(self, variable, name=None): + dims, data, attrs, encoding = unpack_for_decoding(variable) + + if 'units' in attrs and attrs['units'] in TIME_UNITS: + units = pop_to(attrs, encoding, 'units') + transform = partial(decode_cf_timedelta, units=units) + dtype = np.dtype('timedelta64[ns]') + data = lazy_elemwise_func(data, transform, dtype=dtype) + + return Variable(dims, data, attrs, encoding) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 993d93519b0..d7fc60e0319 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -32,6 +32,10 @@ class VariableCoder(object): the identity ``coder.decode(coder.encode(variable)) == variable``. If any options are necessary, they should be implemented as arguments to the __init__ method. + + The optional name argument to encode() and decode() exists solely for the + sake of better error messages, and should correspond to the name of + variables in the underlying store. """ def encode(self, variable, name=None): @@ -68,7 +72,7 @@ def __getitem__(self, key): def __repr__(self): return ("%s(%r, func=%r, dtype=%r)" % - (type(self).__name__, self.array, self._func, self._dtype)) + (type(self).__name__, self.array, self.func, self.dtype)) def lazy_elemwise_func(array, func, dtype): @@ -126,12 +130,14 @@ def pop_to(source, dest, key, name=None): def _apply_mask(data, # type: np.ndarray encoded_fill_values, # type: list - decoded_fill_value # type: Any - ): # type: npndarray + decoded_fill_value, # type: Any + dtype, # type: Any + ): # type: np.ndarray """Mask all matching values in a NumPy arrays.""" condition = False for fv in encoded_fill_values: condition |= data == fv + data = np.asarray(data, dtype=dtype) return np.where(condition, decoded_fill_value, data) @@ -145,11 +151,6 @@ def encode(self, variable, name=None): fill_value = pop_to(encoding, attrs, '_FillValue', name=name) if not pd.isnull(fill_value): data = duck_array_ops.fillna(data, fill_value) - variable = Variable(dims, data, attrs, encoding) - - if ('_FillValue' not in attrs and '_FillValue' not in encoding and - np.issubdtype(data.dtype, np.floating)): - attrs['_FillValue'] = data.dtype.type(np.nan) return Variable(dims, data, attrs, encoding) @@ -188,7 +189,89 @@ def decode(self, variable, name=None): if encoded_fill_values: transform = partial(_apply_mask, encoded_fill_values=encoded_fill_values, - decoded_fill_value=decoded_fill_value) + decoded_fill_value=decoded_fill_value, + dtype=dtype) data = lazy_elemwise_func(data, transform, dtype) return Variable(dims, data, attrs, encoding) + + +def _scale_offset_decoding(data, scale_factor, add_offset, dtype): + data = np.array(data, dtype=dtype, copy=True) + if scale_factor is not None: + data *= scale_factor + if add_offset is not None: + data += add_offset + return data + + +class CFScaleOffsetCoder(VariableCoder): + """Scale and offset variables according to CF conventions. + + Follows the formula: + decode_values = encoded_values * scale_factor + add_offset + """ + + def encode(self, variable, name=None): + dims, data, attrs, encoding = unpack_for_encoding(variable) + + if 'scale_factor' in encoding or 'add_offset' in encoding: + data = data.astype(dtype=np.float64, copy=True) + if 'add_offset' in encoding: + data -= pop_to(encoding, attrs, 'add_offset', name=name) + if 'scale_factor' in encoding: + data /= pop_to(encoding, attrs, 'scale_factor', name=name) + + return Variable(dims, data, attrs, encoding) + + def decode(self, variable, name=None): + dims, data, attrs, encoding = unpack_for_decoding(variable) + + if 'scale_factor' in attrs or 'add_offset' in attrs: + scale_factor = pop_to(attrs, encoding, 'scale_factor', name=name) + add_offset = pop_to(attrs, encoding, 'add_offset', name=name) + dtype = np.float64 + transform = partial(_scale_offset_decoding, + scale_factor=scale_factor, + add_offset=add_offset, + dtype=dtype) + data = lazy_elemwise_func(data, transform, dtype) + + return Variable(dims, data, attrs, encoding) + + +class UnsignedCoder(VariableCoder): + + def encode(self, variable, name=None): + dims, data, attrs, encoding = unpack_for_encoding(variable) + + if encoding.get('_Unsigned', False): + pop_to(encoding, attrs, '_Unsigned') + signed_dtype = np.dtype('i%s' % data.dtype.itemsize) + if '_FillValue' in attrs: + new_fill = signed_dtype.type(attrs['_FillValue']) + attrs['_FillValue'] = new_fill + data = duck_array_ops.around(data).astype(signed_dtype) + + return Variable(dims, data, attrs, encoding) + + def decode(self, variable, name=None): + dims, data, attrs, encoding = unpack_for_decoding(variable) + + if '_Unsigned' in attrs: + unsigned = pop_to(attrs, encoding, '_Unsigned') + + if data.dtype.kind == 'i': + if unsigned: + unsigned_dtype = np.dtype('u%s' % data.dtype.itemsize) + transform = partial(np.asarray, dtype=unsigned_dtype) + data = lazy_elemwise_func(data, transform, unsigned_dtype) + if '_FillValue' in attrs: + new_fill = unsigned_dtype.type(attrs['_FillValue']) + attrs['_FillValue'] = new_fill + else: + warnings.warn("variable %r has _Unsigned attribute but is not " + "of integer type. Ignoring attribute." % name, + SerializationWarning, stacklevel=3) + + return Variable(dims, data, attrs, encoding) diff --git a/xarray/conventions.py b/xarray/conventions.py index 5b951ff694b..e29fa358009 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -1,457 +1,25 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from datetime import datetime -import re -import traceback -import warnings -import numpy as np -import pandas as pd +import warnings from collections import defaultdict -try: - from pandas.errors import OutOfBoundsDatetime -except ImportError: - # pandas < 0.20 - from pandas.tslib import OutOfBoundsDatetime - -from .core import duck_array_ops, indexing, ops, utils -from .core.formatting import format_timestamp, first_n_items, last_item -from .core.variable import as_variable, IndexVariable, Variable -from .core.pycompat import iteritems, OrderedDict, PY3, basestring +import numpy as np -# standard calendars recognized by netcdftime -_STANDARD_CALENDARS = set(['standard', 'gregorian', 'proleptic_gregorian']) +import pandas as pd -_NS_PER_TIME_DELTA = {'us': 1e3, - 'ms': 1e6, - 's': 1e9, - 'm': 1e9 * 60, - 'h': 1e9 * 60 * 60, - 'D': 1e9 * 60 * 60 * 24} +from .coding import times +from .coding import variables +from .core import duck_array_ops, indexing +from .core.pycompat import OrderedDict, basestring, iteritems +from .core.variable import IndexVariable, Variable, as_variable class SerializationWarning(RuntimeWarning): """Warnings about encoding/decoding issues in serialization.""" -def mask_and_scale(array, fill_value=None, scale_factor=None, add_offset=None, - dtype=float): - """Scale and mask array values according to CF conventions for packed and - missing values - - First, values equal to the fill_value are replaced by NaN. Then, new values - are given by the formula: - - original_values * scale_factor + add_offset - - Parameters - ---------- - array : array-like - Original array of values to wrap - fill_value : number, optional - All values equal to fill_value in the original array are replaced - by NaN. If an array of multiple values is provided a warning will be - issued and all array elements matching an value in the fill_value array - will be replaced by NaN. - scale_factor : number, optional - Multiply entries in the original array by this number. - add_offset : number, optional - After applying scale_factor, add this number to entries in the - original array. - - Returns - ------- - scaled : np.ndarray - Array of masked and scaled values. - - References - ---------- - http://www.unidata.ucar.edu/software/netcdf/docs/BestPractices.html - """ - # by default, cast to float to ensure NaN is meaningful - values = np.array(array, dtype=dtype, copy=True) - if fill_value is not None and not np.all(pd.isnull(fill_value)): - if getattr(fill_value, 'size', 1) > 1: - fill_values = fill_value # multiple fill values - else: - fill_values = [fill_value] - for f_value in fill_values: - if values.ndim > 0: - values[values == f_value] = np.nan - elif values == f_value: - values = np.array(np.nan) - if scale_factor is not None: - values *= scale_factor - if add_offset is not None: - values += add_offset - return values - - -def _netcdf_to_numpy_timeunit(units): - units = units.lower() - if not units.endswith('s'): - units = '%ss' % units - return {'microseconds': 'us', 'milliseconds': 'ms', 'seconds': 's', - 'minutes': 'm', 'hours': 'h', 'days': 'D'}[units] - - -def _unpack_netcdf_time_units(units): - # CF datetime units follow the format: "UNIT since DATE" - # this parses out the unit and date allowing for extraneous - # whitespace. - matches = re.match('(.+) since (.+)', units) - if not matches: - raise ValueError('invalid time units: %s' % units) - delta_units, ref_date = [s.strip() for s in matches.groups()] - return delta_units, ref_date - - -def _decode_datetime_with_netcdf4(num_dates, units, calendar): - import netCDF4 as nc4 - - dates = np.asarray(nc4.num2date(num_dates, units, calendar)) - if (dates[np.nanargmin(num_dates)].year < 1678 or - dates[np.nanargmax(num_dates)].year >= 2262): - warnings.warn('Unable to decode time axis into full ' - 'numpy.datetime64 objects, continuing using dummy ' - 'netCDF4.datetime objects instead, reason: dates out' - ' of range', SerializationWarning, stacklevel=3) - else: - try: - dates = nctime_to_nptime(dates) - except ValueError as e: - warnings.warn('Unable to decode time axis into full ' - 'numpy.datetime64 objects, continuing using ' - 'dummy netCDF4.datetime objects instead, reason:' - '{0}'.format(e), SerializationWarning, stacklevel=3) - return dates - - -def decode_cf_datetime(num_dates, units, calendar=None): - """Given an array of numeric dates in netCDF format, convert it into a - numpy array of date time objects. - - For standard (Gregorian) calendars, this function uses vectorized - operations, which makes it much faster than netCDF4.num2date. In such a - case, the returned array will be of type np.datetime64. - - Note that time unit in `units` must not be smaller than microseconds and - not larger than days. - - See also - -------- - netCDF4.num2date - """ - num_dates = np.asarray(num_dates) - flat_num_dates = num_dates.ravel() - if calendar is None: - calendar = 'standard' - - delta, ref_date = _unpack_netcdf_time_units(units) - - try: - if calendar not in _STANDARD_CALENDARS: - raise OutOfBoundsDatetime - - delta = _netcdf_to_numpy_timeunit(delta) - try: - ref_date = pd.Timestamp(ref_date) - except ValueError: - # ValueError is raised by pd.Timestamp for non-ISO timestamp - # strings, in which case we fall back to using netCDF4 - raise OutOfBoundsDatetime - - # fixes: https://github.com/pydata/pandas/issues/14068 - # these lines check if the the lowest or the highest value in dates - # cause an OutOfBoundsDatetime (Overflow) error - pd.to_timedelta(flat_num_dates.min(), delta) + ref_date - pd.to_timedelta(flat_num_dates.max(), delta) + ref_date - - # Cast input dates to integers of nanoseconds because `pd.to_datetime` - # works much faster when dealing with integers - flat_num_dates_ns_int = (flat_num_dates * - _NS_PER_TIME_DELTA[delta]).astype(np.int64) - - dates = (pd.to_timedelta(flat_num_dates_ns_int, 'ns') + - ref_date).values - - except (OutOfBoundsDatetime, OverflowError): - dates = _decode_datetime_with_netcdf4(flat_num_dates.astype(np.float), - units, - calendar) - - return dates.reshape(num_dates.shape) - - -def decode_cf_timedelta(num_timedeltas, units): - """Given an array of numeric timedeltas in netCDF format, convert it into a - numpy timedelta64[ns] array. - """ - num_timedeltas = np.asarray(num_timedeltas) - units = _netcdf_to_numpy_timeunit(units) - - shape = num_timedeltas.shape - num_timedeltas = num_timedeltas.ravel() - - result = pd.to_timedelta(num_timedeltas, unit=units, box=False) - # NaT is returned unboxed with wrong units; this should be fixed in pandas - if result.dtype != 'timedelta64[ns]': - result = result.astype('timedelta64[ns]') - return result.reshape(shape) - - -TIME_UNITS = frozenset(['days', 'hours', 'minutes', 'seconds', - 'milliseconds', 'microseconds']) - - -def _infer_time_units_from_diff(unique_timedeltas): - for time_unit, delta in [('days', 86400), ('hours', 3600), - ('minutes', 60), ('seconds', 1)]: - unit_delta = np.timedelta64(10 ** 9 * delta, 'ns') - diffs = unique_timedeltas / unit_delta - if np.all(diffs == diffs.astype(int)): - return time_unit - return 'seconds' - - -def infer_datetime_units(dates): - """Given an array of datetimes, returns a CF compatible time-unit string of - the form "{time_unit} since {date[0]}", where `time_unit` is 'days', - 'hours', 'minutes' or 'seconds' (the first one that can evenly divide all - unique time deltas in `dates`) - """ - dates = pd.to_datetime(np.asarray(dates).ravel(), box=False) - dates = dates[pd.notnull(dates)] - unique_timedeltas = np.unique(np.diff(dates)) - units = _infer_time_units_from_diff(unique_timedeltas) - reference_date = dates[0] if len(dates) > 0 else '1970-01-01' - return '%s since %s' % (units, pd.Timestamp(reference_date)) - - -def infer_timedelta_units(deltas): - """Given an array of timedeltas, returns a CF compatible time-unit from - {'days', 'hours', 'minutes' 'seconds'} (the first one that can evenly - divide all unique time deltas in `deltas`) - """ - deltas = pd.to_timedelta(np.asarray(deltas).ravel(), box=False) - unique_timedeltas = np.unique(deltas[pd.notnull(deltas)]) - units = _infer_time_units_from_diff(unique_timedeltas) - return units - - -def nctime_to_nptime(times): - """Given an array of netCDF4.datetime objects, return an array of - numpy.datetime64 objects of the same size""" - times = np.asarray(times) - new = np.empty(times.shape, dtype='M8[ns]') - for i, t in np.ndenumerate(times): - dt = datetime(t.year, t.month, t.day, t.hour, t.minute, t.second) - new[i] = np.datetime64(dt) - return new - - -def _cleanup_netcdf_time_units(units): - delta, ref_date = _unpack_netcdf_time_units(units) - try: - units = '%s since %s' % (delta, format_timestamp(ref_date)) - except OutOfBoundsDatetime: - # don't worry about reifying the units if they're out of bounds - pass - return units - - -def _encode_datetime_with_netcdf4(dates, units, calendar): - """Fallback method for encoding dates using netCDF4-python. - - This method is more flexible than xarray's parsing using datetime64[ns] - arrays but also slower because it loops over each element. - """ - import netCDF4 as nc4 - - if np.issubdtype(dates.dtype, np.datetime64): - # numpy's broken datetime conversion only works for us precision - dates = dates.astype('M8[us]').astype(datetime) - - def encode_datetime(d): - return np.nan if d is None else nc4.date2num(d, units, calendar) - - return np.vectorize(encode_datetime)(dates) - - -def cast_to_int_if_safe(num): - int_num = np.array(num, dtype=np.int64) - if (num == int_num).all(): - num = int_num - return num - - -def encode_cf_datetime(dates, units=None, calendar=None): - """Given an array of datetime objects, returns the tuple `(num, units, - calendar)` suitable for a CF compliant time variable. - - Unlike `date2num`, this function can handle datetime64 arrays. - - See also - -------- - netCDF4.date2num - """ - dates = np.asarray(dates) - - if units is None: - units = infer_datetime_units(dates) - else: - units = _cleanup_netcdf_time_units(units) - - if calendar is None: - calendar = 'proleptic_gregorian' - - delta, ref_date = _unpack_netcdf_time_units(units) - try: - if calendar not in _STANDARD_CALENDARS or dates.dtype.kind == 'O': - # parse with netCDF4 instead - raise OutOfBoundsDatetime - assert dates.dtype == 'datetime64[ns]' - - delta_units = _netcdf_to_numpy_timeunit(delta) - time_delta = np.timedelta64(1, delta_units).astype('timedelta64[ns]') - ref_date = np.datetime64(pd.Timestamp(ref_date)) - num = (dates - ref_date) / time_delta - - except (OutOfBoundsDatetime, OverflowError): - num = _encode_datetime_with_netcdf4(dates, units, calendar) - - num = cast_to_int_if_safe(num) - return (num, units, calendar) - - -def encode_cf_timedelta(timedeltas, units=None): - if units is None: - units = infer_timedelta_units(timedeltas) - - np_unit = _netcdf_to_numpy_timeunit(units) - num = 1.0 * timedeltas / np.timedelta64(1, np_unit) - num = np.where(pd.isnull(timedeltas), np.nan, num) - num = cast_to_int_if_safe(num) - return (num, units) - - -class MaskedAndScaledArray(indexing.ExplicitlyIndexedNDArrayMixin): - """Wrapper around array-like objects to create a new indexable object where - values, when accessed, are automatically scaled and masked according to - CF conventions for packed and missing data values. - - New values are given by the formula: - original_values * scale_factor + add_offset - - Values can only be accessed via `__getitem__`: - - >>> x = MaskedAndScaledArray(np.array([-99, -1, 0, 1, 2]), -99, 0.01, 1) - >>> x - MaskedAndScaledArray(array([-99, -1, 0, 1, 2]), fill_value=-99, - scale_factor=0.01, add_offset=1) - >>> x[:] - array([ nan, 0.99, 1. , 1.01, 1.02] - - References - ---------- - http://www.unidata.ucar.edu/software/netcdf/docs/BestPractices.html - """ - def __init__(self, array, fill_value=None, scale_factor=None, - add_offset=None, dtype=float): - """ - Parameters - ---------- - array : array-like - Original array of values to wrap - fill_value : number, optional - All values equal to fill_value in the original array are replaced - by NaN. - scale_factor : number, optional - Multiply entries in the original array by this number. - add_offset : number, optional - After applying scale_factor, add this number to entries in the - original array. - """ - self.array = indexing.as_indexable(array) - self.fill_value = fill_value - self.scale_factor = scale_factor - self.add_offset = add_offset - self._dtype = dtype - - @property - def dtype(self): - return np.dtype(self._dtype) - - def __getitem__(self, key): - return mask_and_scale(self.array[key], self.fill_value, - self.scale_factor, self.add_offset, self._dtype) - - def __repr__(self): - return ("%s(%r, fill_value=%r, scale_factor=%r, add_offset=%r, " - "dtype=%r)" % - (type(self).__name__, self.array, self.fill_value, - self.scale_factor, self.add_offset, self._dtype)) - - -class DecodedCFDatetimeArray(indexing.ExplicitlyIndexedNDArrayMixin): - """Wrapper around array-like objects to create a new indexable object where - values, when accessed, are automatically converted into datetime objects - using decode_cf_datetime. - """ - def __init__(self, array, units, calendar=None): - self.array = indexing.as_indexable(array) - self.units = units - self.calendar = calendar - - # Verify that at least the first and last date can be decoded - # successfully. Otherwise, tracebacks end up swallowed by - # Dataset.__repr__ when users try to view their lazily decoded array. - values = indexing.ImplicitToExplicitIndexingAdapter(self.array) - example_value = np.concatenate([first_n_items(values, 1) or [0], - last_item(values) or [0]]) - - try: - result = decode_cf_datetime(example_value, units, calendar) - except Exception: - calendar_msg = ('the default calendar' if calendar is None - else 'calendar %r' % calendar) - msg = ('unable to decode time units %r with %s. Try ' - 'opening your dataset with decode_times=False.' - % (units, calendar_msg)) - if not PY3: - msg += ' Full traceback:\n' + traceback.format_exc() - raise ValueError(msg) - else: - self._dtype = getattr(result, 'dtype', np.dtype('object')) - - @property - def dtype(self): - return self._dtype - - def __getitem__(self, key): - return decode_cf_datetime(self.array[key], units=self.units, - calendar=self.calendar) - - -class DecodedCFTimedeltaArray(indexing.ExplicitlyIndexedNDArrayMixin): - """Wrapper around array-like objects to create a new indexable object where - values, when accessed, are automatically converted into timedelta objects - using decode_cf_timedelta. - """ - def __init__(self, array, units): - self.array = indexing.as_indexable(array) - self.units = units - - @property - def dtype(self): - return np.dtype('timedelta64[ns]') - - def __getitem__(self, key): - return decode_cf_timedelta(self.array[key], units=self.units) - - class StackedBytesArray(indexing.ExplicitlyIndexedNDArrayMixin): """Wrapper around array-like objects to create a new indexable object where values, when accessed, are automatically stacked along the last dimension. @@ -594,34 +162,6 @@ def __getitem__(self, key): return np.asarray(self.array[key], dtype=self.dtype) -class UnsignedIntTypeArray(indexing.ExplicitlyIndexedNDArrayMixin): - """Decode arrays on the fly from signed integer to unsigned - integer. Typically used when _Unsigned is set at as a netCDF - attribute on a signed integer variable. - - >>> sb = np.asarray([0, 1, 127, -128, -1], dtype='i1') - - >>> sb.dtype - dtype('int8') - - >>> UnsignedIntTypeArray(sb).dtype - dtype('uint8') - - >>> UnsignedIntTypeArray(sb)[:] - array([ 0, 1, 127, 128, 255], dtype=uint8) - """ - def __init__(self, array): - self.array = indexing.as_indexable(array) - self.unsigned_dtype = np.dtype('u%s' % array.dtype.itemsize) - - @property - def dtype(self): - return self.unsigned_dtype - - def __getitem__(self, key): - return np.asarray(self.array[key], dtype=self.dtype) - - def bytes_to_char(arr): """Like netCDF4.stringtochar, but faster and more flexible. """ @@ -696,56 +236,17 @@ def _var_as_tuple(var): return var.dims, var.data, var.attrs.copy(), var.encoding.copy() -def maybe_encode_datetime(var, name=None): - if np.issubdtype(var.dtype, np.datetime64): - dims, data, attrs, encoding = _var_as_tuple(var) - (data, units, calendar) = encode_cf_datetime( - data, encoding.pop('units', None), encoding.pop('calendar', None)) - safe_setitem(attrs, 'units', units, name=name) - safe_setitem(attrs, 'calendar', calendar, name=name) - var = Variable(dims, data, attrs, encoding) - return var - - -def maybe_encode_timedelta(var, name=None): - if np.issubdtype(var.dtype, np.timedelta64): - dims, data, attrs, encoding = _var_as_tuple(var) - data, units = encode_cf_timedelta( - data, encoding.pop('units', None)) - safe_setitem(attrs, 'units', units, name=name) - var = Variable(dims, data, attrs, encoding) - return var - - -def maybe_encode_offset_and_scale(var, needs_copy=True, name=None): - if any(k in var.encoding for k in ['add_offset', 'scale_factor']): - dims, data, attrs, encoding = _var_as_tuple(var) - data = data.astype(dtype=float, copy=needs_copy) - needs_copy = False - if 'add_offset' in encoding: - data -= pop_to(encoding, attrs, 'add_offset', name=name) - if 'scale_factor' in encoding: - data /= pop_to(encoding, attrs, 'scale_factor', name=name) - var = Variable(dims, data, attrs, encoding) - return var, needs_copy - - -def maybe_encode_fill_value(var, needs_copy=True, name=None): - # replace NaN with the fill value - if var.encoding.get('_FillValue') is not None: - dims, data, attrs, encoding = _var_as_tuple(var) - fill_value = pop_to(encoding, attrs, '_FillValue', name=name) - if not pd.isnull(fill_value): - data = ops.fillna(data, fill_value) - needs_copy = False - var = Variable(dims, data, attrs, encoding) - return var, needs_copy - - def maybe_encode_as_char_array(var, name=None): if var.dtype.kind in {'S', 'U'}: dims, data, attrs, encoding = _var_as_tuple(var) if data.dtype.kind == 'U': + if '_FillValue' in attrs: + raise NotImplementedError( + 'variable {!r} has a _FillValue specified, but ' + '_FillValue is yet supported on unicode strings: ' + 'https://github.com/pydata/xarray/issues/1647' + .format(name)) + string_encoding = encoding.pop('_Encoding', 'utf-8') safe_setitem(attrs, '_Encoding', string_encoding, name=name) data = encode_string_array(data, string_encoding) @@ -780,13 +281,6 @@ def maybe_encode_nonstring_dtype(var, name=None): 'any _FillValue to use for NaNs' % name, SerializationWarning, stacklevel=3) data = duck_array_ops.around(data)[...] - if encoding.get('_Unsigned', False): - signed_dtype = np.dtype('i%s' % dtype.itemsize) - if '_FillValue' in var.attrs: - new_fill = signed_dtype.type(attrs['_FillValue']) - attrs['_FillValue'] = new_fill - data = data.astype(signed_dtype) - pop_to(encoding, attrs, '_Unsigned') data = data.astype(dtype=dtype) var = Variable(dims, data, attrs, encoding) return var @@ -830,18 +324,20 @@ def _infer_dtype(array, name=None): return dtype +def ensure_not_multiindex(var, name=None): + if (isinstance(var, IndexVariable) and + isinstance(var.to_index(), pd.MultiIndex)): + raise NotImplementedError( + 'variable {!r} is a MultiIndex, which cannot yet be ' + 'serialized to netCDF files ' + '(https://github.com/pydata/xarray/issues/1077). Use ' + 'reset_index() to convert MultiIndex levels into coordinate ' + 'variables instead.'.format(name)) + + def ensure_dtype_not_object(var, name=None): # TODO: move this from conventions to backends? (it's not CF related) if var.dtype.kind == 'O': - if (isinstance(var, IndexVariable) and - isinstance(var.to_index(), pd.MultiIndex)): - raise NotImplementedError( - 'variable {!r} is a MultiIndex, which cannot yet be ' - 'serialized to netCDF files ' - '(https://github.com/pydata/xarray/issues/1077). Use ' - 'reset_index() to convert MultiIndex levels into coordinate ' - 'variables instead.'.format(name)) - dims, data, attrs, encoding = _var_as_tuple(var) missing = pd.isnull(data) if missing.any(): @@ -890,10 +386,16 @@ def encode_cf_variable(var, needs_copy=True, name=None): out : xarray.Variable A variable which has been encoded as described above. """ - var = maybe_encode_datetime(var, name=name) - var = maybe_encode_timedelta(var, name=name) - var, needs_copy = maybe_encode_offset_and_scale(var, needs_copy, name=name) - var, needs_copy = maybe_encode_fill_value(var, needs_copy, name=name) + ensure_not_multiindex(var, name=name) + + for coder in [times.CFDatetimeCoder(), + times.CFTimedeltaCoder(), + variables.CFScaleOffsetCoder(), + variables.CFMaskCoder(), + variables.UnsignedCoder()]: + var = coder.encode(var, name=name) + + # TODO(shoyer): convert all of these to use coders, too: var = maybe_encode_nonstring_dtype(var, name=name) var = maybe_default_fill_value(var) var = maybe_encode_bools(var) @@ -958,73 +460,22 @@ def decode_cf_variable(name, var, concat_characters=True, mask_and_scale=True, if string_encoding is not None: data = BytesToStringArray(data, string_encoding) - unsigned = pop_to(attributes, encoding, '_Unsigned') - if unsigned and mask_and_scale: - if data.dtype.kind == 'i': - data = UnsignedIntTypeArray(data) - else: - warnings.warn("variable %r has _Unsigned attribute but is not " - "of integer type. Ignoring attribute." % name, - SerializationWarning, stacklevel=3) + # TODO(shoyer): convert everything above to use coders + var = Variable(dimensions, data, attributes, encoding) if mask_and_scale: - if 'missing_value' in attributes: - # missing_value is deprecated, but we still want to support it as - # an alias for _FillValue. - if ('_FillValue' in attributes and - not utils.equivalent(attributes['_FillValue'], - attributes['missing_value'])): - raise ValueError("Conflicting _FillValue and missing_value " - "attributes on a variable {!r}: {} vs. {}\n\n" - "Consider opening the offending dataset " - "using decode_cf=False, correcting the " - "attributes and decoding explicitly using " - "xarray.decode_cf()." - .format(name, attributes['_FillValue'], - attributes['missing_value'])) - attributes['_FillValue'] = attributes.pop('missing_value') - - fill_value = pop_to(attributes, encoding, '_FillValue') - if isinstance(fill_value, np.ndarray) and fill_value.size > 1: - warnings.warn("variable {!r} has multiple fill values {}, " - "decoding all values to NaN." - .format(name, fill_value), - SerializationWarning, stacklevel=3) - - scale_factor = pop_to(attributes, encoding, 'scale_factor') - add_offset = pop_to(attributes, encoding, 'add_offset') - has_fill = (fill_value is not None and - not np.any(pd.isnull(fill_value))) - if (has_fill or scale_factor is not None or add_offset is not None): - if has_fill and np.array(fill_value).dtype.kind in ['U', 'S', 'O']: - if string_encoding is not None: - raise NotImplementedError( - 'variable %r has a _FillValue specified, but ' - '_FillValue is yet supported on unicode strings: ' - 'https://github.com/pydata/xarray/issues/1647') - dtype = object - else: - # According to the CF spec, the fill value is of the same - # type as its variable, i.e. its storage format on disk. - # This handles the case where the fill_value also needs to be - # converted to its unsigned value. - if has_fill: - fill_value = data.dtype.type(fill_value) - dtype = float - - data = MaskedAndScaledArray(data, fill_value, scale_factor, - add_offset, dtype) - - if decode_times and 'units' in attributes: - if 'since' in attributes['units']: - # datetime - units = pop_to(attributes, encoding, 'units') - calendar = pop_to(attributes, encoding, 'calendar') - data = DecodedCFDatetimeArray(data, units, calendar) - elif attributes['units'] in TIME_UNITS: - # timedelta - units = pop_to(attributes, encoding, 'units') - data = DecodedCFTimedeltaArray(data, units) + for coder in [variables.UnsignedCoder(), + variables.CFMaskCoder(), + variables.CFScaleOffsetCoder()]: + var = coder.decode(var, name=name) + if decode_times: + for coder in [times.CFTimedeltaCoder(), + times.CFDatetimeCoder()]: + var = coder.decode(var, name=name) + + dimensions, data, attributes, encoding = ( + variables.unpack_for_decoding(var)) + # TODO(shoyer): convert everything below to use coders if decode_endianness and not data.dtype.isnative: # do this last, so it's only done if we didn't already unmask/scale diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 6b0cd59eb9e..58e23943b77 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -530,6 +530,7 @@ def test_roundtrip_string_with_fill_value_nchar(self): def test_unsigned_roundtrip_mask_and_scale(self): decoded = create_unsigned_masked_scaled_data() encoded = create_encoded_unsigned_masked_scaled_data() + print('ORIGINAL', encoded, encoded.x.attrs, encoded.x.encoding) with self.roundtrip(decoded) as actual: for k in decoded.variables: self.assertEqual(decoded.variables[k].dtype, @@ -549,6 +550,7 @@ def test_unsigned_roundtrip_mask_and_scale(self): self.assertDatasetAllClose(encoded, actual, decode_bytes=False) # make sure roundtrip encoding didn't change the # original dataset. + print('NOW', encoded, encoded.x.attrs, encoded.x.encoding) self.assertDatasetAllClose( encoded, create_encoded_unsigned_masked_scaled_data()) with self.roundtrip(encoded) as actual: diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py new file mode 100644 index 00000000000..f4c726355a5 --- /dev/null +++ b/xarray/tests/test_coding_times.py @@ -0,0 +1,323 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import warnings + +import numpy as np +import pandas as pd + +from xarray import Variable, coding +from . import TestCase, requires_netCDF4 + + +@np.vectorize +def _ensure_naive_tz(dt): + if hasattr(dt, 'tzinfo'): + return dt.replace(tzinfo=None) + else: + return dt + + +class TestDatetime(TestCase): + @requires_netCDF4 + def test_cf_datetime(self): + import netCDF4 as nc4 + for num_dates, units in [ + (np.arange(10), 'days since 2000-01-01'), + (np.arange(10).reshape(2, 5), 'days since 2000-01-01'), + (12300 + np.arange(5), 'hours since 1680-01-01 00:00:00'), + # here we add a couple minor formatting errors to test + # the robustness of the parsing algorithm. + (12300 + np.arange(5), 'hour since 1680-01-01 00:00:00'), + (12300 + np.arange(5), u'Hour since 1680-01-01 00:00:00'), + (12300 + np.arange(5), ' Hour since 1680-01-01 00:00:00 '), + (10, 'days since 2000-01-01'), + ([10], 'daYs since 2000-01-01'), + ([[10]], 'days since 2000-01-01'), + ([10, 10], 'days since 2000-01-01'), + (np.array(10), 'days since 2000-01-01'), + (0, 'days since 1000-01-01'), + ([0], 'days since 1000-01-01'), + ([[0]], 'days since 1000-01-01'), + (np.arange(2), 'days since 1000-01-01'), + (np.arange(0, 100000, 20000), 'days since 1900-01-01'), + (17093352.0, 'hours since 1-1-1 00:00:0.0'), + ([0.5, 1.5], 'hours since 1900-01-01T00:00:00'), + (0, 'milliseconds since 2000-01-01T00:00:00'), + (0, 'microseconds since 2000-01-01T00:00:00'), + ]: + for calendar in ['standard', 'gregorian', 'proleptic_gregorian']: + expected = _ensure_naive_tz( + nc4.num2date(num_dates, units, calendar)) + print(num_dates, units, calendar) + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', + 'Unable to decode time axis') + actual = coding.times.decode_cf_datetime(num_dates, units, + calendar) + if (isinstance(actual, np.ndarray) and + np.issubdtype(actual.dtype, np.datetime64)): + # self.assertEqual(actual.dtype.kind, 'M') + # For some reason, numpy 1.8 does not compare ns precision + # datetime64 arrays as equal to arrays of datetime objects, + # but it works for us precision. Thus, convert to us + # precision for the actual array equal comparison... + actual_cmp = actual.astype('M8[us]') + else: + actual_cmp = actual + self.assertArrayEqual(expected, actual_cmp) + encoded, _, _ = coding.times.encode_cf_datetime(actual, units, + calendar) + if '1-1-1' not in units: + # pandas parses this date very strangely, so the original + # units/encoding cannot be preserved in this case: + # (Pdb) pd.to_datetime('1-1-1 00:00:0.0') + # Timestamp('2001-01-01 00:00:00') + self.assertArrayEqual(num_dates, np.around(encoded, 1)) + if (hasattr(num_dates, 'ndim') and num_dates.ndim == 1 and + '1000' not in units): + # verify that wrapping with a pandas.Index works + # note that it *does not* currently work to even put + # non-datetime64 compatible dates into a pandas.Index + encoded, _, _ = coding.times.encode_cf_datetime( + pd.Index(actual), units, calendar) + self.assertArrayEqual(num_dates, np.around(encoded, 1)) + + @requires_netCDF4 + def test_decode_cf_datetime_overflow(self): + # checks for + # https://github.com/pydata/pandas/issues/14068 + # https://github.com/pydata/xarray/issues/975 + + from datetime import datetime + units = 'days since 2000-01-01 00:00:00' + + # date after 2262 and before 1678 + days = (-117608, 95795) + expected = (datetime(1677, 12, 31), datetime(2262, 4, 12)) + + for i, day in enumerate(days): + result = coding.times.decode_cf_datetime(day, units) + self.assertEqual(result, expected[i]) + + def test_decode_cf_datetime_non_standard_units(self): + expected = pd.date_range(periods=100, start='1970-01-01', freq='h') + # netCDFs from madis.noaa.gov use this format for their time units + # they cannot be parsed by netcdftime, but pd.Timestamp works + units = 'hours since 1-1-1970' + actual = coding.times.decode_cf_datetime(np.arange(100), units) + self.assertArrayEqual(actual, expected) + + @requires_netCDF4 + def test_decode_cf_datetime_non_iso_strings(self): + # datetime strings that are _almost_ ISO compliant but not quite, + # but which netCDF4.num2date can still parse correctly + expected = pd.date_range(periods=100, start='2000-01-01', freq='h') + cases = [(np.arange(100), 'hours since 2000-01-01 0'), + (np.arange(100), 'hours since 2000-1-1 0'), + (np.arange(100), 'hours since 2000-01-01 0:00')] + for num_dates, units in cases: + actual = coding.times.decode_cf_datetime(num_dates, units) + self.assertArrayEqual(actual, expected) + + @requires_netCDF4 + def test_decode_non_standard_calendar(self): + import netCDF4 as nc4 + + for calendar in ['noleap', '365_day', '360_day', 'julian', 'all_leap', + '366_day']: + units = 'days since 0001-01-01' + times = pd.date_range('2001-04-01-00', end='2001-04-30-23', + freq='H') + noleap_time = nc4.date2num(times.to_pydatetime(), units, + calendar=calendar) + expected = times.values + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', 'Unable to decode time axis') + actual = coding.times.decode_cf_datetime(noleap_time, units, + calendar=calendar) + self.assertEqual(actual.dtype, np.dtype('M8[ns]')) + abs_diff = abs(actual - expected) + # once we no longer support versions of netCDF4 older than 1.1.5, + # we could do this check with near microsecond accuracy: + # https://github.com/Unidata/netcdf4-python/issues/355 + self.assertTrue((abs_diff <= np.timedelta64(1, 's')).all()) + + @requires_netCDF4 + def test_decode_non_standard_calendar_single_element(self): + units = 'days since 0001-01-01' + for calendar in ['noleap', '365_day', '360_day', 'julian', 'all_leap', + '366_day']: + for num_time in [735368, [735368], [[735368]]]: + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', + 'Unable to decode time axis') + actual = coding.times.decode_cf_datetime(num_time, units, + calendar=calendar) + self.assertEqual(actual.dtype, np.dtype('M8[ns]')) + + @requires_netCDF4 + def test_decode_non_standard_calendar_single_element_fallback(self): + import netCDF4 as nc4 + + units = 'days since 0001-01-01' + dt = nc4.netcdftime.datetime(2001, 2, 29) + for calendar in ['360_day', 'all_leap', '366_day']: + num_time = nc4.date2num(dt, units, calendar) + with self.assertWarns('Unable to decode time axis'): + actual = coding.times.decode_cf_datetime(num_time, units, + calendar=calendar) + expected = np.asarray(nc4.num2date(num_time, units, calendar)) + print(num_time, calendar, actual, expected) + self.assertEqual(actual.dtype, np.dtype('O')) + self.assertEqual(expected, actual) + + @requires_netCDF4 + def test_decode_non_standard_calendar_multidim_time(self): + import netCDF4 as nc4 + + calendar = 'noleap' + units = 'days since 0001-01-01' + times1 = pd.date_range('2001-04-01', end='2001-04-05', freq='D') + times2 = pd.date_range('2001-05-01', end='2001-05-05', freq='D') + noleap_time1 = nc4.date2num(times1.to_pydatetime(), units, + calendar=calendar) + noleap_time2 = nc4.date2num(times2.to_pydatetime(), units, + calendar=calendar) + mdim_time = np.empty((len(noleap_time1), 2), ) + mdim_time[:, 0] = noleap_time1 + mdim_time[:, 1] = noleap_time2 + + expected1 = times1.values + expected2 = times2.values + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', 'Unable to decode time axis') + actual = coding.times.decode_cf_datetime(mdim_time, units, + calendar=calendar) + self.assertEqual(actual.dtype, np.dtype('M8[ns]')) + self.assertArrayEqual(actual[:, 0], expected1) + self.assertArrayEqual(actual[:, 1], expected2) + + @requires_netCDF4 + def test_decode_non_standard_calendar_fallback(self): + import netCDF4 as nc4 + # ensure leap year doesn't matter + for year in [2010, 2011, 2012, 2013, 2014]: + for calendar in ['360_day', '366_day', 'all_leap']: + calendar = '360_day' + units = 'days since {0}-01-01'.format(year) + num_times = np.arange(100) + expected = nc4.num2date(num_times, units, calendar) + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + actual = coding.times.decode_cf_datetime(num_times, units, + calendar=calendar) + self.assertEqual(len(w), 1) + self.assertIn('Unable to decode time axis', + str(w[0].message)) + + self.assertEqual(actual.dtype, np.dtype('O')) + self.assertArrayEqual(actual, expected) + + @requires_netCDF4 + def test_cf_datetime_nan(self): + for num_dates, units, expected_list in [ + ([np.nan], 'days since 2000-01-01', ['NaT']), + ([np.nan, 0], 'days since 2000-01-01', + ['NaT', '2000-01-01T00:00:00Z']), + ([np.nan, 0, 1], 'days since 2000-01-01', + ['NaT', '2000-01-01T00:00:00Z', '2000-01-02T00:00:00Z']), + ]: + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', 'All-NaN') + actual = coding.times.decode_cf_datetime(num_dates, units) + expected = np.array(expected_list, dtype='datetime64[ns]') + self.assertArrayEqual(expected, actual) + + @requires_netCDF4 + def test_decoded_cf_datetime_array_2d(self): + # regression test for GH1229 + variable = Variable(('x', 'y'), np.array([[0, 1], [2, 3]]), + {'units': 'days since 2000-01-01'}) + result = coding.times.CFDatetimeCoder().decode(variable) + assert result.dtype == 'datetime64[ns]' + expected = pd.date_range('2000-01-01', periods=4).values.reshape(2, 2) + self.assertArrayEqual(np.asarray(result), expected) + + def test_infer_datetime_units(self): + for dates, expected in [(pd.date_range('1900-01-01', periods=5), + 'days since 1900-01-01 00:00:00'), + (pd.date_range('1900-01-01 12:00:00', freq='H', + periods=2), + 'hours since 1900-01-01 12:00:00'), + (['1900-01-01', '1900-01-02', + '1900-01-02 00:00:01'], + 'seconds since 1900-01-01 00:00:00'), + (pd.to_datetime( + ['1900-01-01', '1900-01-02', 'NaT']), + 'days since 1900-01-01 00:00:00'), + (pd.to_datetime(['1900-01-01', + '1900-01-02T00:00:00.005']), + 'seconds since 1900-01-01 00:00:00'), + (pd.to_datetime(['NaT', '1900-01-01']), + 'days since 1900-01-01 00:00:00'), + (pd.to_datetime(['NaT']), + 'days since 1970-01-01 00:00:00'), + ]: + self.assertEqual( + expected, coding.times.infer_datetime_units(dates)) + + def test_cf_timedelta(self): + examples = [ + ('1D', 'days', np.int64(1)), + (['1D', '2D', '3D'], 'days', np.array([1, 2, 3], 'int64')), + ('1h', 'hours', np.int64(1)), + ('1ms', 'milliseconds', np.int64(1)), + ('1us', 'microseconds', np.int64(1)), + (['NaT', '0s', '1s'], None, [np.nan, 0, 1]), + (['30m', '60m'], 'hours', [0.5, 1.0]), + (np.timedelta64('NaT', 'ns'), 'days', np.nan), + (['NaT', 'NaT'], 'days', [np.nan, np.nan]), + ] + + for timedeltas, units, numbers in examples: + timedeltas = pd.to_timedelta(timedeltas, box=False) + numbers = np.array(numbers) + + expected = numbers + actual, _ = coding.times.encode_cf_timedelta(timedeltas, units) + self.assertArrayEqual(expected, actual) + self.assertEqual(expected.dtype, actual.dtype) + + if units is not None: + expected = timedeltas + actual = coding.times.decode_cf_timedelta(numbers, units) + self.assertArrayEqual(expected, actual) + self.assertEqual(expected.dtype, actual.dtype) + + expected = np.timedelta64('NaT', 'ns') + actual = coding.times.decode_cf_timedelta(np.array(np.nan), 'days') + self.assertArrayEqual(expected, actual) + + def test_cf_timedelta_2d(self): + timedeltas = ['1D', '2D', '3D'] + units = 'days' + numbers = np.atleast_2d([1, 2, 3]) + + timedeltas = np.atleast_2d(pd.to_timedelta(timedeltas, box=False)) + expected = timedeltas + + actual = coding.times.decode_cf_timedelta(numbers, units) + self.assertArrayEqual(expected, actual) + self.assertEqual(expected.dtype, actual.dtype) + + def test_infer_timedelta_units(self): + for deltas, expected in [ + (pd.to_timedelta(['1 day', '2 days']), 'days'), + (pd.to_timedelta(['1h', '1 day 1 hour']), 'hours'), + (pd.to_timedelta(['1m', '2m', np.nan]), 'minutes'), + (pd.to_timedelta(['1m3s', '1m4s']), 'seconds')]: + self.assertEqual( + expected, coding.times.infer_timedelta_units(deltas)) diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index ca88ea661c7..0d0d1efd598 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -2,16 +2,16 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function + import contextlib +import warnings import numpy as np import pandas as pd -import pytest -import warnings - import pytest from xarray import conventions, Variable, Dataset, open_dataset from xarray.core import utils, indexing +from xarray.testing import assert_identical from . import TestCase, requires_netCDF4, unittest, raises_regex, IndexerMaker from .test_backends import CFEncodedDataTest from xarray.core.pycompat import iteritems @@ -25,46 +25,6 @@ V = IndexerMaker(indexing.VectorizedIndexer) -class TestMaskedAndScaledArray(TestCase): - def test(self): - x = conventions.MaskedAndScaledArray(np.arange(3), fill_value=0) - self.assertEqual(x.dtype, np.dtype('float')) - self.assertEqual(x.shape, (3,)) - self.assertEqual(x.size, 3) - self.assertEqual(x.ndim, 1) - self.assertEqual(len(x), 3) - self.assertArrayEqual([np.nan, 1, 2], x) - - x = conventions.MaskedAndScaledArray(np.arange(3), add_offset=1) - self.assertArrayEqual(np.arange(3) + 1, x) - - x = conventions.MaskedAndScaledArray(np.arange(3), scale_factor=2) - self.assertArrayEqual(2 * np.arange(3), x) - - x = conventions.MaskedAndScaledArray(np.array([-99, -1, 0, 1, 2]), - -99, 0.01, 1) - expected = np.array([np.nan, 0.99, 1, 1.01, 1.02]) - self.assertArrayEqual(expected, x) - - def test_0d(self): - x = conventions.MaskedAndScaledArray(np.array(0), fill_value=0) - self.assertTrue(np.isnan(x)) - self.assertTrue(np.isnan(x[B[()]])) - - x = conventions.MaskedAndScaledArray(np.array(0), fill_value=10) - self.assertEqual(0, x[B[()]]) - - def test_multiple_fill_value(self): - x = conventions.MaskedAndScaledArray( - np.arange(4), fill_value=np.array([0, 1])) - self.assertArrayEqual([np.nan, np.nan, 2, 3], x) - - x = conventions.MaskedAndScaledArray( - np.array(0), fill_value=np.array([0, 1])) - self.assertTrue(np.isnan(x)) - self.assertTrue(np.isnan(x[B[()]])) - - class TestStackedBytesArray(TestCase): def test_wrapper_class(self): array = np.array([[b'a', b'b', b'c'], [b'd', b'e', b'f']], dtype='S') @@ -176,15 +136,6 @@ def test_decode_bytes_array(self): np.testing.assert_array_equal(actual, expected) -class TestUnsignedIntTypeArray(TestCase): - def test_unsignedinttype_array(self): - sb = np.asarray([0, 1, 127, -128, -1], dtype='i1') - ub = conventions.UnsignedIntTypeArray(sb) - self.assertEqual(ub.dtype, np.dtype('u1')) - self.assertArrayEqual(ub, np.array([0, 1, 127, 128, 255], - dtype=np.dtype('u1'))) - - class TestBoolTypeArray(TestCase): def test_booltype_array(self): x = np.array([1, 0, 1, 1, 0], dtype='i1') @@ -194,397 +145,6 @@ def test_booltype_array(self): dtype=np.bool)) -@np.vectorize -def _ensure_naive_tz(dt): - if hasattr(dt, 'tzinfo'): - return dt.replace(tzinfo=None) - else: - return dt - - -class TestDatetime(TestCase): - @requires_netCDF4 - def test_cf_datetime(self): - import netCDF4 as nc4 - for num_dates, units in [ - (np.arange(10), 'days since 2000-01-01'), - (np.arange(10).reshape(2, 5), 'days since 2000-01-01'), - (12300 + np.arange(5), 'hours since 1680-01-01 00:00:00'), - # here we add a couple minor formatting errors to test - # the robustness of the parsing algorithm. - (12300 + np.arange(5), 'hour since 1680-01-01 00:00:00'), - (12300 + np.arange(5), u'Hour since 1680-01-01 00:00:00'), - (12300 + np.arange(5), ' Hour since 1680-01-01 00:00:00 '), - (10, 'days since 2000-01-01'), - ([10], 'daYs since 2000-01-01'), - ([[10]], 'days since 2000-01-01'), - ([10, 10], 'days since 2000-01-01'), - (np.array(10), 'days since 2000-01-01'), - (0, 'days since 1000-01-01'), - ([0], 'days since 1000-01-01'), - ([[0]], 'days since 1000-01-01'), - (np.arange(2), 'days since 1000-01-01'), - (np.arange(0, 100000, 20000), 'days since 1900-01-01'), - (17093352.0, 'hours since 1-1-1 00:00:0.0'), - ([0.5, 1.5], 'hours since 1900-01-01T00:00:00'), - (0, 'milliseconds since 2000-01-01T00:00:00'), - (0, 'microseconds since 2000-01-01T00:00:00'), - ]: - for calendar in ['standard', 'gregorian', 'proleptic_gregorian']: - expected = _ensure_naive_tz(nc4.num2date(num_dates, units, calendar)) - print(num_dates, units, calendar) - with warnings.catch_warnings(): - warnings.filterwarnings('ignore', - 'Unable to decode time axis') - actual = conventions.decode_cf_datetime(num_dates, units, - calendar) - if (isinstance(actual, np.ndarray) and - np.issubdtype(actual.dtype, np.datetime64)): - # self.assertEqual(actual.dtype.kind, 'M') - # For some reason, numpy 1.8 does not compare ns precision - # datetime64 arrays as equal to arrays of datetime objects, - # but it works for us precision. Thus, convert to us - # precision for the actual array equal comparison... - actual_cmp = actual.astype('M8[us]') - else: - actual_cmp = actual - self.assertArrayEqual(expected, actual_cmp) - encoded, _, _ = conventions.encode_cf_datetime(actual, units, - calendar) - if '1-1-1' not in units: - # pandas parses this date very strangely, so the original - # units/encoding cannot be preserved in this case: - # (Pdb) pd.to_datetime('1-1-1 00:00:0.0') - # Timestamp('2001-01-01 00:00:00') - self.assertArrayEqual(num_dates, np.around(encoded, 1)) - if (hasattr(num_dates, 'ndim') and num_dates.ndim == 1 and - '1000' not in units): - # verify that wrapping with a pandas.Index works - # note that it *does not* currently work to even put - # non-datetime64 compatible dates into a pandas.Index :( - encoded, _, _ = conventions.encode_cf_datetime( - pd.Index(actual), units, calendar) - self.assertArrayEqual(num_dates, np.around(encoded, 1)) - - @requires_netCDF4 - def test_decode_cf_datetime_overflow(self): - # checks for - # https://github.com/pydata/pandas/issues/14068 - # https://github.com/pydata/xarray/issues/975 - - from datetime import datetime - units = 'days since 2000-01-01 00:00:00' - - # date after 2262 and before 1678 - days = (-117608, 95795) - expected = (datetime(1677, 12, 31), datetime(2262, 4, 12)) - - for i, day in enumerate(days): - result = conventions.decode_cf_datetime(day, units) - self.assertEqual(result, expected[i]) - - @requires_netCDF4 - def test_decode_cf_datetime_transition_to_invalid(self): - # manually create dataset with not-decoded date - from datetime import datetime - ds = Dataset(coords={'time': [0, 266 * 365]}) - units = 'days since 2000-01-01 00:00:00' - ds.time.attrs = dict(units=units) - ds_decoded = conventions.decode_cf(ds) - - expected = [datetime(2000, 1, 1, 0, 0), - datetime(2265, 10, 28, 0, 0)] - - self.assertArrayEqual(ds_decoded.time.values, expected) - - def test_decoded_cf_datetime_array(self): - actual = conventions.DecodedCFDatetimeArray( - np.array([0, 1, 2]), 'days since 1900-01-01', 'standard') - expected = pd.date_range('1900-01-01', periods=3).values - self.assertEqual(actual.dtype, np.dtype('datetime64[ns]')) - self.assertArrayEqual(actual, expected) - - # default calendar - actual = conventions.DecodedCFDatetimeArray( - np.array([0, 1, 2]), 'days since 1900-01-01') - self.assertEqual(actual.dtype, np.dtype('datetime64[ns]')) - self.assertArrayEqual(actual, expected) - - def test_slice_decoded_cf_datetime_array(self): - actual = conventions.DecodedCFDatetimeArray( - np.array([0, 1, 2]), 'days since 1900-01-01', 'standard') - expected = pd.date_range('1900-01-01', periods=3).values - self.assertEqual(actual.dtype, np.dtype('datetime64[ns]')) - self.assertArrayEqual(actual[B[0:2]], expected[slice(0, 2)]) - - actual = conventions.DecodedCFDatetimeArray( - np.array([0, 1, 2]), 'days since 1900-01-01', 'standard') - expected = pd.date_range('1900-01-01', periods=3).values - self.assertEqual(actual.dtype, np.dtype('datetime64[ns]')) - self.assertArrayEqual(actual[O[np.array([0, 2])]], expected[[0, 2]]) - - def test_decode_cf_datetime_non_standard_units(self): - expected = pd.date_range(periods=100, start='1970-01-01', freq='h') - # netCDFs from madis.noaa.gov use this format for their time units - # they cannot be parsed by netcdftime, but pd.Timestamp works - units = 'hours since 1-1-1970' - actual = conventions.decode_cf_datetime(np.arange(100), units) - self.assertArrayEqual(actual, expected) - - def test_decode_cf_with_conflicting_fill_missing_value(self): - var = Variable(['t'], np.arange(10), - {'units': 'foobar', - 'missing_value': 0, - '_FillValue': 1}) - with raises_regex(ValueError, "_FillValue and missing_value"): - conventions.decode_cf_variable('t', var) - - var = Variable(['t'], np.arange(10), - {'units': 'foobar', - 'missing_value': np.nan, - '_FillValue': np.nan}) - var = conventions.decode_cf_variable('t', var) - self.assertIsNotNone(var) - - var = Variable(['t'], np.arange(10), - {'units': 'foobar', - 'missing_value': np.float32(np.nan), - '_FillValue': np.float32(np.nan)}) - var = conventions.decode_cf_variable('t', var) - self.assertIsNotNone(var) - - @requires_netCDF4 - def test_decode_cf_datetime_non_iso_strings(self): - # datetime strings that are _almost_ ISO compliant but not quite, - # but which netCDF4.num2date can still parse correctly - expected = pd.date_range(periods=100, start='2000-01-01', freq='h') - cases = [(np.arange(100), 'hours since 2000-01-01 0'), - (np.arange(100), 'hours since 2000-1-1 0'), - (np.arange(100), 'hours since 2000-01-01 0:00')] - for num_dates, units in cases: - actual = conventions.decode_cf_datetime(num_dates, units) - self.assertArrayEqual(actual, expected) - - @requires_netCDF4 - def test_decode_non_standard_calendar(self): - import netCDF4 as nc4 - - for calendar in ['noleap', '365_day', '360_day', 'julian', 'all_leap', - '366_day']: - units = 'days since 0001-01-01' - times = pd.date_range('2001-04-01-00', end='2001-04-30-23', - freq='H') - noleap_time = nc4.date2num(times.to_pydatetime(), units, - calendar=calendar) - expected = times.values - with warnings.catch_warnings(): - warnings.filterwarnings('ignore', 'Unable to decode time axis') - actual = conventions.decode_cf_datetime(noleap_time, units, - calendar=calendar) - self.assertEqual(actual.dtype, np.dtype('M8[ns]')) - abs_diff = abs(actual - expected) - # once we no longer support versions of netCDF4 older than 1.1.5, - # we could do this check with near microsecond accuracy: - # https://github.com/Unidata/netcdf4-python/issues/355 - self.assertTrue((abs_diff <= np.timedelta64(1, 's')).all()) - - @requires_netCDF4 - def test_decode_non_standard_calendar_single_element(self): - units = 'days since 0001-01-01' - for calendar in ['noleap', '365_day', '360_day', 'julian', 'all_leap', - '366_day']: - for num_time in [735368, [735368], [[735368]]]: - with warnings.catch_warnings(): - warnings.filterwarnings('ignore', - 'Unable to decode time axis') - actual = conventions.decode_cf_datetime(num_time, units, - calendar=calendar) - self.assertEqual(actual.dtype, np.dtype('M8[ns]')) - - @requires_netCDF4 - def test_decode_non_standard_calendar_single_element_fallback(self): - import netCDF4 as nc4 - - units = 'days since 0001-01-01' - dt = nc4.netcdftime.datetime(2001, 2, 29) - for calendar in ['360_day', 'all_leap', '366_day']: - num_time = nc4.date2num(dt, units, calendar) - with self.assertWarns('Unable to decode time axis'): - actual = conventions.decode_cf_datetime(num_time, units, - calendar=calendar) - expected = np.asarray(nc4.num2date(num_time, units, calendar)) - print(num_time, calendar, actual, expected) - self.assertEqual(actual.dtype, np.dtype('O')) - self.assertEqual(expected, actual) - - @requires_netCDF4 - def test_decode_non_standard_calendar_multidim_time(self): - import netCDF4 as nc4 - - calendar = 'noleap' - units = 'days since 0001-01-01' - times1 = pd.date_range('2001-04-01', end='2001-04-05', freq='D') - times2 = pd.date_range('2001-05-01', end='2001-05-05', freq='D') - noleap_time1 = nc4.date2num(times1.to_pydatetime(), units, - calendar=calendar) - noleap_time2 = nc4.date2num(times2.to_pydatetime(), units, - calendar=calendar) - mdim_time = np.empty((len(noleap_time1), 2), ) - mdim_time[:, 0] = noleap_time1 - mdim_time[:, 1] = noleap_time2 - - expected1 = times1.values - expected2 = times2.values - with warnings.catch_warnings(): - warnings.filterwarnings('ignore', 'Unable to decode time axis') - actual = conventions.decode_cf_datetime(mdim_time, units, - calendar=calendar) - self.assertEqual(actual.dtype, np.dtype('M8[ns]')) - self.assertArrayEqual(actual[:, 0], expected1) - self.assertArrayEqual(actual[:, 1], expected2) - - @requires_netCDF4 - def test_decode_non_standard_calendar_fallback(self): - import netCDF4 as nc4 - # ensure leap year doesn't matter - for year in [2010, 2011, 2012, 2013, 2014]: - for calendar in ['360_day', '366_day', 'all_leap']: - calendar = '360_day' - units = 'days since {0}-01-01'.format(year) - num_times = np.arange(100) - expected = nc4.num2date(num_times, units, calendar) - - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter('always') - actual = conventions.decode_cf_datetime(num_times, units, - calendar=calendar) - self.assertEqual(len(w), 1) - self.assertIn('Unable to decode time axis', - str(w[0].message)) - - self.assertEqual(actual.dtype, np.dtype('O')) - self.assertArrayEqual(actual, expected) - - @requires_netCDF4 - def test_cf_datetime_nan(self): - for num_dates, units, expected_list in [ - ([np.nan], 'days since 2000-01-01', ['NaT']), - ([np.nan, 0], 'days since 2000-01-01', - ['NaT', '2000-01-01T00:00:00Z']), - ([np.nan, 0, 1], 'days since 2000-01-01', - ['NaT', '2000-01-01T00:00:00Z', '2000-01-02T00:00:00Z']), - ]: - with warnings.catch_warnings(): - warnings.filterwarnings('ignore', 'All-NaN') - actual = conventions.decode_cf_datetime(num_dates, units) - expected = np.array(expected_list, dtype='datetime64[ns]') - self.assertArrayEqual(expected, actual) - - @requires_netCDF4 - def test_decoded_cf_datetime_array_2d(self): - # regression test for GH1229 - array = conventions.DecodedCFDatetimeArray(np.array([[0, 1], [2, 3]]), - 'days since 2000-01-01') - assert array.dtype == 'datetime64[ns]' - expected = pd.date_range('2000-01-01', periods=4).values.reshape(2, 2) - self.assertArrayEqual(np.asarray(array), expected) - - def test_infer_datetime_units(self): - for dates, expected in [(pd.date_range('1900-01-01', periods=5), - 'days since 1900-01-01 00:00:00'), - (pd.date_range('1900-01-01 12:00:00', freq='H', - periods=2), - 'hours since 1900-01-01 12:00:00'), - (['1900-01-01', '1900-01-02', - '1900-01-02 00:00:01'], - 'seconds since 1900-01-01 00:00:00'), - (pd.to_datetime(['1900-01-01', '1900-01-02', 'NaT']), - 'days since 1900-01-01 00:00:00'), - (pd.to_datetime(['1900-01-01', - '1900-01-02T00:00:00.005']), - 'seconds since 1900-01-01 00:00:00'), - (pd.to_datetime(['NaT', '1900-01-01']), - 'days since 1900-01-01 00:00:00'), - (pd.to_datetime(['NaT']), - 'days since 1970-01-01 00:00:00'), - ]: - self.assertEqual(expected, conventions.infer_datetime_units(dates)) - - def test_cf_timedelta(self): - examples = [ - ('1D', 'days', np.int64(1)), - (['1D', '2D', '3D'], 'days', np.array([1, 2, 3], 'int64')), - ('1h', 'hours', np.int64(1)), - ('1ms', 'milliseconds', np.int64(1)), - ('1us', 'microseconds', np.int64(1)), - (['NaT', '0s', '1s'], None, [np.nan, 0, 1]), - (['30m', '60m'], 'hours', [0.5, 1.0]), - (np.timedelta64('NaT', 'ns'), 'days', np.nan), - (['NaT', 'NaT'], 'days', [np.nan, np.nan]), - ] - - for timedeltas, units, numbers in examples: - timedeltas = pd.to_timedelta(timedeltas, box=False) - numbers = np.array(numbers) - - expected = numbers - actual, _ = conventions.encode_cf_timedelta(timedeltas, units) - self.assertArrayEqual(expected, actual) - self.assertEqual(expected.dtype, actual.dtype) - - if units is not None: - expected = timedeltas - actual = conventions.decode_cf_timedelta(numbers, units) - self.assertArrayEqual(expected, actual) - self.assertEqual(expected.dtype, actual.dtype) - - expected = np.timedelta64('NaT', 'ns') - actual = conventions.decode_cf_timedelta(np.array(np.nan), 'days') - self.assertArrayEqual(expected, actual) - - def test_cf_timedelta_2d(self): - timedeltas, units, numbers = ['1D', '2D', '3D'], 'days', np.atleast_2d([1, 2, 3]) - - timedeltas = np.atleast_2d(pd.to_timedelta(timedeltas, box=False)) - expected = timedeltas - - actual = conventions.decode_cf_timedelta(numbers, units) - self.assertArrayEqual(expected, actual) - self.assertEqual(expected.dtype, actual.dtype) - - def test_infer_timedelta_units(self): - for deltas, expected in [ - (pd.to_timedelta(['1 day', '2 days']), 'days'), - (pd.to_timedelta(['1h', '1 day 1 hour']), 'hours'), - (pd.to_timedelta(['1m', '2m', np.nan]), 'minutes'), - (pd.to_timedelta(['1m3s', '1m4s']), 'seconds')]: - self.assertEqual(expected, conventions.infer_timedelta_units(deltas)) - - def test_invalid_units_raises_eagerly(self): - ds = Dataset({'time': ('time', [0, 1], {'units': 'foobar since 123'})}) - with raises_regex(ValueError, 'unable to decode time'): - decode_cf(ds) - - @requires_netCDF4 - def test_dataset_repr_with_netcdf4_datetimes(self): - # regression test for #347 - attrs = {'units': 'days since 0001-01-01', 'calendar': 'noleap'} - with warnings.catch_warnings(): - warnings.filterwarnings('ignore', 'unable to decode time') - ds = decode_cf(Dataset({'time': ('time', [0, 1], attrs)})) - self.assertIn('(time) object', repr(ds)) - - attrs = {'units': 'days since 1900-01-01'} - ds = decode_cf(Dataset({'time': ('time', [0, 1], attrs)})) - self.assertIn('(time) datetime64[ns]', repr(ds)) - - # this should not throw a warning (GH1111) - with warnings.catch_warnings(): - warnings.filterwarnings('error') - conventions.DecodedCFDatetimeArray(np.asarray([722624]), - "days since 0001-01-01") - - class TestNativeEndiannessArray(TestCase): def test(self): x = np.arange(5, dtype='>i8') @@ -595,6 +155,31 @@ def test(self): self.assertArrayEqual(a, expected) +def test_decode_cf_with_conflicting_fill_missing_value(): + var = Variable(['t'], np.arange(10), + {'units': 'foobar', + 'missing_value': 0, + '_FillValue': 1}) + with raises_regex(ValueError, "_FillValue and missing_value"): + conventions.decode_cf_variable('t', var) + + expected = Variable(['t'], np.arange(10), {'units': 'foobar'}) + + var = Variable(['t'], np.arange(10), + {'units': 'foobar', + 'missing_value': np.nan, + '_FillValue': np.nan}) + actual = conventions.decode_cf_variable('t', var) + assert_identical(actual, expected) + + var = Variable(['t'], np.arange(10), + {'units': 'foobar', + 'missing_value': np.float32(np.nan), + '_FillValue': np.float32(np.nan)}) + actual = conventions.decode_cf_variable('t', var) + assert_identical(actual, expected) + + @requires_netCDF4 class TestEncodeCFVariable(TestCase): def test_incompatible_attributes(self): @@ -675,6 +260,38 @@ def test_decode_cf_with_drop_variables(self): self.assertDatasetIdentical(expected, actual) self.assertDatasetIdentical(expected, actual2) + def test_invalid_time_units_raises_eagerly(self): + ds = Dataset({'time': ('time', [0, 1], {'units': 'foobar since 123'})}) + with raises_regex(ValueError, 'unable to decode time'): + decode_cf(ds) + + @requires_netCDF4 + def test_dataset_repr_with_netcdf4_datetimes(self): + # regression test for #347 + attrs = {'units': 'days since 0001-01-01', 'calendar': 'noleap'} + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', 'unable to decode time') + ds = decode_cf(Dataset({'time': ('time', [0, 1], attrs)})) + self.assertIn('(time) object', repr(ds)) + + attrs = {'units': 'days since 1900-01-01'} + ds = decode_cf(Dataset({'time': ('time', [0, 1], attrs)})) + self.assertIn('(time) datetime64[ns]', repr(ds)) + + @requires_netCDF4 + def test_decode_cf_datetime_transition_to_invalid(self): + # manually create dataset with not-decoded date + from datetime import datetime + ds = Dataset(coords={'time': [0, 266 * 365]}) + units = 'days since 2000-01-01 00:00:00' + ds.time.attrs = dict(units=units) + ds_decoded = conventions.decode_cf(ds) + + expected = [datetime(2000, 1, 1, 0, 0), + datetime(2265, 10, 28, 0, 0)] + + self.assertArrayEqual(ds_decoded.time.values, expected) + class CFEncodedInMemoryStore(WritableCFDataStore, InMemoryDataStore): pass diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index fd4244c4f9f..eb37cbe2b26 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -13,9 +13,9 @@ from xarray import (align, broadcast, Dataset, DataArray, IndexVariable, Variable) +from xarray.coding.times import CFDatetimeCoder from xarray.core.pycompat import iteritems, OrderedDict from xarray.core.common import full_like -from xarray.conventions import maybe_encode_datetime from xarray.tests import ( TestCase, ReturnItem, source_ndarray, unittest, requires_dask, assert_identical, assert_equal, assert_allclose, assert_array_equal, @@ -2891,7 +2891,7 @@ def test_to_and_from_iris(self): original_coord = original.coords[orginal_key] self.assertEqual(coord.var_name, original_coord.name) self.assertArrayEqual(coord.points, - maybe_encode_datetime(original_coord).values) + CFDatetimeCoder().encode(original_coord).values) self.assertEqual(actual.coord_dims(coord), original.get_axis_num (original.coords[coord.var_name].dims)) @@ -2963,7 +2963,7 @@ def test_to_and_from_iris_dask(self): original_coord = original.coords[orginal_key] self.assertEqual(coord.var_name, original_coord.name) self.assertArrayEqual(coord.points, - maybe_encode_datetime(original_coord).values) + CFDatetimeCoder().encode(original_coord).values) self.assertEqual(actual.coord_dims(coord), original.get_axis_num (original.coords[coord.var_name].dims)) From 61de0ceabc258f8036bdf387036785f1205740e2 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sat, 30 Dec 2017 16:26:28 -0800 Subject: [PATCH 2/5] Fix zarr and cmds export --- xarray/backends/zarr.py | 15 ++++++++------- xarray/convert.py | 6 +++--- xarray/tests/test_backends.py | 2 -- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 779d8d07886..081cf842a11 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -6,6 +6,7 @@ import numpy as np +from .. import coding from .. import Variable from ..core import indexing from ..core.utils import FrozenOrderedDict, HiddenKeyDict @@ -259,13 +260,13 @@ def encode_zarr_variable(var, needs_copy=True, name=None): raise NotImplementedError("Variable `%s` is an object. Zarr " "store can't yet encode objects." % name) - var = conventions.maybe_encode_datetime(var, name=name) - var = conventions.maybe_encode_timedelta(var, name=name) - var, needs_copy = conventions.maybe_encode_offset_and_scale(var, - needs_copy, - name=name) - var, needs_copy = conventions.maybe_encode_fill_value(var, needs_copy, - name=name) + for coder in [coding.times.CFDatetimeCoder(), + coding.times.CFTimedeltaCoder(), + coding.variables.CFScaleOffsetCoder(), + coding.variables.CFMaskCoder(), + coding.variables.UnsignedCoder()]: + var = coder.encode(var, name=name) + var = conventions.maybe_encode_nonstring_dtype(var, name=name) var = conventions.maybe_default_fill_value(var) var = conventions.maybe_encode_bools(var) diff --git a/xarray/convert.py b/xarray/convert.py index 446bd5a0d35..caf665b421d 100644 --- a/xarray/convert.py +++ b/xarray/convert.py @@ -6,11 +6,11 @@ import numpy as np +from .coding.times import CFDatetimeCoder, CFTimedeltaCoder from .core.dataarray import DataArray from .core.pycompat import OrderedDict, range from .core.dtypes import get_fill_value -from .conventions import ( - maybe_encode_timedelta, maybe_encode_datetime, decode_cf) +from .conventions import decode_cf cdms2_ignored_attrs = {'name', 'tileIndex'} iris_forbidden_keys = {'standard_name', 'long_name', 'units', 'bounds', 'axis', @@ -25,7 +25,7 @@ def encode(var): - return maybe_encode_timedelta(maybe_encode_datetime(var.variable)) + return CFTimedeltaCoder().encode(CFDatetimeCoder().encode(var.variable)) def _filter_attrs(attrs, ignored_attrs): diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 58e23943b77..6b0cd59eb9e 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -530,7 +530,6 @@ def test_roundtrip_string_with_fill_value_nchar(self): def test_unsigned_roundtrip_mask_and_scale(self): decoded = create_unsigned_masked_scaled_data() encoded = create_encoded_unsigned_masked_scaled_data() - print('ORIGINAL', encoded, encoded.x.attrs, encoded.x.encoding) with self.roundtrip(decoded) as actual: for k in decoded.variables: self.assertEqual(decoded.variables[k].dtype, @@ -550,7 +549,6 @@ def test_unsigned_roundtrip_mask_and_scale(self): self.assertDatasetAllClose(encoded, actual, decode_bytes=False) # make sure roundtrip encoding didn't change the # original dataset. - print('NOW', encoded, encoded.x.attrs, encoded.x.encoding) self.assertDatasetAllClose( encoded, create_encoded_unsigned_masked_scaled_data()) with self.roundtrip(encoded) as actual: From f23dfe4fe9091b07516f2c55c8185684ca0258d8 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 10 Jan 2018 08:30:12 -0800 Subject: [PATCH 3/5] add whats-new and small cleanup --- doc/whats-new.rst | 3 ++- xarray/backends/zarr.py | 2 +- xarray/coding/variables.py | 2 +- xarray/conventions.py | 9 +++------ 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 17ff3ab661c..942625afd22 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -61,7 +61,8 @@ Bug fixes with size one in some dimension can now be plotted, which is good for exploring satellite imagery. (:issue:`1780`) By `Zac Hatfield-Dodds `_. - +- Fixed ``UnboundLocalError`` when opening netCDF file `` (:issue:`1781`). + By `Stephan Hoyer `_. .. _whats-new.0.10.0: diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 081cf842a11..dd794c3b947 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -264,7 +264,7 @@ def encode_zarr_variable(var, needs_copy=True, name=None): coding.times.CFTimedeltaCoder(), coding.variables.CFScaleOffsetCoder(), coding.variables.CFMaskCoder(), - coding.variables.UnsignedCoder()]: + coding.variables.UnsignedIntegerCoder()]: var = coder.encode(var, name=name) var = conventions.maybe_encode_nonstring_dtype(var, name=name) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index d7fc60e0319..bf2ded8b562 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -240,7 +240,7 @@ def decode(self, variable, name=None): return Variable(dims, data, attrs, encoding) -class UnsignedCoder(VariableCoder): +class UnsignedIntegerCoder(VariableCoder): def encode(self, variable, name=None): dims, data, attrs, encoding = unpack_for_encoding(variable) diff --git a/xarray/conventions.py b/xarray/conventions.py index e29fa358009..ef80622b60f 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -11,15 +11,12 @@ from .coding import times from .coding import variables +from .coding.variables import SerializationWarning from .core import duck_array_ops, indexing from .core.pycompat import OrderedDict, basestring, iteritems from .core.variable import IndexVariable, Variable, as_variable -class SerializationWarning(RuntimeWarning): - """Warnings about encoding/decoding issues in serialization.""" - - class StackedBytesArray(indexing.ExplicitlyIndexedNDArrayMixin): """Wrapper around array-like objects to create a new indexable object where values, when accessed, are automatically stacked along the last dimension. @@ -392,7 +389,7 @@ def encode_cf_variable(var, needs_copy=True, name=None): times.CFTimedeltaCoder(), variables.CFScaleOffsetCoder(), variables.CFMaskCoder(), - variables.UnsignedCoder()]: + variables.UnsignedIntegerCoder()]: var = coder.encode(var, name=name) # TODO(shoyer): convert all of these to use coders, too: @@ -464,7 +461,7 @@ def decode_cf_variable(name, var, concat_characters=True, mask_and_scale=True, var = Variable(dimensions, data, attributes, encoding) if mask_and_scale: - for coder in [variables.UnsignedCoder(), + for coder in [variables.UnsignedIntegerCoder(), variables.CFMaskCoder(), variables.CFScaleOffsetCoder()]: var = coder.decode(var, name=name) From 1b24ef7b1b7d21282b8b2b7769218c7d14a8ebd0 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 10 Jan 2018 19:51:24 -0800 Subject: [PATCH 4/5] Move constant to top of module --- xarray/coding/times.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 62f79934f72..ef73ba8abf3 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -36,6 +36,9 @@ 'h': 1e9 * 60 * 60, 'D': 1e9 * 60 * 60 * 24} +TIME_UNITS = frozenset(['days', 'hours', 'minutes', 'seconds', + 'milliseconds', 'microseconds']) + def _netcdf_to_numpy_timeunit(units): units = units.lower() @@ -176,10 +179,6 @@ def decode_cf_timedelta(num_timedeltas, units): return result.reshape(shape) -TIME_UNITS = frozenset(['days', 'hours', 'minutes', 'seconds', - 'milliseconds', 'microseconds']) - - def _infer_time_units_from_diff(unique_timedeltas): for time_unit, delta in [('days', 86400), ('hours', 3600), ('minutes', 60), ('seconds', 1)]: From 67f86fc16bc62cd7df25c9334dbb167bc7367cda Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 10 Jan 2018 19:56:56 -0800 Subject: [PATCH 5/5] use _NS_PER_TIME_DELTA --- xarray/coding/times.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index ef73ba8abf3..e00769af884 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -29,12 +29,12 @@ # standard calendars recognized by netcdftime _STANDARD_CALENDARS = set(['standard', 'gregorian', 'proleptic_gregorian']) -_NS_PER_TIME_DELTA = {'us': 1e3, - 'ms': 1e6, - 's': 1e9, - 'm': 1e9 * 60, - 'h': 1e9 * 60 * 60, - 'D': 1e9 * 60 * 60 * 24} +_NS_PER_TIME_DELTA = {'us': int(1e3), + 'ms': int(1e6), + 's': int(1e9), + 'm': int(1e9) * 60, + 'h': int(1e9) * 60 * 60, + 'D': int(1e9) * 60 * 60 * 24} TIME_UNITS = frozenset(['days', 'hours', 'minutes', 'seconds', 'milliseconds', 'microseconds']) @@ -180,9 +180,9 @@ def decode_cf_timedelta(num_timedeltas, units): def _infer_time_units_from_diff(unique_timedeltas): - for time_unit, delta in [('days', 86400), ('hours', 3600), - ('minutes', 60), ('seconds', 1)]: - unit_delta = np.timedelta64(10 ** 9 * delta, 'ns') + for time_unit in ['days', 'hours', 'minutes', 'seconds']: + delta_ns = _NS_PER_TIME_DELTA[_netcdf_to_numpy_timeunit(time_unit)] + unit_delta = np.timedelta64(delta_ns, 'ns') diffs = unique_timedeltas / unit_delta if np.all(diffs == diffs.astype(int)): return time_unit