From 6349bdad7d12649ec7b9e5b00534c3d3ac6967e5 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 28 May 2021 16:37:58 -0400 Subject: [PATCH 01/14] skeletal API --- xarray/__init__.py | 4 +- xarray/core/computation.py | 76 +++++++++++++++++++++++++++++++++++ xarray/core/dataarray.py | 77 ++++++++++++++++++++++++++++++++++++ xarray/core/dataset.py | 81 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 236 insertions(+), 2 deletions(-) diff --git a/xarray/__init__.py b/xarray/__init__.py index 3886edc60e6..6dd0c76cf79 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -18,7 +18,7 @@ from .core.alignment import align, broadcast from .core.combine import combine_by_coords, combine_nested from .core.common import ALL_DIMS, full_like, ones_like, zeros_like -from .core.computation import apply_ufunc, corr, cov, dot, polyval, where +from .core.computation import apply_ufunc, corr, cov, dot, hist, polyval, where from .core.concat import concat from .core.dataarray import DataArray from .core.dataset import Dataset @@ -57,7 +57,7 @@ "cov", "corr", "full_like", - "infer_freq", + "hist" "infer_freq", "load_dataarray", "load_dataset", "map_blocks", diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 07b7bca27c5..fa75ba12d41 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1724,3 +1724,79 @@ def _calc_idxminmax( res.attrs = indx.attrs return res + + +def hist(*datarrays, bins=None, dim=None, weights=None, density=False): + """ + Histogram applied along specified dimensions. + + If the supplied arguments are chunked dask arrays it will use + `dask.array.blockwise` internally to parallelize over all chunks. + + Parameters + ---------- + datarrays : xarray.DataArray objects + Input data. The number of input arguments determines the dimensionality of + the histogram. For example, two arguments produce a 2D histogram. + dim : tuple of strings, optional + Dimensions over which which the histogram is computed. The default is to + compute the histogram of the flattened array. i.e. over all dimensions. + bins : int or array_like or a list of ints or arrays, or list of DataArrays, optional + If a list, there should be one entry for each item in ``args``. + The bin specification: + + * If int, the number of bins for all arguments in ``args``. + * If array_like, the bin edges for all arguments in ``args``. + * If a list of ints, the number of bins for every argument in ``args``. + * If a list arrays, the bin edges for each argument in ``args`` + (required format for Dask inputs). + * A combination [int, array] or [array, int], where int + is the number of bins and array is the bin edges. + * If a list of DataArrays, the bins for each argument in ``args`` + The DataArrays can be multidimensional, but must not have any + dimensions shared with the `dim` argument. + + When bin edges are specified, all but the last (righthand-most) bin include + the left edge and exclude the right edge. The last bin includes both edges. + + A ``TypeError`` will be raised if ``args`` contains dask arrays and + ``bins`` are not specified explicitly as a list of arrays. + weights : array_like, optional + An array of weights, of the same shape as `a`. Each value in + `a` only contributes its associated weight towards the bin count + (instead of 1). If `density` is True, the weights are + normalized, so that the integral of the density over the range + remains 1. NaNs in the weights input will fill the entire bin with + NaNs. If there are NaNs in the weights input call ``.fillna(0.)`` + before running ``hist()``. + density : bool, optional + If ``False``, the result will contain the number of samples in + each bin. If ``True``, the result is the value of the + probability *density* function at the bin, normalized such that + the *integral* over the range is 1. Note that the sum of the + histogram values will not be equal to 1 unless bins of unit + width are chosen; it is not a probability *mass* function. + + Returns + ------- + hist : xarray.DataArray + A single dataarray which contains the values of the histogram. See + `density` and `weights` for a description of the possible semantics. + + The returned dataarray will have one additional coordinate for each + dataarray supplied, named as `var_bins`, which contains the positions + of the centres of each bin. + + Examples + -------- + + See Also + -------- + DataArray.hist + Dataset.hist + numpy.histogramdd + dask.array.blockwise + """ + + # TODO xhistogram code goes here + raise NotImplementedError diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 2ab60b894e1..71c285731d0 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -4599,6 +4599,83 @@ def drop_duplicates( indexes = {dim: ~self.get_index(dim).duplicated(keep=keep)} return self.isel(indexes) + def hist(self, dim=None, bins=None, weights=None, density=False): + """ + Histogram applied along specified dimensions. + + If the supplied arguments are chunked dask arrays it will use + `dask.array.blockwise` internally to parallelize over all chunks. + + dim : tuple of strings, optional + Dimensions over which which the histogram is computed. The default is to + compute the histogram of the flattened array. i.e. over all dimensions. + bins : int or array_like or a list of ints or arrays, or list of DataArrays, optional + If a list, there should be one entry for each item in ``args``. + The bin specification: + + * If int, the number of bins for all arguments in ``args``. + * If array_like, the bin edges for all arguments in ``args``. + * If a list of ints, the number of bins for every argument in ``args``. + * If a list arrays, the bin edges for each argument in ``args`` + (required format for Dask inputs). + * A combination [int, array] or [array, int], where int + is the number of bins and array is the bin edges. + * If a list of DataArrays, the bins for each argument in ``args`` + The DataArrays can be multidimensional, but must not have any + dimensions shared with the `dim` argument. + + When bin edges are specified, all but the last (righthand-most) bin include + the left edge and exclude the right edge. The last bin includes both edges. + + A ``TypeError`` will be raised if ``args`` contains dask arrays and + ``bins`` are not specified explicitly as a list of arrays. + weights : array_like, optional + An array of weights, of the same shape as `a`. Each value in + `a` only contributes its associated weight towards the bin count + (instead of 1). If `density` is True, the weights are + normalized, so that the integral of the density over the range + remains 1. NaNs in the weights input will fill the entire bin with + NaNs. If there are NaNs in the weights input call ``.fillna(0.)`` + before running ``hist()``. + density : bool, optional + If ``False``, the result will contain the number of samples in + each bin. If ``True``, the result is the value of the + probability *density* function at the bin, normalized such that + the *integral* over the range is 1. Note that the sum of the + histogram values will not be equal to 1 unless bins of unit + width are chosen; it is not a probability *mass* function. + + Returns + ------- + hist : xarray.DataArray + A single dataarray which contains the values of the histogram. See + `density` and `weights` for a description of the possible semantics. + + The returned dataarray will have one additional coordinate for each + dataarray supplied, named as `var_bins`, which contains the positions + of the centres of each bin. + + Examples + -------- + + See Also + -------- + xarray.hist + DataArray.hist + numpy.histogramdd + dask.array.blockwise + """ + + from .computation import hist + + return hist( + [self], + dim=dim, + bins=bins, + weights=weights, + density=density, + ) + # this needs to be at the end, or mypy will confuse with `str` # https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names str = utils.UncachedAccessor(StringAccessor) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 1697a7c67aa..1e0854c8c84 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -7627,3 +7627,84 @@ def _wrapper(Y, *coords_, **kwargs): result.attrs = self.attrs.copy() return result + + def hist(self, vars=None, dim=None, bins=None, weights=None, density=False): + """ + Histogram applied along specified dimensions. + + If the supplied arguments are chunked dask arrays it will use + `dask.array.blockwise` internally to parallelize over all chunks. + + vars : list of str + Variables on the Dataset to use as input data. The number of variables + determines the dimensionality of the histogram. For example, two + arguments produce a 2D histogram. + dim : tuple of strings, optional + Dimensions over which which the histogram is computed. The default is to + compute the histogram of the flattened array. i.e. over all dimensions. + bins : int or array_like or a list of ints or arrays, or list of DataArrays, optional + If a list, there should be one entry for each item in ``args``. + The bin specification: + + * If int, the number of bins for all arguments in ``args``. + * If array_like, the bin edges for all arguments in ``args``. + * If a list of ints, the number of bins for every argument in ``args``. + * If a list arrays, the bin edges for each argument in ``args`` + (required format for Dask inputs). + * A combination [int, array] or [array, int], where int + is the number of bins and array is the bin edges. + * If a list of DataArrays, the bins for each argument in ``args`` + The DataArrays can be multidimensional, but must not have any + dimensions shared with the `dim` argument. + + When bin edges are specified, all but the last (righthand-most) bin include + the left edge and exclude the right edge. The last bin includes both edges. + + A ``TypeError`` will be raised if ``args`` contains dask arrays and + ``bins`` are not specified explicitly as a list of arrays. + weights : array_like, optional + An array of weights, of the same shape as `a`. Each value in + `a` only contributes its associated weight towards the bin count + (instead of 1). If `density` is True, the weights are + normalized, so that the integral of the density over the range + remains 1. NaNs in the weights input will fill the entire bin with + NaNs. If there are NaNs in the weights input call ``.fillna(0.)`` + before running ``hist()``. + density : bool, optional + If ``False``, the result will contain the number of samples in + each bin. If ``True``, the result is the value of the + probability *density* function at the bin, normalized such that + the *integral* over the range is 1. Note that the sum of the + histogram values will not be equal to 1 unless bins of unit + width are chosen; it is not a probability *mass* function. + + Returns + ------- + hist : xarray.DataArray + A single dataarray which contains the values of the histogram. See + `density` and `weights` for a description of the possible semantics. + + The returned dataarray will have one additional coordinate for each + dataarray supplied, named as `var_bins`, which contains the positions + of the centres of each bin. + + Examples + -------- + + See Also + -------- + xarray.hist + DataArray.hist + numpy.histogramdd + dask.array.blockwise + """ + + from .computation import hist + + return hist( + [self[var] for var in vars], + dim=dim, + bins=bins, + weights=weights, + density=density, + ) From c8dff54f582d8be461056faa0a026d0ff3dbbafb Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 28 May 2021 16:38:07 -0400 Subject: [PATCH 02/14] redirected plt.hist to go via new hist function --- xarray/plot/plot.py | 56 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 7 deletions(-) diff --git a/xarray/plot/plot.py b/xarray/plot/plot.py index f530427562a..b13ad21db13 100644 --- a/xarray/plot/plot.py +++ b/xarray/plot/plot.py @@ -9,9 +9,8 @@ import functools import numpy as np -import pandas as pd -from .facetgrid import _easy_facetgrid +from .facetgrid import FacetGrid, _easy_facetgrid from .utils import ( _add_colorbar, _assert_valid_xy, @@ -379,6 +378,9 @@ def step(darray, *args, where="pre", drawstyle=None, ds=None, **kwargs): def hist( darray, + row=None, + col=None, + col_wrap=None, figsize=None, size=None, aspect=None, @@ -391,6 +393,9 @@ def hist( yticks=None, xlim=None, ylim=None, + bins=None, + weights=None, + density=False, **kwargs, ): """ @@ -398,12 +403,15 @@ def hist( Wraps :py:func:`matplotlib:matplotlib.pyplot.hist`. - Plots *N*-dimensional arrays by first flattening the array. + Plots *N*-dimensional arrays by first flattening the array and calculating + the histogram via :py:func:`DataArray.hist`. Parameters ---------- darray : DataArray - Can have any number of dimensions. + Can have any number of dimensions, but will be reduced to 1 dimension + by the histogram calculation before plotting (faceted plots will retain + more dimensions). figsize : tuple, optional A tuple (width, height) of the figure in inches. Mutually exclusive with ``size`` and ``ax``. @@ -416,16 +424,50 @@ def hist( ax : matplotlib axes object, optional Axes on which to plot. By default, use the current axes. Mutually exclusive with ``size`` and ``figsize``. + bins : int or array_like or a list of ints or arrays, or list of DataArrays, optional + See :py:func:DataArray.hist + weights : array_like, optional + See :py:func:DataArray.hist + density : bool, optional + See :py:func:DataArray.hist **kwargs : optional Additional keyword arguments to :py:func:`matplotlib:matplotlib.pyplot.hist`. """ + + # compute the dims to count over + reduce_dims = set(darray.dims) - set([row, col]) + + # Handle facetgrids first + if row or col: + allargs = locals().copy() + allargs.update(allargs.pop("kwargs")) + allargs.pop("darray") + + g = FacetGrid( + data=darray, + col=col, + row=row, + col_wrap=col_wrap, + sharex=False, + sharey=False, + figsize=figsize, + aspect=aspect, + size=size, + subplot_kws=kwargs, + ) + + return g.map(hist, **kwargs) + ax = get_axis(figsize, size, aspect, ax) - no_nan = np.ravel(darray.values) - no_nan = no_nan[pd.notnull(no_nan)] + h = darray.hist(bins=bins, dim=reduce_dims, weights=weights, density=density) + counts = h.values + bins = h.coords[f"{darray.name}_bins"].values - primitive = ax.hist(no_nan, **kwargs) + # Use the weights kwarg to avoid recomputing histogram in matplotlib + # (see matplotlib.pyplot.hist docstring) + primitive = ax.hist(bins[:-1], weights=counts, **kwargs) ax.set_title("Histogram") ax.set_xlabel(label_from_attrs(darray)) From a595fadc34d682a6474bb3063c0f951a63c2ab56 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 29 May 2021 10:54:30 -0400 Subject: [PATCH 03/14] fixed mistake in init --- xarray/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/__init__.py b/xarray/__init__.py index 6dd0c76cf79..54e2f4ec45d 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -57,7 +57,8 @@ "cov", "corr", "full_like", - "hist" "infer_freq", + "hist", + "infer_freq", "load_dataarray", "load_dataset", "map_blocks", From 8e0b6eb27e6545a0d5238f079298a6ad0935fb22 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 29 May 2021 10:57:49 -0400 Subject: [PATCH 04/14] added hist functions to auto-generated API doc page --- doc/api.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/api.rst b/doc/api.rst index fbd0a75bb41..e7d1cd495ca 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -32,6 +32,7 @@ Top-level functions cov corr dot + hist polyval map_blocks show_versions @@ -180,6 +181,7 @@ Computation Dataset.map_blocks Dataset.polyfit Dataset.curvefit + Dataset.hist **Aggregation**: :py:attr:`~Dataset.all` @@ -378,6 +380,7 @@ Computation DataArray.polyfit DataArray.map_blocks DataArray.curvefit + DataArray.hist **Aggregation**: :py:attr:`~DataArray.all` From f023a2c49034aa333a8f28b4f66742498063ec6f Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 29 May 2021 12:40:00 -0400 Subject: [PATCH 05/14] added method to Weighted class --- xarray/core/computation.py | 2 +- xarray/core/dataset.py | 11 +++---- xarray/core/weighted.py | 66 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+), 8 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index fa75ba12d41..e08da39f95f 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1726,7 +1726,7 @@ def _calc_idxminmax( return res -def hist(*datarrays, bins=None, dim=None, weights=None, density=False): +def hist(*datarrays, dim=None, bins=None, weights=None, density=False): """ Histogram applied along specified dimensions. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 1e0854c8c84..26fe013f869 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -7628,17 +7628,14 @@ def _wrapper(Y, *coords_, **kwargs): return result - def hist(self, vars=None, dim=None, bins=None, weights=None, density=False): + def hist(self, dim=None, bins=None, weights=None, density=False): """ - Histogram applied along specified dimensions. + Histogram applied along specified dimensions. Will create a N-dimensional + histogram, where N is the number of variables in the dataset. If the supplied arguments are chunked dask arrays it will use `dask.array.blockwise` internally to parallelize over all chunks. - vars : list of str - Variables on the Dataset to use as input data. The number of variables - determines the dimensionality of the histogram. For example, two - arguments produce a 2D histogram. dim : tuple of strings, optional Dimensions over which which the histogram is computed. The default is to compute the histogram of the flattened array. i.e. over all dimensions. @@ -7702,7 +7699,7 @@ def hist(self, vars=None, dim=None, bins=None, weights=None, density=False): from .computation import hist return hist( - [self[var] for var in vars], + [self.data_vars.values()], dim=dim, bins=bins, weights=weights, diff --git a/xarray/core/weighted.py b/xarray/core/weighted.py index e8838b07157..26645471837 100644 --- a/xarray/core/weighted.py +++ b/xarray/core/weighted.py @@ -58,6 +58,67 @@ New {cls} object with the sum of the weights over the given dimension. """ +_WEIGHTED_HIST_DOCSTRING = """ + Weighted histogram applied along specified dimensions. + + If the supplied arguments are chunked dask arrays it will use + `dask.array.blockwise` internally to parallelize over all chunks. + + Parameters + ---------- + dim : tuple of strings, optional + Dimensions over which which the histogram is computed. The default is to + compute the histogram of the flattened {cls}. i.e. over all dimensions. + bins : int or array_like or a list of ints or arrays, or list of DataArrays, optional + If a list, there should be one entry for each item in ``args``. + The bin specification: + + * If int, the number of bins for all arguments in ``args``. + * If array_like, the bin edges for all arguments in ``args``. + * If a list of ints, the number of bins for every argument in ``args``. + * If a list arrays, the bin edges for each argument in ``args`` + (required format for Dask inputs). + * A combination [int, array] or [array, int], where int + is the number of bins and array is the bin edges. + * If a list of DataArrays, the bins for each argument in ``args`` + The DataArrays can be multidimensional, but must not have any + dimensions shared with the `dim` argument. + + When bin edges are specified, all but the last (righthand-most) bin include + the left edge and exclude the right edge. The last bin includes both edges. + + A ``TypeError`` will be raised if ``args`` contains dask arrays and + ``bins`` are not specified explicitly as a list of arrays. + density : bool, optional + If ``False``, the result will contain the number of samples in + each bin. If ``True``, the result is the value of the + probability *density* function at the bin, normalized such that + the *integral* over the range is 1. Note that the sum of the + histogram values will not be equal to 1 unless bins of unit + width are chosen; it is not a probability *mass* function. + + Returns + ------- + hist : xarray.DataArray + A xarray.DataArray object which contains the values of the histogram. See + `density` and `weights` for a description of the possible semantics. + + The returned dataarray will have one additional coordinate for each + variable supplied, named as `var_bins`, which contains the positions + of the centres of each bin. + + Examples + -------- + + See Also + -------- + xarray.hist + DataArray.hist + Dataset.hist + numpy.histogramdd + dask.array.blockwise + """ + class Weighted(Generic[T_DataWithCoords]): """An object that implements weighted operations. @@ -238,6 +299,9 @@ def mean( self._weighted_mean, dim=dim, skipna=skipna, keep_attrs=keep_attrs ) + def hist(self, dim=None, bins=None, density=False): + return self.obj.hist(dim=dim, bins=bins, weights=self.weights, density=density) + def __repr__(self): """provide a nice str repr of our Weighted object""" @@ -268,6 +332,8 @@ def _inject_docstring(cls, cls_name): cls.sum_of_weights.__doc__ = _SUM_OF_WEIGHTS_DOCSTRING.format(cls=cls_name) + cls.hist.__doc__ = _WEIGHTED_HIST_DOCSTRING.format(cls=cls_name) + cls.sum.__doc__ = _WEIGHTED_REDUCE_DOCSTRING_TEMPLATE.format( cls=cls_name, fcn="sum", on_zero="0" ) From 60ed9c1ee95102740057c163abfdd5664baab0ab Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sat, 29 May 2021 13:04:05 -0400 Subject: [PATCH 06/14] keep_attrs --- xarray/core/computation.py | 9 ++++++++- xarray/core/dataarray.py | 7 ++++++- xarray/core/dataset.py | 7 ++++++- xarray/core/weighted.py | 16 +++++++++++++--- 4 files changed, 33 insertions(+), 6 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index e08da39f95f..8c8d3085ef1 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1726,7 +1726,7 @@ def _calc_idxminmax( return res -def hist(*datarrays, dim=None, bins=None, weights=None, density=False): +def hist(*datarrays, dim=None, bins=None, weights=None, density=False, keep_attrs=None): """ Histogram applied along specified dimensions. @@ -1776,6 +1776,10 @@ def hist(*datarrays, dim=None, bins=None, weights=None, density=False): the *integral* over the range is 1. Note that the sum of the histogram values will not be equal to 1 unless bins of unit width are chosen; it is not a probability *mass* function. + keep_attrs : bool, optional + If True, the attributes (``attrs``) will be copied from the original + object to the new one. If False (default), the new object will be + returned without attributes. Returns ------- @@ -1798,5 +1802,8 @@ def hist(*datarrays, dim=None, bins=None, weights=None, density=False): dask.array.blockwise """ + if keep_attrs is None: + keep_attrs = _get_keep_attrs(default=False) + # TODO xhistogram code goes here raise NotImplementedError diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 71c285731d0..8a7df05d2fd 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -4599,7 +4599,7 @@ def drop_duplicates( indexes = {dim: ~self.get_index(dim).duplicated(keep=keep)} return self.isel(indexes) - def hist(self, dim=None, bins=None, weights=None, density=False): + def hist(self, dim=None, bins=None, weights=None, density=False, keep_attrs=None): """ Histogram applied along specified dimensions. @@ -4644,6 +4644,10 @@ def hist(self, dim=None, bins=None, weights=None, density=False): the *integral* over the range is 1. Note that the sum of the histogram values will not be equal to 1 unless bins of unit width are chosen; it is not a probability *mass* function. + keep_attrs : bool, optional + If True, the attributes (``attrs``) will be copied from the original + object to the new one. If False (default), the new object will be + returned without attributes. Returns ------- @@ -4674,6 +4678,7 @@ def hist(self, dim=None, bins=None, weights=None, density=False): bins=bins, weights=weights, density=density, + keep_attrs=keep_attrs, ) # this needs to be at the end, or mypy will confuse with `str` diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 26fe013f869..899a298dc4f 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -7628,7 +7628,7 @@ def _wrapper(Y, *coords_, **kwargs): return result - def hist(self, dim=None, bins=None, weights=None, density=False): + def hist(self, dim=None, bins=None, weights=None, density=False, keep_attrs=None): """ Histogram applied along specified dimensions. Will create a N-dimensional histogram, where N is the number of variables in the dataset. @@ -7674,6 +7674,10 @@ def hist(self, dim=None, bins=None, weights=None, density=False): the *integral* over the range is 1. Note that the sum of the histogram values will not be equal to 1 unless bins of unit width are chosen; it is not a probability *mass* function. + keep_attrs : bool, optional + If True, the attributes (``attrs``) will be copied from the original + object to the new one. If False (default), the new object will be + returned without attributes. Returns ------- @@ -7704,4 +7708,5 @@ def hist(self, dim=None, bins=None, weights=None, density=False): bins=bins, weights=weights, density=density, + keep_attrs=keep_attrs, ) diff --git a/xarray/core/weighted.py b/xarray/core/weighted.py index 26645471837..025cd2606aa 100644 --- a/xarray/core/weighted.py +++ b/xarray/core/weighted.py @@ -58,7 +58,7 @@ New {cls} object with the sum of the weights over the given dimension. """ -_WEIGHTED_HIST_DOCSTRING = """ +_WEIGHTED_HIST_DOCSTRING = """ Weighted histogram applied along specified dimensions. If the supplied arguments are chunked dask arrays it will use @@ -96,6 +96,10 @@ the *integral* over the range is 1. Note that the sum of the histogram values will not be equal to 1 unless bins of unit width are chosen; it is not a probability *mass* function. + keep_attrs : bool, optional + If True, the attributes (``attrs``) will be copied from the original + object to the new one. If False (default), the new object will be + returned without attributes. Returns ------- @@ -299,8 +303,14 @@ def mean( self._weighted_mean, dim=dim, skipna=skipna, keep_attrs=keep_attrs ) - def hist(self, dim=None, bins=None, density=False): - return self.obj.hist(dim=dim, bins=bins, weights=self.weights, density=density) + def hist(self, dim=None, bins=None, density=False, keep_attrs=None): + return self.obj.hist( + dim=dim, + bins=bins, + weights=self.weights, + density=density, + keep_attrs=keep_attrs, + ) def __repr__(self): """provide a nice str repr of our Weighted object""" From 57b2f57c446cf835b0cb070b89d8b2c3bfe8a495 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Sun, 30 May 2021 12:07:30 -0400 Subject: [PATCH 07/14] hypothesis tests for chunking patterns --- properties/test_chunking_patterns.py | 116 +++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 properties/test_chunking_patterns.py diff --git a/properties/test_chunking_patterns.py b/properties/test_chunking_patterns.py new file mode 100644 index 00000000000..eb99736f277 --- /dev/null +++ b/properties/test_chunking_patterns.py @@ -0,0 +1,116 @@ +""" +Property-based tests that dask-powered algorithms will succeed regardless of the +pattern of chunks on the input arrays. +""" + +import uuid + +import pytest + +pytest.importorskip("hypothesis") + +import hypothesis.strategies as st +import numpy as np +from hypothesis import given + +import xarray as xr +from xarray.tests import requires_dask + + +def example_dataarray(shape=(5, 20)): + data = np.random.randn(*shape) + dims = [f"dim_{i}" for i in range(len(shape))] + da = xr.DataArray(data, dims=dims, name="T") + return da + + +def example_dataset(n_dim=2, n_vars=2): + """Random dataset with every variable having the same shape""" + + shape = tuple(range(8, 8 + n_dim)) + dims = [f"dim_{i}" for i in range(len(shape))] + var_names = [uuid.uuid4().hex for _ in range(n_vars)] + ds = xr.Dataset() + for i in range(n_vars): + name = var_names[i] + data = np.random.randn(*shape) + da = xr.DataArray(data, dims=dims, name=name) + ds[name] = da + return ds + + +@st.composite +def chunk_shapes(draw, n_dim=3, max_arr_len=10): + """Generate different chunking patterns for an N-D array of data.""" + chunks = [] + for n in range(n_dim): + shape = draw(st.integers(min_value=1, max_value=max_arr_len)) + chunks.append(shape) + return tuple(chunks) + + +@requires_dask +class TestChunkingHypotheses: + @given(chunk_shapes(n_dim=1, max_arr_len=20)) + def test_all_chunking_patterns_1d(self, chunks): + + data = example_dataarray(shape=(20,)).chunk(chunks) + + nbins_a = 8 + bins = np.linspace(-4, 4, nbins_a + 1) + + h = xr.hist(data, bins=[bins]) + + assert h.shape == (nbins_a,) + + hist, _ = np.histogram( + data.values, + bins=bins, + ) + + np.testing.assert_allclose(hist, h) + + @pytest.mark.slow + @given(chunk_shapes(n_dim=2, max_arr_len=8)) + def test_all_chunking_patterns_2d(self, chunks): + + data_a = example_dataarray(shape=(5, 20)).chunk(chunks) + data_b = example_dataarray(shape=(5, 20)).chunk(chunks) + + nbins_a = 8 + nbins_b = 9 + bins_a = np.linspace(-4, 4, nbins_a + 1) + bins_b = np.linspace(-4, 4, nbins_b + 1) + + h = xr.hist(data_a, data_b, bins=[bins_a, bins_b]) + + assert h.shape == (nbins_a, nbins_b) + + hist, _, _ = np.histogram2d( + data_a.values.ravel(), + data_b.values.ravel(), + bins=[bins_a, bins_b], + ) + + np.testing.assert_allclose(hist, h.values) + + @pytest.mark.slow + @pytest.mark.parametrize("n_vars", [1, 2, 3, 4]) + @given(chunk_shapes(n_dim=2, max_arr_len=7)) + def test_all_chunking_patterns_dd_hist(self, n_vars, chunk_shapes): + ds = example_dataset(n_dim=2, n_vars=n_vars) + ds = ds.chunk({d: c for d, c in zip(ds.dims.keys(), chunk_shapes)}) + + n_bins = (7, 8, 9, 10)[:n_vars] + bins = [np.linspace(-4, 4, n + 1) for n in n_bins] + + h = xr.hist(*[da for name, da in ds.data_vars.items()], bins=bins) + + assert h.shape == n_bins + + input_data = np.stack( + [da.values.ravel() for name, da in ds.data_vars.items()], axis=-1 + ) + hist, _ = np.histogramdd(input_data, bins=bins) + + np.testing.assert_allclose(hist, h.values) From 2ac4f9bdcb2cc5930daa35514158f9c276e93307 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 3 Jun 2021 17:14:42 -0400 Subject: [PATCH 08/14] input checking and organising wrapper around xhistogram.core.histogram [skip-ci] --- xarray/core/computation.py | 160 ++++++++++++++++++++++++++++++++++--- 1 file changed, 147 insertions(+), 13 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 8c8d3085ef1..f64c00b0958 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -26,11 +26,11 @@ import numpy as np from . import dtypes, duck_array_ops, utils -from .alignment import align, deep_align +from .alignment import align, deep_align, broadcast from .merge import merge_attrs, merge_coordinates_without_align from .options import OPTIONS, _get_keep_attrs from .pycompat import is_duck_dask_array -from .utils import is_dict_like +from .utils import is_dict_like, OrderedSet from .variable import Variable if TYPE_CHECKING: @@ -1726,7 +1726,7 @@ def _calc_idxminmax( return res -def hist(*datarrays, dim=None, bins=None, weights=None, density=False, keep_attrs=None): +def hist(*dataarrays, dim=None, bins=None, weights=None, density=False, keep_attrs=None): """ Histogram applied along specified dimensions. @@ -1735,10 +1735,10 @@ def hist(*datarrays, dim=None, bins=None, weights=None, density=False, keep_attr Parameters ---------- - datarrays : xarray.DataArray objects + dataarrays : xarray.DataArray objects Input data. The number of input arguments determines the dimensionality of the histogram. For example, two arguments produce a 2D histogram. - dim : tuple of strings, optional + dim : str or tuple of strings, optional Dimensions over which which the histogram is computed. The default is to compute the histogram of the flattened array. i.e. over all dimensions. bins : int or array_like or a list of ints or arrays, or list of DataArrays, optional @@ -1750,11 +1750,10 @@ def hist(*datarrays, dim=None, bins=None, weights=None, density=False, keep_attr * If a list of ints, the number of bins for every argument in ``args``. * If a list arrays, the bin edges for each argument in ``args`` (required format for Dask inputs). - * A combination [int, array] or [array, int], where int - is the number of bins and array is the bin edges. * If a list of DataArrays, the bins for each argument in ``args`` - The DataArrays can be multidimensional, but must not have any - dimensions shared with the `dim` argument. + The DataArrays can be multidimensional, but must contain X + and must not have any dimensions shared with the `dim` argument. + If supplied these will be present as coordinates on the output. When bin edges are specified, all but the last (righthand-most) bin include the left edge and exclude the right edge. The last bin includes both edges. @@ -1777,8 +1776,8 @@ def hist(*datarrays, dim=None, bins=None, weights=None, density=False, keep_attr histogram values will not be equal to 1 unless bins of unit width are chosen; it is not a probability *mass* function. keep_attrs : bool, optional - If True, the attributes (``attrs``) will be copied from the original - object to the new one. If False (default), the new object will be + If True, the attributes (``attrs``) will be copied from the first original + object passed to the new one. If False (default), the new object will be returned without attributes. Returns @@ -1788,9 +1787,12 @@ def hist(*datarrays, dim=None, bins=None, weights=None, density=False, keep_attr `density` and `weights` for a description of the possible semantics. The returned dataarray will have one additional coordinate for each - dataarray supplied, named as `var_bins`, which contains the positions + dataarray supplied, named as `[var]_bins`, which contains the positions of the centres of each bin. + All other coordinates will be retained, unless they depend on a dimension + which has been reduced along, in which case they will be dropped. + Examples -------- @@ -1802,8 +1804,140 @@ def hist(*datarrays, dim=None, bins=None, weights=None, density=False, keep_attr dask.array.blockwise """ + # TODO range argument + + from .dataarray import DataArray + from .dataarray import Variable + + # Check inputs + if any(not isinstance(arr, DataArray) for arr in dataarrays): + raise TypeError( + "Only xr.DataArray is supported as input data, but given " + f"{[type(arr) for arr in dataarrays]}." + ) + if weights is not None: + if not isinstance(weights, (Variable, DataArray)): + raise TypeError( + "Only xr.DataArray and xr.Variable are supported as weights, " + "but given {type(weights)}." + ) + + if len(dataarrays) == 0: + raise TypeError("At least one input dataarray must be given.") + + for da in dataarrays: + if da.name is None: + raise ValueError("All input dataarrays must have a name.") + + if isinstance(dim, str): + dim = (dim,) + reduce_dims = dim + + all_input_dims = ordered_set_union([da.dims for da in dataarrays]) + broadcast_dims = OrderedSet(all_input_dims) - OrderedSet(reduce_dims) + + # create output dims + new_bin_dims = [da.name + '_bins' for da in dataarrays] + output_dims = [broadcast_dims] + new_bin_dims + + # TODO deal with arrays or lists of arrays + # TODO just check if array-like after explicitly checking if other options + # TODO might need to check if numpy arrays explicitly + # Check validity of given bins, or create using np.histogram_bin_edges + def _create_bin_coords(dataarrays, bins, range, new_bin_dims): + return [DataArray(np.histogram_bin_edges(a, b, r), dims=d, name=d, attrs=a.attrs) + for a, b, r, d in zip(dataarrays, bins, range, new_bin_dims)] + if isinstance(bins, int): + bins = _create_bin_coords(dataarrays, bins, range, new_bin_dims) + elif bins is None: + bins = _create_bin_coords(dataarrays, ['auto']*len(dataarrays), range, new_bin_dims) + elif isinstance(bins, Iterable): + if len(bins) != len(dataarrays): + raise TypeError("If bins is an Iterable then it must have same length " + "as number of input dataarrays passed, but instead has " + f"length {len(bins)}") + if all(isinstance(obj, DataArray) for obj in bins): + for da, new_bin_dim in zip(bins, new_bin_dims): + if not set(da.dims).issubset(broadcast_dims): + raise ValueError("A bins dataarray has dimensions present that " + "will not be broadcast on the output: " + f"{da.dims} vs {broadcast_dims}") + if new_bin_dim not in da.dims: + raise ValueError("A bins dataarray does not contain the " + "corresponding output bins dimension - " + f"has dims {da.dims} but not {new_bin_dim}.") + elif all(isinstance(b, (int, str)) for b in bins): + bins = _create_bin_coords(dataarrays, bins, range, new_bin_dims) + else: + TypeError("One or more of the elements in bins is not a valid argument." + f"Instead found types {[type(b) for b in bins]}") + else: + raise TypeError(f"Type {type(bins)} is not a valid argument to bins") + + # Align / broadcast all inputs (including weights and bins) + arrs = broadcast(dataarrays) + weights = weights.broadcast_like(arrs[0]) + aligned_bins = [b.broadcast_like(arrs[0]) for b in bins] + # TODO bins now already has the output dims included, is that correct? + + # Compute histogram results + reduce_axes = tuple(list(all_input_dims).index(d) for d in reduce_dims) + h_data = _calc_histogram([arr.values for arr in arrs], + axis=reduce_axes, + bins=[b.values for b in aligned_bins], + weights=weights.values, + density=density) + + # Reconstruct output dataarray + output_name = "_".join(["histogram"] + [a.name for a in dataarrays]) + + # Adjust bin coords to return positions of bin centres rather than bin edges + def _find_centers(da, dim): + return 0.5 * (da.isel(dim=slice(None, None, -1)) + + da.isel(dim=slice(1, None))) + bin_centers = [_find_centers(bin, new_bin_dim) + for bin, new_bin_dim in zip(bins, new_bin_dims)] + + a0 = dataarrays[0] + all_coords = {} + new_coords = { + name: ((name,), bin_center, a.attrs) + for name, bin_center, a in zip(new_bin_dims, bin_centers, dataarrays) + } + all_coords.update(new_coords) + + # Keep all old coords from any input that have not been reduced along + old_coords = {coord.name: coord for da in dataarrays for coord in da.coords + if not any(coord.dims in reduce_dims)} + all_coords.update(old_coords) + if keep_attrs is None: keep_attrs = _get_keep_attrs(default=False) + attrs = dataarrays[0].attrs if True else None + + return DataArray(h_data, dims=output_dims, coords=all_coords, + name=output_name, attrs=attrs) + + +def _calc_histogram(*arrs, axis, bins, weights, density): + """ + Internal axis-aware histogram calculation. + Will map over dask arrays using blockwise if any are present. + Expects all numpy/dask arrays, already aligned and broadcast against one another. + """ # TODO xhistogram code goes here - raise NotImplementedError + # Use xhistogram.core.histogram + + # TODO how does xarray normally test for dask arrays? + + return ... + + +def _bincount(): + """ + Axis-aware bincounting function. + Acts on a single numpy array/dask chunk. + Works by reshaping input + """ + return ... \ No newline at end of file From 248b1b63c247dcfa47414c99aed349a74aa663cd Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Fri, 4 Jun 2021 11:35:47 -0400 Subject: [PATCH 09/14] Some tests of input argument handling --- xarray/core/computation.py | 122 ++++++++++++++++++------------- xarray/tests/test_computation.py | 39 ++++++++++ 2 files changed, 109 insertions(+), 52 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index f64c00b0958..627a11fd4d8 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -26,11 +26,11 @@ import numpy as np from . import dtypes, duck_array_ops, utils -from .alignment import align, deep_align, broadcast +from .alignment import align, broadcast, deep_align from .merge import merge_attrs, merge_coordinates_without_align from .options import OPTIONS, _get_keep_attrs from .pycompat import is_duck_dask_array -from .utils import is_dict_like, OrderedSet +from .utils import OrderedSet, is_dict_like from .variable import Variable if TYPE_CHECKING: @@ -1726,7 +1726,9 @@ def _calc_idxminmax( return res -def hist(*dataarrays, dim=None, bins=None, weights=None, density=False, keep_attrs=None): +def hist( + *dataarrays, dim=None, bins=None, weights=None, density=False, keep_attrs=None +): """ Histogram applied along specified dimensions. @@ -1806,8 +1808,7 @@ def hist(*dataarrays, dim=None, bins=None, weights=None, density=False, keep_att # TODO range argument - from .dataarray import DataArray - from .dataarray import Variable + from .dataarray import DataArray, Variable # Check inputs if any(not isinstance(arr, DataArray) for arr in dataarrays): @@ -1822,7 +1823,8 @@ def hist(*dataarrays, dim=None, bins=None, weights=None, density=False, keep_att "but given {type(weights)}." ) - if len(dataarrays) == 0: + n_args = len(dataarrays) + if n_args == 0: raise TypeError("At least one input dataarray must be given.") for da in dataarrays: @@ -1837,7 +1839,7 @@ def hist(*dataarrays, dim=None, bins=None, weights=None, density=False, keep_att broadcast_dims = OrderedSet(all_input_dims) - OrderedSet(reduce_dims) # create output dims - new_bin_dims = [da.name + '_bins' for da in dataarrays] + new_bin_dims = [da.name + "_bins" for da in dataarrays] output_dims = [broadcast_dims] + new_bin_dims # TODO deal with arrays or lists of arrays @@ -1845,32 +1847,44 @@ def hist(*dataarrays, dim=None, bins=None, weights=None, density=False, keep_att # TODO might need to check if numpy arrays explicitly # Check validity of given bins, or create using np.histogram_bin_edges def _create_bin_coords(dataarrays, bins, range, new_bin_dims): - return [DataArray(np.histogram_bin_edges(a, b, r), dims=d, name=d, attrs=a.attrs) - for a, b, r, d in zip(dataarrays, bins, range, new_bin_dims)] + return [ + DataArray(np.histogram_bin_edges(a, b, r), dims=d, name=d, attrs=a.attrs) + for a, b, r, d in zip(dataarrays, bins, range, new_bin_dims) + ] + + # TODO change to allow mixed types of bins arguments if isinstance(bins, int): - bins = _create_bin_coords(dataarrays, bins, range, new_bin_dims) + bins = _create_bin_coords(dataarrays, [bins] * n_args, range, new_bin_dims) elif bins is None: - bins = _create_bin_coords(dataarrays, ['auto']*len(dataarrays), range, new_bin_dims) + bins = _create_bin_coords(dataarrays, ["auto"] * n_args, range, new_bin_dims) elif isinstance(bins, Iterable): - if len(bins) != len(dataarrays): - raise TypeError("If bins is an Iterable then it must have same length " - "as number of input dataarrays passed, but instead has " - f"length {len(bins)}") + if len(bins) != n_args: + raise TypeError( + "If bins is an Iterable then it must have same length " + "as number of input dataarrays passed, but instead has " + f"length {len(bins)}" + ) if all(isinstance(obj, DataArray) for obj in bins): for da, new_bin_dim in zip(bins, new_bin_dims): if not set(da.dims).issubset(broadcast_dims): - raise ValueError("A bins dataarray has dimensions present that " - "will not be broadcast on the output: " - f"{da.dims} vs {broadcast_dims}") + raise ValueError( + "A bins dataarray has dimensions present that " + "will not be broadcast on the output: " + f"{da.dims} vs {tuple(*broadcast_dims)}" + ) if new_bin_dim not in da.dims: - raise ValueError("A bins dataarray does not contain the " - "corresponding output bins dimension - " - f"has dims {da.dims} but not {new_bin_dim}.") + raise ValueError( + "A bins dataarray does not contain the " + "corresponding output bins dimension - " + f"has dims {da.dims} but not {new_bin_dim}." + ) elif all(isinstance(b, (int, str)) for b in bins): bins = _create_bin_coords(dataarrays, bins, range, new_bin_dims) else: - TypeError("One or more of the elements in bins is not a valid argument." - f"Instead found types {[type(b) for b in bins]}") + TypeError( + "One or more of the elements in bins is not a valid argument." + f"Instead found types {[type(b) for b in bins]}" + ) else: raise TypeError(f"Type {type(bins)} is not a valid argument to bins") @@ -1882,41 +1896,44 @@ def _create_bin_coords(dataarrays, bins, range, new_bin_dims): # Compute histogram results reduce_axes = tuple(list(all_input_dims).index(d) for d in reduce_dims) - h_data = _calc_histogram([arr.values for arr in arrs], - axis=reduce_axes, - bins=[b.values for b in aligned_bins], - weights=weights.values, - density=density) + h_data = _calc_histogram( + [arr.values for arr in arrs], + axis=reduce_axes, + bins=[b.values for b in aligned_bins], + weights=weights.values, + density=density, + ) # Reconstruct output dataarray - output_name = "_".join(["histogram"] + [a.name for a in dataarrays]) + output_name = "_".join(["histogram"] + [da.name for da in dataarrays]) # Adjust bin coords to return positions of bin centres rather than bin edges def _find_centers(da, dim): - return 0.5 * (da.isel(dim=slice(None, None, -1)) - + da.isel(dim=slice(1, None))) - bin_centers = [_find_centers(bin, new_bin_dim) - for bin, new_bin_dim in zip(bins, new_bin_dims)] - - a0 = dataarrays[0] - all_coords = {} - new_coords = { - name: ((name,), bin_center, a.attrs) - for name, bin_center, a in zip(new_bin_dims, bin_centers, dataarrays) - } - all_coords.update(new_coords) + return 0.5 * (da.isel(dim=slice(None, None, -1)) + da.isel(dim=slice(1, None))) + + bin_centers = [ + _find_centers(bin, new_bin_dim) for bin, new_bin_dim in zip(bins, new_bin_dims) + ] # Keep all old coords from any input that have not been reduced along - old_coords = {coord.name: coord for da in dataarrays for coord in da.coords - if not any(coord.dims in reduce_dims)} - all_coords.update(old_coords) + old_coords = { + coord.name: coord + for da in dataarrays + for coord in da.coords + if not any(coord.dims in reduce_dims) + } if keep_attrs is None: keep_attrs = _get_keep_attrs(default=False) - attrs = dataarrays[0].attrs if True else None - - return DataArray(h_data, dims=output_dims, coords=all_coords, - name=output_name, attrs=attrs) + attrs = dataarrays[0].attrs if keep_attrs else None + + return DataArray( + h_data, + dims=output_dims, + coords={**old_coords, **bin_centers}, + name=output_name, + attrs=attrs, + ) def _calc_histogram(*arrs, axis, bins, weights, density): @@ -1931,13 +1948,14 @@ def _calc_histogram(*arrs, axis, bins, weights, density): # TODO how does xarray normally test for dask arrays? - return ... + return [] def _bincount(): """ Axis-aware bincounting function. - Acts on a single numpy array/dask chunk. - Works by reshaping input + Acts on a single numpy array (/ dask chunk). + Works by reshaping input to combine all broadcast dims along one axis, + and all reduce dims along one axis, then reshaping back. """ - return ... \ No newline at end of file + return ... diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index b7ae1ca9828..3d9420f4f85 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -15,6 +15,7 @@ apply_ufunc, broadcast_compat_data, collect_dict_values, + hist, join_dict_keys, ordered_set_intersection, ordered_set_union, @@ -1927,3 +1928,41 @@ def test_polyval(use_dask, use_datetime): da_pv = xr.polyval(da.x, coeffs) xr.testing.assert_allclose(da, da_pv.T) + + +class TestHistInputTypes: + def test_invalid_data(self): + with pytest.raises(TypeError, match="Only xr.DataArray is supported"): + hist("string") + + def test_invalid_weights(self): + with pytest.raises(TypeError, match="supported as weights"): + hist(xr.DataArray([], name="a"), weights="string") + + def test_no_data(self): + with pytest.raises(TypeError, match="At least one"): + hist() + + def test_nameless_data(self): + with pytest.raises(ValueError, match="must have a name"): + hist(xr.DataArray([])) + + def test_wrong_number_of_bins(self): + with pytest.raises(TypeError, match="must have same length"): + hist(xr.DataArray([], name="a"), bins=[2, 3]) + + def test_invalid_bins(self): + with pytest.raises(TypeError, match="not a valid argument"): + hist(xr.DataArray([], name="a"), bins=2.7) + + def test_bin_dataarrays_with_extra_dims(self): + data = xr.DataArray([0], dims=["x"], name="a") + bins = xr.DataArray([[1]], dims=["bad", "a_bins"]) + with pytest.raises(ValueError, match="will not be broadcast"): + hist(data, dim="x", bins=[bins]) + + def test_bin_dataarrays_without_reduce_dim(self): + data = xr.DataArray([0], dims=["x"], name="a") + bins = xr.DataArray(1) + with pytest.raises(ValueError, match="does not contain"): + hist(data, dim="x", bins=[bins]) From f5024b29ff46c805788dc63a08eccbed20ed83b3 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Tue, 8 Jun 2021 17:59:01 -0400 Subject: [PATCH 10/14] rigorous checking of bins inputs --- xarray/core/computation.py | 133 ++++++++++++++++++------------- xarray/tests/test_computation.py | 11 ++- 2 files changed, 89 insertions(+), 55 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 627a11fd4d8..138459472f6 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -42,6 +42,21 @@ _JOINS_WITHOUT_FILL_VALUES = frozenset({"inner", "exact"}) +try: + import dask.array as dsa + + has_dask = True +except ImportError: + has_dask = False + + +def _any_dask_array(*args): + if not has_dask: + return False + else: + return any(isinstance(a, dsa.core.Array) for a in args) + + def _first_of_type(args, kind): """Return either first object of type 'kind' or raise if not found.""" for arg in args: @@ -1743,25 +1758,30 @@ def hist( dim : str or tuple of strings, optional Dimensions over which which the histogram is computed. The default is to compute the histogram of the flattened array. i.e. over all dimensions. - bins : int or array_like or a list of ints or arrays, or list of DataArrays, optional + bins : int, str, numpy array or DataArray, or a list of ints, strs, arrays and/or DataArrays, optional If a list, there should be one entry for each item in ``args``. - The bin specification: - - * If int, the number of bins for all arguments in ``args``. - * If array_like, the bin edges for all arguments in ``args``. - * If a list of ints, the number of bins for every argument in ``args``. - * If a list arrays, the bin edges for each argument in ``args`` - (required format for Dask inputs). - * If a list of DataArrays, the bins for each argument in ``args`` - The DataArrays can be multidimensional, but must contain X - and must not have any dimensions shared with the `dim` argument. - If supplied these will be present as coordinates on the output. + The bin specifications for each entry: + + * If int, the number of bins. + * If str; the method used to automatically calculate the optimal bin + width, as defined by `np.histogram_bin_edges`. + * If a numpy array, the bin edges. Must be 1D. + * If a DataArray, the bin edges. The DataArray can be multidimensional, + but must contain the output bins dimension (named as `[var]_bins` + for a given input variable named `var`), and must not have any + dimensions shared with the `dim` argument. If supplied this DataArray + will be present as a coordinate on the output. + * If a list of ints, strs, arrays and/or DataArrays; the bin specification + as above for every argument in ``args``. + * If not supplied (or any elements of the list are `None`) then bins + will be automatically calculated by `np.histogram_bin_edges`. When bin edges are specified, all but the last (righthand-most) bin include the left edge and exclude the right edge. The last bin includes both edges. - A ``TypeError`` will be raised if ``args`` contains dask arrays and - ``bins`` are not specified explicitly as a list of arrays. + A ``TypeError`` will also be raised if ``args`` contains dask arrays and + ``bins`` are not specified explicitly via arrays or DataArrays, because + other bin specifications trigger loading of the entire input data. weights : array_like, optional An array of weights, of the same shape as `a`. Each value in `a` only contributes its associated weight towards the bin count @@ -1790,7 +1810,7 @@ def hist( The returned dataarray will have one additional coordinate for each dataarray supplied, named as `[var]_bins`, which contains the positions - of the centres of each bin. + of the centres of each bin, varing along a new dimension of the same name. All other coordinates will be retained, unless they depend on a dimension which has been reduced along, in which case they will be dropped. @@ -1842,55 +1862,60 @@ def hist( new_bin_dims = [da.name + "_bins" for da in dataarrays] output_dims = [broadcast_dims] + new_bin_dims - # TODO deal with arrays or lists of arrays - # TODO just check if array-like after explicitly checking if other options - # TODO might need to check if numpy arrays explicitly - # Check validity of given bins, or create using np.histogram_bin_edges - def _create_bin_coords(dataarrays, bins, range, new_bin_dims): - return [ - DataArray(np.histogram_bin_edges(a, b, r), dims=d, name=d, attrs=a.attrs) - for a, b, r, d in zip(dataarrays, bins, range, new_bin_dims) - ] + def _check_and_format_bins_into_coords(b, da, r, bin_dim): + # Check validity of given bins, or create using np.histogram_bin_edges + # Package into a coordinate DataArray before returning + _edges = np.histogram_bin_edges + if isinstance(b, (int, str)) or b is None: + if is_duck_dask_array(da.data): + raise TypeError(f"Choice of bins as {b} would trigger loading " + f"of entire input array to histogram") + b = "auto" if b is None else b + return DataArray(_edges(da.values, b, r), dims=bin_dim, name=bin_dim, + attrs=da.attrs) + elif isinstance(b, np.ndarray): + if b.ndim > 1: + raise ValueError("bins specified as numpy arrays can only be 1-dimensional") + return DataArray(b, dims=bin_dim, name=bin_dim, attrs=da.attrs) + elif isinstance(b, DataArray): + print(b.dims) + print(broadcast_dims) + if bin_dim not in b.dims: + raise ValueError( + "A bins dataarray does not contain the " + "corresponding output bins dimension - " + f"has dims {b.dims} but not {bin_dim}." + ) + if not (set(b.dims) - set([bin_dim])).issubset(broadcast_dims): + raise ValueError( + "A bins dataarray has dimensions present that " + "will not be broadcast on the output: " + f"{da.dims} vs {tuple(*broadcast_dims)}" + ) + return b + else: + raise TypeError(f"Type {type(b)} is not a valid argument to bins") - # TODO change to allow mixed types of bins arguments - if isinstance(bins, int): - bins = _create_bin_coords(dataarrays, [bins] * n_args, range, new_bin_dims) - elif bins is None: - bins = _create_bin_coords(dataarrays, ["auto"] * n_args, range, new_bin_dims) - elif isinstance(bins, Iterable): + # TODO check ranges are of valid type + ranges = [None] * n_args + if isinstance(bins, list): if len(bins) != n_args: raise TypeError( - "If bins is an Iterable then it must have same length " + "If `bins` is a list then it must have same length " "as number of input dataarrays passed, but instead has " - f"length {len(bins)}" - ) - if all(isinstance(obj, DataArray) for obj in bins): - for da, new_bin_dim in zip(bins, new_bin_dims): - if not set(da.dims).issubset(broadcast_dims): - raise ValueError( - "A bins dataarray has dimensions present that " - "will not be broadcast on the output: " - f"{da.dims} vs {tuple(*broadcast_dims)}" - ) - if new_bin_dim not in da.dims: - raise ValueError( - "A bins dataarray does not contain the " - "corresponding output bins dimension - " - f"has dims {da.dims} but not {new_bin_dim}." - ) - elif all(isinstance(b, (int, str)) for b in bins): - bins = _create_bin_coords(dataarrays, bins, range, new_bin_dims) - else: - TypeError( - "One or more of the elements in bins is not a valid argument." - f"Instead found types {[type(b) for b in bins]}" + f"length {len(bins)}. To manually specify bin edges for " + "a single input pass them as a numpy array instead." ) else: - raise TypeError(f"Type {type(bins)} is not a valid argument to bins") + bins = [bins] * n_args + bins = [_check_and_format_bins_into_coords(b, da, r, d) for b, da, r, d in + zip(bins, dataarrays, ranges, new_bin_dims)] # Align / broadcast all inputs (including weights and bins) + # TODO if this was merely alignment could blockwise handle all the broadcasting? arrs = broadcast(dataarrays) weights = weights.broadcast_like(arrs[0]) + # TODO surround with try except? aligned_bins = [b.broadcast_like(arrs[0]) for b in bins] # TODO bins now already has the output dims included, is that correct? diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 3d9420f4f85..209af8f4234 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -1930,7 +1930,7 @@ def test_polyval(use_dask, use_datetime): xr.testing.assert_allclose(da, da_pv.T) -class TestHistInputTypes: +class TestHistInputTypeChecks: def test_invalid_data(self): with pytest.raises(TypeError, match="Only xr.DataArray is supported"): hist("string") @@ -1951,6 +1951,10 @@ def test_wrong_number_of_bins(self): with pytest.raises(TypeError, match="must have same length"): hist(xr.DataArray([], name="a"), bins=[2, 3]) + def test_non_1d_numpy_array(self): + with pytest.raises(ValueError, match="can only be 1-d"): + hist(xr.DataArray([], name="a"), bins=np.array([[2, 3]])) + def test_invalid_bins(self): with pytest.raises(TypeError, match="not a valid argument"): hist(xr.DataArray([], name="a"), bins=2.7) @@ -1966,3 +1970,8 @@ def test_bin_dataarrays_without_reduce_dim(self): bins = xr.DataArray(1) with pytest.raises(ValueError, match="does not contain"): hist(data, dim="x", bins=[bins]) + + def test_prevent_trigger_loading_dask(self): + with pytest.raises(TypeError, match="would trigger loading"): + hist(xr.DataArray([0], name="a").chunk(1), bins=2) + From 80e665a2de5f3f824ba209b689a85c226d665905 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Wed, 9 Jun 2021 14:37:26 -0400 Subject: [PATCH 11/14] removed unnecessary method of checking for dask arrays (+ rogue prints) --- xarray/core/computation.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 138459472f6..7c9c1644a09 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -42,21 +42,6 @@ _JOINS_WITHOUT_FILL_VALUES = frozenset({"inner", "exact"}) -try: - import dask.array as dsa - - has_dask = True -except ImportError: - has_dask = False - - -def _any_dask_array(*args): - if not has_dask: - return False - else: - return any(isinstance(a, dsa.core.Array) for a in args) - - def _first_of_type(args, kind): """Return either first object of type 'kind' or raise if not found.""" for arg in args: @@ -1878,8 +1863,6 @@ def _check_and_format_bins_into_coords(b, da, r, bin_dim): raise ValueError("bins specified as numpy arrays can only be 1-dimensional") return DataArray(b, dims=bin_dim, name=bin_dim, attrs=da.attrs) elif isinstance(b, DataArray): - print(b.dims) - print(broadcast_dims) if bin_dim not in b.dims: raise ValueError( "A bins dataarray does not contain the " From 5f826bcf7451efef690d0d2732cf1cc5bd7dfef3 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 10 Jun 2021 13:38:17 -0400 Subject: [PATCH 12/14] range argument and docstring corrections --- xarray/core/computation.py | 81 ++++++++++++++++++++++++++++++++------ xarray/core/dataarray.py | 53 +++++++++++++++---------- xarray/core/dataset.py | 59 ++++++++++++++++++--------- 3 files changed, 140 insertions(+), 53 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 7c9c1644a09..3cab54cc321 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -1727,7 +1727,13 @@ def _calc_idxminmax( def hist( - *dataarrays, dim=None, bins=None, weights=None, density=False, keep_attrs=None + *dataarrays, + dim=None, + bins=None, + range=None, + weights=None, + density=False, + keep_attrs=None, ): """ Histogram applied along specified dimensions. @@ -1767,6 +1773,21 @@ def hist( A ``TypeError`` will also be raised if ``args`` contains dask arrays and ``bins`` are not specified explicitly via arrays or DataArrays, because other bin specifications trigger loading of the entire input data. + range : (float, float) or a list of (float, float), optional + If a list, there should be one entry for each item in ``args``. + The range specifications are as follows: + + * If (float, float); the lower and upper range(s) of the bins for all + arguments in ``args``. Values outside the range are ignored. The first + element of the range must be less than or equal to the second. `range` + affects the automatic bin computation as well. In this case, while bin + width is computed to be optimal based on the actual data within `range`, + the bin count will fill the entire range including portions containing + no data. + * If a list of (float, float); the ranges as above for every argument in + ``args``. + * If not provided, range is simply ``(arg.min(), arg.max())`` for each + arg. weights : array_like, optional An array of weights, of the same shape as `a`. Each value in `a` only contributes its associated weight towards the bin count @@ -1808,11 +1829,10 @@ def hist( DataArray.hist Dataset.hist numpy.histogramdd + numpy.histogram_bin_edges dask.array.blockwise """ - # TODO range argument - from .dataarray import DataArray, Variable # Check inputs @@ -1840,27 +1860,62 @@ def hist( dim = (dim,) reduce_dims = dim + # Will broadcast over all dimensions not counted along by histogram all_input_dims = ordered_set_union([da.dims for da in dataarrays]) broadcast_dims = OrderedSet(all_input_dims) - OrderedSet(reduce_dims) - # create output dims + # Create output dims new_bin_dims = [da.name + "_bins" for da in dataarrays] output_dims = [broadcast_dims] + new_bin_dims + # Check range kwarg + def _check_or_find_range(r, da): + if r is None: + lower, upper = da.min(), da.max() + else: + lower, upper = r + if not all(isinstance(bound, float) for bound in (lower, upper)): + raise TypeError( + "ranges must be provided in form (float, float), " + f"but found ({type(lower)}, {type(upper)})." + ) + if lower > upper: + raise ValueError( + "lower bound of range must be smaller than upper bound" + ) + return lower, upper + + if isinstance(range, list): + if len(range) != n_args: + raise TypeError( + "If `range` is a list then it must have same length " + "as number of input dataarrays passed, but instead has " + f"length {len(range)}." + ) + else: + range = [range] * n_args + ranges = [_check_or_find_range(r, da) for r, da in zip(range, dataarrays)] + + # Check bins kwarg def _check_and_format_bins_into_coords(b, da, r, bin_dim): # Check validity of given bins, or create using np.histogram_bin_edges # Package into a coordinate DataArray before returning _edges = np.histogram_bin_edges if isinstance(b, (int, str)) or b is None: if is_duck_dask_array(da.data): - raise TypeError(f"Choice of bins as {b} would trigger loading " - f"of entire input array to histogram") + raise TypeError( + f"Choice of bins as {b} would trigger loading " + f"of entire input array to histogram" + ) b = "auto" if b is None else b - return DataArray(_edges(da.values, b, r), dims=bin_dim, name=bin_dim, - attrs=da.attrs) + return DataArray( + _edges(da.values, b, r), dims=bin_dim, name=bin_dim, attrs=da.attrs + ) elif isinstance(b, np.ndarray): if b.ndim > 1: - raise ValueError("bins specified as numpy arrays can only be 1-dimensional") + raise ValueError( + "bins specified as numpy arrays can only be 1-dimensional" + ) return DataArray(b, dims=bin_dim, name=bin_dim, attrs=da.attrs) elif isinstance(b, DataArray): if bin_dim not in b.dims: @@ -1879,8 +1934,6 @@ def _check_and_format_bins_into_coords(b, da, r, bin_dim): else: raise TypeError(f"Type {type(b)} is not a valid argument to bins") - # TODO check ranges are of valid type - ranges = [None] * n_args if isinstance(bins, list): if len(bins) != n_args: raise TypeError( @@ -1891,8 +1944,10 @@ def _check_and_format_bins_into_coords(b, da, r, bin_dim): ) else: bins = [bins] * n_args - bins = [_check_and_format_bins_into_coords(b, da, r, d) for b, da, r, d in - zip(bins, dataarrays, ranges, new_bin_dims)] + bins = [ + _check_and_format_bins_into_coords(b, da, r, d) + for b, da, r, d in zip(bins, dataarrays, ranges, new_bin_dims) + ] # Align / broadcast all inputs (including weights and bins) # TODO if this was merely alignment could blockwise handle all the broadcasting? diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 8a7df05d2fd..4ce7601d485 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -4599,36 +4599,46 @@ def drop_duplicates( indexes = {dim: ~self.get_index(dim).duplicated(keep=keep)} return self.isel(indexes) - def hist(self, dim=None, bins=None, weights=None, density=False, keep_attrs=None): + def hist(self, dim=None, bins=None, range=None, weights=None, density=False, keep_attrs=None): """ Histogram applied along specified dimensions. - If the supplied arguments are chunked dask arrays it will use + If the supplied dataarray contains a chunked dask array it will use `dask.array.blockwise` internally to parallelize over all chunks. dim : tuple of strings, optional Dimensions over which which the histogram is computed. The default is to compute the histogram of the flattened array. i.e. over all dimensions. - bins : int or array_like or a list of ints or arrays, or list of DataArrays, optional - If a list, there should be one entry for each item in ``args``. - The bin specification: - - * If int, the number of bins for all arguments in ``args``. - * If array_like, the bin edges for all arguments in ``args``. - * If a list of ints, the number of bins for every argument in ``args``. - * If a list arrays, the bin edges for each argument in ``args`` - (required format for Dask inputs). - * A combination [int, array] or [array, int], where int - is the number of bins and array is the bin edges. - * If a list of DataArrays, the bins for each argument in ``args`` - The DataArrays can be multidimensional, but must not have any - dimensions shared with the `dim` argument. + bins : int, str, numpy array or DataArray, optional + The bin specifications: + + * If int, the number of bins. + * If str; the method used to automatically calculate the optimal bin + width, as defined by `np.histogram_bin_edges`. + * If a numpy array, the bin edges. Must be 1D. + * If a DataArray, the bin edges. The DataArray can be multidimensional, + but must contain the output bins dimension (named as `[da.name]_bins`), + and must not have any dimensions shared with the `dim` argument. + If supplied this DataArray will be present as a coordinate on the output. + * If not supplied then bins will be automatically calculated by + `np.histogram_bin_edges`. When bin edges are specified, all but the last (righthand-most) bin include the left edge and exclude the right edge. The last bin includes both edges. - A ``TypeError`` will be raised if ``args`` contains dask arrays and - ``bins`` are not specified explicitly as a list of arrays. + A ``TypeError`` will also be raised if the dataarray contains a dask array and + ``bins`` are not specified explicitly via an array or DataArray, because + other bin specifications trigger loading of the entire input data. + range : (float, float), optional + The range specifications are as follows: + + * If (float, float); the lower and upper range(s) of the bins. Values + outside the range are ignored. The first element of the range must be + less than or equal to the second. `range` affects the automatic bin + computation as well. In this case, while bin width is computed to be + optimal based on the actual data within `range`, the bin count will + fill the entire range including portions containing no data. + * If not provided, range is simply ``(da.min(), da.max())``. weights : array_like, optional An array of weights, of the same shape as `a`. Each value in `a` only contributes its associated weight towards the bin count @@ -4655,9 +4665,8 @@ def hist(self, dim=None, bins=None, weights=None, density=False, keep_attrs=None A single dataarray which contains the values of the histogram. See `density` and `weights` for a description of the possible semantics. - The returned dataarray will have one additional coordinate for each - dataarray supplied, named as `var_bins`, which contains the positions - of the centres of each bin. + The returned dataarray will have one additional coordinate, named as + `[da.name]_bins`, which contains the positions of the centres of each bin. Examples -------- @@ -4667,6 +4676,7 @@ def hist(self, dim=None, bins=None, weights=None, density=False, keep_attrs=None xarray.hist DataArray.hist numpy.histogramdd + numpy.histogram_bin_edges dask.array.blockwise """ @@ -4676,6 +4686,7 @@ def hist(self, dim=None, bins=None, weights=None, density=False, keep_attrs=None [self], dim=dim, bins=bins, + range=range, weights=weights, density=density, keep_attrs=keep_attrs, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 899a298dc4f..cebcdd7091f 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -7628,7 +7628,7 @@ def _wrapper(Y, *coords_, **kwargs): return result - def hist(self, dim=None, bins=None, weights=None, density=False, keep_attrs=None): + def hist(self, dim=None, bins=None, range=None, weights=None, density=False, keep_attrs=None): """ Histogram applied along specified dimensions. Will create a N-dimensional histogram, where N is the number of variables in the dataset. @@ -7639,26 +7639,45 @@ def hist(self, dim=None, bins=None, weights=None, density=False, keep_attrs=None dim : tuple of strings, optional Dimensions over which which the histogram is computed. The default is to compute the histogram of the flattened array. i.e. over all dimensions. - bins : int or array_like or a list of ints or arrays, or list of DataArrays, optional - If a list, there should be one entry for each item in ``args``. - The bin specification: - - * If int, the number of bins for all arguments in ``args``. - * If array_like, the bin edges for all arguments in ``args``. - * If a list of ints, the number of bins for every argument in ``args``. - * If a list arrays, the bin edges for each argument in ``args`` - (required format for Dask inputs). - * A combination [int, array] or [array, int], where int - is the number of bins and array is the bin edges. - * If a list of DataArrays, the bins for each argument in ``args`` - The DataArrays can be multidimensional, but must not have any - dimensions shared with the `dim` argument. + bins : int, str, numpy array or DataArray, or a list of ints, strs, arrays and/or DataArrays, optional + If a list, there should be one entry for each variable in the dataset. + The bin specifications for each entry: + + * If int, the number of bins. + * If str; the method used to automatically calculate the optimal bin + width, as defined by `np.histogram_bin_edges`. + * If a numpy array, the bin edges. Must be 1D. + * If a DataArray, the bin edges. The DataArray can be multidimensional, + but must contain the output bins dimension (named as `[var]_bins` + for a given input variable named `var`), and must not have any + dimensions shared with the `dim` argument. If supplied this DataArray + will be present as a coordinate on the output. + * If a list of ints, strs, arrays and/or DataArrays; the bin specification + as above for every variable in the dataset. + * If not supplied (or any elements of the list are `None`) then bins + will be automatically calculated by `np.histogram_bin_edges`. When bin edges are specified, all but the last (righthand-most) bin include the left edge and exclude the right edge. The last bin includes both edges. - A ``TypeError`` will be raised if ``args`` contains dask arrays and - ``bins`` are not specified explicitly as a list of arrays. + A ``TypeError`` will also be raised if the variables contain dask arrays and + ``bins`` are not specified explicitly via arrays or DataArrays, because + other bin specifications trigger loading of the entire input variable. + range : (float, float) or a list of (float, float), optional + If a list, there should be one entry for each variable in the dataset. + The range specifications are as follows: + + * If (float, float); the lower and upper range(s) of the bins for all + variables in the dataset. Values outside the range are ignored. The first + element of the range must be less than or equal to the second. `range` + affects the automatic bin computation as well. In this case, while bin + width is computed to be optimal based on the actual data within `range`, + the bin count will fill the entire range including portions containing + no data. + * If a list of (float, float); the ranges as above for every variable in + the dataset. + * If not provided, the ranges are simply ``(ds[var].min(), ds[var].max())`` + for each variable. weights : array_like, optional An array of weights, of the same shape as `a`. Each value in `a` only contributes its associated weight towards the bin count @@ -7686,8 +7705,8 @@ def hist(self, dim=None, bins=None, weights=None, density=False, keep_attrs=None `density` and `weights` for a description of the possible semantics. The returned dataarray will have one additional coordinate for each - dataarray supplied, named as `var_bins`, which contains the positions - of the centres of each bin. + variable in the input dataset, named as `var_bins`, which contains + the positions of the centres of each bin. Examples -------- @@ -7697,6 +7716,7 @@ def hist(self, dim=None, bins=None, weights=None, density=False, keep_attrs=None xarray.hist DataArray.hist numpy.histogramdd + numpy.histogram_bin_edges dask.array.blockwise """ @@ -7706,6 +7726,7 @@ def hist(self, dim=None, bins=None, weights=None, density=False, keep_attrs=None [self.data_vars.values()], dim=dim, bins=bins, + range=range, weights=weights, density=density, keep_attrs=keep_attrs, From e20b69bc8e8f42bc9f1263ec76e1744d6f5b74f3 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 10 Jun 2021 13:38:32 -0400 Subject: [PATCH 13/14] tests for validation of range kwarg --- xarray/tests/test_computation.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 209af8f4234..70eb1750101 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -1930,14 +1930,14 @@ def test_polyval(use_dask, use_datetime): xr.testing.assert_allclose(da, da_pv.T) -class TestHistInputTypeChecks: +class TestHistInputChecks: def test_invalid_data(self): with pytest.raises(TypeError, match="Only xr.DataArray is supported"): hist("string") def test_invalid_weights(self): with pytest.raises(TypeError, match="supported as weights"): - hist(xr.DataArray([], name="a"), weights="string") + hist(xr.DataArray([0], name="a"), weights="string") def test_no_data(self): with pytest.raises(TypeError, match="At least one"): @@ -1945,19 +1945,31 @@ def test_no_data(self): def test_nameless_data(self): with pytest.raises(ValueError, match="must have a name"): - hist(xr.DataArray([])) + hist(xr.DataArray([0])) def test_wrong_number_of_bins(self): with pytest.raises(TypeError, match="must have same length"): - hist(xr.DataArray([], name="a"), bins=[2, 3]) + hist(xr.DataArray([0], name="a"), bins=[2, 3]) def test_non_1d_numpy_array(self): with pytest.raises(ValueError, match="can only be 1-d"): - hist(xr.DataArray([], name="a"), bins=np.array([[2, 3]])) + hist(xr.DataArray([0], name="a"), bins=np.array([[2, 3]])) def test_invalid_bins(self): with pytest.raises(TypeError, match="not a valid argument"): - hist(xr.DataArray([], name="a"), bins=2.7) + hist(xr.DataArray([0], name="a"), bins=2.7) + + def test_wrong_number_of_ranges(self): + with pytest.raises(TypeError, match="must have same length"): + hist(xr.DataArray([0], name="a"), range=[(1.0, 2.0), (2.0, 3.0)]) + + def test_unsorted_range(self): + with pytest.raises(ValueError, match="must be smaller"): + hist(xr.DataArray([0], name="a"), range=(6.7, 2.7)) + + def test_invalid_range(self): + with pytest.raises(TypeError, match="float, float"): + hist(xr.DataArray([0], name="a"), range=(2.7, [4.5, 6.7])) def test_bin_dataarrays_with_extra_dims(self): data = xr.DataArray([0], dims=["x"], name="a") @@ -1975,3 +1987,4 @@ def test_prevent_trigger_loading_dask(self): with pytest.raises(TypeError, match="would trigger loading"): hist(xr.DataArray([0], name="a").chunk(1), bins=2) + From d276838c8120856b678b1afbbe2169230576ad63 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 10 Jun 2021 17:28:35 -0400 Subject: [PATCH 14/14] attempt at type hinting --- xarray/core/computation.py | 36 ++++++++++++++++++++---------------- xarray/core/dataarray.py | 24 +++++++++++++++++------- xarray/core/dataset.py | 23 ++++++++++++++++------- xarray/core/weighted.py | 13 +++++++++++-- 4 files changed, 64 insertions(+), 32 deletions(-) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 3cab54cc321..d0d6834f280 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -36,6 +36,7 @@ if TYPE_CHECKING: from .coordinates import Coordinates # noqa from .dataset import Dataset + from .dataarray import DataArray _NO_FILL_VALUE = utils.ReprObject("") _DEFAULT_NAME = utils.ReprObject("") @@ -1726,15 +1727,18 @@ def _calc_idxminmax( return res +_ALLOWED_BINS_TYPES = Union[int, str, np.ndarray, DataArray, None] + + def hist( - *dataarrays, - dim=None, - bins=None, - range=None, - weights=None, - density=False, - keep_attrs=None, -): + *dataarrays : DataArray, + dim : Union[Hashable, Iterable[Hashable]] = None, + bins : Union[_ALLOWED_BINS_TYPES, List[_ALLOWED_BINS_TYPES]] = None, + range : Union[Tuple[float, float], List[Tuple[float, float]]] = None, + weights : DataArray = None, + density : bool = False, + keep_attrs : bool = None, +) -> DataArray: """ Histogram applied along specified dimensions. @@ -1788,11 +1792,11 @@ def hist( ``args``. * If not provided, range is simply ``(arg.min(), arg.max())`` for each arg. - weights : array_like, optional - An array of weights, of the same shape as `a`. Each value in - `a` only contributes its associated weight towards the bin count - (instead of 1). If `density` is True, the weights are - normalized, so that the integral of the density over the range + weights : xarray.DataArray, optional + An array of weights, able to be broadcast to match the input data. + If supplied each value in the input data only contributes its associated + weight towards the bin count (instead of 1). If `density` is True, the + weights are normalized, so that the integral of the density over the range remains 1. NaNs in the weights input will fill the entire bin with NaNs. If there are NaNs in the weights input call ``.fillna(0.)`` before running ``hist()``. @@ -1862,7 +1866,7 @@ def hist( # Will broadcast over all dimensions not counted along by histogram all_input_dims = ordered_set_union([da.dims for da in dataarrays]) - broadcast_dims = OrderedSet(all_input_dims) - OrderedSet(reduce_dims) + broadcast_dims : Iterable[Hashable] = OrderedSet(*all_input_dims) - OrderedSet(*reduce_dims) # Create output dims new_bin_dims = [da.name + "_bins" for da in dataarrays] @@ -1944,7 +1948,7 @@ def _check_and_format_bins_into_coords(b, da, r, bin_dim): ) else: bins = [bins] * n_args - bins = [ + bins_as_coords : List[DataArray] = [ _check_and_format_bins_into_coords(b, da, r, d) for b, da, r, d in zip(bins, dataarrays, ranges, new_bin_dims) ] @@ -1954,7 +1958,7 @@ def _check_and_format_bins_into_coords(b, da, r, bin_dim): arrs = broadcast(dataarrays) weights = weights.broadcast_like(arrs[0]) # TODO surround with try except? - aligned_bins = [b.broadcast_like(arrs[0]) for b in bins] + aligned_bins = [b.broadcast_like(arrs[0]) for b in bins_as_coords] # TODO bins now already has the output dims included, is that correct? # Compute histogram results diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 4ce7601d485..5ee95f3ca37 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -73,6 +73,8 @@ T_DataArray = TypeVar("T_DataArray", bound="DataArray") T_DSorDA = TypeVar("T_DSorDA", "DataArray", Dataset) if TYPE_CHECKING: + from .computation import _ALLOWED_BINS_TYPES + try: from dask.delayed import Delayed except ImportError: @@ -4599,7 +4601,15 @@ def drop_duplicates( indexes = {dim: ~self.get_index(dim).duplicated(keep=keep)} return self.isel(indexes) - def hist(self, dim=None, bins=None, range=None, weights=None, density=False, keep_attrs=None): + def hist( + self, + dim : Union[Hashable, Iterable[Hashable]] = None, + bins : Union[_ALLOWED_BINS_TYPES, List[_ALLOWED_BINS_TYPES]] = None, + range : Tuple[float, float] = None, + weights : T_DataArray = None, + density : bool = False, + keep_attrs : bool = None, + ) -> DataArray: """ Histogram applied along specified dimensions. @@ -4639,11 +4649,11 @@ def hist(self, dim=None, bins=None, range=None, weights=None, density=False, kee optimal based on the actual data within `range`, the bin count will fill the entire range including portions containing no data. * If not provided, range is simply ``(da.min(), da.max())``. - weights : array_like, optional - An array of weights, of the same shape as `a`. Each value in - `a` only contributes its associated weight towards the bin count - (instead of 1). If `density` is True, the weights are - normalized, so that the integral of the density over the range + weights : xarray.DataArray, optional + An array of weights, able to be broadcast to match the input data. + If supplied each value in the input data only contributes its associated + weight towards the bin count (instead of 1). If `density` is True, the + weights are normalized, so that the integral of the density over the range remains 1. NaNs in the weights input will fill the entire bin with NaNs. If there are NaNs in the weights input call ``.fillna(0.)`` before running ``hist()``. @@ -4683,7 +4693,7 @@ def hist(self, dim=None, bins=None, range=None, weights=None, density=False, kee from .computation import hist return hist( - [self], + self, dim=dim, bins=bins, range=range, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index cebcdd7091f..dfe1b2075f4 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -106,6 +106,7 @@ from ..backends import AbstractDataStore, ZarrStore from .dataarray import DataArray from .merge import CoercibleMapping + from .computation import _ALLOWED_BINS_TYPES T_DSorDA = TypeVar("T_DSorDA", DataArray, "Dataset") @@ -7628,7 +7629,15 @@ def _wrapper(Y, *coords_, **kwargs): return result - def hist(self, dim=None, bins=None, range=None, weights=None, density=False, keep_attrs=None): + def hist( + self, + dim : Union[Hashable, Iterable[Hashable]] = None, + bins : Union[_ALLOWED_BINS_TYPES, List[_ALLOWED_BINS_TYPES]] = None, + range : Union[Tuple[float, float], List[Tuple[float, float]]] = None, + weights : DataArray = None, + density : bool = False, + keep_attrs : bool = None, + ) -> DataArray: """ Histogram applied along specified dimensions. Will create a N-dimensional histogram, where N is the number of variables in the dataset. @@ -7678,11 +7687,11 @@ def hist(self, dim=None, bins=None, range=None, weights=None, density=False, kee the dataset. * If not provided, the ranges are simply ``(ds[var].min(), ds[var].max())`` for each variable. - weights : array_like, optional - An array of weights, of the same shape as `a`. Each value in - `a` only contributes its associated weight towards the bin count - (instead of 1). If `density` is True, the weights are - normalized, so that the integral of the density over the range + weights : xarray.DataArray, optional + An array of weights, able to be broadcast to match the input data. + If supplied each value in the input data only contributes its associated + weight towards the bin count (instead of 1). If `density` is True, the + weights are normalized, so that the integral of the density over the range remains 1. NaNs in the weights input will fill the entire bin with NaNs. If there are NaNs in the weights input call ``.fillna(0.)`` before running ``hist()``. @@ -7723,7 +7732,7 @@ def hist(self, dim=None, bins=None, range=None, weights=None, density=False, kee from .computation import hist return hist( - [self.data_vars.values()], + *[self[var] for var in self], dim=dim, bins=bins, range=range, diff --git a/xarray/core/weighted.py b/xarray/core/weighted.py index 025cd2606aa..86232af8d5c 100644 --- a/xarray/core/weighted.py +++ b/xarray/core/weighted.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Generic, Hashable, Iterable, Optional, TypeVar, Union +from typing import TYPE_CHECKING, Generic, Hashable, Iterable, Tuple, List, Optional, TypeVar, Union from . import duck_array_ops from .computation import dot @@ -7,6 +7,7 @@ if TYPE_CHECKING: from .common import DataWithCoords # noqa: F401 from .dataarray import DataArray, Dataset + from .computation import _ALLOWED_BINS_TYPES T_DataWithCoords = TypeVar("T_DataWithCoords", bound="DataWithCoords") @@ -303,10 +304,18 @@ def mean( self._weighted_mean, dim=dim, skipna=skipna, keep_attrs=keep_attrs ) - def hist(self, dim=None, bins=None, density=False, keep_attrs=None): + def hist( + self, + dim : Union[Hashable, Iterable[Hashable]] = None, + bins : Union[_ALLOWED_BINS_TYPES, List[_ALLOWED_BINS_TYPES]] = None, + range : Union[Tuple[float, float], List[Tuple[float, float]]] = None, + density : bool = False, + keep_attrs : bool = None, + ) -> T_DataWithCoords: return self.obj.hist( dim=dim, bins=bins, + range=range, weights=self.weights, density=density, keep_attrs=keep_attrs,