xgcm · TomNicholas · Apr 29, 2021 · Apr 30, 2021 · Apr 30, 2021 · Apr 30, 2021
diff --git a/versioneer.py b/versioneer.py
diff --git a/xhistogram/core.py b/xhistogram/core.py
@@ -7,18 +7,33 @@
 import numpy as np
 from functools import reduce
 from collections.abc import Iterable
-from .duck_array_ops import (
+from numpy import (
     digitize,
     bincount,
     reshape,
     ravel_multi_index,
     concatenate,
     broadcast_arrays,
+    broadcast_to,
 )
 
 # range is a keyword so save the builtin so they can use it.
 _range = range
 
+try:
+    import dask.array as dsa
+
+    has_dask = True
+except ImportError:
+    has_dask = False
+
+
+def _any_dask_array(*args):
+    if not has_dask:
+        return False
+    else:
+        return any(isinstance(a, dsa.core.Array) for a in args)
+
 
 def _ensure_correctly_formatted_bins(bins, N_expected):
     # TODO: This could be done better / more robustly
@@ -120,8 +135,8 @@ def _dispatch_bincount(bin_indices, weights, N, hist_shapes, block_size=None):
         return _bincount_loop(bin_indices, weights, N, hist_shapes, block_chunks)
 
 
-def _histogram_2d_vectorized(
-    *args, bins=None, weights=None, density=False, right=False, block_size=None
+def _bincount_2d_vectorized(
+    *args, bins=None, weights=None, block_size=None
 ):
     """Calculate the histogram independently on each row of a 2D array"""
 
@@ -131,14 +146,17 @@ def _histogram_2d_vectorized(
     # consistency checks for inputa
     for a, b in zip(args, bins):
         assert a.ndim == 2
-        assert b.ndim == 1
+        #assert b.ndim == 1
         assert a.shape == a0.shape
     if weights is not None:
         assert weights.shape == a0.shape
 
     nrows, ncols = a0.shape
-    nbins = [len(b) for b in bins]
-    hist_shapes = [nb + 1 for nb in nbins]
+
+    #bins = [np.expand_dims(b, axis=0) if b.ndim == 1 else b for b in bins]
+
+    # TODO assuming all bins have same form here
+    b0 = bins[0]
 
     # a marginally faster implementation would be to use searchsorted,
     # like numpy histogram itself does
@@ -150,14 +168,38 @@ def _histogram_2d_vectorized(
     # https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/calibration.py#L592
     # but a better approach would be to use something like _search_sorted_inclusive() in
     # numpy histogram. This is an additional motivation for moving to searchsorted
-    bins = [np.concatenate((b[:-1], b[-1:] + 1e-8)) for b in bins]
+    # TODO wouldn't need ifs if we just promoted all bins to 2D
+
+    if b0.ndim == 1:
+        bins = [np.concatenate((b[:-1], b[-1:] + 1e-8)) for b in bins]
+    elif b0.ndim == 2:
+        bins = [np.concatenate((b[:, :-1], b[:, -1:] + 1e-8), axis=1) for b in bins]
 
     # the maximum possible value of of digitize is nbins
     # for right=False:
     #   - 0 corresponds to a < b[0]
     #   - i corresponds to bins[i-1] <= a < b[i]
     #   - nbins corresponds to a a >= b[1]
-    each_bin_indices = [digitize(a, b) for a, b in zip(args, bins)]
+
+    if b0.ndim == 1:
+        nbins = [len(b) for b in bins]
+        hist_shapes = [nb + 1 for nb in nbins]
+
+        each_bin_indices = [digitize(a, b) for a, b in zip(args, bins)]
+    elif b0.ndim == 2:
+        nbins = [b.shape[1] for b in bins]
+        hist_shapes = [nb + 1 for nb in nbins]
+
+        # Apply digitize separately to each row with different bins
+        each_bin_indices = []
+        for a, b in zip(args, bins):
+            each_bin_indices_single_var = np.stack([digitize(a[row, :], b[row, :])
+                                                   for row in np.arange(a.shape[0])])
+            each_bin_indices.append(each_bin_indices_single_var)
+
+    # TODO check if this array is correct!
+    print(each_bin_indices)
+
     # product of the bins gives the joint distribution
     if N_inputs > 1:
         bin_indices = ravel_multi_index(each_bin_indices, hist_shapes)
@@ -178,6 +220,72 @@ def _histogram_2d_vectorized(
     return bin_counts
 
 
+def _bincount(*all_arrays, weights=False, axis=None, density=False):
+
+    all_arrays = list(all_arrays)
+
+    weights_array = all_arrays.pop()
+
+    # TODO a more robust way to pass the bins together with the arrays
+    n_args = len(all_arrays) // 2
+    arrays = all_arrays[:n_args]
+    bins = all_arrays[n_args:]
+
+    # is this necessary? (it is necessary for the weights to match the data)
+    all_arrays_broadcast = broadcast_arrays(*arrays)
+
+    a0 = all_arrays_broadcast[0]
+
+    do_full_array = (axis is None) or (set(axis) == set(_range(a0.ndim)))
+
+    if do_full_array:
+        kept_axes_shape = (1,) * a0.ndim
+    else:
+        kept_axes_shape = tuple(
+            [a0.shape[i] if i not in axis else 1 for i in _range(a0.ndim)]
+        )
+
+    def reshape_input(a):
+        if do_full_array:
+            d = a.ravel()[None, :]
+        else:
+            # reshape the array to 2D
+            # axis 0: preserved axis after histogram
+            # axis 1: calculate histogram along this axis
+            new_pos = tuple(_range(-len(axis), 0))
+            c = np.moveaxis(a, axis, new_pos)
+            split_idx = c.ndim - len(axis)
+            dims_0 = c.shape[:split_idx]
+            # assert dims_0 == kept_axes_shape
+            dims_1 = c.shape[split_idx:]
+            new_dim_0 = np.prod(dims_0)
+            new_dim_1 = np.prod(dims_1)
+            # TODO integer vs float logic here is not robust
+            d = reshape(c, (new_dim_0, new_dim_1))
+        return d
+
+    all_arrays_reshaped = [reshape_input(a) for a in all_arrays_broadcast]
+
+    if any(b.ndim > 1 for b in bins):
+        bins_reshaped = [reshape_input(b) for b in bins]
+    else:
+        bins_reshaped = bins
+    if weights:
+        weights_broadcast = broadcast_to(weights_array, a0.shape)
+        weights_reshaped = reshape_input(weights_broadcast)
+    else:
+        weights_reshaped = None
+
+    bin_counts = _bincount_2d_vectorized(
+        *all_arrays_reshaped, bins=bins_reshaped, weights=weights_reshaped
+    )
+
+    final_shape = kept_axes_shape + bin_counts.shape[1:]
+    bin_counts = reshape(bin_counts, final_shape)
+
+    return bin_counts
+
+
 def histogram(
     *args,
     bins=None,
@@ -280,69 +388,110 @@ def histogram(
                 ax_positive = ndim + ax
             assert ax_positive < ndim, "axis must be less than ndim"
             axis_normed.append(ax_positive)
-        axis = np.atleast_1d(axis_normed)
+        axis = [int(i) for i in axis_normed]
 
-    do_full_array = (axis is None) or (set(axis) == set(_range(a0.ndim)))
-    if do_full_array:
-        kept_axes_shape = None
-    else:
-        kept_axes_shape = tuple([a0.shape[i] for i in _range(a0.ndim) if i not in axis])
+    all_arrays = list(args)
+    n_inputs = len(all_arrays)
 
-    all_args = list(args)
+    # TODO make feeding weights in less janky
     if weights is not None:
-        all_args += [weights]
-    all_args_broadcast = broadcast_arrays(*all_args)
-
-    def reshape_input(a):
-        if do_full_array:
-            d = a.ravel()[None, :]
-        else:
-            # reshape the array to 2D
-            # axis 0: preserved axis after histogram
-            # axis 1: calculate histogram along this axis
-            new_pos = tuple(_range(-len(axis), 0))
-            c = np.moveaxis(a, axis, new_pos)
-            split_idx = c.ndim - len(axis)
-            dims_0 = c.shape[:split_idx]
-            assert dims_0 == kept_axes_shape
-            dims_1 = c.shape[split_idx:]
-            new_dim_0 = np.prod(dims_0)
-            new_dim_1 = np.prod(dims_1)
-            d = reshape(c, (new_dim_0, new_dim_1))
-        return d
+        has_weights = True
+    else:
+        has_weights = False
 
-    all_args_reshaped = [reshape_input(a) for a in all_args_broadcast]
+    dtype = "i8" if not has_weights else weights.dtype
 
-    if weights is not None:
-        weights_reshaped = all_args_reshaped.pop()
-    else:
-        weights_reshaped = None
+    # here I am assuming all the arrays have the same shape
+    # probably needs to be generalized
+    input_indexes = [tuple(_range(a.ndim)) for a in all_arrays]
+    input_index = input_indexes[0]
+    assert all([ii == input_index for ii in input_indexes])
 
     # Some sanity checks and format bins and range correctly
-    bins = _ensure_correctly_formatted_bins(bins, n_inputs)
+    formatted_bins = _ensure_correctly_formatted_bins(bins, n_inputs)
     range = _ensure_correctly_formatted_range(range, n_inputs)
 
     # histogram_bin_edges trigges computation on dask arrays. It would be possible
     # to write a version of this that doesn't trigger when `range` is provided, but
     # for now let's just use np.histogram_bin_edges
     if is_dask_array:
-        if not all([isinstance(b, np.ndarray) for b in bins]):
+        if not all([isinstance(b, np.ndarray) for b in formatted_bins]):
             raise TypeError(
                 "When using dask arrays, bins must be provided as numpy array(s) of edges"
             )
+        bins = formatted_bins
     else:
-        bins = [
-            np.histogram_bin_edges(a, b, r, weights_reshaped)
-            for a, b, r in zip(all_args_reshaped, bins, range)
-        ]
-
-    bin_counts = _histogram_2d_vectorized(
-        *all_args_reshaped,
-        bins=bins,
-        weights=weights_reshaped,
-        density=density,
-        block_size=block_size,
-    )
+        bins = []
+        for a, b, r in zip(all_arrays, formatted_bins, range):
+            if isinstance(b, np.ndarray):
+                # account for possibility that bins is a >1d numpy array
+                pass
+            else:
+                b = np.histogram_bin_edges(a, b, r)
+            bins.append(b)
+    bincount_kwargs = dict(weights=has_weights, axis=axis, density=density)
+
+    # broadcast bins to match reduced data
+    # TODO assumes all bins arguments have the same shape
+    b0 = bins[0]
+    broadcast_bins_shape = [ii for ii in a0.shape if ii not in axis] + list(b0.shape)
+    bins = [np.broadcast_to(b, broadcast_bins_shape) for b in bins]
+
+    # keep these axes in the inputs
+    if axis is not None:
+        drop_axes = tuple([ii for ii in input_index if ii in axis])
+    else:
+        drop_axes = input_index
+
+    if _any_dask_array(weights, *all_arrays):
+        # We should be able to just apply the bin_count function to every
+        # block and then sum over all blocks to get the total bin count.
+        # The main challenge is to figure out the chunk shape that will come
+        # out of _bincount. We might also need to add dummy dimensions to sum
+        # over in the _bincount function
+        import dask.array as dsa
+
+        # Important note from blockwise docs
+        # > Any index, like i missing from the output index is interpreted as a contraction...
+        # > In the case of a contraction the passed function should expect an iterable of blocks
+        # > on any array that holds that index.
+        # This means that we need to have all the input indexes present in the output index
+        # However, they will be reduced to singleton (len 1) dimensions
+
+        adjust_chunks = {i: (lambda x: 1) for i in drop_axes}
+
+        new_axes = {
+            max(input_index) + 1 + i: axis_len
+            for i, axis_len in enumerate([len(bin) - 1 for bin in bins])
+        }
+        out_index = input_index + tuple(new_axes)
+
+        blockwise_args = []
+        for arg in all_arrays:
+            blockwise_args.append(arg)
+            blockwise_args.append(input_index)
+
+        # Bins arrays do not contain axes which will get reduced along
+        # TODO incorrect for >1D bins - how do we know what broadcast axes are on bins here?
+        bins_input_index = tuple(new_axes.keys())
+        for b in bins:
+            blockwise_args.append(b)
+            blockwise_args.append(bins_input_index)
+
+        bin_counts = dsa.blockwise(
+            _bincount,
+            out_index,
+            *blockwise_args,
+            new_axes=new_axes,
+            adjust_chunks=adjust_chunks,
+            meta=np.array((), dtype),
+            **bincount_kwargs,
+        )
+        # sum over the block dims
+        bin_counts = bin_counts.sum(drop_axes)
+    else:
+        # drop the extra axis used for summing over blocks
+        bin_counts = _bincount(*(all_arrays + bins + [weights]), **bincount_kwargs).squeeze(drop_axes)
 
     if density:
         # Normalise by dividing by bin counts and areas such that all the
@@ -360,11 +509,4 @@ def reshape_input(a):
     else:
         h = bin_counts
 
-    if h.shape[0] == 1:
-        assert do_full_array
-        h = h.squeeze()
-    else:
-        final_shape = kept_axes_shape + h.shape[1:]
-        h = reshape(h, final_shape)
-
     return h, bins
diff --git a/xhistogram/duck_array_ops.py b/xhistogram/duck_array_ops.py