Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Multidimensional bins #59

Open
wants to merge 22 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
267 changes: 165 additions & 102 deletions versioneer.py

Large diffs are not rendered by default.

264 changes: 203 additions & 61 deletions xhistogram/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,33 @@
import numpy as np
from functools import reduce
from collections.abc import Iterable
from .duck_array_ops import (
from numpy import (
digitize,
bincount,
reshape,
ravel_multi_index,
concatenate,
broadcast_arrays,
broadcast_to,
)

# range is a keyword so save the builtin so they can use it.
_range = range

try:
import dask.array as dsa

has_dask = True
except ImportError:
has_dask = False


def _any_dask_array(*args):
if not has_dask:
return False
else:
return any(isinstance(a, dsa.core.Array) for a in args)


def _ensure_correctly_formatted_bins(bins, N_expected):
# TODO: This could be done better / more robustly
Expand Down Expand Up @@ -120,8 +135,8 @@ def _dispatch_bincount(bin_indices, weights, N, hist_shapes, block_size=None):
return _bincount_loop(bin_indices, weights, N, hist_shapes, block_chunks)


def _histogram_2d_vectorized(
*args, bins=None, weights=None, density=False, right=False, block_size=None
def _bincount_2d_vectorized(
*args, bins=None, weights=None, block_size=None
):
"""Calculate the histogram independently on each row of a 2D array"""

Expand All @@ -131,14 +146,17 @@ def _histogram_2d_vectorized(
# consistency checks for inputa
for a, b in zip(args, bins):
assert a.ndim == 2
assert b.ndim == 1
#assert b.ndim == 1
assert a.shape == a0.shape
if weights is not None:
assert weights.shape == a0.shape

nrows, ncols = a0.shape
nbins = [len(b) for b in bins]
hist_shapes = [nb + 1 for nb in nbins]

#bins = [np.expand_dims(b, axis=0) if b.ndim == 1 else b for b in bins]

# TODO assuming all bins have same form here
b0 = bins[0]

# a marginally faster implementation would be to use searchsorted,
# like numpy histogram itself does
Expand All @@ -150,14 +168,38 @@ def _histogram_2d_vectorized(
# https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/calibration.py#L592
# but a better approach would be to use something like _search_sorted_inclusive() in
# numpy histogram. This is an additional motivation for moving to searchsorted
bins = [np.concatenate((b[:-1], b[-1:] + 1e-8)) for b in bins]
# TODO wouldn't need ifs if we just promoted all bins to 2D

if b0.ndim == 1:
bins = [np.concatenate((b[:-1], b[-1:] + 1e-8)) for b in bins]
elif b0.ndim == 2:
bins = [np.concatenate((b[:, :-1], b[:, -1:] + 1e-8), axis=1) for b in bins]

# the maximum possible value of of digitize is nbins
# for right=False:
# - 0 corresponds to a < b[0]
# - i corresponds to bins[i-1] <= a < b[i]
# - nbins corresponds to a a >= b[1]
each_bin_indices = [digitize(a, b) for a, b in zip(args, bins)]

if b0.ndim == 1:
nbins = [len(b) for b in bins]
hist_shapes = [nb + 1 for nb in nbins]

each_bin_indices = [digitize(a, b) for a, b in zip(args, bins)]
elif b0.ndim == 2:
nbins = [b.shape[1] for b in bins]
hist_shapes = [nb + 1 for nb in nbins]

# Apply digitize separately to each row with different bins
each_bin_indices = []
for a, b in zip(args, bins):
each_bin_indices_single_var = np.stack([digitize(a[row, :], b[row, :])
for row in np.arange(a.shape[0])])
each_bin_indices.append(each_bin_indices_single_var)

# TODO check if this array is correct!
print(each_bin_indices)

# product of the bins gives the joint distribution
if N_inputs > 1:
bin_indices = ravel_multi_index(each_bin_indices, hist_shapes)
Expand All @@ -178,6 +220,72 @@ def _histogram_2d_vectorized(
return bin_counts


def _bincount(*all_arrays, weights=False, axis=None, density=False):

all_arrays = list(all_arrays)

weights_array = all_arrays.pop()

# TODO a more robust way to pass the bins together with the arrays
n_args = len(all_arrays) // 2
arrays = all_arrays[:n_args]
bins = all_arrays[n_args:]

# is this necessary? (it is necessary for the weights to match the data)
all_arrays_broadcast = broadcast_arrays(*arrays)

a0 = all_arrays_broadcast[0]

do_full_array = (axis is None) or (set(axis) == set(_range(a0.ndim)))

if do_full_array:
kept_axes_shape = (1,) * a0.ndim
else:
kept_axes_shape = tuple(
[a0.shape[i] if i not in axis else 1 for i in _range(a0.ndim)]
)

def reshape_input(a):
if do_full_array:
d = a.ravel()[None, :]
else:
# reshape the array to 2D
# axis 0: preserved axis after histogram
# axis 1: calculate histogram along this axis
new_pos = tuple(_range(-len(axis), 0))
c = np.moveaxis(a, axis, new_pos)
split_idx = c.ndim - len(axis)
dims_0 = c.shape[:split_idx]
# assert dims_0 == kept_axes_shape
dims_1 = c.shape[split_idx:]
new_dim_0 = np.prod(dims_0)
new_dim_1 = np.prod(dims_1)
# TODO integer vs float logic here is not robust
d = reshape(c, (new_dim_0, new_dim_1))
return d

all_arrays_reshaped = [reshape_input(a) for a in all_arrays_broadcast]

if any(b.ndim > 1 for b in bins):
bins_reshaped = [reshape_input(b) for b in bins]
else:
bins_reshaped = bins
if weights:
weights_broadcast = broadcast_to(weights_array, a0.shape)
weights_reshaped = reshape_input(weights_broadcast)
else:
weights_reshaped = None

bin_counts = _bincount_2d_vectorized(
*all_arrays_reshaped, bins=bins_reshaped, weights=weights_reshaped
)

final_shape = kept_axes_shape + bin_counts.shape[1:]
bin_counts = reshape(bin_counts, final_shape)

return bin_counts


def histogram(
*args,
bins=None,
Expand Down Expand Up @@ -280,69 +388,110 @@ def histogram(
ax_positive = ndim + ax
assert ax_positive < ndim, "axis must be less than ndim"
axis_normed.append(ax_positive)
axis = np.atleast_1d(axis_normed)
axis = [int(i) for i in axis_normed]

do_full_array = (axis is None) or (set(axis) == set(_range(a0.ndim)))
if do_full_array:
kept_axes_shape = None
else:
kept_axes_shape = tuple([a0.shape[i] for i in _range(a0.ndim) if i not in axis])
all_arrays = list(args)
n_inputs = len(all_arrays)

all_args = list(args)
# TODO make feeding weights in less janky
if weights is not None:
all_args += [weights]
all_args_broadcast = broadcast_arrays(*all_args)

def reshape_input(a):
if do_full_array:
d = a.ravel()[None, :]
else:
# reshape the array to 2D
# axis 0: preserved axis after histogram
# axis 1: calculate histogram along this axis
new_pos = tuple(_range(-len(axis), 0))
c = np.moveaxis(a, axis, new_pos)
split_idx = c.ndim - len(axis)
dims_0 = c.shape[:split_idx]
assert dims_0 == kept_axes_shape
dims_1 = c.shape[split_idx:]
new_dim_0 = np.prod(dims_0)
new_dim_1 = np.prod(dims_1)
d = reshape(c, (new_dim_0, new_dim_1))
return d
has_weights = True
else:
has_weights = False

all_args_reshaped = [reshape_input(a) for a in all_args_broadcast]
dtype = "i8" if not has_weights else weights.dtype

if weights is not None:
weights_reshaped = all_args_reshaped.pop()
else:
weights_reshaped = None
# here I am assuming all the arrays have the same shape
# probably needs to be generalized
input_indexes = [tuple(_range(a.ndim)) for a in all_arrays]
input_index = input_indexes[0]
assert all([ii == input_index for ii in input_indexes])

# Some sanity checks and format bins and range correctly
bins = _ensure_correctly_formatted_bins(bins, n_inputs)
formatted_bins = _ensure_correctly_formatted_bins(bins, n_inputs)
range = _ensure_correctly_formatted_range(range, n_inputs)

# histogram_bin_edges trigges computation on dask arrays. It would be possible
# to write a version of this that doesn't trigger when `range` is provided, but
# for now let's just use np.histogram_bin_edges
if is_dask_array:
if not all([isinstance(b, np.ndarray) for b in bins]):
if not all([isinstance(b, np.ndarray) for b in formatted_bins]):
raise TypeError(
"When using dask arrays, bins must be provided as numpy array(s) of edges"
)
bins = formatted_bins
else:
bins = [
np.histogram_bin_edges(a, b, r, weights_reshaped)
for a, b, r in zip(all_args_reshaped, bins, range)
]

bin_counts = _histogram_2d_vectorized(
*all_args_reshaped,
bins=bins,
weights=weights_reshaped,
density=density,
block_size=block_size,
)
bins = []
for a, b, r in zip(all_arrays, formatted_bins, range):
if isinstance(b, np.ndarray):
# account for possibility that bins is a >1d numpy array
pass
else:
b = np.histogram_bin_edges(a, b, r)
bins.append(b)
bincount_kwargs = dict(weights=has_weights, axis=axis, density=density)

# broadcast bins to match reduced data
# TODO assumes all bins arguments have the same shape
b0 = bins[0]
broadcast_bins_shape = [ii for ii in a0.shape if ii not in axis] + list(b0.shape)
bins = [np.broadcast_to(b, broadcast_bins_shape) for b in bins]

# keep these axes in the inputs
if axis is not None:
drop_axes = tuple([ii for ii in input_index if ii in axis])
else:
drop_axes = input_index

if _any_dask_array(weights, *all_arrays):
# We should be able to just apply the bin_count function to every
# block and then sum over all blocks to get the total bin count.
# The main challenge is to figure out the chunk shape that will come
# out of _bincount. We might also need to add dummy dimensions to sum
# over in the _bincount function
import dask.array as dsa

# Important note from blockwise docs
# > Any index, like i missing from the output index is interpreted as a contraction...
# > In the case of a contraction the passed function should expect an iterable of blocks
# > on any array that holds that index.
# This means that we need to have all the input indexes present in the output index
# However, they will be reduced to singleton (len 1) dimensions

adjust_chunks = {i: (lambda x: 1) for i in drop_axes}

new_axes = {
max(input_index) + 1 + i: axis_len
for i, axis_len in enumerate([len(bin) - 1 for bin in bins])
}
out_index = input_index + tuple(new_axes)

blockwise_args = []
for arg in all_arrays:
blockwise_args.append(arg)
blockwise_args.append(input_index)

# Bins arrays do not contain axes which will get reduced along
# TODO incorrect for >1D bins - how do we know what broadcast axes are on bins here?
bins_input_index = tuple(new_axes.keys())
for b in bins:
blockwise_args.append(b)
blockwise_args.append(bins_input_index)

bin_counts = dsa.blockwise(
_bincount,
out_index,
*blockwise_args,
new_axes=new_axes,
adjust_chunks=adjust_chunks,
meta=np.array((), dtype),
**bincount_kwargs,
)
# sum over the block dims
bin_counts = bin_counts.sum(drop_axes)
else:
# drop the extra axis used for summing over blocks
bin_counts = _bincount(*(all_arrays + bins + [weights]), **bincount_kwargs).squeeze(drop_axes)

if density:
# Normalise by dividing by bin counts and areas such that all the
Expand All @@ -360,11 +509,4 @@ def reshape_input(a):
else:
h = bin_counts

if h.shape[0] == 1:
assert do_full_array
h = h.squeeze()
else:
final_shape = kept_axes_shape + h.shape[1:]
h = reshape(h, final_shape)

return h, bins
39 changes: 0 additions & 39 deletions xhistogram/duck_array_ops.py

This file was deleted.

Loading