Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dask: Data.percentile, Data.median #313

Merged
merged 17 commits into from
Feb 9, 2022
126 changes: 126 additions & 0 deletions cf/data/dask_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
instance, as would be passed to `dask.array.map_blocks`.

"""
from functools import reduce
from operator import mul

import dask.array as da
import numpy as np
Expand Down Expand Up @@ -184,6 +186,130 @@ def cf_harden_mask(a):
return a


def cf_percentile(a, q, axis, interpolation, keepdims=False, mtol=1):
"""Compute percentiles of the data along the specified axes.

See `cf.Data.percentile` for further details.

.. note:: This function correctly sets the mask hardness of the
output array.

.. versionadded:: TODODASK

.. seealso:: `cf.Data.percentile`

:Parameters:

a: `numpy.ndarray`
Input array,
davidhassell marked this conversation as resolved.
Show resolved Hide resolved

q: `numpy.ndarray`
Percentile or sequence of percentiles to compute, which
must be between 0 and 100 inclusive.

axis: `tuple` of `int`
Axes along which the percentiles are computed.

interpolation: `str`
Specifies the interpolation method to use when the desired
percentile lies between two data points ``i < j``. Must be
one of ``'linear'``, ``'lower'``, ``'higher'``,
``'midpoint'``, or ``'nearest'``.

keepdims: `bool`, optional
If this is set to True, the axes which are reduced are
left in the result as dimensions with size one. With this
option, the result will broadcast correctly against the
original array *a*.

mtol: number, optional
Set an upper limit of the amount input data values which
are allowed to be missing data when contributing to
individual output percentile values. It is defined as a
fraction (between 0 and 1 inclusive) of the contributing
input data values. The default is 1, meaning that a
missing datum in the output array only occurs when all of
its contributing input array elements are missing data. A
value of 0 means that a missing datum in the output array
occurs whenever any of its contributing input array
elements are missing data.

:Returns:

`numpy.ndarray`

"""
if not len(a):
sadielbartholomew marked this conversation as resolved.
Show resolved Hide resolved
return None

if np.ma.is_masked(a):
# ------------------------------------------------------------
# Input array is masked: Replace missing values with NaNs and
# remask later.
# ------------------------------------------------------------
if a.dtype != float:
# Can't assign NaNs to integer arrays
a = a.astype(float, copy=True)

mask = None
if mtol < 1:
# Count the number of missing values that contribute to
# each output percentile value and make a corresponding
# mask
full_size = reduce(
mul, [size for i, size in enumerate(a.shape) if i in axis], 1
)
n_missing = full_size - np.ma.count(
a, axis=axis, keepdims=keepdims
)
if n_missing.any():
mask = np.where(n_missing >= mtol * full_size, True, False)
if q.ndim:
mask = np.expand_dims(mask, 0)

a = np.ma.filled(a, np.nan)

with np.testing.suppress_warnings() as sup:
sup.filter(
category=RuntimeWarning, message=".*All-NaN slice encountered"
)
p = np.nanpercentile(
a,
q,
axis=axis,
interpolation=interpolation,
sadielbartholomew marked this conversation as resolved.
Show resolved Hide resolved
keepdims=keepdims,
overwrite_input=True,
)

# Update the mask for NaN points
nan_mask = np.isnan(p)
if nan_mask.any():
if mask is None:
mask = nan_mask
else:
mask = np.ma.where(nan_mask, True, mask)

# Mask any NaNs and elements below the mtol threshold
if mask is not None:
p = np.ma.where(mask, np.ma.masked, p)

else:
# ------------------------------------------------------------
# Input array is not masked
# ------------------------------------------------------------
p = np.percentile(
a,
q,
axis=axis,
interpolation=interpolation,
sadielbartholomew marked this conversation as resolved.
Show resolved Hide resolved
keepdims=keepdims,
overwrite_input=False,
)

return p


def cf_soften_mask(a):
"""Soften the mask of a masked `numpy` array.

Expand Down
Loading