Skip to content

Allow passing masked arrays with missing values to pm.Data() #6645

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ repos:
- id: black
- id: black-jupyter
- repo: https://github.com/PyCQA/pylint
rev: v2.17.0
rev: v2.17.1
hooks:
- id: pylint
args: [--rcfile=.pylintrc]
Expand Down
12 changes: 11 additions & 1 deletion pymc/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@

import pymc as pm

from pymc.pytensorf import convert_observed_data
from pymc.pytensorf import convert_observed_data, unmask_masked_data

__all__ = [
"get_data",
Expand Down Expand Up @@ -419,10 +419,20 @@ def Data(
)
name = model.name_for(name)

if isinstance(value, np.ma.MaskedArray):
warnings.warn(
"If possible, masked arrays will be converted to standard numpy arrays with np.nan values for compatibility with PyTensor."
)

# `convert_observed_data` takes care of parameter `value` and
# transforms it to something digestible for PyTensor.
arr = convert_observed_data(value)

# because converted_observed_data() is also used outside pyTensor, we need an extra step to ensure that any masked arrays
# produced by it are converted back to np.ndarray() with np.nan value.
# This is not very efficient and will not be necessary once pyTensor implements MaskedArray support
arr = unmask_masked_data(arr)

if mutable is None:
warnings.warn(
"The `mutable` kwarg was not specified. Before v4.1.0 it defaulted to `pm.Data(mutable=True)`,"
Expand Down
30 changes: 27 additions & 3 deletions pymc/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
hessian,
inputvars,
replace_rvs_by_values,
unmask_masked_data,
)
from pymc.util import (
UNSET,
Expand Down Expand Up @@ -1184,7 +1185,19 @@ def set_data(

if isinstance(values, list):
values = np.array(values)

if isinstance(values, np.ma.MaskedArray):
warnings.warn(
"If possible, masked arrays will be converted to standard numpy arrays with np.nan values for compatibility with PyTensor."
)

values = convert_observed_data(values)

# because converted_observed_data() is also used outside pyTensor, we need an extra step to ensure that any masked arrays
# produced by it are converted back to np.ndarray() with np.nan value.
# This is not very efficient and will not be necessary once pyTensor implements MaskedArray support
values = unmask_masked_data(values)

dims = self.named_vars_to_dims.get(name, None) or ()
coords = coords or {}

Expand Down Expand Up @@ -1397,7 +1410,18 @@ def make_obs_var(
else:
rv_var.tag.test_value = data

mask = getattr(data, "mask", None)
# In case observed is a pyTensor variable, we need to retrieve the underlying array
# and convert it to a masked one to enable automatic imputation
if isinstance(data, SharedVariable):
values = data.get_value()
mask = np.isnan(values) if np.any(np.isnan(values)) else None
elif isinstance(data, TensorConstant):
values = data.data
mask = np.isnan(values) if np.any(np.isnan(values)) else None
else:
mask = getattr(data, "mask", None)
values = data

if mask is not None:
impute_message = (
f"Data in {rv_var} contains missing values and"
Expand Down Expand Up @@ -1443,7 +1467,7 @@ def make_obs_var(
# values, and another for the non-missing values.

antimask_idx = (~mask).nonzero()
nonmissing_data = pt.as_tensor_variable(data[antimask_idx])
nonmissing_data = pt.as_tensor_variable(values[antimask_idx])
unmasked_rv_var = rv_var[antimask_idx]
unmasked_rv_var = unmasked_rv_var.owner.clone().default_output()

Expand All @@ -1466,7 +1490,7 @@ def make_obs_var(

# Create deterministic that combines observed and missing
# Note: This can widely increase memory consumption during sampling for large datasets
rv_var = pt.empty(data.shape, dtype=observed_rv_var.type.dtype)
rv_var = pt.empty(values.shape, dtype=observed_rv_var.type.dtype)
rv_var = pt.set_subtensor(rv_var[mask.nonzero()], missing_rv_var)
rv_var = pt.set_subtensor(rv_var[antimask_idx], observed_rv_var)
rv_var = Deterministic(name, rv_var, self, dims)
Expand Down
22 changes: 22 additions & 0 deletions pymc/pytensorf.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,33 @@
"make_shared_replacements",
"generator",
"convert_observed_data",
"unmask_masked_data",
"compile_pymc",
"constant_fold",
]


def unmask_masked_data(data):
"""Unmask masked numpy arrays for usage within PyTensor"""

# PyTensor currently does not support masked arrays
# If a masked array is passed, we convert it to a standard numpy array with np.nans for float type arrays
# In case of integer type arrays, we throw an error as np.nan is a float concept.

if isinstance(data, np.ma.MaskedArray):
if "int" in str(data.dtype):
raise TypeError(
"Masked integer arrays (integer type datasets with missing values) are not supported by pm.Data() / pm.Model.set_data() at this time. \n"
"Consider if using a float type fits your use case. \n"
"Alternatively, if you want to benefit from automatic imputation in pyMC, pass a masked array directly to `observed=` parameter when defining a distribution."
)
else:
ret = data.filled(fill_value=np.nan)
else:
ret = data
return ret


def convert_observed_data(data):
"""Convert user provided dataset to accepted formats."""

Expand Down
27 changes: 27 additions & 0 deletions tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,33 @@ def test_get_data():
assert type(data) == io.BytesIO


def test_masked_data_mutable():
with pm.Model():
data = np.ma.MaskedArray([1.0, 2.0, 3], [0, 0, 1])
expected = np.array([1, 2, np.nan])
with pytest.warns(UserWarning, match="masked arrays"):
result = pm.MutableData("test", data).get_value()
np.testing.assert_array_equal(result, expected)


def test_masked_data_constant():
with pm.Model():
data = np.ma.MaskedArray([1.0, 2.0, 3], [0, 0, 1])
expected = np.array([1, 2, np.nan])
with pytest.warns(UserWarning, match="masked arrays"):
result = pm.ConstantData("test", data).data
np.testing.assert_array_equal(result, expected)


def test_masked_integer_data():
with pm.Model():
data = np.ma.MaskedArray([1, 2, 3], [0, 0, 1])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Integers should be fine, otherwise we can't input discrete variables?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately, we can't - you cannot have an integer NumPy array with nan values, ie. this throws an error: np.array([1,2,3, np.nan[. dtype=int). That's because nan is strictly a float concept. So yes, we would not be able to allow users pass an integer masked array into pm.Data(). If they want to benefit from automatic imputation, they can today (and will be after this PR) pass a masked integer array directly into observed parameter of an RV. I have an error message that explains the options in the code:

28d15f8#diff-823b37f218229d363550b4cc387cfffa180c5c6e0e5ad0e174f2f0be7aa4692aR102

if isinstance(data, np.ma.MaskedArray):
        if "int" in str(data.dtype):
            raise TypeError(
                "Masked integer arrays (integer type datasets with missing values) are not supported by pm.Data() / pm.Model.set_data() at this time. \n"
                "Consider if using a float type fits your use case. \n"
                "Alternatively, if you want to benefit from automatic imputation in pyMC, pass a masked array directly to `observed=` parameter when defining a distribution."
            )
        else:
            ret = data.filled(fill_value=np.nan)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wasn't clear. Is any error raised if user pass floats observed to discrete variables? I think it works just fine.

with pytest.raises(TypeError, match="Masked integer"):
pm.ConstantData("test", data)
with pytest.raises(TypeError, match="Masked integer"):
pm.MutableData("test", data)


class _DataSampler:
"""
Not for users
Expand Down
Loading