- Float masked arrays are filled with nan when

kamicollo · kamicollo · commit 28d15f809ddf · 2023-04-01T22:48:12.000-04:00
passed to pm.Data() and pm.Model().set_data()
- Integer masked arrays trigger an error message and
provide suggested alternatives
diff --git a/pymc/data.py b/pymc/data.py
@@ -36,7 +36,7 @@
 
 import pymc as pm
 
-from pymc.pytensorf import convert_observed_data
+from pymc.pytensorf import convert_observed_data, unmask_masked_data
 
 __all__ = [
     "get_data",
@@ -419,10 +419,20 @@ def Data(
         )
     name = model.name_for(name)
 
+    if isinstance(value, np.ma.MaskedArray):
+        warnings.warn(
+            "If possible, masked arrays will be converted to standard numpy arrays with np.nan values for compatibility with PyTensor."
+        )
+
     # `convert_observed_data` takes care of parameter `value` and
     # transforms it to something digestible for PyTensor.
     arr = convert_observed_data(value)
 
+    # because converted_observed_data() is also used outside pyTensor, we need an extra step to ensure that any masked arrays
+    # produced by it are converted back to np.ndarray() with np.nan value.
+    # This is not very efficient and will not be necessary once pyTensor implements MaskedArray support
+    arr = unmask_masked_data(arr)
+
     if mutable is None:
         warnings.warn(
             "The `mutable` kwarg was not specified. Before v4.1.0 it defaulted to `pm.Data(mutable=True)`,"
diff --git a/pymc/model.py b/pymc/model.py
@@ -75,6 +75,7 @@
     hessian,
     inputvars,
     replace_rvs_by_values,
+    unmask_masked_data,
 )
 from pymc.util import (
     UNSET,
@@ -1184,7 +1185,19 @@ def set_data(
 
         if isinstance(values, list):
             values = np.array(values)
+
+        if isinstance(values, np.ma.MaskedArray):
+            warnings.warn(
+                "If possible, masked arrays will be converted to standard numpy arrays with np.nan values for compatibility with PyTensor."
+            )
+
         values = convert_observed_data(values)
+
+        # because converted_observed_data() is also used outside pyTensor, we need an extra step to ensure that any masked arrays
+        # produced by it are converted back to np.ndarray() with np.nan value.
+        # This is not very efficient and will not be necessary once pyTensor implements MaskedArray support
+        values = unmask_masked_data(values)
+
         dims = self.named_vars_to_dims.get(name, None) or ()
         coords = coords or {}
 
diff --git a/pymc/pytensorf.py b/pymc/pytensorf.py
@@ -84,11 +84,33 @@
     "make_shared_replacements",
     "generator",
     "convert_observed_data",
+    "unmask_masked_data",
     "compile_pymc",
     "constant_fold",
 ]
 
 
+def unmask_masked_data(data):
+    """Unmask masked numpy arrays for usage within PyTensor"""
+
+    # PyTensor currently does not support masked arrays
+    # If a masked array is passed, we convert it to a standard numpy array with np.nans for float type arrays
+    # In case of integer type arrays, we throw an error as np.nan is a float concept.
+
+    if isinstance(data, np.ma.MaskedArray):
+        if "int" in str(data.dtype):
+            raise TypeError(
+                "Masked integer arrays (integer type datasets with missing values) are not supported by pm.Data() / pm.Model.set_data() at this time. \n"
+                "Consider if using a float type fits your use case. \n"
+                "Alternatively, if you want to benefit from automatic imputation in pyMC, pass a masked array directly to `observed=` parameter when defining a distribution."
+            )
+        else:
+            ret = data.filled(fill_value=np.nan)
+    else:
+        ret = data
+    return ret
+
+
 def convert_observed_data(data):
     """Convert user provided dataset to accepted formats."""
 
diff --git a/tests/test_data.py b/tests/test_data.py
@@ -454,6 +454,33 @@ def test_get_data():
     assert type(data) == io.BytesIO
 
 
+def test_masked_data_mutable():
+    with pm.Model():
+        data = np.ma.MaskedArray([1.0, 2.0, 3], [0, 0, 1])
+        expected = np.array([1, 2, np.nan])
+        with pytest.warns(UserWarning, match="masked arrays"):
+            result = pm.MutableData("test", data).get_value()
+        np.testing.assert_array_equal(result, expected)
+
+
+def test_masked_data_constant():
+    with pm.Model():
+        data = np.ma.MaskedArray([1.0, 2.0, 3], [0, 0, 1])
+        expected = np.array([1, 2, np.nan])
+        with pytest.warns(UserWarning, match="masked arrays"):
+            result = pm.ConstantData("test", data).data
+        np.testing.assert_array_equal(result, expected)
+
+
+def test_masked_integer_data():
+    with pm.Model():
+        data = np.ma.MaskedArray([1, 2, 3], [0, 0, 1])
+        with pytest.raises(TypeError, match="Masked integer"):
+            pm.ConstantData("test", data)
+        with pytest.raises(TypeError, match="Masked integer"):
+            pm.MutableData("test", data)
+
+
 class _DataSampler:
     """
     Not for users
diff --git a/tests/test_model.py b/tests/test_model.py
@@ -967,6 +967,27 @@ def test_set_data_constant_shape_error():
         pmodel.set_data("y", np.arange(10))
 
 
+def test_set_data_masked_array():
+    data = np.ma.MaskedArray([1.0, 2.0, 3], [0, 0, 1])
+
+    with pm.Model() as pmodel:
+        D = pm.MutableData("test", np.zeros(4))
+
+    with pytest.warns(UserWarning, match="masked arrays"):
+        pmodel.set_data("test", data)
+    result = D.get_value()
+    expected = np.array([1.0, 2.0, np.nan])
+    np.testing.assert_array_equal(result, expected)
+
+
+def test_set_data_masked_integer_array():
+    with pm.Model() as pmodel:
+        D = pm.MutableData("test", np.zeros(4))
+    with pytest.warns(UserWarning, match="masked arrays"):
+        with pytest.raises(TypeError, match="Masked integer"):
+            pmodel.set_data("test", np.ma.MaskedArray([1, 2, 3], [0, 0, 1]))
+
+
 def test_model_deprecation_warning():
     with pm.Model() as m:
         x = pm.Normal("x", 0, 1, size=2)
diff --git a/tests/test_pytensorf.py b/tests/test_pytensorf.py
@@ -49,6 +49,7 @@
     replace_rvs_by_values,
     reseed_rngs,
     rvs_to_value_vars,
+    unmask_masked_data,
     walk_model,
 )
 from pymc.testing import assert_no_rvs
@@ -269,6 +270,25 @@ def test_convert_observed_data(input_dtype):
     assert isinstance(wrapped, TensorVariable)
 
 
+def test_unmask_masked_data():
+    # test with non-masked data
+    data = np.array([1, 2, 3])
+    result = unmask_masked_data(data)
+    expected = np.array([1, 2, 3])
+    np.testing.assert_array_equal(result, expected)
+
+    # test with masked float data
+    data = np.ma.MaskedArray([1.0, 2.0, 3.0], [0, 0, 1])
+    result = unmask_masked_data(data)
+    expected = np.array([1.0, 2.0, np.nan])
+    np.testing.assert_array_equal(result, expected)
+
+    # test with integer masked data
+    data = np.ma.MaskedArray([1, 2, 3], [0, 0, 1])
+    with pytest.raises(TypeError, match="Masked integer"):
+        unmask_masked_data(data)
+
+
 def test_pandas_to_array_pandas_index():
     data = pd.Index([1, 2, 3])
     result = convert_observed_data(data)