pymc-devs · kamicollo · Apr 2, 2023 · Apr 2, 2023 · Apr 2, 2023 · Apr 3, 2023
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -39,7 +39,7 @@ repos:
     - id: black
     - id: black-jupyter
 - repo: https://github.com/PyCQA/pylint
-  rev: v2.17.0
+  rev: v2.17.1
   hooks:
     - id: pylint
       args: [--rcfile=.pylintrc]

diff --git a/pymc/data.py b/pymc/data.py
@@ -36,7 +36,7 @@
 
 import pymc as pm
 
-from pymc.pytensorf import convert_observed_data
+from pymc.pytensorf import convert_observed_data, unmask_masked_data
 
 __all__ = [
     "get_data",
@@ -419,10 +419,20 @@ def Data(
         )
     name = model.name_for(name)
 
+    if isinstance(value, np.ma.MaskedArray):
+        warnings.warn(
+            "If possible, masked arrays will be converted to standard numpy arrays with np.nan values for compatibility with PyTensor."
+        )
+
     # `convert_observed_data` takes care of parameter `value` and
     # transforms it to something digestible for PyTensor.
     arr = convert_observed_data(value)
 
+    # because converted_observed_data() is also used outside pyTensor, we need an extra step to ensure that any masked arrays
+    # produced by it are converted back to np.ndarray() with np.nan value.
+    # This is not very efficient and will not be necessary once pyTensor implements MaskedArray support
+    arr = unmask_masked_data(arr)
+
     if mutable is None:
         warnings.warn(
             "The `mutable` kwarg was not specified. Before v4.1.0 it defaulted to `pm.Data(mutable=True)`,"

diff --git a/pymc/model.py b/pymc/model.py
@@ -75,6 +75,7 @@
     hessian,
     inputvars,
     replace_rvs_by_values,
+    unmask_masked_data,
 )
 from pymc.util import (
     UNSET,
@@ -1184,7 +1185,19 @@ def set_data(
 
         if isinstance(values, list):
             values = np.array(values)
+
+        if isinstance(values, np.ma.MaskedArray):
+            warnings.warn(
+                "If possible, masked arrays will be converted to standard numpy arrays with np.nan values for compatibility with PyTensor."
+            )
+
         values = convert_observed_data(values)
+
+        # because converted_observed_data() is also used outside pyTensor, we need an extra step to ensure that any masked arrays
+        # produced by it are converted back to np.ndarray() with np.nan value.
+        # This is not very efficient and will not be necessary once pyTensor implements MaskedArray support
+        values = unmask_masked_data(values)
+
         dims = self.named_vars_to_dims.get(name, None) or ()
         coords = coords or {}
 
@@ -1397,7 +1410,18 @@ def make_obs_var(
             else:
                 rv_var.tag.test_value = data
 
-        mask = getattr(data, "mask", None)
+        # In case observed is a pyTensor variable, we need to retrieve the underlying array
+        # and convert it to a masked one to enable automatic imputation
+        if isinstance(data, SharedVariable):
+            values = data.get_value()
+            mask = np.isnan(values) if np.any(np.isnan(values)) else None
+        elif isinstance(data, TensorConstant):
+            values = data.data
+            mask = np.isnan(values) if np.any(np.isnan(values)) else None
+        else:
+            mask = getattr(data, "mask", None)
+            values = data
+
         if mask is not None:
             impute_message = (
                 f"Data in {rv_var} contains missing values and"
@@ -1443,7 +1467,7 @@ def make_obs_var(
             # values, and another for the non-missing values.
 
             antimask_idx = (~mask).nonzero()
-            nonmissing_data = pt.as_tensor_variable(data[antimask_idx])
+            nonmissing_data = pt.as_tensor_variable(values[antimask_idx])
             unmasked_rv_var = rv_var[antimask_idx]
             unmasked_rv_var = unmasked_rv_var.owner.clone().default_output()
 
@@ -1466,7 +1490,7 @@ def make_obs_var(
 
             # Create deterministic that combines observed and missing
             # Note: This can widely increase memory consumption during sampling for large datasets
-            rv_var = pt.empty(data.shape, dtype=observed_rv_var.type.dtype)
+            rv_var = pt.empty(values.shape, dtype=observed_rv_var.type.dtype)
             rv_var = pt.set_subtensor(rv_var[mask.nonzero()], missing_rv_var)
             rv_var = pt.set_subtensor(rv_var[antimask_idx], observed_rv_var)
             rv_var = Deterministic(name, rv_var, self, dims)

diff --git a/pymc/pytensorf.py b/pymc/pytensorf.py
@@ -84,11 +84,33 @@
     "make_shared_replacements",
     "generator",
     "convert_observed_data",
+    "unmask_masked_data",
     "compile_pymc",
     "constant_fold",
 ]
 
 
+def unmask_masked_data(data):
+    """Unmask masked numpy arrays for usage within PyTensor"""
+
+    # PyTensor currently does not support masked arrays
+    # If a masked array is passed, we convert it to a standard numpy array with np.nans for float type arrays
+    # In case of integer type arrays, we throw an error as np.nan is a float concept.
+
+    if isinstance(data, np.ma.MaskedArray):
+        if "int" in str(data.dtype):
+            raise TypeError(
+                "Masked integer arrays (integer type datasets with missing values) are not supported by pm.Data() / pm.Model.set_data() at this time. \n"
+                "Consider if using a float type fits your use case. \n"
+                "Alternatively, if you want to benefit from automatic imputation in pyMC, pass a masked array directly to `observed=` parameter when defining a distribution."
+            )
+        else:
+            ret = data.filled(fill_value=np.nan)
+    else:
+        ret = data
+    return ret
+
+
 def convert_observed_data(data):
     """Convert user provided dataset to accepted formats."""
 

diff --git a/tests/test_data.py b/tests/test_data.py
@@ -454,6 +454,33 @@ def test_get_data():
     assert type(data) == io.BytesIO
 
 
+def test_masked_data_mutable():
+    with pm.Model():
+        data = np.ma.MaskedArray([1.0, 2.0, 3], [0, 0, 1])
+        expected = np.array([1, 2, np.nan])
+        with pytest.warns(UserWarning, match="masked arrays"):
+            result = pm.MutableData("test", data).get_value()
+        np.testing.assert_array_equal(result, expected)
+
+
+def test_masked_data_constant():
+    with pm.Model():
+        data = np.ma.MaskedArray([1.0, 2.0, 3], [0, 0, 1])
+        expected = np.array([1, 2, np.nan])
+        with pytest.warns(UserWarning, match="masked arrays"):
+            result = pm.ConstantData("test", data).data
+        np.testing.assert_array_equal(result, expected)
+
+
+def test_masked_integer_data():
+    with pm.Model():
+        data = np.ma.MaskedArray([1, 2, 3], [0, 0, 1])
+        with pytest.raises(TypeError, match="Masked integer"):
+            pm.ConstantData("test", data)
+        with pytest.raises(TypeError, match="Masked integer"):
+            pm.MutableData("test", data)
+
+
 class _DataSampler:
     """
     Not for users