From 653efafcbfeb1ef621cdcaa241441ad92b7d85d6 Mon Sep 17 00:00:00 2001 From: Michael Osthege Date: Sun, 24 Jan 2021 23:09:45 +0100 Subject: [PATCH] Support imputations with ndarray data closes #4437 --- RELEASE-NOTES.md | 3 ++- pymc3/model.py | 33 ++++++++++++++++++++++--------- pymc3/tests/test_model_helpers.py | 11 +++++++---- 3 files changed, 33 insertions(+), 14 deletions(-) diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md index 65cde7cc6ed..99e35d659b3 100644 --- a/RELEASE-NOTES.md +++ b/RELEASE-NOTES.md @@ -1,10 +1,11 @@ # Release Notes -## PyMC3 vNext (on deck) +## PyMC3 vNext (3.11.1) ### Breaking Changes ### New Features ++ Automatic imputations now also work with `ndarray` data, not just `pd.Series` or `pd.DataFrame` (see[#4439](https://github.com/pymc-devs/pymc3/pull/4439)). ### Maintenance - `math.log1mexp_numpy` no longer raises RuntimeWarning when given very small inputs. These were commonly observed during NUTS sampling (see [#4428](https://github.com/pymc-devs/pymc3/pull/4428)). diff --git a/pymc3/model.py b/pymc3/model.py index 393c4d2f6a2..0794116809c 100644 --- a/pymc3/model.py +++ b/pymc3/model.py @@ -1695,16 +1695,31 @@ def pandas_to_array(data): XXX: When `data` is a generator, this will return a Theano tensor! """ - if hasattr(data, "values"): # pandas - if data.isnull().any().any(): # missing values - ret = np.ma.MaskedArray(data.values, data.isnull().values) + if hasattr(data, "to_numpy"): + # typically, but not limited to pandas objects + vals = data.to_numpy() + mask = np.isnan(vals) + if mask.any(): + # there are missing values + ret = np.ma.MaskedArray(vals, mask) else: - ret = data.values - elif hasattr(data, "mask"): - if data.mask.any(): - ret = data - else: # empty mask - ret = data.filled() + ret = vals + elif isinstance(data, np.ndarray): + if isinstance(data, np.ma.MaskedArray): + if not data.mask.any(): + # empty mask + ret = data.filled() + else: + # already masked and rightly so + ret = data + else: + # already a ndarray, but not masked + mask = np.isnan(data) + if np.any(mask): + ret = np.ma.MaskedArray(data, mask) + else: + # no masking required + ret = data elif isinstance(data, theano.graph.basic.Variable): ret = data elif sps.issparse(data): diff --git a/pymc3/tests/test_model_helpers.py b/pymc3/tests/test_model_helpers.py index 7b049bac707..b0d8efc9e34 100644 --- a/pymc3/tests/test_model_helpers.py +++ b/pymc3/tests/test_model_helpers.py @@ -41,9 +41,12 @@ def test_pandas_to_array(self, input_dtype): pandas_input = pd.DataFrame(dense_input) # All the even numbers are replaced with NaN - missing_pandas_input = pd.DataFrame( - np.array([[np.nan, 1, np.nan], [3, np.nan, 5], [np.nan, 7, np.nan]]) - ) + missing_numpy_input = np.array([ + [np.nan, 1, np.nan], + [3, np.nan, 5], + [np.nan, 7, np.nan] + ]) + missing_pandas_input = pd.DataFrame(missing_numpy_input) masked_array_input = ma.array(dense_input, mask=(np.mod(dense_input, 2) == 0)) # Create a generator object. Apparently the generator object needs to @@ -72,7 +75,7 @@ def test_pandas_to_array(self, input_dtype): # Check function behavior when using masked array inputs and pandas # objects with missing data - for input_value in [masked_array_input, missing_pandas_input]: + for input_value in [missing_numpy_input, masked_array_input, missing_pandas_input]: func_output = func(input_value) assert isinstance(func_output, ma.core.MaskedArray) assert func_output.shape == input_value.shape