Allow specification of dims instead of shape (#3551)

* Allow specification of dims instead of shape * Add pm.TidyData * Create coords for pm.Data(ndarray) * empty commit to trigger CI * Apply suggestions from code review Co-authored-by: Alexandre ANDORRA <andorra.alexandre@gmail.com> * apply black formatting * address review comments & formatting * Add demonstration of named coordinates/dims * don't require dim names to be identifiers * sort imports * raise ShapeError instead of ValueError * formatting * robustify Dtype and ShapeError * Removed TidyData and refined dims and coords implementation * Changed name of kwarg export_dims and improved docstrings * Add link to ArviZ in docstrings * Removed TidyData from __all__ * Polished Data container NB * Fixed line break in data.py * Fix inference of coords for dataframes * Refined Data container NB * Updated getting started NB with new dims and coords features * Reran getting started NB * Blackified NBs * rerun with ArviZ branch * use np.shape to be compatible with tuples/lists * add tests for named coordinate handling * Extended tests for data container Co-authored-by: Michael Osthege <m.osthege@fz-juelich.de> Co-authored-by: Michael Osthege <michael.osthege@outlook.com> Co-authored-by: Alexandre ANDORRA <andorra.alexandre@gmail.com>
pymc-devs · Jun 10, 2020 · 76f3a7b · 76f3a7b
1 parent 8a8beab
commit 76f3a7b
Show file tree

Hide file tree

Showing 9 changed files with 2,375 additions and 728 deletions.
diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
@@ -17,6 +17,7 @@
 - `pm.LKJCholeskyCov` now automatically computes and returns the unpacked Cholesky decomposition, the correlations and the standard deviations of the covariance matrix (see [#3881](https://github.com/pymc-devs/pymc3/pull/3881)).
 - `pm.Data` container can now be used for index variables, i.e with integer data and not only floats (issue [#3813](https://github.com/pymc-devs/pymc3/issues/3813), fixed by [#3925](https://github.com/pymc-devs/pymc3/pull/3925)).
 - `pm.Data` container can now be used as input for other random variables (issue [#3842](https://github.com/pymc-devs/pymc3/issues/3842), fixed by [#3925](https://github.com/pymc-devs/pymc3/pull/3925)).
+- Allow users to specify coordinates and dimension names instead of numerical shapes when specifying a model. This makes interoperability with ArviZ easier. ([see #3551](https://github.com/pymc-devs/pymc3/pull/3551))
 - Plots and Stats API sections now link to ArviZ documentation [#3927](https://github.com/pymc-devs/pymc3/pull/3927)
 - Add `SamplerReport` with properties `n_draws`, `t_sampling` and `n_tune` to SMC. `n_tune` is always 0 [#3931](https://github.com/pymc-devs/pymc3/issues/3931).
 - SMC-ABC: add option to define summary statistics, allow to sample from more complex models, remove redundant distances [#3940](https://github.com/pymc-devs/pymc3/issues/3940)

diff --git a/docs/source/notebooks/data_container.ipynb b/docs/source/notebooks/data_container.ipynb
diff --git a/docs/source/notebooks/getting_started.ipynb b/docs/source/notebooks/getting_started.ipynb
diff --git a/pymc3/data.py b/pymc3/data.py
@@ -12,23 +12,25 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
-from typing import Dict, List, Any
+import collections
 from copy import copy
 import io
 import os
 import pkgutil
-import collections
+from typing import Dict, List, Any
+
 import numpy as np
+import pandas as pd
 import pymc3 as pm
 import theano.tensor as tt
 import theano
 
 __all__ = [
-    'get_data',
-    'GeneratorAdapter',
-    'Minibatch',
-    'align_minibatches',
-    'Data',
+    "get_data",
+    "GeneratorAdapter",
+    "Minibatch",
+    "align_minibatches",
+    "Data",
 ]
 
 
@@ -44,8 +46,8 @@ def get_data(filename):
     -------
     BytesIO of the data
     """
-    data_pkg = 'pymc3.examples'
-    return io.BytesIO(pkgutil.get_data(data_pkg, os.path.join('data', filename)))
+    data_pkg = "pymc3.examples"
+    return io.BytesIO(pkgutil.get_data(data_pkg, os.path.join("data", filename)))
 
 
 class GenTensorVariable(tt.TensorVariable):
@@ -78,14 +80,14 @@ def make_variable(self, gop, name=None):
 
     def __init__(self, generator):
         if not pm.vartypes.isgenerator(generator):
-            raise TypeError('Object should be generator like')
+            raise TypeError("Object should be generator like")
         self.test_value = pm.smartfloatX(copy(next(generator)))
         # make pickling potentially possible
         self._yielded_test_value = False
         self.gen = generator
         self.tensortype = tt.TensorType(
-            self.test_value.dtype,
-            ((False, ) * self.test_value.ndim))
+            self.test_value.dtype, ((False,) * self.test_value.ndim)
+        )
 
     # python3 generator
     def __next__(self):
@@ -283,28 +285,37 @@ class Minibatch(tt.TensorVariable):
         >>> assert x.eval().shape == (2, 20, 20, 40, 10)
     """
 
-    RNG = collections.defaultdict(list) # type: Dict[str, List[Any]]
-
-    @theano.configparser.change_flags(compute_test_value='raise')
-    def __init__(self, data, batch_size=128, dtype=None, broadcastable=None, name='Minibatch',
-                 random_seed=42, update_shared_f=None, in_memory_size=None):
+    RNG = collections.defaultdict(list)  # type: Dict[str, List[Any]]
+
+    @theano.configparser.change_flags(compute_test_value="raise")
+    def __init__(
+        self,
+        data,
+        batch_size=128,
+        dtype=None,
+        broadcastable=None,
+        name="Minibatch",
+        random_seed=42,
+        update_shared_f=None,
+        in_memory_size=None,
+    ):
         if dtype is None:
             data = pm.smartfloatX(np.asarray(data))
         else:
             data = np.asarray(data, dtype)
         in_memory_slc = self.make_static_slices(in_memory_size)
         self.shared = theano.shared(data[in_memory_slc])
         self.update_shared_f = update_shared_f
-        self.random_slc = self.make_random_slices(self.shared.shape, batch_size, random_seed)
+        self.random_slc = self.make_random_slices(
+            self.shared.shape, batch_size, random_seed
+        )
         minibatch = self.shared[self.random_slc]
         if broadcastable is None:
-            broadcastable = (False, ) * minibatch.ndim
+            broadcastable = (False,) * minibatch.ndim
         minibatch = tt.patternbroadcast(minibatch, broadcastable)
         self.minibatch = minibatch
         super().__init__(self.minibatch.type, None, None, name=name)
-        theano.Apply(
-            theano.compile.view_op,
-            inputs=[self.minibatch], outputs=[self])
+        theano.Apply(theano.compile.view_op, inputs=[self.minibatch], outputs=[self])
         self.tag.test_value = copy(self.minibatch.tag.test_value)
 
     def rslice(self, total, size, seed):
@@ -313,11 +324,11 @@ def rslice(self, total, size, seed):
         elif isinstance(size, int):
             rng = pm.tt_rng(seed)
             Minibatch.RNG[id(self)].append(rng)
-            return (rng
-                    .uniform(size=(size, ), low=0.0, high=pm.floatX(total) - 1e-16)
-                    .astype('int64'))
+            return rng.uniform(
+                size=(size,), low=0.0, high=pm.floatX(total) - 1e-16
+            ).astype("int64")
         else:
-            raise TypeError('Unrecognized size type, %r' % size)
+            raise TypeError("Unrecognized size type, %r" % size)
 
     def __del__(self):
         del Minibatch.RNG[id(self)]
@@ -340,17 +351,18 @@ def make_static_slices(user_size):
                 elif isinstance(i, slice):
                     slc.append(i)
                 else:
-                    raise TypeError('Unrecognized size type, %r' % user_size)
+                    raise TypeError("Unrecognized size type, %r" % user_size)
             return slc
         else:
-            raise TypeError('Unrecognized size type, %r' % user_size)
+            raise TypeError("Unrecognized size type, %r" % user_size)
 
     def make_random_slices(self, in_memory_shape, batch_size, default_random_seed):
         if batch_size is None:
             return [Ellipsis]
         elif isinstance(batch_size, int):
             slc = [self.rslice(in_memory_shape[0], batch_size, default_random_seed)]
         elif isinstance(batch_size, (list, tuple)):
+
             def check(t):
                 if t is Ellipsis or t is None:
                     return True
@@ -364,12 +376,14 @@ def check(t):
                         return True
                     else:
                         return False
+
             # end check definition
             if not all(check(t) for t in batch_size):
-                raise TypeError('Unrecognized `batch_size` type, expected '
-                                'int or List[int|tuple(size, random_seed)] where '
-                                'size and random seed are both ints, got %r' %
-                                batch_size)
+                raise TypeError(
+                    "Unrecognized `batch_size` type, expected "
+                    "int or List[int|tuple(size, random_seed)] where "
+                    "size and random seed are both ints, got %r" % batch_size
+                )
             batch_size = [
                 (i, default_random_seed) if isinstance(i, int) else i
                 for i in batch_size
@@ -378,12 +392,14 @@ def check(t):
             if Ellipsis in batch_size:
                 sep = batch_size.index(Ellipsis)
                 begin = batch_size[:sep]
-                end = batch_size[sep + 1:]
+                end = batch_size[sep + 1 :]
                 if Ellipsis in end:
-                    raise ValueError('Double Ellipsis in `batch_size` is restricted, got %r' %
-                                     batch_size)
+                    raise ValueError(
+                        "Double Ellipsis in `batch_size` is restricted, got %r"
+                        % batch_size
+                    )
                 if len(end) > 0:
-                    shp_mid = shape[sep:-len(end)]
+                    shp_mid = shape[sep : -len(end)]
                     mid = [tt.arange(s) for s in shp_mid]
                 else:
                     mid = []
@@ -392,23 +408,30 @@ def check(t):
                 end = []
                 mid = []
             if (len(begin) + len(end)) > len(in_memory_shape.eval()):
-                raise ValueError('Length of `batch_size` is too big, '
-                                 'number of ints is bigger that ndim, got %r'
-                                 % batch_size)
+                raise ValueError(
+                    "Length of `batch_size` is too big, "
+                    "number of ints is bigger that ndim, got %r" % batch_size
+                )
             if len(end) > 0:
-                shp_end = shape[-len(end):]
+                shp_end = shape[-len(end) :]
             else:
                 shp_end = np.asarray([])
-            shp_begin = shape[:len(begin)]
-            slc_begin = [self.rslice(shp_begin[i], t[0], t[1])
-                         if t is not None else tt.arange(shp_begin[i])
-                         for i, t in enumerate(begin)]
-            slc_end = [self.rslice(shp_end[i], t[0], t[1])
-                       if t is not None else tt.arange(shp_end[i])
-                       for i, t in enumerate(end)]
+            shp_begin = shape[: len(begin)]
+            slc_begin = [
+                self.rslice(shp_begin[i], t[0], t[1])
+                if t is not None
+                else tt.arange(shp_begin[i])
+                for i, t in enumerate(begin)
+            ]
+            slc_end = [
+                self.rslice(shp_end[i], t[0], t[1])
+                if t is not None
+                else tt.arange(shp_end[i])
+                for i, t in enumerate(end)
+            ]
             slc = slc_begin + mid + slc_end
         else:
-            raise TypeError('Unrecognized size type, %r' % batch_size)
+            raise TypeError("Unrecognized size type, %r" % batch_size)
         return pm.theanof.ix_(*slc)
 
     def update_shared(self):
@@ -434,7 +457,7 @@ def align_minibatches(batches=None):
     else:
         for b in batches:
             if not isinstance(b, Minibatch):
-                raise TypeError('{b} is not a Minibatch')
+                raise TypeError("{b} is not a Minibatch")
             for rng in Minibatch.RNG[id(b)]:
                 rng.seed()
 
@@ -447,8 +470,17 @@ class Data:
     ----------
     name: str
         The name for this variable
-    value
+    value: {List, np.ndarray, pd.Series, pd.Dataframe}
         A value to associate with this variable
+    dims: {str, tuple of str}, optional, default=None
+        Dimension names of the random variables (as opposed to the shapes of these
+        random variables). Use this when `value` is a Pandas Series or DataFrame. The
+        `dims` will then be the name of the Series / DataFrame's columns. See ArviZ
+        documentation for more information about dimensions and coordinates:
+        https://arviz-devs.github.io/arviz/notebooks/Introduction.html
+    export_index_as_coords: bool, optional, default=False
+        If True, the `Data` container will try to infer what the coordinates should be
+        if there is an index in `value`.
 
     Examples
     --------
@@ -479,7 +511,7 @@ class Data:
     https://docs.pymc.io/notebooks/data_container.html
     """
 
-    def __new__(self, name, value):
+    def __new__(self, name, value, *, dims=None, export_index_as_coords=False):
         if isinstance(value, list):
             value = np.array(value)
 
@@ -497,10 +529,68 @@ def __new__(self, name, value):
         # transforms it to something digestible for pymc3
         shared_object = theano.shared(pm.model.pandas_to_array(value), name)
 
+        if isinstance(dims, str):
+            dims = (dims,)
+        if not (dims is None or len(dims) == shared_object.ndim):
+            raise pm.exceptions.ShapeError(
+                "Length of `dims` must match the dimensions of the dataset.",
+                actual=len(dims), expected=shared_object.ndim
+            )
+
+        coords = self.set_coords(model, value, dims)
+
+        if export_index_as_coords:
+            model.add_coords(coords)
+
         # To draw the node for this variable in the graphviz Digraph we need
         # its shape.
         shared_object.dshape = tuple(shared_object.shape.eval())
+        if dims is not None:
+            shape_dims = model.shape_from_dims(dims)
+            if shared_object.dshape != shape_dims:
+                raise pm.exceptions.ShapeError(
+                    "Data shape does not match with specified `dims`.",
+                    actual=shared_object.dshape, expected=shape_dims
+                )
 
-        model.add_random_variable(shared_object)
+        model.add_random_variable(shared_object, dims=dims)
 
         return shared_object
+
+    @staticmethod
+    def set_coords(model, value, dims=None):
+        coords = {}
+
+        # If value is a df or a series, we interpret the index as coords:
+        if isinstance(value, (pd.Series, pd.DataFrame)):
+            dim_name = None
+            if dims is not None:
+                dim_name = dims[0]
+            if dim_name is None and value.index.name is not None:
+                dim_name = value.index.name
+            if dim_name is not None:
+                coords[dim_name] = value.index
+
+        # If value is a df, we also interpret the columns as coords:
+        if isinstance(value, pd.DataFrame):
+            dim_name = None
+            if dims is not None:
+                dim_name = dims[1]
+            if dim_name is None and value.columns.name is not None:
+                dim_name = value.columns.name
+            if dim_name is not None:
+                coords[dim_name] = value.columns
+
+        if isinstance(value, np.ndarray) and dims is not None:
+            if len(dims) != value.ndim:
+                raise pm.exceptions.ShapeError(
+                    "Invalid data shape. The rank of the dataset must match the "
+                    "length of `dims`.",
+                    actual=value.shape, expected=value.ndim
+                )
+            for size, dim in zip(value.shape, dims):
+                coord = model.coords.get(dim, None)
+                if coord is None:
+                    coords[dim] = pd.RangeIndex(size, name=dim)
+
+        return coords