Skip to content

Commit

Permalink
Allow specification of dims instead of shape (#3551)
Browse files Browse the repository at this point in the history
* Allow specification of dims instead of shape

* Add pm.TidyData

* Create coords for pm.Data(ndarray)

* empty commit to trigger CI

* Apply suggestions from code review

Co-authored-by: Alexandre ANDORRA <andorra.alexandre@gmail.com>

* apply black formatting

* address review comments & formatting

* Add demonstration of named coordinates/dims

* don't require dim names to be identifiers

* sort imports

* raise ShapeError instead of ValueError

* formatting

* robustify Dtype and ShapeError

* Removed TidyData and refined dims and coords implementation

* Changed name of kwarg export_dims and improved docstrings

* Add link to ArviZ in docstrings

* Removed TidyData from __all__

* Polished Data container NB

* Fixed line break in data.py

* Fix inference of coords for dataframes

* Refined Data container NB

* Updated getting started NB with new dims and coords features

* Reran getting started NB

* Blackified NBs

* rerun with ArviZ branch

* use np.shape to be compatible with tuples/lists

* add tests for named coordinate handling

* Extended tests for data container

Co-authored-by: Michael Osthege <m.osthege@fz-juelich.de>
Co-authored-by: Michael Osthege <michael.osthege@outlook.com>
Co-authored-by: Alexandre ANDORRA <andorra.alexandre@gmail.com>
  • Loading branch information
4 people authored Jun 10, 2020
1 parent 8a8beab commit 76f3a7b
Show file tree
Hide file tree
Showing 9 changed files with 2,375 additions and 728 deletions.
1 change: 1 addition & 0 deletions RELEASE-NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
- `pm.LKJCholeskyCov` now automatically computes and returns the unpacked Cholesky decomposition, the correlations and the standard deviations of the covariance matrix (see [#3881](https://github.com/pymc-devs/pymc3/pull/3881)).
- `pm.Data` container can now be used for index variables, i.e with integer data and not only floats (issue [#3813](https://github.com/pymc-devs/pymc3/issues/3813), fixed by [#3925](https://github.com/pymc-devs/pymc3/pull/3925)).
- `pm.Data` container can now be used as input for other random variables (issue [#3842](https://github.com/pymc-devs/pymc3/issues/3842), fixed by [#3925](https://github.com/pymc-devs/pymc3/pull/3925)).
- Allow users to specify coordinates and dimension names instead of numerical shapes when specifying a model. This makes interoperability with ArviZ easier. ([see #3551](https://github.com/pymc-devs/pymc3/pull/3551))
- Plots and Stats API sections now link to ArviZ documentation [#3927](https://github.com/pymc-devs/pymc3/pull/3927)
- Add `SamplerReport` with properties `n_draws`, `t_sampling` and `n_tune` to SMC. `n_tune` is always 0 [#3931](https://github.com/pymc-devs/pymc3/issues/3931).
- SMC-ABC: add option to define summary statistics, allow to sample from more complex models, remove redundant distances [#3940](https://github.com/pymc-devs/pymc3/issues/3940)
Expand Down
1,271 changes: 1,064 additions & 207 deletions docs/source/notebooks/data_container.ipynb

Large diffs are not rendered by default.

900 changes: 669 additions & 231 deletions docs/source/notebooks/getting_started.ipynb

Large diffs are not rendered by default.

194 changes: 142 additions & 52 deletions pymc3/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,25 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Dict, List, Any
import collections
from copy import copy
import io
import os
import pkgutil
import collections
from typing import Dict, List, Any

import numpy as np
import pandas as pd
import pymc3 as pm
import theano.tensor as tt
import theano

__all__ = [
'get_data',
'GeneratorAdapter',
'Minibatch',
'align_minibatches',
'Data',
"get_data",
"GeneratorAdapter",
"Minibatch",
"align_minibatches",
"Data",
]


Expand All @@ -44,8 +46,8 @@ def get_data(filename):
-------
BytesIO of the data
"""
data_pkg = 'pymc3.examples'
return io.BytesIO(pkgutil.get_data(data_pkg, os.path.join('data', filename)))
data_pkg = "pymc3.examples"
return io.BytesIO(pkgutil.get_data(data_pkg, os.path.join("data", filename)))


class GenTensorVariable(tt.TensorVariable):
Expand Down Expand Up @@ -78,14 +80,14 @@ def make_variable(self, gop, name=None):

def __init__(self, generator):
if not pm.vartypes.isgenerator(generator):
raise TypeError('Object should be generator like')
raise TypeError("Object should be generator like")
self.test_value = pm.smartfloatX(copy(next(generator)))
# make pickling potentially possible
self._yielded_test_value = False
self.gen = generator
self.tensortype = tt.TensorType(
self.test_value.dtype,
((False, ) * self.test_value.ndim))
self.test_value.dtype, ((False,) * self.test_value.ndim)
)

# python3 generator
def __next__(self):
Expand Down Expand Up @@ -283,28 +285,37 @@ class Minibatch(tt.TensorVariable):
>>> assert x.eval().shape == (2, 20, 20, 40, 10)
"""

RNG = collections.defaultdict(list) # type: Dict[str, List[Any]]

@theano.configparser.change_flags(compute_test_value='raise')
def __init__(self, data, batch_size=128, dtype=None, broadcastable=None, name='Minibatch',
random_seed=42, update_shared_f=None, in_memory_size=None):
RNG = collections.defaultdict(list) # type: Dict[str, List[Any]]

@theano.configparser.change_flags(compute_test_value="raise")
def __init__(
self,
data,
batch_size=128,
dtype=None,
broadcastable=None,
name="Minibatch",
random_seed=42,
update_shared_f=None,
in_memory_size=None,
):
if dtype is None:
data = pm.smartfloatX(np.asarray(data))
else:
data = np.asarray(data, dtype)
in_memory_slc = self.make_static_slices(in_memory_size)
self.shared = theano.shared(data[in_memory_slc])
self.update_shared_f = update_shared_f
self.random_slc = self.make_random_slices(self.shared.shape, batch_size, random_seed)
self.random_slc = self.make_random_slices(
self.shared.shape, batch_size, random_seed
)
minibatch = self.shared[self.random_slc]
if broadcastable is None:
broadcastable = (False, ) * minibatch.ndim
broadcastable = (False,) * minibatch.ndim
minibatch = tt.patternbroadcast(minibatch, broadcastable)
self.minibatch = minibatch
super().__init__(self.minibatch.type, None, None, name=name)
theano.Apply(
theano.compile.view_op,
inputs=[self.minibatch], outputs=[self])
theano.Apply(theano.compile.view_op, inputs=[self.minibatch], outputs=[self])
self.tag.test_value = copy(self.minibatch.tag.test_value)

def rslice(self, total, size, seed):
Expand All @@ -313,11 +324,11 @@ def rslice(self, total, size, seed):
elif isinstance(size, int):
rng = pm.tt_rng(seed)
Minibatch.RNG[id(self)].append(rng)
return (rng
.uniform(size=(size, ), low=0.0, high=pm.floatX(total) - 1e-16)
.astype('int64'))
return rng.uniform(
size=(size,), low=0.0, high=pm.floatX(total) - 1e-16
).astype("int64")
else:
raise TypeError('Unrecognized size type, %r' % size)
raise TypeError("Unrecognized size type, %r" % size)

def __del__(self):
del Minibatch.RNG[id(self)]
Expand All @@ -340,17 +351,18 @@ def make_static_slices(user_size):
elif isinstance(i, slice):
slc.append(i)
else:
raise TypeError('Unrecognized size type, %r' % user_size)
raise TypeError("Unrecognized size type, %r" % user_size)
return slc
else:
raise TypeError('Unrecognized size type, %r' % user_size)
raise TypeError("Unrecognized size type, %r" % user_size)

def make_random_slices(self, in_memory_shape, batch_size, default_random_seed):
if batch_size is None:
return [Ellipsis]
elif isinstance(batch_size, int):
slc = [self.rslice(in_memory_shape[0], batch_size, default_random_seed)]
elif isinstance(batch_size, (list, tuple)):

def check(t):
if t is Ellipsis or t is None:
return True
Expand All @@ -364,12 +376,14 @@ def check(t):
return True
else:
return False

# end check definition
if not all(check(t) for t in batch_size):
raise TypeError('Unrecognized `batch_size` type, expected '
'int or List[int|tuple(size, random_seed)] where '
'size and random seed are both ints, got %r' %
batch_size)
raise TypeError(
"Unrecognized `batch_size` type, expected "
"int or List[int|tuple(size, random_seed)] where "
"size and random seed are both ints, got %r" % batch_size
)
batch_size = [
(i, default_random_seed) if isinstance(i, int) else i
for i in batch_size
Expand All @@ -378,12 +392,14 @@ def check(t):
if Ellipsis in batch_size:
sep = batch_size.index(Ellipsis)
begin = batch_size[:sep]
end = batch_size[sep + 1:]
end = batch_size[sep + 1 :]
if Ellipsis in end:
raise ValueError('Double Ellipsis in `batch_size` is restricted, got %r' %
batch_size)
raise ValueError(
"Double Ellipsis in `batch_size` is restricted, got %r"
% batch_size
)
if len(end) > 0:
shp_mid = shape[sep:-len(end)]
shp_mid = shape[sep : -len(end)]
mid = [tt.arange(s) for s in shp_mid]
else:
mid = []
Expand All @@ -392,23 +408,30 @@ def check(t):
end = []
mid = []
if (len(begin) + len(end)) > len(in_memory_shape.eval()):
raise ValueError('Length of `batch_size` is too big, '
'number of ints is bigger that ndim, got %r'
% batch_size)
raise ValueError(
"Length of `batch_size` is too big, "
"number of ints is bigger that ndim, got %r" % batch_size
)
if len(end) > 0:
shp_end = shape[-len(end):]
shp_end = shape[-len(end) :]
else:
shp_end = np.asarray([])
shp_begin = shape[:len(begin)]
slc_begin = [self.rslice(shp_begin[i], t[0], t[1])
if t is not None else tt.arange(shp_begin[i])
for i, t in enumerate(begin)]
slc_end = [self.rslice(shp_end[i], t[0], t[1])
if t is not None else tt.arange(shp_end[i])
for i, t in enumerate(end)]
shp_begin = shape[: len(begin)]
slc_begin = [
self.rslice(shp_begin[i], t[0], t[1])
if t is not None
else tt.arange(shp_begin[i])
for i, t in enumerate(begin)
]
slc_end = [
self.rslice(shp_end[i], t[0], t[1])
if t is not None
else tt.arange(shp_end[i])
for i, t in enumerate(end)
]
slc = slc_begin + mid + slc_end
else:
raise TypeError('Unrecognized size type, %r' % batch_size)
raise TypeError("Unrecognized size type, %r" % batch_size)
return pm.theanof.ix_(*slc)

def update_shared(self):
Expand All @@ -434,7 +457,7 @@ def align_minibatches(batches=None):
else:
for b in batches:
if not isinstance(b, Minibatch):
raise TypeError('{b} is not a Minibatch')
raise TypeError("{b} is not a Minibatch")
for rng in Minibatch.RNG[id(b)]:
rng.seed()

Expand All @@ -447,8 +470,17 @@ class Data:
----------
name: str
The name for this variable
value
value: {List, np.ndarray, pd.Series, pd.Dataframe}
A value to associate with this variable
dims: {str, tuple of str}, optional, default=None
Dimension names of the random variables (as opposed to the shapes of these
random variables). Use this when `value` is a Pandas Series or DataFrame. The
`dims` will then be the name of the Series / DataFrame's columns. See ArviZ
documentation for more information about dimensions and coordinates:
https://arviz-devs.github.io/arviz/notebooks/Introduction.html
export_index_as_coords: bool, optional, default=False
If True, the `Data` container will try to infer what the coordinates should be
if there is an index in `value`.
Examples
--------
Expand Down Expand Up @@ -479,7 +511,7 @@ class Data:
https://docs.pymc.io/notebooks/data_container.html
"""

def __new__(self, name, value):
def __new__(self, name, value, *, dims=None, export_index_as_coords=False):
if isinstance(value, list):
value = np.array(value)

Expand All @@ -497,10 +529,68 @@ def __new__(self, name, value):
# transforms it to something digestible for pymc3
shared_object = theano.shared(pm.model.pandas_to_array(value), name)

if isinstance(dims, str):
dims = (dims,)
if not (dims is None or len(dims) == shared_object.ndim):
raise pm.exceptions.ShapeError(
"Length of `dims` must match the dimensions of the dataset.",
actual=len(dims), expected=shared_object.ndim
)

coords = self.set_coords(model, value, dims)

if export_index_as_coords:
model.add_coords(coords)

# To draw the node for this variable in the graphviz Digraph we need
# its shape.
shared_object.dshape = tuple(shared_object.shape.eval())
if dims is not None:
shape_dims = model.shape_from_dims(dims)
if shared_object.dshape != shape_dims:
raise pm.exceptions.ShapeError(
"Data shape does not match with specified `dims`.",
actual=shared_object.dshape, expected=shape_dims
)

model.add_random_variable(shared_object)
model.add_random_variable(shared_object, dims=dims)

return shared_object

@staticmethod
def set_coords(model, value, dims=None):
coords = {}

# If value is a df or a series, we interpret the index as coords:
if isinstance(value, (pd.Series, pd.DataFrame)):
dim_name = None
if dims is not None:
dim_name = dims[0]
if dim_name is None and value.index.name is not None:
dim_name = value.index.name
if dim_name is not None:
coords[dim_name] = value.index

# If value is a df, we also interpret the columns as coords:
if isinstance(value, pd.DataFrame):
dim_name = None
if dims is not None:
dim_name = dims[1]
if dim_name is None and value.columns.name is not None:
dim_name = value.columns.name
if dim_name is not None:
coords[dim_name] = value.columns

if isinstance(value, np.ndarray) and dims is not None:
if len(dims) != value.ndim:
raise pm.exceptions.ShapeError(
"Invalid data shape. The rank of the dataset must match the "
"length of `dims`.",
actual=value.shape, expected=value.ndim
)
for size, dim in zip(value.shape, dims):
coord = model.coords.get(dim, None)
if coord is None:
coords[dim] = pd.RangeIndex(size, name=dim)

return coords
Loading

0 comments on commit 76f3a7b

Please sign in to comment.