Skip to content

Commit

Permalink
Allow unnamed (None) dims and undefined (None) coord values
Browse files Browse the repository at this point in the history
Also refactor into properties to add docstrings and type annotations.
And no longer allow InferenceData conversion without a Model on stack.

Co-authored-by: Oriol Abril Pla <oriol.abril.pla@gmail.com>
  • Loading branch information
michaelosthege and OriolAbril committed Apr 18, 2021
1 parent 45cb4eb commit 9afc5ef
Show file tree
Hide file tree
Showing 7 changed files with 200 additions and 43 deletions.
2 changes: 2 additions & 0 deletions RELEASE-NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
### Maintenance
- Remove float128 dtype support (see [#4514](https://github.com/pymc-devs/pymc3/pull/4514)).
- Logp method of `Uniform` and `DiscreteUniform` no longer depends on `pymc3.distributions.dist_math.bound` for proper evaluation (see [#4541](https://github.com/pymc-devs/pymc3/pull/4541)).
- `Model.RV_dims` and `Model.coords` are now read-only properties. To modify the `coords` dictionary use `Model.add_coord`. Also `dims` or coordinate values that are `None` will be auto-completed (see [#4625](https://github.com/pymc-devs/pymc3/pull/4625)).
- The length of `dims` in the model is now tracked symbolically through `Model.dim_lengths` (see [#4625](https://github.com/pymc-devs/pymc3/pull/4625)).
- ...

## PyMC3 3.11.2 (14 March 2021)
Expand Down
11 changes: 6 additions & 5 deletions pymc3/backends/arviz.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,10 +162,7 @@ def __init__(
self.trace = trace

# this permits us to get the model from command-line argument or from with model:
try:
self.model = modelcontext(model)
except TypeError:
self.model = None
self.model = modelcontext(model)

self.attrs = None
if trace is not None:
Expand Down Expand Up @@ -223,10 +220,14 @@ def arbitrary_element(dct: Dict[Any, np.ndarray]) -> np.ndarray:
self.coords = {} if coords is None else coords
if hasattr(self.model, "coords"):
self.coords = {**self.model.coords, **self.coords}
self.coords = {key: value for key, value in self.coords.items() if value is not None}

self.dims = {} if dims is None else dims
if hasattr(self.model, "RV_dims"):
model_dims = {k: list(v) for k, v in self.model.RV_dims.items()}
model_dims = {
var_name: [dim for dim in dims if dim is not None]
for var_name, dims in self.model.RV_dims.items()
}
self.dims = {**model_dims, **self.dims}

self.density_dist_obs = density_dist_obs
Expand Down
11 changes: 8 additions & 3 deletions pymc3/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import urllib.request

from copy import copy
from typing import Any, Dict, List
from typing import Any, Dict, List, Sequence

import aesara
import aesara.tensor as at
Expand Down Expand Up @@ -502,7 +502,7 @@ class Data:
>>> for data_vals in observed_data:
... with model:
... # Switch out the observed dataset
... pm.set_data({'data': data_vals})
... model.set_data('data', data_vals)
... traces.append(pm.sample())
To set the value of the data container variable, check out
Expand Down Expand Up @@ -543,6 +543,11 @@ def __new__(self, name, value, *, dims=None, export_index_as_coords=False):

if export_index_as_coords:
model.add_coords(coords)
elif dims:
# Register new dimension lengths
for d, dname in enumerate(dims):
if not dname in model.dim_lengths:
model.add_coord(dname, values=None, length=shared_object.shape[d])

# To draw the node for this variable in the graphviz Digraph we need
# its shape.
Expand All @@ -562,7 +567,7 @@ def __new__(self, name, value, *, dims=None, export_index_as_coords=False):
return shared_object

@staticmethod
def set_coords(model, value, dims=None):
def set_coords(model, value, dims=None) -> Dict[str, Sequence]:
coords = {}

# If value is a df or a series, we interpret the index as coords:
Expand Down
200 changes: 170 additions & 30 deletions pymc3/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,20 @@
import warnings

from sys import modules
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type, TypeVar, Union
from typing import (
TYPE_CHECKING,
Any,
Dict,
List,
Optional,
Sequence,
Tuple,
Type,
TypeVar,
Union,
)

import aesara
import aesara.graph.basic
import aesara.sparse as sparse
import aesara.tensor as at
import numpy as np
Expand All @@ -32,6 +42,7 @@
from aesara.graph.basic import Constant, Variable, graph_inputs
from aesara.graph.fg import FunctionGraph, MissingInputError
from aesara.tensor.random.opt import local_subtensor_rv_lift
from aesara.tensor.sharedvar import ScalarSharedVariable
from aesara.tensor.var import TensorVariable
from pandas import Series

Expand All @@ -46,7 +57,7 @@
from pymc3.blocking import DictToArrayBijection, RaveledVars
from pymc3.data import GenTensorVariable, Minibatch
from pymc3.distributions import logp_transform, logpt, logpt_sum
from pymc3.exceptions import ImputationWarning, SamplingError
from pymc3.exceptions import ImputationWarning, SamplingError, ShapeError
from pymc3.math import flatten_list
from pymc3.util import UNSET, WithMemoization, get_var_name, treedict, treelist
from pymc3.vartypes import continuous_types, discrete_types, typefilter
Expand Down Expand Up @@ -606,8 +617,9 @@ def __new__(cls, *args, **kwargs):

def __init__(self, name="", model=None, aesara_config=None, coords=None, check_bounds=True):
self.name = name
self.coords = {}
self.RV_dims = {}
self._coords = {}
self._RV_dims = {}
self._dim_lengths = {}
self.add_coords(coords)
self.check_bounds = check_bounds

Expand Down Expand Up @@ -826,6 +838,27 @@ def basic_RVs(self):
"""
return self.free_RVs + self.observed_RVs

@property
def RV_dims(self) -> Dict[str, Tuple[Union[str, None]]]:
"""Tuples of dimension names for specific model variables.
Entries in the tuples may be ``None``, if the RV dimension was not given a name.
"""
return self._RV_dims

@property
def coords(self) -> Dict[str, Union[Sequence, None]]:
"""Coordinate values for model dimensions."""
return self._coords

@property
def dim_lengths(self) -> Dict[str, Tuple[Variable]]:
"""The symbolic lengths of dimensions in the model.
The values are typically instances of ``TensorVariable`` or ``ScalarSharedVariable``.
"""
return self._dim_lengths

@property
def unobserved_RVs(self):
"""List of all random variables, including deterministic ones.
Expand Down Expand Up @@ -913,20 +946,138 @@ def shape_from_dims(self, dims):
shape.extend(np.shape(self.coords[dim]))
return tuple(shape)

def add_coords(self, coords):
def add_coord(
self,
name: str,
values: Optional[Sequence] = None,
*,
length: Optional[Variable] = None,
):
"""Registers a dimension coordinate with the model.
Parameters
----------
name : str
Name of the dimension.
Forbidden: {"chain", "draw"}
values : optional, array-like
Coordinate values or ``None`` (for auto-numbering).
If ``None`` is passed, a ``length`` must be specified.
length : optional, scalar
A symbolic scalar of the dimensions length.
Defaults to ``aesara.shared(len(values))``.
"""
if name in {"draw", "chain"}:
raise ValueError(
"Dimensions can not be named `draw` or `chain`, as they are reserved for the sampler's outputs."
)
if values is None and length is None:
raise ValueError(
f"Either `values` or `length` must be specified for the '{name}' dimension."
)
if length is not None and not isinstance(length, Variable):
raise ValueError(
f"The `length` passed for the '{name}' coord must be an Aesara Variable or None."
)
if name in self.coords:
if not values.equals(self.coords[name]):
raise ValueError("Duplicate and incompatiple coordinate: %s." % name)
else:
self._coords[name] = values
self._dim_lengths[name] = length or aesara.shared(len(values))

def add_coords(
self,
coords: Dict[str, Optional[Sequence]],
*,
lengths: Optional[Dict[str, Union[Variable, None]]] = None,
):
"""Vectorized version of ``Model.add_coord``."""
if coords is None:
return
lengths = lengths or {}

for name in coords:
if name in {"draw", "chain"}:
raise ValueError(
"Dimensions can not be named `draw` or `chain`, as they are reserved for the sampler's outputs."
for name, values in coords.items():
self.add_coord(name, values, length=lengths.get(name, None))

def set_data(
self,
name: str,
values: Dict[str, Optional[Sequence]],
coords: Optional[Dict[str, Sequence]] = None,
):
"""Changes the values of a data variable in the model.
In contrast to pm.Data().set_value, this method can also
update the corresponding coordinates.
Parameters
----------
name : str
Name of a shared variable in the model.
values : array-like
New values for the shared variable.
coords : optional, dict
New coordinate values for dimensions of the shared variable.
Must be provided for all named dimensions that change in length.
"""
shared_object = self[name]
if not isinstance(shared_object, SharedVariable):
raise TypeError(
f"The variable `{name}` must be defined as `pymc3.Data` inside the model to allow updating. "
f"The current type is: {type(shared_object)}"
)
values = pandas_to_array(values)
dims = self.RV_dims.get(name, None) or ()
coords = coords or {}

if values.ndim != shared_object.ndim:
raise ValueError(
f"New values for '{name}' must have {shared_object.ndim} dimensions, just like the original."
)

for d, dname in enumerate(dims):
length_tensor = self.dim_lengths[dname]
old_length = length_tensor.eval()
new_length = values.shape[d]
original_coords = self.coords.get(dname, None)
new_coords = coords.get(dname, None)

length_changed = new_length != old_length

# Reject resizing if we already know that it would create shape problems.
# NOTE: If there are multiple pm.Data containers sharing this dim, but the user only
# changes the values for one of them, they will run into shape problems nonetheless.
if not isinstance(length_tensor, ScalarSharedVariable) and length_changed:
raise ShapeError(
f"Resizing dimension {dname} with values of length {new_length} would lead to incompatibilities, "
f"because the dimension was not initialized from a shared variable. "
f"Check if the dimension was defined implicitly before the shared variable '{name}' was created, "
f"for example by a model variable.",
actual=new_length,
expected=old_length,
)
if name in self.coords:
if not coords[name].equals(self.coords[name]):
raise ValueError("Duplicate and incompatiple coordinate: %s." % name)
else:
self.coords[name] = coords[name]
if original_coords is not None and length_changed:
if length_changed and new_coords is None:
raise ValueError(
f"The '{name}' variable already had {len(original_coords)} coord values defined for"
f"its {dname} dimension. With the new values this dimension changes to length "
f"{new_length}, so new coord values for the {dname} dimension are required."
)
if new_coords is not None:
# Update the registered coord values (also if they were None)
if len(new_coords) != new_length:
raise ShapeError(
f"Length of new coordinate values for dimension '{dname}' does not match the provided values.",
actual=len(new_coords),
expected=new_length,
)
self._coords[dname] = new_coords
if isinstance(length_tensor, ScalarSharedVariable) and new_length != old_length:
# Updating the shared variable resizes dependent nodes that use this dimension for their `size`.
length_tensor.set_value(new_length)

shared_object.set_value(values)

def register_rv(self, rv_var, name, data=None, total_size=None, dims=None, transform=UNSET):
"""Register an (un)observed random variable with the model.
Expand Down Expand Up @@ -1132,16 +1283,16 @@ def create_value_var(self, rv_var: TensorVariable, transform: Any) -> TensorVari

return value_var

def add_random_variable(self, var, dims=None):
def add_random_variable(self, var, dims: Optional[Tuple[Union[str, None]]] = None):
"""Add a random variable to the named variables of the model."""
if self.named_vars.tree_contains(var.name):
raise ValueError(f"Variable name {var.name} already exists.")

if dims is not None:
if isinstance(dims, str):
dims = (dims,)
assert all(dim in self.coords for dim in dims)
self.RV_dims[var.name] = dims
assert all(dim in self.coords or dim is None for dim in dims)
self._RV_dims[var.name] = dims

self.named_vars[var.name] = var
if not hasattr(self, self.name_of(var.name)):
Expand Down Expand Up @@ -1500,18 +1651,7 @@ def set_data(new_data, model=None):
model = modelcontext(model)

for variable_name, new_value in new_data.items():
if isinstance(model[variable_name], SharedVariable):
if isinstance(new_value, list):
new_value = np.array(new_value)
model[variable_name].set_value(pandas_to_array(new_value))
else:
message = (
"The variable `{}` must be defined as `pymc3."
"Data` inside the model to allow updating. The "
"current type is: "
"{}.".format(variable_name, type(model[variable_name]))
)
raise TypeError(message)
model.set_data(variable_name, new_value)


def fn(outs, mode=None, model=None, *args, **kwargs):
Expand Down
6 changes: 4 additions & 2 deletions pymc3/tests/sampler_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,13 +155,15 @@ def setup_class(cls):

def test_neff(self):
if hasattr(self, "min_n_eff"):
idata = to_inference_data(self.trace[self.burn :])
with self.model:
idata = to_inference_data(self.trace[self.burn :])
n_eff = az.ess(idata)
for var in n_eff:
npt.assert_array_less(self.min_n_eff, n_eff[var])

def test_Rhat(self):
idata = to_inference_data(self.trace[self.burn :])
with self.model:
idata = to_inference_data(self.trace[self.burn :])
rhat = az.rhat(idata)
for var in rhat:
npt.assert_allclose(rhat[var], 1, rtol=0.01)
Expand Down
8 changes: 8 additions & 0 deletions pymc3/tests/test_data_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
import pytest

from aesara import shared
from aesara.tensor.sharedvar import ScalarSharedVariable
from aesara.tensor.var import TensorVariable

import pymc3 as pm

Expand Down Expand Up @@ -272,9 +274,15 @@ def test_explicit_coords(self):

assert "rows" in pmodel.coords
assert pmodel.coords["rows"] == ["R1", "R2", "R3", "R4", "R5"]
assert "rows" in pmodel.dim_lengths
assert isinstance(pmodel.dim_lengths["rows"], ScalarSharedVariable)
assert pmodel.dim_lengths["rows"].eval() == 5
assert "columns" in pmodel.coords
assert pmodel.coords["columns"] == ["C1", "C2", "C3", "C4", "C5", "C6", "C7"]
assert pmodel.RV_dims == {"observations": ("rows", "columns")}
assert "columns" in pmodel.dim_lengths
assert isinstance(pmodel.dim_lengths["columns"], ScalarSharedVariable)
assert pmodel.dim_lengths["columns"].eval() == 7

def test_implicit_coords_series(self):
ser_sales = pd.Series(
Expand Down
Loading

0 comments on commit 9afc5ef

Please sign in to comment.