diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md index 7a73c65b72e..962ba3bda9c 100644 --- a/RELEASE-NOTES.md +++ b/RELEASE-NOTES.md @@ -13,6 +13,8 @@ ### Maintenance - Remove float128 dtype support (see [#4514](https://github.com/pymc-devs/pymc3/pull/4514)). - Logp method of `Uniform` and `DiscreteUniform` no longer depends on `pymc3.distributions.dist_math.bound` for proper evaluation (see [#4541](https://github.com/pymc-devs/pymc3/pull/4541)). +- `Model.RV_dims` and `Model.coords` are now read-only properties. To modify the `coords` dictionary use `Model.add_coord`. Also `dims` or coordinate values that are `None` will be auto-completed (see [#4625](https://github.com/pymc-devs/pymc3/pull/4625)). +- The length of `dims` in the model is now tracked symbolically through `Model.dim_lengths` (see [#4625](https://github.com/pymc-devs/pymc3/pull/4625)). - ... ## PyMC3 3.11.2 (14 March 2021) diff --git a/pymc3/backends/arviz.py b/pymc3/backends/arviz.py index 97a0c136137..8a3f7b46cc7 100644 --- a/pymc3/backends/arviz.py +++ b/pymc3/backends/arviz.py @@ -162,10 +162,7 @@ def __init__( self.trace = trace # this permits us to get the model from command-line argument or from with model: - try: - self.model = modelcontext(model) - except TypeError: - self.model = None + self.model = modelcontext(model) self.attrs = None if trace is not None: @@ -223,10 +220,14 @@ def arbitrary_element(dct: Dict[Any, np.ndarray]) -> np.ndarray: self.coords = {} if coords is None else coords if hasattr(self.model, "coords"): self.coords = {**self.model.coords, **self.coords} + self.coords = {key: value for key, value in self.coords.items() if value is not None} self.dims = {} if dims is None else dims if hasattr(self.model, "RV_dims"): - model_dims = {k: list(v) for k, v in self.model.RV_dims.items()} + model_dims = { + var_name: [dim for dim in dims if dim is not None] + for var_name, dims in self.model.RV_dims.items() + } self.dims = {**model_dims, **self.dims} self.density_dist_obs = density_dist_obs diff --git a/pymc3/data.py b/pymc3/data.py index 846e8272b71..06dfb2766bc 100644 --- a/pymc3/data.py +++ b/pymc3/data.py @@ -19,7 +19,7 @@ import urllib.request from copy import copy -from typing import Any, Dict, List +from typing import Any, Dict, List, Sequence import aesara import aesara.tensor as at @@ -502,7 +502,7 @@ class Data: >>> for data_vals in observed_data: ... with model: ... # Switch out the observed dataset - ... pm.set_data({'data': data_vals}) + ... model.set_data('data', data_vals) ... traces.append(pm.sample()) To set the value of the data container variable, check out @@ -543,6 +543,11 @@ def __new__(self, name, value, *, dims=None, export_index_as_coords=False): if export_index_as_coords: model.add_coords(coords) + elif dims: + # Register new dimension lengths + for d, dname in enumerate(dims): + if not dname in model.dim_lengths: + model.add_coord(dname, values=None, length=shared_object.shape[d]) # To draw the node for this variable in the graphviz Digraph we need # its shape. @@ -562,7 +567,7 @@ def __new__(self, name, value, *, dims=None, export_index_as_coords=False): return shared_object @staticmethod - def set_coords(model, value, dims=None): + def set_coords(model, value, dims=None) -> Dict[str, Sequence]: coords = {} # If value is a df or a series, we interpret the index as coords: diff --git a/pymc3/model.py b/pymc3/model.py index b4478108c0f..d47b58f5be6 100644 --- a/pymc3/model.py +++ b/pymc3/model.py @@ -18,10 +18,20 @@ import warnings from sys import modules -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type, TypeVar, Union +from typing import ( + TYPE_CHECKING, + Any, + Dict, + List, + Optional, + Sequence, + Tuple, + Type, + TypeVar, + Union, +) import aesara -import aesara.graph.basic import aesara.sparse as sparse import aesara.tensor as at import numpy as np @@ -32,6 +42,7 @@ from aesara.graph.basic import Constant, Variable, graph_inputs from aesara.graph.fg import FunctionGraph, MissingInputError from aesara.tensor.random.opt import local_subtensor_rv_lift +from aesara.tensor.sharedvar import ScalarSharedVariable from aesara.tensor.var import TensorVariable from pandas import Series @@ -46,7 +57,7 @@ from pymc3.blocking import DictToArrayBijection, RaveledVars from pymc3.data import GenTensorVariable, Minibatch from pymc3.distributions import logp_transform, logpt, logpt_sum -from pymc3.exceptions import ImputationWarning, SamplingError +from pymc3.exceptions import ImputationWarning, SamplingError, ShapeError from pymc3.math import flatten_list from pymc3.util import UNSET, WithMemoization, get_var_name, treedict, treelist from pymc3.vartypes import continuous_types, discrete_types, typefilter @@ -606,8 +617,9 @@ def __new__(cls, *args, **kwargs): def __init__(self, name="", model=None, aesara_config=None, coords=None, check_bounds=True): self.name = name - self.coords = {} - self.RV_dims = {} + self._coords = {} + self._RV_dims = {} + self._dim_lengths = {} self.add_coords(coords) self.check_bounds = check_bounds @@ -826,6 +838,27 @@ def basic_RVs(self): """ return self.free_RVs + self.observed_RVs + @property + def RV_dims(self) -> Dict[str, Tuple[Union[str, None]]]: + """Tuples of dimension names for specific model variables. + + Entries in the tuples may be ``None``, if the RV dimension was not given a name. + """ + return self._RV_dims + + @property + def coords(self) -> Dict[str, Union[Sequence, None]]: + """Coordinate values for model dimensions.""" + return self._coords + + @property + def dim_lengths(self) -> Dict[str, Tuple[Variable]]: + """The symbolic lengths of dimensions in the model. + + The values are typically instances of ``TensorVariable`` or ``ScalarSharedVariable``. + """ + return self._dim_lengths + @property def unobserved_RVs(self): """List of all random variables, including deterministic ones. @@ -913,20 +946,138 @@ def shape_from_dims(self, dims): shape.extend(np.shape(self.coords[dim])) return tuple(shape) - def add_coords(self, coords): + def add_coord( + self, + name: str, + values: Optional[Sequence] = None, + *, + length: Optional[Variable] = None, + ): + """Registers a dimension coordinate with the model. + + Parameters + ---------- + name : str + Name of the dimension. + Forbidden: {"chain", "draw"} + values : optional, array-like + Coordinate values or ``None`` (for auto-numbering). + If ``None`` is passed, a ``length`` must be specified. + length : optional, scalar + A symbolic scalar of the dimensions length. + Defaults to ``aesara.shared(len(values))``. + """ + if name in {"draw", "chain"}: + raise ValueError( + "Dimensions can not be named `draw` or `chain`, as they are reserved for the sampler's outputs." + ) + if values is None and length is None: + raise ValueError( + f"Either `values` or `length` must be specified for the '{name}' dimension." + ) + if length is not None and not isinstance(length, Variable): + raise ValueError( + f"The `length` passed for the '{name}' coord must be an Aesara Variable or None." + ) + if name in self.coords: + if not values.equals(self.coords[name]): + raise ValueError("Duplicate and incompatiple coordinate: %s." % name) + else: + self._coords[name] = values + self._dim_lengths[name] = length or aesara.shared(len(values)) + + def add_coords( + self, + coords: Dict[str, Optional[Sequence]], + *, + lengths: Optional[Dict[str, Union[Variable, None]]] = None, + ): + """Vectorized version of ``Model.add_coord``.""" if coords is None: return + lengths = lengths or {} - for name in coords: - if name in {"draw", "chain"}: - raise ValueError( - "Dimensions can not be named `draw` or `chain`, as they are reserved for the sampler's outputs." + for name, values in coords.items(): + self.add_coord(name, values, length=lengths.get(name, None)) + + def set_data( + self, + name: str, + values: Dict[str, Optional[Sequence]], + coords: Optional[Dict[str, Sequence]] = None, + ): + """Changes the values of a data variable in the model. + + In contrast to pm.Data().set_value, this method can also + update the corresponding coordinates. + + Parameters + ---------- + name : str + Name of a shared variable in the model. + values : array-like + New values for the shared variable. + coords : optional, dict + New coordinate values for dimensions of the shared variable. + Must be provided for all named dimensions that change in length. + """ + shared_object = self[name] + if not isinstance(shared_object, SharedVariable): + raise TypeError( + f"The variable `{name}` must be defined as `pymc3.Data` inside the model to allow updating. " + f"The current type is: {type(shared_object)}" + ) + values = pandas_to_array(values) + dims = self.RV_dims.get(name, None) or () + coords = coords or {} + + if values.ndim != shared_object.ndim: + raise ValueError( + f"New values for '{name}' must have {shared_object.ndim} dimensions, just like the original." + ) + + for d, dname in enumerate(dims): + length_tensor = self.dim_lengths[dname] + old_length = length_tensor.eval() + new_length = values.shape[d] + original_coords = self.coords.get(dname, None) + new_coords = coords.get(dname, None) + + length_changed = new_length != old_length + + # Reject resizing if we already know that it would create shape problems. + # NOTE: If there are multiple pm.Data containers sharing this dim, but the user only + # changes the values for one of them, they will run into shape problems nonetheless. + if not isinstance(length_tensor, ScalarSharedVariable) and length_changed: + raise ShapeError( + f"Resizing dimension {dname} with values of length {new_length} would lead to incompatibilities, " + f"because the dimension was not initialized from a shared variable. " + f"Check if the dimension was defined implicitly before the shared variable '{name}' was created, " + f"for example by a model variable.", + actual=new_length, + expected=old_length, ) - if name in self.coords: - if not coords[name].equals(self.coords[name]): - raise ValueError("Duplicate and incompatiple coordinate: %s." % name) - else: - self.coords[name] = coords[name] + if original_coords is not None and length_changed: + if length_changed and new_coords is None: + raise ValueError( + f"The '{name}' variable already had {len(original_coords)} coord values defined for" + f"its {dname} dimension. With the new values this dimension changes to length " + f"{new_length}, so new coord values for the {dname} dimension are required." + ) + if new_coords is not None: + # Update the registered coord values (also if they were None) + if len(new_coords) != new_length: + raise ShapeError( + f"Length of new coordinate values for dimension '{dname}' does not match the provided values.", + actual=len(new_coords), + expected=new_length, + ) + self._coords[dname] = new_coords + if isinstance(length_tensor, ScalarSharedVariable) and new_length != old_length: + # Updating the shared variable resizes dependent nodes that use this dimension for their `size`. + length_tensor.set_value(new_length) + + shared_object.set_value(values) def register_rv(self, rv_var, name, data=None, total_size=None, dims=None, transform=UNSET): """Register an (un)observed random variable with the model. @@ -1132,7 +1283,7 @@ def create_value_var(self, rv_var: TensorVariable, transform: Any) -> TensorVari return value_var - def add_random_variable(self, var, dims=None): + def add_random_variable(self, var, dims: Optional[Tuple[Union[str, None]]] = None): """Add a random variable to the named variables of the model.""" if self.named_vars.tree_contains(var.name): raise ValueError(f"Variable name {var.name} already exists.") @@ -1140,8 +1291,8 @@ def add_random_variable(self, var, dims=None): if dims is not None: if isinstance(dims, str): dims = (dims,) - assert all(dim in self.coords for dim in dims) - self.RV_dims[var.name] = dims + assert all(dim in self.coords or dim is None for dim in dims) + self._RV_dims[var.name] = dims self.named_vars[var.name] = var if not hasattr(self, self.name_of(var.name)): @@ -1500,18 +1651,7 @@ def set_data(new_data, model=None): model = modelcontext(model) for variable_name, new_value in new_data.items(): - if isinstance(model[variable_name], SharedVariable): - if isinstance(new_value, list): - new_value = np.array(new_value) - model[variable_name].set_value(pandas_to_array(new_value)) - else: - message = ( - "The variable `{}` must be defined as `pymc3." - "Data` inside the model to allow updating. The " - "current type is: " - "{}.".format(variable_name, type(model[variable_name])) - ) - raise TypeError(message) + model.set_data(variable_name, new_value) def fn(outs, mode=None, model=None, *args, **kwargs): diff --git a/pymc3/tests/test_data_container.py b/pymc3/tests/test_data_container.py index 88a1432d488..dddc1dfb236 100644 --- a/pymc3/tests/test_data_container.py +++ b/pymc3/tests/test_data_container.py @@ -17,6 +17,8 @@ import pytest from aesara import shared +from aesara.tensor.sharedvar import ScalarSharedVariable +from aesara.tensor.var import TensorVariable import pymc3 as pm @@ -272,9 +274,15 @@ def test_explicit_coords(self): assert "rows" in pmodel.coords assert pmodel.coords["rows"] == ["R1", "R2", "R3", "R4", "R5"] + assert "rows" in pmodel.dim_lengths + assert isinstance(pmodel.dim_lengths["rows"], ScalarSharedVariable) + assert pmodel.dim_lengths["rows"].eval() == 5 assert "columns" in pmodel.coords assert pmodel.coords["columns"] == ["C1", "C2", "C3", "C4", "C5", "C6", "C7"] assert pmodel.RV_dims == {"observations": ("rows", "columns")} + assert "columns" in pmodel.dim_lengths + assert isinstance(pmodel.dim_lengths["columns"], ScalarSharedVariable) + assert pmodel.dim_lengths["columns"].eval() == 7 def test_implicit_coords_series(self): ser_sales = pd.Series( diff --git a/pymc3/tests/test_sampling.py b/pymc3/tests/test_sampling.py index 71b8a9b8dcf..1b061deb609 100644 --- a/pymc3/tests/test_sampling.py +++ b/pymc3/tests/test_sampling.py @@ -1087,16 +1087,15 @@ def test_sample_from_xarray_prior(self, point_list_arg_bug_fixture): with pmodel: prior = pm.sample_prior_predictive(samples=20) - - idat = pm.to_inference_data(trace, prior=prior) + idat = pm.to_inference_data(trace, prior=prior) with pmodel: pp = pm.sample_posterior_predictive(idat.prior, var_names=["d"]) def test_sample_from_xarray_posterior(self, point_list_arg_bug_fixture): pmodel, trace = point_list_arg_bug_fixture - idat = pm.to_inference_data(trace) with pmodel: + idat = pm.to_inference_data(trace) pp = pm.sample_posterior_predictive(idat.posterior, var_names=["d"])