diff --git a/botorch/acquisition/expected_feasible_improvement.py b/botorch/acquisition/expected_feasible_improvement.py new file mode 100644 index 0000000000..759128b539 --- /dev/null +++ b/botorch/acquisition/expected_feasible_improvement.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from __future__ import annotations + +from typing import Optional + +import torch +from botorch.acquisition import AcquisitionFunction +from botorch.acquisition.analytic import ExpectedImprovement +from botorch.acquisition.monte_carlo import qExpectedImprovement +from botorch.exceptions.errors import UnsupportedError +from botorch.fit import fit_gpytorch_mll +from botorch.models.skew_gp import ExactMarginalLogLikelihood_v2, SkewGPClassifier +from botorch.utils.transforms import t_batch_mode_transform + + +class SkewGPClassifierMixin: + def __init__( + self, + X_pending: Optional[torch.Tensor] = None, + classifier: Optional[SkewGPClassifier] = None, + ) -> None: + self.set_X_pending(X_pending) + self._classifier = classifier + + # Fit classifier in advance to avoid it done inside the query optimizer's closure + self.classifier # TODO: Improve me + + def set_X_pending(self, *args, **kwargs) -> None: + AcquisitionFunction.set_X_pending(self, *args, **kwargs) + self._classifier = None + + @property + def classifier(self) -> Optional[SkewGPClassifier]: + if self._classifier is None and self.X_pending is not None: + X_succ = self.model.train_inputs[0] + X_fail = self.X_pending + # deal with multi-output SingleTaskGP models (which have an additional batch dim) + if X_succ.ndim > X_fail.ndim: + if not all((X_ == X_succ[0]).all() for X_ in X_succ[1:]): + # if we don't have a block design things are ambiguous - give up + raise UnsupportedError("Only block design models are supported") + X_succ = X_succ[0] + X = torch.cat([X_succ, X_fail], dim=0) + Y = torch.cat( + [ + torch.full(X_succ.shape[:-1], True), + torch.full(X_fail.shape[:-1], False), + ], + dim=0, + ) + model = self._classifier = SkewGPClassifier(train_X=X, train_Y=Y) + fit_gpytorch_mll(ExactMarginalLogLikelihood_v2(model.likelihood, model)) + return self._classifier + + +class ExpectedFeasibleImprovement(SkewGPClassifierMixin, ExpectedImprovement): + def __init__( + self, + *args, + X_pending: Optional[torch.Tensor] = None, + classifier: Optional[SkewGPClassifier] = None, + **kwargs, + ): + ExpectedImprovement.__init__(self, *args, **kwargs) + SkewGPClassifierMixin.__init__(self, X_pending=X_pending, classifier=classifier) + + @t_batch_mode_transform(expected_q=1, assert_output_shape=False) + def forward(self, X: torch.Tensor) -> torch.Tensor: + ei = super().forward(X) + if self.classifier is None: + return ei + + p_feas = self.classifier.posterior_predictive(X) + return p_feas.mean.view(ei.shape) * ei + + +class qExpectedFeasibleImprovement(SkewGPClassifierMixin, qExpectedImprovement): + def __init__( + self, + *args, + X_pending: Optional[torch.Tensor] = None, + classifier: Optional[SkewGPClassifier] = None, + **kwargs, + ): + qExpectedImprovement.__init__(self, *args, **kwargs) + SkewGPClassifierMixin.__init__(self, X_pending=X_pending, classifier=classifier) + + @t_batch_mode_transform(expected_q=1, assert_output_shape=False) + def forward(self, X: torch.Tensor) -> torch.Tensor: + ei = super().forward(X) + if self.classifier is None: + return ei + + p_feas = self.classifier.posterior_predictive(X) + return p_feas.mean.view(ei.shape) * ei diff --git a/botorch/acquisition/input_constructors.py b/botorch/acquisition/input_constructors.py index 6e6f7dcd62..64e0863e71 100644 --- a/botorch/acquisition/input_constructors.py +++ b/botorch/acquisition/input_constructors.py @@ -41,6 +41,10 @@ UpperConfidenceBound, ) from botorch.acquisition.cost_aware import InverseCostWeightedUtility +from botorch.acquisition.expected_feasible_improvement import ( + ExpectedFeasibleImprovement, + qExpectedFeasibleImprovement, +) from botorch.acquisition.fixed_feature import FixedFeatureAcquisitionFunction from botorch.acquisition.joint_entropy_search import qJointEntropySearch from botorch.acquisition.knowledge_gradient import ( @@ -448,7 +452,7 @@ def construct_inputs_qSimpleRegret( ) -@acqf_input_constructor(qExpectedImprovement) +@acqf_input_constructor(qExpectedImprovement, qExpectedFeasibleImprovement) def construct_inputs_qEI( model: Model, training_data: MaybeDict[SupervisedDataset], @@ -1253,3 +1257,8 @@ def construct_inputs_qJES( **kwargs, } return inputs + + +@acqf_input_constructor(ExpectedFeasibleImprovement) +def _construct_inputs_efi(*args, X_pending=None, **kwargs): + return {"X_pending": X_pending, **construct_inputs_best_f(*args, **kwargs)} diff --git a/botorch/models/__init__.py b/botorch/models/__init__.py index c6ce746be1..b472bc5075 100644 --- a/botorch/models/__init__.py +++ b/botorch/models/__init__.py @@ -33,6 +33,7 @@ MultiTaskGP, ) from botorch.models.pairwise_gp import PairwiseGP, PairwiseLaplaceMarginalLogLikelihood +from botorch.models.skew_gp import SkewGP, SkewGPClassifier __all__ = [ "AffineDeterministicModel", @@ -56,4 +57,6 @@ "SingleTaskGP", "SingleTaskMultiFidelityGP", "SingleTaskVariationalGP", + "SkewGP", + "SkewGPClassifier", ] diff --git a/botorch/models/likelihoods/__init__.py b/botorch/models/likelihoods/__init__.py index 8ae8f2a155..2fb2894cba 100644 --- a/botorch/models/likelihoods/__init__.py +++ b/botorch/models/likelihoods/__init__.py @@ -4,6 +4,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +from botorch.models.likelihoods.affine_probit import AffineProbitLikelihood from botorch.models.likelihoods.pairwise import ( PairwiseLogitLikelihood, PairwiseProbitLikelihood, @@ -11,6 +12,7 @@ __all__ = [ + "AffineProbitLikelihood", "PairwiseProbitLikelihood", "PairwiseLogitLikelihood", ] diff --git a/botorch/models/likelihoods/affine_probit.py b/botorch/models/likelihoods/affine_probit.py new file mode 100644 index 0000000000..7097779b5f --- /dev/null +++ b/botorch/models/likelihoods/affine_probit.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from __future__ import annotations + +from typing import Optional, Tuple, Union + +import torch +from botorch.utils.probability import TruncatedMultivariateNormal, UnifiedSkewNormal +from gpytorch.distributions import MultivariateNormal +from gpytorch.likelihoods import _GaussianLikelihoodBase +from gpytorch.likelihoods.likelihood import Likelihood +from gpytorch.likelihoods.noise_models import Noise +from linear_operator.operators import DiagLinearOperator, LinearOperator +from torch import BoolTensor, Tensor +from torch.nn.functional import pad + + +class AffineProbitLikelihood(_GaussianLikelihoodBase, Likelihood): + def __init__( + self, + weight: Union[LinearOperator, Tensor], + bias: Optional[Union[LinearOperator, Tensor]] = None, + noise_covar: Optional[Noise] = None, + ): + """Affine probit likelihood `P(f + e > 0)`, where `f = Ax + b` is as an affine + transformation of an `n`-dimensional Gaussian random vector `x` and `e ~ Noise` + is an `m`-dimensional centered, Gaussian noise vector. + + Args: + weight: Matrix `A` with shape (... x n x m). + bias: Vector `b` with shape (... x m). + noise_covar: Noise covariance matrix with shape (... x m x m). + """ + Likelihood.__init__(self) + self.weight = weight + self.bias = bias + self.noise_covar = noise_covar + + def get_affine_transform( + self, diag: Optional[Tensor] = None + ) -> Tuple[Union[Tensor, LinearOperator], Optional[Union[Tensor, LinearOperator]]]: + """Returns the base affine transform with sign flips for negative labels. + + Args: + diag: Scaling factors `d` for the affine transform such that (DA, Db) is + returned, where `D = diag(d)`. + + Returns: + Tensor representation of the affine transform (A, b). + + """ + if diag is None: + return self.weight, self.bias + + D = DiagLinearOperator(diag) + return D @ self.weight, None if (self.bias is None) else D @ self.bias + + def marginal( + self, + function_dist: MultivariateNormal, + observations: Optional[BoolTensor] = None, + ) -> TruncatedMultivariateNormal: + """Returns the truncated multivariate normal distribution of `h | h > 0`, where + `x` is a Gaussian random vector, `h = (Ax + b) + e`, and `e ~ Noise`.""" + gauss_loc = function_dist.loc + gauss_cov = function_dist.covariance_matrix + signed_labels = ( + None + if observations is None + else 2 * observations.to(dtype=gauss_loc.dtype, device=gauss_loc.device) - 1 + ) + + A, b = self.get_affine_transform(diag=signed_labels) + trunc_loc = A @ gauss_loc if (b is None) else A @ gauss_loc + b + trunc_cov = A @ gauss_cov @ A.transpose(-1, -2) + if self.noise_covar is not None: + noise_diag = self.noise_covar(shape=trunc_cov.shape[:-1]) + trunc_cov = (trunc_cov + noise_diag).to_dense() + + return TruncatedMultivariateNormal( + loc=trunc_loc, + covariance_matrix=trunc_cov, + bounds=pad(torch.full_like(trunc_loc, float("inf")).unsqueeze(-1), (1, 0)), + validate_args=False, + ) + + def log_marginal( + self, + observations: BoolTensor, + function_dist: MultivariateNormal, + ) -> Tensor: + """Returns the log marginal likelihood `ln p(y) = ln P([2y - 1](f + e) > 0)`, + where `f = Ax + b` and `e ~ Noise`.""" + return self.marginal(function_dist, observations=observations).log_partition + + def latent_marginal( + self, + function_dist: MultivariateNormal, + observations: Optional[BoolTensor] = None, + ) -> UnifiedSkewNormal: + """Returns the UnifiedSkewNormal distribution of `x | f + e > 0`, where + `x` is a Gaussian random vector, `f = Ax + b`, and `e ~ Noise`.""" + gauss_loc = function_dist.loc + gauss_cov = function_dist.covariance_matrix + signed_labels = ( + None + if observations is None + else 2 * observations.to(dtype=gauss_loc.dtype, device=gauss_loc.device) - 1 + ) + + A, b = self.get_affine_transform(diag=signed_labels) + trunc_loc = A @ gauss_loc if (b is None) else A @ gauss_loc + b + cross_cov = A @ gauss_cov + trunc_cov = cross_cov @ A.transpose(-1, -2) + if self.noise_covar is not None: + noise_diag = self.noise_covar(shape=trunc_cov.shape[:-1]) + trunc_cov = (trunc_cov + noise_diag).to_dense() + + trunc = TruncatedMultivariateNormal( + loc=trunc_loc, + covariance_matrix=trunc_cov, + bounds=pad(torch.full_like(trunc_loc, float("inf")).unsqueeze(-1), (1, 0)), + validate_args=False, + ) + + return UnifiedSkewNormal( + trunc=trunc, + gauss=function_dist, + cross_covariance_matrix=cross_cov, + ) diff --git a/botorch/models/skew_gp.py b/botorch/models/skew_gp.py new file mode 100644 index 0000000000..85d53cf7ed --- /dev/null +++ b/botorch/models/skew_gp.py @@ -0,0 +1,600 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +r""" +Skew Gaussian processes models and accompanying methods. +For details, see [benavoli2020unified]_ and [benavoli2020skew]_. + +.. [benavoli2020unified] + A. Benavoli and D. Azzimonti and D. Piga. A unified framework for closed-form + nonparametric regression, classification, preference and mixed problems with + Skew Gaussian Processes. arXiv Preprint, 2020. + +.. [benavoli2020skew] + A. Benavoli and D. Azzimonti and D. Piga. Skew Gaussian processes for + classification. Machine Learning, 2020. +""" + + +from __future__ import annotations + +from typing import Any, List, Optional, Tuple, Union + +import torch +from botorch.exceptions import InputDataError, UnsupportedError +from botorch.models.likelihoods.affine_probit import AffineProbitLikelihood +from botorch.models.model import Model +from botorch.models.transforms.input import InputTransform +from botorch.models.utils import ( + check_min_max_scaling, + check_no_nans, + gpt_posterior_settings, +) +from botorch.posteriors.torch import TorchPosterior +from botorch.settings import _Flag, validate_input_scaling +from botorch.utils.probability import TruncatedMultivariateNormal, UnifiedSkewNormal +from gpytorch import settings as gpt_settings +from gpytorch.distributions import MultivariateNormal +from gpytorch.kernels import MaternKernel, ScaleKernel +from gpytorch.likelihoods.noise_models import HomoskedasticNoise +from gpytorch.means import ConstantMean +from gpytorch.mlls import ExactMarginalLogLikelihood, MarginalLogLikelihood +from gpytorch.models import GP +from gpytorch.module import Module +from gpytorch.priors.torch_priors import GammaPrior +from linear_operator.operators import IdentityLinearOperator, LinearOperator, to_dense +from torch import BoolTensor, Tensor +from torch.distributions import Bernoulli, Distribution +from torch.distributions.utils import lazy_property +from torch.nn.functional import pad + + +NoneType = type(None) + + +class ExactMarginalLogLikelihood_v2(ExactMarginalLogLikelihood): + r""" + Same as `ExactMarginalLogLikelihood` but calls `likelihood.log_marginal` + directly while passing in target values from `forward`. + + TODO: Incorporate these changes directly into `ExactMarginalLogLikelihood`. + """ + + def forward( + self, function_dist: Distribution, target: Tensor, *params: Tensor + ) -> MarginalLogLikelihood: + log_prob = self.likelihood.log_marginal(target, function_dist) + log_prob = self._add_other_terms(log_prob, params) + num_data = function_dist.event_shape.numel() + return log_prob.div_(num_data) + + +class SkewGPPredictionStrategy: + def __init__( + self, + train_prior: MultivariateNormal, + likelihood: AffineProbitLikelihood, + ): + self.train_prior = train_prior + self.likelihood = likelihood + + def __call__( + self, + test_prior: MultivariateNormal, + train_test_covar: Union[LinearOperator, Tensor], + predict_noise: Union[bool, Tensor] = False, + ) -> UnifiedSkewNormal: + r"""Returns the conditional distribution of test variables. + + Args: + test_prior: The prior distribution of test variables. + train_test_covar: The prior cross-covariance of training and test variables. + predict_noise: If True, use the likelihood's noise module to noise + the posterior distribution of latent random variables. If a Tensor, + add it directly to the latent random variables covariance matrix. + + Returns: + The conditional distribution of test variables. + """ + if predict_noise is True and self.likelihood.noise_covar is not None: + predict_noise = self.likelihood.noise_covar(shape=test_prior.mean.shape) + + if isinstance(predict_noise, (Tensor, LinearOperator)): + test_prior = MultivariateNormal( + mean=test_prior.mean, + covariance_matrix=test_prior.covariance_matrix + predict_noise, + ) + + A, _ = self.affine_transform + + # ensuring that trunc and gauss have the same batch_shape + trunc = self.train_prior_predictive + gauss = test_prior + batch_t = trunc.batch_shape + batch_g = gauss.batch_shape + if batch_t != batch_g: + batch_shape = torch.broadcast_shapes(batch_t, batch_g) + trunc = trunc.expand(batch_shape) + gauss = gauss.expand(batch_shape) + + return UnifiedSkewNormal( + trunc=trunc, + gauss=gauss, + cross_covariance_matrix=(A @ train_test_covar), + ) + + @lazy_property + def affine_transform(self) -> Tuple[Tensor, Optional[Tensor]]: + return self.likelihood.get_affine_transform() + + @property + def num_train(self): + return self.train_shape.numel() + + @lazy_property + def train_shape(self): + return self.train_prior.event_shape + + @lazy_property + def train_prior_predictive(self) -> TruncatedMultivariateNormal: + return self.likelihood.marginal(self.train_prior) + + +class SkewGP(Model, GP): + def __init__( + self, + likelihood: AffineProbitLikelihood, + train_X: Tensor, + train_Y: Optional[Tensor] = None, + mean_module: Optional[Module] = None, + covar_module: Optional[Module] = None, + input_transform: Optional[InputTransform] = None, + **kwargs: Any, + ) -> None: + r"""Infinite-dimensional analogue of a unified skew normal distribution. + + Args: + likelihood: An AffineProbitLikelihood that defines a set of affine constraints. + train_X: A `batch_shape x n x d` tensor of training features. + train_Y: A `batch_shape x n x 1` boolean tensor of training observations. + mean_module: The module used to compute prior means. + covar_module: The module used to compute prior covariances. + input_transform: An input transform that is applied in the model's + forward pass. + """ + super().__init__() + self._validate_inputs(train_X=train_X, train_Y=train_Y) + with torch.no_grad(): + X = train_X if input_transform is None else input_transform(train_X) + + if mean_module is None: + mean_module = ConstantMean() + + if covar_module is None: + covar_module = ScaleKernel( + MaternKernel( + nu=2.5, + ard_num_dims=X.shape[-1], + lengthscale_prior=GammaPrior(3.0, 6.0), + ), + outputscale_prior=GammaPrior(2.0, 0.15), + ) + # Initialize to modest values to avoid potential numerical issues + covar_module.base_kernel.lengthscale = 0.25 * (X.shape[-1] ** 0.5) + + self._train_inputs = None + self._train_targets = None + self._prediction_strategy = None + self.mean_module = mean_module + self.covar_module = covar_module + self.likelihood = likelihood + if input_transform is not None: + self.input_transform = input_transform + + self.set_train_data( + inputs=(train_X,), + targets=None if train_Y is None else train_Y.squeeze(-1), + ) + self.to(train_X) + + def forward(self, x: Tensor) -> MultivariateNormal: + if self.training: + x = self.transform_inputs(x) + mean_x = self.mean_module(x) + covar_x = self.covar_module(x) + return MultivariateNormal(mean_x, covar_x) + + def __call__(self, *inputs, **kwargs): + # Training and prior modes + if self.training or gpt_settings.prior_mode.on() or self.train_inputs is None: + if self.training: + if self.train_inputs is None: + raise RuntimeError("train_inputs cannot be None in training mode.") + if gpt_settings.debug.on() and len(inputs): + if len(inputs) != len(self.train_inputs) or not all( + A.equal(B) for A, B in zip(self.train_inputs, inputs) + ): + raise RuntimeError("You must train on the training inputs!") + + return self.forward(*self.train_inputs) + + test_prior = self.forward(*inputs) + train_test_covar = self.covar_module(*self.train_inputs, *inputs) + with gpt_settings.cg_tolerance(gpt_settings.eval_cg_tolerance.value()): + return self.prediction_strategy( + test_prior=test_prior, train_test_covar=train_test_covar, **kwargs + ) + + def posterior(self, X: Tensor) -> TorchPosterior: + r"""Computes the posterior distribution of process values at test locations `X`. + + Args: + X: A `(batch_shape) x q x d`-dim Tensor, where `d` is the dimension + of the feature space and `q` is the number of points considered + jointly. + + Returns: + A `TorchPosterior` object with a `UnifiedSkewNormal` distribution, + representing a batch of `b` joint distributions over `q` points. + """ + self.eval() + with gpt_posterior_settings(): + usn = self(self.transform_inputs(X)) + return TorchPosterior(distribution=usn) + + def posterior_predictive( + self, X: Tensor, predict_noise: Union[bool, Tensor] = True + ) -> TorchPosterior: + r"""Computes the posterior distribution of observables at test locations `X`. + + Args: + X: A `(batch_shape) x q x d`-dim Tensor, where `d` is the dimension + of the feature space and `q` is the number of points considered + jointly. + predict_noise: If True, use the likelihood's noise module to noise + the posterior distribution of latent random variables. If a Tensor, + add it directly to the latent random variables covariance matrix. + + Returns: + A `TorchPosterior` object with a `UnifiedSkewNormal` distribution, + representing a batch of `b` joint distributions over `q` points. + Includes observation noise if specified. + """ + self.eval() + with gpt_posterior_settings(): + usn = self(self.transform_inputs(X), predict_noise=predict_noise) + return TorchPosterior(distribution=usn) + + def set_train_data( + self, + inputs: Optional[Tuple[Tensor, ...]] = None, + targets: Optional[Tensor] = None, + strict: Optional[bool] = None, + ) -> None: + r"""Set training data (does not re-fit model hyper-parameters). + + Args: + inputs: The new training inputs. + targets: The new training targets. + strict: If `True`, the new inputs and targets must have + the same shape, dtype, and device as the current inputs and targets. + Otherwise, any shape/dtype/device are allowed. Default to `False` if + no training inputs/targets have not yet been defined; otherwise, `True`. + """ + if inputs is not None: + _strict = strict or (strict is None and self.train_inputs is not None) + if _strict and len(inputs) != len(self.train_inputs): + raise RuntimeError("Cannot modify number of input tensors.") + + if torch.is_tensor(inputs): + inputs = (inputs.unsqueeze(-1) if inputs.ndimension() == 1 else inputs,) + else: + inputs = (X.unsqueeze(-1) if X.ndimension() == 1 else X for X in inputs) + + if _strict: + for input_, t_input in zip(inputs, self.train_inputs): + for attr in {"shape", "dtype", "device"}: + expected_attr = getattr(t_input, attr, None) + found_attr = getattr(input_, attr, None) + if expected_attr != found_attr: + msg = "Cannot modify {attr} of inputs (expected {e_attr}, found {f_attr})." + msg = msg.format( + attr=attr, e_attr=expected_attr, f_attr=found_attr + ) + raise RuntimeError(msg) + self.train_inputs = inputs + else: + self.train_inputs = tuple(inputs) + + if targets is not None: + if strict or (strict is None and self.train_targets is not None): + for attr in {"shape", "dtype", "device"}: + expected_attr = getattr(self.train_targets, attr, None) + found_attr = getattr(targets, attr, None) + if expected_attr != found_attr: + msg = "Cannot modify {attr} of targets (expected {e_attr}, found {f_attr})." + msg = msg.format( + attr=attr, e_attr=expected_attr, f_attr=found_attr + ) + raise RuntimeError(msg) + self.train_targets = targets + self._prediction_strategy = None + + @property + def train_inputs(self) -> Tuple[Tensor, ...]: + return self._train_inputs + + @train_inputs.setter + def train_inputs(self, train_inputs: Tuple[Tensor]) -> None: + self._train_inputs = train_inputs + + @property + def train_targets(self) -> None: + return None + + @train_targets.setter + def train_targets(self, train_targets: NoneType) -> None: + if train_targets is not None: + raise UnsupportedError("SkewGP does not have 'train_targets'.") + + @property + def prediction_strategy(self) -> SkewGPPredictionStrategy: + if self._prediction_strategy is None: + self._prediction_strategy = SkewGPPredictionStrategy( + train_prior=self.forward(*self.train_inputs), + likelihood=self.likelihood, + ) + return self._prediction_strategy + + @classmethod + def _validate_inputs( + cls, + train_X: Tensor, + train_Y: Optional[Tensor], + raise_on_fail: bool = False, + ignore_X_dims: Optional[List[int]] = None, + ) -> None: + if train_Y is not None: + if train_Y.dtype != torch.bool: + raise ValueError("`train_Y` must be a boolean tensor") + if train_Y.shape[-1] != 1: + raise ValueError("Only a single output is supported for `train_Y`.") + + if validate_input_scaling.off(): + return + + check_no_nans(train_X) + check_min_max_scaling( + X=train_X, raise_on_fail=raise_on_fail, ignore_dims=ignore_X_dims + ) + + +class SkewGPClassifierPredictionStrategy(SkewGPPredictionStrategy): + def __init__( + self, + train_prior: MultivariateNormal, + likelihood: AffineProbitLikelihood, + train_targets: BoolTensor, + ): + super().__init__(train_prior=train_prior, likelihood=likelihood) + self.train_targets = train_targets + self.predict_latents = type("PredictLatentsFlag", (_Flag,), {}) + + def __call__(self, *args, **kwargs) -> Union[Bernoulli, UnifiedSkewNormal]: + if self.predict_latents.on(): + return super().__call__(*args, **kwargs) + + return self.forward(*args, **kwargs) + + def forward( + self, + test_prior: MultivariateNormal, + train_test_covar: Union[Tensor, LinearOperator], + test_targets: Optional[BoolTensor] = None, + predict_noise: Union[bool, Tensor] = True, + ) -> Bernoulli: + r"""Returns the conditional distribution of the Boolean random variable defined + as whether or not the test targets are correct. + + Args: + test_prior: The prior distribution of test variables. + train_test_covar: The prior cross-covariance of training and test variables. + test_targets: Target values for the test set, defaults to positive. + predict_noise: If True, use the likelihood's noise module to noise + the posterior distribution of latent random variables. If a Tensor, + add it directly to the latent random variables covariance matrix. + + Returns: + The conditional distribution of the Boolean random variable defined as + whether or not the test targets are correct. + """ + test_loc = test_prior.loc + test_covar = test_prior.covariance_matrix + if predict_noise is True and self.likelihood.noise_covar is not None: + predict_noise = self.likelihood.noise_covar(shape=test_loc.shape) + + if isinstance(predict_noise, (Tensor, LinearOperator)): + test_covar = test_covar + predict_noise + + if test_targets is not None: + signs = 2 * test_targets.to(test_prior.loc.dtype) - 1 + test_loc = signs * test_prior.loc + test_covar = signs.unsqueeze(-1) * test_covar * signs.unsqueeze(-2) + train_test_covar = train_test_covar * signs.unsqueeze(-2) + + A, _ = self.affine_transform + train_solver = self.train_prior_predictive.solver + cross_covar = to_dense((A @ train_test_covar).transpose(-1, -2)) + train_perm = train_solver.perm.expand( + *cross_covar.shape[:-1], train_solver.perm.shape[-1] + ) + joint_solver = train_solver.expand(*test_covar.shape[:-2]).augment( + covariance_matrix=to_dense(test_covar), + cross_covariance_matrix=cross_covar.gather(-1, train_perm), + bounds=pad(-test_loc.unsqueeze(-1), (0, 1), value=float("inf")), + ) + log_probs = joint_solver.solve() - self.train_prior_predictive.log_partition + return Bernoulli(probs=log_probs.exp()) + + @lazy_property + def affine_transform(self) -> Tuple[Tensor, Optional[Tensor]]: + signed_train_targets = -1 + 2 * self.train_targets.to( + dtype=self.train_prior.loc.dtype, device=self.train_prior.loc.device + ) + return self.likelihood.get_affine_transform(diag=signed_train_targets) + + @lazy_property + def train_prior_predictive(self) -> TruncatedMultivariateNormal: + return self.likelihood.marginal(self.train_prior, self.train_targets) + + +class SkewGPClassifier(SkewGP): + def __init__( + self, + train_X: Tensor, + train_Y: BoolTensor, + likelihood: Optional[AffineProbitLikelihood] = None, + mean_module: Optional[Module] = None, + covar_module: Optional[Module] = None, + input_transform: Optional[InputTransform] = None, + outcome_transform: NoneType = None, + **kwargs: Any, + ) -> None: + r"""Infinite-dimensional analogue of pushing a (truncated) multivariate normal + random vector through a Heaviside step function, see [benavoli2020skew]. + + Let `f ~ GP(m, k)` be a Gaussian process and define observations as binary + random variables `y_i = 1_{f(x_i) + e(x_i) > 0}`, where `e(.)` is a centered + Gaussian noise model. Then, `P(Y) = P(Af(X) + e(X) > 0)`, where `A` is a + diagonal matrix such that `A_{ii} = 2y_i - 1`. + + Args: + train_X: A `batch_shape x n x d` tensor of training features. + train_Y: A `batch_shape x n x m` tensor of binary training labels. + likelihood: A likelihood. If omitted, use an AffineProbitLikelihood + with homoskedastic Gaussian noise. + covar_module: The module used to compute prior covariances. + input_transform: An input transform that is applied in the model's + forward pass. + """ + assert outcome_transform is None, UnsupportedError( + "SkewGPClassifier does not use an outcome transform." + ) + + if likelihood is None: + weight = IdentityLinearOperator( + train_X.shape[:1], device=train_X.device, dtype=train_X.dtype + ) + noise_covar = HomoskedasticNoise(noise_prior=GammaPrior(0.9, 10.0)) + likelihood = AffineProbitLikelihood(weight=weight, noise_covar=noise_covar) + + super().__init__( + train_X=train_X, + train_Y=train_Y, + likelihood=likelihood, + mean_module=mean_module, + covar_module=covar_module, + input_transform=input_transform, + **kwargs, + ) + + def forward(self, x: Tensor) -> MultivariateNormal: + if self.training: + x = self.transform_inputs(x) + mean_x = self.mean_module(x) + covar_x = self.covar_module(x) + return MultivariateNormal(mean_x, covar_x) + + def posterior(self, X: Tensor) -> TorchPosterior: + r"""Computes the posterior distribution of process values at test locations `X`. + + Args: + X: A `(batch_shape) x q x d`-dim Tensor, where `d` is the dimension + of the feature space and `q` is the number of points considered + jointly. + + Returns: + A `TorchPosterior` object with a `UnifiedSkewNormal` distribution, + representing a batch of `b` joint distributions over `q` points. + """ + self.eval() + with self.prediction_strategy.predict_latents(True), gpt_posterior_settings(): + usn = self(self.transform_inputs(X)) + return TorchPosterior(distribution=usn) + + def posterior_predictive( + self, + X: Tensor, + predict_noise: Union[bool, Tensor] = True, + test_targets: Optional[BoolTensor] = None, + **kwargs: Any, + ) -> TorchPosterior: + r"""Computes the posterior distribution of the Boolean random variable defined + as whether or not the test targets are correct. + + Args: + X: A `(batch_shape) x q x d`-dim Tensor, where `d` is the dimension + of the feature space and `q` is the number of points considered + jointly. + predict_noise: If True, use the likelihood's noise module to add + noise to the covariance of the latent random variables. If a Tensor, + add it directly to the aforementioned covariancce + (must be of shape `(batch_shape) x q`). + test_targets: Target values for the test set, assumed positive by default. + + Returns: + A `TorchPosterior` object with a `Bernoulli` distribution, representing a + batch of `b` joint distributions over `q` points. + """ + self.eval() + with self.prediction_strategy.predict_latents(False), gpt_posterior_settings(): + bernoulli = self( + self.transform_inputs(X), + predict_noise=predict_noise, + test_targets=test_targets, + ) + return TorchPosterior(distribution=bernoulli) + + @property + def train_targets(self) -> Tensor: + return self._train_targets + + @train_targets.setter + def train_targets(self, train_targets: Tensor) -> None: + self._train_targets = train_targets + + @property + def prediction_strategy(self) -> SkewGPClassifierPredictionStrategy: + if self._prediction_strategy is None: + self._prediction_strategy = SkewGPClassifierPredictionStrategy( + train_prior=self.forward(*self.train_inputs), + likelihood=self.likelihood, + train_targets=self.train_targets, + ) + return self._prediction_strategy + + @classmethod + def _validate_inputs( + cls, + train_X: Tensor, + train_Y: BoolTensor, + train_Yvar: Optional[Tensor] = None, + raise_on_fail: bool = False, + ignore_X_dims: Optional[List[int]] = None, + ) -> None: + if validate_input_scaling.off(): + return + + check_no_nans(train_X) + check_no_nans(train_Y) + if train_Yvar is not None: + check_no_nans(train_Yvar) + if torch.any(train_Yvar < 0): + raise InputDataError("Input data contains negative variances.") + + check_min_max_scaling( + X=train_X, raise_on_fail=raise_on_fail, ignore_dims=ignore_X_dims + ) diff --git a/test/models/test_skew_gp.py b/test/models/test_skew_gp.py new file mode 100644 index 0000000000..002c7bd209 --- /dev/null +++ b/test/models/test_skew_gp.py @@ -0,0 +1,283 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from __future__ import annotations + +from contextlib import suppress + +from functools import lru_cache +from itertools import product +from typing import Iterator, NamedTuple, Optional, Tuple, TypeVar, Union +from unittest.mock import patch +from warnings import catch_warnings, filterwarnings + +import torch +from botorch.exceptions.errors import UnsupportedError +from botorch.exceptions.warnings import OptimizationWarning +from botorch.fit import fit_gpytorch_mll +from botorch.models.likelihoods.affine_probit import AffineProbitLikelihood +from botorch.models.skew_gp import ( + ExactMarginalLogLikelihood_v2, + SkewGP, + SkewGPClassifier, +) +from botorch.models.transforms.input import InputTransform, Normalize +from botorch.posteriors.torch import TorchPosterior +from botorch.utils.probability.unified_skew_normal import UnifiedSkewNormal +from botorch.utils.testing import BotorchTestCase +from gpytorch import settings as gpytorch_settings +from gpytorch.kernels import MaternKernel, ScaleKernel +from gpytorch.likelihoods.noise_models import HomoskedasticNoise +from gpytorch.means import ConstantMean +from gpytorch.priors import GammaPrior +from torch import Size, Tensor +from torch.distributions.bernoulli import Bernoulli + +T = TypeVar("T") +Choices = type("Choices", (tuple,), {}) +Option = Union[T, Tuple[T, ...]] + + +class TestConfigSkewGP(NamedTuple): + seed: Option[int] = Choices([0]) + dtype: Option[torch.dtype] = Choices([torch.float32, torch.float64]) + device: Option[torch.device] = Choices([torch.device("cpu")]) + input_dim: Option[int] = Choices([2]) + num_train: Option[int] = Choices([3]) + num_constraints: Option[int] = Choices([2]) + test_shapes: Option[Tuple[Size, ...]] = Choices([(Size([4]), Size([2, 4]))]) + transform_inputs: Option[bool] = Choices([False, True]) + + +class TestCacheSkewGP(NamedTuple): + model: SkewGP + train_X: Tensor + test_Xs: Tuple[Tensor, ...] + likelihood: AffineProbitLikelihood + input_transform: Optional[InputTransform] = None + + +class TestSkewGP(BotorchTestCase): + def setUp(self): + self.choices = TestConfigSkewGP() + + def gen_config(self) -> Iterator[NamedTuple]: + for config in map( + lambda tpl: type(self.choices)(*tpl), + product(*(x if isinstance(x, Choices) else (x,) for x in self.choices)), + ): + yield config + + @property + def cases(self) -> Iterator[Tuple[TestConfigSkewGP, TestCacheSkewGP]]: + for config in self.gen_config(): + yield config, self.get_case(config) + + @lru_cache(maxsize=None) + def get_case(self, config: TestConfigSkewGP) -> TestCacheSkewGP: + assert config.num_train > 1 + + tkwargs = {"device": config.device, "dtype": config.dtype} + with torch.random.fork_rng(): + torch.random.manual_seed(config.seed) + train_X = torch.rand(config.num_train, config.input_dim, **tkwargs) + test_Xs = tuple( + torch.rand(test_shape + (config.input_dim,), **tkwargs) + for test_shape in config.test_shapes + ) + + weight = torch.rand(config.num_constraints, config.num_train, **tkwargs) + bias = torch.rand(config.num_constraints, **tkwargs) + + noise_covar = HomoskedasticNoise(noise_prior=GammaPrior(0.9, 10.0)) + likelihood = AffineProbitLikelihood( + weight=weight, bias=bias, noise_covar=noise_covar + ) + + if config.transform_inputs: + warping = Normalize( + d=config.input_dim, + transform_on_train=True, + bounds=torch.tensor( + [config.input_dim * [-1.0], config.input_dim * [1.0]], **tkwargs + ), + ) + else: + warping = None + + model = SkewGP(train_X=train_X, likelihood=likelihood, input_transform=warping) + + return TestCacheSkewGP( + model=model.to(**tkwargs), + likelihood=likelihood, + train_X=train_X, + test_Xs=test_Xs, + input_transform=warping, + ) + + def test_init(self): + for _, cache in self.cases: + model = cache.model + self.assertIsInstance(model.mean_module, ConstantMean) + self.assertIsInstance(model.covar_module, ScaleKernel) + base_kernel = model.covar_module.base_kernel + self.assertIsInstance(base_kernel, MaternKernel) + self.assertIsInstance(base_kernel.lengthscale_prior, GammaPrior) + if cache.input_transform is not None: + self.assertEqual(model.input_transform, cache.input_transform) + + def test_call(self): + for _, cache in self.cases: + model = cache.model + model.train() + with patch.object(model, "_train_inputs", None): + with self.assertRaisesRegex(RuntimeError, "train_inputs cannot be Non"): + model(cache.train_X) + + with gpytorch_settings.debug(True): + with self.assertRaisesRegex(RuntimeError, "must train on the training"): + model(cache.test_Xs[0]) + + def test_set_train_data(self): + config, cache = next(self.cases) + + X = cache.test_Xs[0] + while X.ndim > 2: + X = X[0] + + with self.assertRaisesRegex(RuntimeError, "Cannot modify number of"): + cache.model.set_train_data(inputs=(), strict=True) + + with self.assertRaisesRegex(RuntimeError, "Cannot modify shape of"): + cache.model.set_train_data(inputs=(X[:1],), strict=True) + + with self.assertRaisesRegex(UnsupportedError, "SkewGP does not have"): + cache.model.set_train_data(targets=X) # fails regardless of strict + + def test_fit(self): + for _, cache in self.cases: + mll = ExactMarginalLogLikelihood_v2(cache.model.likelihood, cache.model) + with catch_warnings(): + filterwarnings("ignore", category=OptimizationWarning) + fit_gpytorch_mll( + mll, optimizer_kwargs={"options": {"maxiter": 1}}, max_attempts=1 + ) + + def test_eval(self): + for config, cache in self.cases: + model = cache.model + model.eval() + self.assertEqual(model.prediction_strategy.num_train, config.num_train) + self.assertEqual( + tuple(model.prediction_strategy.train_shape), (config.num_train,) + ) + if config.transform_inputs: + transformed_X = cache.input_transform(cache.train_X) + self.assertTrue(model.train_inputs[0].equal(transformed_X)) + + def test_posterior(self): + for _, cache in self.cases: + for test_X in cache.test_Xs: + posterior = cache.model.posterior(test_X) + self.assertIsInstance(posterior, TorchPosterior) + self.assertIsInstance(posterior.distribution, UnifiedSkewNormal) + self.assertEqual(posterior.event_shape, test_X.shape[:-1]) + + posterior_pred = cache.model.posterior_predictive(test_X) + self.assertIsInstance(posterior_pred, TorchPosterior) + self.assertIsInstance(posterior_pred.distribution, UnifiedSkewNormal) + self.assertEqual(posterior_pred.event_shape, test_X.shape[:-1]) + + +class TestConfigSkewGPClassifier(NamedTuple): + seed: Option[int] = Choices([0]) + dtype: Option[torch.dtype] = Choices([torch.float32, torch.float64]) + device: Option[torch.device] = Choices([torch.device("cpu")]) + input_dim: Option[int] = Choices([2]) + num_train: Option[int] = Choices([3]) + test_shapes: Option[Tuple[Size, ...]] = Choices([(Size([4]), Size([2, 4]))]) + transform_inputs: Option[bool] = Choices([False, True]) + + +class TestCacheSkewGPClassifier(NamedTuple): + model: SkewGP + train_X: Tensor + train_Y: Tensor + test_Xs: Tuple[Tensor, ...] + input_transform: Optional[InputTransform] = None + + +class TestSkewGPClassifier(TestSkewGP): + def setUp(self): + self.choices = TestConfigSkewGPClassifier() + + @lru_cache(maxsize=None) + def get_case(self, config: TestConfigSkewGPClassifier) -> TestCacheSkewGPClassifier: + assert config.num_train > 1 + + tkwargs = {"device": config.device, "dtype": config.dtype} + with torch.random.fork_rng(): + torch.random.manual_seed(config.seed) + train_X = torch.rand(config.num_train, config.input_dim, **tkwargs) + train_Y = torch.rand(config.num_train) > 0.5 + test_Xs = tuple( + torch.rand(test_shape + (config.input_dim,), **tkwargs) + for test_shape in config.test_shapes + ) + + if config.transform_inputs: + warping = Normalize( + d=config.input_dim, + transform_on_train=True, + bounds=torch.tensor( + [config.input_dim * [-1.0], config.input_dim * [1.0]], **tkwargs + ), + ) + else: + warping = None + + model = SkewGPClassifier( + train_X=train_X, train_Y=train_Y, input_transform=warping + ) + + return TestCacheSkewGPClassifier( + model=model.to(**tkwargs), + train_X=train_X, + train_Y=train_Y, + test_Xs=test_Xs, + input_transform=warping, + ) + + def test_set_train_data(self): + config, cache = next(self.cases) + + X = cache.test_Xs[0] + while X.ndim > 2: + X = X[0] + + with self.assertRaisesRegex(RuntimeError, "Cannot modify number of"): + cache.model.set_train_data(inputs=(), strict=True) + + with self.assertRaisesRegex(RuntimeError, "Cannot modify shape of"): + cache.model.set_train_data(inputs=(X[:1],), strict=True) + + with self.assertRaisesRegex(RuntimeError, "Cannot modify"): + cache.model.set_train_data(targets=X, strict=True) + + def test_posterior(self): + for _, cache in self.cases: + for test_X in cache.test_Xs: + # TODO: Batch-mode gradients currently disabled due to pivoting + with (torch.no_grad if test_X.ndim > 2 else suppress)(): + posterior = cache.model.posterior(test_X) + self.assertIsInstance(posterior, TorchPosterior) + self.assertIsInstance(posterior.distribution, UnifiedSkewNormal) + self.assertEqual(posterior.event_shape, test_X.shape[:-1]) + + posterior_pred = cache.model.posterior_predictive(test_X) + self.assertIsInstance(posterior_pred, TorchPosterior) + self.assertIsInstance(posterior_pred.distribution, Bernoulli) + self.assertEqual(posterior_pred.event_shape, test_X.shape[:-2])