diff --git a/aepsych/acquisition/lookahead_utils.py b/aepsych/acquisition/lookahead_utils.py
index 842282e67..5e3817ada 100644
--- a/aepsych/acquisition/lookahead_utils.py
+++ b/aepsych/acquisition/lookahead_utils.py
@@ -41,6 +41,7 @@ def posterior_at_xstar_xq(
            - Sigma_sq: (b x m) covariance between Xstar and each point in Xq.
     """
     # Evaluate posterior and extract needed components
+    Xq = Xq.to(Xstar)
     Xext = torch.cat((Xstar, Xq), dim=-2)
     posterior = model.posterior(Xext, posterior_transform=posterior_transform)
     mu = posterior.mean[..., :, 0]
diff --git a/aepsych/benchmark/problem.py b/aepsych/benchmark/problem.py
index 2aea5ec45..966431c19 100644
--- a/aepsych/benchmark/problem.py
+++ b/aepsych/benchmark/problem.py
@@ -9,7 +9,7 @@
 
 import numpy as np
 import torch
-from aepsych.models.model_protocol import ModelProtocol
+from aepsych.models.base import AEPsychModelMixin
 from aepsych.models.utils import p_below_threshold
 from aepsych.strategy import SequentialStrategy
 from aepsych.utils import make_scaled_sobol
@@ -78,11 +78,11 @@ def sample_y(
         """
         return bernoulli.rvs(self.p(x))
 
-    def f_hat(self, model: ModelProtocol) -> torch.Tensor:
+    def f_hat(self, model: AEPsychModelMixin) -> torch.Tensor:
         """Generate mean predictions from the model over the evaluation grid.
 
         Args:
-            model (TensoModelProtocolr): Model to evaluate.
+            model (AEPsychModelMixin): Model to evaluate.
 
         Returns:
             torch.Tensor: Posterior mean from underlying model over the evaluation grid.
@@ -109,11 +109,11 @@ def p_true(self) -> torch.Tensor:
         normal_dist = torch.distributions.Normal(0, 1)
         return normal_dist.cdf(self.f_true)
 
-    def p_hat(self, model: ModelProtocol) -> torch.Tensor:
+    def p_hat(self, model: AEPsychModelMixin) -> torch.Tensor:
         """Generate mean predictions from the model over the evaluation grid.
 
         Args:
-            model (TensoModelProtocolr): Model to evaluate.
+            model (AEPsychModelMixin): Model to evaluate.
 
         Returns:
             torch.Tensor: Posterior mean from underlying model over the evaluation grid.
@@ -171,9 +171,9 @@ def evaluate(
         # eval in samp-based expectation over posterior instead of just mean
         fsamps = model.sample(self.eval_grid, num_samples=1000)
         try:
-            psamps = (
-                model.sample(self.eval_grid, num_samples=1000, probability_space=True)  # type: ignore
-            )
+            psamps = model.sample(
+                self.eval_grid, num_samples=1000, probability_space=True
+            )  # type: ignore
         except (
             TypeError
         ):  # vanilla models don't have proba_space samps, TODO maybe we should add them
diff --git a/aepsych/generators/acqf_grid_search_generator.py b/aepsych/generators/acqf_grid_search_generator.py
index eef1c0dba..446d9033b 100644
--- a/aepsych/generators/acqf_grid_search_generator.py
+++ b/aepsych/generators/acqf_grid_search_generator.py
@@ -10,12 +10,11 @@
 
 import numpy as np
 import torch
-from aepsych.models.model_protocol import ModelProtocol
+from aepsych.generators.grid_eval_acqf_generator import GridEvalAcqfGenerator
+from aepsych.models.base import AEPsychModelMixin
 from aepsych.utils_logging import getLogger
 from numpy.random import choice
 
-from .grid_eval_acqf_generator import GridEvalAcqfGenerator
-
 logger = getLogger()
 
 
@@ -25,7 +24,7 @@ class AcqfGridSearchGenerator(GridEvalAcqfGenerator):
     def _gen(
         self,
         num_points: int,
-        model: ModelProtocol,
+        model: AEPsychModelMixin,
         fixed_features: Optional[Dict[int, float]] = None,
         **gen_options,
     ) -> torch.Tensor:
@@ -34,7 +33,7 @@ def _gen(
 
         Args:
             num_points (int): The number of points to query.
-            model (ModelProtocol): The fitted model used to evaluate the acquisition function.
+            model (AEPsychModelMixin): The fitted model used to evaluate the acquisition function.
             fixed_features: (Dict[int, float], optional): Parameters that are fixed to specific values.
             gen_options (dict): Additional options for generating points, including:
                 - "seed": Random seed for reproducibility.
diff --git a/aepsych/generators/acqf_thompson_sampler_generator.py b/aepsych/generators/acqf_thompson_sampler_generator.py
index 5dbc4afa5..7c4ae35d6 100644
--- a/aepsych/generators/acqf_thompson_sampler_generator.py
+++ b/aepsych/generators/acqf_thompson_sampler_generator.py
@@ -10,7 +10,7 @@
 
 import numpy as np
 import torch
-from aepsych.models.model_protocol import ModelProtocol
+from aepsych.models.base import AEPsychModelMixin
 from aepsych.utils_logging import getLogger
 from numpy.random import choice
 
@@ -25,7 +25,7 @@ class AcqfThompsonSamplerGenerator(GridEvalAcqfGenerator):
     def _gen(
         self,
         num_points: int,
-        model: ModelProtocol,
+        model: AEPsychModelMixin,
         fixed_features: Optional[Dict[int, float]] = None,
         **gen_options,
     ) -> torch.Tensor:
@@ -34,7 +34,7 @@ def _gen(
 
         Args:
             num_points (int): The number of points to query.
-            model (ModelProtocol): The fitted model used to evaluate the acquisition function.
+            model (AEPsychModelMixin): The fitted model used to evaluate the acquisition function.
             fixed_features: (Dict[int, float], optional): Parameters that are fixed to specific values.
             gen_options (dict): Additional options for generating points, including:
                 - "seed": Random seed for reproducibility.
diff --git a/aepsych/generators/base.py b/aepsych/generators/base.py
index 53bb28abd..9aeb29984 100644
--- a/aepsych/generators/base.py
+++ b/aepsych/generators/base.py
@@ -11,7 +11,7 @@
 
 import torch
 from aepsych.config import Config, ConfigurableMixin
-from aepsych.models.base import AEPsychMixin
+from aepsych.models.base import AEPsychModelMixin
 from botorch.acquisition import (
     AcquisitionFunction,
     LogNoisyExpectedImprovement,
@@ -21,9 +21,7 @@
 )
 from botorch.acquisition.preference import AnalyticExpectedUtilityOfBestOption
 
-from ..models.model_protocol import ModelProtocol
-
-AEPsychModelType = TypeVar("AEPsychModelType", bound=AEPsychMixin)
+AEPsychModelType = TypeVar("AEPsychModelType", bound=AEPsychModelMixin)
 
 
 @runtime_checkable
@@ -166,12 +164,14 @@ def _get_acqf_options(
 
         return extra_acqf_args
 
-    def _instantiate_acquisition_fn(self, model: ModelProtocol) -> AcquisitionFunction:
+    def _instantiate_acquisition_fn(
+        self, model: AEPsychModelMixin
+    ) -> AcquisitionFunction:
         """
         Instantiates the acquisition function with the specified model and additional arguments.
 
         Args:
-            model (ModelProtocol): The model to use with the acquisition function.
+            model (AEPsychModelMixin): The model to use with the acquisition function.
 
         Returns:
             AcquisitionFunction: Configured acquisition function.
@@ -193,6 +193,8 @@ def _instantiate_acquisition_fn(self, model: ModelProtocol) -> AcquisitionFuncti
                 self.acqf_kwargs["ub"] = self.acqf_kwargs["ub"].to(model.device)
 
         if self.acqf in self.baseline_requiring_acqfs:
+            if model.train_inputs is None:
+                raise ValueError(f"model needs data as a baseline for {self.acqf}")
             return self.acqf(model, model.train_inputs[0], **self.acqf_kwargs)
         else:
             return self.acqf(model=model, **self.acqf_kwargs)
diff --git a/aepsych/generators/epsilon_greedy_generator.py b/aepsych/generators/epsilon_greedy_generator.py
index a35b9d95c..2a26a4b6b 100644
--- a/aepsych/generators/epsilon_greedy_generator.py
+++ b/aepsych/generators/epsilon_greedy_generator.py
@@ -9,8 +9,8 @@
 
 import numpy as np
 import torch
+from aepsych.models.base import AEPsychModelMixin
 
-from ..models.model_protocol import ModelProtocol
 from .base import AEPsychGenerator
 from .optimize_acqf_generator import OptimizeAcqfGenerator
 
@@ -65,7 +65,7 @@ def get_config_options(
     def gen(
         self,
         num_points: int,
-        model: ModelProtocol,
+        model: AEPsychModelMixin,
         fixed_features: Optional[Dict[int, float]] = None,
         **kwargs,
     ) -> torch.Tensor:
@@ -73,7 +73,7 @@ def gen(
 
         Args:
             num_points (int): Number of points to query.
-            model (ModelProtocol): Model to use for generating points.
+            model (AEPsychModelMixin): Model to use for generating points.
             fixed_features: (Dict[int, float], optional): Parameters that are fixed to specific values.
             **kwargs: Passed to subgenerator if not exploring
         """
diff --git a/aepsych/generators/grid_eval_acqf_generator.py b/aepsych/generators/grid_eval_acqf_generator.py
index 16a1aadad..215187f51 100644
--- a/aepsych/generators/grid_eval_acqf_generator.py
+++ b/aepsych/generators/grid_eval_acqf_generator.py
@@ -11,7 +11,7 @@
 from aepsych.config import Config
 from aepsych.generators.base import AcqfGenerator, AEPsychGenerator
 from aepsych.generators.sobol_generator import SobolGenerator
-from aepsych.models.model_protocol import ModelProtocol
+from aepsych.models.base import AEPsychModelMixin
 from aepsych.utils_logging import getLogger
 from botorch.acquisition import AcquisitionFunction
 
@@ -53,14 +53,14 @@ def __init__(
     def gen(
         self,
         num_points: int,
-        model: ModelProtocol,
+        model: AEPsychModelMixin,
         fixed_features: Optional[Dict[int, float]] = None,
         **gen_options,
     ) -> torch.Tensor:
         """Query next point(s) to run by optimizing the acquisition function.
         Args:
             num_points (int): Number of points to query.
-            model (ModelProtocol): Fitted model of the data.
+            model (AEPsychModelMixin): Fitted model of the data.
         Returns:
             torch.Tensor: Next set of point(s) to evaluate, [num_points x dim].
         """
@@ -89,7 +89,7 @@ def gen(
     def _gen(
         self,
         num_points: int,
-        model: ModelProtocol,
+        model: AEPsychModelMixin,
         fixed_features: Optional[Dict[int, float]] = None,
         **gen_options,
     ) -> torch.Tensor:
@@ -98,7 +98,7 @@ def _gen(
     def _eval_acqf(
         self,
         num_points: int,
-        model: ModelProtocol,
+        model: AEPsychModelMixin,
         fixed_features: Optional[Dict[int, float]] = None,
         **gen_options,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
diff --git a/aepsych/generators/manual_generator.py b/aepsych/generators/manual_generator.py
index 729bdb39c..b9ec74ffe 100644
--- a/aepsych/generators/manual_generator.py
+++ b/aepsych/generators/manual_generator.py
@@ -11,7 +11,7 @@
 import torch
 from aepsych.config import Config
 from aepsych.generators.base import AEPsychGenerator
-from aepsych.models.base import AEPsychMixin
+from aepsych.models.base import AEPsychModelMixin
 from aepsych.utils import _process_bounds
 from torch.quasirandom import SobolEngine
 
@@ -53,14 +53,14 @@ def __init__(
     def gen(
         self,
         num_points: int = 1,
-        model: Optional[AEPsychMixin] = None,  # included for API compatibility
+        model: Optional[AEPsychModelMixin] = None,  # included for API compatibility
         fixed_features: Optional[Dict[int, float]] = None,
         **kwargs,  # Ignored
     ) -> torch.Tensor:
         """Query next point(s) to run by quasi-randomly sampling the parameter space.
         Args:
             num_points (int): Number of points to query. Defaults to 1.
-            model (AEPsychMixin, optional): Model to use for generating points. Not used in this generator. Defaults to None.
+            model (AEPsychModelMixin, optional): Model to use for generating points. Not used in this generator. Defaults to None.
             fixed_features (Dict[int, float], optional): Ignored, kept for consistent
                 API.
             **kwargs: Ignored, API compatibility
diff --git a/aepsych/generators/optimize_acqf_generator.py b/aepsych/generators/optimize_acqf_generator.py
index fd708b652..6d19897e1 100644
--- a/aepsych/generators/optimize_acqf_generator.py
+++ b/aepsych/generators/optimize_acqf_generator.py
@@ -14,7 +14,7 @@
 from aepsych.acquisition.lookahead import LookaheadAcquisitionFunction
 from aepsych.config import Config
 from aepsych.generators.base import AcqfGenerator
-from aepsych.models.model_protocol import ModelProtocol
+from aepsych.models.base import AEPsychModelMixin
 from aepsych.utils_logging import getLogger
 from botorch.acquisition import AcquisitionFunction
 from botorch.optim import optimize_acqf
@@ -60,14 +60,14 @@ def __init__(
     def gen(
         self,
         num_points: int,
-        model: ModelProtocol,
+        model: AEPsychModelMixin,
         fixed_features: Optional[Dict[int, float]] = None,
         **gen_options,
     ) -> torch.Tensor:
         """Query next point(s) to run by optimizing the acquisition function.
         Args:
             num_points (int): Number of points to query.
-            model (ModelProtocol): Fitted model of the data.
+            model (AEPsychModelMixin): Fitted model of the data.
             fixed_features (Dict[int, float], optional): The values where the specified
                 parameters should be at when generating. Should be a dictionary where
                 the keys are the indices of the parameters to fix and the values are the
@@ -116,7 +116,7 @@ def gen(
     def _gen(
         self,
         num_points: int,
-        model: ModelProtocol,
+        model: AEPsychModelMixin,
         acqf: AcquisitionFunction,
         fixed_features: Optional[Dict[int, float]] = None,
         **gen_options: Dict[str, Any],
@@ -126,7 +126,7 @@ def _gen(
 
         Args:
             num_points (int): Number of points to query.
-            model (ModelProtocol): Fitted model of the data.
+            model (AEPsychModelMixin): Fitted model of the data.
             acqf (AcquisitionFunction): Acquisition function.
             fixed_features (Dict[int, float], optional): The values where the specified
                 parameters should be at when generating. Should be a dictionary where
diff --git a/aepsych/generators/random_generator.py b/aepsych/generators/random_generator.py
index 96f593fd0..255fb763d 100644
--- a/aepsych/generators/random_generator.py
+++ b/aepsych/generators/random_generator.py
@@ -10,7 +10,7 @@
 import torch
 from aepsych.config import Config
 from aepsych.generators.base import AEPsychGenerator
-from aepsych.models.base import AEPsychMixin
+from aepsych.models.base import AEPsychModelMixin
 from aepsych.utils import _process_bounds
 
 
@@ -38,14 +38,14 @@ def __init__(
     def gen(
         self,
         num_points: int = 1,
-        model: Optional[AEPsychMixin] = None,  # included for API compatibility.
+        model: Optional[AEPsychModelMixin] = None,  # included for API compatibility.
         fixed_features: Optional[Dict[int, float]] = None,
         **kwargs,
     ) -> torch.Tensor:
         """Query next point(s) to run by randomly sampling the parameter space.
         Args:
             num_points (int): Number of points to query. Currently, only 1 point can be queried at a time.
-            model (AEPsychMixin, optional): Model to use for generating points. Not used in this generator.
+            model (AEPsychModelMixin, optional): Model to use for generating points. Not used in this generator.
             fixed_features: (Dict[int, float], optional): Parameters that are fixed to specific values.
             **kwargs: Ignored, API compatibility
 
diff --git a/aepsych/generators/sobol_generator.py b/aepsych/generators/sobol_generator.py
index a67b8f4cc..bdd7bfe9c 100644
--- a/aepsych/generators/sobol_generator.py
+++ b/aepsych/generators/sobol_generator.py
@@ -11,7 +11,7 @@
 import torch
 from aepsych.config import Config
 from aepsych.generators.base import AEPsychGenerator
-from aepsych.models.base import AEPsychMixin
+from aepsych.models.base import AEPsychModelMixin
 from aepsych.utils import _process_bounds
 from torch.quasirandom import SobolEngine
 
@@ -49,14 +49,14 @@ def __init__(
     def gen(
         self,
         num_points: int = 1,
-        model: Optional[AEPsychMixin] = None,  # included for API compatibility
+        model: Optional[AEPsychModelMixin] = None,  # included for API compatibility
         fixed_features: Optional[Dict[int, float]] = None,
         **kwargs,
     ) -> torch.Tensor:
         """Query next point(s) to run by quasi-randomly sampling the parameter space.
         Args:
             num_points (int): Number of points to query. Defaults to 1.
-            moodel (AEPsychMixin, optional): Model to use for generating points. Not used in this generator. Defaults to None.
+            moodel (AEPsychModelMixin, optional): Model to use for generating points. Not used in this generator. Defaults to None.
             fixed_features: (Dict[int, float], optional): Parameters that are fixed to specific values.
             **kwargs: Ignored, API compatibility
         Returns:
diff --git a/aepsych/models/base.py b/aepsych/models/base.py
index 111e6bf9e..1a62f0fa6 100644
--- a/aepsych/models/base.py
+++ b/aepsych/models/base.py
@@ -19,42 +19,99 @@
 from botorch.models.gpytorch import GPyTorchModel
 from botorch.posteriors import TransformedPosterior
 from gpytorch.mlls import MarginalLogLikelihood
-from torch.nn import Module
 
 logger = getLogger()
 
 
-class AEPsychMixin(GPyTorchModel, ConfigurableMixin):
+class AEPsychModelMixin(GPyTorchModel, ConfigurableMixin):
     """Mixin class that provides AEPsych-specific utility methods."""
 
     extremum_solver = "Nelder-Mead"
     outcome_types: List[str] = []
-    train_inputs: Optional[Tuple[torch.Tensor]]
-    train_targets: Optional[torch.Tensor]
     stimuli_per_trial: int = 1
 
-    def set_train_data(
-        self,
-        inputs: Optional[torch.Tensor] = None,
-        targets: Optional[torch.Tensor] = None,
-        strict: bool = False,
-    ):
+    dim: int
+    _train_inputs: Optional[Tuple[torch.Tensor]]
+    _train_targets: Optional[torch.Tensor]
+
+    def fit(self, train_x: torch.Tensor, train_y: torch.Tensor, **kwargs: Any) -> None:
+        """Fit underlying model. Must be overriden by subclasses.
+
+        Args:
+            train_x (torch.Tensor): Inputs.
+            train_y (torch.LongTensor): Responses.
+            **kwargs: Extra kwargs for fitting the model.
+        """
+        raise NotImplementedError
+
+    @property
+    def device(self) -> torch.device:
+        """Get the device of the model.
+
+        Returns:
+            torch.device: Device of the model.
+        """
+        # We assume all models have some parameters and all models will only use one device
+        # notice that this has no setting, don't let users set device, use .to().
+        return next(self.parameters()).device
+
+    @property
+    def train_inputs(self) -> Optional[Tuple[torch.Tensor]]:
+        """Get the training inputs.
+
+        Returns:
+            Optional[Tuple[torch.Tensor]]: Training inputs.
         """
-        Set the training data for the model.
+        if self._train_inputs is None:
+            return None
+
+        # makes sure the tensors are on the right device, move in place
+        for input in self._train_inputs:
+            input.to(self.device)
+
+        return self._train_inputs
+
+    @train_inputs.setter
+    def train_inputs(self, train_inputs: Optional[Tuple[torch.Tensor]]) -> None:
+        """Set the training inputs.
 
         Args:
-            inputs (torch.Tensor, optional):  The new training inputs.
-            targets (torch.Tensor, optional): The new training targets.
-            strict (bool):  Default is False. Ignored, just for compatibility.
+            train_inputs (Tuple[torch.Tensor]): Training inputs.
+        """
+        if train_inputs is None:
+            self._train_inputs = None
+        else:
+            for input in train_inputs:
+                input.to(self.device)
 
-        input transformers. TODO: actually use this arg or change input transforms
-        to not require it.
+            self._train_inputs = train_inputs
+
+    @property
+    def train_targets(self) -> Optional[torch.Tensor]:
+        """Get the training targets.
+
+        Returns:
+            Optional[torch.Tensor]: Training targets.
         """
-        if inputs is not None:
-            self.train_inputs = (inputs,)
+        if self._train_targets is None:
+            return None
 
-        if targets is not None:
-            self.train_targets = targets
+        # make sure the tensors are on the right device
+        self._train_targets = self._train_targets.to(self.device)
+
+        return self._train_targets
+
+    @train_targets.setter
+    def train_targets(self, train_targets: Optional[torch.Tensor]) -> None:
+        """Set the training targets.
+
+        Args:
+            train_targets (torch.Tensor, optional): Training targets.
+        """
+        if train_targets is None:
+            self._train_targets = None
+        else:
+            self._train_targets = train_targets.to(self.device)
 
     def forward(self, x: torch.Tensor) -> gpytorch.distributions.MultivariateNormal:
         """Evaluate GP
@@ -110,71 +167,6 @@ def _fit_mll(
         )
         return res
 
-    @classmethod
-    def get_config_options(
-        cls,
-        config: Config,
-        name: Optional[str] = None,
-        options: Optional[Dict[str, Any]] = None,
-    ) -> Dict[str, Any]:
-        """.
-
-        Args:
-            config (Config): Config to look for options in.
-            name (str, optional): The name of the strategy to warm start (Not actually optional here.)
-            options (Dict[str, Any], optional): options are ignored.
-
-        Raises:
-            ValueError: the name of the strategy is necessary to identify warm start search criteria.
-            KeyError: the config specified this strategy should be warm started but the associated config section wasn't defined.
-
-        Returns:
-            Dict[str, Any]: a dictionary of the search criteria described in the experiment's config
-        """
-        # NOTE: This get_config_options implies there should be an __init__ in this base
-        # class, but because the exact order of superclasses in this class's
-        # subclasses is very particular to ensure the MRO is exactly right, we cannot
-        # have a __init__ here. Expect the arguments, dim, mean_module, covar_module,
-        # likelihood, max_fit_time, and options. Look at subclasses for typing.
-
-        options = super().get_config_options(config=config, name=name, options=options)
-
-        name = name or cls.__name__
-
-        # Missing dims
-        if "dim" not in options:
-            options["dim"] = get_dims(config)
-
-        # Missing mean/covar modules
-        if (
-            options.get("mean_module", None) is None
-            and options.get("mean_module", None) is None
-        ):
-            # Get the factory
-            mean_covar_factory = config.getobj(
-                name, "mean_covar_factory", fallback=default_mean_covar_factory
-            )
-
-            mean_module, covar_module = mean_covar_factory(
-                config, stimuli_per_trial=cls.stimuli_per_trial
-            )
-
-            options["mean_module"] = mean_module
-            options["covar_module"] = covar_module
-
-        if "likelihood" in options and isinstance(options["likelihood"], type):
-            options["likelihood"] = options["likelihood"]()  # Initialize it
-
-        # Get optimize options, this is necessarily bespoke
-        options["optimizer_options"] = get_optimizer_options(config, name)
-
-        return options
-
-
-class AEPsychModelDeviceMixin(AEPsychMixin):
-    _train_inputs: Optional[Tuple[torch.Tensor]]
-    _train_targets: Optional[torch.Tensor]
-
     def set_train_data(
         self,
         inputs: Optional[torch.Tensor] = None,
@@ -198,86 +190,21 @@ def set_train_data(
         if targets is not None:
             self._train_targets = targets.to(self.device)
 
-    @property
-    def device(self) -> torch.device:
-        """Get the device of the model.
-
-        Returns:
-            torch.device: Device of the model.
-        """
-        # We assume all models have some parameters and all models will only use one device
-        # notice that this has no setting, don't let users set device, use .to().
-        return next(self.parameters()).device
-
-    @property
-    def train_inputs(self) -> Optional[Tuple[torch.Tensor]]:
-        """Get the training inputs.
-
-        Returns:
-            Optional[Tuple[torch.Tensor]]: Training inputs.
-        """
-        if self._train_inputs is None:
-            return None
-
-        # makes sure the tensors are on the right device, move in place
-        for input in self._train_inputs:
-            input.to(self.device)
-
-        return self._train_inputs
-
-    @train_inputs.setter
-    def train_inputs(self, train_inputs: Optional[Tuple[torch.Tensor]]) -> None:
-        """Set the training inputs.
-
-        Args:
-            train_inputs (Tuple[torch.Tensor]): Training inputs.
-        """
-        if train_inputs is None:
-            self._train_inputs = None
-        else:
-            for input in train_inputs:
-                input.to(self.device)
-
-            self._train_inputs = train_inputs
-
-    @property
-    def train_targets(self) -> Optional[torch.Tensor]:
-        """Get the training targets.
-
-        Returns:
-            Optional[torch.Tensor]: Training targets.
-        """
-        if self._train_targets is None:
-            return None
-
-        # make sure the tensors are on the right device
-        self._train_targets = self._train_targets.to(self.device)
-
-        return self._train_targets
-
-    @train_targets.setter
-    def train_targets(self, train_targets: Optional[torch.Tensor]) -> None:
-        """Set the training targets.
-
-        Args:
-            train_targets (torch.Tensor, optional): Training targets.
-        """
-        if train_targets is None:
-            self._train_targets = None
-        else:
-            self._train_targets = train_targets.to(self.device)
-
     def predict(
         self,
         x: torch.Tensor,
+        *args,
+        **kwargs,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Query the model for posterior mean and variance.
 
         Args:
             x (torch.Tensor): Points at which to predict from the model.
+            *args: Positional arguments for model-specific predict args.
+            **kwargs: Keyword arguments for model-specific predict kwargs.
 
         Returns:
-            Tuple[torch.Tensor, torch.Tensor]: Posterior mean and variance at queries points.
+            Tuple[torch.Tensor, torch.Tensor]: Posterior mean and variance at query points.
         """
         with torch.no_grad():
             x = x.to(self.device)
@@ -285,7 +212,7 @@ def predict(
             mean = post.mean.squeeze()
             var = post.variance.squeeze()
 
-        return promote_0d(mean.to(self.device)), promote_0d(var.to(self.device))
+        return mean.to(self.device), var.to(self.device)
 
     def predict_transform(
         self,
@@ -302,7 +229,7 @@ def predict_transform(
                 transformation is applied.
 
         Returns:
-            Tuple[torch.Tensor, torch.Tensor]: Transformed posterior mean and variance at queries points.
+            Tuple[torch.Tensor, torch.Tensor]: Transformed posterior mean and variance at query points.
         """
         if transformed_posterior_cls is None:
             return self.predict(x)
@@ -338,3 +265,65 @@ def update(self, train_x: torch.Tensor, train_y: torch.Tensor, **kwargs):
             train_y (torch.Tensor): Responses.
         """
         return self.fit(train_x, train_y, **kwargs)
+
+    @classmethod
+    def get_config_options(
+        cls,
+        config: Config,
+        name: Optional[str] = None,
+        options: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Any]:
+        """.
+
+        Args:
+            config (Config): Config to look for options in.
+            name (str, optional): The name of the strategy to warm start (Not actually optional here.)
+            options (Dict[str, Any], optional): options are ignored.
+
+        Raises:
+            ValueError: the name of the strategy is necessary to identify warm start search criteria.
+            KeyError: the config specified this strategy should be warm started but the associated config section wasn't defined.
+
+        Returns:
+            Dict[str, Any]: a dictionary of the search criteria described in the experiment's config
+        """
+        name = name or cls.__name__
+        options = super().get_config_options(config, name, options)
+
+        dim = config.getint(name, "dim", fallback=None)
+        if dim is None:
+            dim = get_dims(config)
+
+        mean_covar_factory = config.getobj(
+            name, "mean_covar_factory", fallback=default_mean_covar_factory
+        )
+
+        mean, covar = mean_covar_factory(
+            config, stimuli_per_trial=cls.stimuli_per_trial
+        )
+        max_fit_time = config.getfloat(name, "max_fit_time", fallback=None)
+
+        likelihood_cls = config.getobj(name, "likelihood", fallback=None)
+
+        if likelihood_cls is not None:
+            if hasattr(likelihood_cls, "from_config"):
+                likelihood = likelihood_cls.from_config(config)
+            else:
+                likelihood = likelihood_cls()
+        else:
+            likelihood = None  # fall back to __init__ default
+
+        optimizer_options = get_optimizer_options(config, name)
+
+        options.update(
+            {
+                "dim": dim,
+                "mean_module": mean,
+                "covar_module": covar,
+                "max_fit_time": max_fit_time,
+                "likelihood": likelihood,
+                "optimizer_options": optimizer_options,
+            }
+        )
+
+        return options
diff --git a/aepsych/models/gp_classification.py b/aepsych/models/gp_classification.py
index 04c952bfd..fa6b8f934 100644
--- a/aepsych/models/gp_classification.py
+++ b/aepsych/models/gp_classification.py
@@ -90,7 +90,7 @@ def predict(
                 response probability instead of latent function value. Defaults to False.
 
         Returns:
-            Tuple[torch.Tensor, torch.Tensor]: Posterior mean and variance at queries points.
+            Tuple[torch.Tensor, torch.Tensor]: Posterior mean and variance at query points.
         """
 
         if not probability_space:
@@ -117,7 +117,7 @@ def predict_transform(
                 transformation is applied.
 
         Returns:
-            Tuple[torch.Tensor, torch.Tensor]: Transformed posterior mean and variance at queries points.
+            Tuple[torch.Tensor, torch.Tensor]: Transformed posterior mean and variance at query points.
         """
 
         return super().predict_transform(
@@ -131,6 +131,6 @@ def predict_probability(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tens
             x (torch.Tensor): Points at which to predict from the model.
 
         Returns:
-            Tuple[torch.Tensor, torch.Tensor]: Posterior mean and variance at queries points.
+            Tuple[torch.Tensor, torch.Tensor]: Posterior mean and variance at query points.
         """
         return self.predict(x, probability_space=True)
diff --git a/aepsych/models/gp_regression.py b/aepsych/models/gp_regression.py
index a2f00c0c7..8e6b973a7 100644
--- a/aepsych/models/gp_regression.py
+++ b/aepsych/models/gp_regression.py
@@ -13,7 +13,7 @@
 import torch
 from aepsych.config import Config
 from aepsych.factory.default import default_mean_covar_factory
-from aepsych.models.base import AEPsychModelDeviceMixin
+from aepsych.models.base import AEPsychModelMixin
 from aepsych.utils import get_dims, get_optimizer_options, promote_0d
 from aepsych.utils_logging import getLogger
 from gpytorch.likelihoods import GaussianLikelihood, Likelihood
@@ -22,7 +22,7 @@
 logger = getLogger()
 
 
-class GPRegressionModel(AEPsychModelDeviceMixin, ExactGP):
+class GPRegressionModel(AEPsychModelMixin, ExactGP):
     """GP Regression model for continuous outcomes, using exact inference."""
 
     _num_outputs = 1
diff --git a/aepsych/models/model_protocol.py b/aepsych/models/model_protocol.py
deleted file mode 100644
index c08206da0..000000000
--- a/aepsych/models/model_protocol.py
+++ /dev/null
@@ -1,75 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from typing import Any, Optional, Protocol
-
-import torch
-from botorch.posteriors import Posterior
-from gpytorch.likelihoods import Likelihood
-
-from .transformed_posteriors import TransformedPosterior
-
-
-class ModelProtocol(Protocol):
-    @property
-    def _num_outputs(self) -> int:
-        pass
-
-    @property
-    def outcome_type(self) -> str:
-        pass
-
-    @property
-    def extremum_solver(self) -> str:
-        pass
-
-    @property
-    def train_inputs(self) -> torch.Tensor:
-        pass
-
-    @property
-    def dim(self) -> int:
-        pass
-
-    @property
-    def device(self) -> torch.device:
-        pass
-
-    def posterior(self, X: torch.Tensor) -> Posterior:
-        pass
-
-    def predict(self, x: torch.Tensor, **kwargs) -> torch.Tensor:
-        pass
-
-    def predict_probability(self, x: torch.Tensor, **kwargs) -> torch.Tensor:
-        pass
-
-    def predict_transform(
-        self,
-        x: torch.Tensor,
-        transformed_posterior_cls: Optional[type[TransformedPosterior]] = None,
-        **transform_kwargs,
-    ):
-        pass
-
-    @property
-    def stimuli_per_trial(self) -> int:
-        pass
-
-    @property
-    def likelihood(self) -> Likelihood:
-        pass
-
-    def sample(self, x: torch.Tensor, num_samples: int) -> torch.Tensor:
-        pass
-
-    def fit(self, train_x: torch.Tensor, train_y: torch.Tensor, **kwargs: Any) -> None:
-        pass
-
-    def update(
-        self, train_x: torch.Tensor, train_y: torch.Tensor, **kwargs: Any
-    ) -> None:
-        pass
diff --git a/aepsych/models/monotonic_projection_gp.py b/aepsych/models/monotonic_projection_gp.py
index 947b07066..84df0c2bf 100644
--- a/aepsych/models/monotonic_projection_gp.py
+++ b/aepsych/models/monotonic_projection_gp.py
@@ -176,14 +176,17 @@ def posterior(
             # using numpy because torch doesn't support vectorized linspace,
             # pytorch/issues/61292
             grid: Union[np.ndarray, torch.Tensor] = np.linspace(
-                self.lb[dim],
-                X[:, dim].numpy(),
+                self.lb[dim].cpu().numpy(),
+                X[:, dim].cpu().numpy(),
                 s + 1,
             )  # (s+1 x n)
             grid = torch.tensor(grid[:-1, :], dtype=X.dtype)  # Drop x; (s x n)
             X_aug[(1 + i * s) : (1 + (i + 1) * s), :, dim] = grid
         # X_aug[0, :, :] is X, and then subsequent indices are points in the grids
         # Predict marginal distributions on X_aug
+
+        X = X.to(self.device)
+        X_aug = X_aug.to(self.device)
         with torch.no_grad():
             post_aug = super().posterior(X=X_aug)
         mu_aug = post_aug.mean.squeeze()  # (m*s+1 x n)
@@ -198,12 +201,13 @@ def posterior(
         # Adjust the whole covariance matrix to accomadate the projected marginals
         with torch.no_grad():
             post = super().posterior(X=X)
-            R = cov2corr(post.distribution.covariance_matrix.squeeze().numpy())
-            S_proj = torch.tensor(corr2cov(R, sigma_proj.numpy()), dtype=X.dtype)
+            R = cov2corr(post.distribution.covariance_matrix.squeeze().cpu().numpy())
+            S_proj = torch.tensor(corr2cov(R, sigma_proj.cpu().numpy()), dtype=X.dtype)
         mvn_proj = gpytorch.distributions.MultivariateNormal(
-            mu_proj.unsqueeze(0),
-            S_proj.unsqueeze(0),
+            mu_proj.unsqueeze(0).to(self.device),
+            S_proj.unsqueeze(0).to(self.device),
         )
+
         return GPyTorchPosterior(mvn_proj)
 
     def sample(self, x: torch.Tensor, num_samples: int) -> torch.Tensor:
diff --git a/aepsych/models/pairwise_probit.py b/aepsych/models/pairwise_probit.py
index f21e3538b..f4eddf441 100644
--- a/aepsych/models/pairwise_probit.py
+++ b/aepsych/models/pairwise_probit.py
@@ -11,7 +11,7 @@
 import torch
 from aepsych.config import Config
 from aepsych.factory import default_mean_covar_factory
-from aepsych.models.base import AEPsychModelDeviceMixin
+from aepsych.models.base import AEPsychModelMixin
 from aepsych.utils import _process_bounds, get_dims, get_optimizer_options, promote_0d
 from aepsych.utils_logging import getLogger
 from botorch.fit import fit_gpytorch_mll
@@ -22,7 +22,7 @@
 logger = getLogger()
 
 
-class PairwiseProbitModel(PairwiseGP, AEPsychModelDeviceMixin):
+class PairwiseProbitModel(PairwiseGP, AEPsychModelMixin):
     _num_outputs = 1
     stimuli_per_trial = 2
     outcome_type = "binary"
@@ -184,7 +184,7 @@ def predict(
             rereference (str): How to sample. Options are "x_min", "x_max", "f_min", "f_max". Defaults to "x_min".
 
         Returns:
-            Tuple[torch.Tensor, torch.Tensor]: Posterior mean and variance at queries points.
+            Tuple[torch.Tensor, torch.Tensor]: Posterior mean and variance at query points.
         """
         if rereference is not None:
             samps = self.sample(x, num_samples, rereference)
@@ -217,7 +217,7 @@ def predict_probability(
             rereference (str): How to sample. Options are "x_min", "x_max", "f_min", "f_max". Defaults to "x_min".
 
         Returns:
-            Tuple[torch.Tensor, torch.Tensor]: Posterior mean and variance at queries points.
+            Tuple[torch.Tensor, torch.Tensor]: Posterior mean and variance at query points.
         """
         return self.predict(
             x, probability_space=True, num_samples=num_samples, rereference=rereference
diff --git a/aepsych/models/semi_p.py b/aepsych/models/semi_p.py
index b2ac1991d..8ab7c37cd 100644
--- a/aepsych/models/semi_p.py
+++ b/aepsych/models/semi_p.py
@@ -606,7 +606,7 @@ def predict(
                 response probability instead of latent function value. Defaults to False.
 
         Returns:
-            Tuple[torch.Tensor, torch.Tensor]: Posterior mean and variance at queries points.
+            Tuple[torch.Tensor, torch.Tensor]: Posterior mean and variance at query points.
         """
         if probability_space:
             if hasattr(self.likelihood, "objective"):
diff --git a/aepsych/models/utils.py b/aepsych/models/utils.py
index b29787d5c..9ad89c529 100644
--- a/aepsych/models/utils.py
+++ b/aepsych/models/utils.py
@@ -11,7 +11,7 @@
 
 import numpy as np
 import torch
-from aepsych.models.model_protocol import ModelProtocol
+from aepsych.models.base import AEPsychModelMixin
 from aepsych.utils import dim_grid, get_jnd_multid, promote_0d
 from botorch.acquisition import PosteriorMean
 from botorch.acquisition.objective import (
@@ -159,7 +159,7 @@ def get_extremum(
 
 
 def get_min(
-    model: ModelProtocol,
+    model: AEPsychModelMixin,
     bounds: torch.Tensor,
     locked_dims: Optional[Mapping[int, float]] = None,
     probability_space: bool = False,
@@ -168,7 +168,7 @@ def get_min(
 ) -> Tuple[float, torch.Tensor]:
     """Return the minimum of the modeled function, subject to constraints
     Args:
-        model (ModelProtocol): AEPsychModel to get the minimum of.
+        model (AEPsychModelMixin): AEPsychModel to get the minimum of.
         bounds (torch.Tensor): Bounds of the space to find the minimum.
         locked_dims (Mapping[int, float], optional): Dimensions to fix, so that the
             inverse is along a slice of the full surface.
@@ -193,7 +193,7 @@ def get_min(
 
 
 def get_max(
-    model: ModelProtocol,
+    model: AEPsychModelMixin,
     bounds: torch.Tensor,
     locked_dims: Optional[Mapping[int, float]] = None,
     probability_space: bool = False,
@@ -203,7 +203,7 @@ def get_max(
     """Return the maximum of the modeled function, subject to constraints
 
     Args:
-        model (ModelProtocol): AEPsychModel to get the maximum of.
+        model (AEPsychModelMixin): AEPsychModel to get the maximum of.
         bounds (torch.Tensor): Bounds of the space to find the maximum.
         locked_dims (Mapping[int, float], optional): Dimensions to fix, so that the
             inverse is along a slice of the full surface. Defaults to None.
@@ -228,7 +228,7 @@ def get_max(
 
 
 def inv_query(
-    model: ModelProtocol,
+    model: AEPsychModelMixin,
     y: Union[float, torch.Tensor],
     bounds: torch.Tensor,
     locked_dims: Optional[Mapping[int, float]] = None,
@@ -241,7 +241,7 @@ def inv_query(
     Return nearest x such that f(x) = queried y, and also return the
         value of f at that point.
     Args:
-        model (ModelProtocol): AEPsychModel to get the find the inverse from y.
+        model (AEPsychModelMixin): AEPsychModel to get the find the inverse from y.
         y (Union[float, torch.Tensor]): Points at which to find the inverse.
         bounds (torch.Tensor): Lower and upper bounds of the search space.
         locked_dims (Mapping[int, float], optional): Dimensions to fix, so that the
@@ -288,7 +288,7 @@ def inv_query(
 
 
 def get_jnd(
-    model: ModelProtocol,
+    model: AEPsychModelMixin,
     lb: torch.Tensor,
     ub: torch.Tensor,
     dim: int,
@@ -311,7 +311,7 @@ def get_jnd(
     Both definitions are equivalent for linear psychometric functions.
 
     Args:
-        model (ModelProtocol): Model to use for prediction.
+        model (AEPsychModelMixin): Model to use for prediction.
         lb (torch.Tensor): Lower bounds of the input space.
         ub (torch.Tensor): Upper bounds of the input space.
         dim (int): Dimensionality of the input space.
@@ -389,7 +389,7 @@ def get_jnd(
 
 
 def p_below_threshold(
-    model: ModelProtocol, x: torch.Tensor, f_thresh: torch.Tensor
+    model: AEPsychModelMixin, x: torch.Tensor, f_thresh: torch.Tensor
 ) -> torch.Tensor:
     """Compute the probability that the latent function is below a threshold.
 
@@ -417,7 +417,7 @@ def bernoulli_probit_prob_transform(mean: torch.Tensor, var: torch.Tensor):
         mean (torch.Tensor): The latent variance of a Bernoulli-probit model evaluated at a set of query points.
 
     Returns:
-        Tuple[torch.Tensor, torch.Tensor]: Posterior mean and variance at queries points in probability space.
+        Tuple[torch.Tensor, torch.Tensor]: Posterior mean and variance at query points in probability space.
     """
     fmean = mean.squeeze()
     fvar = var.squeeze()
diff --git a/aepsych/models/variationalgp.py b/aepsych/models/variationalgp.py
index f99e1d4d7..8f88821e6 100644
--- a/aepsych/models/variationalgp.py
+++ b/aepsych/models/variationalgp.py
@@ -13,7 +13,7 @@
 import torch
 from aepsych.config import Config
 from aepsych.factory.default import default_mean_covar_factory
-from aepsych.models.base import AEPsychModelDeviceMixin
+from aepsych.models.base import AEPsychModelMixin
 from aepsych.models.inducing_points import GreedyVarianceReduction
 from aepsych.models.inducing_points.base import InducingPointAllocator
 from aepsych.utils_logging import getLogger
@@ -24,7 +24,7 @@
 logger = getLogger()
 
 
-class VariationalGPModel(AEPsychModelDeviceMixin, ApproximateGP):
+class VariationalGPModel(AEPsychModelMixin, ApproximateGP):
     """Base GP model with variational inference"""
 
     _batch_size = 1
diff --git a/aepsych/strategy/strategy.py b/aepsych/strategy/strategy.py
index de9b1217d..07a33edbd 100644
--- a/aepsych/strategy/strategy.py
+++ b/aepsych/strategy/strategy.py
@@ -14,7 +14,7 @@
 import torch
 from aepsych.config import Config, ConfigurableMixin
 from aepsych.generators.base import AEPsychGenerator
-from aepsych.models.base import AEPsychMixin
+from aepsych.models.base import AEPsychModelMixin
 from aepsych.models.utils import get_max, get_min, inv_query
 from aepsych.strategy.utils import ensure_model_is_fresh
 from aepsych.transforms import (
@@ -45,7 +45,7 @@ def __init__(
         dim: Optional[int] = None,
         min_total_tells: int = 0,
         min_asks: int = 0,
-        model: Optional[AEPsychMixin] = None,
+        model: Optional[AEPsychModelMixin] = None,
         use_gpu_modeling: bool = False,
         use_gpu_generating: bool = False,
         refit_every: int = 1,
@@ -69,7 +69,7 @@ def __init__(
                 of lb and ub.
             min_total_tells (int): The minimum number of total observations needed to complete this strategy.
             min_asks (int): The minimum number of points that should be generated from this strategy.
-            model (ModelProtocol, optional): The AEPsych model of the data.
+            model (AEPsychModelMixin, optional): The AEPsych model of the data.
             use_gpu_modeling (bool): Whether to move the model to GPU fitting/predictions, defaults to False.
             use_gpu_generating (bool): Whether to use the GPU for generating points, defaults to False.
             refit_every (int): How often to refit the model from scratch.
@@ -371,7 +371,9 @@ def inv_query(
         return val, arg
 
     @ensure_model_is_fresh
-    def predict(self, x: torch.Tensor, probability_space: bool = False) -> torch.Tensor:
+    def predict(
+        self, x: torch.Tensor, probability_space: bool = False
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Predict the output value(s) for the given input(s).
 
         Args:
@@ -379,21 +381,19 @@ def predict(self, x: torch.Tensor, probability_space: bool = False) -> torch.Ten
             probability_space (bool): Whether to return the output in probability space. Defaults to False.
 
         Returns:
-            torch.Tensor: The predicted output value(s).
+            Tuple[torch.Tensor, torch.Tensor]: Posterior mean and variance at query points.
         """
         assert self.model is not None, "model is None! Cannot predict without a model!"
         self.model.to(self.model_device)
         return self.model.predict(x=x, probability_space=probability_space)
 
     @ensure_model_is_fresh
-    def sample(
-        self, x: torch.Tensor, num_samples: Optional[int] = None
-    ) -> torch.Tensor:
+    def sample(self, x: torch.Tensor, num_samples: int = 1000) -> torch.Tensor:
         """Sample the output value(s) for the given input(s).
 
         Args:
             x (torch.Tensor): The input value(s).
-            num_samples (int, optional): The number of samples to generate. Defaults to None.
+            num_samples (int): The number of samples to generate. Defaults to 1000.
 
         Returns:
             torch.Tensor: The sampled output value(s).
@@ -443,9 +443,9 @@ def finished(self) -> bool:
                 self.model is not None
             ), "model is None! Cannot predict without a model!"
             fmean, _ = self.model.predict(self.eval_grid, probability_space=True)
-            meets_post_range = (
-                (fmean.max() - fmean.min()) >= self.min_post_range
-            ).item()
+            meets_post_range = bool(
+                ((fmean.max() - fmean.min()) >= self.min_post_range).item()
+            )
         else:
             meets_post_range = True
         finished = (
diff --git a/aepsych/transforms/parameters.py b/aepsych/transforms/parameters.py
index 427e9654f..3e4f6e9dd 100644
--- a/aepsych/transforms/parameters.py
+++ b/aepsych/transforms/parameters.py
@@ -15,8 +15,7 @@
 import torch
 from aepsych.config import Config, ConfigurableMixin
 from aepsych.generators.base import AcqfGenerator, AEPsychGenerator
-from aepsych.models.base import AEPsychMixin
-from aepsych.models.model_protocol import ModelProtocol
+from aepsych.models.base import AEPsychModelMixin
 from aepsych.transforms.ops import Fixed, Log10Plus, NormalizeScale, Round
 from aepsych.transforms.ops.base import Transform
 from aepsych.utils import get_bounds
@@ -377,7 +376,7 @@ def __init__(
     def gen(
         self,
         num_points: int = 1,
-        model: Optional[AEPsychMixin] = None,
+        model: Optional[AEPsychModelMixin] = None,
         fixed_features: Optional[Dict[int, float]] = None,
         **kwargs,
     ) -> torch.Tensor:
@@ -385,7 +384,7 @@ def gen(
 
         Args:
             num_points (int): Number of points to query, defaults to 1.
-            model (AEPsychMixin, optional): The model to use to generate points, can be
+            model (AEPsychModelMixin, optional): The model to use to generate points, can be
                 None if no model is needed.
             fixed_features: (Dict[int, float], optional): Parameters that are fixed to specific values.
             **kwargs: Kwargs to pass to the generator's generator.
@@ -524,11 +523,11 @@ class ParameterTransformedModel(ParameterTransformWrapper, ConfigurableMixin):
     untransforms any outputs from the model back to raw parameter space.
     """
 
-    _base_obj: ModelProtocol
+    _base_obj: AEPsychModelMixin
 
     def __init__(
         self,
-        model: Union[Type, ModelProtocol],
+        model: Union[Type, AEPsychModelMixin],
         transforms: ChainedInputTransform = ChainedInputTransform(**{}),
         **kwargs: Any,
     ) -> None:
@@ -547,7 +546,7 @@ def __init__(
         The object's name will be ParameterTransformed<Model.__name__>.
 
         Args:
-            model (Union[Type, ModelProtocol]): Model to wrap, this could either be a
+            model (Union[Type, AEPsychModelMixin]): Model to wrap, this could either be a
                 completely initialized model or just the model class. An initialized
                 model is expected to have been initialized in the transformed
                 parameter space (i.e., bounds are transformed). If a model class is
@@ -597,9 +596,7 @@ def wrapper(self, *args, **kwargs) -> torch.Tensor:
         return wrapper
 
     @_promote_1d
-    def predict(
-        self, x: torch.Tensor, **kwargs
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor]]:
+    def predict(self, x: torch.Tensor, **kwargs) -> Tuple[torch.Tensor, torch.Tensor]:
         """Query the model on its posterior given transformed x.
 
         Args:
@@ -608,7 +605,7 @@ def predict(
             **kwargs: Keyword arguments to pass to the model.predict() call.
 
         Returns:
-            Union[Tensor, Tuple[Tensor]]: At least one Tensor will be returned.
+            Tuple[torch.Tensor, torch.Tensor]: Posterior mean and variance at query points.
         """
         x = self.transforms.transform(x)
         return self._base_obj.predict(x, **kwargs)
@@ -616,7 +613,7 @@ def predict(
     @_promote_1d
     def predict_probability(
         self, x: torch.Tensor, **kwargs
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor]]:
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Query the model on its posterior given transformed x and return units in
         response probability space.
 
@@ -626,7 +623,7 @@ def predict_probability(
             **kwargs: Keyword arguments to pass to the model.predict() call.
 
         Returns:
-            Union[Tensor, Tuple[Tensor]]: At least one Tensor will be returned.
+            Tuple[torch.Tensor, torch.Tensor]: Posterior mean and variance at query points.
         """
         x = self.transforms.transform(x)
         return self._base_obj.predict_probability(x, **kwargs)
diff --git a/tests/models/test_monotonic_projection_gp.py b/tests/models/test_monotonic_projection_gp.py
index cff554f16..b17fbd408 100644
--- a/tests/models/test_monotonic_projection_gp.py
+++ b/tests/models/test_monotonic_projection_gp.py
@@ -69,12 +69,7 @@ def test_posterior(self):
             Xtest[:, i] = torch.tensor([-1, 0, 1])
             post = model.posterior(Xtest)
             mu = post.mean.squeeze()
-            self.assertTrue(
-                torch.equal(
-                    torch.tensor([0, 1, 2], dtype=torch.long),
-                    torch.argsort(mu),
-                )
-            )
+            self.assertTrue(mu[0] <= mu[1] and mu[1] <= mu[2])
 
         # Check that min_f_val is respected
         config_str = """
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
index 17148b13b..50906f686 100644
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -80,7 +80,7 @@ def setUp(self):
         )
 
     def unvectorized_p_below_threshold(self, x, f_thresh) -> torch.Tensor:
-        """this is the original p_below_threshold method in the AEPsychMixin that calculates model prediction
+        """this is the original p_below_threshold method in the AEPsychModelMixin that calculates model prediction
         of the probability of the stimulus being below a threshold
         for one single threshold"""
         f, var = self.model.predict(x)
diff --git a/tests_gpu/models/test_monotonic_projection_gp.py b/tests_gpu/models/test_monotonic_projection_gp.py
new file mode 100644
index 000000000..f1a8872b7
--- /dev/null
+++ b/tests_gpu/models/test_monotonic_projection_gp.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import unittest
+
+import torch
+
+# run on single threads to keep us from deadlocking weirdly in CI
+if "CI" in os.environ or "SANDCASTLE" in os.environ:
+    torch.set_num_threads(1)
+
+import numpy as np
+from aepsych.config import Config
+from aepsych.transforms import ParameterTransformedModel
+from sklearn.datasets import make_classification
+
+
+class MonotonicProjectionGPtest(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(1)
+        torch.manual_seed(1)
+        X, y = make_classification(
+            n_samples=100,
+            n_features=3,
+            n_redundant=0,
+            n_informative=3,
+            random_state=1,
+            n_clusters_per_class=1,
+        )
+        self.X, self.y = torch.Tensor(X), torch.Tensor(y)
+
+    def test_posterior(self):
+        X, y = self.X, self.y
+        config_str = """
+        [common]
+        parnames = [x, y, z]
+        lb = [-4, -4, -4]
+        ub = [4, 4, 4]
+        stimuli_per_trial = 1
+        outcome_types = [binary]
+
+        strategy_names = [init_strat]
+
+        [init_strat]
+        generator = OptimizeAcqfGenerator
+        model = MonotonicProjectionGP
+
+        [MonotonicProjectionGP]
+        monotonic_dims = [0, 1]
+        inducing_size=100
+
+        [default_mean_covar_factory]
+        lengthscale_prior = gamma
+        fixed_kernel_amplitude = False
+        """
+        config = Config(config_str=config_str)
+        model = ParameterTransformedModel.from_config(config, "MonotonicProjectionGP")
+        model.cuda()
+        model.fit(X, y)
+
+        # Check that it is monotonic in both dims
+        for i in range(2):
+            Xtest = torch.zeros(3, 3)
+            Xtest[:, i] = torch.tensor([-1, 0, 1])
+            post = model.posterior(Xtest)
+            mu = post.mean.squeeze()
+            self.assertTrue(mu[0] <= mu[1] and mu[1] <= mu[2])
+
+        # Check that min_f_val is respected
+        config_str = """
+        [common]
+        parnames = [x, y, z]
+        lb = [-4, -4, -4]
+        ub = [4, 4, 4]
+        stimuli_per_trial = 1
+        outcome_types = [binary]
+
+        strategy_names = [init_strat]
+
+        [init_strat]
+        generator = OptimizeAcqfGenerator
+        model = MonotonicProjectionGP
+
+        [MonotonicProjectionGP]
+        monotonic_dims = [0]
+        inducing_size=10
+        min_f_val = 5.0
+
+        [default_mean_covar_factory]
+        lengthscale_prior = gamma
+        fixed_kernel_amplitude = False
+        """
+        config = Config(config_str=config_str)
+        model = ParameterTransformedModel.from_config(config, "MonotonicProjectionGP")
+        post = model.posterior(Xtest)
+        mu = post.mean.squeeze()
+        self.assertTrue(mu.min().item() >= 4.9)
+        # And in samples
+        samps = model.sample(Xtest, num_samples=10)
+        self.assertTrue(samps.min().item() >= 4.9)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests_gpu/models/test_semi_p.py b/tests_gpu/models/test_semi_p.py
new file mode 100644
index 000000000..411d375b8
--- /dev/null
+++ b/tests_gpu/models/test_semi_p.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import numpy as np
+import numpy.testing as npt
+import torch
+from aepsych.acquisition import MCPosteriorVariance
+from aepsych.acquisition.lookahead import GlobalMI
+from aepsych.acquisition.objective import (
+    FloorGumbelObjective,
+    FloorLogitObjective,
+    FloorProbitObjective,
+    ProbitObjective,
+)
+from aepsych.acquisition.objective.semi_p import (
+    SemiPProbabilityObjective,
+    SemiPThresholdObjective,
+)
+from aepsych.generators import OptimizeAcqfGenerator, SobolGenerator
+from aepsych.likelihoods import BernoulliObjectiveLikelihood
+from aepsych.likelihoods.semi_p import LinearBernoulliLikelihood
+from aepsych.models import HadamardSemiPModel, SemiParametricGPModel
+from aepsych.models.semi_p import _hadamard_mvn_approx, semi_p_posterior_transform
+from aepsych.strategy import SequentialStrategy, Strategy
+from aepsych.utils import make_scaled_sobol
+from gpytorch.distributions import MultivariateNormal
+from parameterized import parameterized
+
+
+def _hadamard_model_constructor(
+    stim_dim,
+    floor,
+    objective=FloorLogitObjective,
+):
+    return HadamardSemiPModel(
+        dim=2,
+        stim_dim=stim_dim,
+        likelihood=BernoulliObjectiveLikelihood(objective=objective(floor=floor)),
+        inducing_size=10,
+        max_fit_time=0.5,
+    )
+
+
+def _semip_model_constructor(
+    stim_dim,
+    floor,
+    objective=FloorLogitObjective,
+):
+    return SemiParametricGPModel(
+        dim=2,
+        stim_dim=stim_dim,
+        likelihood=LinearBernoulliLikelihood(objective=objective(floor=floor)),
+        inducing_size=10,
+    )
+
+
+links = [FloorLogitObjective, FloorProbitObjective, FloorGumbelObjective]
+floors = [0, 0.3, 0.5]
+constructors = [_semip_model_constructor, _hadamard_model_constructor]
+test_configs = [[FloorLogitObjective, 0.3, _hadamard_model_constructor]]
+# test_configs = list(product(links, floors, constructors)) # TODO too slow
+
+
+class SemiPSmokeTests(unittest.TestCase):
+    def setUp(self):
+        self.seed = 1
+        self.stim_dim = 0
+        self.context_dim = 1
+        np.random.seed(1)
+        torch.manual_seed(1)
+        X = np.random.randn(100, 2) / 3
+        xcontext = X[..., self.context_dim]
+        xintensity = X[..., self.stim_dim]
+        # polynomial context
+        slope = xcontext - 0.7 * xcontext**2 + 0.3 * xcontext**3 - 0.1 * xcontext**4
+        intercept = (
+            xcontext + 0.03 * xcontext**5 - 0.2 * xcontext**3 - 0.7 * xcontext**4
+        )
+        # multiply by intensity
+        self.f = torch.Tensor(slope * (intercept + xintensity)).unsqueeze(-1)
+        X[:, 0] = X[:, 0] * 100
+        X[:, 1] = X[:, 1] / 100
+        self.lb = torch.tensor([-100.0, -0.01])
+        self.ub = torch.tensor([100.0, 0.01])
+        self.X = torch.Tensor(X).cuda()
+        self.inducing_size = 10
+
+    def test_analytic_lookahead_generation(self):
+        floor = 0
+        objective = FloorProbitObjective
+        model = _semip_model_constructor(
+            stim_dim=self.stim_dim,
+            floor=floor,
+            objective=objective,
+        )
+        model.cuda()
+
+        generator = OptimizeAcqfGenerator(
+            acqf=GlobalMI,
+            acqf_kwargs={
+                "posterior_transform": semi_p_posterior_transform,
+                "target": 0.75,
+                "query_set_size": 100,
+                "Xq": make_scaled_sobol(self.lb, self.ub, 100),
+                "lb": self.lb,
+                "ub": self.ub,
+            },
+            max_gen_time=0.2,
+            lb=self.lb,
+            ub=self.ub,
+        )
+        link = objective(floor=floor)
+        y = torch.bernoulli(link(self.f))
+
+        model.set_train_data(
+            self.X[:10], y[:10]
+        )  # no need to fit for checking gen shapes
+
+        next_x = generator.gen(num_points=1, model=model)
+        self.assertEqual(
+            next_x.shape,
+            (
+                1,
+                2,
+            ),
+        )
+
+    @parameterized.expand(test_configs)
+    def test_memorize_data(self, objective, floor, model_constructor):
+        """
+        see approximate accuracy on easy logistic ps that only varies in 1d
+        (no slope and intercept)
+        accuracy determined by average performance on training data
+        """
+        with self.subTest(
+            objective=objective.__name__,
+            floor=floor,
+            model_constructor=model_constructor,
+        ):
+            link = objective(floor=floor)
+            y = torch.bernoulli(link(self.f))
+
+            model = model_constructor(
+                stim_dim=self.stim_dim,
+                floor=floor,
+                objective=objective,
+            )
+            model.cuda()
+
+            model.fit(train_x=self.X[:50], train_y=y[:50])
+
+            pm, _ = model.predict(self.X[:50])
+            pred = (link(pm) > 0.5).cpu().numpy()
+            npt.assert_allclose(pred, y[:50].numpy(), atol=1)  # mismatch at most one
+
+            model.update(self.X, y)
+
+            pm, _ = model.predict(self.X[50:])
+            pred = (link(pm) > 0.5).cpu().numpy()
+            npt.assert_allclose(pred, y[50:].numpy(), atol=1)
+
+
+if __name__ == "__main__":
+    unittest.main()