diff --git a/doc/spec/model_selection.rst b/doc/spec/model_selection.rst
new file mode 100644
index 000000000..17f4d75e1
--- /dev/null
+++ b/doc/spec/model_selection.rst
@@ -0,0 +1,43 @@
+.. _model_selection:
+
+=================
+Model Selection
+=================
+
+Estimators that derive from :class:`._OrthoLearner` fit first stage nuisance models on different folds of the data and then fit a final model.
+In many cases it will make sense to perform model selection over a number of first-stage models, and the library facilitates this by allowing
+a flexible specification of the first-stage models, as any of the following:
+
+    * An sklearn-compatible estimator
+
+        * If the estimator is a known class that performs its own hyperparameter selection via cross-validation (such as :class:`~sklearn.linear_model.LassoCV`), 
+          then this will be done once and then the selected hyperparameters will be used when cross-fitting on each fold
+
+        * If a custom class is used, then it should support a `fit` method and either a `predict` method if the target is continuous or `predict_proba` if the target is discrete.
+
+    * One of the following strings; the exact set of models supported by each of these keywords may vary depending on the version of our package:
+
+      ``"linear"``
+        Selects over linear models regularized by L1 or L2 norm
+
+      ``"poly"``
+        Selects over regularized linear models with polynomial features of different degrees
+
+      ``"forest"``
+        Selects over random forest models
+
+      ``"gbf"``
+        Selects over gradient boosting models
+
+      ``"nnet"``
+        Selects over neural network models
+
+      ``"automl"``
+        Selects over all of the above (note that this will be potentially time consuming)
+
+    * A list of any of the above
+
+    * An implementation of :class:`.ModelSelector`, which is a class that supports a two-stage model selection and fitting process
+      (this is used internally by our library and is not generally intended to be used directly by end users).
+
+Most subclasses also use the string `"auto"`` as a special default value to automatically select a model from an appropriate smaller subset of models than would be generated by "automl".
diff --git a/doc/spec/spec.rst b/doc/spec/spec.rst
index a85af4e08..38917c742 100644
--- a/doc/spec/spec.rst
+++ b/doc/spec/spec.rst
@@ -12,6 +12,7 @@ EconML User Guide
     estimation_iv
     estimation_dynamic
     inference
+    model_selection
     interpretability
     federated_learning
     references
diff --git a/econml/_ortho_learner.py b/econml/_ortho_learner.py
index 9423617c2..4db1e2921 100644
--- a/econml/_ortho_learner.py
+++ b/econml/_ortho_learner.py
@@ -28,6 +28,7 @@ class in this module implements the general logic in a very versatile way
 from collections import namedtuple
 from warnings import warn
 from abc import abstractmethod
+from typing import List, Union
 import inspect
 from collections import defaultdict
 import re
@@ -85,23 +86,17 @@ def _fit_fold(model, train_idxs, test_idxs, calculate_scores, args, kwargs):
     -Tuple containing:
     nuisance_temp (tuple): Predictions or values of interest from the model.
     fitted_model: The fitted model after training.
-    test_idxs (array-like): Indices of the test data.
     score_temp (tuple or None): Scores calculated after fitting if `calculate_scores` is True, otherwise None.
     """
     model = clone(model, safe=False)
 
-    if len(np.intersect1d(train_idxs, test_idxs)) > 0:
-        raise AttributeError(
-            "Invalid crossfitting fold structure. Train and test indices of each fold must be disjoint. {},{}".format(
-                train_idxs, test_idxs))
-
     args_train = tuple(var[train_idxs] if var is not None else None for var in args)
     args_test = tuple(var[test_idxs] if var is not None else None for var in args)
 
     kwargs_train = {key: var[train_idxs] for key, var in kwargs.items()}
     kwargs_test = {key: var[test_idxs] for key, var in kwargs.items()}
 
-    model.train(False, *args_train, **kwargs_train)
+    model.train(False, None, *args_train, **kwargs_train)
     nuisance_temp = model.predict(*args_test, **kwargs_test)
 
     if not isinstance(nuisance_temp, tuple):
@@ -113,21 +108,22 @@ def _fit_fold(model, train_idxs, test_idxs, calculate_scores, args, kwargs):
         if not isinstance(score_temp, tuple):
             score_temp = (score_temp,)
 
-    return nuisance_temp, model, test_idxs, (score_temp if calculate_scores else None)
+    return nuisance_temp, model, (score_temp if calculate_scores else None)
 
 
-def _crossfit(model: ModelSelector, folds, use_ray, ray_remote_fun_option, *args, **kwargs):
+def _crossfit(models: Union[ModelSelector, List[ModelSelector]], folds, use_ray, ray_remote_fun_option,
+              *args, **kwargs):
     """
     General crossfit based calculation of nuisance parameters.
 
     Parameters
     ----------
-    model : ModelSelector
-        An object that has train and predict methods.
-        The train method must take an 'is_selecting' argument first, and then
-        accept positional arguments `args` and keyword arguments `kwargs`; the predict method
-        just takes those `args` and `kwargs`. The train
-        method selects or estimates a model of the nuisance function, based on the input
+    models : ModelSelector or List[ModelSelector]
+        One or more objects that have train and predict methods.
+        The train method must take an 'is_selecting' argument first, a set of folds second
+        (which will be None when not selecting) and then accept positional arguments `args`
+        and keyword arguments `kwargs`; the predict method just takes those `args` and `kwargs`.
+        The train method selects or estimates a model of the nuisance function, based on the input
         data to fit. Predict evaluates the fitted nuisance function on the input
         data to predict.
     folds : list of tuple or None
@@ -179,11 +175,14 @@ def _crossfit(model: ModelSelector, folds, use_ray, ray_remote_fun_option, *args
         class Wrapper:
             def __init__(self, model):
                 self._model = model
-            def train(self, is_selecting, X, y, W=None):
+            def train(self, is_selecting, folds, X, y, W=None):
                 self._model.fit(X, y)
                 return self
             def predict(self, X, y, W=None):
                 return self._model.predict(X)
+            @property
+            def needs_fit(self):
+                return False
         np.random.seed(123)
         X = np.random.normal(size=(5000, 3))
         y = X[:, 0] + np.random.normal(size=(5000,))
@@ -206,65 +205,109 @@ def predict(self, X, y, W=None):
     model_list = []
 
     kwargs = filter_none_kwargs(**kwargs)
-    model.train(True, *args, **kwargs)
-
-    calculate_scores = hasattr(model, 'score')
-    # remove None arguments
-
-    if folds is None:  # skip crossfitting
-        model_list.append(clone(model, safe=False))
-        model_list[0].train(True, *args, **kwargs)
-        model_list[0].train(False, *args, **kwargs)  # fit the selected model
-        nuisances = model_list[0].predict(*args, **kwargs)
-        scores = model_list[0].score(*args, **kwargs) if calculate_scores else None
-
-        if not isinstance(nuisances, tuple):
-            nuisances = (nuisances,)
-        if not isinstance(scores, tuple):
-            scores = (scores,)
-
-        # scores entries should be lists of scores, so make each entry a singleton list
-        scores = tuple([s] for s in scores)
-
-        first_arr = args[0] if args else kwargs.items()[0][1]
-        return nuisances, model_list, np.arange(first_arr.shape[0]), scores
-
-    folds = list(folds)
-    fold_refs = []
-    if use_ray:
-        # Adding the kwargs to ray object store to be used by remote functions for each fold to avoid IO overhead
-        ray_args = ray.put(kwargs)
+
+    n = (args[0] if args else kwargs.items()[0][1]).shape[0]
+
+    # fully materialize folds so that they can be reused across models
+    # and precompute fitted indices so that we fail fast if there's an issue with them
+    if folds is not None:
+        folds = list(folds)
+        fitted_inds = []
         for idx, (train_idxs, test_idxs) in enumerate(folds):
-            fold_refs.append(
-                ray.remote(_fit_fold).options(**ray_remote_fun_option).remote(model, train_idxs, test_idxs,
-                                                                              calculate_scores, args, ray_args))
-    fitted_inds = []
-    for idx, (train_idxs, test_idxs) in enumerate(folds):
-        if use_ray:
-            nuisance_temp, model, test_idxs, score_temp = ray.get(fold_refs[idx])
+            if len(np.intersect1d(train_idxs, test_idxs)) > 0:
+                raise AttributeError(f"Invalid crossfitting fold structure. Train and test indices of fold {idx+1} "
+                                     f"are not disjoint: {train_idxs}, {test_idxs}")
+            common_idxs = np.intersect1d(fitted_inds, test_idxs)
+            if len(common_idxs) > 0:
+                raise AttributeError(f"Invalid crossfitting fold structure. The indexes {common_idxs} in fold {idx+1} "
+                                     f"have appeared in previous folds")
+            fitted_inds = np.concatenate((fitted_inds, test_idxs))
+        fitted_inds = np.sort(fitted_inds.astype(int))
+    else:
+        fold_vals = [(np.arange(n), np.arange(n))]
+        fitted_inds = np.arange(n)
+
+    accumulated_nuisances = ()
+    # NOTE: if any model is missing scores we will just return None even if another model
+    #       has scores. this is because we don't know how many scores are missing
+    #       for the models that are missing them, so we don't know how to pad the array
+    calculate_scores = True
+    accumulated_scores = ()
+
+    # for convenience we allos a single model to be passed in lieu of a singleton list
+    # in that case, we will also unwrap the model output
+    unwrap_model_output = False
+    if not isinstance(models, list):
+        unwrap_model_output = True
+        models = [models]
+
+    for model in models:
+        # when there is more than one model, nuisances from previous models
+        # come first as positional arguments
+        accumulated_args = accumulated_nuisances + args
+        if model.needs_fit:
+            model.train(True, fold_vals if folds is None else folds, *accumulated_args, **kwargs)
+
+        calculate_scores &= hasattr(model, 'score')
+
+        model_list.append([])  # add a new empty list of clones for this model
+
+        if folds is None:  # skip crossfitting
+            model_list[-1].append(clone(model, safe=False))
+            model_list[-1][0].train(False, None, *accumulated_args, **kwargs)  # fit the selected model
+            nuisances = model_list[-1][0].predict(*accumulated_args, **kwargs)
+            if not isinstance(nuisances, tuple):
+                nuisances = (nuisances,)
+
+            if calculate_scores:
+                scores = model_list[-1][0].score(*accumulated_args, **kwargs)
+                if not isinstance(scores, tuple):
+                    scores = (scores,)
+                # scores entries should be lists of scores, so make each entry a singleton list
+                scores = tuple([s] for s in scores)
+
         else:
-            nuisance_temp, model, test_idxs, score_temp = _fit_fold(model, train_idxs, test_idxs,
-                                                                    calculate_scores, args, kwargs)
+            fold_refs = []
+            if use_ray:
+                # Adding the kwargs to ray object store to be used by remote functions
+                # for each fold to avoid IO overhead
+                ray_args = ray.put(kwargs)
+                for idx, (train_idxs, test_idxs) in enumerate(folds):
+                    fold_refs.append(
+                        ray.remote(_fit_fold).options(**ray_remote_fun_option).remote(model, train_idxs, test_idxs,
+                                                                                      calculate_scores,
+                                                                                      accumulated_args, ray_args))
+            for idx, (train_idxs, test_idxs) in enumerate(folds):
+                if use_ray:
+                    nuisance_temp, model_out, score_temp = ray.get(fold_refs[idx])
+                else:
+                    nuisance_temp, model_out, score_temp = _fit_fold(model, train_idxs, test_idxs,
+                                                                     calculate_scores, accumulated_args, kwargs)
 
-        if len(np.intersect1d(fitted_inds, test_idxs)) > 0:
-            raise AttributeError(
-                "Invalid crossfitting fold structure. The same index appears in two test folds.")
-        fitted_inds = np.concatenate((fitted_inds, test_idxs))
-        if idx == 0:
-            nuisances = tuple([np.full((args[0].shape[0],) + nuis.shape[1:], np.nan) for nuis in nuisance_temp])
+                if idx == 0:
+                    nuisances = tuple([np.full((n,) + nuis.shape[1:], np.nan)
+                                      for nuis in nuisance_temp])
 
-        for it, nuis in enumerate(nuisance_temp):
-            nuisances[it][test_idxs] = nuis
+                for it, nuis in enumerate(nuisance_temp):
+                    nuisances[it][test_idxs] = nuis
 
-        if calculate_scores:
-            if idx == 0:
-                scores = tuple([] for _ in score_temp)
-            for it, score in enumerate(score_temp):
-                scores[it].append(score)
+                if calculate_scores:
+                    if idx == 0:
+                        scores = tuple([] for _ in score_temp)
+                    for it, score in enumerate(score_temp):
+                        scores[it].append(score)
 
-        model_list.append(model)
+                model_list[-1].append(model_out)
 
-    return nuisances, model_list, np.sort(fitted_inds.astype(int)), (scores if calculate_scores else None)
+        accumulated_nuisances += nuisances
+        if calculate_scores:
+            accumulated_scores += scores
+        else:
+            accumulated_scores = None
+
+    if unwrap_model_output:
+        model_list = model_list[0]
+    return accumulated_nuisances, model_list, fitted_inds, accumulated_scores
 
 
 CachedValues = namedtuple('CachedValues', ['nuisances',
@@ -403,12 +446,15 @@ class ModelNuisance:
             def __init__(self, model_t, model_y):
                 self._model_t = model_t
                 self._model_y = model_y
-            def train(self, is_selecting, Y, T, W=None):
+            def train(self, is_selecting, folds, Y, T, W=None):
                 self._model_t.fit(W, T)
                 self._model_y.fit(W, Y)
                 return self
             def predict(self, Y, T, W=None):
                 return Y - self._model_y.predict(W), T - self._model_t.predict(W)
+            @property
+            def needs_fit(self):
+                return False
         class ModelFinal:
             def __init__(self):
                 return
@@ -457,12 +503,15 @@ class ModelNuisance:
             def __init__(self, model_t, model_y):
                 self._model_t = model_t
                 self._model_y = model_y
-            def train(self, is_selecting, Y, T, W=None):
+            def train(self, is_selecting, folds, Y, T, W=None):
                 self._model_t.fit(W, np.matmul(T, np.arange(1, T.shape[1]+1)))
                 self._model_y.fit(W, Y)
                 return self
             def predict(self, Y, T, W=None):
                 return Y - self._model_y.predict(W), T - self._model_t.predict_proba(W)[:, 1:]
+            @property
+            def needs_fit(self):
+                return False
         class ModelFinal:
             def __init__(self):
                 return
@@ -554,11 +603,11 @@ def _gen_ortho_learner_model_nuisance(self):
 
         Returns
         -------
-        model_nuisance: selector
-            The selector for fitting the nuisance function. The returned estimator must implement
+        model_nuisance: list of selector
+            The selector(s) for fitting the nuisance function. The returned estimators must implement
             `train` and `predict` methods that both have signatures::
 
-                model_nuisance.train(is_selecting, Y, T, X=X, W=W, Z=Z,
+                model_nuisance.train(is_selecting, folds, Y, T, X=X, W=W, Z=Z,
                                 sample_weight=sample_weight)
                 model_nuisance.predict(Y, T, X=X, W=W, Z=Z,
                                     sample_weight=sample_weight)
@@ -805,7 +854,7 @@ def _fit_nuisances(Y, T, X, W, Z, sample_weight, groups):
                                       for nuisance_mc_variants in zip(*all_nuisances))
                 else:
                     raise ValueError(
-                        "Parameter `mc_agg` must be one of {'mean', 'median'}. Got {}".format(self.mc_agg))
+                        f"Parameter `mc_agg` must be one of {{'mean', 'median'}}. Got {self.mc_agg}")
 
             Y, T, X, W, Z, sample_weight, freq_weight, sample_var = (self._subinds_check_none(arr, fitted_inds)
                                                                      for arr in (Y, T, X, W, Z, sample_weight,
@@ -1035,26 +1084,45 @@ def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
         if self.discrete_outcome:
             Y = self.outcome_transformer.transform(Y).reshape(-1, 1)
         n_iters = len(self._models_nuisance)
-        n_splits = len(self._models_nuisance[0])
+        # self._models_nuisance will be a list of lists or a list of list of lists
+        # so we use self._ortho_learner_model_nuisance to determine the nesting level
+        expand_list = not isinstance(self._ortho_learner_model_nuisance, list)
+        if expand_list:
+            n_selectors = 1
+            n_splits = len(self._models_nuisance[0])
+        else:
+            n_selectors = len(self._models_nuisance[0])
+            n_splits = len(self._models_nuisance[0][0])
 
-        # for each mc iteration
-        for i, models_nuisances in enumerate(self._models_nuisance):
-            # for each model under cross fit setting
-            for j, mdl in enumerate(models_nuisances):
-                nuisance_temp = mdl.predict(Y, T, **filter_none_kwargs(X=X, W=W, Z=Z, groups=groups))
-                if not isinstance(nuisance_temp, tuple):
-                    nuisance_temp = (nuisance_temp,)
+        kwargs = filter_none_kwargs(X=X, W=W, Z=Z, groups=groups)
 
-                if i == 0 and j == 0:
-                    nuisances = [np.zeros((n_iters * n_splits,) + nuis.shape) for nuis in nuisance_temp]
+        accumulated_nuisances = []
 
-                for it, nuis in enumerate(nuisance_temp):
-                    nuisances[it][j * n_iters + i] = nuis
+        for sel in range(n_selectors):
+            accumulated_args = tuple(accumulated_nuisances) + (Y, T)
+            # for each mc iteration
+            for i, models_nuisances in enumerate(self._models_nuisance):
+                if expand_list:
+                    models_nuisances = [models_nuisances]
+
+                # for each model under cross fit setting
+                for j, mdl in enumerate(models_nuisances[sel]):
+                    nuisance_temp = mdl.predict(*accumulated_args, **kwargs)
+                    if not isinstance(nuisance_temp, tuple):
+                        nuisance_temp = (nuisance_temp,)
+
+                    if i == 0 and j == 0:
+                        nuisances = [np.zeros((n_iters * n_splits,) + nuis.shape) for nuis in nuisance_temp]
+
+                    for it, nuis in enumerate(nuisance_temp):
+                        nuisances[it][j * n_iters + i] = nuis
+
+            for it in range(len(nuisances)):
+                nuisances[it] = np.mean(nuisances[it], axis=0)
 
-        for it in range(len(nuisances)):
-            nuisances[it] = np.mean(nuisances[it], axis=0)
+            accumulated_nuisances += nuisances
 
-        return self._ortho_learner_model_final.score(Y, T, nuisances=nuisances,
+        return self._ortho_learner_model_final.score(Y, T, nuisances=accumulated_nuisances,
                                                      **filter_none_kwargs(X=X, W=W, Z=Z,
                                                                           sample_weight=sample_weight, groups=groups))
 
diff --git a/econml/dml/_rlearner.py b/econml/dml/_rlearner.py
index b4c346b26..1d6318481 100644
--- a/econml/dml/_rlearner.py
+++ b/econml/dml/_rlearner.py
@@ -1,484 +1,493 @@
-# Copyright (c) PyWhy contributors. All rights reserved.
-# Licensed under the MIT License.
-
-"""
-
-The R Learner is an approach for estimating flexible non-parametric models
-of conditional average treatment effects in the setting with no unobserved confounders.
-The method is based on the idea of Neyman orthogonality and estimates a CATE
-whose mean squared error is robust to the estimation errors of auxiliary submodels
-that also need to be estimated from data:
-
-    1) the outcome or regression model
-    2) the treatment or propensity or policy or logging policy model
-
-References
-----------
-
-Xinkun Nie, Stefan Wager (2017). Quasi-Oracle Estimation of Heterogeneous Treatment Effects.
-    https://arxiv.org/abs/1712.04912
-
-Dylan Foster, Vasilis Syrgkanis (2019). Orthogonal Statistical Learning.
-    ACM Conference on Learning Theory. https://arxiv.org/abs/1901.09036
-
-Chernozhukov et al. (2017). Double/debiased machine learning for treatment and structural parameters.
-    The Econometrics Journal. https://arxiv.org/abs/1608.00060
-"""
-
-from abc import abstractmethod
-import numpy as np
-import copy
-from warnings import warn
-
-from ..sklearn_extensions.model_selection import ModelSelector
-from ..utilities import (shape, reshape, ndim, hstack, filter_none_kwargs, _deprecate_positional)
-from sklearn.linear_model import LinearRegression
-from sklearn.base import clone
-from .._ortho_learner import _OrthoLearner
-
-
-class _ModelNuisance(ModelSelector):
-    """
-    Nuisance model fits the model_y and model_t at fit time and at predict time
-    calculates the residual Y and residual T based on the fitted models and returns
-    the residuals as two nuisance parameters.
-    """
-
-    def __init__(self, model_y: ModelSelector, model_t: ModelSelector):
-        self._model_y = model_y
-        self._model_t = model_t
-
-    def train(self, is_selecting, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
-        assert Z is None, "Cannot accept instrument!"
-        self._model_t.train(is_selecting, X, W, T, **filter_none_kwargs(sample_weight=sample_weight, groups=groups))
-        self._model_y.train(is_selecting, X, W, Y, **filter_none_kwargs(sample_weight=sample_weight, groups=groups))
-        return self
-
-    def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
-        # note that groups are not passed to score because they are only used for fitting
-        T_score = self._model_t.score(X, W, T, **filter_none_kwargs(sample_weight=sample_weight))
-        Y_score = self._model_y.score(X, W, Y, **filter_none_kwargs(sample_weight=sample_weight))
-        return Y_score, T_score
-
-    def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
-        Y_pred = self._model_y.predict(X, W)
-        T_pred = self._model_t.predict(X, W)
-        if (X is None) and (W is None):  # In this case predict above returns a single row
-            Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1))
-            T_pred = np.tile(T_pred.reshape(1, -1), (T.shape[0], 1))
-        Y_res = Y - Y_pred.reshape(Y.shape)
-        T_res = T - T_pred.reshape(T.shape)
-        return Y_res, T_res
-
-
-class _ModelFinal:
-    """
-    Final model at fit time, fits a residual on residual regression with a heterogeneous coefficient
-    that depends on X, i.e.
-
-        .. math ::
-            Y - E[Y | X, W] = \\theta(X) \\cdot (T - E[T | X, W]) + \\epsilon
-
-    and at predict time returns :math:`\\theta(X)`. The score method returns the MSE of this final
-    residual on residual regression.
-    """
-
-    def __init__(self, model_final):
-        self._model_final = model_final
-
-    def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None,
-            sample_weight=None, freq_weight=None, sample_var=None, groups=None):
-        Y_res, T_res = nuisances
-        self._model_final.fit(X, T, T_res, Y_res, sample_weight=sample_weight,
-                              freq_weight=freq_weight, sample_var=sample_var)
-        return self
-
-    def predict(self, X=None):
-        return self._model_final.predict(X)
-
-    def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, groups=None):
-        Y_res, T_res = nuisances
-        if Y_res.ndim == 1:
-            Y_res = Y_res.reshape((-1, 1))
-        if T_res.ndim == 1:
-            T_res = T_res.reshape((-1, 1))
-        effects = self._model_final.predict(X).reshape((-1, Y_res.shape[1], T_res.shape[1]))
-        Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape)
-        if sample_weight is not None:
-            return np.mean(np.average((Y_res - Y_res_pred) ** 2, weights=sample_weight, axis=0))
-        else:
-            return np.mean((Y_res - Y_res_pred) ** 2)
-
-
-class _RLearner(_OrthoLearner):
-    """
-    Base class for CATE learners that residualize treatment and outcome and run residual on residual regression.
-    The estimator is a special of an :class:`._OrthoLearner` estimator,
-    so it follows the two
-    stage process, where a set of nuisance functions are estimated in the first stage in a crossfitting
-    manner and a final stage estimates the CATE model. See the documentation of
-    :class:`._OrthoLearner` for a description of this two stage process.
-
-    In this estimator, the CATE is estimated by using the following estimating equations:
-
-    .. math ::
-        Y - \\E[Y | X, W] = \\Theta(X) \\cdot (T - \\E[T | X, W]) + \\epsilon
-
-    Thus if we estimate the nuisance functions :math:`q(X, W) = \\E[Y | X, W]` and
-    :math:`f(X, W)=\\E[T | X, W]` in the first stage, we can estimate the final stage cate for each
-    treatment t, by running a regression, minimizing the residual on residual square loss:
-
-    .. math ::
-        \\hat{\\theta} = \\arg\\min_{\\Theta}\
-        \\E_n\\left[ (\\tilde{Y} - \\Theta(X) \\cdot \\tilde{T})^2 \\right]
-
-    Where :math:`\\tilde{Y}=Y - \\E[Y | X, W]` and :math:`\\tilde{T}=T-\\E[T | X, W]` denotes the
-    residual outcome and residual treatment.
-
-    Parameters
-    ----------
-    discrete_outcome: bool
-        Whether the outcome should be treated as binary
-
-    discrete_treatment: bool
-        Whether the treatment values should be treated as categorical, rather than continuous, quantities
-
-    treatment_featurizer : :term:`transformer` or None
-        Must support fit_transform and transform. Used to create composite treatment in the final CATE regression.
-        The final CATE will be trained on the outcome of featurizer.fit_transform(T).
-        If featurizer=None, then CATE is trained on T.
-
-    categories: 'auto' or list
-        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
-        The first category will be treated as the control treatment.
-
-    cv: int, cross-validation generator or an iterable
-        Determines the cross-validation splitting strategy.
-        Possible inputs for cv are:
-
-        - None, to use the default 3-fold cross-validation,
-        - integer, to specify the number of folds.
-        - :term:`CV splitter`
-        - An iterable yielding (train, test) splits as arrays of indices.
-
-        For integer/None inputs, if the treatment is discrete
-        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
-        :class:`~sklearn.model_selection.KFold` is used
-        (with a random shuffle in either case).
-
-        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
-        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
-
-    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None
-        If int, random_state is the seed used by the random number generator;
-        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
-        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
-        by :mod:`np.random<numpy.random>`.
-
-    mc_iters: int, optional
-        The number of times to rerun the first stage models to reduce the variance of the nuisances.
-
-    mc_agg: {'mean', 'median'}, default 'mean'
-        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
-        cross-fitting.
-
-    allow_missing: bool
-        Whether to allow missing values in X, W. If True, will need to supply nuisance models that can handle
-        missing values.
-
-    use_ray: bool, default False
-        Whether to use Ray to speed up the cross-fitting step.
-
-    ray_remote_func_options : dict, optional
-        Options to pass to ray.remote function decorator.
-        see more at https://docs.ray.io/en/latest/ray-core/api/doc/ray.remote.html
-
-    Examples
-    --------
-
-    The example code below implements a very simple version of the double machine learning
-    method on top of the :class:`._RLearner` class, for expository purposes.
-    For a more elaborate implementation of a Double Machine Learning child class of the class
-    checkout :class:`.DML` and its child classes:
-
-    .. testcode::
-
-        import numpy as np
-        from sklearn.linear_model import LinearRegression
-        from econml.dml._rlearner import _RLearner
-        from econml.sklearn_extensions.model_selection import SingleModelSelector
-        from sklearn.base import clone
-        class ModelFirst:
-            def __init__(self, model):
-                self._model = clone(model, safe=False)
-            def fit(self, X, W, Y, sample_weight=None):
-                self._model.fit(np.hstack([X, W]), Y)
-                return self
-            def predict(self, X, W):
-                return self._model.predict(np.hstack([X, W]))
-        class ModelSelector(SingleModelSelector):
-            def __init__(self, model):
-                self._model = ModelFirst(model)
-            def train(self, is_selecting, X, W, Y, sample_weight=None):
-                self._model.fit(X, W, Y, sample_weight=sample_weight)
-                return self
-            @property
-            def best_model(self):
-                return self._model
-            @property
-            def best_score(self):
-                return 0
-        class ModelFinal:
-            def fit(self, X, T, T_res, Y_res, sample_weight=None, freq_weight=None, sample_var=None):
-                self.model = LinearRegression(fit_intercept=False).fit(X * T_res.reshape(-1, 1),
-                                                                       Y_res)
-                return self
-            def predict(self, X):
-                return self.model.predict(X)
-        class RLearner(_RLearner):
-            def _gen_model_y(self):
-                return ModelSelector(LinearRegression())
-            def _gen_model_t(self):
-                return ModelSelector(LinearRegression())
-            def _gen_rlearner_model_final(self):
-                return ModelFinal()
-        np.random.seed(123)
-        X = np.random.normal(size=(1000, 3))
-        y = X[:, 0] + X[:, 1] + np.random.normal(0, 0.01, size=(1000,))
-        est = RLearner(cv=2, discrete_outcome=False, discrete_treatment=False,
-                       treatment_featurizer=None, categories='auto', random_state=None)
-        est.fit(y, X[:, 0], X=np.ones((X.shape[0], 1)), W=X[:, 1:])
-
-    >>> est.const_marginal_effect(np.ones((1,1)))
-    array([0.999631...])
-    >>> est.effect(np.ones((1,1)), T0=0, T1=10)
-    array([9.996314...])
-    >>> est.score(y, X[:, 0], X=np.ones((X.shape[0], 1)), W=X[:, 1:])
-    9.73638006...e-05
-    >>> est.rlearner_model_final_.model
-    LinearRegression(fit_intercept=False)
-    >>> est.rlearner_model_final_.model.coef_
-    array([0.999631...])
-    >>> est.score_
-    9.82623204...e-05
-    >>> [mdl._model for mdls in est.models_y for mdl in mdls]
-    [LinearRegression(), LinearRegression()]
-    >>> [mdl._model for mdls in est.models_t for mdl in mdls]
-    [LinearRegression(), LinearRegression()]
-
-    Attributes
-    ----------
-    models_y: nested list of objects of type(model_y)
-        A nested list of instances of the model_y object. Number of sublist equals to number of monte carlo
-        iterations, each element in the sublist corresponds to a crossfitting
-        fold and is the model instance that was fitted for that training fold.
-    models_t: nested list of objects of type(model_t)
-        A nested list of instances of the model_t object. Number of sublist equals to number of monte carlo
-        iterations, each element in the sublist corresponds to a crossfitting
-        fold and is the model instance that was fitted for that training fold.
-    rlearner_model_final_ : object of type(model_final)
-        An instance of the model_final object that was fitted after calling fit.
-    score_ : float
-        The MSE in the final residual on residual regression
-    nuisance_scores_y : nested list of float
-        The out-of-sample scores for each outcome model
-    nuisance_scores_t : nested list of float
-        The out-of-sample scores for each treatment model
-
-        .. math::
-            \\frac{1}{n} \\sum_{i=1}^n (Y_i - \\hat{E}[Y|X_i, W_i]\
-                                        - \\hat{\\theta}(X_i)\\cdot (T_i - \\hat{E}[T|X_i, W_i]))^2
-
-        If `sample_weight` is not None at fit time, then a weighted average is returned. If the outcome Y
-        is multidimensional, then the average of the MSEs for each dimension of Y is returned.
-    """
-
-    def __init__(self,
-                 *,
-                 discrete_outcome,
-                 discrete_treatment,
-                 treatment_featurizer,
-                 categories,
-                 cv,
-                 random_state,
-                 mc_iters=None,
-                 mc_agg='mean',
-                 allow_missing=False,
-                 use_ray=False,
-                 ray_remote_func_options=None):
-        super().__init__(discrete_outcome=discrete_outcome,
-                         discrete_treatment=discrete_treatment,
-                         treatment_featurizer=treatment_featurizer,
-                         discrete_instrument=False,  # no instrument, so doesn't matter
-                         categories=categories,
-                         cv=cv,
-                         random_state=random_state,
-                         mc_iters=mc_iters,
-                         mc_agg=mc_agg,
-                         allow_missing=allow_missing,
-                         use_ray=use_ray,
-                         ray_remote_func_options=ray_remote_func_options)
-
-    @abstractmethod
-    def _gen_model_y(self):
-        """
-        Returns
-        -------
-        model_y: selector for the estimator of E[Y | X, W]
-            The estimator for fitting the response to the features and controls. Must implement
-            `fit` and `predict` methods.  Unlike sklearn estimators both methods must
-            take an extra second argument (the controls), i.e. ::
-
-                model_y.fit(X, W, Y, sample_weight=sample_weight)
-                model_y.predict(X, W)
-        """
-        raise NotImplementedError("Abstract method")
-
-    @abstractmethod
-    def _gen_model_t(self):
-        """
-        Returns
-        -------
-        model_t: selector for the estimator of E[T | X, W]
-            The estimator for fitting the treatment to the features and controls. Must implement
-            `fit` and `predict` methods.  Unlike sklearn estimators both methods must
-            take an extra second argument (the controls), i.e. ::
-
-                model_t.fit(X, W, T, sample_weight=sample_weight)
-                model_t.predict(X, W)
-        """
-        raise NotImplementedError("Abstract method")
-
-    @abstractmethod
-    def _gen_rlearner_model_final(self):
-        """
-        Returns
-        -------
-        model_final: estimator for fitting the response residuals to the features and treatment residuals
-            Must implement `fit` and `predict` methods. Unlike sklearn estimators the fit methods must
-            take an extra second argument (the treatment residuals). Predict, on the other hand,
-            should just take the features and return the constant marginal effect. More, concretely::
-
-                model_final.fit(X, T_res, Y_res,
-                                sample_weight=sample_weight, freq_weight=freq_weight, sample_var=sample_var)
-                model_final.predict(X)
-        """
-        raise NotImplementedError("Abstract method")
-
-    def _gen_ortho_learner_model_nuisance(self):
-        return _ModelNuisance(self._gen_model_y(), self._gen_model_t())
-
-    def _gen_ortho_learner_model_final(self):
-        return _ModelFinal(self._gen_rlearner_model_final())
-
-    def fit(self, Y, T, *, X=None, W=None, sample_weight=None, freq_weight=None, sample_var=None, groups=None,
-            cache_values=False, inference=None):
-        """
-        Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
-
-        Parameters
-        ----------
-        Y: (n, d_y) matrix or vector of length n
-            Outcomes for each sample
-        T: (n, d_t) matrix or vector of length n
-            Treatments for each sample
-        X:(n, d_x) matrix, optional
-            Features for each sample
-        W:(n, d_w) matrix, optional
-            Controls for each sample
-        sample_weight : (n,) array_like, optional
-            Individual weights for each sample. If None, it assumes equal weight.
-        freq_weight: (n, ) array_like of int, optional
-            Weight for the observation. Observation i is treated as the mean
-            outcome of freq_weight[i] independent observations.
-            When ``sample_var`` is not None, this should be provided.
-        sample_var : {(n,), (n, d_y)} nd array_like, optional
-            Variance of the outcome(s) of the original freq_weight[i] observations that were used to
-            compute the mean outcome represented by observation i.
-        groups: (n,) vector, optional
-            All rows corresponding to the same group will be kept together during splitting.
-            If groups is not None, the `cv` argument passed to this class's initializer
-            must support a 'groups' argument to its split method.
-        cache_values: bool, default False
-            Whether to cache inputs and first stage results, which will allow refitting a different final model
-        inference: str, :class:`.Inference` instance, or None
-            Method for performing inference.  This estimator supports 'bootstrap'
-            (or an instance of:class:`.BootstrapInference`).
-
-        Returns
-        -------
-        self: _RLearner instance
-        """
-        # Replacing fit from _OrthoLearner, to enforce Z=None and improve the docstring
-        return super().fit(Y, T, X=X, W=W,
-                           sample_weight=sample_weight, freq_weight=freq_weight, sample_var=sample_var, groups=groups,
-                           cache_values=cache_values,
-                           inference=inference)
-
-    def score(self, Y, T, X=None, W=None, sample_weight=None):
-        """
-        Score the fitted CATE model on a new data set. Generates nuisance parameters
-        for the new data set based on the fitted residual nuisance models created at fit time.
-        It uses the mean prediction of the models fitted by the different crossfit folds.
-        Then calculates the MSE of the final residual Y on residual T regression.
-
-        If model_final does not have a score method, then it raises an :exc:`.AttributeError`
-
-        Parameters
-        ----------
-        Y: (n, d_y) matrix or vector of length n
-            Outcomes for each sample
-        T: (n, d_t) matrix or vector of length n
-            Treatments for each sample
-        X:(n, d_x) matrix, optional
-            Features for each sample
-        W:(n, d_w) matrix, optional
-            Controls for each sample
-        sample_weight:(n,) vector, optional
-            Weights for each samples
-
-
-        Returns
-        -------
-        score: float
-            The MSE of the final CATE model on the new data.
-        """
-        # Replacing score from _OrthoLearner, to enforce Z=None and improve the docstring
-        return super().score(Y, T, X=X, W=W, sample_weight=sample_weight)
-
-    @property
-    def rlearner_model_final_(self):
-        # NOTE: important to get parent's wrapped copy so that
-        #       after training wrapped featurizer is also trained, etc.
-        return self.ortho_learner_model_final_._model_final
-
-    @property
-    def models_y(self):
-        return [[mdl._model_y.best_model for mdl in mdls] for mdls in super().models_nuisance_]
-
-    @property
-    def models_t(self):
-        return [[mdl._model_t.best_model for mdl in mdls] for mdls in super().models_nuisance_]
-
-    @property
-    def nuisance_scores_y(self):
-        return self.nuisance_scores_[0]
-
-    @property
-    def nuisance_scores_t(self):
-        return self.nuisance_scores_[1]
-
-    @property
-    def residuals_(self):
-        """
-        A tuple (y_res, T_res, X, W), of the residuals from the first stage estimation
-        along with the associated X and W. Samples are not guaranteed to be in the same
-        order as the input order.
-        """
-        if not hasattr(self, '_cached_values'):
-            raise AttributeError("Estimator is not fitted yet!")
-        if self._cached_values is None:
-            raise AttributeError("`fit` was called with `cache_values=False`. "
-                                 "Set to `True` to enable residual storage.")
-        Y_res, T_res = self._cached_values.nuisances
-        return Y_res, T_res, self._cached_values.X, self._cached_values.W
+# Copyright (c) PyWhy contributors. All rights reserved.
+# Licensed under the MIT License.
+
+"""
+
+The R Learner is an approach for estimating flexible non-parametric models
+of conditional average treatment effects in the setting with no unobserved confounders.
+The method is based on the idea of Neyman orthogonality and estimates a CATE
+whose mean squared error is robust to the estimation errors of auxiliary submodels
+that also need to be estimated from data:
+
+    1) the outcome or regression model
+    2) the treatment or propensity or policy or logging policy model
+
+References
+----------
+
+Xinkun Nie, Stefan Wager (2017). Quasi-Oracle Estimation of Heterogeneous Treatment Effects.
+    https://arxiv.org/abs/1712.04912
+
+Dylan Foster, Vasilis Syrgkanis (2019). Orthogonal Statistical Learning.
+    ACM Conference on Learning Theory. https://arxiv.org/abs/1901.09036
+
+Chernozhukov et al. (2017). Double/debiased machine learning for treatment and structural parameters.
+    The Econometrics Journal. https://arxiv.org/abs/1608.00060
+"""
+
+from abc import abstractmethod
+import numpy as np
+import copy
+from warnings import warn
+
+from ..sklearn_extensions.model_selection import ModelSelector
+from ..utilities import (shape, reshape, ndim, hstack, filter_none_kwargs, _deprecate_positional)
+from sklearn.linear_model import LinearRegression
+from sklearn.base import clone
+from .._ortho_learner import _OrthoLearner
+
+
+class _ModelNuisance(ModelSelector):
+    """
+    Nuisance model fits the model_y and model_t at fit time and at predict time
+    calculates the residual Y and residual T based on the fitted models and returns
+    the residuals as two nuisance parameters.
+    """
+
+    def __init__(self, model_y: ModelSelector, model_t: ModelSelector):
+        self._model_y = model_y
+        self._model_t = model_t
+
+    def train(self, is_selecting, folds, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
+        assert Z is None, "Cannot accept instrument!"
+        self._model_t.train(is_selecting, folds, X, W, T, **
+                            filter_none_kwargs(sample_weight=sample_weight, groups=groups))
+        self._model_y.train(is_selecting, folds, X, W, Y, **
+                            filter_none_kwargs(sample_weight=sample_weight, groups=groups))
+        return self
+
+    def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
+        # note that groups are not passed to score because they are only used for fitting
+        T_score = self._model_t.score(X, W, T, **filter_none_kwargs(sample_weight=sample_weight))
+        Y_score = self._model_y.score(X, W, Y, **filter_none_kwargs(sample_weight=sample_weight))
+        return Y_score, T_score
+
+    def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
+        Y_pred = self._model_y.predict(X, W)
+        T_pred = self._model_t.predict(X, W)
+        if (X is None) and (W is None):  # In this case predict above returns a single row
+            Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1))
+            T_pred = np.tile(T_pred.reshape(1, -1), (T.shape[0], 1))
+        Y_res = Y - Y_pred.reshape(Y.shape)
+        T_res = T - T_pred.reshape(T.shape)
+        return Y_res, T_res
+
+    @property
+    def needs_fit(self):
+        return self._model_y.needs_fit or self._model_t.needs_fit
+
+
+class _ModelFinal:
+    """
+    Final model at fit time, fits a residual on residual regression with a heterogeneous coefficient
+    that depends on X, i.e.
+
+        .. math ::
+            Y - E[Y | X, W] = \\theta(X) \\cdot (T - E[T | X, W]) + \\epsilon
+
+    and at predict time returns :math:`\\theta(X)`. The score method returns the MSE of this final
+    residual on residual regression.
+    """
+
+    def __init__(self, model_final):
+        self._model_final = model_final
+
+    def fit(self, Y, T, X=None, W=None, Z=None, nuisances=None,
+            sample_weight=None, freq_weight=None, sample_var=None, groups=None):
+        Y_res, T_res = nuisances
+        self._model_final.fit(X, T, T_res, Y_res, sample_weight=sample_weight,
+                              freq_weight=freq_weight, sample_var=sample_var)
+        return self
+
+    def predict(self, X=None):
+        return self._model_final.predict(X)
+
+    def score(self, Y, T, X=None, W=None, Z=None, nuisances=None, sample_weight=None, groups=None):
+        Y_res, T_res = nuisances
+        if Y_res.ndim == 1:
+            Y_res = Y_res.reshape((-1, 1))
+        if T_res.ndim == 1:
+            T_res = T_res.reshape((-1, 1))
+        effects = self._model_final.predict(X).reshape((-1, Y_res.shape[1], T_res.shape[1]))
+        Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape)
+        if sample_weight is not None:
+            return np.mean(np.average((Y_res - Y_res_pred) ** 2, weights=sample_weight, axis=0))
+        else:
+            return np.mean((Y_res - Y_res_pred) ** 2)
+
+
+class _RLearner(_OrthoLearner):
+    """
+    Base class for CATE learners that residualize treatment and outcome and run residual on residual regression.
+    The estimator is a special of an :class:`._OrthoLearner` estimator,
+    so it follows the two
+    stage process, where a set of nuisance functions are estimated in the first stage in a crossfitting
+    manner and a final stage estimates the CATE model. See the documentation of
+    :class:`._OrthoLearner` for a description of this two stage process.
+
+    In this estimator, the CATE is estimated by using the following estimating equations:
+
+    .. math ::
+        Y - \\E[Y | X, W] = \\Theta(X) \\cdot (T - \\E[T | X, W]) + \\epsilon
+
+    Thus if we estimate the nuisance functions :math:`q(X, W) = \\E[Y | X, W]` and
+    :math:`f(X, W)=\\E[T | X, W]` in the first stage, we can estimate the final stage cate for each
+    treatment t, by running a regression, minimizing the residual on residual square loss:
+
+    .. math ::
+        \\hat{\\theta} = \\arg\\min_{\\Theta}\
+        \\E_n\\left[ (\\tilde{Y} - \\Theta(X) \\cdot \\tilde{T})^2 \\right]
+
+    Where :math:`\\tilde{Y}=Y - \\E[Y | X, W]` and :math:`\\tilde{T}=T-\\E[T | X, W]` denotes the
+    residual outcome and residual treatment.
+
+    Parameters
+    ----------
+    discrete_outcome: bool
+        Whether the outcome should be treated as binary
+
+    discrete_treatment: bool
+        Whether the treatment values should be treated as categorical, rather than continuous, quantities
+
+    treatment_featurizer : :term:`transformer` or None
+        Must support fit_transform and transform. Used to create composite treatment in the final CATE regression.
+        The final CATE will be trained on the outcome of featurizer.fit_transform(T).
+        If featurizer=None, then CATE is trained on T.
+
+    categories: 'auto' or list
+        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
+        The first category will be treated as the control treatment.
+
+    cv: int, cross-validation generator or an iterable
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the treatment is discrete
+        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
+        :class:`~sklearn.model_selection.KFold` is used
+        (with a random shuffle in either case).
+
+        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
+        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
+
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None
+        If int, random_state is the seed used by the random number generator;
+        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
+        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
+        by :mod:`np.random<numpy.random>`.
+
+    mc_iters: int, optional
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
+    mc_agg: {'mean', 'median'}, default 'mean'
+        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
+        cross-fitting.
+
+    allow_missing: bool
+        Whether to allow missing values in X, W. If True, will need to supply nuisance models that can handle
+        missing values.
+
+    use_ray: bool, default False
+        Whether to use Ray to speed up the cross-fitting step.
+
+    ray_remote_func_options : dict, optional
+        Options to pass to ray.remote function decorator.
+        see more at https://docs.ray.io/en/latest/ray-core/api/doc/ray.remote.html
+
+    Examples
+    --------
+
+    The example code below implements a very simple version of the double machine learning
+    method on top of the :class:`._RLearner` class, for expository purposes.
+    For a more elaborate implementation of a Double Machine Learning child class of the class
+    checkout :class:`.DML` and its child classes:
+
+    .. testcode::
+
+        import numpy as np
+        from sklearn.linear_model import LinearRegression
+        from econml.dml._rlearner import _RLearner
+        from econml.sklearn_extensions.model_selection import SingleModelSelector
+        from sklearn.base import clone
+        class ModelFirst:
+            def __init__(self, model):
+                self._model = clone(model, safe=False)
+            def fit(self, X, W, Y, sample_weight=None):
+                self._model.fit(np.hstack([X, W]), Y)
+                return self
+            def predict(self, X, W):
+                return self._model.predict(np.hstack([X, W]))
+        class ModelSelector(SingleModelSelector):
+            def __init__(self, model):
+                self._model = ModelFirst(model)
+            def train(self, is_selecting, folds, X, W, Y, sample_weight=None):
+                self._model.fit(X, W, Y, sample_weight=sample_weight)
+                return self
+            @property
+            def best_model(self):
+                return self._model
+            @property
+            def best_score(self):
+                return 0
+            @property
+            def needs_fit(self):
+                return False
+        class ModelFinal:
+            def fit(self, X, T, T_res, Y_res, sample_weight=None, freq_weight=None, sample_var=None):
+                self.model = LinearRegression(fit_intercept=False).fit(X * T_res.reshape(-1, 1),
+                                                                       Y_res)
+                return self
+            def predict(self, X):
+                return self.model.predict(X)
+        class RLearner(_RLearner):
+            def _gen_model_y(self):
+                return ModelSelector(LinearRegression())
+            def _gen_model_t(self):
+                return ModelSelector(LinearRegression())
+            def _gen_rlearner_model_final(self):
+                return ModelFinal()
+        np.random.seed(123)
+        X = np.random.normal(size=(1000, 3))
+        y = X[:, 0] + X[:, 1] + np.random.normal(0, 0.01, size=(1000,))
+        est = RLearner(cv=2, discrete_outcome=False, discrete_treatment=False,
+                       treatment_featurizer=None, categories='auto', random_state=None)
+        est.fit(y, X[:, 0], X=np.ones((X.shape[0], 1)), W=X[:, 1:])
+
+    >>> est.const_marginal_effect(np.ones((1,1)))
+    array([0.999631...])
+    >>> est.effect(np.ones((1,1)), T0=0, T1=10)
+    array([9.996314...])
+    >>> est.score(y, X[:, 0], X=np.ones((X.shape[0], 1)), W=X[:, 1:])
+    9.73638006...e-05
+    >>> est.rlearner_model_final_.model
+    LinearRegression(fit_intercept=False)
+    >>> est.rlearner_model_final_.model.coef_
+    array([0.999631...])
+    >>> est.score_
+    9.82623204...e-05
+    >>> [mdl._model for mdls in est.models_y for mdl in mdls]
+    [LinearRegression(), LinearRegression()]
+    >>> [mdl._model for mdls in est.models_t for mdl in mdls]
+    [LinearRegression(), LinearRegression()]
+
+    Attributes
+    ----------
+    models_y: nested list of objects of type(model_y)
+        A nested list of instances of the model_y object. Number of sublist equals to number of monte carlo
+        iterations, each element in the sublist corresponds to a crossfitting
+        fold and is the model instance that was fitted for that training fold.
+    models_t: nested list of objects of type(model_t)
+        A nested list of instances of the model_t object. Number of sublist equals to number of monte carlo
+        iterations, each element in the sublist corresponds to a crossfitting
+        fold and is the model instance that was fitted for that training fold.
+    rlearner_model_final_ : object of type(model_final)
+        An instance of the model_final object that was fitted after calling fit.
+    score_ : float
+        The MSE in the final residual on residual regression
+    nuisance_scores_y : nested list of float
+        The out-of-sample scores for each outcome model
+    nuisance_scores_t : nested list of float
+        The out-of-sample scores for each treatment model
+
+        .. math::
+            \\frac{1}{n} \\sum_{i=1}^n (Y_i - \\hat{E}[Y|X_i, W_i]\
+                                        - \\hat{\\theta}(X_i)\\cdot (T_i - \\hat{E}[T|X_i, W_i]))^2
+
+        If `sample_weight` is not None at fit time, then a weighted average is returned. If the outcome Y
+        is multidimensional, then the average of the MSEs for each dimension of Y is returned.
+    """
+
+    def __init__(self,
+                 *,
+                 discrete_outcome,
+                 discrete_treatment,
+                 treatment_featurizer,
+                 categories,
+                 cv,
+                 random_state,
+                 mc_iters=None,
+                 mc_agg='mean',
+                 allow_missing=False,
+                 use_ray=False,
+                 ray_remote_func_options=None):
+        super().__init__(discrete_outcome=discrete_outcome,
+                         discrete_treatment=discrete_treatment,
+                         treatment_featurizer=treatment_featurizer,
+                         discrete_instrument=False,  # no instrument, so doesn't matter
+                         categories=categories,
+                         cv=cv,
+                         random_state=random_state,
+                         mc_iters=mc_iters,
+                         mc_agg=mc_agg,
+                         allow_missing=allow_missing,
+                         use_ray=use_ray,
+                         ray_remote_func_options=ray_remote_func_options)
+
+    @abstractmethod
+    def _gen_model_y(self):
+        """
+        Returns
+        -------
+        model_y: selector for the estimator of E[Y | X, W]
+            The estimator for fitting the response to the features and controls. Must implement
+            `fit` and `predict` methods.  Unlike sklearn estimators both methods must
+            take an extra second argument (the controls), i.e. ::
+
+                model_y.fit(X, W, Y, sample_weight=sample_weight)
+                model_y.predict(X, W)
+        """
+        raise NotImplementedError("Abstract method")
+
+    @abstractmethod
+    def _gen_model_t(self):
+        """
+        Returns
+        -------
+        model_t: selector for the estimator of E[T | X, W]
+            The estimator for fitting the treatment to the features and controls. Must implement
+            `fit` and `predict` methods.  Unlike sklearn estimators both methods must
+            take an extra second argument (the controls), i.e. ::
+
+                model_t.fit(X, W, T, sample_weight=sample_weight)
+                model_t.predict(X, W)
+        """
+        raise NotImplementedError("Abstract method")
+
+    @abstractmethod
+    def _gen_rlearner_model_final(self):
+        """
+        Returns
+        -------
+        model_final: estimator for fitting the response residuals to the features and treatment residuals
+            Must implement `fit` and `predict` methods. Unlike sklearn estimators the fit methods must
+            take an extra second argument (the treatment residuals). Predict, on the other hand,
+            should just take the features and return the constant marginal effect. More, concretely::
+
+                model_final.fit(X, T_res, Y_res,
+                                sample_weight=sample_weight, freq_weight=freq_weight, sample_var=sample_var)
+                model_final.predict(X)
+        """
+        raise NotImplementedError("Abstract method")
+
+    def _gen_ortho_learner_model_nuisance(self):
+        return _ModelNuisance(self._gen_model_y(), self._gen_model_t())
+
+    def _gen_ortho_learner_model_final(self):
+        return _ModelFinal(self._gen_rlearner_model_final())
+
+    def fit(self, Y, T, *, X=None, W=None, sample_weight=None, freq_weight=None, sample_var=None, groups=None,
+            cache_values=False, inference=None):
+        """
+        Estimate the counterfactual model from data, i.e. estimates function :math:`\\theta(\\cdot)`.
+
+        Parameters
+        ----------
+        Y: (n, d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n, d_t) matrix or vector of length n
+            Treatments for each sample
+        X:(n, d_x) matrix, optional
+            Features for each sample
+        W:(n, d_w) matrix, optional
+            Controls for each sample
+        sample_weight : (n,) array_like, optional
+            Individual weights for each sample. If None, it assumes equal weight.
+        freq_weight: (n, ) array_like of int, optional
+            Weight for the observation. Observation i is treated as the mean
+            outcome of freq_weight[i] independent observations.
+            When ``sample_var`` is not None, this should be provided.
+        sample_var : {(n,), (n, d_y)} nd array_like, optional
+            Variance of the outcome(s) of the original freq_weight[i] observations that were used to
+            compute the mean outcome represented by observation i.
+        groups: (n,) vector, optional
+            All rows corresponding to the same group will be kept together during splitting.
+            If groups is not None, the `cv` argument passed to this class's initializer
+            must support a 'groups' argument to its split method.
+        cache_values: bool, default False
+            Whether to cache inputs and first stage results, which will allow refitting a different final model
+        inference: str, :class:`.Inference` instance, or None
+            Method for performing inference.  This estimator supports 'bootstrap'
+            (or an instance of:class:`.BootstrapInference`).
+
+        Returns
+        -------
+        self: _RLearner instance
+        """
+        # Replacing fit from _OrthoLearner, to enforce Z=None and improve the docstring
+        return super().fit(Y, T, X=X, W=W,
+                           sample_weight=sample_weight, freq_weight=freq_weight, sample_var=sample_var, groups=groups,
+                           cache_values=cache_values,
+                           inference=inference)
+
+    def score(self, Y, T, X=None, W=None, sample_weight=None):
+        """
+        Score the fitted CATE model on a new data set. Generates nuisance parameters
+        for the new data set based on the fitted residual nuisance models created at fit time.
+        It uses the mean prediction of the models fitted by the different crossfit folds.
+        Then calculates the MSE of the final residual Y on residual T regression.
+
+        If model_final does not have a score method, then it raises an :exc:`.AttributeError`
+
+        Parameters
+        ----------
+        Y: (n, d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n, d_t) matrix or vector of length n
+            Treatments for each sample
+        X:(n, d_x) matrix, optional
+            Features for each sample
+        W:(n, d_w) matrix, optional
+            Controls for each sample
+        sample_weight:(n,) vector, optional
+            Weights for each samples
+
+
+        Returns
+        -------
+        score: float
+            The MSE of the final CATE model on the new data.
+        """
+        # Replacing score from _OrthoLearner, to enforce Z=None and improve the docstring
+        return super().score(Y, T, X=X, W=W, sample_weight=sample_weight)
+
+    @property
+    def rlearner_model_final_(self):
+        # NOTE: important to get parent's wrapped copy so that
+        #       after training wrapped featurizer is also trained, etc.
+        return self.ortho_learner_model_final_._model_final
+
+    @property
+    def models_y(self):
+        return [[mdl._model_y.best_model for mdl in mdls] for mdls in super().models_nuisance_]
+
+    @property
+    def models_t(self):
+        return [[mdl._model_t.best_model for mdl in mdls] for mdls in super().models_nuisance_]
+
+    @property
+    def nuisance_scores_y(self):
+        return self.nuisance_scores_[0]
+
+    @property
+    def nuisance_scores_t(self):
+        return self.nuisance_scores_[1]
+
+    @property
+    def residuals_(self):
+        """
+        A tuple (y_res, T_res, X, W), of the residuals from the first stage estimation
+        along with the associated X and W. Samples are not guaranteed to be in the same
+        order as the input order.
+        """
+        if not hasattr(self, '_cached_values'):
+            raise AttributeError("Estimator is not fitted yet!")
+        if self._cached_values is None:
+            raise AttributeError("`fit` was called with `cache_values=False`. "
+                                 "Set to `True` to enable residual storage.")
+        Y_res, T_res = self._cached_values.nuisances
+        return Y_res, T_res, self._cached_values.X, self._cached_values.W
diff --git a/econml/dml/causal_forest.py b/econml/dml/causal_forest.py
index a3affed39..bbe44008f 100644
--- a/econml/dml/causal_forest.py
+++ b/econml/dml/causal_forest.py
@@ -268,35 +268,23 @@ class CausalForestDML(_BaseDML):
 
     Parameters
     ----------
-    model_y: estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
-        Determines how to fit the treatment to the features.
-
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+    model_y: estimator, default ``'auto'``
+        Determines how to fit the outcome to the features.
 
-            - 'linear' - LogisticRegressionCV if discrete_outcome=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_outcome=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_outcome=True.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_outcome` is True
+          and a regressor otherwise
 
-    model_t: estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
-        Determines how to fit the treatment to the features. str in a sentence
-
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+    model_t: estimator, default ``'auto'``
+        Determines how to fit the treatment to the features.
 
-            - 'linear' - LogisticRegressionCV if discrete_treatment=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_treatment=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_treatment=True.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_treatment` is True
+          and a regressor otherwise
 
     featurizer : :term:`transformer`, optional
         Must support fit_transform and transform. Used to create composite features in the final CATE regression.
@@ -569,10 +557,10 @@ class CausalForestDML(_BaseDML):
         est.fit(y, T, X=X, W=None)
 
     >>> est.effect(X[:3])
-    array([0.88518..., 1.25061..., 0.81112...])
+    array([0.62947..., 1.64576..., 0.68496... ])
     >>> est.effect_interval(X[:3])
-    (array([0.40163..., 0.75023..., 0.46629...]),
-    array([1.36873..., 1.75099..., 1.15596...]))
+    (array([0.19136...  , 1.17143..., 0.10789...]),
+    array([1.06758..., 2.12009..., 1.26203...]))
 
     Attributes
     ----------
diff --git a/econml/dml/dml.py b/econml/dml/dml.py
index 7c1a2bd2d..884d820e2 100644
--- a/econml/dml/dml.py
+++ b/econml/dml/dml.py
@@ -82,7 +82,7 @@ def __init__(self, model: SingleModelSelector, discrete_target):
         self._model = clone(model, safe=False)
         self._discrete_target = discrete_target
 
-    def train(self, is_selecting, X, W, Target, sample_weight=None, groups=None):
+    def train(self, is_selecting, folds, X, W, Target, sample_weight=None, groups=None):
         if self._discrete_target:
             # In this case, the Target is the one-hot-encoding of the treatment variable
             # We need to go back to the label representation of the one-hot so as to call
@@ -92,7 +92,7 @@ def train(self, is_selecting, X, W, Target, sample_weight=None, groups=None):
                                      "don't contain all treatments")
             Target = inverse_onehot(Target)
 
-        self._model.train(is_selecting, _combine(X, W, Target.shape[0]), Target,
+        self._model.train(is_selecting, folds, _combine(X, W, Target.shape[0]), Target,
                           **filter_none_kwargs(groups=groups, sample_weight=sample_weight))
         return self
 
@@ -104,6 +104,10 @@ def best_model(self):
     def best_score(self):
         return self._model.best_score
 
+    @property
+    def needs_fit(self):
+        return self._model.needs_fit
+
 
 def _make_first_stage_selector(model, is_discrete, random_state):
     if model == 'auto':
@@ -354,35 +358,23 @@ class takes as input the parameter `model_t`, which is an arbitrary scikit-learn
 
     Parameters
     ----------
-    model_y: estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
-        Determines how to fit the treatment to the features.
+    model_y: estimator, default ``'auto'``
+        Determines how to fit the outcome to the features.
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-            - 'linear' - LogisticRegressionCV if discrete_outcome=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_outcome=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_outcome` is True
+          and a regressor otherwise
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_outcome=True.
-
-    model_t: estimator, {'linear', 'forest'}, list of str/estimator, or 'auto
+    model_t: estimator, default ``'auto'``
         Determines how to fit the treatment to the features.
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
-
-            - 'linear' - LogisticRegressionCV if discrete_treatment=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_treatment=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_treatment=True.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_treatment` is True
+          and a regressor otherwise
 
     model_final: estimator
         The estimator for fitting the response residuals to the treatment residuals. Must implement
@@ -486,19 +478,19 @@ class takes as input the parameter `model_t`, which is an arbitrary scikit-learn
         est.fit(y, T, X=X, W=None)
 
     >>> est.effect(X[:3])
-    array([0.65142..., 1.82917..., 0.79287...])
+    array([0.63382..., 1.78225..., 0.71859...])
     >>> est.effect_interval(X[:3])
-    (array([0.28936..., 1.31239..., 0.47626...]),
-    array([1.01348..., 2.34594..., 1.10949...]))
+    (array([0.27937..., 1.27619..., 0.42091...]),
+    array([0.98827... , 2.28831..., 1.01628...]))
     >>> est.coef_
-    array([ 0.32570..., -0.05311..., -0.03973...,  0.01598..., -0.11045...])
+    array([ 0.42857...,  0.04488..., -0.03317...,  0.02258..., -0.14875...])
     >>> est.coef__interval()
-    (array([ 0.13791..., -0.20081..., -0.17941..., -0.12073..., -0.25769...]),
-    array([0.51348..., 0.09458..., 0.09993..., 0.15269..., 0.03679...]))
+    (array([ 0.25179..., -0.10558..., -0.16723... , -0.11916..., -0.28759...]),
+    array([ 0.60535...,  0.19536...,  0.10088...,  0.16434..., -0.00990...]))
     >>> est.intercept_
-    1.02940...
+    1.01166...
     >>> est.intercept__interval()
-    (0.88754..., 1.17125...)
+    (0.87125..., 1.15207...)
     """
 
     def __init__(self, *,
@@ -622,35 +614,23 @@ class LinearDML(StatsModelsCateEstimatorMixin, DML):
 
     Parameters
     ----------
-    model_y: estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
-        Determines how to fit the treatment to the features.
-
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+    model_y: estimator, default ``'auto'``
+        Determines how to fit the outcome to the features.
 
-            - 'linear' - LogisticRegressionCV if discrete_outcome=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_outcome=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_outcome=True.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_outcome` is True
+          and a regressor otherwise
 
-    model_t: estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
+    model_t: estimator, default ``'auto'``
         Determines how to fit the treatment to the features.
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-            - 'linear' - LogisticRegressionCV if discrete_treatment=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_treatment=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
-
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_treatment=True.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_treatment` is True
+          and a regressor otherwise
 
     featurizer : :term:`transformer`, optional
         Must support fit_transform and transform. Used to create composite features in the final CATE regression.
@@ -743,19 +723,19 @@ class LinearDML(StatsModelsCateEstimatorMixin, DML):
         est.fit(y, T, X=X, W=None)
 
     >>> est.effect(X[:3])
-    array([0.60257..., 1.74564..., 0.72062...])
+    array([0.49977..., 1.91668..., 0.70799...])
     >>> est.effect_interval(X[:3])
-    (array([0.25760..., 1.24005..., 0.41770...]),
-    array([0.94754..., 2.25123..., 1.02354...]))
+    (array([0.15122..., 1.40176..., 0.40954...]),
+    array([0.84831..., 2.43159..., 1.00644...]))
     >>> est.coef_
-    array([ 0.41635...,  0.00287..., -0.01831..., -0.01197..., -0.11620...])
+    array([ 0.48825...,  0.00105...,  0.00244...,  0.02217..., -0.08471...])
     >>> est.coef__interval()
-    (array([ 0.24496..., -0.13418..., -0.14852..., -0.13947..., -0.25089...]),
-    array([0.58775..., 0.13993..., 0.11189..., 0.11551..., 0.01848...]))
+    (array([ 0.30469..., -0.13904..., -0.12790..., -0.11514..., -0.22505... ]),
+    array([0.67180..., 0.14116..., 0.13278..., 0.15949..., 0.05562...]))
     >>> est.intercept_
-    0.97162...
+    1.01247...
     >>> est.intercept__interval()
-    (0.83640..., 1.10684...)
+    (0.87480..., 1.15015...)
     """
 
     def __init__(self, *,
@@ -869,35 +849,23 @@ class SparseLinearDML(DebiasedLassoCateEstimatorMixin, DML):
 
     Parameters
     ----------
-    model_y: estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
-        Determines how to fit the treatment to the features.
+    model_y: estimator, default ``'auto'``
+        Determines how to fit the outcome to the features.
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-            - 'linear' - LogisticRegressionCV if discrete_outcome=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_outcome=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_outcome` is True
+          and a regressor otherwise
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_outcome=True.
-
-    model_t: estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
+    model_t: estimator, default ``'auto'``
         Determines how to fit the treatment to the features.
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
-
-            - 'linear' - LogisticRegressionCV if discrete_treatment=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_treatment=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_treatment=True.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_treatment` is True
+          and a regressor otherwise
 
     alpha: str or float, default 'auto'
         CATE L1 regularization applied through the debiased lasso in the final model.
@@ -1016,19 +984,19 @@ class SparseLinearDML(DebiasedLassoCateEstimatorMixin, DML):
         est.fit(y, T, X=X, W=None)
 
     >>> est.effect(X[:3])
-    array([0.59812..., 1.75138..., 0.71770...])
+    array([0.50083..., 1.91663..., 0.70386...])
     >>> est.effect_interval(X[:3])
-    (array([0.25046..., 1.24249..., 0.42606...]),
-    array([0.94577..., 2.26028..., 1.00935... ]))
+    (array([0.14616..., 1.40364..., 0.40674...]),
+    array([0.85550...  , 2.42962... , 1.00099...]))
     >>> est.coef_
-    array([ 0.41820...,  0.00506..., -0.01831..., -0.00778..., -0.11965...])
+    array([ 0.49123...,  0.00495...,  0.00007...,  0.02302..., -0.08483...])
     >>> est.coef__interval()
-    (array([ 0.25058..., -0.13713..., -0.15469..., -0.13932..., -0.26252...]),
-    array([0.58583..., 0.14726..., 0.11806..., 0.12376..., 0.02320...]))
+    (array([ 0.31323..., -0.13848..., -0.13721..., -0.11141..., -0.22961...]),
+    array([0.66923..., 0.14839... , 0.13735..., 0.15745..., 0.05993...]))
     >>> est.intercept_
-    0.97131...
+    1.01476...
     >>> est.intercept__interval()
-    (0.83363..., 1.10899...)
+    (0.87620..., 1.15332...)
     """
 
     def __init__(self, *,
@@ -1168,32 +1136,23 @@ class KernelDML(DML):
 
     Parameters
     ----------
-    model_y: estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
-        Determines how to fit the treatment to the features.
-
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+    model_y: estimator, default ``'auto'``
+        Determines how to fit the outcome to the features.
 
-            - 'linear' - LogisticRegressionCV if discrete_outcome=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_outcome=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_outcome=True.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_outcome` is True
+          and a regressor otherwise
 
-    model_t: estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
+    model_t: estimator, default ``'auto'``
         Determines how to fit the treatment to the features.
 
-        - If an estimator, will use the model as is for fitting.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
-
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_treatment=True.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_treatment` is True
+          and a regressor otherwise
 
     fit_cate_intercept : bool, default True
         Whether the linear CATE model should have a constant term.
@@ -1283,7 +1242,7 @@ class KernelDML(DML):
         est.fit(y, T, X=X, W=None)
 
     >>> est.effect(X[:3])
-    array([0.64124..., 1.46561..., 0.68568...])
+    array([0.63041..., 1.86098..., 0.74218...])
     """
 
     def __init__(self, model_y='auto', model_t='auto',
@@ -1393,32 +1352,23 @@ class NonParamDML(_BaseDML):
 
     Parameters
     ----------
-    model_y: estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
-        Determines how to fit the treatment to the features.
+    model_y: estimator, default ``'auto'``
+        Determines how to fit the outcome to the features.
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-            - 'linear' - LogisticRegressionCV if discrete_outcome=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_outcome=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_outcome` is True
+          and a regressor otherwise
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_outcome=True.
-
-    model_t: estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
+    model_t: estimator, default ``'auto'``
         Determines how to fit the treatment to the features.
 
-        - If an estimator, will use the model as is for fitting.
-
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_treatment=True.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_treatment` is True
+          and a regressor otherwise
 
     model_final: estimator
         The estimator for fitting the response residuals to the treatment residuals. Must implement
@@ -1515,7 +1465,7 @@ class NonParamDML(_BaseDML):
         est.fit(y, T, X=X, W=None)
 
     >>> est.effect(X[:3])
-    array([0.32389..., 0.85703..., 0.97468...])
+    array([0.35318..., 1.28760..., 0.83506...])
     """
 
     def __init__(self, *,
diff --git a/econml/dr/_drlearner.py b/econml/dr/_drlearner.py
index 02560fd81..f51bc99eb 100644
--- a/econml/dr/_drlearner.py
+++ b/econml/dr/_drlearner.py
@@ -72,7 +72,7 @@ def __init__(self,
     def _combine(self, X, W):
         return np.hstack([arr for arr in [X, W] if arr is not None])
 
-    def train(self, is_selecting, Y, T, X=None, W=None, *, sample_weight=None, groups=None):
+    def train(self, is_selecting, folds, Y, T, X=None, W=None, *, sample_weight=None, groups=None):
         if Y.ndim != 1 and (Y.ndim != 2 or Y.shape[1] != 1):
             raise ValueError("The outcome matrix must be of shape ({0}, ) or ({0}, 1), "
                              "instead got {1}.".format(len(X), Y.shape))
@@ -84,8 +84,8 @@ def train(self, is_selecting, Y, T, X=None, W=None, *, sample_weight=None, group
         XW = self._combine(X, W)
         filtered_kwargs = filter_none_kwargs(sample_weight=sample_weight)
 
-        self._model_propensity.train(is_selecting, XW, inverse_onehot(T), groups=groups, **filtered_kwargs)
-        self._model_regression.train(is_selecting, np.hstack([XW, T]), Y, groups=groups, **filtered_kwargs)
+        self._model_propensity.train(is_selecting, folds, XW, inverse_onehot(T), groups=groups, **filtered_kwargs)
+        self._model_regression.train(is_selecting, folds, np.hstack([XW, T]), Y, groups=groups, **filtered_kwargs)
         return self
 
     def score(self, Y, T, X=None, W=None, *, sample_weight=None, groups=None):
@@ -126,6 +126,10 @@ def predict(self, Y, T, X=None, W=None, *, sample_weight=None, groups=None):
         propensities_weight = np.sum(propensities * T_complete, axis=1)
         return Y_pred.reshape(Y.shape + (T.shape[1] + 1,)), propensities_weight.reshape((n,))
 
+    @property
+    def needs_fit(self):
+        return self._model_propensity.needs_fit or self._model_regression.needs_fit
+
 
 def _make_first_stage_selector(model, is_discrete, random_state):
     if model == "auto":
@@ -247,35 +251,22 @@ class takes as input the parameter ``model_regressor``, which is an arbitrary sc
 
     Parameters
     ----------
-    model_propensity : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
-        Estimator for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.
-
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+    model_propensity: estimator, default ``'auto'``
+        Classifier for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.
 
-            - 'linear' - LogisticRegressionCV
-            - 'forest' - RandomForestClassifier
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        User-supplied estimators should support 'fit' and 'predict', and 'predict_proba'.
+        - Otherwise, see :ref:`model_selection` for the range of supported options
 
-    model_regression : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
+    model_regression: estimator, default ``'auto'``
         Estimator for E[Y | X, W, T]. Trained by regressing Y on (features, controls, one-hot-encoded treatments)
         concatenated. The one-hot-encoding excludes the baseline treatment.
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-            - 'linear' - LogisticRegressionCV if discrete_outcome=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_outcome=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
-
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_outcome=True.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_outcome` is True
+          and a regressor otherwise
 
     model_final :
         estimator for the final cate model. Trained on regressing the doubly robust potential outcomes
@@ -374,29 +365,20 @@ class takes as input the parameter ``model_regressor``, which is an arbitrary sc
         est.fit(y, T, X=X, W=None)
 
     >>> est.const_marginal_effect(X[:2])
-    array([[0.520977..., 1.244073...],
-           [0.365645..., 0.749762...]])
+    array([[0.516931..., 0.995704...],
+           [0.356427..., 0.671870...]])
     >>> est.effect(X[:2], T0=0, T1=1)
-    array([0.520977..., 0.365645...])
+    array([0.516931..., 0.356427...])
     >>> est.score_
-    3.15958089...
+    2.84365756...
     >>> est.score(y, T, X=X)
-    2.60965712...
+    1.06259465...
     >>> est.model_cate(T=1).coef_
-    array([0.369069..., 0.016610..., 0.019072...])
+    array([ 0.447095..., -0.001013... ,  0.018982...])
     >>> est.model_cate(T=2).coef_
-    array([ 0.768336...,  0.082106..., -0.030475...])
+    array([ 0.925055..., -0.012357... ,  0.033489...])
     >>> est.cate_feature_names()
     ['X0', 'X1', 'X2']
-    >>> [mdl.coef_ for mdls in est.models_regression for mdl in mdls]
-    [array([ 1.463...,  0.006..., -0.006...,  0.726...,  2.029...]),
-    array([ 1.466..., -0.002...,  0...,  0.646...,  2.014...])]
-    >>> [mdl.coef_ for mdls in est.models_propensity for mdl in mdls]
-    [array([[-0.67903093,  0.04261741, -0.05969718],
-           [ 0.034..., -0.013..., -0.013...],
-           [ 0.644..., -0.028...,  0.073...]]), array([[-0.831...,  0.100...,  0.090...],
-           [ 0.084...,  0.013..., -0.154...],
-           [ 0.747..., -0.113...,  0.063...]])]
 
     Beyond default models:
 
@@ -418,19 +400,19 @@ class takes as input the parameter ``model_regressor``, which is an arbitrary sc
         est.fit(y, T, X=X, W=None)
 
     >>> est.score_
-    3.7...
+    1.7...
     >>> est.const_marginal_effect(X[:3])
-    array([[0.64..., 1.23...],
-           [0.49..., 0.92...],
-           [0.20..., 0.26...]])
+    array([[0.68..., 1.10...],
+           [0.56..., 0.79... ],
+           [0.34..., 0.10... ]])
     >>> est.model_cate(T=2).coef_
-    array([0.72..., 0.        , 0.        ])
+    array([0.74..., 0.        , 0.        ])
     >>> est.model_cate(T=2).intercept_
-    2.0...
+    1.9...
     >>> est.model_cate(T=1).coef_
-    array([0.31..., 0.01..., 0.00...])
+    array([0.24..., 0.00..., 0.        ])
     >>> est.model_cate(T=1).intercept_
-    0.97...
+    0.94...
 
     Attributes
     ----------
@@ -809,35 +791,22 @@ class LinearDRLearner(StatsModelsCateEstimatorDiscreteMixin, DRLearner):
 
     Parameters
     ----------
-    model_propensity : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
-        Estimator for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.
-
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+    model_propensity: estimator, default ``'auto'``
+        Classifier for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.
 
-            - 'linear' - LogisticRegressionCV
-            - 'forest' - RandomForestClassifier
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        User-supplied estimators should support 'fit' and 'predict', and 'predict_proba'.
+        - Otherwise, see :ref:`model_selection` for the range of supported options
 
-    model_regression : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
+    model_regression: estimator, default ``'auto'``
         Estimator for E[Y | X, W, T]. Trained by regressing Y on (features, controls, one-hot-encoded treatments)
         concatenated. The one-hot-encoding excludes the baseline treatment.
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-            - 'linear' - LogisticRegressionCV if discrete_outcome=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_outcome=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
-
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_outcome=True.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_outcome` is True
+          and a regressor otherwise
 
     featurizer : :term:`transformer`, optional
         Must support fit_transform and transform. Used to create composite features in the final CATE regression.
@@ -924,17 +893,17 @@ class LinearDRLearner(StatsModelsCateEstimatorDiscreteMixin, DRLearner):
         est.fit(y, T, X=X, W=None)
 
     >>> est.effect(X[:3])
-    array([0.457602..., 0.335707..., 0.011288...])
+    array([ 0.432476...,  0.359739..., -0.085326...])
     >>> est.effect_interval(X[:3])
-    (array([ 0.164623..., -0.098980..., -0.493464...]), array([0.750582..., 0.77039... , 0.516041...]))
+    (array([ 0.084145... , -0.178020..., -0.734688...]), array([0.780807..., 0.897500..., 0.564035...]))
     >>> est.coef_(T=1)
-    array([0.338061..., 0.025654..., 0.044389...])
+    array([ 0.450620..., -0.008792...,  0.075242...])
     >>> est.coef__interval(T=1)
-    (array([ 0.135677..., -0.155845..., -0.143376...]), array([0.540446..., 0.207155..., 0.232155...]))
+    (array([ 0.156233... , -0.252177..., -0.159805...]), array([0.745007..., 0.234592..., 0.310290...]))
     >>> est.intercept_(T=1)
-    0.78646497...
+    0.90916103...
     >>> est.intercept__interval(T=1)
-    (0.60344468..., 0.96948526...)
+    (0.66855287..., 1.14976919...)
 
     Attributes
     ----------
@@ -1101,35 +1070,22 @@ class SparseLinearDRLearner(DebiasedLassoCateEstimatorDiscreteMixin, DRLearner):
 
     Parameters
     ----------
-    model_propensity : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
-        Estimator for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.
-
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+    model_propensity: estimator, default ``'auto'``
+        Classifier for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.
 
-            - 'linear' - LogisticRegressionCV
-            - 'forest' - RandomForestClassifier
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        User-supplied estimators should support 'fit' and 'predict', and 'predict_proba'.
+        - Otherwise, see :ref:`model_selection` for the range of supported options
 
-    model_regression : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
+    model_regression: estimator, default ``'auto'``
         Estimator for E[Y | X, W, T]. Trained by regressing Y on (features, controls, one-hot-encoded treatments)
         concatenated. The one-hot-encoding excludes the baseline treatment.
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
-
-            - 'linear' - LogisticRegressionCV if discrete_outcome=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_outcome=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_outcome=True.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_outcome` is True
+          and a regressor otherwise
 
     featurizer : :term:`transformer`, optional
         Must support fit_transform and transform. Used to create composite features in the final CATE regression.
@@ -1242,17 +1198,17 @@ class SparseLinearDRLearner(DebiasedLassoCateEstimatorDiscreteMixin, DRLearner):
         est.fit(y, T, X=X, W=None)
 
     >>> est.effect(X[:3])
-    array([0.45..., 0.33..., 0.01...])
+    array([ 0.43...,  0.35..., -0.08...  ])
     >>> est.effect_interval(X[:3])
-    (array([ 0.11..., -0.13..., -0.54...]), array([0.79..., 0.80..., 0.57...]))
+    (array([-0.01..., -0.26..., -0.81...]), array([0.87..., 0.98..., 0.65...]))
     >>> est.coef_(T=1)
-    array([0.33..., 0.02..., 0.04...])
+    array([ 0.44..., -0.00...,  0.07...])
     >>> est.coef__interval(T=1)
-    (array([ 0.14..., -0.15..., -0.14...]), array([0.53..., 0.20..., 0.23...]))
+    (array([ 0.19... , -0.24..., -0.17...]), array([0.70..., 0.22..., 0.32...]))
     >>> est.intercept_(T=1)
-    0.78...
+    0.90...
     >>> est.intercept__interval(T=1)
-    (0.60..., 0.96...)
+    (0.66..., 1.14...)
 
     Attributes
     ----------
@@ -1403,35 +1359,22 @@ class ForestDRLearner(ForestModelFinalCateEstimatorDiscreteMixin, DRLearner):
 
     Parameters
     ----------
-    model_propensity : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
-        Estimator for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.
+    model_propensity: estimator, default ``'auto'``
+        Classifier for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-            - 'linear' - LogisticRegressionCV
-            - 'forest' - RandomForestClassifier
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - Otherwise, see :ref:`model_selection` for the range of supported options
 
-        User-supplied estimators should support 'fit' and 'predict', and 'predict_proba'.
-
-    model_regression : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
+    model_regression: estimator, default ``'auto'``
         Estimator for E[Y | X, W, T]. Trained by regressing Y on (features, controls, one-hot-encoded treatments)
         concatenated. The one-hot-encoding excludes the baseline treatment.
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
-
-            - 'linear' - LogisticRegressionCV if discrete_outcome=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_outcome=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_outcome=True.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_outcome` is True
+          and a regressor otherwise
 
     discrete_outcome: bool, default False
         Whether the outcome should be treated as binary
diff --git a/econml/iv/dml/_dml.py b/econml/iv/dml/_dml.py
index 27e85fe7d..5ce9d4dcb 100644
--- a/econml/iv/dml/_dml.py
+++ b/econml/iv/dml/_dml.py
@@ -56,15 +56,16 @@ def __init__(self,
         else:
             self._model_z_xw = model_z
 
-    def train(self, is_selecting, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
-        self._model_y_xw.train(is_selecting, X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups)
-        self._model_t_xw.train(is_selecting, X=X, W=W, Target=T, sample_weight=sample_weight, groups=groups)
+    def train(self, is_selecting, folds, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
+        self._model_y_xw.train(is_selecting, folds, X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups)
+        self._model_t_xw.train(is_selecting, folds, X=X, W=W, Target=T, sample_weight=sample_weight, groups=groups)
         if self._projection:
             # concat W and Z
             WZ = _combine(W, Z, Y.shape[0])
-            self._model_t_xwz.train(is_selecting, X=X, W=WZ, Target=T, sample_weight=sample_weight, groups=groups)
+            self._model_t_xwz.train(is_selecting, folds, X=X, W=WZ, Target=T,
+                                    sample_weight=sample_weight, groups=groups)
         else:
-            self._model_z_xw.train(is_selecting, X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups)
+            self._model_z_xw.train(is_selecting, folds, X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups)
         return self
 
     def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
@@ -118,6 +119,12 @@ def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None)
             Z_res = Z - Z_pred.reshape(Z.shape)
         return Y_res, T_res, Z_res
 
+    @property
+    def needs_fit(self):
+        return (self._model_y_xw.needs_fit or self._model_t_xw.needs_fit or
+                (self._projection and self._model_t_xwz.needs_fit) or
+                (not self._projection and self._model_z_xw.needs_fit))
+
 
 class _OrthoIVModelFinal:
     def __init__(self, model_final, featurizer, fit_cate_intercept):
@@ -204,65 +211,41 @@ class OrthoIV(LinearModelFinalCateEstimatorMixin, _OrthoLearner):
 
     Parameters
     ----------
-    model_y_xw : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
-        model to estimate :math:`\\E[Y | X, W]`.
-
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
-
-            - 'linear' - LogisticRegressionCV if discrete_outcome=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_outcome=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+    model_y_xw: estimator, default ``'auto'``
+        Determines how to fit the outcome to the features and controls (:math:`\\E[Y | X, W]`).
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_outcome=True.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-    model_t_xw : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
-        model to estimate :math:`\\E[T | X, W]`.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_outcome` is True
+          and a regressor otherwise
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+    model_t_xw: estimator, default ``'auto'``
+        Determines how to fit the treatment to the features and controls (:math:`\\E[T | X, W]`).
 
-            - 'linear' - LogisticRegressionCV if discrete_treatment=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_treatment=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_treatment=True.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_treatment` is True
+          and a regressor otherwise
 
-    model_t_xwz : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
-        model to estimate :math:`\\E[T | X, W, Z]`.
+    model_t_xwz: estimator, default ``'auto'``
+        Determines how to fit the treatment to the features, controls, and instrument (:math:`\\E[T | X, W, Z]`).
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-            - 'linear' - LogisticRegressionCV if discrete_treatment=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_treatment=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_treatment` is True
+          and a regressor otherwise
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_treatment=True.
+    model_z_xw: estimator, default ``'auto'``
+        Determines how to fit the instrument to the features and controls (:math:`\\E[Z | X, W]`).
 
-    model_z_xw : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
-        model to estimate :math:`\\E[Z | X, W]`.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
-
-            - 'linear' - LogisticRegressionCV if discrete_instrument=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_instrument=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
-
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_instrument=True.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_instrument` is True
+          and a regressor otherwise
 
     projection: bool, default False
         If True, we fit a slight variant of OrthoIV where we use E[T|X, W, Z] as the instrument as opposed to Z,
@@ -369,19 +352,19 @@ def true_heterogeneity_function(X):
         est.fit(Y=y, T=T, Z=Z, X=X)
 
     >>> est.effect(X[:3])
-    array([-4.49594...,  5.79852..., -2.88049...])
+    array([-4.28045... ,  6.02945..., -2.86851...])
     >>> est.effect_interval(X[:3])
-    (array([-7.40954...,  1.47475..., -5.32889...]),
-    array([-1.58235..., 10.12229..., -0.43209...]))
+    (array([-7.20729...,  1.75412..., -5.20897...]),
+    array([-1.35361..., 10.30478..., -0.52805...]))
     >>> est.coef_
-    array([ 5.27614...,  0.92092...,  0.57579..., -0.22810..., -0.16952...])
+    array([ 4.51659...,  0.78512...,  0.23706...,  0.24126... , -0.47167...])
     >>> est.coef__interval()
-    (array([ 3.93362..., -0.22159..., -0.59863..., -1.39139..., -1.34549...]),
-    array([6.61866..., 2.06345..., 1.75022..., 0.93518..., 1.00644...]))
+    (array([ 3.15602..., -0.35785..., -0.89798..., -0.90530..., -1.62445...]),
+    array([5.87715... , 1.92810... , 1.37211..., 1.38783..., 0.68110...]))
     >>> est.intercept_
-    -0.29110...
+    -0.13672...
     >>> est.intercept__interval()
-    (-1.45607..., 0.87386...)
+    (-1.27036..., 0.99690...)
     """
 
     def __init__(self, *,
@@ -747,12 +730,14 @@ def __init__(self, model_y_xw: ModelSelector, model_t_xw: ModelSelector, model_t
         self._model_t_xw = model_t_xw
         self._model_t_xwz = model_t_xwz
 
-    def train(self, is_selecting, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
-        self._model_y_xw.train(is_selecting, X, W, Y, **filter_none_kwargs(sample_weight=sample_weight, groups=groups))
-        self._model_t_xw.train(is_selecting, X, W, T, **filter_none_kwargs(sample_weight=sample_weight, groups=groups))
+    def train(self, is_selecting, folds, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
+        self._model_y_xw.train(is_selecting, folds, X, W, Y, **
+                               filter_none_kwargs(sample_weight=sample_weight, groups=groups))
+        self._model_t_xw.train(is_selecting, folds, X, W, T, **
+                               filter_none_kwargs(sample_weight=sample_weight, groups=groups))
         # concat W and Z
         WZ = _combine(W, Z, Y.shape[0])
-        self._model_t_xwz.train(is_selecting, X, WZ, T,
+        self._model_t_xwz.train(is_selecting, folds, X, WZ, T,
                                 **filter_none_kwargs(sample_weight=sample_weight, groups=groups))
         return self
 
@@ -788,6 +773,10 @@ def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None)
         T_res = TXZ_pred.reshape(T.shape) - TX_pred.reshape(T.shape)
         return Y_res, T_res
 
+    @property
+    def needs_fit(self):
+        return self._model_y_xw.needs_fit or self._model_t_xw.needs_fit or self._model_t_xwz.needs_fit
+
 
 class _BaseDMLIVModelFinal(_ModelFinal):
     """
@@ -1034,50 +1023,32 @@ class DMLIV(_BaseDMLIV):
 
     Parameters
     ----------
-    model_y_xw : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
-        model to estimate :math:`\\E[Y | X, W]`.
-
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
-
-            - 'linear' - LogisticRegressionCV if discrete_outcome=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_outcome=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
-
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_outcome=True.
+    model_y_xw: estimator, default ``'auto'``
+        Determines how to fit the outcome to the features and controls (:math:`\\E[Y | X, W]`).
 
-    model_t_xw : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
-        Model to estimate :math:`\\E[T | X, W]`.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_outcome` is True
+          and a regressor otherwise
 
-            - 'linear' - LogisticRegressionCV if discrete_treatment=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_treatment=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+    model_t_xw: estimator, default ``'auto'``
+        Determines how to fit the treatment to the features and controls (:math:`\\E[T | X, W]`).
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_treatment=True.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-    model_t_xwz : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
-        Model to estimate :math:`\\E[T | X, W, Z]`.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_treatment` is True
+          and a regressor otherwise
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+    model_t_xwz: estimator, default ``'auto'``
+        Determines how to fit the treatment to the features, controls, and instrument (:math:`\\E[T | X, W, Z]`).
 
-            - 'linear' - LogisticRegressionCV if discrete_treatment=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_treatment=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_treatment=True.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_treatment` is True
+          and a regressor otherwise
 
     model_final : estimator (default is :class:`.StatsModelsLinearRegression`)
         final model that at fit time takes as input :math:`(Y-\\E[Y|X])`, :math:`(\\E[T|X,Z]-\\E[T|X])` and X
@@ -1183,11 +1154,11 @@ def true_heterogeneity_function(X):
         est.fit(Y=y, T=T, Z=Z, X=X)
 
     >>> est.effect(X[:3])
-    array([-6.83575...,  9.40666..., -4.27123...])
+    array([-3.83383...,  5.31902..., -2.78082...])
     >>> est.coef_
-    array([ 8.07179...,  1.51080...,  0.87328..., -0.06944..., -0.47404...])
+    array([ 4.03889...,  0.89335...,  0.12043...,  0.37958..., -0.66097...])
     >>> est.intercept_
-    -0.20555...
+    -0.18482...
 
     """
 
@@ -1442,50 +1413,32 @@ class NonParamDMLIV(_BaseDMLIV):
 
     Parameters
     ----------
-    model_y_xw : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
-        model to estimate :math:`\\E[Y | X, W]`.
-
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
-
-            - 'linear' - LogisticRegressionCV if discrete_outcome=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_outcome=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
-
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_outcome=True.
+    model_y_xw: estimator, default ``'auto'``
+        Determines how to fit the outcome to the features and controls (:math:`\\E[Y | X, W]`).
 
-    model_t_xw : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
-        Model to estimate :math:`\\E[T | X, W]`.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_outcome` is True
+          and a regressor otherwise
 
-            - 'linear' - LogisticRegressionCV if discrete_treatment=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_treatment=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+    model_t_xw: estimator, default ``'auto'``
+        Determines how to fit the treatment to the features and controls (:math:`\\E[T | X, W]`).
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_treatment=True.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-    model_t_xwz : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
-        Model to estimate :math:`\\E[T | X, W, Z]`.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_treatment` is True
+          and a regressor otherwise
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+    model_t_xwz: estimator, default ``'auto'``
+        Determines how to fit the treatment to the features, controls, and instrument (:math:`\\E[T | X, W, Z]`).
 
-            - 'linear' - LogisticRegressionCV if discrete_treatment=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_treatment=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_treatment=True.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_treatment` is True
+          and a regressor otherwise
 
     model_final : estimator
         final model for predicting :math:`\\tilde{Y}` from X with sample weights V(X)
@@ -1592,7 +1545,7 @@ def true_heterogeneity_function(X):
         est.fit(Y=y, T=T, Z=Z, X=X)
 
     >>> est.effect(X[:3])
-    array([-6.18157...,  8.70189..., -4.06004...])
+    array([-5.98517...,  9.03610..., -3.56684...])
 
     """
 
diff --git a/econml/iv/dr/_dr.py b/econml/iv/dr/_dr.py
index 8b205a747..0de96c444 100644
--- a/econml/iv/dr/_dr.py
+++ b/econml/iv/dr/_dr.py
@@ -44,15 +44,13 @@ def _combine(W, Z, n_samples):
 
 
 class _BaseDRIVNuisanceSelector(ModelSelector):
-    def __init__(self, *, prel_model_effect, model_y_xw, model_t_xw, model_tz_xw, model_z,
-                 projection, fit_cov_directly,
+    def __init__(self, *, prel_model_effect, model_y_xw, model_t_xw, model_z,
+                 projection,
                  discrete_treatment, discrete_instrument):
         self._prel_model_effect = prel_model_effect
         self._model_y_xw = model_y_xw
         self._model_t_xw = model_t_xw
-        self._model_tz_xw = model_tz_xw
         self._projection = projection
-        self._fit_cov_directly = fit_cov_directly
         self._discrete_treatment = discrete_treatment
         self._discrete_instrument = discrete_instrument
         if self._projection:
@@ -60,92 +58,20 @@ def __init__(self, *, prel_model_effect, model_y_xw, model_t_xw, model_tz_xw, mo
         else:
             self._model_z_xw = model_z
 
-    def train(self, is_selecting, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
+    def train(self, is_selecting, folds, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
         # T and Z only allow single continuous or binary, keep the shape of (n,) for continuous and (n,1) for binary
         T = T.ravel() if not self._discrete_treatment else T
         Z = Z.ravel() if not self._discrete_instrument else Z
 
-        self._model_y_xw.train(is_selecting, X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups)
-        self._model_t_xw.train(is_selecting, X=X, W=W, Target=T, sample_weight=sample_weight, groups=groups)
-        if is_selecting and self._fit_cov_directly:
-            # need to fit, too, since we call predict later inside this train method
-            self._model_t_xw.train(False, X=X, W=W, Target=T, sample_weight=sample_weight, groups=groups)
+        self._model_y_xw.train(is_selecting, folds, X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups)
+        self._model_t_xw.train(is_selecting, folds, X=X, W=W, Target=T, sample_weight=sample_weight, groups=groups)
 
         if self._projection:
             WZ = _combine(W, Z, Y.shape[0])
-            self._model_t_xwz.train(is_selecting, X=X, W=WZ, Target=T, sample_weight=sample_weight, groups=groups)
-            if is_selecting:
-                # need to fit, too, since we call predict later inside this train method
-                self._model_t_xwz.train(False, X=X, W=WZ, Target=T, sample_weight=sample_weight, groups=groups)
-        else:
-            self._model_z_xw.train(is_selecting, X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups)
-            if is_selecting:
-                # need to fit, too, since we call predict later inside this train method
-                self._model_z_xw.train(False, X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups)
-
-        if self._projection:
-            T_proj = self._model_t_xwz.predict(X, WZ).reshape(T.shape)
-            if self._fit_cov_directly:
-                # We're projecting, so we're treating E[T|X,Z] as the instrument (ignoring W for simplicity)
-                # Then beta(X) = E[T̃ (E[T|X,Z]-E[E[T|X,Z]|X)|X] and we can apply the tower rule several times to get
-                #              = E[(E[T|X,Z]-E[T|X])^2|X]
-                # and also     = E[(E[T|X,Z]-T)^2|X]
-                # so we can compute it either from (T_proj-T_pred)^2 or from (T_proj-T)^2
-                T_pred = self._model_t_xw.predict(X, W)
-                if X is None and W is None:
-                    T_pred = np.broadcast_to(T_pred, T.shape)
-                else:
-                    T_pred = T_pred.reshape(T.shape)
-                target = (T_proj - T_pred)**2
-                self._model_tz_xw.train(is_selecting, X=X, W=W, Target=target,
-                                        sample_weight=sample_weight, groups=groups)
-            else:
-                # return shape (n,)
-                target = (T * T_proj).reshape(T.shape[0],)
-                self._model_tz_xw.train(is_selecting, X=X, W=W, Target=target,
-                                        sample_weight=sample_weight, groups=groups)
+            self._model_t_xwz.train(is_selecting, folds, X=X, W=WZ, Target=T,
+                                    sample_weight=sample_weight, groups=groups)
         else:
-            if self._fit_cov_directly:
-                Z_pred = self._model_z_xw.predict(X, W)
-                T_pred = self._model_t_xw.predict(X, W)
-
-                if X is None and W is None:
-                    Z_pred = np.broadcast_to(Z_pred, Z.shape)
-                    T_pred = np.broadcast_to(T_pred, T.shape)
-                else:
-                    Z_pred = Z_pred.reshape(Z.shape)
-                    T_pred = T_pred.reshape(T.shape)
-
-                Z_res = Z - Z_pred
-                T_res = T - T_pred
-
-                # need to avoid erroneous broadcasting when one of Z_res or T_res is (n,1) and the other is (n,)
-                assert Z_res.shape[0] == T_res.shape[0] and (Z_res.ndim == 1 or Z_res.shape[1:] == (
-                    1,)) and (T_res.ndim == 1 or T_res.shape[1:] == (1,))
-                target_shape = Z_res.shape if Z_res.ndim > 1 else T_res.shape
-                target = T_res.reshape(target_shape) * Z_res.reshape(target_shape)
-                # TODO: if the T and Z models overfit, then this will be biased towards 0;
-                #       consider using nested cross-fitting
-                #       a similar comment applies to the projection case
-                self._model_tz_xw.train(is_selecting, X=X, W=W, Target=target,
-                                        sample_weight=sample_weight, groups=groups)
-            else:
-                if self._discrete_treatment:
-                    if self._discrete_instrument:
-                        # target will be discrete and will be inversed from FirstStageWrapper, shape (n,1)
-                        target = T * Z
-                    else:
-                        # shape (n,)
-                        target = inverse_onehot(T) * Z
-                else:
-                    if self._discrete_instrument:
-                        # shape (n,)
-                        target = T * inverse_onehot(Z)
-                    else:
-                        # shape(n,)
-                        target = T * Z
-                self._model_tz_xw.train(is_selecting, X=X, W=W, Target=target,
-                                        sample_weight=sample_weight, groups=groups)
+            self._model_z_xw.train(is_selecting, folds, X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups)
 
         # TODO: prel_model_effect could allow sample_var and freq_weight?
         if self._discrete_instrument:
@@ -187,14 +113,7 @@ def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
             else:
                 t_xwz_score = None
 
-            if hasattr(self._model_tz_xw, 'score'):
-                T_proj = self._model_t_xwz.predict(X, WZ).reshape(T.shape)
-                # if discrete, return shape (n,1); if continuous return shape (n,)
-                target = (T * T_proj).reshape(T.shape[0],)
-                tz_xw_score = self._model_tz_xw.score(X=X, W=W, Target=target, sample_weight=sample_weight)
-            else:
-                tz_xw_score = None
-            return y_xw_score, t_xw_score, t_xwz_score, tz_xw_score, effect_score
+            return y_xw_score, t_xw_score, t_xwz_score, effect_score
 
         else:
             if hasattr(self._model_z_xw, 'score'):
@@ -202,28 +121,11 @@ def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
             else:
                 z_xw_score = None
 
-            if hasattr(self._model_tz_xw, 'score'):
-                if self._discrete_treatment:
-                    if self._discrete_instrument:
-                        # target will be discrete and will be inversed from FirstStageWrapper
-                        target = T * Z
-                    else:
-                        target = inverse_onehot(T) * Z
-                else:
-                    if self._discrete_instrument:
-                        target = T * inverse_onehot(Z)
-                    else:
-                        target = T * Z
-                tz_xw_score = self._model_tz_xw.score(X=X, W=W, Target=target, sample_weight=sample_weight)
-            else:
-                tz_xw_score = None
-
-            return y_xw_score, t_xw_score, z_xw_score, tz_xw_score, effect_score
+            return y_xw_score, t_xw_score, z_xw_score, effect_score
 
     def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
         Y_pred = self._model_y_xw.predict(X, W)
         T_pred = self._model_t_xw.predict(X, W)
-        TZ_pred = self._model_tz_xw.predict(X, W)
         prel_theta = self._prel_model_effect.effect(X)
 
         if X is None:
@@ -231,7 +133,6 @@ def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None)
             if W is None:
                 Y_pred = np.tile(Y_pred.reshape(1, -1), (Y.shape[0], 1))
                 T_pred = np.tile(T_pred.reshape(1, -1), (Y.shape[0], 1))
-                TZ_pred = np.tile(TZ_pred.reshape(1, -1), (Y.shape[0], 1))
 
         # for convenience, reshape Z,T to a vector since they are either binary or single dimensional continuous
         T = T.reshape(T.shape[0],)
@@ -239,7 +140,6 @@ def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None)
         # reshape the predictions
         Y_pred = Y_pred.reshape(Y.shape)
         T_pred = T_pred.reshape(T.shape)
-        TZ_pred = TZ_pred.reshape(T.shape)
 
         Y_res = Y - Y_pred
         T_res = T - T_pred
@@ -249,30 +149,135 @@ def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None)
             WZ = _combine(W, Z, Y.shape[0])
             T_proj = self._model_t_xwz.predict(X, WZ).reshape(T.shape)
             Z_res = T_proj - T_pred
+        else:
+            Z_pred = self._model_z_xw.predict(X, W)
+            if X is None and W is None:
+                Z_pred = np.tile(Z_pred.reshape(1, -1), (Z.shape[0], 1))
+            Z_pred = Z_pred.reshape(Z.shape)
+            Z_res = Z - Z_pred
+
+        # check nuisances outcome shape
+        # Y_res could be a vector or 1-dimensional 2d-array
+        assert T_res.ndim == 1, "Nuisance outcome should be vector!"
+        assert Z_res.ndim == 1, "Nuisance outcome should be vector!"
+
+        return prel_theta, Y_res, T_res, Z_res
+
+    @property
+    def needs_fit(self):
+        return (self._model_y_xw.needs_fit or self._model_t_xw.needs_fit or
+                (self._projection and self._model_t_xwz.needs_fit) or
+                (not self._projection and self._model_z_xw.needs_fit))
+
+
+class _BaseDRIVNuisanceCovarianceSelector(ModelSelector):
+    def __init__(self, *, model_tz_xw,
+                 projection, fit_cov_directly,
+                 discrete_treatment, discrete_instrument):
+        self._model_tz_xw = model_tz_xw
+        self._projection = projection
+        self._fit_cov_directly = fit_cov_directly
+        self._discrete_treatment = discrete_treatment
+        self._discrete_instrument = discrete_instrument
+
+    def _get_target(self, T_res, Z_res, T, Z):
+        T = T.ravel() if not self._discrete_treatment else T
+        Z = Z.ravel() if not self._discrete_instrument else Z
+        if self._projection:
+            if self._fit_cov_directly:
+                # We're projecting, so we're treating E[T|X,Z] as the instrument (ignoring W for simplicity)
+                # Then beta(X) = E[T̃ (E[T|X,Z]-E[E[T|X,Z]|X)|X] and we can apply the tower rule several times to get
+                #              = E[(E[T|X,Z]-E[T|X])^2|X]
+                # and also     = E[(E[T|X,Z]-T)^2|X]
+                # so we can compute it either from (T_proj-T_pred)^2 or from (T_proj-T)^2
+                # The first of these is just Z_res^2
+                target = Z_res**2
+            else:
+                # fit on T*T_proj, covariance will be computed by E[T_res * T_proj] = E[T*T_proj] - E[T]^2
+                # return shape (n,)
+                T_pred = T - T_res.reshape(T.shape)
+                T_proj = T_pred + Z_res.reshape(T.shape)
+                target = (T * T_proj).reshape(T.shape[0],)
+        else:
+            if self._fit_cov_directly:
+                # we will fit on the covariance (T_res*Z_res) directly
+                target_shape = Z_res.shape if Z_res.ndim > 1 else T_res.shape
+                target = T_res.reshape(target_shape) * Z_res.reshape(target_shape)
+            else:
+                # fit on TZ, covariance will be computed by E[T_res * Z_res] = TZ_pred - T_pred * Z_pred
+                if self._discrete_treatment:
+                    if self._discrete_instrument:
+                        # target will be discrete and will be inversed from FirstStageWrapper, shape (n,1)
+                        target = T * Z
+                    else:
+                        # shape (n,)
+                        target = inverse_onehot(T) * Z
+                else:
+                    if self._discrete_instrument:
+                        # shape (n,)
+                        target = T * inverse_onehot(Z)
+                    else:
+                        # shape(n,)
+                        target = T * Z
+        return target
+
+    def train(self, is_selecting, folds,
+              prel_theta, Y_res, T_res, Z_res,
+              Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
+        # T and Z only allow single continuous or binary, keep the shape of (n,) for continuous and (n,1) for binary
+        target = self._get_target(T_res, Z_res, T, Z)
+        self._model_tz_xw.train(is_selecting, folds, X=X, W=W, Target=target,
+                                sample_weight=sample_weight, groups=groups)
+
+        return self
+
+    def score(self, prel_theta, Y_res, T_res, Z_res, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
+        # T and Z only allow single continuous or binary, keep the shape of (n,) for continuous and (n,1) for binary
+        if hasattr(self._model_tz_xw, 'score'):
+            target = self._get_target(T_res, Z_res, T, Z)
+            tz_xw_score = self._model_tz_xw.score(X=X, W=W, Target=target, sample_weight=sample_weight)
+        else:
+            tz_xw_score = None
+
+        return (tz_xw_score,)
+
+    def predict(self, prel_theta, Y_res, T_res, Z_res, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
+        TZ_pred = self._model_tz_xw.predict(X, W)
+
+        if X is None and W is None:
+            TZ_pred = np.tile(TZ_pred.reshape(1, -1), (Y.shape[0], 1))
+
+        # for convenience, reshape Z,T to a vector since they are either binary or single dimensional continuous
+        T = T.reshape(T.shape[0],)
+        Z = Z.reshape(Z.shape[0],)
+
+        # reshape the predictions
+        TZ_pred = TZ_pred.reshape(T.shape)
+
+        if self._projection:
             if self._fit_cov_directly:
                 cov = TZ_pred
             else:
+                T_pred = T - T_res
                 cov = TZ_pred - T_pred**2
             # in the projection case, this is a variance and should always be non-negative
             cov = np.maximum(cov, 0)
         else:
-            Z_pred = self._model_z_xw.predict(X, W)
-            if X is None and W is None:
-                Z_pred = np.tile(Z_pred.reshape(1, -1), (Z.shape[0], 1))
-            Z_pred = Z_pred.reshape(Z.shape)
-            Z_res = Z - Z_pred
             if self._fit_cov_directly:
                 cov = TZ_pred
             else:
+                T_pred = T - T_res
+                Z_pred = Z - Z_res
                 cov = TZ_pred - T_pred * Z_pred
 
         # check nuisances outcome shape
-        # Y_res could be a vector or 1-dimensional 2d-array
-        assert T_res.ndim == 1, "Nuisance outcome should be vector!"
-        assert Z_res.ndim == 1, "Nuisance outcome should be vector!"
         assert cov.ndim == 1, "Nuisance outcome should be vector!"
 
-        return prel_theta, Y_res, T_res, Z_res, cov
+        return (cov,)
+
+    @property
+    def needs_fit(self):
+        return self._model_tz_xw.needs_fit
 
 
 class _BaseDRIVModelFinal:
@@ -693,15 +698,18 @@ def _gen_ortho_learner_model_nuisance(self):
                                                  is_discrete=self.discrete_instrument,
                                                  random_state=self.random_state)
 
-        return _BaseDRIVNuisanceSelector(prel_model_effect=self._gen_prel_model_effect(),
-                                         model_y_xw=model_y_xw,
-                                         model_t_xw=model_t_xw,
-                                         model_tz_xw=model_tz_xw,
-                                         model_z=model_z,
-                                         projection=self.projection,
-                                         fit_cov_directly=self.fit_cov_directly,
-                                         discrete_treatment=self.discrete_treatment,
-                                         discrete_instrument=self.discrete_instrument)
+        return [_BaseDRIVNuisanceSelector(prel_model_effect=self._gen_prel_model_effect(),
+                                          model_y_xw=model_y_xw,
+                                          model_t_xw=model_t_xw,
+                                          model_z=model_z,
+                                          projection=self.projection,
+                                          discrete_treatment=self.discrete_treatment,
+                                          discrete_instrument=self.discrete_instrument),
+                _BaseDRIVNuisanceCovarianceSelector(model_tz_xw=model_tz_xw,
+                                                    projection=self.projection,
+                                                    fit_cov_directly=self.fit_cov_directly,
+                                                    discrete_treatment=self.discrete_treatment,
+                                                    discrete_instrument=self.discrete_instrument)]
 
 
 class DRIV(_DRIV):
@@ -711,83 +719,51 @@ class DRIV(_DRIV):
 
     Parameters
     ----------
-    model_y_xw : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
-        model to estimate :math:`\\E[Y | X, W]`.
-
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
-
-            - 'linear' - LogisticRegressionCV if discrete_outcome=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_outcome=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
-
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_outcome=True.
+    model_y_xw: estimator, default ``'auto'``
+        Determines how to fit the outcome to the features and controls (:math:`\\E[Y | X, W]`).
 
-    model_t_xw : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
-        Model to estimate :math:`\\E[T | X, W]`.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_outcome` is True
+          and a regressor otherwise
 
-            - 'linear' - LogisticRegressionCV if discrete_treatment=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_treatment=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+    model_t_xw: estimator, default ``'auto'``
+        Determines how to fit the treatment to the features and controls (:math:`\\E[T | X, W]`).
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_treatment=True.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-    model_z_xw : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
-        model to estimate :math:`\\E[Z | X, W]`.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_treatment` is True
+          and a regressor otherwise
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+    model_z_xw: estimator, default ``'auto'``
+        Determines how to fit the instrument to the features and controls (:math:`\\E[Z | X, W]`).
 
-            - 'linear' - LogisticRegressionCV if discrete_instrument=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_instrument=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_instrument=True.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_instrument` is True
+          and a regressor otherwise
 
-    model_t_xwz : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
-        Model to estimate :math:`\\E[T | X, W, Z]`.
+    model_t_xwz: estimator, default ``'auto'``
+        Determines how to fit the treatment to the features, controls, and instrument (:math:`\\E[T | X, W, Z]`).
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-            - 'linear' - LogisticRegressionCV if discrete_treatment=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_treatment=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_treatment` is True
+          and a regressor otherwise
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_treatment=True.
+    model_tz_xw: estimator, default ``'auto'``
+        Determines how to fit the covariance to the features and controls (:math:`\\E[T*Z | X, W]` or
+        :math:`\\E[\\tilde{T}*\\tilde{Z} | X, W]` depending on `fit_cov_directly`).
 
-    model_tz_xw : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
-        model to estimate :math:`\\E[T*Z | X, W]` or :math:`\\E[\\tilde{T}*\\tilde{Z} | X, W]`
-        depending on `fit_cov_directly`.
-        Target will be discrete if discrete instrument and discrete treatment with `fit_cov_directly=False`,
-        else target will be continuous.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
-
-            - 'linear' - LogisticRegressionCV if discrete target else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete target else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
-
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete target.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_treatment` is True
+          and a regressor otherwise
 
     fit_cov_directly : bool, default True
         Whether to fit :math:`\\E[\\tilde{T}*\\tilde{Z} | X, W]` instead of :math:`\\E[T*Z | X, W]`.
@@ -934,7 +910,7 @@ def true_heterogeneity_function(X):
         est.fit(Y=y, T=T, Z=Z, X=X)
 
     >>> est.effect(X[:3])
-    array([-4.07330...,  6.01693..., -2.71813...])
+    array([-4.15079...,  5.99291..., -2.86514...])
     """
 
     def __init__(self, *,
@@ -1022,6 +998,7 @@ def _gen_prel_model_effect(self):
                          opt_reweighted=self.prel_opt_reweighted,
                          discrete_instrument=self.discrete_instrument,
                          discrete_treatment=self.discrete_treatment,
+                         discrete_outcome=self.discrete_outcome,
                          categories=self.categories,
                          cv=self.prel_cv,
                          mc_iters=self.mc_iters,
@@ -1035,6 +1012,7 @@ def _gen_prel_model_effect(self):
                                  model_final=clone(self.flexible_model_effect, safe=False),
                                  discrete_instrument=self.discrete_instrument,
                                  discrete_treatment=self.discrete_treatment,
+                                 discrete_outcome=self.discrete_outcome,
                                  featurizer=self._gen_featurizer(),
                                  categories=self.categories,
                                  cv=self.prel_cv,
@@ -1111,7 +1089,7 @@ def models_y_xw(self):
             iterations, each element in the sublist corresponds to a crossfitting
             fold and is the model instance that was fitted for that training fold.
         """
-        return [[mdl._model_y_xw.best_model._model for mdl in mdls] for mdls in super().models_nuisance_]
+        return [[mdl._model_y_xw.best_model._model for mdl in mdls[0]] for mdls in super().models_nuisance_]
 
     @property
     def models_t_xw(self):
@@ -1125,7 +1103,7 @@ def models_t_xw(self):
             iterations, each element in the sublist corresponds to a crossfitting
             fold and is the model instance that was fitted for that training fold.
         """
-        return [[mdl._model_t_xw.best_model._model for mdl in mdls] for mdls in super().models_nuisance_]
+        return [[mdl._model_t_xw.best_model._model for mdl in mdls[0]] for mdls in super().models_nuisance_]
 
     @property
     def models_z_xw(self):
@@ -1141,7 +1119,7 @@ def models_z_xw(self):
         """
         if self.projection:
             raise AttributeError("Projection model is fitted for instrument! Use models_t_xwz.")
-        return [[mdl._model_z_xw.best_model._model for mdl in mdls] for mdls in super().models_nuisance_]
+        return [[mdl._model_z_xw.best_model._model for mdl in mdls[0]] for mdls in super().models_nuisance_]
 
     @property
     def models_t_xwz(self):
@@ -1157,7 +1135,7 @@ def models_t_xwz(self):
         """
         if not self.projection:
             raise AttributeError("Direct model is fitted for instrument! Use models_z_xw.")
-        return [[mdl._model_t_xwz.best_model._model for mdl in mdls] for mdls in super().models_nuisance_]
+        return [[mdl._model_t_xwz.best_model._model for mdl in mdls[0]] for mdls in super().models_nuisance_]
 
     @property
     def models_tz_xw(self):
@@ -1171,7 +1149,7 @@ def models_tz_xw(self):
             iterations, each element in the sublist corresponds to a crossfitting
             fold and is the model instance that was fitted for that training fold.
         """
-        return [[mdl._model_tz_xw.best_model._model for mdl in mdls] for mdls in super().models_nuisance_]
+        return [[mdl._model_tz_xw.best_model._model for mdl in mdls[1]] for mdls in super().models_nuisance_]
 
     @property
     def models_prel_model_effect(self):
@@ -1185,7 +1163,7 @@ def models_prel_model_effect(self):
             of monte carlo iterations, each element in the sublist corresponds to a crossfitting
             fold and is the model instance that was fitted for that training fold.
         """
-        return [[mdl._prel_model_effect for mdl in mdls] for mdls in super().models_nuisance_]
+        return [[mdl._prel_model_effect for mdl in mdls[0]] for mdls in super().models_nuisance_]
 
     @property
     def nuisance_scores_y_xw(self):
@@ -1220,16 +1198,16 @@ def nuisance_scores_t_xwz(self):
         return self.nuisance_scores_[2]
 
     @property
-    def nuisance_scores_tz_xw(self):
+    def nuisance_scores_prel_model_effect(self):
         """
-        Get the scores for tz_xw model on the out-of-sample training data
+        Get the scores for prel_model_effect model on the out-of-sample training data
         """
         return self.nuisance_scores_[3]
 
     @property
-    def nuisance_scores_prel_model_effect(self):
+    def nuisance_scores_tz_xw(self):
         """
-        Get the scores for prel_model_effect model on the out-of-sample training data
+        Get the scores for tz_xw model on the out-of-sample training data
         """
         return self.nuisance_scores_[4]
 
@@ -1244,83 +1222,51 @@ class LinearDRIV(StatsModelsCateEstimatorMixin, DRIV):
 
     Parameters
     ----------
-    model_y_xw : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
-        model to estimate :math:`\\E[Y | X, W]`.
+    model_y_xw: estimator, default ``'auto'``
+        Determines how to fit the outcome to the features and controls (:math:`\\E[Y | X, W]`).
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-            - 'linear' - LogisticRegressionCV if discrete_outcome=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_outcome=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_outcome` is True
+          and a regressor otherwise
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_outcome=True.
+    model_t_xw: estimator, default ``'auto'``
+        Determines how to fit the treatment to the features and controls (:math:`\\E[T | X, W]`).
 
-    model_t_xw : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
-        model to estimate :math:`\\E[T | X, W]`.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_treatment` is True
+          and a regressor otherwise
 
-            - 'linear' - LogisticRegressionCV if discrete_treatment=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_treatment=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+    model_z_xw: estimator, default ``'auto'``
+        Determines how to fit the instrument to the features and controls (:math:`\\E[Z | X, W]`).
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_treatment=True.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-    model_z_xw : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
-        model to estimate :math:`\\E[Z | X, W]`.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_instrument` is True
+          and a regressor otherwise
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+    model_t_xwz: estimator, default ``'auto'``
+        Determines how to fit the treatment to the features, controls, and instrument (:math:`\\E[T | X, W, Z]`).
 
-            - 'linear' - LogisticRegressionCV if discrete_instrument=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_instrument=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_instrument=True.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_treatment` is True
+          and a regressor otherwise
 
-    model_t_xwz : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
-        model to estimate :math:`\\E[T | X, W, Z]`.
+    model_tz_xw: estimator, default ``'auto'``
+        Determines how to fit the covariance to the features and controls (:math:`\\E[T*Z | X, W]` or
+        :math:`\\E[\\tilde{T}*\\tilde{Z} | X, W]` depending on `fit_cov_directly`).
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-            - 'linear' - LogisticRegressionCV if discrete_treatment=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_treatment=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
-
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_treatment=True.
-
-    model_tz_xw : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
-        model to estimate :math:`\\E[T*Z | X, W]` or :math:`\\E[\\tilde{T}*\\tilde{Z} | X, W]`
-        depending on `fit_cov_directly`.
-        Target will be discrete if discrete instrument and discrete treatment with `fit_cov_directly=False`,
-        else target will be continuous.
-
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
-
-            - 'linear' - LogisticRegressionCV if discrete target else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete target else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
-
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete target.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_treatment` is True
+          and a regressor otherwise
 
     fit_cov_directly : bool, default True
         Whether to fit :math:`\\E[\\tilde{T}*\\tilde{Z} | X, W]` instead of :math:`\\E[T*Z | X, W]`.
@@ -1464,19 +1410,19 @@ def true_heterogeneity_function(X):
         est.fit(Y=y, T=T, Z=Z, X=X)
 
     >>> est.effect(X[:3])
-    array([-4.29809...,  5.94280..., -3.00977...])
+    array([-4.27803...,  5.84999..., -2.98296...])
     >>> est.effect_interval(X[:3])
-    (array([-7.09165...,  1.79692..., -5.46033...]),
-    array([-1.50452..., 10.08868..., -0.55922...]))
+    (array([-7.16141...,  1.71887..., -5.41441...]),
+    array([-1.39465...,  9.98110..., -0.55151...]))
     >>> est.coef_
-    array([ 4.84900...,  0.82084...,  0.24269..., -0.04771..., -0.29325...])
+    array([ 4.65225...,  0.93347...,  0.23315...,  0.22843..., -0.42850...])
     >>> est.coef__interval()
-    (array([ 3.67882..., -0.35547..., -0.97063..., -1.15410..., -1.50482...]),
-    array([6.01917..., 1.99716..., 1.45603..., 1.05867..., 0.91831...]))
+    (array([ 3.40045..., -0.19165..., -0.95122..., -0.88662..., -1.56024...]),
+    array([5.90404..., 2.05861..., 1.41753..., 1.34349..., 0.70324...]))
     >>> est.intercept_
-    -0.16276...
+    -0.12823...
     >>> est.intercept__interval()
-    (-1.32713..., 1.00160...)
+    (-1.27155..., 1.01508...)
     """
 
     def __init__(self, *,
@@ -1614,83 +1560,51 @@ class SparseLinearDRIV(DebiasedLassoCateEstimatorMixin, DRIV):
 
     Parameters
     ----------
-    model_y_xw : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
-        model to estimate :math:`\\E[Y | X, W]`.
-
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+    model_y_xw: estimator, default ``'auto'``
+        Determines how to fit the outcome to the features and controls (:math:`\\E[Y | X, W]`).
 
-            - 'linear' - LogisticRegressionCV if discrete_outcome=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_outcome=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_outcome=True.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_outcome` is True
+          and a regressor otherwise
 
-    model_t_xw : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
-        model to estimate :math:`\\E[T | X, W]`.
+    model_t_xw: estimator, default ``'auto'``
+        Determines how to fit the treatment to the features and controls (:math:`\\E[T | X, W]`).
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-            - 'linear' - LogisticRegressionCV if discrete_treatment=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_treatment=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_treatment` is True
+          and a regressor otherwise
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_treatment=True.
+    model_z_xw: estimator, default ``'auto'``
+        Determines how to fit the instrument to the features and controls (:math:`\\E[Z | X, W]`).
 
-    model_z_xw : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
-        model to estimate :math:`\\E[Z | X, W]`.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_instrument` is True
+          and a regressor otherwise
 
-            - 'linear' - LogisticRegressionCV if discrete_instrument=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_instrument=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+    model_t_xwz: estimator, default ``'auto'``
+        Determines how to fit the treatment to the features, controls, and instrument (:math:`\\E[T | X, W, Z]`).
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_instrument=True.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-    model_t_xwz : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
-        Model to estimate :math:`\\E[T | X, W, Z]`.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_treatment` is True
+          and a regressor otherwise
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+    model_tz_xw: estimator, default ``'auto'``
+        Determines how to fit the covariance to the features and controls (:math:`\\E[T*Z | X, W]` or
+        :math:`\\E[\\tilde{T}*\\tilde{Z} | X, W]` depending on `fit_cov_directly`).
 
-            - 'linear' - LogisticRegressionCV if discrete_treatment=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_treatment=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_treatment=True.
-
-    model_tz_xw : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
-        model to estimate :math:`\\E[T*Z | X, W]` or :math:`\\E[\\tilde{T}*\\tilde{Z} | X, W]`
-        depending on `fit_cov_directly`.
-        Target will be discrete if discrete instrument and discrete treatment with `fit_cov_directly=False`,
-        else target will be continuous.
-
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
-
-            - 'linear' - LogisticRegressionCV if discrete target else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete target else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
-
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete target.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_treatment` is True
+          and a regressor otherwise
 
     fit_cov_directly : bool, default True
         Whether to fit :math:`\\E[\\tilde{T}*\\tilde{Z} | X, W]` instead of :math:`\\E[T*Z | X, W]`.
@@ -1864,19 +1778,19 @@ def true_heterogeneity_function(X):
         est.fit(Y=y, T=T, Z=Z, X=X)
 
     >>> est.effect(X[:3])
-    array([-4.26791...,  5.98882..., -3.02154...])
+    array([-4.23929...,  5.89223..., -3.01208...])
     >>> est.effect_interval(X[:3])
-    (array([-7.06828...,  2.00060..., -5.46554...]),
-    array([-1.46754...,  9.97704..., -0.57754...]))
+    (array([-6.99789...,  1.96351..., -5.41963...]),
+    array([-1.48069...,  9.82096..., -0.60454...]))
     >>> est.coef_
-    array([ 4.84189...,  0.81844... ,  0.20681..., -0.04660..., -0.28790...])
+    array([ 4.65819...,  0.94689...,  0.18314...,  0.23012..., -0.40375...])
     >>> est.coef__interval()
-    (array([ 3.68288..., -0.35434..., -0.98986..., -1.18770..., -1.48722...]),
-    array([6.00090..., 1.99122..., 1.40349..., 1.09449..., 0.91141...]))
+    (array([ 3.51647..., -0.20839..., -0.99568..., -0.89394..., -1.58518...]),
+    array([5.79991..., 2.10218... , 1.36197..., 1.35420... , 0.77767...]))
     >>> est.intercept_
-    -0.12298...
+    -0.06539...
     >>> est.intercept__interval()
-    (-1.28204..., 1.03607...)
+    (-1.20716..., 1.07637...)
     """
 
     def __init__(self, *,
@@ -2030,83 +1944,51 @@ class ForestDRIV(ForestModelFinalCateEstimatorMixin, DRIV):
 
     Parameters
     ----------
-    model_y_xw : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
-        model to estimate :math:`\\E[Y | X, W]`.
-
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+    model_y_xw: estimator, default ``'auto'``
+        Determines how to fit the outcome to the features and controls (:math:`\\E[Y | X, W]`).
 
-            - 'linear' - LogisticRegressionCV if discrete_outcome=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_outcome=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_outcome=True.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_outcome` is True
+          and a regressor otherwise
 
-    model_t_xw : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
-        model to estimate :math:`\\E[T | X, W]`.
+    model_t_xw: estimator, default ``'auto'``
+        Determines how to fit the treatment to the features and controls (:math:`\\E[T | X, W]`).
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-            - 'linear' - LogisticRegressionCV if discrete_treatment=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_treatment=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_treatment` is True
+          and a regressor otherwise
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_treatment=True.
+    model_z_xw: estimator, default ``'auto'``
+        Determines how to fit the instrument to the features and controls (:math:`\\E[Z | X, W]`).
 
-    model_z_xw : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
-        model to estimate :math:`\\E[Z | X, W]`.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_instrument` is True
+          and a regressor otherwise
 
-            - 'linear' - LogisticRegressionCV if discrete_instrument=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_instrument=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+    model_t_xwz: estimator, default ``'auto'``
+        Determines how to fit the treatment to the features, controls, and instrument (:math:`\\E[T | X, W, Z]`).
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_instrument=True.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-    model_t_xwz : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
-        model to estimate :math:`\\E[T | X, W, Z]`.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_treatment` is True
+          and a regressor otherwise
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+    model_tz_xw: estimator, default ``'auto'``
+        Determines how to fit the covariance to the features and controls (:math:`\\E[T*Z | X, W]`
+        or :math:`\\E[\\tilde{T}*\\tilde{Z} | X, W]` depending on `fit_cov_directly`).
 
-            - 'linear' - LogisticRegressionCV if discrete_treatment=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_treatment=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_treatment=True.
-
-    model_tz_xw : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
-        model to estimate :math:`\\E[T*Z | X, W]` or :math:`\\E[\\tilde{T}*\\tilde{Z} | X, W]`
-        depending on `fit_cov_directly`.
-        Target will be discrete if discrete instrument and discrete treatment with `fit_cov_directly=False`,
-        else target will be continuous.
-
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
-
-            - 'linear' - LogisticRegressionCV if discrete target else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete target else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
-
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete target.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_treatment` is True
+          and a regressor otherwise
 
     fit_cov_directly : bool, default True
         Whether to fit :math:`\\E[\\tilde{T}*\\tilde{Z} | X, W]` instead of :math:`\\E[T*Z | X, W]`.
@@ -2352,10 +2234,10 @@ def true_heterogeneity_function(X):
         est.fit(Y=y, T=T, Z=Z, X=X)
 
     >>> est.effect(X[:3])
-    array([-1.60489...,  5.40611..., -3.46904...])
+    array([-2.11667...,  6.31903..., -3.65700...])
     >>> est.effect_interval(X[:3])
-    (array([-5.37171...,  0.73055..., -7.15266...]),
-    array([ 2.16192..., 10.08168...,  0.21457...]))
+    (array([-5.53359...,  2.40420..., -7.14977...]),
+    array([ 1.30025..., 10.23385..., -0.16424...]))
     """
 
     def __init__(self, *,
@@ -2521,12 +2403,12 @@ def __init__(self,
         self._dummy_z = dummy_z
         self._prel_model_effect = prel_model_effect
 
-    def train(self, is_selecting, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
-        self._model_y_xw.train(is_selecting, X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups)
+    def train(self, is_selecting, folds, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None):
+        self._model_y_xw.train(is_selecting, folds, X=X, W=W, Target=Y, sample_weight=sample_weight, groups=groups)
         # concat W and Z
         WZ = _combine(W, Z, Y.shape[0])
-        self._model_t_xwz.train(is_selecting, X=X, W=WZ, Target=T, sample_weight=sample_weight, groups=groups)
-        self._dummy_z.train(is_selecting, X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups)
+        self._model_t_xwz.train(is_selecting, folds, X=X, W=WZ, Target=T, sample_weight=sample_weight, groups=groups)
+        self._dummy_z.train(is_selecting, folds, X=X, W=W, Target=Z, sample_weight=sample_weight, groups=groups)
         # we need to undo the one-hot encoding for calling effect,
         # since it expects raw values
         self._prel_model_effect.fit(Y, inverse_onehot(T), Z=inverse_onehot(Z), X=X, W=W,
@@ -2582,6 +2464,10 @@ def predict(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None)
 
         return prel_theta, Y_res, T_res, Z_res, beta
 
+    @property
+    def needs_fit(self):
+        return self._model_y_xw.needs_fit or self._model_t_xwz.needs_fit or self._dummy_z.needs_fit
+
 
 class _DummyClassifier:
     """
@@ -2694,35 +2580,23 @@ class IntentToTreatDRIV(_IntentToTreatDRIV):
 
     Parameters
     ----------
-    model_y_xw : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
-        model to estimate :math:`\\E[Y | X, W]`.
+    model_y_xw: estimator, default ``'auto'``
+        Determines how to fit the outcome to the features and controls (:math:`\\E[Y | X, W]`).
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-            - 'linear' - LogisticRegressionCV if discrete_outcome=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_outcome=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_outcome` is True
+          and a regressor otherwise
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_outcome=True.
+    model_t_xwz: estimator, default ``'auto'``
+        Determines how to fit the treatment to the features, controls, and instrument (:math:`\\E[T | X, W, Z]`).
 
-    model_t_xwz : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
-        model to estimate :math:`\\E[T | X, W, Z]`.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
-
-            - 'linear' - LogisticRegressionCV if discrete_treatment=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_treatment=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
-
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_treatment=True.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_treatment` is True
+          and a regressor otherwise
 
     flexible_model_effect : estimator or 'auto' (default is 'auto')
         a flexible model for a preliminary version of the CATE, must accept sample_weight at fit time.
@@ -2852,7 +2726,7 @@ def true_heterogeneity_function(X):
         est.fit(Y=y, T=T, Z=Z, X=X)
 
     >>> est.effect(X[:3])
-    array([-3.71724...,  6.39915..., -2.14545...])
+    array([-4.52641...,  6.38726..., -2.67055...])
     """
 
     def __init__(self, *,
@@ -3013,36 +2887,23 @@ class LinearIntentToTreatDRIV(StatsModelsCateEstimatorMixin, IntentToTreatDRIV):
 
     Parameters
     ----------
-    model_y_xw : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
-        model to estimate :math:`\\E[Y | X, W]`.
-
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
-
-            - 'linear' - LogisticRegressionCV if discrete_outcome=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_outcome=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
-
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_outcome=True.
+    model_y_xw: estimator, default ``'auto'``
+        Determines how to fit the outcome to the features and controls (:math:`\\E[Y | X, W]`).
 
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-    model_t_xwz : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
-        model to estimate :math:`\\E[T | X, W, Z]`.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_outcome` is True
+          and a regressor otherwise
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+    model_t_xwz: estimator, default ``'auto'``
+        Determines how to fit the treatment to the features, controls, and instrument (:math:`\\E[T | X, W, Z]`).
 
-            - 'linear' - LogisticRegressionCV if discrete_treatment=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_treatment=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_treatment=True.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_treatment` is True
+          and a regressor otherwise
 
     flexible_model_effect : estimator or 'auto' (default is 'auto')
         a flexible model for a preliminary version of the CATE, must accept sample_weight at fit time.
@@ -3173,19 +3034,19 @@ def true_heterogeneity_function(X):
         est.fit(Y=y, T=T, Z=Z, X=X)
 
     >>> est.effect(X[:3])
-    array([-4.05294...,  6.44603..., -2.49535...])
+    array([-4.80489...,  6.10521... , -2.94904...])
     >>> est.effect_interval(X[:3])
-    (array([-8.42902...,  0.05595..., -6.34202...]),
-    array([ 0.32313..., 12.83612...,  1.35131...]))
+    (array([-9.20176..., -0.47031... , -6.67354...]),
+    array([-0.40802..., 12.68073...,  0.77546...]))
     >>> est.coef_
-    array([ 4.99132...,  0.35043...,  0.41963..., -0.63553..., -0.33972...])
+    array([ 5.52418...,  0.96276...,  0.68158..., -0.16803..., -0.13056...])
     >>> est.coef__interval()
-    (array([ 3.11828..., -1.44768..., -1.46377..., -2.36080..., -2.18746...]),
-    array([6.86435..., 2.14856..., 2.30303..., 1.08973..., 1.50802...]))
+    (array([ 3.61373..., -0.81856..., -1.12589..., -1.90193... , -1.92331...]),
+    array([7.43462..., 2.74409... , 2.48906..., 1.56587..., 1.66218...]))
     >>> est.intercept_
-    -0.25633...
+    -0.28940...
     >>> est.intercept__interval()
-    (-2.07961..., 1.56695...)
+    (-2.07653..., 1.49771...)
     """
 
     def __init__(self, *,
diff --git a/econml/panel/dml/_dml.py b/econml/panel/dml/_dml.py
index b44bf0178..900a07ffe 100644
--- a/econml/panel/dml/_dml.py
+++ b/econml/panel/dml/_dml.py
@@ -45,27 +45,57 @@ def __init__(self, model_y, model_t, n_periods):
         self._model_t = model_t
         self.n_periods = n_periods
 
-    def train(self, is_selecting, Y, T, X=None, W=None, sample_weight=None, groups=None):
+    def train(self, is_selecting, folds, Y, T, X=None, W=None, sample_weight=None, groups=None):
         """Fit a series of nuisance models for each period or period pairs."""
         assert Y.shape[0] % self.n_periods == 0, \
             "Length of training data should be an integer multiple of time periods."
         period_filters = _get_groups_period_filter(groups, self.n_periods)
-        if is_selecting:  # create the per-period y and t models
+        if not hasattr(self, '_model_y_trained'):  # create the per-period y and t models
             self._model_y_trained = {t: clone(self._model_y, safe=False)
                                      for t in np.arange(self.n_periods)}
             self._model_t_trained = {j: {t: clone(self._model_t, safe=False)
                                          for t in np.arange(j + 1)}
                                      for j in np.arange(self.n_periods)}
+
+        # we have to filter the folds because they contain the indices in the original data not
+        # the indices in the period-filtered data
+
+        def _translate_inds(t, inds):
+            # translate the indices in a fold to the indices in the period-filtered data
+            # if groups was [3,3,4,4,5,5,6,6,1,1,2,2,0,0] (the group ids can be in any order, but the
+            # time periods for each group should be contguous), and we had [10,11,0,1] as the indices in a fold
+            # (so the fold is taking the entries corresponding to groups 2 and 3)
+            # then group_period_filter(0) is [0,2,4,6,8,10,12] and gpf(1) is [1,3,5,7,9,11,13]
+            # so for period 1, the fold should be [10,0] => [5,0] (the indices that return 10 and 0 in the t=0 data)
+            # and for period 2, the fold should be [11,1] => [5,0] again (the indices that return 11,1 in the t=1 data)
+
+            # filter to the indices for the time period
+            inds = inds[np.isin(inds, period_filters[t])]
+
+            # now find their index in the period-filtered data, which is always sorted
+            return np.searchsorted(period_filters[t], inds)
+
+        if folds is not None:
+            translated_folds = []
+            for (train, test) in folds:
+                translated_folds.append((_translate_inds(0, train), _translate_inds(0, test)))
+                # sanity check that the folds are the same no matter the time period
+                for t in range(1, self.n_periods):
+                    assert np.array_equal(_translate_inds(t, train), _translate_inds(0, train))
+                    assert np.array_equal(_translate_inds(t, test), _translate_inds(0, test))
+        else:
+            translated_folds = None
+
         for t in np.arange(self.n_periods):
             self._model_y_trained[t].train(
-                is_selecting,
+                is_selecting, translated_folds,
                 self._index_or_None(X, period_filters[t]),
                 self._index_or_None(
                     W, period_filters[t]),
                 Y[period_filters[self.n_periods - 1]])
             for j in np.arange(t, self.n_periods):
                 self._model_t_trained[j][t].train(
-                    is_selecting,
+                    is_selecting, translated_folds,
                     self._index_or_None(X, period_filters[t]),
                     self._index_or_None(W, period_filters[t]),
                     T[period_filters[j]])
@@ -141,6 +171,10 @@ def _get_shape_formatter(self, X, W):
     def _index_or_None(self, X, filter_idx):
         return None if X is None else X[filter_idx]
 
+    @property
+    def needs_fit(self):
+        return self._model_t.needs_fit or self._model_y.needs_fit
+
 
 class _DynamicModelFinal:
     """
@@ -344,35 +378,23 @@ class DynamicDML(LinearModelFinalCateEstimatorMixin, _OrthoLearner):
 
     Parameters
     ----------
-    model_y: estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
-        model to estimate :math:`\\E[Y | X, W]`.
-
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+    model_y: estimator, default ``'auto'``
+        Determines how to fit the outcome to the features.
 
-            - 'linear' - LogisticRegressionCV if discrete_outcome=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_outcome=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_outcome=True.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_outcome` is True
+          and a regressor otherwise
 
-    model_t: estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
+    model_t: estimator, default ``'auto'``
         Determines how to fit the treatment to the features.
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
-
-            - 'linear' - LogisticRegressionCV if discrete_treatment=True else WeightedLassoCVWrapper
-            - 'forest' - RandomForestClassifier if discrete_treatment=True else RandomForestRegressor
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        User-supplied estimators should support 'fit' and 'predict' methods,
-        and additionally 'predict_proba' if discrete_treatment=True.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_treatment` is True
+          and a regressor otherwise
 
     featurizer : :term:`transformer`, optional
         Must support fit_transform and transform. Used to create composite features in the final CATE regression.
@@ -451,31 +473,31 @@ class DynamicDML(LinearModelFinalCateEstimatorMixin, _OrthoLearner):
         est.fit(y, T, X=X, W=None, groups=groups, inference="auto")
 
     >>> est.const_marginal_effect(X[:2])
-    array([[-0.345..., -0.056..., -0.044...,  0.046..., -0.202...,
+    array([[-0.363..., -0.049..., -0.044...,  0.042..., -0.202...,
              0.023...],
-           [-0.120...,  0.434...,  0.052..., -0.201..., -0.115...,
-            -0.134...]])
+           [-0.128...,  0.424...,  0.050... , -0.203..., -0.115...,
+            -0.135...]])
     >>> est.effect(X[:2], T0=0, T1=1)
-    array([-0.579..., -0.085...])
+    array([-0.594..., -0.107...])
     >>> est.effect(X[:2], T0=np.zeros((2, n_periods*T.shape[1])), T1=np.ones((2, n_periods*T.shape[1])))
-    array([-0.579..., -0.085...])
+    array([-0.594..., -0.107...])
     >>> est.coef_
-    array([[ 0.108...],
-           [ 0.235...],
-           [ 0.046...],
-           [-0.119...],
-           [ 0.042...],
-           [-0.075...]])
+    array([[ 0.112... ],
+           [ 0.227...],
+           [ 0.045...],
+           [-0.118...],
+           [ 0.041...],
+           [-0.076...]])
     >>> est.coef__interval()
-    (array([[-0.042...],
-           [-0.001...],
+    (array([[-0.060...],
+           [-0.008...],
            [-0.120...],
-           [-0.393...],
+           [-0.392...],
            [-0.120...],
-           [-0.256...]]), array([[0.258...],
-           [0.473...],
-           [0.212...],
-           [0.154...],
+           [-0.257...]]), array([[0.286...],
+           [0.463...],
+           [0.212... ],
+           [0.156...],
            [0.204...],
            [0.104...]]))
     """
diff --git a/econml/policy/_drlearner.py b/econml/policy/_drlearner.py
index 2ee38c158..a3018d197 100644
--- a/econml/policy/_drlearner.py
+++ b/econml/policy/_drlearner.py
@@ -239,34 +239,22 @@ class takes as input the parameter ``model_regressor``, which is an arbitrary sc
 
     Parameters
     ----------
-    model_propensity : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
-        Estimator for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.
+    model_propensity: estimator, default ``'auto'``
+        Classifier for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-            - 'linear' - LogisticRegressionCV
-            - 'forest' - RandomForestClassifier
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - Otherwise, see :ref:`model_selection` for the range of supported options
 
-        User-supplied estimators should support 'fit' and 'predict', and 'predict_proba'.
-
-    model_regression : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
+    model_regression: estimator, default ``'auto'``
         Estimator for E[Y | X, W, T]. Trained by regressing Y on (features, controls, one-hot-encoded treatments)
         concatenated. The one-hot-encoding excludes the baseline treatment.
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
-
-            - 'linear' - LogisticRegressionCV
-            - 'forest' - RandomForestClassifier
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        User-supplied estimators should support 'fit' and 'predict' methods.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_outcome` is True
+          and a regressor otherwise
 
     featurizer : :term:`transformer`, optional
         Must support fit_transform and transform. Used to create composite features in the final CATE regression.
@@ -651,34 +639,22 @@ class takes as input the parameter ``model_regressor``, which is an arbitrary sc
 
     Parameters
     ----------
-    model_propensity : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto', default 'auto'
-        Estimator for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.
+    model_propensity: estimator, default ``'auto'``
+        Classifier for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-            - 'linear' - LogisticRegressionCV
-            - 'forest' - RandomForestClassifier
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - Otherwise, see :ref:`model_selection` for the range of supported options
 
-        User-supplied estimators should support 'fit' and 'predict', and 'predict_proba'.
-
-    model_regression : estimator, {'linear', 'forest'}, list of str/estimator, or 'auto'
+    model_regression: estimator, default ``'auto'``
         Estimator for E[Y | X, W, T]. Trained by regressing Y on (features, controls, one-hot-encoded treatments)
         concatenated. The one-hot-encoding excludes the baseline treatment.
 
-        - If an estimator, will use the model as is for fitting.
-        - If str, will use model associated with the keyword.
-
-            - 'linear' - LogisticRegressionCV
-            - 'forest' - RandomForestClassifier
-        - If list, will perform model selection on the supplied list, which can be a mix of str and estimators, \
-            and then use the best estimator for fitting.
-        - If 'auto', model will select over linear and forest models
+        - If ``'auto'``, the model will be the best-fitting of a set of linear and forest models
 
-        User-supplied estimators should support 'fit' and 'predict' methods.
+        - Otherwise, see :ref:`model_selection` for the range of supported options;
+          if a single model is specified it should be a classifier if `discrete_outcome` is True
+          and a regressor otherwise
 
     featurizer : :term:`transformer`, optional
         Must support fit_transform and transform. Used to create composite features in the final CATE regression.
diff --git a/econml/sklearn_extensions/model_selection.py b/econml/sklearn_extensions/model_selection.py
index dda389eaf..5cab620c6 100644
--- a/econml/sklearn_extensions/model_selection.py
+++ b/econml/sklearn_extensions/model_selection.py
@@ -2,7 +2,10 @@
 # Licensed under the MIT License.
 """Collection of scikit-learn extensions for model selection techniques."""
 
+from inspect import signature
+import inspect
 import numbers
+from typing import List, Optional
 import warnings
 import abc
 
@@ -12,7 +15,8 @@
 import sklearn
 from joblib import Parallel, delayed
 from sklearn.base import BaseEstimator, clone, is_classifier
-from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.ensemble import (GradientBoostingClassifier, GradientBoostingRegressor,
+                              RandomForestClassifier, RandomForestRegressor)
 from sklearn.exceptions import FitFailedWarning
 from sklearn.linear_model import (ElasticNet, ElasticNetCV, Lasso, LassoCV, MultiTaskElasticNet, MultiTaskElasticNetCV,
                                   MultiTaskLasso, MultiTaskLassoCV, Ridge, RidgeCV, RidgeClassifier, RidgeClassifierCV,
@@ -23,7 +27,9 @@
 # TODO: conisder working around relying on sklearn implementation details
 from sklearn.model_selection._validation import (_check_is_permutation,
                                                  _fit_and_predict)
-from sklearn.preprocessing import LabelEncoder, StandardScaler
+from sklearn.neural_network import MLPClassifier, MLPRegressor
+from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.preprocessing import LabelEncoder, PolynomialFeatures, StandardScaler
 from sklearn.utils import check_random_state, indexable
 from sklearn.utils.multiclass import type_of_target
 from sklearn.utils.validation import _num_samples
@@ -276,9 +282,10 @@ class ModelSelector(metaclass=abc.ABCMeta):
     """
 
     @abc.abstractmethod
-    def train(self, is_selecting: bool, *args, **kwargs):
+    def train(self, is_selecting: bool, folds: Optional[List], *args, **kwargs):
         """
         Either selects a model or fits a model, depending on the value of `is_selecting`.
+        If `is_selecting` is `False`, then `folds` should not be provided because they are only during selection.
         """
         raise NotImplementedError("Abstract method")
 
@@ -298,6 +305,15 @@ def score(self, *args, **kwargs):
         """
         raise NotImplementedError("Abstract method")
 
+    @property
+    @abc.abstractmethod
+    def needs_fit(self):
+        """
+        Whether the model selector needs to be fit before it can be used for prediction or scoring;
+        in many cases this is equivalent to whether the selector is choosing between multiple models
+        """
+        raise NotImplementedError("Abstract method")
+
 
 class SingleModelSelector(ModelSelector):
     """
@@ -333,7 +349,7 @@ def score(self, *args, **kwargs):
             return None
 
 
-def _fit_with_groups(model, X, y, *, groups, **kwargs):
+def _fit_with_groups(model, X, y, *, sub_model=None, groups, **kwargs):
     """
     Fits a model while correctly handling grouping if necessary.
 
@@ -349,8 +365,10 @@ def _fit_with_groups(model, X, y, *, groups, **kwargs):
     rows that GroupKFold would have generated rather than using GroupKFold as the cv instance.
     """
     if groups is not None:
-        if hasattr(model, 'cv'):
-            old_cv = model.cv
+        if sub_model is None:
+            sub_model = model
+        if hasattr(sub_model, 'cv'):
+            old_cv = sub_model.cv
             # logic copied from check_cv
             cv = 5 if old_cv is None else old_cv
             if isinstance(cv, numbers.Integral):
@@ -360,10 +378,10 @@ def _fit_with_groups(model, X, y, *, groups, **kwargs):
 
             splits = list(cv.split(X, y, groups=groups))
             try:
-                model.cv = splits
+                sub_model.cv = splits
                 return model.fit(X, y, **kwargs)  # drop groups from arg list
             finally:
-                model.cv = old_cv
+                sub_model.cv = old_cv
 
     # drop groups from arg list, which were already used at the outer level and may not be supported by the model
     return model.fit(X, y, **kwargs)
@@ -371,19 +389,31 @@ def _fit_with_groups(model, X, y, *, groups, **kwargs):
 
 class FixedModelSelector(SingleModelSelector):
     """
-    Model selection class that always selects the given model
+    Model selection class that always selects the given sklearn-compatible model
     """
 
     def __init__(self, model):
         self.model = clone(model, safe=False)
 
-    def train(self, is_selecting, *args, groups=None, **kwargs):
-        # whether selecting or not, need to train the model on the data
-        _fit_with_groups(self.model, *args, groups=groups, **kwargs)
-        if is_selecting and hasattr(self.model, 'score'):
-            # TODO: we need to alter this to use out-of-sample score here, which
-            #       will require cross-validation, but should respect grouping, stratifying, etc.
-            self._score = self.model.score(*args, **kwargs)
+    def train(self, is_selecting, folds: Optional[List], X, y, groups=None, **kwargs):
+        if is_selecting:
+            # since needs_fit is False, is_selecting will only be true if
+            # the score needs to be compared to another model's
+            # so we don't need to fit the model itself, just get the out-of-sample score
+            assert hasattr(self.model, 'score'), (f"Can't select between a fixed {type(self.model)} model and others "
+                                                  "because it doesn't have a score method")
+            scores = []
+            for train, test in folds:
+                # use _fit_with_groups instead of just fit to handle nested grouping
+                _fit_with_groups(self.model, X[train], y[train],
+                                 groups=None if groups is None else groups[train],
+                                 **{key: val[train] for key, val in kwargs.items()})
+                scores.append(self.model.score(X[test], y[test]))
+            self._score = np.mean(scores)
+        else:
+            # we need to train the model on the data
+            _fit_with_groups(self.model, X, y, groups=groups, **kwargs)
+
         return self
 
     @property
@@ -394,22 +424,22 @@ def best_model(self):
     def best_score(self):
         return self._score
 
+    @property
+    def needs_fit(self):
+        return False  # We have only a single model so we can skip the selection process
+
 
 def _copy_to(m1, m2, attrs, insert_underscore=False):
     for attr in attrs:
         setattr(m2, attr, getattr(m1, attr + "_" if insert_underscore else attr))
 
 
-def _convert_linear_model(model, new_cls, extra_attrs=[]):
+def _convert_linear_model(model, new_cls):
     new_model = new_cls()
     # copy common parameters
-    _copy_to(model, new_model, ["fit_intercept", "max_iter",
-                                "tol",
-                                "random_state"])
+    _copy_to(model, new_model, ["fit_intercept"])
     # copy common fitted variables
-    _copy_to(model, new_model, ["coef_", "intercept_", "n_features_in_", "n_iter_"])
-    # copy attributes unique to this class
-    _copy_to(model, new_model, extra_attrs)
+    _copy_to(model, new_model, ["coef_", "intercept_", "n_features_in_"])
     return new_model
 
 
@@ -418,7 +448,8 @@ def _to_logisticRegression(model: LogisticRegressionCV):
     _copy_to(model, lr, ["penalty", "dual", "intercept_scaling",
                          "class_weight",
                          "solver", "multi_class",
-                         "verbose", "n_jobs"])
+                         "verbose", "n_jobs",
+                         "tol", "max_iter", "random_state", "n_iter_"])
     _copy_to(model, lr, ["classes_"])
 
     _copy_to(model, lr, ["C", "l1_ratio"], True)  # these are arrays in LogisticRegressionCV, need to convert them next
@@ -435,21 +466,26 @@ def _to_logisticRegression(model: LogisticRegressionCV):
 
 
 def _convert_linear_regression(model, new_cls, extra_attrs=["positive"]):
-    new_model = _convert_linear_model(model, new_cls, ["copy_X",
-                                                       "n_iter_"])
+    new_model = _convert_linear_model(model, new_cls)
     _copy_to(model, new_model, ["alpha"], True)
     return new_model
 
 
-def _to_elasticNet(model: ElasticNetCV, is_lasso=False, cls=None, extra_attrs=[]):
+def _to_elasticNet(model: ElasticNetCV, args, kwargs, is_lasso=False, cls=None, extra_attrs=[]):
+    # We need an R^2 score to compare to other models; ElasticNetCV doesn't provide it,
+    # but we can calculate it ourselves from the MSE plus the variance of the target y
+    y = signature(model.fit).bind(*args, **kwargs).arguments["y"]
     cls = cls or (Lasso if is_lasso else ElasticNet)
-    new_model = _convert_linear_regression(model, cls, extra_attrs + ['selection', 'warm_start',
-                                                                      'dual_gap_'])
+    new_model = _convert_linear_regression(model, cls, extra_attrs + ['selection', 'warm_start', 'dual_gap_',
+                                                                      'tol', 'max_iter', 'random_state', 'n_iter_',
+                                                                      'copy_X'])
     if not is_lasso:
         # l1 ratio doesn't apply to Lasso, only ElasticNet
         _copy_to(model, new_model, ["l1_ratio"], True)
-    max_score = np.max(np.mean(model.mse_path_, axis=-1))  # last dimension in mse_path is folds, so average over that
-    return new_model, max_score
+    # max R^2 corresponds to min MSE
+    min_mse = np.min(np.mean(model.mse_path_, axis=-1))  # last dimension in mse_path is folds, so average over that
+    r2 = 1 - min_mse / np.var(y)  # R^2 = 1 - MSE / Var(y)
+    return new_model, r2
 
 
 def _to_ridge(model, cls=Ridge, extra_attrs=["positive"]):
@@ -472,38 +508,66 @@ def convertible_types():
 
     @staticmethod
     def can_wrap(model):
+        if isinstance(model, Pipeline):
+            return SklearnCVSelector.can_wrap(model.steps[-1][1])
         return any(isinstance(model, model_type) for model_type in SklearnCVSelector.convertible_types())
 
     @staticmethod
     def _model_mapping():
-        return {LogisticRegressionCV: _to_logisticRegression,
-                ElasticNetCV: _to_elasticNet,
-                LassoCV: lambda model: _to_elasticNet(model, True, None, ["positive"]),
-                RidgeCV: _to_ridge,
-                RidgeClassifierCV: lambda model: _to_ridge(model, RidgeClassifier, ["positive", "class_weight",
-                                                                                    "_label_binarizer"]),
-                MultiTaskElasticNetCV: lambda model: _to_elasticNet(model, False, MultiTaskElasticNet, extra_attrs=[]),
-                MultiTaskLassoCV: lambda model: _to_elasticNet(model, True, MultiTaskLasso, extra_attrs=[]),
-                WeightedLassoCVWrapper: lambda model: _to_elasticNet(model, True, WeightedLassoWrapper,
-                                                                     extra_attrs=[]),
+        return {LogisticRegressionCV: lambda model, _args, _kwargs: _to_logisticRegression(model),
+                ElasticNetCV: lambda model, args, kwargs: _to_elasticNet(model, args, kwargs),
+                LassoCV: lambda model, args, kwargs: _to_elasticNet(model, args, kwargs, True, None, ["positive"]),
+                RidgeCV: lambda model, _args, _kwargs: _to_ridge(model),
+                RidgeClassifierCV: lambda model, _args, _kwargs: _to_ridge(model, RidgeClassifier,
+                                                                           ["positive", "class_weight",
+                                                                            "_label_binarizer"]),
+                MultiTaskElasticNetCV: lambda model, args, kwargs: _to_elasticNet(model, args, kwargs,
+                                                                                  False, MultiTaskElasticNet,
+                                                                                  extra_attrs=[]),
+                MultiTaskLassoCV: lambda model, args, kwargs: _to_elasticNet(model, args, kwargs,
+                                                                             True, MultiTaskLasso, extra_attrs=[]),
+                WeightedLassoCVWrapper: lambda model, args, kwargs: _to_elasticNet(model, args, kwargs,
+                                                                                   True, WeightedLassoWrapper,
+                                                                                   extra_attrs=[]),
                 }
 
-    def train(self, is_selecting: bool, *args, groups=None, **kwargs):
+    @staticmethod
+    def _convert_model(model, args, kwargs):
+        if isinstance(model, Pipeline):
+            name, inner_model = model.steps[-1]
+            best_model, score = SklearnCVSelector._convert_model(inner_model, args, kwargs)
+            return Pipeline(steps=[*model.steps[:-1], (name, best_model)]), score
+
+        if isinstance(model, GridSearchCV) or isinstance(model, RandomizedSearchCV):
+            return model.best_estimator_, model.best_score_
+
+        for known_type in SklearnCVSelector._model_mapping().keys():
+            if isinstance(model, known_type):
+                converter = SklearnCVSelector._model_mapping()[known_type]
+                return converter(model, args, kwargs)
+
+    def train(self, is_selecting: bool, folds: Optional[List], *args, groups=None, **kwargs):
         if is_selecting:
-            _fit_with_groups(self.searcher, *args, groups=groups, **kwargs)
+            sub_model = self.searcher
+            if isinstance(self.searcher, Pipeline):
+                sub_model = self.searcher.steps[-1][1]
+
+            init_params = inspect.signature(sub_model.__init__).parameters
+            if 'cv' in init_params:
+                default_cv = init_params['cv'].default
+            else:
+                # constructor takes cv as a positional or kwarg, just pull it out of a new instance
+                default_cv = type(sub_model)().cv
 
-            if isinstance(self.searcher, GridSearchCV) or isinstance(self.searcher, RandomizedSearchCV):
-                self._best_model = self.searcher.best_estimator_
-                self._best_score = self.searcher.best_score_
+            if sub_model.cv != default_cv:
+                warnings.warn(f"Model {sub_model} has a non-default cv attribute, which will be ignored")
+            sub_model.cv = folds
 
-            for known_type in self._model_mapping().keys():
-                if isinstance(self.searcher, known_type):
-                    converter = self._model_mapping()[known_type]
-                    self._best_model, self._best_score = converter(self.searcher)
-                    return self
+            self.searcher.fit(*args, **kwargs)
+
+            self._best_model, self._best_score = self._convert_model(self.searcher, args, kwargs)
 
         else:
-            # don't need to use _fit_with_groups here since none of these models support it
             self.best_model.fit(*args, **kwargs)
         return self
 
@@ -515,6 +579,11 @@ def best_model(self):
     def best_score(self):
         return self._best_score
 
+    @property
+    def needs_fit(self):
+        return True  # strictly speaking, could be false if the hyperparameters are fixed
+        # but it would be complicated to check that
+
 
 class ListSelector(SingleModelSelector):
     """
@@ -532,18 +601,19 @@ def __init__(self, models, unwrap=True):
         self.models = [clone(model, safe=False) for model in models]
         self.unwrap = unwrap
 
-    def train(self, is_selecting, *args, **kwargs):
+    def train(self, is_selecting, folds: Optional[List], *args, **kwargs):
+        assert len(self.models) > 0, "ListSelector must have at least one model"
         if is_selecting:
             scores = []
             for model in self.models:
-                model.train(is_selecting, *args, **kwargs)
+                model.train(is_selecting, folds, *args, **kwargs)
                 scores.append(model.best_score)
             self._all_scores = scores
             self._best_score = np.max(scores)
             self._best_model = self.models[np.argmax(scores)]
 
         else:
-            self._best_model.train(is_selecting, *args, **kwargs)
+            self._best_model.train(is_selecting, folds, *args, **kwargs)
 
     @property
     def best_model(self):
@@ -557,14 +627,31 @@ def best_model(self):
     def best_score(self):
         return self._best_score
 
+    @property
+    def needs_fit(self):
+        # technically, if there is just one model and it doesn't need to be fit we don't need to fit it,
+        # but that complicates the training logic so we don't bother with that case
+        return True
+
 
 def get_selector(input, is_discrete, *, random_state=None, cv=None, wrapper=GridSearchCV):
     named_models = {
         'linear': (LogisticRegressionCV(random_state=random_state, cv=cv) if is_discrete
                    else WeightedLassoCVWrapper(random_state=random_state, cv=cv)),
+        'poly': ([make_pipeline(PolynomialFeatures(d),
+                                (LogisticRegressionCV(random_state=random_state, cv=cv) if is_discrete
+                                 else WeightedLassoCVWrapper(random_state=random_state, cv=cv)))
+                  for d in range(1, 4)]),
         'forest': (GridSearchCV(RandomForestClassifier(random_state=random_state) if is_discrete
                                 else RandomForestRegressor(random_state=random_state),
                                 param_grid={}, cv=cv)),
+        'gbf': (GridSearchCV(GradientBoostingClassifier(random_state=random_state) if is_discrete
+                             else GradientBoostingRegressor(random_state=random_state),
+                             param_grid={}, cv=cv)),
+        'nnet': (GridSearchCV(MLPClassifier(random_state=random_state) if is_discrete
+                              else MLPRegressor(random_state=random_state),
+                              param_grid={}, cv=cv)),
+        'automl': ["poly", "forest", "gbf", "nnet"],
     }
     if isinstance(input, ModelSelector):  # we've already got a model selector, don't need to do anything
         return input
diff --git a/econml/tests/test_cate_interpreter.py b/econml/tests/test_cate_interpreter.py
index 13af13a5b..eff4393cc 100644
--- a/econml/tests/test_cate_interpreter.py
+++ b/econml/tests/test_cate_interpreter.py
@@ -4,6 +4,7 @@
 import numpy as np
 import unittest
 import pytest
+import matplotlib
 from econml.cate_interpreter import SingleTreeCateInterpreter, SingleTreePolicyInterpreter
 from econml.dml import LinearDML
 from sklearn.linear_model import LinearRegression, LogisticRegression
@@ -16,7 +17,6 @@
 except Exception:
     graphviz_works = False
 
-import matplotlib
 matplotlib.use('Agg')
 
 
diff --git a/econml/tests/test_missing_values.py b/econml/tests/test_missing_values.py
index e59c8811c..8784e222e 100644
--- a/econml/tests/test_missing_values.py
+++ b/econml/tests/test_missing_values.py
@@ -27,7 +27,7 @@ def __init__(self, model_t, model_y):
         self._model_t = model_t
         self._model_y = model_y
 
-    def train(self, is_selecting, Y, T, W=None):
+    def train(self, is_selecting, folds, Y, T, W=None):
         self._model_t.fit(W, T)
         self._model_y.fit(W, Y)
         return self
@@ -35,6 +35,10 @@ def train(self, is_selecting, Y, T, W=None):
     def predict(self, Y, T, W=None):
         return Y - self._model_y.predict(W), T - self._model_t.predict(W)
 
+    @property
+    def needs_fit(self):
+        return False
+
 
 class ModelFinal:
 
diff --git a/econml/tests/test_model_selection.py b/econml/tests/test_model_selection.py
new file mode 100644
index 000000000..efb24f64f
--- /dev/null
+++ b/econml/tests/test_model_selection.py
@@ -0,0 +1,135 @@
+# Copyright (c) PyWhy contributors. All rights reserved.
+# Licensed under the MIT License.
+
+import unittest
+import numpy as np
+from scipy.special import expit
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.linear_model import (ElasticNetCV, Lasso, LassoCV, LinearRegression, LogisticRegression,
+                                  LogisticRegressionCV, MultiTaskElasticNetCV, MultiTaskLassoCV,
+                                  RidgeCV, RidgeClassifierCV)
+from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
+from sklearn.preprocessing import PolynomialFeatures
+from econml.dml import LinearDML
+from econml.sklearn_extensions.linear_model import WeightedLassoCVWrapper
+
+
+class TestModelSelection(unittest.TestCase):
+
+    def _simple_dgp(self, n, d_x, d_w, discrete_treatment):
+        n = 500  # keep the data set small since we're testing a lot of models and don't care about the results
+        X = np.random.normal(size=(n, d_x))
+        W = np.random.normal(size=(n, d_w))
+        alpha = np.random.normal(size=(X.shape[1]))
+        n_f = d_w + d_x
+        beta = np.random.normal(size=(n_f,))
+        gamma = np.random.normal(size=(n_f,))
+        XW = np.hstack([X, W])
+        if discrete_treatment:
+            T = np.random.binomial(1, expit(XW @ beta))
+        else:
+            T = XW @ beta + np.random.normal(size=(n,))
+        Y = (X @ alpha) * T + XW @ gamma + np.random.normal(size=(n,))
+        return Y, T, X, W
+
+    def test_poly(self):
+        # tests that we can recover the right degree of polynomial features
+        # implicitly also tests ability to handle pipelines
+        # since 'poly' uses pipelines containing PolynomialFeatures
+        n = 5000
+        X = np.random.normal(size=(n, 2))
+        W = np.random.normal(size=(n, 3))
+
+        for true_d in range(1, 4):
+            with self.subTest(true_d=true_d):
+                pf = PolynomialFeatures(degree=true_d)
+
+                fts = pf.fit_transform(np.hstack([X, W]))
+                k = fts.shape[1]
+                m = X.shape[1] + W.shape[1]
+
+                alpha_x = np.random.normal(size=(X.shape[1],))
+                alpha_1 = np.random.normal(size=())
+                beta = np.random.normal(size=(k,))
+                gamma = np.random.normal(size=(k,))
+
+                # generate larger coefficients in a set of high degree features,
+                # weighted towards higher degree features
+                ft_inds_beta = np.random.choice(k, size=m, replace=False, p=np.arange(k) / np.sum(np.arange(k)))
+                ft_inds_gamma = np.random.choice(k, size=m, replace=False, p=np.arange(k) / np.sum(np.arange(k)))
+
+                beta[ft_inds_beta] = 10 * np.random.normal(1, size=(m,))
+                gamma[ft_inds_gamma] = 10 * np.random.normal(1, size=(m,))
+
+                t = np.random.normal(size=(n,)) + fts @ beta + np.random.normal(scale=0.5, size=(n,))
+                y = np.random.normal(size=(n,)) + t * (alpha_1 + X @ alpha_x) + fts @ gamma
+
+                # just test a polynomial T model, since for Y the correct degree also depends on
+                # the interation of T and X
+                mdl = LinearDML(model_t='poly',
+                                model_y=LinearRegression()).fit(y, t, X=X, W=W)
+                for t in mdl.models_t[0]:
+                    self.assertEqual(t[0].degree, true_d)
+
+    def test_all_strings(self):
+        for discrete_treatment in [True, False]:
+            Y, T, X, W = self._simple_dgp(500, 2, 3, discrete_treatment)
+            for model_t in ['auto', 'linear', 'poly', 'forest', 'gbf', 'nnet', 'automl']:
+                with self.subTest(model_t=model_t, discrete_treatment=discrete_treatment):
+                    mdl = LinearDML(model_t=model_t,
+                                    discrete_treatment=discrete_treatment,
+                                    model_y=LinearRegression())
+                    mdl.fit(Y, T, X=X, W=W)
+
+            model_t = 'some_random_string'
+            with self.subTest(model_t=model_t, discrete_treatment=True):
+                mdl = LinearDML(model_t=model_t,
+                                discrete_treatment=discrete_treatment,
+                                model_y=LinearRegression())
+                with self.assertRaises(ValueError):
+                    mdl.fit(Y, T, X=X, W=W)
+
+    def test_list_selection(self):
+        Y, T, X, W = self._simple_dgp(500, 2, 3, False)
+
+        # test corner case with just one model in a list
+        mdl = LinearDML(model_t=[LinearRegression()],
+                        model_y=LinearRegression())
+        mdl.fit(Y, T, X=X, W=W)
+
+        # test corner case with empty list
+        with self.assertRaises(Exception):
+            mdl = LinearDML(model_t=[],
+                            model_y=LinearRegression())
+            mdl.fit(Y, T, X=X, W=W)
+
+        # test selecting between two fixed models
+        mdl = LinearDML(model_t=[LinearRegression(), RandomForestRegressor()],
+                        model_y=LinearRegression())
+        mdl.fit(Y, T, X=X, W=W)
+        # DGP is a linear model, so linear regression should fit better
+        assert isinstance(mdl.models_t[0][0], LinearRegression)
+
+        T2 = T + 10 * (X[:, 1] > 0)  # add a non-linear effect
+        mdl.fit(Y, T2, X=X, W=W)
+        # DGP is now non-linear, so random forest should fit better
+        assert isinstance(mdl.models_t[0][0], RandomForestRegressor)
+
+    def test_sklearn_model_selection(self):
+        for is_discrete, mdls in [(True, [LogisticRegressionCV(), RidgeClassifierCV(),
+                                          GridSearchCV(LogisticRegression(), {'C': [1, 10]}),
+                                          RandomizedSearchCV(LogisticRegression(), {'C': [1, 10]})]),
+                                  (False, [ElasticNetCV(), LassoCV(), RidgeCV(),
+                                           MultiTaskElasticNetCV(), MultiTaskLassoCV(), WeightedLassoCVWrapper(),
+                                           GridSearchCV(Lasso(), {'alpha': [0.1, 1]}),
+                                           RandomizedSearchCV(Lasso(), {'alpha': [0.1, 1]})])]:
+            Y, T, X, W = self._simple_dgp(500, 2, 3, is_discrete)
+            T2 = np.tile(T.reshape(-1, 1), (1, 2))  # multi-column T
+            for mdl in mdls:
+                # these models only work on multi-output data
+                use_array = isinstance(mdl, (MultiTaskElasticNetCV, MultiTaskLassoCV))
+                with self.subTest(model=mdl):
+                    est = LinearDML(model_t=mdl,
+                                    discrete_treatment=is_discrete,
+                                    model_y=LinearRegression())
+                    est.fit(Y, T2 if use_array else T, X=X, W=W)
diff --git a/econml/tests/test_ortho_learner.py b/econml/tests/test_ortho_learner.py
index 0c65358ea..fa8c3a7f4 100644
--- a/econml/tests/test_ortho_learner.py
+++ b/econml/tests/test_ortho_learner.py
@@ -3,7 +3,6 @@
 
 from sklearn.datasets import make_regression
 from econml._ortho_learner import _OrthoLearner, _crossfit
-from sklearn.linear_model import LinearRegression
 from sklearn.preprocessing import PolynomialFeatures
 from sklearn.linear_model import LinearRegression, LassoCV, Lasso
 from sklearn.model_selection import KFold
@@ -28,7 +27,7 @@ class Wrapper:
             def __init__(self, model):
                 self._model = model
 
-            def train(self, is_selecting, X, y, Q, W=None):
+            def train(self, is_selecting, folds, X, y, Q, W=None):
                 self._model.fit(X, y)
                 return self
 
@@ -38,6 +37,10 @@ def predict(self, X, y, Q, W=None):
             def score(self, X, y, Q, W=None):
                 return self._model.score(X, y)
 
+            @property
+            def needs_fit(self):
+                return False
+
         np.random.seed(123)
         X = np.random.normal(size=(5000, 3))
         y = X[:, 0] + np.random.normal(size=(5000,))
@@ -108,13 +111,17 @@ class Wrapper:
             def __init__(self, model):
                 self._model = model
 
-            def train(self, is_selecting, X, y, W=None):
+            def train(self, is_selecting, folds, X, y, W=None):
                 self._model.fit(X, y)
                 return self
 
             def predict(self, X, y, W=None):
                 return self._model.predict(X), y - self._model.predict(X), X
 
+            @property
+            def needs_fit(self):
+                return False
+
         np.random.seed(123)
         X = np.random.normal(size=(5000, 3))
         y = X[:, 0] + np.random.normal(size=(5000,))
@@ -178,7 +185,7 @@ class Wrapper:
                 def __init__(self, model):
                     self._model = model
 
-                def train(self, is_selecting, X, y, Q, W=None):
+                def train(self, is_selecting, folds, X, y, Q, W=None):
                     self._model.fit(X, y)
                     return self
 
@@ -188,6 +195,10 @@ def predict(self, X, y, Q, W=None):
                 def score(self, X, y, Q, W=None):
                     return self._model.score(X, y)
 
+                @property
+                def needs_fit(self):
+                    return False
+
             # Generate synthetic data
             X, y = make_regression(n_samples=10, n_features=5, noise=0.1, random_state=42)
             folds = list(KFold(2).split(X, y))
@@ -218,7 +229,7 @@ def __init__(self, model_t, model_y):
                 self._model_t = model_t
                 self._model_y = model_y
 
-            def train(self, is_selecting, Y, T, W=None):
+            def train(self, is_selecting, folds, Y, T, W=None):
                 self._model_t.fit(W, T)
                 self._model_y.fit(W, Y)
                 return self
@@ -226,6 +237,10 @@ def train(self, is_selecting, Y, T, W=None):
             def predict(self, Y, T, W=None):
                 return Y - self._model_y.predict(W), T - self._model_t.predict(W)
 
+            @property
+            def needs_fit(self):
+                return False
+
         class ModelFinal:
 
             def __init__(self):
@@ -330,7 +345,7 @@ def __init__(self, model_t, model_y):
                 self._model_t = model_t
                 self._model_y = model_y
 
-            def train(self, is_selecting, Y, T, W=None):
+            def train(self, is_selecting, folds, Y, T, W=None):
                 self._model_t.fit(W, T)
                 self._model_y.fit(W, Y)
                 return self
@@ -338,6 +353,10 @@ def train(self, is_selecting, Y, T, W=None):
             def predict(self, Y, T, W=None):
                 return Y - self._model_y.predict(W), T - self._model_t.predict(W)
 
+            @property
+            def needs_fit(self):
+                return False
+
         class ModelFinal:
 
             def __init__(self):
@@ -378,7 +397,7 @@ def __init__(self, model_t, model_y):
                 self._model_t = model_t
                 self._model_y = model_y
 
-            def train(self, is_selecting, Y, T, W=None):
+            def train(self, is_selecting, folds, Y, T, W=None):
                 self._model_t.fit(W, T)
                 self._model_y.fit(W, Y)
                 return self
@@ -389,6 +408,10 @@ def predict(self, Y, T, W=None):
             def score(self, Y, T, W=None):
                 return (self._model_t.score(W, Y), self._model_y.score(W, T))
 
+            @property
+            def needs_fit(self):
+                return False
+
         class ModelFinal:
 
             def __init__(self):
@@ -435,7 +458,7 @@ def __init__(self, model_t, model_y):
                 self._model_t = model_t
                 self._model_y = model_y
 
-            def train(self, is_selecting, Y, T, W=None):
+            def train(self, is_selecting, folds, Y, T, W=None):
                 self._model_t.fit(W, np.matmul(T, np.arange(1, T.shape[1] + 1)))
                 self._model_y.fit(W, Y)
                 return self
@@ -443,6 +466,10 @@ def train(self, is_selecting, Y, T, W=None):
             def predict(self, Y, T, W=None):
                 return Y - self._model_y.predict(W), T - self._model_t.predict_proba(W)[:, 1:]
 
+            @property
+            def needs_fit(self):
+                return False
+
         class ModelFinal:
 
             def __init__(self):
diff --git a/econml/tests/test_shap.py b/econml/tests/test_shap.py
index 6c524a536..fecbb202c 100644
--- a/econml/tests/test_shap.py
+++ b/econml/tests/test_shap.py
@@ -4,10 +4,10 @@
 import numpy as np
 import unittest
 import shap
-from econml.dml import *
-from econml.orf import *
-from econml.dr import *
-from econml.metalearners import *
+from econml.dml import LinearDML, CausalForestDML, NonParamDML
+from econml.orf import DMLOrthoForest, DROrthoForest
+from econml.dr import DRLearner, ForestDRLearner
+from econml.metalearners import TLearner, SLearner, XLearner, DomainAdaptationLearner
 from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso
 from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
 from sklearn.preprocessing import PolynomialFeatures
diff --git a/econml/tests/test_statsmodels.py b/econml/tests/test_statsmodels.py
index 958d538c4..db667e18e 100644
--- a/econml/tests/test_statsmodels.py
+++ b/econml/tests/test_statsmodels.py
@@ -416,7 +416,7 @@ def test_inference(self):
                 "{}, {}".format(est.coef__interval()[1][t],
                                 np.array([scipy.stats.norm.ppf(.975, loc=1, scale=1)] +
                                          [scipy.stats.norm.ppf(.975, loc=0, scale=1)] * (d - 1)))
-            assert np.all(np.abs(est.intercept_[t]) <= 1e-12), "{}, {}".format(est.intercept_[t])
+            assert np.all(np.abs(est.intercept_[t]) <= 1e-12), "{}".format(est.intercept_[t])
             assert np.all(np.abs(est.intercept_stderr_[t]) <= 1e-12), "{}".format(est.intercept_stderr_[t])
             assert np.all(np.abs(est.intercept__interval()[0][t]) <=
                           1e-12), "{}".format(est.intercept__interval()[0][t])
@@ -446,7 +446,7 @@ def test_inference(self):
                 "{}, {}".format(est.coef__interval()[1][t],
                                 np.array([scipy.stats.norm.ppf(.975, loc=1, scale=np.sqrt(2))] +
                                          [scipy.stats.norm.ppf(.975, loc=0, scale=np.sqrt(2))] * (d - 1)))
-            assert np.all(np.abs(est.intercept_[t]) <= 1e-12), "{}, {}".format(est.intercept_[t])
+            assert np.all(np.abs(est.intercept_[t]) <= 1e-12), "{}".format(est.intercept_[t])
             assert np.all(np.abs(est.intercept_stderr_[t] - 1) <= 1e-12), "{}".format(est.intercept_stderr_[t])
             assert np.all(np.abs(est.intercept__interval()[0][t] -
                                  scipy.stats.norm.ppf(.025, loc=0, scale=1)) <= 1e-12), \
diff --git a/monte_carlo_tests/monte_carlo_statsmodels.py b/monte_carlo_tests/monte_carlo_statsmodels.py
index 1d6301cc5..9c061e3f8 100644
--- a/monte_carlo_tests/monte_carlo_statsmodels.py
+++ b/monte_carlo_tests/monte_carlo_statsmodels.py
@@ -14,8 +14,6 @@
 import warnings
 import joblib
 from sklearn.model_selection import GridSearchCV
-from statsmodels.tools.tools import add_constant
-from econml.utilities import cross_product
 from sklearn.multioutput import MultiOutputRegressor
 
 
@@ -251,7 +249,7 @@ def true_effect(x, t):
                                     (hetero_coef * X[:, [0]] + 1) * np.random.normal(0, 1, size=(n, p))
 
                                 XT = np.hstack([X, T])
-                                X1, X2, y1, y2, X_final_first, X_final_sec, y_sum_first, y_sum_sec,\
+                                X1, X2, y1, y2, X_final_first, X_final_sec, y_sum_first, y_sum_sec, \
                                     n_sum_first, n_sum_sec, var_first, var_sec = _summarize(XT, y)
                                 X = np.vstack([X1, X2])
                                 y = np.concatenate((y1, y2))
@@ -420,7 +418,8 @@ def first_stage():
                                                        min_samples_leaf=10, random_state=123),
                                  MultiOutputRegressor(GradientBoostingRegressor(n_estimators=20,
                                                                                 max_depth=3,
-                                                                                min_samples_leaf=10, random_state=123))],
+                                                                                min_samples_leaf=10,
+                                                                                random_state=123))],
                                 param_grid_list=[{},
                                                  {},
                                                  {},