From 97033ab2dff6a737bcb4f595244b0aae0d6cbf35 Mon Sep 17 00:00:00 2001
From: Adrian Seyboldt <adrian.seyboldt@gmail.com>
Date: Thu, 16 Sep 2021 18:25:50 +0200
Subject: [PATCH 01/12] Fix issue in hmc gradient storage

---
 pymc3/step_methods/hmc/nuts.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pymc3/step_methods/hmc/nuts.py b/pymc3/step_methods/hmc/nuts.py
index 267c20659fa..e2a09eaa99a 100644
--- a/pymc3/step_methods/hmc/nuts.py
+++ b/pymc3/step_methods/hmc/nuts.py
@@ -254,7 +254,7 @@ def __init__(self, ndim, integrator, start, step_size, Emax):
 
         self.left = self.right = start
         self.proposal = Proposal(
-            start.q.data, start.q_grad.data, start.energy, 1.0, start.model_logp
+            start.q.data, start.q_grad, start.energy, 1.0, start.model_logp
         )
         self.depth = 0
         self.log_size = 0
@@ -350,7 +350,7 @@ def _single_step(self, left, epsilon):
                 log_size = -energy_change
                 proposal = Proposal(
                     right.q.data,
-                    right.q_grad.data,
+                    right.q_grad,
                     right.energy,
                     log_p_accept_weighted,
                     right.model_logp,

From 71db94dd8217cd31e68a5e826bf2828c4b3f2454 Mon Sep 17 00:00:00 2001
From: Adrian Seyboldt <adrian.seyboldt@gmail.com>
Date: Thu, 16 Sep 2021 18:26:57 +0200
Subject: [PATCH 02/12] Skip first samples during NUTS adaptation

---
 pymc3/sampling.py                       |  20 +++++
 pymc3/step_methods/hmc/quadpotential.py | 103 +++++++++++++++++++++++-
 2 files changed, 120 insertions(+), 3 deletions(-)

diff --git a/pymc3/sampling.py b/pymc3/sampling.py
index e8131ce642f..b9ae3d3e5f8 100644
--- a/pymc3/sampling.py
+++ b/pymc3/sampling.py
@@ -516,6 +516,7 @@ def sample(
                 random_seed=random_seed,
                 progressbar=progressbar,
                 jitter_max_retries=jitter_max_retries,
+                tune=tune,
                 **kwargs,
             )
             if start is None:
@@ -2078,6 +2079,7 @@ def init_nuts(
     random_seed=None,
     progressbar=True,
     jitter_max_retries=10,
+    tune=None,
     **kwargs,
 ):
     """Set up the mass matrix initialization for NUTS.
@@ -2174,6 +2176,24 @@ def init_nuts(
         var = np.ones_like(mean)
         n = len(var)
         potential = quadpotential.QuadPotentialDiagAdapt(n, mean, var, 10)
+    elif init == "jitter+adapt_diag_grad":
+        start = _init_jitter(model, model.initial_point, chains, jitter_max_retries)
+        mean = np.mean([DictToArrayBijection.map(vals).data for vals in start], axis=0)
+        var = np.ones_like(mean)
+        n = len(var)
+
+        if tune is not None and tune > 200:
+            stop_adaptation = tune - 50
+        else:
+            stop_adaptation = None
+
+        potential = quadpotential.QuadPotentialDiagAdaptExp(
+            n,
+            mean,
+            alpha=0.02,
+            use_grads=True,
+            stop_adaptation=stop_adaptation,
+        )
     elif init == "advi+adapt_diag_grad":
         approx: pm.MeanField = pm.fit(
             random_seed=random_seed,
diff --git a/pymc3/step_methods/hmc/quadpotential.py b/pymc3/step_methods/hmc/quadpotential.py
index 2bb2f196c50..f29948b5bce 100644
--- a/pymc3/step_methods/hmc/quadpotential.py
+++ b/pymc3/step_methods/hmc/quadpotential.py
@@ -154,6 +154,10 @@ def __init__(
         adaptation_window=101,
         adaptation_window_multiplier=1,
         dtype=None,
+        discard_window=50,
+        initial_weights=None,
+        early_update=False,
+        store_mass_matrix_trace=False,
     ):
         """Set up a diagonal mass matrix."""
         if initial_diag is not None and initial_diag.ndim != 1:
@@ -175,12 +179,20 @@ def __init__(
         self.dtype = dtype
         self._n = n
 
+        self._discard_window = discard_window
+        self._early_update = early_update
+
         self._initial_mean = initial_mean
         self._initial_diag = initial_diag
         self._initial_weight = initial_weight
         self.adaptation_window = adaptation_window
         self.adaptation_window_multiplier = float(adaptation_window_multiplier)
 
+        self._store_mass_matrix_trace = store_mass_matrix_trace
+        self._mass_trace = []
+
+        self._initial_weights = initial_weights
+
         self.reset()
 
     def reset(self):
@@ -222,12 +234,18 @@ def _update_from_weightvar(self, weightvar):
 
     def update(self, sample, grad, tune):
         """Inform the potential about a new sample during tuning."""
+        if self._store_mass_matrix_trace:
+            self._mass_trace.append(self._stds.copy())
+
         if not tune:
             return
 
-        self._foreground_var.add_sample(sample, weight=1)
-        self._background_var.add_sample(sample, weight=1)
-        self._update_from_weightvar(self._foreground_var)
+        if self._n_samples > self._discard_window:
+            self._foreground_var.add_sample(sample, weight=1)
+            self._background_var.add_sample(sample, weight=1)
+
+        if self._early_update or self._n_samples > self.adaptation_window:
+            self._update_from_weightvar(self._foreground_var)
 
         if self._n_samples > 0 and self._n_samples % self.adaptation_window == 0:
             self._foreground_var = self._background_var
@@ -342,6 +360,8 @@ def __init__(
 
     def add_sample(self, x, weight):
         x = np.asarray(x)
+        if weight != 1:
+            raise ValueError("weight is unused and broken")
         self.n_samples += 1
         old_diff = x - self.mean
         self.mean[:] += old_diff / self.n_samples
@@ -360,6 +380,83 @@ def current_mean(self):
         return self.mean.copy(dtype=self._dtype)
 
 
+class _ExpWeightedVariance:
+    def __init__(self, n_vars, *, init_mean, init_var, alpha):
+        self._variance = init_var
+        self._mean = init_mean
+        self._alpha = alpha
+
+    def add_sample(self, value):
+        alpha = self._alpha
+        delta = value - self._mean
+        self._mean += alpha * delta
+        self._variance = (1 - alpha) * (self._variance + alpha * delta ** 2)
+
+    def current_variance(self, out=None):
+        if out is None:
+            out = np.empty_like(self._variance)
+        np.copyto(out, self._variance)
+        return out
+
+    def current_mean(self, out=None):
+        if out is None:
+            out = np.empty_like(self._mean)
+        np.copyto(out, self._mean)
+        return out
+
+
+class QuadPotentialDiagAdaptExp(QuadPotentialDiagAdapt):
+    def __init__(self, *args, alpha, use_grads=False, stop_adaptation=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._alpha = alpha
+        self._use_grads = use_grads
+        self._stop_adaptation = stop_adaptation
+
+    def update(self, sample, grad, tune):
+        if tune and self._n_samples < self._stop_adaptation:
+            if self._n_samples > self._discard_window:
+                self._variance_estimator.add_sample(sample)
+                if self._use_grads:
+                    self._variance_estimator_grad.add_sample(grad)
+            elif self._n_samples == self._discard_window:
+                self._variance_estimator = _ExpWeightedVariance(
+                    self._n,
+                    init_mean=sample.copy(),
+                    init_var=np.zeros_like(sample),
+                    alpha=self._alpha,
+                )
+                if self._use_grads:
+                    self._variance_estimator_grad = _ExpWeightedVariance(
+                        self._n,
+                        init_mean=grad.copy(),
+                        init_var=np.zeros_like(grad),
+                        alpha=self._alpha,
+                    )
+
+            if self._n_samples > 2 * self._discard_window:
+                if self._use_grads:
+                    self._update_from_variances(
+                        self._variance_estimator, self._variance_estimator_grad
+                    )
+                else:
+                    self._update_from_weightvar(self._variance_estimator)
+
+            self._n_samples += 1
+
+        if self._store_mass_matrix_trace:
+            self._mass_trace.append(self._stds.copy())
+
+    def _update_from_variances(self, var_estimator, inv_var_estimator):
+        var = var_estimator.current_variance()
+        inv_var = inv_var_estimator.current_variance()
+        # print(inv_var)
+        updated = np.sqrt(var / inv_var)
+        self._var[:] = updated
+        # updated = np.exp((np.log(var) - np.log(inv_var)) / 2)
+        np.sqrt(updated, out=self._stds)
+        np.divide(1, self._stds, out=self._inv_stds)
+
+
 class QuadPotentialDiag(QuadPotential):
     """Quad potential using a diagonal covariance matrix."""
 

From 0076325c89e577f0962d977b43acab344f8a79b4 Mon Sep 17 00:00:00 2001
From: Adrian Seyboldt <adrian.seyboldt@gmail.com>
Date: Mon, 20 Sep 2021 12:38:44 +0200
Subject: [PATCH 03/12] Add test and doc for jitter+adapt_diag_grad

---
 pymc3/sampling.py              | 3 +++
 pymc3/step_methods/hmc/nuts.py | 4 +---
 pymc3/tests/test_sampling.py   | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/pymc3/sampling.py b/pymc3/sampling.py
index b9ae3d3e5f8..fdc15f75284 100644
--- a/pymc3/sampling.py
+++ b/pymc3/sampling.py
@@ -2101,6 +2101,9 @@ def init_nuts(
           as starting point.
         * jitter+adapt_diag: Same as ``adapt_diag``, but use test value plus a uniform jitter in
           [-1, 1] as starting point in each chain.
+        * jitter+adapt_diag_grad:
+          An experimental initialization method that uses information from gradients and samples
+          during tuning.
         * advi+adapt_diag: Run ADVI and then adapt the resulting diagonal mass matrix based on the
           sample variance of the tuning samples.
         * advi+adapt_diag_grad: Run ADVI and then adapt the resulting diagonal mass matrix based
diff --git a/pymc3/step_methods/hmc/nuts.py b/pymc3/step_methods/hmc/nuts.py
index e2a09eaa99a..ef1375e9986 100644
--- a/pymc3/step_methods/hmc/nuts.py
+++ b/pymc3/step_methods/hmc/nuts.py
@@ -253,9 +253,7 @@ def __init__(self, ndim, integrator, start, step_size, Emax):
         self.start_energy = np.array(start.energy)
 
         self.left = self.right = start
-        self.proposal = Proposal(
-            start.q.data, start.q_grad, start.energy, 1.0, start.model_logp
-        )
+        self.proposal = Proposal(start.q.data, start.q_grad, start.energy, 1.0, start.model_logp)
         self.depth = 0
         self.log_size = 0
         self.log_weighted_accept_sum = -np.inf
diff --git a/pymc3/tests/test_sampling.py b/pymc3/tests/test_sampling.py
index 0b6ce99a15b..00fc04f3e1d 100644
--- a/pymc3/tests/test_sampling.py
+++ b/pymc3/tests/test_sampling.py
@@ -87,7 +87,7 @@ def test_sample(self):
 
     def test_sample_init(self):
         with self.model:
-            for init in ("advi", "advi_map", "map"):
+            for init in ("advi", "advi_map", "map", "jitter+adapt_diag_grad"):
                 pm.sample(
                     init=init,
                     tune=0,

From a4121bc9f42f085527d40be7e337a06350cc9472 Mon Sep 17 00:00:00 2001
From: Adrian Seyboldt <adrian.seyboldt@gmail.com>
Date: Mon, 20 Sep 2021 14:29:38 +0200
Subject: [PATCH 04/12] Improve tests of init methods

---
 pymc3/step_methods/hmc/quadpotential.py |  6 +++---
 pymc3/tests/test_sampling.py            | 15 +++++++++++++--
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/pymc3/step_methods/hmc/quadpotential.py b/pymc3/step_methods/hmc/quadpotential.py
index f29948b5bce..15108f33d63 100644
--- a/pymc3/step_methods/hmc/quadpotential.py
+++ b/pymc3/step_methods/hmc/quadpotential.py
@@ -361,7 +361,7 @@ def __init__(
     def add_sample(self, x, weight):
         x = np.asarray(x)
         if weight != 1:
-            raise ValueError("weight is unused and broken")
+            raise ValueError("Setting weight != 1 is not supported.")
         self.n_samples += 1
         old_diff = x - self.mean
         self.mean[:] += old_diff / self.n_samples
@@ -389,8 +389,8 @@ def __init__(self, n_vars, *, init_mean, init_var, alpha):
     def add_sample(self, value):
         alpha = self._alpha
         delta = value - self._mean
-        self._mean += alpha * delta
-        self._variance = (1 - alpha) * (self._variance + alpha * delta ** 2)
+        self._mean[...] += alpha * delta
+        self._variance[...] = (1 - alpha) * (self._variance + alpha * delta ** 2)
 
     def current_variance(self, out=None):
         if out is None:
diff --git a/pymc3/tests/test_sampling.py b/pymc3/tests/test_sampling.py
index 00fc04f3e1d..50b56e9a20b 100644
--- a/pymc3/tests/test_sampling.py
+++ b/pymc3/tests/test_sampling.py
@@ -87,10 +87,21 @@ def test_sample(self):
 
     def test_sample_init(self):
         with self.model:
-            for init in ("advi", "advi_map", "map", "jitter+adapt_diag_grad"):
+            for init in (
+                "advi",
+                "advi_map",
+                "map",
+                "adapt_diag",
+                "jitter+adapt_diag",
+                "jitter+adapt_diag_grad",
+                "advi+adapt_diag_grad",
+                "advi+adapt_diag",
+                "adapt_full",
+                "jitter+adapt_full",
+            ):
                 pm.sample(
                     init=init,
-                    tune=0,
+                    tune=120,
                     n_init=1000,
                     draws=50,
                     random_seed=self.random_seed,

From 2b399031dbd2974187bc0dded26179727da4de5a Mon Sep 17 00:00:00 2001
From: Adrian Seyboldt <adrian.seyboldt@gmail.com>
Date: Mon, 20 Sep 2021 14:34:32 +0200
Subject: [PATCH 05/12] Add new tuning method to release notes

---
 RELEASE-NOTES.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
index cdf96b03796..e47178fe833 100644
--- a/RELEASE-NOTES.md
+++ b/RELEASE-NOTES.md
@@ -23,6 +23,8 @@
 - The `OrderedMultinomial` distribution has been added for use on ordinal data which are _aggregated_ by trial, like multinomial observations, whereas `OrderedLogistic` only accepts ordinal data in a _disaggregated_ format, like categorical
   observations (see [#4773](https://github.com/pymc-devs/pymc3/pull/4773)).
 - The `Polya-Gamma` distribution has been added (see [#4531](https://github.com/pymc-devs/pymc3/pull/4531)). To make use of this distribution, the [`polyagamma>=1.3.1`](https://pypi.org/project/polyagamma/) library must be installed and available in the user's environment.
+- A small change to the default mass matrix tuning method jitter+adapt_diag improves performance early on during tuning for some models. [#5004](https://github.com/pymc-devs/pymc3/pull/5004)
+- New experimental mass matrix tuning method jitter+adapt_diag_grad. [#5004](https://github.com/pymc-devs/pymc3/pull/5004)
 - ...
 
 ### Maintenance

From ad45bd8da319e9e0fbb0c73874da4dcf341ccf87 Mon Sep 17 00:00:00 2001
From: Adrian Seyboldt <adrian.seyboldt@gmail.com>
Date: Mon, 20 Sep 2021 14:57:40 +0200
Subject: [PATCH 06/12] Fix quadpotential for short tuning periods

---
 pymc3/step_methods/hmc/quadpotential.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pymc3/step_methods/hmc/quadpotential.py b/pymc3/step_methods/hmc/quadpotential.py
index 15108f33d63..2e05aacae00 100644
--- a/pymc3/step_methods/hmc/quadpotential.py
+++ b/pymc3/step_methods/hmc/quadpotential.py
@@ -410,6 +410,9 @@ def __init__(self, *args, alpha, use_grads=False, stop_adaptation=None, **kwargs
         super().__init__(*args, **kwargs)
         self._alpha = alpha
         self._use_grads = use_grads
+
+        if stop_adaptation is None:
+            stop_adaptation = np.inf
         self._stop_adaptation = stop_adaptation
 
     def update(self, sample, grad, tune):

From 99330914f6ac939879d91c35dd0f27bd2d2af490 Mon Sep 17 00:00:00 2001
From: Adrian Seyboldt <adrian.seyboldt@gmail.com>
Date: Mon, 20 Sep 2021 15:30:10 +0200
Subject: [PATCH 07/12] Fix release notes

---
 RELEASE-NOTES.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
index e47178fe833..e1f8705b033 100644
--- a/RELEASE-NOTES.md
+++ b/RELEASE-NOTES.md
@@ -23,7 +23,7 @@
 - The `OrderedMultinomial` distribution has been added for use on ordinal data which are _aggregated_ by trial, like multinomial observations, whereas `OrderedLogistic` only accepts ordinal data in a _disaggregated_ format, like categorical
   observations (see [#4773](https://github.com/pymc-devs/pymc3/pull/4773)).
 - The `Polya-Gamma` distribution has been added (see [#4531](https://github.com/pymc-devs/pymc3/pull/4531)). To make use of this distribution, the [`polyagamma>=1.3.1`](https://pypi.org/project/polyagamma/) library must be installed and available in the user's environment.
-- A small change to the default mass matrix tuning method jitter+adapt_diag improves performance early on during tuning for some models. [#5004](https://github.com/pymc-devs/pymc3/pull/5004)
+- A small change to the mass matrix tuning methods jitter+adapt_diag (the default) and adapt_diag improves performance early on during tuning for some models. [#5004](https://github.com/pymc-devs/pymc3/pull/5004)
 - New experimental mass matrix tuning method jitter+adapt_diag_grad. [#5004](https://github.com/pymc-devs/pymc3/pull/5004)
 - ...
 

From 6263f99a3644605115e0c527e6457f9bae8aa72d Mon Sep 17 00:00:00 2001
From: Adrian Seyboldt <adrian.seyboldt@gmail.com>
Date: Mon, 20 Sep 2021 15:31:08 +0200
Subject: [PATCH 08/12] Remove old gradient mass matrix adaptation

---
 pymc3/sampling.py                       | 41 +------------------------
 pymc3/step_methods/hmc/quadpotential.py | 41 -------------------------
 2 files changed, 1 insertion(+), 81 deletions(-)

diff --git a/pymc3/sampling.py b/pymc3/sampling.py
index fdc15f75284..f068ae582e2 100644
--- a/pymc3/sampling.py
+++ b/pymc3/sampling.py
@@ -287,25 +287,7 @@ def sample(
         by default. See ``discard_tuned_samples``.
     init : str
         Initialization method to use for auto-assigned NUTS samplers.
-
-        * auto: Choose a default initialization method automatically.
-          Currently, this is ``jitter+adapt_diag``, but this can change in the future.
-          If you depend on the exact behaviour, choose an initialization method explicitly.
-        * adapt_diag: Start with a identity mass matrix and then adapt a diagonal based on the
-          variance of the tuning samples. All chains use the test value (usually the prior mean)
-          as starting point.
-        * jitter+adapt_diag: Same as ``adapt_diag``, but add uniform jitter in [-1, 1] to the
-          starting point in each chain.
-        * advi+adapt_diag: Run ADVI and then adapt the resulting diagonal mass matrix based on the
-          sample variance of the tuning samples.
-        * advi+adapt_diag_grad: Run ADVI and then adapt the resulting diagonal mass matrix based
-          on the variance of the gradients during tuning. This is **experimental** and might be
-          removed in a future release.
-        * advi: Run ADVI to estimate posterior mean and diagonal mass matrix.
-        * advi_map: Initialize ADVI with MAP and use MAP as starting point.
-        * map: Use the MAP as starting point. This is discouraged.
-        * adapt_full: Adapt a dense mass matrix using the sample covariances
-
+        See `pm.init_nuts` for a list of all options.
     step : function or iterable of functions
         A step function or collection of functions. If there are variables without step methods,
         step methods for those variables will be assigned automatically.  By default the NUTS step
@@ -2106,9 +2088,6 @@ def init_nuts(
           during tuning.
         * advi+adapt_diag: Run ADVI and then adapt the resulting diagonal mass matrix based on the
           sample variance of the tuning samples.
-        * advi+adapt_diag_grad: Run ADVI and then adapt the resulting diagonal mass matrix based
-          on the variance of the gradients during tuning. This is **experimental** and might be
-          removed in a future release.
         * advi: Run ADVI to estimate posterior mean and diagonal mass matrix.
         * advi_map: Initialize ADVI with MAP and use MAP as starting point.
         * map: Use the MAP as starting point. This is discouraged.
@@ -2197,24 +2176,6 @@ def init_nuts(
             use_grads=True,
             stop_adaptation=stop_adaptation,
         )
-    elif init == "advi+adapt_diag_grad":
-        approx: pm.MeanField = pm.fit(
-            random_seed=random_seed,
-            n=n_init,
-            method="advi",
-            model=model,
-            callbacks=cb,
-            progressbar=progressbar,
-            obj_optimizer=pm.adagrad_window,
-        )
-        start = approx.sample(draws=chains)
-        start = list(start)
-        std_apoint = approx.std.eval()
-        cov = std_apoint ** 2
-        mean = approx.mean.get_value()
-        weight = 50
-        n = len(cov)
-        potential = quadpotential.QuadPotentialDiagAdaptGrad(n, mean, cov, weight)
     elif init == "advi+adapt_diag":
         approx = pm.fit(
             random_seed=random_seed,
diff --git a/pymc3/step_methods/hmc/quadpotential.py b/pymc3/step_methods/hmc/quadpotential.py
index 2e05aacae00..bd668a2fab0 100644
--- a/pymc3/step_methods/hmc/quadpotential.py
+++ b/pymc3/step_methods/hmc/quadpotential.py
@@ -293,47 +293,6 @@ def raise_ok(self, map_info):
             raise ValueError("\n".join(errmsg))
 
 
-class QuadPotentialDiagAdaptGrad(QuadPotentialDiagAdapt):
-    """Adapt a diagonal mass matrix from the variances of the gradients.
-
-    This is experimental, and may be removed without prior deprication.
-    """
-
-    def reset(self):
-        super().reset()
-        self._grads1 = np.zeros(self._n, dtype=self.dtype)
-        self._ngrads1 = 0
-        self._grads2 = np.zeros(self._n, dtype=self.dtype)
-        self._ngrads2 = 0
-
-    def _update(self, var):
-        self._var[:] = var
-        np.sqrt(self._var, out=self._stds)
-        np.divide(1, self._stds, out=self._inv_stds)
-        self._var_aesara.set_value(self._var)
-
-    def update(self, sample, grad, tune):
-        """Inform the potential about a new sample during tuning."""
-        if not tune:
-            return
-
-        self._grads1[:] += np.abs(grad)
-        self._grads2[:] += np.abs(grad)
-        self._ngrads1 += 1
-        self._ngrads2 += 1
-
-        if self._n_samples <= 150:
-            super().update(sample, grad, tune)
-        else:
-            self._update((self._ngrads1 / self._grads1) ** 2)
-
-        if self._n_samples > 100 and self._n_samples % 100 == 50:
-            self._ngrads1 = self._ngrads2
-            self._ngrads2 = 1
-            self._grads1[:] = self._grads2
-            self._grads2[:] = 1
-
-
 class _WeightedVariance:
     """Online algorithm for computing mean of variance."""
 

From d2684c0c25187c90f3c9d4f3bb12b2f39c93b43e Mon Sep 17 00:00:00 2001
From: Adrian Seyboldt <adrian.seyboldt@gmail.com>
Date: Mon, 20 Sep 2021 15:31:45 +0200
Subject: [PATCH 09/12] Change default for stop_adaptation

---
 pymc3/sampling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pymc3/sampling.py b/pymc3/sampling.py
index f068ae582e2..665071f6639 100644
--- a/pymc3/sampling.py
+++ b/pymc3/sampling.py
@@ -2164,7 +2164,7 @@ def init_nuts(
         var = np.ones_like(mean)
         n = len(var)
 
-        if tune is not None and tune > 200:
+        if tune is not None and tune > 250:
             stop_adaptation = tune - 50
         else:
             stop_adaptation = None

From ad777b153998dfe11cc42a6f53df28a2baaf9507 Mon Sep 17 00:00:00 2001
From: Adrian Seyboldt <adrian.seyboldt@gmail.com>
Date: Tue, 21 Sep 2021 20:41:10 +0200
Subject: [PATCH 10/12] Add docstrings for quadpotential

---
 pymc3/step_methods/hmc/quadpotential.py | 65 ++++++++++++++++++++++---
 1 file changed, 59 insertions(+), 6 deletions(-)

diff --git a/pymc3/step_methods/hmc/quadpotential.py b/pymc3/step_methods/hmc/quadpotential.py
index bd668a2fab0..d60b4e9e7a4 100644
--- a/pymc3/step_methods/hmc/quadpotential.py
+++ b/pymc3/step_methods/hmc/quadpotential.py
@@ -155,11 +155,40 @@ def __init__(
         adaptation_window_multiplier=1,
         dtype=None,
         discard_window=50,
-        initial_weights=None,
         early_update=False,
         store_mass_matrix_trace=False,
     ):
-        """Set up a diagonal mass matrix."""
+        """Set up a diagonal mass matrix.
+
+        Parameters
+        ----------
+        n : int
+            The number of parameters.
+        initial_mean : np.ndarray
+            An initial guess for the posterior mean of each parameter.
+        initial_diag : np.ndarray
+            An estimate of the posterior variance of each parameter.
+        initial_weight : int
+            How much weight the initial guess has compared to new samples during tuning.
+            Measured in equivalent number of samples.
+        adaptation_window : int
+            The size of the adaptation window during tuning. It specifies how many samples
+            are used to estimate the mass matrix in each section of the adaptation.
+        adaptation_window_multiplier : float
+            The factor with which we increase the adaptation window after each adaptation
+            window.
+        dtype : np.dtype
+            The dtype used to store the mass matrix
+        discard_window : int
+            The number of initial samples that are just discarded and not used to estimate
+            the mass matrix.
+        early_update : bool
+            Whether to update the mass matrix live during the first half of the first
+            adaptation window.
+        store_mass_matrix_trace : bool
+            If true, store the mass matrix at each step of the adaptation. Only for debugging
+            purposes.
+        """
         if initial_diag is not None and initial_diag.ndim != 1:
             raise ValueError("Initial diagonal must be one-dimensional.")
         if initial_mean.ndim != 1:
@@ -191,8 +220,6 @@ def __init__(
         self._store_mass_matrix_trace = store_mass_matrix_trace
         self._mass_trace = []
 
-        self._initial_weights = initial_weights
-
         self.reset()
 
     def reset(self):
@@ -366,6 +393,34 @@ def current_mean(self, out=None):
 
 class QuadPotentialDiagAdaptExp(QuadPotentialDiagAdapt):
     def __init__(self, *args, alpha, use_grads=False, stop_adaptation=None, **kwargs):
+        """Set up a diagonal mass matrix.
+
+        Parameters
+        ----------
+        n : int
+            The number of parameters.
+        initial_mean : np.ndarray
+            An initial guess for the posterior mean of each parameter.
+        initial_diag : np.ndarray
+            An estimate of the posterior variance of each parameter.
+        alpha : float
+            Decay rate of the exponetial weighted variance.
+        use_grads : bool
+            Use gradients, not only samples to estimate the mass matrix.
+        stop_adaptation : int
+            Stop the mass matrix adaptation after this many samples.
+        dtype : np.dtype
+            The dtype used to store the mass matrix
+        discard_window : int
+            The number of initial samples that are just discarded and not used to estimate
+            the mass matrix.
+        store_mass_matrix_trace : bool
+            If true, store the mass matrix at each step of the adaptation. Only for debugging
+            purposes.
+        """
+        if len(args) > 3:
+            raise ValueError("Unsupported arguments to QuadPotentialDiagAdaptExp")
+
         super().__init__(*args, **kwargs)
         self._alpha = alpha
         self._use_grads = use_grads
@@ -411,10 +466,8 @@ def update(self, sample, grad, tune):
     def _update_from_variances(self, var_estimator, inv_var_estimator):
         var = var_estimator.current_variance()
         inv_var = inv_var_estimator.current_variance()
-        # print(inv_var)
         updated = np.sqrt(var / inv_var)
         self._var[:] = updated
-        # updated = np.exp((np.log(var) - np.log(inv_var)) / 2)
         np.sqrt(updated, out=self._stds)
         np.divide(1, self._stds, out=self._inv_stds)
 

From 2f9e5d185f81014973b6724e1cf8bdc11d12f3f4 Mon Sep 17 00:00:00 2001
From: Adrian Seyboldt <adrian.seyboldt@gmail.com>
Date: Tue, 21 Sep 2021 23:13:52 +0200
Subject: [PATCH 11/12] Fix tests for deleted init methods

---
 pymc3/tests/test_quadpotential.py | 6 ------
 pymc3/tests/test_sampling.py      | 2 --
 2 files changed, 8 deletions(-)

diff --git a/pymc3/tests/test_quadpotential.py b/pymc3/tests/test_quadpotential.py
index c869511a606..a6ac9950ddc 100644
--- a/pymc3/tests/test_quadpotential.py
+++ b/pymc3/tests/test_quadpotential.py
@@ -279,9 +279,3 @@ def test_full_adapt_sampling(seed=289586):
         pot = quadpotential.QuadPotentialFullAdapt(initial_point_size, np.zeros(initial_point_size))
         step = pymc3.NUTS(model=model, potential=pot)
         pymc3.sample(draws=10, tune=1000, random_seed=seed, step=step, cores=1, chains=1)
-
-
-def test_issue_3965():
-    with pymc3.Model():
-        pymc3.Normal("n")
-        pymc3.sample(100, tune=300, chains=1, init="advi+adapt_diag_grad")
diff --git a/pymc3/tests/test_sampling.py b/pymc3/tests/test_sampling.py
index 50b56e9a20b..04767ae9b68 100644
--- a/pymc3/tests/test_sampling.py
+++ b/pymc3/tests/test_sampling.py
@@ -94,8 +94,6 @@ def test_sample_init(self):
                 "adapt_diag",
                 "jitter+adapt_diag",
                 "jitter+adapt_diag_grad",
-                "advi+adapt_diag_grad",
-                "advi+adapt_diag",
                 "adapt_full",
                 "jitter+adapt_full",
             ):

From b0b56ce1601426adc1f7d6f8439bb139193efb86 Mon Sep 17 00:00:00 2001
From: Adrian Seyboldt <adrian.seyboldt@gmail.com>
Date: Wed, 22 Sep 2021 01:23:45 +0200
Subject: [PATCH 12/12] Remove weight argument in quadpotential add_sample

---
 pymc3/step_methods/hmc/quadpotential.py | 21 +++++++++------------
 pymc3/tests/test_quadpotential.py       |  4 ++--
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/pymc3/step_methods/hmc/quadpotential.py b/pymc3/step_methods/hmc/quadpotential.py
index d60b4e9e7a4..21c24907dbc 100644
--- a/pymc3/step_methods/hmc/quadpotential.py
+++ b/pymc3/step_methods/hmc/quadpotential.py
@@ -183,8 +183,7 @@ def __init__(
             The number of initial samples that are just discarded and not used to estimate
             the mass matrix.
         early_update : bool
-            Whether to update the mass matrix live during the first half of the first
-            adaptation window.
+            Whether to update the mass matrix live during the first adaptation window.
         store_mass_matrix_trace : bool
             If true, store the mass matrix at each step of the adaptation. Only for debugging
             purposes.
@@ -268,8 +267,8 @@ def update(self, sample, grad, tune):
             return
 
         if self._n_samples > self._discard_window:
-            self._foreground_var.add_sample(sample, weight=1)
-            self._background_var.add_sample(sample, weight=1)
+            self._foreground_var.add_sample(sample)
+            self._background_var.add_sample(sample)
 
         if self._early_update or self._n_samples > self.adaptation_window:
             self._update_from_weightvar(self._foreground_var)
@@ -344,15 +343,13 @@ def __init__(
         if self.mean.shape != (nelem,):
             raise ValueError("Invalid shape for initial mean.")
 
-    def add_sample(self, x, weight):
+    def add_sample(self, x):
         x = np.asarray(x)
-        if weight != 1:
-            raise ValueError("Setting weight != 1 is not supported.")
         self.n_samples += 1
         old_diff = x - self.mean
         self.mean[:] += old_diff / self.n_samples
         new_diff = x - self.mean
-        self.raw_var[:] += weight * old_diff * new_diff
+        self.raw_var[:] += old_diff * new_diff
 
     def current_variance(self, out=None):
         if self.n_samples == 0:
@@ -666,8 +663,8 @@ def update(self, sample, grad, tune):
         # Steps since previous update
         delta = self._n_samples - self._previous_update
 
-        self._foreground_cov.add_sample(sample, weight=1)
-        self._background_cov.add_sample(sample, weight=1)
+        self._foreground_cov.add_sample(sample)
+        self._background_cov.add_sample(sample)
 
         # Update the covariance matrix and recompute the Cholesky factorization
         # every "update_window" steps
@@ -726,13 +723,13 @@ def __init__(
         if self.mean.shape != (nelem,):
             raise ValueError("Invalid shape for initial mean.")
 
-    def add_sample(self, x, weight):
+    def add_sample(self, x):
         x = np.asarray(x)
         self.n_samples += 1
         old_diff = x - self.mean
         self.mean[:] += old_diff / self.n_samples
         new_diff = x - self.mean
-        self.raw_cov[:] += weight * new_diff[:, None] * old_diff[None, :]
+        self.raw_cov[:] += new_diff[:, None] * old_diff[None, :]
 
     def current_covariance(self, out=None):
         if self.n_samples == 0:
diff --git a/pymc3/tests/test_quadpotential.py b/pymc3/tests/test_quadpotential.py
index a6ac9950ddc..4bf342047b8 100644
--- a/pymc3/tests/test_quadpotential.py
+++ b/pymc3/tests/test_quadpotential.py
@@ -169,7 +169,7 @@ def test_weighted_covariance(ndim=10, seed=5432):
 
     est = quadpotential._WeightedCovariance(ndim)
     for sample in samples:
-        est.add_sample(sample, 1)
+        est.add_sample(sample)
     mu_est = est.current_mean()
     cov_est = est.current_covariance()
 
@@ -184,7 +184,7 @@ def test_weighted_covariance(ndim=10, seed=5432):
         10,
     )
     for sample in samples[10:]:
-        est2.add_sample(sample, 1)
+        est2.add_sample(sample)
     mu_est2 = est2.current_mean()
     cov_est2 = est2.current_covariance()