diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index 8a1aa3611fc..7bdd280bd73 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -47,7 +47,6 @@ jobs:
             --ignore=pymc3/tests/test_minibatches.py
             --ignore=pymc3/tests/test_pickling.py
             --ignore=pymc3/tests/test_plots.py
-            --ignore=pymc3/tests/test_special_functions.py
             --ignore=pymc3/tests/test_updates.py
             --ignore=pymc3/tests/test_examples.py
             --ignore=pymc3/tests/test_gp.py
@@ -67,7 +66,6 @@ jobs:
             pymc3/tests/test_minibatches.py
             pymc3/tests/test_pickling.py
             pymc3/tests/test_plots.py
-            pymc3/tests/test_special_functions.py
             pymc3/tests/test_updates.py
 
           - |
diff --git a/pymc3/distributions/continuous.py b/pymc3/distributions/continuous.py
index 908a7b138ad..87249b135aa 100644
--- a/pymc3/distributions/continuous.py
+++ b/pymc3/distributions/continuous.py
@@ -24,6 +24,7 @@
 import numpy as np
 
 from aesara.assert_op import Assert
+from aesara.tensor import gammaln
 from aesara.tensor.random.basic import (
     BetaRV,
     WeibullRV,
@@ -57,9 +58,9 @@
     betaln,
     bound,
     clipped_beta_rvs,
-    gammaln,
     i0e,
     incomplete_beta,
+    log_i0,
     log_normal,
     logpow,
     normal_lccdf,
@@ -67,7 +68,6 @@
     zvalue,
 )
 from pymc3.distributions.distribution import Continuous
-from pymc3.distributions.special import log_i0
 from pymc3.math import log1mexp, log1pexp, logdiffexp, logit
 from pymc3.util import UNSET
 
diff --git a/pymc3/distributions/dist_math.py b/pymc3/distributions/dist_math.py
index 7f4112d7456..491a81c32e2 100644
--- a/pymc3/distributions/dist_math.py
+++ b/pymc3/distributions/dist_math.py
@@ -29,12 +29,12 @@
 from aesara.graph.op import Op
 from aesara.scalar import UnaryScalarOp, upgrade_to_float_no_complex
 from aesara.scan import until
+from aesara.tensor import gammaln
 from aesara.tensor.elemwise import Elemwise
 from aesara.tensor.slinalg import Cholesky, Solve
 
 from pymc3.aesaraf import floatX
 from pymc3.distributions.shape_utils import to_tuple
-from pymc3.distributions.special import gammaln
 
 f = floatX
 c = -0.5 * np.log(2.0 * np.pi)
@@ -634,3 +634,41 @@ def clipped_beta_rvs(a, b, size=None, random_state=None, dtype="float64"):
     out = scipy.stats.beta.rvs(a, b, size=size, random_state=random_state).astype(dtype)
     lower, upper = _beta_clip_values[dtype]
     return np.maximum(np.minimum(out, upper), lower)
+
+
+def multigammaln(a, p):
+    """Multivariate Log Gamma
+
+    Parameters
+    ----------
+    a: tensor like
+    p: int
+       degrees of freedom. p > 0
+    """
+    i = at.arange(1, p + 1)
+    return p * (p - 1) * at.log(np.pi) / 4.0 + at.sum(gammaln(a + (1.0 - i) / 2.0), axis=0)
+
+
+def log_i0(x):
+    """
+    Calculates the logarithm of the 0 order modified Bessel function of the first kind""
+    """
+    return at.switch(
+        at.lt(x, 5),
+        at.log1p(
+            x ** 2.0 / 4.0
+            + x ** 4.0 / 64.0
+            + x ** 6.0 / 2304.0
+            + x ** 8.0 / 147456.0
+            + x ** 10.0 / 14745600.0
+            + x ** 12.0 / 2123366400.0
+        ),
+        x
+        - 0.5 * at.log(2.0 * np.pi * x)
+        + at.log1p(
+            1.0 / (8.0 * x)
+            + 9.0 / (128.0 * x ** 2.0)
+            + 225.0 / (3072.0 * x ** 3.0)
+            + 11025.0 / (98304.0 * x ** 4.0)
+        ),
+    )
diff --git a/pymc3/distributions/multivariate.py b/pymc3/distributions/multivariate.py
index c1160ba764a..ad7c1df6467 100644
--- a/pymc3/distributions/multivariate.py
+++ b/pymc3/distributions/multivariate.py
@@ -24,6 +24,7 @@
 
 from aesara.graph.basic import Apply
 from aesara.graph.op import Op
+from aesara.tensor import gammaln
 from aesara.tensor.nlinalg import det, eigh, matrix_inverse, trace
 from aesara.tensor.random.basic import MultinomialRV, dirichlet, multivariate_normal
 from aesara.tensor.random.utils import broadcast_params
@@ -41,9 +42,8 @@
 from pymc3.aesaraf import floatX, intX
 from pymc3.distributions import transforms
 from pymc3.distributions.continuous import ChiSquared, Normal
-from pymc3.distributions.dist_math import bound, factln, logpow
+from pymc3.distributions.dist_math import bound, factln, logpow, multigammaln
 from pymc3.distributions.distribution import Continuous, Discrete
-from pymc3.distributions.special import gammaln, multigammaln
 from pymc3.math import kron_diag, kron_dot, kron_solve_lower, kronecker
 
 __all__ = [
@@ -154,7 +154,7 @@ def quaddist_tau(delta, chol_mat):
 
 
 class MvNormal(Continuous):
-    R"""
+    r"""
     Multivariate normal log-likelihood.
 
     .. math::
@@ -249,7 +249,7 @@ def _distr_parameters_for_repr(self):
 
 
 class MvStudentT(Continuous):
-    R"""
+    r"""
     Multivariate Student-T log-likelihood.
 
     .. math::
@@ -362,7 +362,7 @@ def _distr_parameters_for_repr(self):
 
 
 class Dirichlet(Continuous):
-    R"""
+    r"""
     Dirichlet log-likelihood.
 
     .. math::
@@ -452,7 +452,7 @@ def rng_fn(cls, rng, n, p, size):
 
 
 class Multinomial(Discrete):
-    R"""
+    r"""
     Multinomial log-likelihood.
 
     Generalizes binomial distribution, but instead of each trial resulting
@@ -525,7 +525,7 @@ def logp(value, n, p):
 
 
 class DirichletMultinomial(Discrete):
-    R"""Dirichlet Multinomial log-likelihood.
+    r"""Dirichlet Multinomial log-likelihood.
 
     Dirichlet mixture of Multinomials distribution, with a marginalized PMF.
 
@@ -729,7 +729,7 @@ def __str__(self):
 
 
 class Wishart(Continuous):
-    R"""
+    r"""
     Wishart log-likelihood.
 
     The Wishart distribution is the probability distribution of the
@@ -946,7 +946,7 @@ def _lkj_normalizing_constant(eta, n):
 
 
 class _LKJCholeskyCov(Continuous):
-    R"""Underlying class for covariance matrix with LKJ distributed correlations.
+    r"""Underlying class for covariance matrix with LKJ distributed correlations.
     See docs for LKJCholeskyCov function for more details on how to use it in models.
     """
 
@@ -1126,7 +1126,7 @@ def _distr_parameters_for_repr(self):
 
 
 def LKJCholeskyCov(name, eta, n, sd_dist, compute_corr=False, store_in_trace=True, *args, **kwargs):
-    R"""Wrapper function for covariance matrix with LKJ distributed correlations.
+    r"""Wrapper function for covariance matrix with LKJ distributed correlations.
 
     This defines a distribution over Cholesky decomposed covariance
     matrices, such that the underlying correlation matrices follow an
@@ -1279,7 +1279,7 @@ def LKJCholeskyCov(name, eta, n, sd_dist, compute_corr=False, store_in_trace=Tru
 
 
 class LKJCorr(Continuous):
-    R"""
+    r"""
     The LKJ (Lewandowski, Kurowicka and Joe) log-likelihood.
 
     The LKJ distribution is a prior distribution for correlation matrices.
@@ -1435,7 +1435,7 @@ def _distr_parameters_for_repr(self):
 
 
 class MatrixNormal(Continuous):
-    R"""
+    r"""
     Matrix-valued normal log-likelihood.
 
     .. math::
@@ -1694,7 +1694,7 @@ def _distr_parameters_for_repr(self):
 
 
 class KroneckerNormal(Continuous):
-    R"""
+    r"""
     Multivariate normal log-likelihood with Kronecker-structured covariance.
 
     .. math::
@@ -1941,7 +1941,7 @@ def _distr_parameters_for_repr(self):
 
 
 class CAR(Continuous):
-    R"""
+    r"""
     Likelihood for a conditional autoregression. This is a special case of the
     multivariate normal with an adjacency-structured covariance matrix.
 
diff --git a/pymc3/distributions/special.py b/pymc3/distributions/special.py
deleted file mode 100644
index 7c0a15e8b22..00000000000
--- a/pymc3/distributions/special.py
+++ /dev/null
@@ -1,58 +0,0 @@
-#   Copyright 2020 The PyMC Developers
-#
-#   Licensed under the Apache License, Version 2.0 (the "License");
-#   you may not use this file except in compliance with the License.
-#   You may obtain a copy of the License at
-#
-#       http://www.apache.org/licenses/LICENSE-2.0
-#
-#   Unless required by applicable law or agreed to in writing, software
-#   distributed under the License is distributed on an "AS IS" BASIS,
-#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#   See the License for the specific language governing permissions and
-#   limitations under the License.
-
-import aesara.tensor as at
-import numpy as np
-
-from aesara.tensor.math import gammaln, psi
-
-__all__ = ["gammaln", "multigammaln", "psi", "log_i0"]
-
-
-def multigammaln(a, p):
-    """Multivariate Log Gamma
-
-    Parameters
-    ----------
-    a: tensor like
-    p: int
-       degrees of freedom. p > 0
-    """
-    i = at.arange(1, p + 1)
-    return p * (p - 1) * at.log(np.pi) / 4.0 + at.sum(gammaln(a + (1.0 - i) / 2.0), axis=0)
-
-
-def log_i0(x):
-    """
-    Calculates the logarithm of the 0 order modified Bessel function of the first kind""
-    """
-    return at.switch(
-        at.lt(x, 5),
-        at.log1p(
-            x ** 2.0 / 4.0
-            + x ** 4.0 / 64.0
-            + x ** 6.0 / 2304.0
-            + x ** 8.0 / 147456.0
-            + x ** 10.0 / 14745600.0
-            + x ** 12.0 / 2123366400.0
-        ),
-        x
-        - 0.5 * at.log(2.0 * np.pi * x)
-        + at.log1p(
-            1.0 / (8.0 * x)
-            + 9.0 / (128.0 * x ** 2.0)
-            + 225.0 / (3072.0 * x ** 3.0)
-            + 11025.0 / (98304.0 * x ** 4.0)
-        ),
-    )
diff --git a/pymc3/tests/test_dist_math.py b/pymc3/tests/test_dist_math.py
index fc1e531a009..c9389990edd 100644
--- a/pymc3/tests/test_dist_math.py
+++ b/pymc3/tests/test_dist_math.py
@@ -16,7 +16,9 @@
 import numpy as np
 import numpy.testing as npt
 import pytest
+import scipy.special
 
+from aesara import config, function
 from aesara.tensor.random.basic import multinomial
 from scipy import interpolate, stats
 
@@ -32,7 +34,9 @@
     clipped_beta_rvs,
     factln,
     i0e,
+    multigammaln,
 )
+from pymc3.tests.checks import close_to
 from pymc3.tests.helpers import verify_grad
 
 
@@ -236,3 +240,25 @@ def test_clipped_beta_rvs(dtype):
     # equal to zero or one (issue #3898)
     values = clipped_beta_rvs(0.01, 0.01, size=1000000, dtype=dtype)
     assert not (np.any(values == 0) or np.any(values == 1))
+
+
+def check_vals(fn1, fn2, *args):
+    v = fn1(*args)
+    close_to(v, fn2(*args), 1e-6 if v.dtype == np.float64 else 1e-4)
+
+
+def test_multigamma():
+    x = at.vector("x")
+    p = at.scalar("p")
+
+    xvals = [np.array([v], dtype=config.floatX) for v in [0.1, 2, 5, 10, 50, 100]]
+
+    multigammaln_ = function([x, p], multigammaln(x, p), mode="FAST_COMPILE")
+
+    def ref_multigammaln(a, b):
+        return np.array(scipy.special.multigammaln(a[0], b), config.floatX)
+
+    for p in [0, 1, 2, 3, 4, 100]:
+        for x in xvals:
+            if np.all(x > 0.5 * (p - 1)):
+                check_vals(multigammaln_, ref_multigammaln, x, p)
diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py
index 30ee062291e..38ae1c82d8a 100644
--- a/pymc3/tests/test_distributions.py
+++ b/pymc3/tests/test_distributions.py
@@ -1367,6 +1367,8 @@ def test_t(self):
             lambda value, nu, mu, lam: sp.t.logcdf(value, nu, mu, lam ** -0.5),
             n_samples=10,  # relies on slow incomplete beta
         )
+        # TODO: reenable when PR #4736 is merged
+        """
         self.check_logcdf(
             StudentT,
             R,
@@ -1374,6 +1376,7 @@ def test_t(self):
             lambda value, nu, mu, sigma: sp.t.logcdf(value, nu, mu, sigma),
             n_samples=5,  # Just testing alternative parametrization
         )
+        """
 
     def test_cauchy(self):
         self.check_logp(
diff --git a/pymc3/tests/test_ode.py b/pymc3/tests/test_ode.py
index 95937a36746..5b8b878d8b2 100644
--- a/pymc3/tests/test_ode.py
+++ b/pymc3/tests/test_ode.py
@@ -395,9 +395,9 @@ def system(y, t, p):
         ode_model = DifferentialEquation(func=system, t0=0, times=times, n_states=2, n_theta=2)
 
         with pm.Model() as model:
-            beta = pm.HalfCauchy("beta", 1)
-            gamma = pm.HalfCauchy("gamma", 1)
-            sigma = pm.HalfCauchy("sigma", 1, shape=2)
+            beta = pm.HalfCauchy("beta", 1, initval=1)
+            gamma = pm.HalfCauchy("gamma", 1, initval=1)
+            sigma = pm.HalfCauchy("sigma", 1, shape=2, initval=[1, 1])
             forward = ode_model(theta=[beta, gamma], y0=[0.99, 0.01])
             y = pm.Lognormal("y", mu=pm.math.log(forward), sd=sigma, observed=yobs)
 
diff --git a/pymc3/tests/test_special_functions.py b/pymc3/tests/test_special_functions.py
deleted file mode 100644
index 9a73647fd10..00000000000
--- a/pymc3/tests/test_special_functions.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#   Copyright 2020 The PyMC Developers
-#
-#   Licensed under the Apache License, Version 2.0 (the "License");
-#   you may not use this file except in compliance with the License.
-#   You may obtain a copy of the License at
-#
-#       http://www.apache.org/licenses/LICENSE-2.0
-#
-#   Unless required by applicable law or agreed to in writing, software
-#   distributed under the License is distributed on an "AS IS" BASIS,
-#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#   See the License for the specific language governing permissions and
-#   limitations under the License.
-
-import aesara.tensor as at
-import numpy as np
-import scipy.special as ss
-
-from aesara import config, function
-
-import pymc3.distributions.special as ps
-
-from pymc3.tests.checks import close_to
-
-
-def check_vals(fn1, fn2, *args):
-    v = fn1(*args)
-    close_to(v, fn2(*args), 1e-6 if v.dtype == np.float64 else 1e-4)
-
-
-def test_multigamma():
-    x = at.vector("x")
-    p = at.scalar("p")
-
-    xvals = [np.array([v], dtype=config.floatX) for v in [0.1, 2, 5, 10, 50, 100]]
-
-    multigammaln = function([x, p], ps.multigammaln(x, p), mode="FAST_COMPILE")
-
-    def ssmultigammaln(a, b):
-        return np.array(ss.multigammaln(a[0], b), config.floatX)
-
-    for p in [0, 1, 2, 3, 4, 100]:
-        for x in xvals:
-            if np.all(x > 0.5 * (p - 1)):
-                check_vals(multigammaln, ssmultigammaln, x, p)