From 46294ff9346e74a6c110cf682aad94e8a2d84a3a Mon Sep 17 00:00:00 2001
From: "Brandon T. Willard" <brandonwillard@users.noreply.github.com>
Date: Thu, 11 Feb 2021 21:25:43 -0600
Subject: [PATCH] Rename Theano to Aesara

---
 .github/ISSUE_TEMPLATE.md                     |   2 +-
 .github/workflows/arviz_compat.yml            |   2 +-
 .github/workflows/jaxtests.yml                |   2 +-
 .github/workflows/pytest.yml                  |   2 +-
 .github/workflows/windows.yml                 |   2 +-
 README.rst                                    |   8 +-
 benchmarks/benchmarks/benchmarks.py           |  10 +-
 .../Advanced_usage_of_Theano_in_PyMC3.rst     |  54 +-
 docs/source/Gaussian_Processes.rst            |   4 +-
 docs/source/Probability_Distributions.rst     |   2 +-
 docs/source/PyMC3_and_Theano.rst              | 106 ++--
 docs/source/about.rst                         |   8 +-
 docs/source/api/math.rst                      |   4 +-
 docs/source/conf.py                           |   2 +-
 docs/source/developer_guide.rst               | 130 ++--
 docs/source/index.rst                         |   6 +-
 pymc3/__init__.py                             |  10 +-
 pymc3/{theanof.py => aesaraf.py}              | 125 ++--
 pymc3/backends/base.py                        |   4 +-
 pymc3/blocking.py                             |   4 +-
 pymc3/data.py                                 |  46 +-
 pymc3/distributions/bound.py                  |   8 +-
 pymc3/distributions/continuous.py             | 596 +++++++++---------
 pymc3/distributions/discrete.py               | 293 ++++-----
 pymc3/distributions/dist_math.py              | 220 +++----
 pymc3/distributions/distribution.py           | 123 ++--
 pymc3/distributions/mixture.py                |  66 +-
 pymc3/distributions/multivariate.py           | 288 ++++-----
 pymc3/distributions/posterior_predictive.py   |  49 +-
 pymc3/distributions/special.py                |  25 +-
 pymc3/distributions/timeseries.py             |  64 +-
 pymc3/distributions/transforms.py             | 102 +--
 pymc3/glm/families.py                         |  10 +-
 pymc3/glm/linear.py                           |  10 +-
 pymc3/glm/utils.py                            |  22 +-
 pymc3/gp/cov.py                               | 167 ++---
 pymc3/gp/gp.py                                |  88 +--
 pymc3/gp/mean.py                              |  12 +-
 pymc3/gp/util.py                              |  16 +-
 pymc3/math.py                                 |  96 +--
 pymc3/model.py                                | 252 ++++----
 pymc3/model_graph.py                          |  24 +-
 pymc3/ode/ode.py                              |  29 +-
 pymc3/ode/utils.py                            |  22 +-
 pymc3/parallel_sampling.py                    |   6 +-
 pymc3/sampling.py                             |   2 +-
 pymc3/sampling_jax.py                         |  14 +-
 pymc3/smc/smc.py                              |  22 +-
 pymc3/step_methods/arraystep.py               |  16 +-
 pymc3/step_methods/elliptical_slice.py        |   8 +-
 pymc3/step_methods/gibbs.py                   |   4 +-
 pymc3/step_methods/hmc/base_hmc.py            |  10 +-
 pymc3/step_methods/hmc/hmc.py                 |   2 +-
 pymc3/step_methods/hmc/nuts.py                |   4 +-
 pymc3/step_methods/hmc/quadpotential.py       |  26 +-
 pymc3/step_methods/metropolis.py              |  12 +-
 pymc3/step_methods/mlda.py                    |  23 +-
 pymc3/step_methods/pgbart.py                  |  10 +-
 pymc3/step_methods/sgmcmc.py                  |  28 +-
 pymc3/step_methods/slicer.py                  |   2 +-
 pymc3/tests/backend_fixtures.py               |  12 +-
 pymc3/tests/conftest.py                       |  14 +-
 pymc3/tests/helpers.py                        |  18 +-
 pymc3/tests/models.py                         |  32 +-
 pymc3/tests/sampler_fixtures.py               |   8 +-
 .../{test_theanof.py => test_aesaraf.py}      |  32 +-
 pymc3/tests/test_data_container.py            |   4 +-
 pymc3/tests/test_dist_math.py                 | 102 +--
 pymc3/tests/test_distributions.py             |  83 +--
 pymc3/tests/test_distributions_random.py      |   4 +-
 pymc3/tests/test_distributions_timeseries.py  |   2 +-
 pymc3/tests/test_examples.py                  |  14 +-
 pymc3/tests/test_gp.py                        | 240 +++----
 pymc3/tests/test_hmc.py                       |   2 +-
 pymc3/tests/test_math.py                      |  30 +-
 pymc3/tests/test_minibatches.py               |  70 +-
 pymc3/tests/test_mixture.py                   |  22 +-
 pymc3/tests/test_model.py                     |  48 +-
 pymc3/tests/test_model_graph.py               |   2 +-
 pymc3/tests/test_model_helpers.py             |  47 +-
 pymc3/tests/test_models_utils.py              |  10 +-
 pymc3/tests/test_ode.py                       |  14 +-
 pymc3/tests/test_parallel_sampling.py         |  15 +-
 pymc3/tests/test_posdef_sym.py                |  10 +-
 pymc3/tests/test_posteriors.py                |   4 +-
 pymc3/tests/test_quadpotential.py             |   2 +-
 pymc3/tests/test_random.py                    |  30 +-
 pymc3/tests/test_sampling.py                  |  40 +-
 pymc3/tests/test_shape_handling.py            |   4 +-
 pymc3/tests/test_shared.py                    |   6 +-
 pymc3/tests/test_smc.py                       |  12 +-
 pymc3/tests/test_special_functions.py         |  12 +-
 pymc3/tests/test_step.py                      |  86 +--
 pymc3/tests/test_transforms.py                |  78 +--
 pymc3/tests/test_types.py                     |  18 +-
 pymc3/tests/test_updates.py                   |  10 +-
 pymc3/tests/test_variational_inference.py     |  96 +--
 pymc3/tuning/scaling.py                       |   2 +-
 pymc3/tuning/starting.py                      |   4 +-
 pymc3/util.py                                 |   4 +-
 pymc3/variational/approximations.py           |  68 +-
 pymc3/variational/flows.py                    |  74 +--
 pymc3/variational/inference.py                |   6 +-
 pymc3/variational/operators.py                |   8 +-
 pymc3/variational/opvi.py                     | 166 ++---
 pymc3/variational/stein.py                    |  16 +-
 pymc3/variational/test_functions.py           |  26 +-
 pymc3/variational/updates.py                  | 135 ++--
 pymc3/vartypes.py                             |   7 -
 requirements.txt                              |   4 +-
 scripts/test.sh                               |   2 +-
 setup.py                                      |   2 +-
 112 files changed, 2562 insertions(+), 2499 deletions(-)
 rename pymc3/{theanof.py => aesaraf.py} (79%)
 rename pymc3/tests/{test_theanof.py => test_aesaraf.py} (90%)

diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
index c9dfdbc6bf6..0988bfa4e95 100644
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@@ -18,7 +18,7 @@ If you have questions about a specific use case, or you are not sure whether thi
 ## Versions and main components
 
 * PyMC3 Version:
-* Theano Version:
+* Aesara Version:
 * Python Version:
 * Operating system:
 * How did you install PyMC3: (conda/pip)
diff --git a/.github/workflows/arviz_compat.yml b/.github/workflows/arviz_compat.yml
index 2bbf0762054..55405d0624e 100644
--- a/.github/workflows/arviz_compat.yml
+++ b/.github/workflows/arviz_compat.yml
@@ -19,7 +19,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     env:
       TEST_SUBSET: ${{ matrix.test-subset }}
-      THEANO_FLAGS: floatX=${{ matrix.floatx }},gcc__cxxflags='-march=native'
+      AESARA_FLAGS: floatX=${{ matrix.floatx }},gcc__cxxflags='-march=native'
     defaults:
       run:
         shell: bash -l {0}
diff --git a/.github/workflows/jaxtests.yml b/.github/workflows/jaxtests.yml
index c5b3f23963d..2e2f16b33ad 100644
--- a/.github/workflows/jaxtests.yml
+++ b/.github/workflows/jaxtests.yml
@@ -17,7 +17,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     env:
       TEST_SUBSET: ${{ matrix.test-subset }}
-      THEANO_FLAGS: floatX=${{ matrix.floatx }},gcc__cxxflags='-march=native'
+      AESARA_FLAGS: floatX=${{ matrix.floatx }},gcc__cxxflags='-march=native'
     defaults:
       run:
         shell: bash -l {0}
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index e492c7e705c..7c36909811e 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -62,7 +62,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     env:
       TEST_SUBSET: ${{ matrix.test-subset }}
-      THEANO_FLAGS: floatX=${{ matrix.floatx }},gcc__cxxflags='-march=native'
+      AESARA_FLAGS: floatX=${{ matrix.floatx }},gcc__cxxflags='-march=native'
     defaults:
       run:
         shell: bash -l {0}
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 8a81e97b217..b5f34623a32 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -17,7 +17,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     env:
       TEST_SUBSET: ${{ matrix.test-subset }}
-      THEANO_FLAGS: floatX=${{ matrix.floatx }},gcc__cxxflags='-march=core2'
+      AESARA_FLAGS: floatX=${{ matrix.floatx }},gcc__cxxflags='-march=core2'
     defaults:
       run:
         shell: bash -l {0}
diff --git a/README.rst b/README.rst
index cc2c5fba08c..9f4e3b36138 100644
--- a/README.rst
+++ b/README.rst
@@ -15,13 +15,13 @@ Check out the `getting started guide <http://docs.pymc.io/notebooks/getting_star
 using Binder!
 For questions on PyMC3, head on over to our `PyMC Discourse <https://discourse.pymc.io/>`__ forum.
 
-The future of PyMC3 & Theano
+The future of PyMC3 & Aesara
 ============================
 
-There have been many questions and uncertainty around the future of PyMC3 since Theano
+There have been many questions and uncertainty around the future of PyMC3 since Aesara
 stopped getting developed by the original authors, and we started experiments with PyMC4.
 
-We are happy to announce that PyMC3 on Theano (which we are `developing further <https://github.com/pymc-devs/Theano-PyMC>`__)
+We are happy to announce that PyMC3 on Aesara (which we are `developing further <https://github.com/pymc-devs/aesara>`__)
 with a new JAX backend is the future. PyMC4 will not be developed further.
 
 See the `full announcement <https://pymc-devs.medium.com/the-future-of-pymc3-or-theano-is-dead-long-live-theano-d8005f8a0e9b>`__
@@ -39,7 +39,7 @@ Features
 -  **Variational inference**: `ADVI <http://www.jmlr.org/papers/v18/16-107.html>`__
    for fast approximate posterior estimation as well as mini-batch ADVI
    for large data sets.
--  Relies on `Theano-PyMC <https://theano-pymc.readthedocs.io/en/latest/>`__ which provides:
+-  Relies on `Aesara <https://aesara.readthedocs.io/en/latest/>`__ which provides:
     *  Computation optimization and dynamic C or JAX compilation
     *  Numpy broadcasting and advanced indexing
     *  Linear algebra operators
diff --git a/benchmarks/benchmarks/benchmarks.py b/benchmarks/benchmarks/benchmarks.py
index 489befbefc0..eb0e3b008d2 100644
--- a/benchmarks/benchmarks/benchmarks.py
+++ b/benchmarks/benchmarks/benchmarks.py
@@ -14,11 +14,11 @@
 import time
 import timeit
 
+import aesara
+import aesara.tensor as aet
 import arviz as az
 import numpy as np
 import pandas as pd
-import theano
-import theano.tensor as tt
 
 import pymc3 as pm
 
@@ -27,7 +27,7 @@ def glm_hierarchical_model(random_seed=123):
     """Sample glm hierarchical model to use in benchmarks"""
     np.random.seed(random_seed)
     data = pd.read_csv(pm.get_data("radon.csv"))
-    data["log_radon"] = data["log_radon"].astype(theano.config.floatX)
+    data["log_radon"] = data["log_radon"].astype(aesara.config.floatX)
     county_idx = data.county_code.values
 
     n_counties = len(data.county.unique())
@@ -61,8 +61,8 @@ def mixture_model(random_seed=1234):
         mu = pm.Normal("mu", mu=0.0, sd=10.0, shape=w_true.shape)
         enforce_order = pm.Potential(
             "enforce_order",
-            tt.switch(mu[0] - mu[1] <= 0, 0.0, -np.inf)
-            + tt.switch(mu[1] - mu[2] <= 0, 0.0, -np.inf),
+            aet.switch(mu[0] - mu[1] <= 0, 0.0, -np.inf)
+            + aet.switch(mu[1] - mu[2] <= 0, 0.0, -np.inf),
         )
         tau = pm.Gamma("tau", alpha=1.0, beta=1.0, shape=w_true.shape)
         pm.NormalMixture("x_obs", w=w, mu=mu, tau=tau, observed=x)
diff --git a/docs/source/Advanced_usage_of_Theano_in_PyMC3.rst b/docs/source/Advanced_usage_of_Theano_in_PyMC3.rst
index ba0df7cad46..2815c99bf40 100644
--- a/docs/source/Advanced_usage_of_Theano_in_PyMC3.rst
+++ b/docs/source/Advanced_usage_of_Theano_in_PyMC3.rst
@@ -4,20 +4,20 @@
     _referenced in docs/source/notebooks/table_of_contents_tutorials.js
 
 =================================
-Advanced usage of Theano in PyMC3
+Advanced usage of Aesara in PyMC3
 =================================
 
 Using shared variables
 ======================
 
-Shared variables allow us to use values in theano functions that are
+Shared variables allow us to use values in aesara functions that are
 not considered an input to the function, but can still be changed
 later. They are very similar to global variables in may ways::
 
-    a = tt.scalar('a')
+    a = aet.scalar('a')
     # Create a new shared variable with initial value of 0.1
-    b = theano.shared(0.1)
-    func = theano.function([a], a * b)
+    b = aesara.shared(0.1)
+    func = aesara.function([a], a * b)
     assert func(2.) == 0.2
 
     b.set_value(10.)
@@ -34,7 +34,7 @@ be time consuming if the number of datasets is large)::
     true_mu = [np.random.randn() for _ in range(10)]
     observed_data = [mu + np.random.randn(20) for mu in true_mu]
 
-    data = theano.shared(observed_data[0])
+    data = aesara.shared(observed_data[0])
     with pm.Model() as model:
         mu = pm.Normal('mu', 0, 10)
         pm.Normal('y', mu=mu, sigma=1, observed=data)
@@ -55,7 +55,7 @@ variable for our observations::
     x = np.random.randn(100)
     y = x > 0
 
-    x_shared = theano.shared(x)
+    x_shared = aesara.shared(x)
 
     with pm.Model() as model:
       coeff = pm.Normal('x', mu=0, sigma=1)
@@ -74,10 +74,10 @@ not possible to change the shape of a shared variable if that would
 also change the shape of one of the variables.
 
 
-Writing custom Theano Ops
+Writing custom Aesara Ops
 =========================
 
-While Theano includes a wide range of operations, there are cases where
+While Aesara includes a wide range of operations, there are cases where
 it makes sense to write your own. But before doing this it is a good
 idea to think hard if it is actually necessary. Especially if you want
 to use algorithms that need gradient information — this includes NUTS and
@@ -87,22 +87,22 @@ debugging skills for the gradients.
 
 Good reasons for defining a custom Op might be the following:
 
-- You require an operation that is not available in Theano and can't
-  be build up out of existing Theano operations. This could for example
+- You require an operation that is not available in Aesara and can't
+  be build up out of existing Aesara operations. This could for example
   include models where you need to solve differential equations or
   integrals, or find a root or minimum of a function that depends
   on your parameters.
 - You want to connect your PyMC3 model to some existing external code.
 - After carefully considering different parametrizations and a lot
   of profiling your model is still too slow, but you know of a faster
-  way to compute the gradient than what theano is doing. This faster
+  way to compute the gradient than what aesara is doing. This faster
   way might be anything from clever maths to using more hardware.
   There is nothing stopping anyone from using a cluster via MPI in
   a custom node, if a part of the gradient computation is slow enough
   and sufficiently parallelizable to make the cost worth it.
   We would definitely like to hear about any such examples.
 
-Theano has extensive `documentation, <http://deeplearning.net/software/theano/extending/index.html>`_
+Aesara has extensive `documentation, <https://aesara.readthedocs.io/en/latest/extending/index.html>`_
 about how to write new Ops.
 
 
@@ -158,7 +158,7 @@ We can now use `scipy.optimize.newton` to find the root::
     def mu_from_theta(theta):
         return optimize.newton(func, 1, fprime=jac, args=(theta,))
 
-We could wrap `mu_from_theta` with `theano.compile.ops.as_op` and use gradient-free
+We could wrap `mu_from_theta` with `aesara.compile.ops.as_op` and use gradient-free
 methods like Metropolis, but to get NUTS and ADVI working, we also
 need to define the derivative of `mu_from_theta`. We can find this
 derivative using the implicit function theorem, or equivalently we
@@ -181,16 +181,16 @@ We get
     \frac{d}{d\theta}\mu(\theta)
         = - \frac{\mu(\theta)^2}{1 + \theta\mu(\theta) + e^{-\theta\mu(\theta)}}
 
-Now, we use this to define a theano op, that also computes the gradient::
+Now, we use this to define a aesara op, that also computes the gradient::
 
-    import theano
-    import theano.tensor as tt
-    import theano.tests.unittest_tools
-    from theano.graph.op import Op
+    import aesara
+    import aesara.tensor as aet
+    import aesara.tests.unittest_tools
+    from aesara.graph.op import Op
 
     class MuFromTheta(Op):
-        itypes = [tt.dscalar]
-        otypes = [tt.dscalar]
+        itypes = [aet.dscalar]
+        otypes = [aet.dscalar]
 
         def perform(self, node, inputs, outputs):
             theta, = inputs
@@ -201,23 +201,23 @@ Now, we use this to define a theano op, that also computes the gradient::
             theta, = inputs
             mu = self(theta)
             thetamu = theta * mu
-            return [- g[0] * mu ** 2 / (1 + thetamu + tt.exp(-thetamu))]
+            return [- g[0] * mu ** 2 / (1 + thetamu + aet.exp(-thetamu))]
 
 If you value your sanity, always check that the gradient is ok::
 
-    theano.tests.unittest_tools.verify_grad(MuFromTheta(), [np.array(0.2)])
-    theano.tests.unittest_tools.verify_grad(MuFromTheta(), [np.array(1e-5)])
-    theano.tests.unittest_tools.verify_grad(MuFromTheta(), [np.array(1e5)])
+    aesara.gradient.verify_grad(MuFromTheta(), [np.array(0.2)])
+    aesara.gradient.verify_grad(MuFromTheta(), [np.array(1e-5)])
+    aesara.gradient.verify_grad(MuFromTheta(), [np.array(1e5)])
 
 We can now define our model using this new op::
 
     import pymc3 as pm
 
-    tt_mu_from_theta = MuFromTheta()
+    aet_mu_from_theta = MuFromTheta()
 
     with pm.Model() as model:
         theta = pm.HalfNormal('theta', sigma=1)
-        mu = pm.Deterministic('mu', tt_mu_from_theta(theta))
+        mu = pm.Deterministic('mu', aet_mu_from_theta(theta))
         pm.Normal('y', mu=mu, sigma=0.1, observed=[0.2, 0.21, 0.3])
 
         trace = pm.sample()
diff --git a/docs/source/Gaussian_Processes.rst b/docs/source/Gaussian_Processes.rst
index 3f4583a80c0..40c987acd7f 100644
--- a/docs/source/Gaussian_Processes.rst
+++ b/docs/source/Gaussian_Processes.rst
@@ -113,7 +113,7 @@ which allows users to combine covariance functions into new ones, for example:
 
 After the covariance function is defined, it is now a function that is
 evaluated by calling :code:`cov_func(x, x)` (or :code:`mean_func(x)`).  Since
-PyMC3 is built on top of Theano, it is relatively easy to define and experiment
+PyMC3 is built on top of Aesara, it is relatively easy to define and experiment
 with non-standard covariance and mean functons.  For more information check out
 the tutorial on covariance functions.
 
@@ -158,7 +158,7 @@ other type of random variable.  The first argument is the name of the random
 variable representing the function we are placing the prior over.
 The second argument is the inputs to the function that the prior is over,
 :code:`X`.  The inputs are usually known and present in the data, but they can
-also be PyMC3 random variables.  If the inputs are a Theano tensor or a
+also be PyMC3 random variables.  If the inputs are a Aesara tensor or a
 PyMC3 random variable, the :code:`shape` needs to be given.
 
 Usually at this point, inference is performed on the model.  The
diff --git a/docs/source/Probability_Distributions.rst b/docs/source/Probability_Distributions.rst
index 8c49af6eaa5..f15c43ecb91 100644
--- a/docs/source/Probability_Distributions.rst
+++ b/docs/source/Probability_Distributions.rst
@@ -27,7 +27,7 @@ A variable requires at least a ``name`` argument, and zero or more model paramet
 
         p = pm.Beta('p', 1, 1, shape=(3, 3))
 
-Probability distributions are all subclasses of ``Distribution``, which in turn has two major subclasses: ``Discrete`` and ``Continuous``. In terms of data types, a ``Continuous`` random variable is given whichever floating point type is defined by ``theano.config.floatX``, while ``Discrete`` variables are given ``int16`` types when ``theano.config.floatX`` is ``float32``, and ``int64`` otherwise.
+Probability distributions are all subclasses of ``Distribution``, which in turn has two major subclasses: ``Discrete`` and ``Continuous``. In terms of data types, a ``Continuous`` random variable is given whichever floating point type is defined by ``aesara.config.floatX``, while ``Discrete`` variables are given ``int16`` types when ``aesara.config.floatX`` is ``float32``, and ``int64`` otherwise.
 
 All distributions in ``pm.distributions`` will have two important methods: ``random()`` and ``logp()`` with the following signatures:
 
diff --git a/docs/source/PyMC3_and_Theano.rst b/docs/source/PyMC3_and_Theano.rst
index d2c521ad3f5..c3f0794ee5a 100644
--- a/docs/source/PyMC3_and_Theano.rst
+++ b/docs/source/PyMC3_and_Theano.rst
@@ -4,24 +4,24 @@
     _href from docs/source/index.rst
 
 ================
-PyMC3 and Theano
+PyMC3 and Aesara
 ================
 
-What is Theano
+What is Aesara
 ==============
 
-Theano is a package that allows us to define functions involving array
+Aesara is a package that allows us to define functions involving array
 operations and linear algebra. When we define a PyMC3 model, we implicitly
-build up a Theano function from the space of our parameters to
+build up a Aesara function from the space of our parameters to
 their posterior probability density up to a constant factor. We then use
 symbolic manipulations of this function to also get access to its gradient.
 
-Note that the original developers have stopped maintaining Theano, so
-PyMC3 uses `Theano-PyMC <https://github.com/pymc-devs/Theano-PyMC>`_,
-a fork of Theano maintained by the PyMC3 developers.
+Note that the original developers have stopped maintaining Aesara, so
+PyMC3 uses `Aesara <https://github.com/pymc-devs/aesara>`_,
+a fork of Aesara maintained by the PyMC3 developers.
 
-For a thorough introduction to Theano see the
-`theano docs <https://theano-pymc.readthedocs.io/en/latest/>`_,
+For a thorough introduction to Aesara see the
+`aesara docs <https://aesara.readthedocs.io/en/latest/>`_,
 but for the most part you don't need detailed knowledge about it as long
 as you are not trying to define new distributions or other extensions
 of PyMC3. But let's look at a simple example to get a rough
@@ -37,14 +37,14 @@ arbitrarily chosen) function
 First, we need to define symbolic variables for our inputs (this
 is similar to eg SymPy's `Symbol`)::
 
-    import theano
-    import theano.tensor as tt
+    import aesara
+    import aesara.tensor as aet
     # We don't specify the dtype of our input variables, so it
     # defaults to using float64 without any special config.
-    a = tt.scalar('a')
-    x = tt.vector('x')
-    # `tt.ivector` creates a symbolic vector of integers.
-    y = tt.ivector('y')
+    a = aet.scalar('a')
+    x = aet.vector('x')
+    # `aet.ivector` creates a symbolic vector of integers.
+    y = aet.ivector('y')
 
 Next, we use those variables to build up a symbolic representation
 of the output of our function. Note that no computation is actually
@@ -52,24 +52,24 @@ being done at this point. We only record what operations we need to
 do to compute the output::
 
     inner = a * x**3 + y**2
-    out = tt.exp(inner).sum()
+    out = aet.exp(inner).sum()
 
 .. note::
 
-   In this example we use `tt.exp` to create a symbolic representation
+   In this example we use `aet.exp` to create a symbolic representation
    of the exponential of `inner`. Somewhat surprisingly, it
    would also have worked if we used `np.exp`. This is because numpy
    gives objects it operates on a chance to define the results of
-   operations themselves. Theano variables do this for a large number
-   of operations. We usually still prefer the theano
+   operations themselves. Aesara variables do this for a large number
+   of operations. We usually still prefer the aesara
    functions instead of the numpy versions, as that makes it clear that
    we are working with symbolic input instead of plain arrays.
 
-Now we can tell Theano to build a function that does this computation.
-With a typical configuration, Theano generates C code, compiles it,
+Now we can tell Aesara to build a function that does this computation.
+With a typical configuration, Aesara generates C code, compiles it,
 and creates a python function which wraps the C function::
 
-    func = theano.function([a, x, y], [out])
+    func = aesara.function([a, x, y], [out])
 
 We can call this function with actual arrays as many times as we want::
 
@@ -79,22 +79,22 @@ We can call this function with actual arrays as many times as we want::
 
     out = func(a_val, x_vals, y_vals)
 
-For the most part the symbolic Theano variables can be operated on
-like NumPy arrays. Most NumPy functions are available in `theano.tensor`
-(which is typically imported as `tt`). A lot of linear algebra operations
-can be found in `tt.nlinalg` and `tt.slinalg` (the NumPy and SciPy
+For the most part the symbolic Aesara variables can be operated on
+like NumPy arrays. Most NumPy functions are available in `aesara.tensor`
+(which is typically imported as `aet`). A lot of linear algebra operations
+can be found in `aet.nlinalg` and `aet.slinalg` (the NumPy and SciPy
 operations respectively). Some support for sparse matrices is available
-in `theano.sparse`. For a detailed overview of available operations,
-see `the theano api docs <http://deeplearning.net/software/theano/library/tensor/index.html>`_.
+in `aesara.sparse`. For a detailed overview of available operations,
+see `the aesara api docs <https://aesara.readthedocs.io/en/latest/library/tensor/index.html>`_.
 
-A notable exception where theano variables do *not* behave like
+A notable exception where aesara variables do *not* behave like
 NumPy arrays are operations involving conditional execution.
 
 Code like this won't work as expected::
 
-    a = tt.vector('a')
+    a = aet.vector('a')
     if (a > 0).all():
-        b = tt.sqrt(a)
+        b = aet.sqrt(a)
     else:
         b = -a
 
@@ -104,17 +104,17 @@ and according to the rules for this conversion, things that aren't empty
 containers or zero are converted to `True`. So the code is equivalent
 to this::
 
-    a = tt.vector('a')
-    b = tt.sqrt(a)
+    a = aet.vector('a')
+    b = aet.sqrt(a)
 
-To get the desired behaviour, we can use `tt.switch`::
+To get the desired behaviour, we can use `aet.switch`::
 
-    a = tt.vector('a')
-    b = tt.switch((a > 0).all(), tt.sqrt(a), -a)
+    a = aet.vector('a')
+    b = aet.switch((a > 0).all(), aet.sqrt(a), -a)
 
 Indexing also works similarly to NumPy::
 
-    a = tt.vector('a')
+    a = aet.vector('a')
     # Access the 10th element. This will fail when a function build
     # from this expression is executed with an array that is too short.
     b = a[10]
@@ -122,21 +122,21 @@ Indexing also works similarly to NumPy::
     # Extract a subvector
     b = a[[1, 2, 10]]
 
-Changing elements of an array is possible using `tt.set_subtensor`::
+Changing elements of an array is possible using `aet.set_subtensor`::
 
-    a = tt.vector('a')
-    b = tt.set_subtensor(a[:10], 1)
+    a = aet.vector('a')
+    b = aet.set_subtensor(a[:10], 1)
 
-    # is roughly equivalent to this (although theano avoids
+    # is roughly equivalent to this (although aesara avoids
     # the copy if `a` isn't used anymore)
     a = np.random.randn(10)
     b = a.copy()
     b[:10] = 1
 
-How PyMC3 uses Theano
+How PyMC3 uses Aesara
 =====================
 
-Now that we have a basic understanding of Theano we can look at what
+Now that we have a basic understanding of Aesara we can look at what
 happens if we define a PyMC3 model. Let's look at a simple example::
 
     true_mu = 0.1
@@ -163,7 +163,7 @@ where with the normal likelihood :math:`N(x|μ,σ^2)`
 
 To build that function we need to keep track of two things: The parameter
 space (the *free variables*) and the logp function. For each free variable
-we generate a Theano variable. And for each variable (observed or otherwise)
+we generate a Aesara variable. And for each variable (observed or otherwise)
 we add a term to the global logp. In the background something similar to
 this is happening::
 
@@ -171,7 +171,7 @@ this is happening::
     # in exactly this way!
     model = pm.Model()
 
-    mu = tt.scalar('mu')
+    mu = aet.scalar('mu')
     model.add_free_variable(mu)
     model.add_logp_term(pm.Normal.dist(0, 1).logp(mu))
 
@@ -181,7 +181,7 @@ So calling `pm.Normal()` modifies the model: It changes the logp function
 of the model. If the `observed` keyword isn't set it also creates a new
 free variable. In contrast, `pm.Normal.dist()` doesn't care about the model,
 it just creates an object that represents the normal distribution. Calling
-`logp` on this object creates a theano variable for the logp probability
+`logp` on this object creates a aesara variable for the logp probability
 or log probability density of the distribution, but again without changing
 the model in any way.
 
@@ -199,27 +199,27 @@ is roughly equivalent to this::
 
     # For illustration only, not real code!
     model = pm.Model()
-    mu = tt.scalar('mu')
+    mu = aet.scalar('mu')
     model.add_free_variable(mu)
     model.add_logp_term(pm.Normal.dist(0, 1).logp(mu))
 
-    sd_log__ = tt.scalar('sd_log__')
+    sd_log__ = aet.scalar('sd_log__')
     model.add_free_variable(sd_log__)
     model.add_logp_term(corrected_logp_half_normal(sd_log__))
 
-    sd = tt.exp(sd_log__)
+    sd = aet.exp(sd_log__)
     model.add_deterministic_variable(sd)
 
     model.add_logp_term(pm.Normal.dist(mu, sd).logp(data))
 
 The return values of the variable constructors are subclasses
-of theano variables, so when we define a variable we can use any
-theano operation on them::
+of aesara variables, so when we define a variable we can use any
+aesara operation on them::
 
     design_matrix = np.array([[...]])
     with pm.Model() as model:
-        # beta is a tt.dvector
+        # beta is a aet.dvector
         beta = pm.Normal('beta', 0, 1, shape=len(design_matrix))
-        predict = tt.dot(design_matrix, beta)
+        predict = aet.dot(design_matrix, beta)
         sd = pm.HalfCauchy('sd', beta=2.5)
         pm.Normal('y', mu=predict, sigma=sd, observed=data)
diff --git a/docs/source/about.rst b/docs/source/about.rst
index 20f111caa5a..56cf3355555 100644
--- a/docs/source/about.rst
+++ b/docs/source/about.rst
@@ -27,7 +27,7 @@ PyMC3 strives to make Bayesian modeling as simple and painless as possible,  all
 
 * Includes a large suite of well-documented statistical distributions.
 
-* Uses Theano as the computational backend, allowing for fast expression evaluation, automatic gradient calculation, and GPU computing.
+* Uses Aesara as the computational backend, allowing for fast expression evaluation, automatic gradient calculation, and GPU computing.
 
 * Built-in support for Gaussian process modeling.
 
@@ -45,7 +45,7 @@ PyMC3 strives to make Bayesian modeling as simple and painless as possible,  all
 What's new in version 3
 =======================
 
-The third major version of PyMC has benefitted from being re-written from scratch. Substantial improvements in the user interface and performance have resulted from this. While PyMC2 relied on Fortran extensions (via f2py) for most of the computational heavy-lifting, PyMC3 leverages Theano, a library from the Montréal Institute for Learning Algorithms (MILA), for array-based expression evaluation, to perform its computation. What this provides, above all else, is fast automatic differentiation, which is at the heart of the gradient-based sampling and optimization methods currently providing inference for probabilistic programming.
+The third major version of PyMC has benefitted from being re-written from scratch. Substantial improvements in the user interface and performance have resulted from this. While PyMC2 relied on Fortran extensions (via f2py) for most of the computational heavy-lifting, PyMC3 leverages Aesara, a library from the Montréal Institute for Learning Algorithms (MILA), for array-based expression evaluation, to perform its computation. What this provides, above all else, is fast automatic differentiation, which is at the heart of the gradient-based sampling and optimization methods currently providing inference for probabilistic programming.
 
 Major changes from previous versions:
 
@@ -65,7 +65,7 @@ Major changes from previous versions:
 
 * Much more!
 
-While the addition of Theano adds a level of complexity to the development of PyMC, fundamentally altering how the underlying computation is performed, we have worked hard to maintain the elegant simplicity of the original PyMC model specification syntax.
+While the addition of Aesara adds a level of complexity to the development of PyMC, fundamentally altering how the underlying computation is performed, we have worked hard to maintain the elegant simplicity of the original PyMC model specification syntax.
 
 
 History
@@ -90,7 +90,7 @@ plotting, csv table output, improved imputation syntax, and posterior
 predictive check plots. PyMC 2.3 was released on October 31, 2013. It included
 Python 3 compatibility, improved summary plots, and some important bug fixes.
 
-In 2011, John Salvatier began thinking about implementing gradient-based MCMC samplers, and developed the ``mcex`` package to experiment with his ideas. The following year, John was invited by the team to re-engineer PyMC to accomodate Hamiltonian Monte Carlo sampling. This led to the adoption of Theano as the computational back end, and marked the beginning of PyMC3's development. The first alpha version of PyMC3 was released in June 2015. Over the following 2 years, the core development team grew to 12 members, and the first release, PyMC3 3.0, was launched in January 2017.
+In 2011, John Salvatier began thinking about implementing gradient-based MCMC samplers, and developed the ``mcex`` package to experiment with his ideas. The following year, John was invited by the team to re-engineer PyMC to accomodate Hamiltonian Monte Carlo sampling. This led to the adoption of Aesara as the computational back end, and marked the beginning of PyMC3's development. The first alpha version of PyMC3 was released in June 2015. Over the following 2 years, the core development team grew to 12 members, and the first release, PyMC3 3.0, was launched in January 2017.
 
 .. _support:
 
diff --git a/docs/source/api/math.rst b/docs/source/api/math.rst
index c548d132163..8842a77c334 100644
--- a/docs/source/api/math.rst
+++ b/docs/source/api/math.rst
@@ -3,8 +3,8 @@ Math
 ====
 
 This submodule contains various mathematical functions. Most of them are imported directly
-from theano.tensor (see there for more details). Doing any kind of math with PyMC3 random
-variables, or defining custom likelihoods or priors requires you to use these theano
+from aesara.tensor (see there for more details). Doing any kind of math with PyMC3 random
+variables, or defining custom likelihoods or priors requires you to use these aesara
 expressions rather than NumPy or Python code.
 
 .. currentmodule:: pymc3.math
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 9b23a323af7..0b7d3a1cc2c 100755
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -158,7 +158,7 @@
         ("About PyMC3", "about"),
     ],
     #     "fixed_sidebar": "false",
-    #     "description": "Probabilistic Programming in Python: Bayesian Modeling and Probabilistic Machine Learning with Theano"
+    #     "description": "Probabilistic Programming in Python: Bayesian Modeling and Probabilistic Machine Learning with Aesara"
 }
 
 # Add any paths that contain custom themes here, relative to this directory.
diff --git a/docs/source/developer_guide.rst b/docs/source/developer_guide.rst
index 64463cd5b41..4f7b6e45248 100644
--- a/docs/source/developer_guide.rst
+++ b/docs/source/developer_guide.rst
@@ -9,7 +9,7 @@ PyMC3 Developer Guide
 
 `PyMC3 <https://docs.pymc.io/>`__ is a Python package for Bayesian
 statistical modeling built on top of
-`Theano <http://deeplearning.net/software/theano/>`__. This
+`Aesara <https://aesara.readthedocs.io/en/latest/index.html>`__. This
 document aims to explain the design and implementation of probabilistic
 programming in PyMC3, with comparisons to other PPL like TensorFlow Probability (TFP)
 and Pyro in mind. A user-facing API
@@ -110,7 +110,7 @@ elementary. As long as you have a well-behaved density function, we can
 use it in the model to build the model log-likelihood function. Random
 number generation is great to have, but sometimes there might not be
 efficient random number generator for some densities. Since a function
-is all you need, you can wrap almost any Theano function into a
+is all you need, you can wrap almost any Aesara function into a
 distribution using ``pm.DensityDist``
 https://docs.pymc.io/Probability\_Distributions.html#custom-distributions
 
@@ -147,7 +147,7 @@ density function <https://en.wikipedia.org/wiki/Probability_density_function>`__
 .. math::
     X:=f(x) = \frac{1}{\sigma \sqrt{2 \pi}} \exp^{- 0.5 (\frac{x - \mu}{\sigma})^2}\vert_{\mu = 0, \sigma=1} = \frac{1}{\sqrt{2 \pi}} \exp^{- 0.5 x^2}
 
-Within a model context, RVs are essentially Theano tensors (more on that
+Within a model context, RVs are essentially Aesara tensors (more on that
 below). This is different than TFP and pyro, where you need to be more
 explicit about the conversion. For example:
 
@@ -156,7 +156,7 @@ explicit about the conversion. For example:
 .. code:: python
 
     with pm.Model() as model:
-        z = pm.Normal('z', mu=0., sigma=5.)             # ==> pymc3.model.FreeRV, or theano.tensor with logp
+        z = pm.Normal('z', mu=0., sigma=5.)             # ==> pymc3.model.FreeRV, or aesara.tensor with logp
         x = pm.Normal('x', mu=z, sigma=1., observed=5.) # ==> pymc3.model.ObservedRV, also has logp properties
     x.logp({'z': 2.5})                                  # ==> -4.0439386
     model.logp({'z': 2.5})                              # ==> -6.6973152
@@ -194,7 +194,7 @@ Random method and logp method, very different behind the curtain
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 In short, the random method is scipy/numpy-based, and the logp method is
-Theano-based. The ``logp`` method is straightforward - it is a Theano
+Aesara-based. The ``logp`` method is straightforward - it is a Aesara
 function within each distribution. It has the following signature:
 
 .. code:: python
@@ -202,20 +202,20 @@ function within each distribution. It has the following signature:
     def logp(self, value):
         # GET PARAMETERS
         param1, param2, ... = self.params1, self.params2, ...
-        # EVALUATE LOG-LIKELIHOOD FUNCTION, all inputs are (or array that could be convert to) theano tensor
+        # EVALUATE LOG-LIKELIHOOD FUNCTION, all inputs are (or array that could be convert to) aesara tensor
         total_log_prob = f(param1, param2, ..., value)
         return total_log_prob
 
-In the ``logp`` method, parameters and values are either Theano tensors,
+In the ``logp`` method, parameters and values are either Aesara tensors,
 or could be converted to tensors. It is rather convenient as the
 evaluation of logp is represented as a tensor (``RV.logpt``), and when
 we linked different ``logp`` together (e.g., summing all ``RVs.logpt``
-to get the model totall logp) the dependence is taken care of by Theano
+to get the model totall logp) the dependence is taken care of by Aesara
 when the graph is built and compiled. Again, since the compiled function
 depends on the nodes that already in the graph, whenever you want to generate
 a new function that takes new input tensors you either need to regenerate the graph
 with the appropriate dependencies, or replace the node by editing the existing graph.
-In PyMC3 we use the second approach by using ``theano.clone()`` when it is needed.
+In PyMC3 we use the second approach by using ``aesara.clone_replace()`` when it is needed.
 
 As explained above, distribution in a ``pm.Model()`` context
 automatically turn into a tensor with distribution property (pymc3
@@ -225,7 +225,7 @@ itself <https://github.com/pymc-devs/pymc3/blob/6d07591962a6c135640a3c31903eba66
 
 .. code:: python
 
-        # self is a theano.tensor with a distribution attached
+        # self is a aesara.tensor with a distribution attached
         self.logp_sum_unscaledt = distribution.logp_sum(self)
         self.logp_nojac_unscaledt = distribution.logp_nojac(self)
 
@@ -237,7 +237,7 @@ Or for a ObservedRV. it evaluate the logp on the data:
         self.logp_nojac_unscaledt = distribution.logp_nojac(data)
 
 However, for the random method things are a bit less graceful. As the
-random generator is limited in Theano, all random generation is done in
+random generator is limited in Aesara, all random generation is done in
 scipy/numpy land. In the random method, we have:
 
 .. code:: python
@@ -259,7 +259,7 @@ Here, ``point`` is a dictionary that contains dependence of
 ``(size, ) + param.shape`` arrays *conditioned* on the information from
 ``point``. This is the backbone for forwarding random simulation. The
 ``draw_values`` function is a recursive algorithm to try to resolve all
-the dependence outside of Theano, by walking the Theano computational
+the dependence outside of Aesara, by walking the Aesara computational
 graph, it is complicated and a constant pain point for bug fixing:
 https://github.com/pymc-devs/pymc3/blob/master/pymc3/distributions/distribution.py#L217-L529
 (But also see a `recent
@@ -417,11 +417,11 @@ usually created in order to optimise performance. But getting a
     class Exp(tr.ElemwiseTransform):
         name = "exp"
         def backward(self, x):
-            return tt.log(x)
+            return aet.log(x)
         def forward(self, x):
-            return tt.exp(x)
+            return aet.exp(x)
         def jacobian_det(self, x):
-            return -tt.log(x)
+            return -aet.log(x)
 
     lognorm = Exp().apply(pm.Normal.dist(0., 1.))
     lognorm
@@ -434,7 +434,7 @@ usually created in order to optimise performance. But getting a
 
 
 Now, back to ``model.RV(...)`` - things returned from ``model.RV(...)``
-are Theano tensor variables, and it is clear from looking at
+are Aesara tensor variables, and it is clear from looking at
 ``TransformedRV``:
 
 .. code:: python
@@ -452,7 +452,7 @@ as for ``FreeRV`` and ``ObservedRV``, they are ``TensorVariable``\s with
 
 ``Factor`` basically `enable and assign the
 logp <https://github.com/pymc-devs/pymc3/blob/6d07591962a6c135640a3c31903eba66b34e71d8/pymc3/model.py#L195-L276>`__
-(representated as a tensor also) property to a Theano tensor (thus
+(representated as a tensor also) property to a Aesara tensor (thus
 making it a random variable). For a ``TransformedRV``, it transforms the
 distribution into a ``TransformedDistribution``, and then ``model.Var`` is
 called again to added the RV associated with the
@@ -494,10 +494,10 @@ the model logp), and also deterministic transformation (as bookkeeping):
 named\_vars, free\_RVs, observed\_RVs, deterministics, potentials,
 missing\_values. The model context then computes some simple model
 properties, builds a bijection mapping that transforms between
-dictionary and numpy/Theano ndarray, thus allowing the ``logp``/``dlogp`` functions
+dictionary and numpy/Aesara ndarray, thus allowing the ``logp``/``dlogp`` functions
 to have two equivalent versions: one takes a ``dict`` as input and the other
 takes an ``ndarray`` as input. More importantly, a ``pm.Model()`` contains methods
-to compile Theano functions that take Random Variables (that are also
+to compile Aesara functions that take Random Variables (that are also
 initialised within the same model) as input, for example:
 
 .. code:: python
@@ -559,20 +559,20 @@ sum them together to get the model logp:
 
     @property
     def logpt(self):
-        """Theano scalar of log-probability of the model"""
+        """Aesara scalar of log-probability of the model"""
         with self:
             factors = [var.logpt for var in self.basic_RVs] + self.potentials
-            logp = tt.sum([tt.sum(factor) for factor in factors])
+            logp = aet.sum([aet.sum(factor) for factor in factors])
             ...
             return logp
 
-which returns a Theano tensor that its value depends on the free
-parameters in the model (i.e., its parent nodes from the Theano
+which returns a Aesara tensor that its value depends on the free
+parameters in the model (i.e., its parent nodes from the Aesara
 graph).You can evaluate or compile into a python callable (that you can
 pass numpy as input args). Note that the logp tensor depends on its
-input in the Theano graph, thus you cannot pass new tensor to generate a
+input in the Aesara graph, thus you cannot pass new tensor to generate a
 logp function. For similar reason, in PyMC3 we do graph copying a lot
-using theano.clone to replace the inputs to a tensor.
+using aesara.clone_replace to replace the inputs to a tensor.
 
 .. code:: python
 
@@ -587,7 +587,7 @@ using theano.clone to replace the inputs to a tensor.
 
 .. code:: python
 
-    type(m.logpt)         # ==> theano.tensor.var.TensorVariable
+    type(m.logpt)         # ==> aesara.tensor.var.TensorVariable
 
 
 .. code:: python
@@ -620,14 +620,14 @@ logp/dlogp function:
         return ValueGradFunction(self.logpt, grad_vars, extra_vars, **kwargs)
 
 ``ValueGradFunction`` is a callable class which isolates part of the
-Theano graph to compile additional Theano functions. PyMC3 relies on
-``theano.clone`` to copy the ``model.logpt`` and replace its input. It
+Aesara graph to compile additional Aesara functions. PyMC3 relies on
+``aesara.clone_replace`` to copy the ``model.logpt`` and replace its input. It
 does not edit or rewrite the graph directly.
 
 .. code:: python
 
     class ValueGradFunction:
-        """Create a theano function that computes a value and its gradient.
+        """Create a aesara function that computes a value and its gradient.
         ...
         """
         def __init__(self, logpt, grad_vars, extra_vars=[], dtype=None,
@@ -646,31 +646,31 @@ does not edit or rewrite the graph directly.
 
             # Extra vars are a subset of free_RVs that are not input to the compiled function.
             # But nonetheless logpt depends on these RVs.
-            # This is set up as a dict of theano.shared tensors, but givens (a list of
-            # tuple(free_RVs, theano.shared)) is the actual list that goes into the theano function
+            # This is set up as a dict of aesara.shared tensors, but givens (a list of
+            # tuple(free_RVs, aesara.shared)) is the actual list that goes into the aesara function
             givens = []
             self._extra_vars_shared = {}
             for var in extra_vars:
-                shared = theano.shared(var.tag.test_value, var.name + '_shared__')
+                shared = aesara.shared(var.tag.test_value, var.name + '_shared__')
                 self._extra_vars_shared[var.name] = shared
                 givens.append((var, shared))
 
             # See the implementation below. Basically, it clones the logpt and replaces its
-            # input with a *single* 1d theano tensor
+            # input with a *single* 1d aesara tensor
             self._vars_joined, self._logpt_joined = self._build_joined(
                 self._logpt, grad_vars, self._ordering.vmap)
 
-            grad = tt.grad(self._logpt_joined, self._vars_joined)
+            grad = aet.grad(self._logpt_joined, self._vars_joined)
             grad.name = '__grad'
 
             inputs = [self._vars_joined]
 
-            self._theano_function = theano.function(
+            self._aesara_function = aesara.function(
                 inputs, [self._logpt_joined, grad], givens=givens, **kwargs)
 
 
         def _build_joined(self, logpt, args, vmap):
-            args_joined = tt.vector('__args_joined')
+            args_joined = aet.vector('__args_joined')
             args_joined.tag.test_value = np.zeros(self.size, dtype=self.dtype)
 
             joined_slices = {}
@@ -680,12 +680,12 @@ does not edit or rewrite the graph directly.
                 joined_slices[vmap.var] = sliced
 
             replace = {var: joined_slices[var.name] for var in args}
-            return args_joined, theano.clone(logpt, replace=replace)
+            return args_joined, aesara.clone_replace(logpt, replace=replace)
 
 
         def __call__(self, array, grad_out=None, extra_vars=None):
             ...
-            logp, dlogp = self._theano_function(array)
+            logp, dlogp = self._aesara_function(array)
             return logp, dlogp
 
 
@@ -773,12 +773,12 @@ gradient easily. Here is a taste of how it works in action:
 
 So why is this necessary? One can imagine that we just compile one logp
 function, and do bookkeeping ourselves. For example, we can build the
-logp function in Theano directly:
+logp function in Aesara directly:
 
 .. code:: python
 
-    import theano
-    func = theano.function(m.free_RVs, m.logpt)
+    import aesara
+    func = aesara.function(m.free_RVs, m.logpt)
     func(*inputlist)
 
 
@@ -790,8 +790,8 @@ logp function in Theano directly:
 
 .. code:: python
 
-    logpt_grad = theano.grad(m.logpt, m.free_RVs)
-    func_d = theano.function(m.free_RVs, logpt_grad)
+    logpt_grad = aesara.grad(m.logpt, m.free_RVs)
+    func_d = aesara.function(m.free_RVs, logpt_grad)
     func_d(*inputlist)
 
 
@@ -808,12 +808,12 @@ Similarly, build a conditional logp:
 
 .. code:: python
 
-    shared = theano.shared(inputlist[1])
-    func2 = theano.function([m.free_RVs[0]], m.logpt, givens=[(m.free_RVs[1], shared)])
+    shared = aesara.shared(inputlist[1])
+    func2 = aesara.function([m.free_RVs[0]], m.logpt, givens=[(m.free_RVs[1], shared)])
     print(func2(inputlist[0]))
 
-    logpt_grad2 = theano.grad(m.logpt, m.free_RVs[0])
-    func_d2 = theano.function([m.free_RVs[0]], logpt_grad2, givens=[(m.free_RVs[1], shared)])
+    logpt_grad2 = aesara.grad(m.logpt, m.free_RVs[0])
+    func_d2 = aesara.function([m.free_RVs[0]], logpt_grad2, givens=[(m.free_RVs[1], shared)])
     print(func_d2(inputlist[0]))
 
 
@@ -830,7 +830,7 @@ everything into a single function:
 
 .. code:: python
 
-    func_logp_and_grad = theano.function(m.free_RVs, [m.logpt, logpt_grad])  # ==> ERROR
+    func_logp_and_grad = aesara.function(m.free_RVs, [m.logpt, logpt_grad])  # ==> ERROR
 
 
 We want to have a function that return the evaluation and its gradient
@@ -838,23 +838,23 @@ re each input: ``value, grad = f(x)``, but the naive implementation does
 not work. We can of course wrap 2 functions - one for logp one for dlogp
 - and output a list. But that would mean we need to call 2 functions. In
 addition, when we write code using python logic to do bookkeeping when
-we build our conditional logp. Using ``theano.clone``, we always have
-the input to the Theano function being a 1d vector (instead of a list of
+we build our conditional logp. Using ``aesara.clone_replace``, we always have
+the input to the Aesara function being a 1d vector (instead of a list of
 RV that each can have very different shape), thus it is very easy to do
 matrix operation like rotation etc.
 
 Notes
 ~~~~~
 
-| The current setup is quite powerful, as the Theano compiled function
+| The current setup is quite powerful, as the Aesara compiled function
   is fairly fast to compile and to call. Also, when we are repeatedly
   calling a conditional logp function, external RV only need to reset
   once. However, there are still significant overheads when we are
-  passing values between Theano graph and numpy. That is the reason we
+  passing values between Aesara graph and numpy. That is the reason we
   often see no advantage in using GPU, because the data is copying
   between GPU and CPU at each function call - and for a small model, the
   result is a slower inference under GPU than CPU.
-| Also, ``theano.clone`` is too convenient (pymc internal joke is that
+| Also, ``aesara.clone_replace`` is too convenient (pymc internal joke is that
   it is like a drug - very addictive). If all the operation happens in
   the graph (including the conditioning and setting value), I see no
   need to isolate part of the graph (via graph copying or graph
@@ -927,10 +927,10 @@ Dynamic HMC
 ^^^^^^^^^^^
 
 We love NUTS, or to be more precise Dynamic HMC with complex stopping
-rules. This part is actually all done outside of Theano, for NUTS, it
+rules. This part is actually all done outside of Aesara, for NUTS, it
 includes: the leapfrog, dual averaging, tunning of mass matrix and step
 size, the tree building, sampler related statistics like divergence and
-energy checking. We actually have a Theano version of HMC, but it has never
+energy checking. We actually have a Aesara version of HMC, but it has never
 been used, and has been removed from the main repository. It can still be
 found in the `git history
 <https://github.com/pymc-devs/pymc3/pull/3734/commits/0fdae8207fd14f66635f3673ef267b2b8817aa68>`__,
@@ -940,7 +940,7 @@ Variational Inference (VI)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 The design of the VI module takes a different approach than
-MCMC - it has a functional design, and everything is done within Theano
+MCMC - it has a functional design, and everything is done within Aesara
 (i.e., Optimization and building the variational objective). The base
 class of variational inference is
 `pymc3.variational.Inference <https://github.com/pymc-devs/pymc3/blob/master/pymc3/variational/inference.py>`__,
@@ -1006,7 +1006,7 @@ skip this for now and only consider ``SingleGroupApproximation`` like
 `variational/opvi <https://github.com/pymc-devs/pymc3/blob/master/pymc3/variational/opvi.py>`__,
 strip away the normalizing term, ``datalogp`` and ``varlogp`` are
 expectation of the variational free\_RVs and data logp - we clone the
-datalogp and varlogp from the model, replace its input with Theano
+datalogp and varlogp from the model, replace its input with Aesara
 tensor that `samples from the variational
 posterior <https://github.com/pymc-devs/pymc3/blob/6d07591962a6c135640a3c31903eba66b34e71d8/pymc3/variational/opvi.py#L1098-L1111>`__.
 For ADVI, these samples are from `a
@@ -1021,7 +1021,7 @@ straightforward to evaluate <https://github.com/pymc-devs/pymc3/blob/6d07591962a
 Some challenges and insights from implementing VI.
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
--  Graph based approach was helpful, but Theano had no direct access to
+-  Graph based approach was helpful, but Aesara had no direct access to
    previously created nodes in the computational graph. you can find a
    lot of ``@node_property`` usages in implementation. This is done to
    cache nodes. TensorFlow has graph utils for that that could
@@ -1029,12 +1029,12 @@ Some challenges and insights from implementing VI.
    Tensorflow seemed to more tricky than expected. The high level reason
    is that graph is an add only container
 
--  There were few fixed bugs not obvoius in the first place. Theano has
-   a tool to manipulate the graph (``theano.clone``) and this tool
+-  There were few fixed bugs not obvoius in the first place. Aesara has
+   a tool to manipulate the graph (``aesara.clone_replace``) and this tool
    requires extremely careful treatment when doing a lot of graph
    replacements at different level.
 
--  We coined a term ``theano.clone`` curse. We got extremely dependent
+-  We coined a term ``aesara.clone_replace`` curse. We got extremely dependent
    on this feature. Internal usages are uncountable:
 
    -  we use this to `vectorize the
@@ -1046,7 +1046,7 @@ Some challenges and insights from implementing VI.
       of computational graph.
 
 As this is the core of the VI process, we were trying to replicate this pattern
-in TF. However, when ``theano.clone`` is called, Theano creates a new part of the graph that can
+in TF. However, when ``aesara.clone_replace`` is called, Aesara creates a new part of the graph that can
 be collected by garbage collector, but TF's graph is add only. So we
 should solve the problem of replacing input in a different way.
 
@@ -1092,7 +1092,7 @@ Extending PyMC3
     -  `Inferencing Linear Mixed Model with EM.ipynb <https://github.com/junpenglao/Planet_Sakaar_Data_Science/blob/master/Ports/Inferencing%20Linear%20Mixed%20Model%20with%20EM.ipynb>`__
     -  `Laplace approximation in  pymc3.ipynb <https://github.com/junpenglao/Planet_Sakaar_Data_Science/blob/master/Ports/Laplace%20approximation%20in%20pymc3.ipynb>`__
 -  Connecting it to other library within a model
-    -  `Using “black box” likelihood function by creating a custom Theano Op <https://docs.pymc.io/notebooks/blackbox_external_likelihood.html>`__
+    -  `Using “black box” likelihood function by creating a custom Aesara Op <https://docs.pymc.io/notebooks/blackbox_external_likelihood.html>`__
     -  Using emcee
 -  Using other library for inference
     -  Connecting to Julia for solving ODE (with gradient for solution that can be used in NUTS)
@@ -1115,14 +1115,14 @@ Random methods in numpy
 
 There is a lot of complex logic for sampling from random variables, and
 because it is all in Python, we can't transform a sampling graph
-further. Unfortunately, Theano does not have code to sample from various
+further. Unfortunately, Aesara does not have code to sample from various
 distributions and we didn't want to write that our own.
 
 Samplers are in Python
 ~~~~~~~~~~~~~~~~~~~~~~
 
 While having the samplers be written in Python allows for a lot of
-flexibility and intuitive for experiment (writing e.g. NUTS in Theano is
+flexibility and intuitive for experiment (writing e.g. NUTS in Aesara is
 also very difficult), it comes at a performance penalty and makes
 sampling on the GPU very inefficient because memory needs to be copied
 for every logp evaluation.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 053e1962f14..dd4effa4a47 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -97,10 +97,10 @@
                     </div>
                 </a>
 
-                <a class="ui link card" href="/PyMC3_and_Theano.html">
+                <a class="ui link card" href="/PyMC3_and_Aesara.html">
                     <div class="content">
-                        <div class="header">PyMC3 and Theano</div>
-                        <div class="description">Theano is the deep-learning library PyMC3 uses to construct probability distributions and then access the gradient in order to implement cutting edge inference algorithms. More advanced models may be built by understanding this layer.
+                        <div class="header">PyMC3 and Aesara</div>
+                        <div class="description">Aesara is the library PyMC3 uses to construct probability distributions and then access the gradient in order to implement cutting edge inference algorithms. More advanced models may be built by understanding this layer.
                         </div>
                     </div>
                 </a>
diff --git a/pymc3/__init__.py b/pymc3/__init__.py
index 1e51deeb647..8f33feef09d 100644
--- a/pymc3/__init__.py
+++ b/pymc3/__init__.py
@@ -29,16 +29,17 @@
 
 
 def __set_compiler_flags():
-    # Workarounds for Theano compiler problems on various platforms
-    import theano
+    # Workarounds for Aesara compiler problems on various platforms
+    import aesara
 
-    current = theano.config.gcc__cxxflags
-    theano.config.gcc__cxxflags = f"{current} -Wno-c++11-narrowing"
+    current = aesara.config.gcc__cxxflags
+    aesara.config.gcc__cxxflags = f"{current} -Wno-c++11-narrowing"
 
 
 __set_compiler_flags()
 
 from pymc3 import gp, ode, sampling
+from pymc3.aesaraf import *
 from pymc3.backends import load_trace, save_trace
 from pymc3.backends.tracetab import *
 from pymc3.blocking import *
@@ -63,7 +64,6 @@ def __set_compiler_flags():
 from pymc3.smc import *
 from pymc3.step_methods import *
 from pymc3.tests import test
-from pymc3.theanof import *
 from pymc3.tuning import *
 from pymc3.variational import *
 from pymc3.vartypes import *
diff --git a/pymc3/theanof.py b/pymc3/aesaraf.py
similarity index 79%
rename from pymc3/theanof.py
rename to pymc3/aesaraf.py
index c40311da6e8..87b370e55fd 100644
--- a/pymc3/theanof.py
+++ b/pymc3/aesaraf.py
@@ -12,14 +12,17 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import aesara
 import numpy as np
-import theano
 
-from theano import scalar
-from theano import tensor as tt
-from theano.graph.basic import Apply, graph_inputs
-from theano.graph.op import Op
-from theano.sandbox.rng_mrg import MRG_RandomStream as RandomStream
+from aesara import scalar
+from aesara import tensor as aet
+from aesara.gradient import grad
+from aesara.graph.basic import Apply, graph_inputs
+from aesara.graph.op import Op
+from aesara.sandbox.rng_mrg import MRG_RandomStream as RandomStream
+from aesara.tensor.elemwise import Elemwise
+from aesara.tensor.var import TensorVariable
 
 from pymc3.blocking import ArrayOrdering
 from pymc3.data import GeneratorAdapter
@@ -39,34 +42,34 @@
     "join_nonshared_inputs",
     "make_shared_replacements",
     "generator",
-    "set_tt_rng",
-    "tt_rng",
+    "set_aet_rng",
+    "aet_rng",
     "take_along_axis",
 ]
 
 
 def inputvars(a):
     """
-    Get the inputs into a theano variables
+    Get the inputs into a aesara variables
 
     Parameters
     ----------
-        a: theano variable
+        a: aesara variable
 
     Returns
     -------
         r: list of tensor variables that are inputs
     """
-    return [v for v in graph_inputs(makeiter(a)) if isinstance(v, tt.TensorVariable)]
+    return [v for v in graph_inputs(makeiter(a)) if isinstance(v, TensorVariable)]
 
 
 def cont_inputs(f):
     """
-    Get the continuous inputs into a theano variables
+    Get the continuous inputs into a aesara variables
 
     Parameters
     ----------
-        a: theano variable
+        a: aesara variable
 
     Returns
     -------
@@ -77,13 +80,13 @@ def cont_inputs(f):
 
 def floatX(X):
     """
-    Convert a theano tensor or numpy array to theano.config.floatX type.
+    Convert a aesara tensor or numpy array to aesara.config.floatX type.
     """
     try:
-        return X.astype(theano.config.floatX)
+        return X.astype(aesara.config.floatX)
     except AttributeError:
         # Scalar passed
-        return np.asarray(X, dtype=theano.config.floatX)
+        return np.asarray(X, dtype=aesara.config.floatX)
 
 
 _conversion_map = {"float64": "int32", "float32": "int16", "float16": "int8", "float8": "int8"}
@@ -91,9 +94,9 @@ def floatX(X):
 
 def intX(X):
     """
-    Convert a theano tensor or numpy array to theano.tensor.int32 type.
+    Convert a aesara tensor or numpy array to aesara.tensor.int32 type.
     """
-    intX = _conversion_map[theano.config.floatX]
+    intX = _conversion_map[aesara.config.floatX]
     try:
         return X.astype(intX)
     except AttributeError:
@@ -111,16 +114,16 @@ def smartfloatX(x):
 
 
 """
-Theano derivative functions
+Aesara derivative functions
 """
 
 
 def gradient1(f, v):
     """flat gradient of f wrt v"""
-    return tt.flatten(tt.grad(f, v, disconnected_inputs="warn"))
+    return aet.flatten(grad(f, v, disconnected_inputs="warn"))
 
 
-empty_gradient = tt.zeros(0, dtype="float32")
+empty_gradient = aet.zeros(0, dtype="float32")
 
 
 def gradient(f, vars=None):
@@ -128,20 +131,20 @@ def gradient(f, vars=None):
         vars = cont_inputs(f)
 
     if vars:
-        return tt.concatenate([gradient1(f, v) for v in vars], axis=0)
+        return aet.concatenate([gradient1(f, v) for v in vars], axis=0)
     else:
         return empty_gradient
 
 
 def jacobian1(f, v):
     """jacobian of f wrt v"""
-    f = tt.flatten(f)
-    idx = tt.arange(f.shape[0], dtype="int32")
+    f = aet.flatten(f)
+    idx = aet.arange(f.shape[0], dtype="int32")
 
     def grad_i(i):
         return gradient1(f[i], v)
 
-    return theano.map(grad_i, idx)[0]
+    return aesara.map(grad_i, idx)[0]
 
 
 def jacobian(f, vars=None):
@@ -149,43 +152,43 @@ def jacobian(f, vars=None):
         vars = cont_inputs(f)
 
     if vars:
-        return tt.concatenate([jacobian1(f, v) for v in vars], axis=1)
+        return aet.concatenate([jacobian1(f, v) for v in vars], axis=1)
     else:
         return empty_gradient
 
 
 def jacobian_diag(f, x):
-    idx = tt.arange(f.shape[0], dtype="int32")
+    idx = aet.arange(f.shape[0], dtype="int32")
 
     def grad_ii(i):
-        return theano.grad(f[i], x)[i]
+        return grad(f[i], x)[i]
 
-    return theano.scan(grad_ii, sequences=[idx], n_steps=f.shape[0], name="jacobian_diag")[0]
+    return aesara.scan(grad_ii, sequences=[idx], n_steps=f.shape[0], name="jacobian_diag")[0]
 
 
-@theano.config.change_flags(compute_test_value="ignore")
+@aesara.config.change_flags(compute_test_value="ignore")
 def hessian(f, vars=None):
     return -jacobian(gradient(f, vars), vars)
 
 
-@theano.config.change_flags(compute_test_value="ignore")
+@aesara.config.change_flags(compute_test_value="ignore")
 def hessian_diag1(f, v):
     g = gradient1(f, v)
-    idx = tt.arange(g.shape[0], dtype="int32")
+    idx = aet.arange(g.shape[0], dtype="int32")
 
     def hess_ii(i):
         return gradient1(g[i], v)[i]
 
-    return theano.map(hess_ii, idx)[0]
+    return aesara.map(hess_ii, idx)[0]
 
 
-@theano.config.change_flags(compute_test_value="ignore")
+@aesara.config.change_flags(compute_test_value="ignore")
 def hessian_diag(f, vars=None):
     if vars is None:
         vars = cont_inputs(f)
 
     if vars:
-        return -tt.concatenate([hessian_diag1(f, v) for v in vars], axis=0)
+        return -aet.concatenate([hessian_diag1(f, v) for v in vars], axis=0)
     else:
         return empty_gradient
 
@@ -235,16 +238,16 @@ def make_shared_replacements(vars, model):
     Dict of variable -> new shared variable
     """
     othervars = set(model.vars) - set(vars)
-    return {var: theano.shared(var.tag.test_value, var.name + "_shared") for var in othervars}
+    return {var: aesara.shared(var.tag.test_value, var.name + "_shared") for var in othervars}
 
 
 def join_nonshared_inputs(xs, vars, shared, make_shared=False):
     """
-    Takes a list of theano Variables and joins their non shared inputs into a single input.
+    Takes a list of aesara Variables and joins their non shared inputs into a single input.
 
     Parameters
     ----------
-    xs: list of theano tensors
+    xs: list of aesara tensors
     vars: list of variables to join
 
     Returns
@@ -256,13 +259,13 @@ def join_nonshared_inputs(xs, vars, shared, make_shared=False):
     if not vars:
         raise ValueError("Empty list of variables.")
 
-    joined = tt.concatenate([var.ravel() for var in vars])
+    joined = aet.concatenate([var.ravel() for var in vars])
 
     if not make_shared:
         tensor_type = joined.type
         inarray = tensor_type("inarray")
     else:
-        inarray = theano.shared(joined.tag.test_value, "inarray")
+        inarray = aesara.shared(joined.tag.test_value, "inarray")
 
     ordering = ArrayOrdering(vars)
     inarray.tag.test_value = joined.tag.test_value
@@ -275,7 +278,7 @@ def join_nonshared_inputs(xs, vars, shared, make_shared=False):
 
     replace.update(shared)
 
-    xs_special = [theano.clone(x, replace, strict=False) for x in xs]
+    xs_special = [aesara.clone_replace(x, replace, strict=False) for x in xs]
     return xs_special, inarray
 
 
@@ -303,16 +306,16 @@ def __call__(self, input):
         input: TensorVariable
         """
         (oldinput,) = inputvars(self.tensor)
-        return theano.clone(self.tensor, {oldinput: input}, strict=False)
+        return aesara.clone_replace(self.tensor, {oldinput: input}, strict=False)
 
 
 scalar_identity = IdentityOp(scalar.upgrade_to_float, name="scalar_identity")
-identity = tt.Elemwise(scalar_identity, name="identity")
+identity = Elemwise(scalar_identity, name="identity")
 
 
 class GeneratorOp(Op):
     """
-    Generator Op is designed for storing python generators inside theano graph.
+    Generator Op is designed for storing python generators inside aesara graph.
 
     __call__ creates TensorVariable
         It has 2 new methods
@@ -351,7 +354,7 @@ def perform(self, node, inputs, output_storage, params=None):
     def do_constant_folding(self, fgraph, node):
         return False
 
-    __call__ = theano.config.change_flags(compute_test_value="off")(Op.__call__)
+    __call__ = aesara.config.change_flags(compute_test_value="off")(Op.__call__)
 
     def set_gen(self, gen):
         if not isinstance(gen, GeneratorAdapter):
@@ -394,10 +397,10 @@ def generator(gen, default=None):
     return GeneratorOp(gen, default)()
 
 
-_tt_rng = RandomStream()
+_aet_rng = RandomStream()
 
 
-def tt_rng(random_seed=None):
+def aet_rng(random_seed=None):
     """
     Get the package-level random number generator or new with specified seed.
 
@@ -405,36 +408,36 @@ def tt_rng(random_seed=None):
     ----------
     random_seed: int
         If not None
-        returns *new* theano random generator without replacing package global one
+        returns *new* aesara random generator without replacing package global one
 
     Returns
     -------
-    `theano.tensor.random.utils.RandomStream` instance
-        `theano.tensor.random.utils.RandomStream`
-        instance passed to the most recent call of `set_tt_rng`
+    `aesara.tensor.random.utils.RandomStream` instance
+        `aesara.tensor.random.utils.RandomStream`
+        instance passed to the most recent call of `set_aet_rng`
     """
     if random_seed is None:
-        return _tt_rng
+        return _aet_rng
     else:
         ret = RandomStream(random_seed)
         return ret
 
 
-def set_tt_rng(new_rng):
+def set_aet_rng(new_rng):
     """
     Set the package-level random number generator.
 
     Parameters
     ----------
-    new_rng: `theano.tensor.random.utils.RandomStream` instance
+    new_rng: `aesara.tensor.random.utils.RandomStream` instance
         The random number generator to use.
     """
     # pylint: disable=global-statement
-    global _tt_rng
+    global _aet_rng
     # pylint: enable=global-statement
     if isinstance(new_rng, int):
         new_rng = RandomStream(new_rng)
-    _tt_rng = new_rng
+    _aet_rng = new_rng
 
 
 def floatX_array(x):
@@ -443,7 +446,7 @@ def floatX_array(x):
 
 def ix_(*args):
     """
-    Theano np.ix_ analog
+    Aesara np.ix_ analog
 
     See numpy.lib.index_tricks.ix_ for reference
     """
@@ -452,7 +455,7 @@ def ix_(*args):
     for k, new in enumerate(args):
         if new is None:
             out.append(slice(None))
-        new = tt.as_tensor(new)
+        new = aet.as_tensor(new)
         if new.ndim != 1:
             raise ValueError("Cross index must be 1 dimensional")
         new = new.reshape((1,) * k + (new.size,) + (1,) * (nd - k - 1))
@@ -482,7 +485,7 @@ def _make_along_axis_idx(arr_shape, indices, axis):
             fancy_index.append(indices)
         else:
             ind_shape = shape_ones[:dim] + (-1,) + shape_ones[dim + 1 :]
-            fancy_index.append(tt.arange(n).reshape(ind_shape))
+            fancy_index.append(aet.arange(n).reshape(ind_shape))
 
     return tuple(fancy_index)
 
@@ -497,8 +500,8 @@ def take_along_axis(arr, indices, axis=0):
     Functions returning an index along an axis, like argsort and argpartition,
     produce suitable indices for this function.
     """
-    arr = tt.as_tensor_variable(arr)
-    indices = tt.as_tensor_variable(indices)
+    arr = aet.as_tensor_variable(arr)
+    indices = aet.as_tensor_variable(indices)
     # normalize inputs
     if axis is None:
         arr = arr.flatten()
diff --git a/pymc3/backends/base.py b/pymc3/backends/base.py
index 8b52c3e09c1..37631b656c7 100644
--- a/pymc3/backends/base.py
+++ b/pymc3/backends/base.py
@@ -23,8 +23,8 @@
 from abc import ABC
 from typing import List
 
+import aesara.tensor as aet
 import numpy as np
-import theano.tensor as tt
 
 from pymc3.backends.report import SamplerReport, merge_reports
 from pymc3.model import modelcontext
@@ -434,7 +434,7 @@ def add_values(self, vals, overwrite=False) -> None:
 
             for idx, chain in enumerate(chains.values()):
                 if new_var:
-                    dummy = tt.as_tensor_variable([], k)
+                    dummy = aet.as_tensor_variable([], k)
                     chain.vars.append(dummy)
                 chain.samples[k] = v[idx]
 
diff --git a/pymc3/blocking.py b/pymc3/blocking.py
index 36696273500..4c07b4b47c2 100644
--- a/pymc3/blocking.py
+++ b/pymc3/blocking.py
@@ -125,13 +125,13 @@ def mapf(self, f):
 
 class ListArrayOrdering:
     """
-    An ordering for a list to an array space. Takes also non theano.tensors.
+    An ordering for a list to an array space. Takes also non aesara.tensors.
     Modified from pymc3 blocking.
 
     Parameters
     ----------
     list_arrays: list
-        :class:`numpy.ndarray` or :class:`theano.tensor.Tensor`
+        :class:`numpy.ndarray` or :class:`aesara.tensor.Tensor`
     intype: str
         defining the input type 'tensor' or 'numpy'
     """
diff --git a/pymc3/data.py b/pymc3/data.py
index 4cdb793aa33..89760c14486 100644
--- a/pymc3/data.py
+++ b/pymc3/data.py
@@ -21,12 +21,14 @@
 from copy import copy
 from typing import Any, Dict, List
 
+import aesara
+import aesara.tensor as aet
 import numpy as np
 import pandas as pd
-import theano
-import theano.tensor as tt
 
-from theano.graph.basic import Apply
+from aesara.graph.basic import Apply
+from aesara.tensor.type import TensorType
+from aesara.tensor.var import TensorVariable
 
 import pymc3 as pm
 
@@ -61,7 +63,7 @@ def get_data(filename):
     return io.BytesIO(content)
 
 
-class GenTensorVariable(tt.TensorVariable):
+class GenTensorVariable(TensorVariable):
     def __init__(self, op, type, name=None):
         super().__init__(type=type, name=name)
         self.op = op
@@ -96,7 +98,7 @@ def __init__(self, generator):
         # make pickling potentially possible
         self._yielded_test_value = False
         self.gen = generator
-        self.tensortype = tt.TensorType(self.test_value.dtype, ((False,) * self.test_value.ndim))
+        self.tensortype = TensorType(self.test_value.dtype, ((False,) * self.test_value.ndim))
 
     # python3 generator
     def __next__(self):
@@ -119,7 +121,7 @@ def __hash__(self):
         return hash(id(self))
 
 
-class Minibatch(tt.TensorVariable):
+class Minibatch(TensorVariable):
     """Multidimensional minibatch that is pure TensorVariable
 
     Parameters
@@ -143,7 +145,7 @@ class Minibatch(tt.TensorVariable):
         you can use it to change source of
         minibatches programmatically
     in_memory_size: ``int`` or ``List[int|slice|Ellipsis]``
-        data size for storing in ``theano.shared``
+        data size for storing in ``aesara.shared``
 
     Attributes
     ----------
@@ -231,11 +233,11 @@ class Minibatch(tt.TensorVariable):
     To be more concrete about how we create a minibatch, here is a demo:
     1. create a shared variable
 
-        >>> shared = theano.shared(data)
+        >>> shared = aesara.shared(data)
 
     2. take a random slice of size 10:
 
-        >>> ridx = pm.tt_rng().uniform(size=(10,), low=0, high=data.shape[0]-1e-10).astype('int64')
+        >>> ridx = pm.aet_rng().uniform(size=(10,), low=0, high=data.shape[0]-1e-10).astype('int64')
 
     3) take the resulting slice:
 
@@ -255,7 +257,7 @@ class Minibatch(tt.TensorVariable):
     Then you should create a `dict` with replacements:
 
     >>> replacements = {x: testdata}
-    >>> rnode = theano.clone(node, replacements)
+    >>> rnode = aesara.clone_replace(node, replacements)
     >>> assert (testdata ** 2 == rnode.eval()).all()
 
     *FIXME: In the following, what is the **reason** to replace the Minibatch variable with
@@ -266,7 +268,7 @@ class Minibatch(tt.TensorVariable):
     For example
 
     >>> replacements = {x.minibatch: x.shared}
-    >>> rnode = theano.clone(node, replacements)
+    >>> rnode = aesara.clone_replace(node, replacements)
 
     For more complex slices some more code is needed that can seem not so clear
 
@@ -296,7 +298,7 @@ class Minibatch(tt.TensorVariable):
 
     RNG = collections.defaultdict(list)  # type: Dict[str, List[Any]]
 
-    @theano.config.change_flags(compute_test_value="raise")
+    @aesara.config.change_flags(compute_test_value="raise")
     def __init__(
         self,
         data,
@@ -313,23 +315,23 @@ def __init__(
         else:
             data = np.asarray(data, dtype)
         in_memory_slc = self.make_static_slices(in_memory_size)
-        self.shared = theano.shared(data[in_memory_slc])
+        self.shared = aesara.shared(data[in_memory_slc])
         self.update_shared_f = update_shared_f
         self.random_slc = self.make_random_slices(self.shared.shape, batch_size, random_seed)
         minibatch = self.shared[self.random_slc]
         if broadcastable is None:
             broadcastable = (False,) * minibatch.ndim
-        minibatch = tt.patternbroadcast(minibatch, broadcastable)
+        minibatch = aet.patternbroadcast(minibatch, broadcastable)
         self.minibatch = minibatch
         super().__init__(self.minibatch.type, None, None, name=name)
-        Apply(theano.compile.view_op, inputs=[self.minibatch], outputs=[self])
+        Apply(aesara.compile.view_op, inputs=[self.minibatch], outputs=[self])
         self.tag.test_value = copy(self.minibatch.tag.test_value)
 
     def rslice(self, total, size, seed):
         if size is None:
             return slice(None)
         elif isinstance(size, int):
-            rng = pm.tt_rng(seed)
+            rng = pm.aet_rng(seed)
             Minibatch.RNG[id(self)].append(rng)
             return rng.uniform(size=(size,), low=0.0, high=pm.floatX(total) - 1e-16).astype("int64")
         else:
@@ -401,7 +403,7 @@ def check(t):
                     )
                 if len(end) > 0:
                     shp_mid = shape[sep : -len(end)]
-                    mid = [tt.arange(s) for s in shp_mid]
+                    mid = [aet.arange(s) for s in shp_mid]
                 else:
                     mid = []
             else:
@@ -419,17 +421,17 @@ def check(t):
                 shp_end = np.asarray([])
             shp_begin = shape[: len(begin)]
             slc_begin = [
-                self.rslice(shp_begin[i], t[0], t[1]) if t is not None else tt.arange(shp_begin[i])
+                self.rslice(shp_begin[i], t[0], t[1]) if t is not None else aet.arange(shp_begin[i])
                 for i, t in enumerate(begin)
             ]
             slc_end = [
-                self.rslice(shp_end[i], t[0], t[1]) if t is not None else tt.arange(shp_end[i])
+                self.rslice(shp_end[i], t[0], t[1]) if t is not None else aet.arange(shp_end[i])
                 for i, t in enumerate(end)
             ]
             slc = slc_begin + mid + slc_end
         else:
             raise TypeError("Unrecognized size type, %r" % batch_size)
-        return pm.theanof.ix_(*slc)
+        return pm.aesaraf.ix_(*slc)
 
     def update_shared(self):
         if self.update_shared_f is None:
@@ -460,7 +462,7 @@ def align_minibatches(batches=None):
 
 
 class Data:
-    """Data container class that wraps the theano ``SharedVariable`` class
+    """Data container class that wraps the aesara ``SharedVariable`` class
     and lets the model be aware of its inputs and outputs.
 
     Parameters
@@ -524,7 +526,7 @@ def __new__(self, name, value, *, dims=None, export_index_as_coords=False):
 
         # `pm.model.pandas_to_array` takes care of parameter `value` and
         # transforms it to something digestible for pymc3
-        shared_object = theano.shared(pm.model.pandas_to_array(value), name)
+        shared_object = aesara.shared(pm.model.pandas_to_array(value), name)
 
         if isinstance(dims, str):
             dims = (dims,)
diff --git a/pymc3/distributions/bound.py b/pymc3/distributions/bound.py
index 074a575ebad..6443414734d 100644
--- a/pymc3/distributions/bound.py
+++ b/pymc3/distributions/bound.py
@@ -14,9 +14,10 @@
 
 from numbers import Real
 
+import aesara.tensor as aet
 import numpy as np
-import theano.tensor as tt
 
+from pymc3.aesaraf import floatX
 from pymc3.distributions import transforms
 from pymc3.distributions.dist_math import bound
 from pymc3.distributions.distribution import (
@@ -26,7 +27,6 @@
     draw_values,
     generate_samples,
 )
-from pymc3.theanof import floatX
 
 __all__ = ["Bound"]
 
@@ -207,9 +207,9 @@ class _ContinuousBounded(_Bounded, Continuous):
 
     def __init__(self, distribution, lower, upper, transform="infer", *args, **kwargs):
         if lower is not None:
-            lower = tt.as_tensor_variable(floatX(lower))
+            lower = aet.as_tensor_variable(floatX(lower))
         if upper is not None:
-            upper = tt.as_tensor_variable(floatX(upper))
+            upper = aet.as_tensor_variable(floatX(upper))
 
         if transform == "infer":
             if lower is None and upper is None:
diff --git a/pymc3/distributions/continuous.py b/pymc3/distributions/continuous.py
index 234ed935f2b..4d5310ecfe3 100644
--- a/pymc3/distributions/continuous.py
+++ b/pymc3/distributions/continuous.py
@@ -19,13 +19,14 @@
 """
 import warnings
 
+import aesara.tensor as aet
 import numpy as np
-import theano.tensor as tt
 
 from scipy import stats
 from scipy.interpolate import InterpolatedUnivariateSpline
 from scipy.special import expit
 
+from pymc3.aesaraf import floatX
 from pymc3.distributions import transforms
 from pymc3.distributions.dist_math import (
     SplineWrapper,
@@ -44,7 +45,6 @@
 from pymc3.distributions.distribution import Continuous, draw_values, generate_samples
 from pymc3.distributions.special import log_i0
 from pymc3.math import invlogit, log1mexp, log1pexp, logdiffexp, logit
-from pymc3.theanof import floatX
 
 __all__ = [
     "Uniform",
@@ -101,8 +101,8 @@ class BoundedContinuous(Continuous):
 
     def __init__(self, transform="auto", lower=None, upper=None, *args, **kwargs):
 
-        lower = tt.as_tensor_variable(lower) if lower is not None else None
-        upper = tt.as_tensor_variable(upper) if upper is not None else None
+        lower = aet.as_tensor_variable(lower) if lower is not None else None
+        upper = aet.as_tensor_variable(upper) if upper is not None else None
 
         if transform == "auto":
             if lower is None and upper is None:
@@ -223,8 +223,8 @@ class Uniform(BoundedContinuous):
     """
 
     def __init__(self, lower=0, upper=1, *args, **kwargs):
-        self.lower = lower = tt.as_tensor_variable(floatX(lower))
-        self.upper = upper = tt.as_tensor_variable(floatX(upper))
+        self.lower = lower = aet.as_tensor_variable(floatX(lower))
+        self.upper = upper = aet.as_tensor_variable(floatX(upper))
         self.mean = (upper + lower) / 2.0
         self.median = self.mean
 
@@ -268,7 +268,7 @@ def logp(self, value):
         """
         lower = self.lower
         upper = self.upper
-        return bound(-tt.log(upper - lower), value >= lower, value <= upper)
+        return bound(-aet.log(upper - lower), value >= lower, value <= upper)
 
     def logcdf(self, value):
         """
@@ -277,9 +277,9 @@ def logcdf(self, value):
 
         Parameters
         ----------
-        value: numeric or np.ndarray or theano.tensor
+        value: numeric or np.ndarray or aesara.tensor
             Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or theano tensor.
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
         Returns
         -------
@@ -288,12 +288,12 @@ def logcdf(self, value):
         lower = self.lower
         upper = self.upper
 
-        return tt.switch(
-            tt.lt(value, lower) | tt.lt(upper, lower),
+        return aet.switch(
+            aet.lt(value, lower) | aet.lt(upper, lower),
             -np.inf,
-            tt.switch(
-                tt.lt(value, upper),
-                tt.log(value - lower) - tt.log(upper - lower),
+            aet.switch(
+                aet.lt(value, upper),
+                aet.log(value - lower) - aet.log(upper - lower),
                 0,
             ),
         )
@@ -331,13 +331,13 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
         TensorVariable
         """
-        return tt.zeros_like(value)
+        return aet.zeros_like(value)
 
     def logcdf(self, value):
         """
@@ -346,16 +346,16 @@ def logcdf(self, value):
 
         Parameters
         ----------
-        value: numeric or np.ndarray or theano.tensor
+        value: numeric or np.ndarray or aesara.tensor
             Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or theano tensor.
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
         Returns
         -------
         TensorVariable
         """
-        return tt.switch(
-            tt.eq(value, -np.inf), -np.inf, tt.switch(tt.eq(value, np.inf), 0, tt.log(0.5))
+        return aet.switch(
+            aet.eq(value, -np.inf), -np.inf, aet.switch(aet.eq(value, np.inf), 0, aet.log(0.5))
         )
 
 
@@ -388,13 +388,13 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
         TensorVariable
         """
-        return bound(tt.zeros_like(value), value > 0)
+        return bound(aet.zeros_like(value), value > 0)
 
     def logcdf(self, value):
         """
@@ -403,15 +403,17 @@ def logcdf(self, value):
 
         Parameters
         ----------
-        value: numeric or np.ndarray or theano.tensor
+        value: numeric or np.ndarray or aesara.tensor
             Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or theano tensor.
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
         Returns
         -------
         TensorVariable
         """
-        return tt.switch(tt.lt(value, np.inf), -np.inf, tt.switch(tt.eq(value, np.inf), 0, -np.inf))
+        return aet.switch(
+            aet.lt(value, np.inf), -np.inf, aet.switch(aet.eq(value, np.inf), 0, -np.inf)
+        )
 
 
 class Normal(Continuous):
@@ -481,10 +483,10 @@ def __init__(self, mu=0, sigma=None, tau=None, sd=None, **kwargs):
         if sd is not None:
             sigma = sd
         tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
-        self.sigma = self.sd = tt.as_tensor_variable(sigma)
-        self.tau = tt.as_tensor_variable(tau)
+        self.sigma = self.sd = aet.as_tensor_variable(sigma)
+        self.tau = aet.as_tensor_variable(tau)
 
-        self.mean = self.median = self.mode = self.mu = mu = tt.as_tensor_variable(floatX(mu))
+        self.mean = self.median = self.mode = self.mu = mu = aet.as_tensor_variable(floatX(mu))
         self.variance = 1.0 / self.tau
 
         assert_negative_support(sigma, "sigma", "Normal")
@@ -522,7 +524,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -532,7 +534,7 @@ def logp(self, value):
         tau = self.tau
         mu = self.mu
 
-        return bound((-tau * (value - mu) ** 2 + tt.log(tau / np.pi / 2.0)) / 2.0, sigma > 0)
+        return bound((-tau * (value - mu) ** 2 + aet.log(tau / np.pi / 2.0)) / 2.0, sigma > 0)
 
     def _distr_parameters_for_repr(self):
         return ["mu", "sigma"]
@@ -544,9 +546,9 @@ def logcdf(self, value):
 
         Parameters
         ----------
-        value: numeric or np.ndarray or theano.tensor
+        value: numeric or np.ndarray or aesara.tensor
             Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or theano tensor.
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
         Returns
         -------
@@ -647,21 +649,21 @@ def __init__(
         if sd is not None:
             sigma = sd
         tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
-        self.sigma = self.sd = tt.as_tensor_variable(sigma)
-        self.tau = tt.as_tensor_variable(tau)
-        self.lower_check = tt.as_tensor_variable(floatX(lower)) if lower is not None else lower
-        self.upper_check = tt.as_tensor_variable(floatX(upper)) if upper is not None else upper
+        self.sigma = self.sd = aet.as_tensor_variable(sigma)
+        self.tau = aet.as_tensor_variable(tau)
+        self.lower_check = aet.as_tensor_variable(floatX(lower)) if lower is not None else lower
+        self.upper_check = aet.as_tensor_variable(floatX(upper)) if upper is not None else upper
         self.lower = (
-            tt.as_tensor_variable(floatX(lower))
+            aet.as_tensor_variable(floatX(lower))
             if lower is not None
-            else tt.as_tensor_variable(-np.inf)
+            else aet.as_tensor_variable(-np.inf)
         )
         self.upper = (
-            tt.as_tensor_variable(floatX(upper))
+            aet.as_tensor_variable(floatX(upper))
             if upper is not None
-            else tt.as_tensor_variable(np.inf)
+            else aet.as_tensor_variable(np.inf)
         )
-        self.mu = tt.as_tensor_variable(floatX(mu))
+        self.mu = aet.as_tensor_variable(floatX(mu))
 
         if self.lower_check is None and self.upper_check is None:
             self._defaultval = mu
@@ -732,7 +734,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -763,7 +765,7 @@ def _normalization(self):
             lsf_a = normal_lccdf(mu, sigma, self.lower)
             lsf_b = normal_lccdf(mu, sigma, self.upper)
 
-            return tt.switch(self.lower > 0, logdiffexp(lsf_a, lsf_b), logdiffexp(lcdf_b, lcdf_a))
+            return aet.switch(self.lower > 0, logdiffexp(lsf_a, lsf_b), logdiffexp(lcdf_b, lcdf_a))
 
         if self.lower_check is not None:
             return normal_lccdf(mu, sigma, self.lower)
@@ -843,10 +845,10 @@ def __init__(self, sigma=None, tau=None, sd=None, *args, **kwargs):
         super().__init__(*args, **kwargs)
         tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
 
-        self.sigma = self.sd = sigma = tt.as_tensor_variable(sigma)
-        self.tau = tau = tt.as_tensor_variable(tau)
+        self.sigma = self.sd = sigma = aet.as_tensor_variable(sigma)
+        self.tau = tau = aet.as_tensor_variable(tau)
 
-        self.mean = tt.sqrt(2 / (np.pi * self.tau))
+        self.mean = aet.sqrt(2 / (np.pi * self.tau))
         self.variance = (1.0 - 2 / np.pi) / self.tau
 
         assert_negative_support(tau, "tau", "HalfNormal")
@@ -882,7 +884,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -891,7 +893,7 @@ def logp(self, value):
         tau = self.tau
         sigma = self.sigma
         return bound(
-            -0.5 * tau * value ** 2 + 0.5 * tt.log(tau * 2.0 / np.pi),
+            -0.5 * tau * value ** 2 + 0.5 * aet.log(tau * 2.0 / np.pi),
             value >= 0,
             tau > 0,
             sigma > 0,
@@ -907,9 +909,9 @@ def logcdf(self, value):
 
         Parameters
         ----------
-        value: numeric or np.ndarray or theano.tensor
+        value: numeric or np.ndarray or aesara.tensor
             Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or theano tensor.
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
         Returns
         -------
@@ -918,7 +920,7 @@ def logcdf(self, value):
         sigma = self.sigma
         z = zvalue(value, mu=0, sigma=sigma)
         return bound(
-            tt.log1p(-tt.erfc(z / tt.sqrt(2.0))),
+            aet.log1p(-aet.erfc(z / aet.sqrt(2.0))),
             0 <= value,
             0 < sigma,
         )
@@ -1005,14 +1007,14 @@ class Wald(PositiveContinuous):
     def __init__(self, mu=None, lam=None, phi=None, alpha=0.0, *args, **kwargs):
         super().__init__(*args, **kwargs)
         mu, lam, phi = self.get_mu_lam_phi(mu, lam, phi)
-        self.alpha = alpha = tt.as_tensor_variable(floatX(alpha))
-        self.mu = mu = tt.as_tensor_variable(floatX(mu))
-        self.lam = lam = tt.as_tensor_variable(floatX(lam))
-        self.phi = phi = tt.as_tensor_variable(floatX(phi))
+        self.alpha = alpha = aet.as_tensor_variable(floatX(alpha))
+        self.mu = mu = aet.as_tensor_variable(floatX(mu))
+        self.lam = lam = aet.as_tensor_variable(floatX(lam))
+        self.phi = phi = aet.as_tensor_variable(floatX(phi))
 
         self.mean = self.mu + self.alpha
         self.mode = (
-            self.mu * (tt.sqrt(1.0 + (1.5 * self.mu / self.lam) ** 2) - 1.5 * self.mu / self.lam)
+            self.mu * (aet.sqrt(1.0 + (1.5 * self.mu / self.lam) ** 2) - 1.5 * self.mu / self.lam)
             + self.alpha
         )
         self.variance = (self.mu ** 3) / self.lam
@@ -1080,7 +1082,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -1113,9 +1115,9 @@ def logcdf(self, value):
 
         Parameters
         ----------
-        value: numeric or np.ndarray or theano.tensor
+        value: numeric or np.ndarray or aesara.tensor
             Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or theano tensor.
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
         Returns
         -------
@@ -1129,29 +1131,29 @@ def logcdf(self, value):
         value -= alpha
         q = value / mu
         l = lam * mu
-        r = tt.sqrt(value * lam)
+        r = aet.sqrt(value * lam)
 
         a = normal_lcdf(0, 1, (q - 1.0) / r)
         b = 2.0 / l + normal_lcdf(0, 1, -(q + 1.0) / r)
 
         left_limit = (
-            tt.lt(value, 0)
-            | (tt.eq(value, 0) & tt.gt(mu, 0) & tt.lt(lam, np.inf))
-            | (tt.lt(value, mu) & tt.eq(lam, 0))
+            aet.lt(value, 0)
+            | (aet.eq(value, 0) & aet.gt(mu, 0) & aet.lt(lam, np.inf))
+            | (aet.lt(value, mu) & aet.eq(lam, 0))
         )
         right_limit = (
-            tt.eq(value, np.inf)
-            | (tt.eq(lam, 0) & tt.gt(value, mu))
-            | (tt.gt(value, 0) & tt.eq(lam, np.inf))
+            aet.eq(value, np.inf)
+            | (aet.eq(lam, 0) & aet.gt(value, mu))
+            | (aet.gt(value, 0) & aet.eq(lam, np.inf))
         )
-        degenerate_dist = (tt.lt(mu, np.inf) & tt.eq(mu, value) & tt.eq(lam, 0)) | (
-            tt.eq(value, 0) & tt.eq(lam, np.inf)
+        degenerate_dist = (aet.lt(mu, np.inf) & aet.eq(mu, value) & aet.eq(lam, 0)) | (
+            aet.eq(value, 0) & aet.eq(lam, np.inf)
         )
 
         return bound(
-            tt.switch(
+            aet.switch(
                 ~(right_limit | degenerate_dist),
-                a + tt.log1p(tt.exp(b - a)),
+                a + aet.log1p(aet.exp(b - a)),
                 0,
             ),
             ~left_limit,
@@ -1229,8 +1231,8 @@ def __init__(self, alpha=None, beta=None, mu=None, sigma=None, sd=None, *args, *
         if sd is not None:
             sigma = sd
         alpha, beta = self.get_alpha_beta(alpha, beta, mu, sigma)
-        self.alpha = alpha = tt.as_tensor_variable(floatX(alpha))
-        self.beta = beta = tt.as_tensor_variable(floatX(beta))
+        self.alpha = alpha = aet.as_tensor_variable(floatX(alpha))
+        self.beta = beta = aet.as_tensor_variable(floatX(beta))
 
         self.mean = self.alpha / (self.alpha + self.beta)
         self.variance = (
@@ -1283,7 +1285,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -1292,11 +1294,11 @@ def logp(self, value):
         alpha = self.alpha
         beta = self.beta
 
-        logval = tt.log(value)
-        log1pval = tt.log1p(-value)
+        logval = aet.log(value)
+        log1pval = aet.log1p(-value)
         logp = (
-            tt.switch(tt.eq(alpha, 1), 0, (alpha - 1) * logval)
-            + tt.switch(tt.eq(beta, 1), 0, (beta - 1) * log1pval)
+            aet.switch(aet.eq(alpha, 1), 0, (alpha - 1) * logval)
+            + aet.switch(aet.eq(beta, 1), 0, (beta - 1) * log1pval)
             - betaln(alpha, beta)
         )
 
@@ -1326,9 +1328,9 @@ def logcdf(self, value):
         b = self.beta
 
         return bound(
-            tt.switch(
-                tt.lt(value, 1),
-                tt.log(incomplete_beta(a, b, value)),
+            aet.switch(
+                aet.lt(value, 1),
+                aet.log(incomplete_beta(a, b, value)),
                 0,
             ),
             0 <= value,
@@ -1385,15 +1387,15 @@ class Kumaraswamy(UnitContinuous):
     def __init__(self, a, b, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-        self.a = a = tt.as_tensor_variable(floatX(a))
-        self.b = b = tt.as_tensor_variable(floatX(b))
+        self.a = a = aet.as_tensor_variable(floatX(a))
+        self.b = b = aet.as_tensor_variable(floatX(b))
 
-        ln_mean = tt.log(b) + tt.gammaln(1 + 1 / a) + tt.gammaln(b) - tt.gammaln(1 + 1 / a + b)
-        self.mean = tt.exp(ln_mean)
+        ln_mean = aet.log(b) + aet.gammaln(1 + 1 / a) + aet.gammaln(b) - aet.gammaln(1 + 1 / a + b)
+        self.mean = aet.exp(ln_mean)
         ln_2nd_raw_moment = (
-            tt.log(b) + tt.gammaln(1 + 2 / a) + tt.gammaln(b) - tt.gammaln(1 + 2 / a + b)
+            aet.log(b) + aet.gammaln(1 + 2 / a) + aet.gammaln(b) - aet.gammaln(1 + 2 / a + b)
         )
-        self.variance = tt.exp(ln_2nd_raw_moment) - self.mean ** 2
+        self.variance = aet.exp(ln_2nd_raw_moment) - self.mean ** 2
 
         assert_negative_support(a, "a", "Kumaraswamy")
         assert_negative_support(b, "b", "Kumaraswamy")
@@ -1430,7 +1432,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -1439,7 +1441,9 @@ def logp(self, value):
         a = self.a
         b = self.b
 
-        logp = tt.log(a) + tt.log(b) + (a - 1) * tt.log(value) + (b - 1) * tt.log(1 - value ** a)
+        logp = (
+            aet.log(a) + aet.log(b) + (a - 1) * aet.log(value) + (b - 1) * aet.log(1 - value ** a)
+        )
 
         return bound(logp, value >= 0, value <= 1, a > 0, b > 0)
 
@@ -1483,10 +1487,10 @@ class Exponential(PositiveContinuous):
 
     def __init__(self, lam, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.lam = lam = tt.as_tensor_variable(floatX(lam))
+        self.lam = lam = aet.as_tensor_variable(floatX(lam))
         self.mean = 1.0 / self.lam
-        self.median = self.mean * tt.log(2)
-        self.mode = tt.zeros_like(self.lam)
+        self.median = self.mean * aet.log(2)
+        self.mode = aet.zeros_like(self.lam)
 
         self.variance = self.lam ** -2
 
@@ -1522,14 +1526,14 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
         TensorVariable
         """
         lam = self.lam
-        return bound(tt.log(lam) - lam * value, value >= 0, lam > 0)
+        return bound(aet.log(lam) - lam * value, value >= 0, lam > 0)
 
     def logcdf(self, value):
         r"""
@@ -1538,15 +1542,15 @@ def logcdf(self, value):
 
         Parameters
         ----------
-        value: numeric or np.ndarray or theano.tensor
+        value: numeric or np.ndarray or aesara.tensor
             Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or theano tensor.
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
         Returns
         -------
         TensorVariable
         """
-        value = floatX(tt.as_tensor(value))
+        value = floatX(aet.as_tensor(value))
         lam = self.lam
         a = lam * value
         return bound(
@@ -1600,8 +1604,8 @@ class Laplace(Continuous):
 
     def __init__(self, mu, b, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.b = b = tt.as_tensor_variable(floatX(b))
-        self.mean = self.median = self.mode = self.mu = mu = tt.as_tensor_variable(floatX(mu))
+        self.b = b = aet.as_tensor_variable(floatX(b))
+        self.mean = self.median = self.mode = self.mu = mu = aet.as_tensor_variable(floatX(mu))
 
         self.variance = 2 * self.b ** 2
 
@@ -1635,7 +1639,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -1644,7 +1648,7 @@ def logp(self, value):
         mu = self.mu
         b = self.b
 
-        return -tt.log(2 * b) - abs(value - mu) / b
+        return -aet.log(2 * b) - abs(value - mu) / b
 
     def logcdf(self, value):
         """
@@ -1653,9 +1657,9 @@ def logcdf(self, value):
 
         Parameters
         ----------
-        value: numeric or np.ndarray or theano.tensor
+        value: numeric or np.ndarray or aesara.tensor
             Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or theano tensor.
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
         Returns
         -------
@@ -1665,13 +1669,13 @@ def logcdf(self, value):
         b = self.b
         y = (value - a) / b
         return bound(
-            tt.switch(
-                tt.le(value, a),
-                tt.log(0.5) + y,
-                tt.switch(
-                    tt.gt(y, 1),
-                    tt.log1p(-0.5 * tt.exp(-y)),
-                    tt.log(1 - 0.5 * tt.exp(-y)),
+            aet.switch(
+                aet.le(value, a),
+                aet.log(0.5) + y,
+                aet.switch(
+                    aet.gt(y, 1),
+                    aet.log1p(-0.5 * aet.exp(-y)),
+                    aet.log(1 - 0.5 * aet.exp(-y)),
                 ),
             ),
             0 < b,
@@ -1715,9 +1719,9 @@ class AsymmetricLaplace(Continuous):
     """
 
     def __init__(self, b, kappa, mu=0, *args, **kwargs):
-        self.b = tt.as_tensor_variable(floatX(b))
-        self.kappa = tt.as_tensor_variable(floatX(kappa))
-        self.mu = mu = tt.as_tensor_variable(floatX(mu))
+        self.b = aet.as_tensor_variable(floatX(b))
+        self.kappa = aet.as_tensor_variable(floatX(kappa))
+        self.mu = mu = aet.as_tensor_variable(floatX(mu))
 
         self.mean = self.mu - (self.kappa - 1 / self.kappa) / b
         self.variance = (1 + self.kappa ** 4) / (self.kappa ** 2 * self.b ** 2)
@@ -1763,7 +1767,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -1771,8 +1775,8 @@ def logp(self, value):
         """
         value = value - self.mu
         return bound(
-            tt.log(self.b / (self.kappa + (self.kappa ** -1)))
-            + (-value * self.b * tt.sgn(value) * (self.kappa ** tt.sgn(value))),
+            aet.log(self.b / (self.kappa + (self.kappa ** -1)))
+            + (-value * self.b * aet.sgn(value) * (self.kappa ** aet.sgn(value))),
             0 < self.b,
             0 < self.kappa,
         )
@@ -1847,14 +1851,14 @@ def __init__(self, mu=0, sigma=None, tau=None, sd=None, *args, **kwargs):
 
         tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
 
-        self.mu = mu = tt.as_tensor_variable(floatX(mu))
-        self.tau = tau = tt.as_tensor_variable(tau)
-        self.sigma = self.sd = sigma = tt.as_tensor_variable(sigma)
+        self.mu = mu = aet.as_tensor_variable(floatX(mu))
+        self.tau = tau = aet.as_tensor_variable(tau)
+        self.sigma = self.sd = sigma = aet.as_tensor_variable(sigma)
 
-        self.mean = tt.exp(self.mu + 1.0 / (2 * self.tau))
-        self.median = tt.exp(self.mu)
-        self.mode = tt.exp(self.mu - 1.0 / self.tau)
-        self.variance = (tt.exp(1.0 / self.tau) - 1) * tt.exp(2 * self.mu + 1.0 / self.tau)
+        self.mean = aet.exp(self.mu + 1.0 / (2 * self.tau))
+        self.median = aet.exp(self.mu)
+        self.mode = aet.exp(self.mu - 1.0 / self.tau)
+        self.variance = (aet.exp(1.0 / self.tau) - 1) * aet.exp(2 * self.mu + 1.0 / self.tau)
 
         assert_negative_support(tau, "tau", "Lognormal")
         assert_negative_support(sigma, "sigma", "Lognormal")
@@ -1891,7 +1895,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -1900,9 +1904,9 @@ def logp(self, value):
         mu = self.mu
         tau = self.tau
         return bound(
-            -0.5 * tau * (tt.log(value) - mu) ** 2
-            + 0.5 * tt.log(tau / (2.0 * np.pi))
-            - tt.log(value),
+            -0.5 * tau * (aet.log(value) - mu) ** 2
+            + 0.5 * aet.log(tau / (2.0 * np.pi))
+            - aet.log(value),
             tau > 0,
         )
 
@@ -1916,9 +1920,9 @@ def logcdf(self, value):
 
         Parameters
         ----------
-        value: numeric or np.ndarray or theano.tensor
+        value: numeric or np.ndarray or aesara.tensor
             Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or theano tensor.
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
         Returns
         -------
@@ -1929,7 +1933,7 @@ def logcdf(self, value):
         tau = self.tau
 
         return bound(
-            normal_lcdf(mu, sigma, tt.log(value)),
+            normal_lcdf(mu, sigma, aet.log(value)),
             0 < value,
             0 < tau,
         )
@@ -2002,13 +2006,13 @@ def __init__(self, nu, mu=0, lam=None, sigma=None, sd=None, *args, **kwargs):
         super().__init__(*args, **kwargs)
         if sd is not None:
             sigma = sd
-        self.nu = nu = tt.as_tensor_variable(floatX(nu))
+        self.nu = nu = aet.as_tensor_variable(floatX(nu))
         lam, sigma = get_tau_sigma(tau=lam, sigma=sigma)
-        self.lam = lam = tt.as_tensor_variable(lam)
-        self.sigma = self.sd = sigma = tt.as_tensor_variable(sigma)
-        self.mean = self.median = self.mode = self.mu = mu = tt.as_tensor_variable(mu)
+        self.lam = lam = aet.as_tensor_variable(lam)
+        self.sigma = self.sd = sigma = aet.as_tensor_variable(sigma)
+        self.mean = self.median = self.mode = self.mu = mu = aet.as_tensor_variable(mu)
 
-        self.variance = tt.switch((nu > 2) * 1, (1 / self.lam) * (nu / (nu - 2)), np.inf)
+        self.variance = aet.switch((nu > 2) * 1, (1 / self.lam) * (nu / (nu - 2)), np.inf)
 
         assert_negative_support(lam, "lam (sigma)", "StudentT")
         assert_negative_support(nu, "nu", "StudentT")
@@ -2043,7 +2047,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -2056,9 +2060,9 @@ def logp(self, value):
 
         return bound(
             gammaln((nu + 1.0) / 2.0)
-            + 0.5 * tt.log(lam / (nu * np.pi))
+            + 0.5 * aet.log(lam / (nu * np.pi))
             - gammaln(nu / 2.0)
-            - (nu + 1.0) / 2.0 * tt.log1p(lam * (value - mu) ** 2 / nu),
+            - (nu + 1.0) / 2.0 * aet.log1p(lam * (value - mu) ** 2 / nu),
             lam > 0,
             nu > 0,
             sigma > 0,
@@ -2092,11 +2096,11 @@ def logcdf(self, value):
         sigma = self.sigma
         lam = self.lam
         t = (value - mu) / sigma
-        sqrt_t2_nu = tt.sqrt(t ** 2 + nu)
+        sqrt_t2_nu = aet.sqrt(t ** 2 + nu)
         x = (t + sqrt_t2_nu) / (2.0 * sqrt_t2_nu)
 
         return bound(
-            tt.log(incomplete_beta(nu / 2.0, nu / 2.0, x)),
+            aet.log(incomplete_beta(nu / 2.0, nu / 2.0, x)),
             0 < nu,
             0 < sigma,
             0 < lam,
@@ -2149,13 +2153,13 @@ class Pareto(Continuous):
     """
 
     def __init__(self, alpha, m, transform="lowerbound", *args, **kwargs):
-        self.alpha = alpha = tt.as_tensor_variable(floatX(alpha))
-        self.m = m = tt.as_tensor_variable(floatX(m))
+        self.alpha = alpha = aet.as_tensor_variable(floatX(alpha))
+        self.m = m = aet.as_tensor_variable(floatX(m))
 
-        self.mean = tt.switch(tt.gt(alpha, 1), alpha * m / (alpha - 1.0), np.inf)
+        self.mean = aet.switch(aet.gt(alpha, 1), alpha * m / (alpha - 1.0), np.inf)
         self.median = m * 2.0 ** (1.0 / alpha)
-        self.variance = tt.switch(
-            tt.gt(alpha, 2), (alpha * m ** 2) / ((alpha - 2.0) * (alpha - 1.0) ** 2), np.inf
+        self.variance = aet.switch(
+            aet.gt(alpha, 2), (alpha * m ** 2) / ((alpha - 2.0) * (alpha - 1.0) ** 2), np.inf
         )
 
         assert_negative_support(alpha, "alpha", "Pareto")
@@ -2197,7 +2201,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -2206,7 +2210,7 @@ def logp(self, value):
         alpha = self.alpha
         m = self.m
         return bound(
-            tt.log(alpha) + logpow(m, alpha) - logpow(value, alpha + 1),
+            aet.log(alpha) + logpow(m, alpha) - logpow(value, alpha + 1),
             value >= m,
             alpha > 0,
             m > 0,
@@ -2222,9 +2226,9 @@ def logcdf(self, value):
 
         Parameters
         ----------
-        value: numeric or np.ndarray or theano.tensor
+        value: numeric or np.ndarray or aesara.tensor
             Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or theano tensor.
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
         Returns
         -------
@@ -2234,10 +2238,10 @@ def logcdf(self, value):
         alpha = self.alpha
         arg = (m / value) ** alpha
         return bound(
-            tt.switch(
-                tt.le(arg, 1e-5),
-                tt.log1p(-arg),
-                tt.log(1 - arg),
+            aet.switch(
+                aet.le(arg, 1e-5),
+                aet.log1p(-arg),
+                aet.log(1 - arg),
             ),
             m <= value,
             0 < alpha,
@@ -2292,8 +2296,8 @@ class Cauchy(Continuous):
 
     def __init__(self, alpha, beta, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.median = self.mode = self.alpha = tt.as_tensor_variable(floatX(alpha))
-        self.beta = tt.as_tensor_variable(floatX(beta))
+        self.median = self.mode = self.alpha = aet.as_tensor_variable(floatX(alpha))
+        self.beta = aet.as_tensor_variable(floatX(beta))
 
         assert_negative_support(beta, "beta", "Cauchy")
 
@@ -2329,7 +2333,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -2338,7 +2342,7 @@ def logp(self, value):
         alpha = self.alpha
         beta = self.beta
         return bound(
-            -tt.log(np.pi) - tt.log(beta) - tt.log1p(((value - alpha) / beta) ** 2), beta > 0
+            -aet.log(np.pi) - aet.log(beta) - aet.log1p(((value - alpha) / beta) ** 2), beta > 0
         )
 
     def logcdf(self, value):
@@ -2348,9 +2352,9 @@ def logcdf(self, value):
 
         Parameters
         ----------
-        value: numeric or np.ndarray or theano.tensor
+        value: numeric or np.ndarray or aesara.tensor
             Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or theano tensor.
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
         Returns
         -------
@@ -2359,7 +2363,7 @@ def logcdf(self, value):
         alpha = self.alpha
         beta = self.beta
         return bound(
-            tt.log(0.5 + tt.arctan((value - alpha) / beta) / np.pi),
+            aet.log(0.5 + aet.arctan((value - alpha) / beta) / np.pi),
             0 < beta,
         )
 
@@ -2404,8 +2408,8 @@ class HalfCauchy(PositiveContinuous):
 
     def __init__(self, beta, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.mode = tt.as_tensor_variable(0)
-        self.median = self.beta = tt.as_tensor_variable(floatX(beta))
+        self.mode = aet.as_tensor_variable(0)
+        self.median = self.beta = aet.as_tensor_variable(floatX(beta))
 
         assert_negative_support(beta, "beta", "HalfCauchy")
 
@@ -2441,7 +2445,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -2449,7 +2453,7 @@ def logp(self, value):
         """
         beta = self.beta
         return bound(
-            tt.log(2) - tt.log(np.pi) - tt.log(beta) - tt.log1p((value / beta) ** 2),
+            aet.log(2) - aet.log(np.pi) - aet.log(beta) - aet.log1p((value / beta) ** 2),
             value >= 0,
             beta > 0,
         )
@@ -2461,9 +2465,9 @@ def logcdf(self, value):
 
         Parameters
         ----------
-        value: numeric or np.ndarray or theano.tensor
+        value: numeric or np.ndarray or aesara.tensor
             Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or theano tensor.
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
         Returns
         -------
@@ -2471,7 +2475,7 @@ def logcdf(self, value):
         """
         beta = self.beta
         return bound(
-            tt.log(2 * tt.arctan(value / beta) / np.pi),
+            aet.log(2 * aet.arctan(value / beta) / np.pi),
             0 <= value,
             0 < beta,
         )
@@ -2541,10 +2545,10 @@ def __init__(self, alpha=None, beta=None, mu=None, sigma=None, sd=None, *args, *
             sigma = sd
 
         alpha, beta = self.get_alpha_beta(alpha, beta, mu, sigma)
-        self.alpha = alpha = tt.as_tensor_variable(floatX(alpha))
-        self.beta = beta = tt.as_tensor_variable(floatX(beta))
+        self.alpha = alpha = aet.as_tensor_variable(floatX(alpha))
+        self.beta = beta = aet.as_tensor_variable(floatX(beta))
         self.mean = alpha / beta
-        self.mode = tt.maximum((alpha - 1) / beta, 0)
+        self.mode = aet.maximum((alpha - 1) / beta, 0)
         self.variance = alpha / beta ** 2
 
         assert_negative_support(alpha, "alpha", "Gamma")
@@ -2595,7 +2599,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -2617,9 +2621,9 @@ def logcdf(self, value):
 
         Parameters
         ----------
-        value: numeric or np.ndarray or theano.tensor
+        value: numeric or np.ndarray or aesara.tensor
             Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or theano tensor.
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
         Returns
         -------
@@ -2628,12 +2632,12 @@ def logcdf(self, value):
         alpha = self.alpha
         beta = self.beta
         # Avoid C-assertion when the gammainc function is called with invalid values (#4340)
-        safe_alpha = tt.switch(tt.lt(alpha, 0), 0, alpha)
-        safe_beta = tt.switch(tt.lt(beta, 0), 0, beta)
-        safe_value = tt.switch(tt.lt(value, 0), 0, value)
+        safe_alpha = aet.switch(aet.lt(alpha, 0), 0, alpha)
+        safe_beta = aet.switch(aet.lt(beta, 0), 0, beta)
+        safe_value = aet.switch(aet.lt(value, 0), 0, value)
 
         return bound(
-            tt.log(tt.gammainc(safe_alpha, safe_beta * safe_value)),
+            aet.log(aet.gammainc(safe_alpha, safe_beta * safe_value)),
             0 <= value,
             0 < alpha,
             0 < beta,
@@ -2698,13 +2702,13 @@ def __init__(self, alpha=None, beta=None, mu=None, sigma=None, sd=None, *args, *
             sigma = sd
 
         alpha, beta = InverseGamma._get_alpha_beta(alpha, beta, mu, sigma)
-        self.alpha = alpha = tt.as_tensor_variable(floatX(alpha))
-        self.beta = beta = tt.as_tensor_variable(floatX(beta))
+        self.alpha = alpha = aet.as_tensor_variable(floatX(alpha))
+        self.beta = beta = aet.as_tensor_variable(floatX(beta))
 
         self.mean = self._calculate_mean()
         self.mode = beta / (alpha + 1.0)
-        self.variance = tt.switch(
-            tt.gt(alpha, 2), (beta ** 2) / ((alpha - 2) * (alpha - 1.0) ** 2), np.inf
+        self.variance = aet.switch(
+            aet.gt(alpha, 2), (beta ** 2) / ((alpha - 2) * (alpha - 1.0) ** 2), np.inf
         )
         assert_negative_support(alpha, "alpha", "InverseGamma")
         assert_negative_support(beta, "beta", "InverseGamma")
@@ -2766,7 +2770,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -2791,9 +2795,9 @@ def logcdf(self, value):
 
         Parameters
         ----------
-        value: numeric or np.ndarray or theano.tensor
+        value: numeric or np.ndarray or aesara.tensor
             Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or theano tensor.
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
         Returns
         -------
@@ -2802,12 +2806,12 @@ def logcdf(self, value):
         alpha = self.alpha
         beta = self.beta
         # Avoid C-assertion when the gammaincc function is called with invalid values (#4340)
-        safe_alpha = tt.switch(tt.lt(alpha, 0), 0, alpha)
-        safe_beta = tt.switch(tt.lt(beta, 0), 0, beta)
-        safe_value = tt.switch(tt.lt(value, 0), 0, value)
+        safe_alpha = aet.switch(aet.lt(alpha, 0), 0, alpha)
+        safe_beta = aet.switch(aet.lt(beta, 0), 0, beta)
+        safe_value = aet.switch(aet.lt(value, 0), 0, value)
 
         return bound(
-            tt.log(tt.gammaincc(safe_alpha, safe_beta / safe_value)),
+            aet.log(aet.gammaincc(safe_alpha, safe_beta / safe_value)),
             0 <= value,
             0 < alpha,
             0 < beta,
@@ -2853,7 +2857,7 @@ class ChiSquared(Gamma):
     """
 
     def __init__(self, nu, *args, **kwargs):
-        self.nu = nu = tt.as_tensor_variable(floatX(nu))
+        self.nu = nu = aet.as_tensor_variable(floatX(nu))
         super().__init__(alpha=nu / 2.0, beta=0.5, *args, **kwargs)
 
 
@@ -2903,12 +2907,12 @@ class Weibull(PositiveContinuous):
 
     def __init__(self, alpha, beta, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.alpha = alpha = tt.as_tensor_variable(floatX(alpha))
-        self.beta = beta = tt.as_tensor_variable(floatX(beta))
-        self.mean = beta * tt.exp(gammaln(1 + 1.0 / alpha))
-        self.median = beta * tt.exp(gammaln(tt.log(2))) ** (1.0 / alpha)
-        self.variance = beta ** 2 * tt.exp(gammaln(1 + 2.0 / alpha)) - self.mean ** 2
-        self.mode = tt.switch(
+        self.alpha = alpha = aet.as_tensor_variable(floatX(alpha))
+        self.beta = beta = aet.as_tensor_variable(floatX(beta))
+        self.mean = beta * aet.exp(gammaln(1 + 1.0 / alpha))
+        self.median = beta * aet.exp(gammaln(aet.log(2))) ** (1.0 / alpha)
+        self.variance = beta ** 2 * aet.exp(gammaln(1 + 2.0 / alpha)) - self.mean ** 2
+        self.mode = aet.switch(
             alpha >= 1, beta * ((alpha - 1) / alpha) ** (1 / alpha), 0
         )  # Reference: https://en.wikipedia.org/wiki/Weibull_distribution
 
@@ -2947,7 +2951,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -2956,9 +2960,9 @@ def logp(self, value):
         alpha = self.alpha
         beta = self.beta
         return bound(
-            tt.log(alpha)
-            - tt.log(beta)
-            + (alpha - 1) * tt.log(value / beta)
+            aet.log(alpha)
+            - aet.log(beta)
+            + (alpha - 1) * aet.log(value / beta)
             - (value / beta) ** alpha,
             value >= 0,
             alpha > 0,
@@ -2972,9 +2976,9 @@ def logcdf(self, value):
 
         Parameters
         ----------
-        value: numeric or np.ndarray or theano.tensor
+        value: numeric or np.ndarray or aesara.tensor
             Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or theano tensor.
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
         Returns
         -------
@@ -3053,12 +3057,12 @@ def __init__(self, nu=1, sigma=None, lam=None, sd=None, *args, **kwargs):
         if sd is not None:
             sigma = sd
 
-        self.mode = tt.as_tensor_variable(0)
+        self.mode = aet.as_tensor_variable(0)
         lam, sigma = get_tau_sigma(lam, sigma)
-        self.median = tt.as_tensor_variable(sigma)
-        self.sigma = self.sd = tt.as_tensor_variable(sigma)
-        self.lam = tt.as_tensor_variable(lam)
-        self.nu = nu = tt.as_tensor_variable(floatX(nu))
+        self.median = aet.as_tensor_variable(sigma)
+        self.sigma = self.sd = aet.as_tensor_variable(sigma)
+        self.lam = aet.as_tensor_variable(lam)
+        self.nu = nu = aet.as_tensor_variable(floatX(nu))
 
         assert_negative_support(sigma, "sigma", "HalfStudentT")
         assert_negative_support(lam, "lam", "HalfStudentT")
@@ -3094,7 +3098,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -3105,11 +3109,11 @@ def logp(self, value):
         lam = self.lam
 
         return bound(
-            tt.log(2)
+            aet.log(2)
             + gammaln((nu + 1.0) / 2.0)
             - gammaln(nu / 2.0)
-            - 0.5 * tt.log(nu * np.pi * sigma ** 2)
-            - (nu + 1.0) / 2.0 * tt.log1p(value ** 2 / (nu * sigma ** 2)),
+            - 0.5 * aet.log(nu * np.pi * sigma ** 2)
+            - (nu + 1.0) / 2.0 * aet.log1p(value ** 2 / (nu * sigma ** 2)),
             sigma > 0,
             lam > 0,
             nu > 0,
@@ -3191,9 +3195,9 @@ def __init__(self, mu=0.0, sigma=None, nu=None, sd=None, *args, **kwargs):
         if sd is not None:
             sigma = sd
 
-        self.mu = mu = tt.as_tensor_variable(floatX(mu))
-        self.sigma = self.sd = sigma = tt.as_tensor_variable(floatX(sigma))
-        self.nu = nu = tt.as_tensor_variable(floatX(nu))
+        self.mu = mu = aet.as_tensor_variable(floatX(mu))
+        self.sigma = self.sd = sigma = aet.as_tensor_variable(floatX(sigma))
+        self.nu = nu = aet.as_tensor_variable(floatX(nu))
         self.mean = mu + nu
         self.variance = (sigma ** 2) + (nu ** 2)
 
@@ -3234,7 +3238,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -3246,10 +3250,10 @@ def logp(self, value):
 
         # Alogithm is adapted from dexGAUS.R from gamlss
         return bound(
-            tt.switch(
-                tt.gt(nu, 0.05 * sigma),
+            aet.switch(
+                aet.gt(nu, 0.05 * sigma),
                 (
-                    -tt.log(nu)
+                    -aet.log(nu)
                     + (mu - value) / nu
                     + 0.5 * (sigma / nu) ** 2
                     + normal_lcdf(mu + (sigma ** 2) / nu, sigma, value)
@@ -3273,9 +3277,9 @@ def logcdf(self, value):
 
         Parameters
         ----------
-        value: numeric or np.ndarray or theano.tensor
+        value: numeric or np.ndarray or aesara.tensor
             Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or theano tensor.
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
         Returns
         -------
@@ -3287,8 +3291,8 @@ def logcdf(self, value):
 
         # Alogithm is adapted from pexGAUS.R from gamlss
         return bound(
-            tt.switch(
-                tt.gt(nu, 0.05 * sigma),
+            aet.switch(
+                aet.gt(nu, 0.05 * sigma),
                 logdiffexp(
                     normal_lcdf(mu, sigma, value),
                     (
@@ -3355,8 +3359,8 @@ def __init__(self, mu=0.0, kappa=None, transform="circular", *args, **kwargs):
         if transform == "circular":
             transform = transforms.Circular()
         super().__init__(transform=transform, *args, **kwargs)
-        self.mean = self.median = self.mode = self.mu = mu = tt.as_tensor_variable(floatX(mu))
-        self.kappa = kappa = tt.as_tensor_variable(floatX(kappa))
+        self.mean = self.median = self.mode = self.mu = mu = aet.as_tensor_variable(floatX(mu))
+        self.kappa = kappa = aet.as_tensor_variable(floatX(kappa))
 
         assert_negative_support(kappa, "kappa", "VonMises")
 
@@ -3390,7 +3394,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -3399,7 +3403,7 @@ def logp(self, value):
         mu = self.mu
         kappa = self.kappa
         return bound(
-            kappa * tt.cos(mu - value) - (tt.log(2 * np.pi) + log_i0(kappa)),
+            kappa * aet.cos(mu - value) - (aet.log(2 * np.pi) + log_i0(kappa)),
             kappa > 0,
             value >= -np.pi,
             value <= np.pi,
@@ -3474,11 +3478,11 @@ def __init__(self, mu=0.0, sigma=None, tau=None, alpha=1, sd=None, *args, **kwar
             sigma = sd
 
         tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
-        self.mu = mu = tt.as_tensor_variable(floatX(mu))
-        self.tau = tt.as_tensor_variable(tau)
-        self.sigma = self.sd = tt.as_tensor_variable(sigma)
+        self.mu = mu = aet.as_tensor_variable(floatX(mu))
+        self.tau = aet.as_tensor_variable(tau)
+        self.sigma = self.sd = aet.as_tensor_variable(sigma)
 
-        self.alpha = alpha = tt.as_tensor_variable(floatX(alpha))
+        self.alpha = alpha = aet.as_tensor_variable(floatX(alpha))
 
         self.mean = mu + self.sigma * (2 / np.pi) ** 0.5 * alpha / (1 + alpha ** 2) ** 0.5
         self.variance = self.sigma ** 2 * (1 - (2 * alpha ** 2) / ((1 + alpha ** 2) * np.pi))
@@ -3518,7 +3522,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -3529,8 +3533,8 @@ def logp(self, value):
         mu = self.mu
         alpha = self.alpha
         return bound(
-            tt.log(1 + tt.erf(((value - mu) * tt.sqrt(tau) * alpha) / tt.sqrt(2)))
-            + (-tau * (value - mu) ** 2 + tt.log(tau / np.pi / 2.0)) / 2.0,
+            aet.log(1 + aet.erf(((value - mu) * aet.sqrt(tau) * alpha) / aet.sqrt(2)))
+            + (-tau * (value - mu) ** 2 + aet.log(tau / np.pi / 2.0)) / 2.0,
             tau > 0,
             sigma > 0,
         )
@@ -3594,9 +3598,9 @@ class Triangular(BoundedContinuous):
     """
 
     def __init__(self, lower=0, upper=1, c=0.5, *args, **kwargs):
-        self.median = self.mean = self.c = c = tt.as_tensor_variable(floatX(c))
-        self.lower = lower = tt.as_tensor_variable(floatX(lower))
-        self.upper = upper = tt.as_tensor_variable(floatX(upper))
+        self.median = self.mean = self.c = c = aet.as_tensor_variable(floatX(c))
+        self.lower = lower = aet.as_tensor_variable(floatX(lower))
+        self.upper = upper = aet.as_tensor_variable(floatX(upper))
 
         super().__init__(lower=lower, upper=upper, *args, **kwargs)
 
@@ -3639,7 +3643,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -3649,10 +3653,10 @@ def logp(self, value):
         lower = self.lower
         upper = self.upper
         return bound(
-            tt.switch(
-                tt.lt(value, c),
-                tt.log(2 * (value - lower) / ((upper - lower) * (c - lower))),
-                tt.log(2 * (upper - value) / ((upper - lower) * (upper - c))),
+            aet.switch(
+                aet.lt(value, c),
+                aet.log(2 * (value - lower) / ((upper - lower) * (c - lower))),
+                aet.log(2 * (upper - value) / ((upper - lower) * (upper - c))),
             ),
             lower <= value,
             value <= upper,
@@ -3665,9 +3669,9 @@ def logcdf(self, value):
 
         Parameters
         ----------
-        value: numeric or np.ndarray or theano.tensor
+        value: numeric or np.ndarray or aesara.tensor
             Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or theano tensor.
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
         Returns
         -------
@@ -3677,15 +3681,15 @@ def logcdf(self, value):
         lower = self.lower
         upper = self.upper
         return bound(
-            tt.switch(
-                tt.le(value, lower),
+            aet.switch(
+                aet.le(value, lower),
                 -np.inf,
-                tt.switch(
-                    tt.le(value, c),
-                    tt.log(((value - lower) ** 2) / ((upper - lower) * (c - lower))),
-                    tt.switch(
-                        tt.lt(value, upper),
-                        tt.log1p(-((upper - value) ** 2) / ((upper - lower) * (upper - c))),
+                aet.switch(
+                    aet.le(value, c),
+                    aet.log(((value - lower) ** 2) / ((upper - lower) * (c - lower))),
+                    aet.switch(
+                        aet.lt(value, upper),
+                        aet.log1p(-((upper - value) ** 2) / ((upper - lower) * (upper - c))),
                         0,
                     ),
                 ),
@@ -3743,13 +3747,13 @@ class Gumbel(Continuous):
     """
 
     def __init__(self, mu=0, beta=1.0, **kwargs):
-        self.mu = tt.as_tensor_variable(floatX(mu))
-        self.beta = tt.as_tensor_variable(floatX(beta))
+        self.mu = aet.as_tensor_variable(floatX(mu))
+        self.beta = aet.as_tensor_variable(floatX(beta))
 
         assert_negative_support(beta, "beta", "Gumbel")
 
         self.mean = self.mu + self.beta * np.euler_gamma
-        self.median = self.mu - self.beta * tt.log(tt.log(2))
+        self.median = self.mu - self.beta * aet.log(aet.log(2))
         self.mode = self.mu
         self.variance = (np.pi ** 2 / 6.0) * self.beta ** 2
 
@@ -3785,7 +3789,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -3795,7 +3799,7 @@ def logp(self, value):
         beta = self.beta
         scaled = (value - mu) / beta
         return bound(
-            -scaled - tt.exp(-scaled) - tt.log(self.beta),
+            -scaled - aet.exp(-scaled) - aet.log(self.beta),
             0 < beta,
         )
 
@@ -3806,9 +3810,9 @@ def logcdf(self, value):
 
         Parameters
         ----------
-        value: numeric or np.ndarray or theano.tensor
+        value: numeric or np.ndarray or aesara.tensor
             Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or theano tensor.
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
         Returns
         -------
@@ -3818,7 +3822,7 @@ def logcdf(self, value):
         mu = self.mu
 
         return bound(
-            -tt.exp(-(value - mu) / beta),
+            -aet.exp(-(value - mu) / beta),
             0 < beta,
         )
 
@@ -3888,18 +3892,18 @@ def __init__(self, nu=None, sigma=None, b=None, sd=None, *args, **kwargs):
             sigma = sd
 
         nu, b, sigma = self.get_nu_b(nu, b, sigma)
-        self.nu = nu = tt.as_tensor_variable(floatX(nu))
-        self.sigma = self.sd = sigma = tt.as_tensor_variable(floatX(sigma))
-        self.b = b = tt.as_tensor_variable(floatX(b))
+        self.nu = nu = aet.as_tensor_variable(floatX(nu))
+        self.sigma = self.sd = sigma = aet.as_tensor_variable(floatX(sigma))
+        self.b = b = aet.as_tensor_variable(floatX(b))
 
         nu_sigma_ratio = -(nu ** 2) / (2 * sigma ** 2)
         self.mean = (
             sigma
             * np.sqrt(np.pi / 2)
-            * tt.exp(nu_sigma_ratio / 2)
+            * aet.exp(nu_sigma_ratio / 2)
             * (
-                (1 - nu_sigma_ratio) * tt.i0(-nu_sigma_ratio / 2)
-                - nu_sigma_ratio * tt.i1(-nu_sigma_ratio / 2)
+                (1 - nu_sigma_ratio) * aet.i0(-nu_sigma_ratio / 2)
+                - nu_sigma_ratio * aet.i1(-nu_sigma_ratio / 2)
             )
         )
         self.variance = (
@@ -3907,10 +3911,10 @@ def __init__(self, nu=None, sigma=None, b=None, sd=None, *args, **kwargs):
             + nu ** 2
             - (np.pi * sigma ** 2 / 2)
             * (
-                tt.exp(nu_sigma_ratio / 2)
+                aet.exp(nu_sigma_ratio / 2)
                 * (
-                    (1 - nu_sigma_ratio) * tt.i0(-nu_sigma_ratio / 2)
-                    - nu_sigma_ratio * tt.i1(-nu_sigma_ratio / 2)
+                    (1 - nu_sigma_ratio) * aet.i0(-nu_sigma_ratio / 2)
+                    - nu_sigma_ratio * aet.i1(-nu_sigma_ratio / 2)
                 )
             )
             ** 2
@@ -3963,7 +3967,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -3974,7 +3978,7 @@ def logp(self, value):
         b = self.b
         x = value / sigma
         return bound(
-            tt.log(x * tt.exp((-(x - b) * (x - b)) / 2) * i0e(x * b) / sigma),
+            aet.log(x * aet.exp((-(x - b) * (x - b)) / 2) * i0e(x * b) / sigma),
             sigma >= 0,
             nu >= 0,
             value > 0,
@@ -4030,8 +4034,8 @@ class Logistic(Continuous):
     def __init__(self, mu=0.0, s=1.0, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-        self.mu = tt.as_tensor_variable(floatX(mu))
-        self.s = tt.as_tensor_variable(floatX(s))
+        self.mu = aet.as_tensor_variable(floatX(mu))
+        self.s = aet.as_tensor_variable(floatX(s))
 
         self.mean = self.mode = mu
         self.variance = s ** 2 * np.pi ** 2 / 3.0
@@ -4067,7 +4071,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -4077,7 +4081,7 @@ def logp(self, value):
         s = self.s
 
         return bound(
-            -(value - mu) / s - tt.log(s) - 2 * tt.log1p(tt.exp(-(value - mu) / s)),
+            -(value - mu) / s - aet.log(s) - 2 * aet.log1p(aet.exp(-(value - mu) / s)),
             s > 0,
         )
 
@@ -4088,9 +4092,9 @@ def logcdf(self, value):
 
         Parameters
         ----------
-        value: numeric or np.ndarray or theano.tensor
+        value: numeric or np.ndarray or aesara.tensor
             Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or theano tensor.
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
         Returns
         -------
@@ -4151,10 +4155,10 @@ class LogitNormal(UnitContinuous):
     def __init__(self, mu=0, sigma=None, tau=None, sd=None, **kwargs):
         if sd is not None:
             sigma = sd
-        self.mu = mu = tt.as_tensor_variable(floatX(mu))
+        self.mu = mu = aet.as_tensor_variable(floatX(mu))
         tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
-        self.sigma = self.sd = tt.as_tensor_variable(sigma)
-        self.tau = tau = tt.as_tensor_variable(tau)
+        self.sigma = self.sd = aet.as_tensor_variable(sigma)
+        self.tau = tau = aet.as_tensor_variable(tau)
 
         self.median = invlogit(mu)
         assert_negative_support(sigma, "sigma", "LogitNormal")
@@ -4192,7 +4196,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -4202,8 +4206,8 @@ def logp(self, value):
         tau = self.tau
         return bound(
             -0.5 * tau * (logit(value) - mu) ** 2
-            + 0.5 * tt.log(tau / (2.0 * np.pi))
-            - tt.log(value * (1 - value)),
+            + 0.5 * aet.log(tau / (2.0 * np.pi))
+            - aet.log(value * (1 - value)),
             value > 0,
             value < 1,
             tau > 0,
@@ -4242,15 +4246,15 @@ class Interpolated(BoundedContinuous):
     """
 
     def __init__(self, x_points, pdf_points, *args, **kwargs):
-        self.lower = lower = tt.as_tensor_variable(x_points[0])
-        self.upper = upper = tt.as_tensor_variable(x_points[-1])
+        self.lower = lower = aet.as_tensor_variable(x_points[0])
+        self.upper = upper = aet.as_tensor_variable(x_points[-1])
 
         super().__init__(lower=lower, upper=upper, *args, **kwargs)
 
         interp = InterpolatedUnivariateSpline(x_points, pdf_points, k=1, ext="zeros")
         Z = interp.integral(x_points[0], x_points[-1])
 
-        self.Z = tt.as_tensor_variable(Z)
+        self.Z = aet.as_tensor_variable(Z)
         self.interp_op = SplineWrapper(interp)
         self.x_points = x_points
         self.pdf_points = pdf_points / Z
@@ -4301,13 +4305,13 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
         TensorVariable
         """
-        return tt.log(self.interp_op(value) / self.Z)
+        return aet.log(self.interp_op(value) / self.Z)
 
     def _distr_parameters_for_repr(self):
         return []
@@ -4361,13 +4365,13 @@ class Moyal(Continuous):
     """
 
     def __init__(self, mu=0, sigma=1.0, *args, **kwargs):
-        self.mu = tt.as_tensor_variable(floatX(mu))
-        self.sigma = tt.as_tensor_variable(floatX(sigma))
+        self.mu = aet.as_tensor_variable(floatX(mu))
+        self.sigma = aet.as_tensor_variable(floatX(sigma))
 
         assert_negative_support(sigma, "sigma", "Moyal")
 
-        self.mean = self.mu + self.sigma * (np.euler_gamma + tt.log(2))
-        self.median = self.mu - self.sigma * tt.log(2 * tt.erfcinv(1 / 2) ** 2)
+        self.mean = self.mu + self.sigma * (np.euler_gamma + aet.log(2))
+        self.median = self.mu - self.sigma * aet.log(2 * aet.erfcinv(1 / 2) ** 2)
         self.mode = self.mu
         self.variance = (np.pi ** 2 / 2.0) * self.sigma ** 2
 
@@ -4403,7 +4407,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -4413,7 +4417,11 @@ def logp(self, value):
         sigma = self.sigma
         scaled = (value - mu) / sigma
         return bound(
-            (-(1 / 2) * (scaled + tt.exp(-scaled)) - tt.log(sigma) - (1 / 2) * tt.log(2 * np.pi)),
+            (
+                -(1 / 2) * (scaled + aet.exp(-scaled))
+                - aet.log(sigma)
+                - (1 / 2) * aet.log(2 * np.pi)
+            ),
             0 < sigma,
         )
 
@@ -4424,9 +4432,9 @@ def logcdf(self, value):
 
         Parameters
         ----------
-        value: numeric or np.ndarray or theano.tensor
+        value: numeric or np.ndarray or aesara.tensor
             Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or theano tensor.
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
         Returns
         -------
@@ -4437,6 +4445,6 @@ def logcdf(self, value):
 
         scaled = (value - mu) / sigma
         return bound(
-            tt.log(tt.erfc(tt.exp(-scaled / 2) * (2 ** -0.5))),
+            aet.log(aet.erfc(aet.exp(-scaled / 2) * (2 ** -0.5))),
             0 < sigma,
         )
diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py
index 0bac6fd6b23..06cd504f403 100644
--- a/pymc3/distributions/discrete.py
+++ b/pymc3/distributions/discrete.py
@@ -14,11 +14,12 @@
 
 import warnings
 
+import aesara.tensor as aet
 import numpy as np
-import theano.tensor as tt
 
 from scipy import stats
 
+from pymc3.aesaraf import floatX, intX, take_along_axis
 from pymc3.distributions.dist_math import (
     betaln,
     binomln,
@@ -34,7 +35,6 @@
 from pymc3.distributions.distribution import Discrete, draw_values, generate_samples
 from pymc3.distributions.shape_utils import broadcast_distribution_samples
 from pymc3.math import log1mexp, log1pexp, logaddexp, logit, logsumexp, sigmoid, tround
-from pymc3.theanof import floatX, intX, take_along_axis
 
 __all__ = [
     "Binomial",
@@ -100,9 +100,9 @@ class Binomial(Discrete):
 
     def __init__(self, n, p, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.n = n = tt.as_tensor_variable(intX(n))
-        self.p = p = tt.as_tensor_variable(floatX(p))
-        self.mode = tt.cast(tround(n * p), self.dtype)
+        self.n = n = aet.as_tensor_variable(intX(n))
+        self.p = p = aet.as_tensor_variable(floatX(p))
+        self.mode = aet.cast(tround(n * p), self.dtype)
 
     def random(self, point=None, size=None):
         r"""
@@ -132,7 +132,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -171,12 +171,12 @@ def logcdf(self, value):
 
         n = self.n
         p = self.p
-        value = tt.floor(value)
+        value = aet.floor(value)
 
         return bound(
-            tt.switch(
-                tt.lt(value, n),
-                tt.log(incomplete_beta(n - value, value + 1, 1 - p)),
+            aet.switch(
+                aet.lt(value, n),
+                aet.log(incomplete_beta(n - value, value + 1, 1 - p)),
                 0,
             ),
             0 <= value,
@@ -243,10 +243,10 @@ def BetaBinom(a, b, n, x):
 
     def __init__(self, alpha, beta, n, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.alpha = alpha = tt.as_tensor_variable(floatX(alpha))
-        self.beta = beta = tt.as_tensor_variable(floatX(beta))
-        self.n = n = tt.as_tensor_variable(intX(n))
-        self.mode = tt.cast(tround(alpha / (alpha + beta)), "int8")
+        self.alpha = alpha = aet.as_tensor_variable(floatX(alpha))
+        self.beta = beta = aet.as_tensor_variable(floatX(beta))
+        self.n = n = aet.as_tensor_variable(intX(n))
+        self.mode = aet.cast(tround(alpha / (alpha + beta)), "int8")
 
     def _random(self, alpha, beta, n, size=None):
         size = size or ()
@@ -300,7 +300,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -340,12 +340,12 @@ def logcdf(self, value):
         alpha = self.alpha
         beta = self.beta
         n = self.n
-        safe_lower = tt.switch(tt.lt(value, 0), value, 0)
+        safe_lower = aet.switch(aet.lt(value, 0), value, 0)
 
         return bound(
-            tt.switch(
-                tt.lt(value, n),
-                logsumexp(self.logp(tt.arange(safe_lower, value + 1)), keepdims=False),
+            aet.switch(
+                aet.lt(value, n),
+                logsumexp(self.logp(aet.arange(safe_lower, value + 1)), keepdims=False),
                 0,
             ),
             0 <= value,
@@ -401,14 +401,14 @@ def __init__(self, p=None, logit_p=None, *args, **kwargs):
             raise ValueError("Specify one of p and logit_p")
         if p is not None:
             self._is_logit = False
-            self.p = p = tt.as_tensor_variable(floatX(p))
+            self.p = p = aet.as_tensor_variable(floatX(p))
             self._logit_p = logit(p)
         else:
             self._is_logit = True
-            self.p = tt.nnet.sigmoid(floatX(logit_p))
-            self._logit_p = tt.as_tensor_variable(logit_p)
+            self.p = aet.nnet.sigmoid(floatX(logit_p))
+            self._logit_p = aet.as_tensor_variable(logit_p)
 
-        self.mode = tt.cast(tround(self.p), "int8")
+        self.mode = aet.cast(tround(self.p), "int8")
 
     def random(self, point=None, size=None):
         r"""
@@ -438,19 +438,23 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
         TensorVariable
         """
         if self._is_logit:
-            lp = tt.switch(value, self._logit_p, -self._logit_p)
+            lp = aet.switch(value, self._logit_p, -self._logit_p)
             return -log1pexp(-lp)
         else:
             p = self.p
             return bound(
-                tt.switch(value, tt.log(p), tt.log(1 - p)), value >= 0, value <= 1, p >= 0, p <= 1
+                aet.switch(value, aet.log(p), aet.log(1 - p)),
+                value >= 0,
+                value <= 1,
+                p >= 0,
+                p <= 1,
             )
 
     def logcdf(self, value):
@@ -460,9 +464,9 @@ def logcdf(self, value):
 
         Parameters
         ----------
-        value: numeric or np.ndarray or theano.tensor
+        value: numeric or np.ndarray or aesara.tensor
             Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or theano tensor.
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
         Returns
         -------
@@ -471,9 +475,9 @@ def logcdf(self, value):
         p = self.p
 
         return bound(
-            tt.switch(
-                tt.lt(value, 1),
-                tt.log1p(-p),
+            aet.switch(
+                aet.lt(value, 1),
+                aet.log1p(-p),
                 0,
             ),
             0 <= value,
@@ -527,8 +531,8 @@ def DiscreteWeibull(q, b, x):
     def __init__(self, q, beta, *args, **kwargs):
         super().__init__(*args, defaults=("median",), **kwargs)
 
-        self.q = tt.as_tensor_variable(floatX(q))
-        self.beta = tt.as_tensor_variable(floatX(beta))
+        self.q = aet.as_tensor_variable(floatX(q))
+        self.beta = aet.as_tensor_variable(floatX(beta))
 
         self.median = self._ppf(0.5)
 
@@ -540,7 +544,7 @@ def _ppf(self, p):
         q = self.q
         beta = self.beta
 
-        return (tt.ceil(tt.power(tt.log(1 - p) / tt.log(q), 1.0 / beta)) - 1).astype("int64")
+        return (aet.ceil(aet.power(aet.log(1 - p) / aet.log(q), 1.0 / beta)) - 1).astype("int64")
 
     def _random(self, q, beta, size=None):
         p = np.random.uniform(size=size)
@@ -576,7 +580,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -585,7 +589,9 @@ def logp(self, value):
         q = self.q
         beta = self.beta
         return bound(
-            tt.log(tt.power(q, tt.power(value, beta)) - tt.power(q, tt.power(value + 1, beta))),
+            aet.log(
+                aet.power(q, aet.power(value, beta)) - aet.power(q, aet.power(value + 1, beta))
+            ),
             0 <= value,
             0 < q,
             q < 1,
@@ -599,9 +605,9 @@ def logcdf(self, value):
 
         Parameters
         ----------
-        value: numeric or np.ndarray or theano.tensor
+        value: numeric or np.ndarray or aesara.tensor
             Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or theano tensor.
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
         Returns
         -------
@@ -611,7 +617,7 @@ def logcdf(self, value):
         beta = self.beta
 
         return bound(
-            tt.log1p(-tt.power(q, tt.power(value + 1, beta))),
+            aet.log1p(-aet.power(q, aet.power(value + 1, beta))),
             0 <= value,
             0 < q,
             q < 1,
@@ -665,8 +671,8 @@ class Poisson(Discrete):
 
     def __init__(self, mu, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.mu = mu = tt.as_tensor_variable(floatX(mu))
-        self.mode = intX(tt.floor(mu))
+        self.mu = mu = aet.as_tensor_variable(floatX(mu))
+        self.mode = intX(aet.floor(mu))
 
     def random(self, point=None, size=None):
         r"""
@@ -696,7 +702,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -705,7 +711,7 @@ def logp(self, value):
         mu = self.mu
         log_prob = bound(logpow(mu, value) - factln(value) - mu, mu >= 0, value >= 0)
         # Return zero when mu and value are both zero
-        return tt.switch(tt.eq(mu, 0) * tt.eq(value, 0), 0, log_prob)
+        return aet.switch(aet.eq(mu, 0) * aet.eq(value, 0), 0, log_prob)
 
     def logcdf(self, value):
         """
@@ -714,22 +720,22 @@ def logcdf(self, value):
 
         Parameters
         ----------
-        value: numeric or np.ndarray or theano.tensor
+        value: numeric or np.ndarray or aesara.tensor
             Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or theano tensor.
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
         Returns
         -------
         TensorVariable
         """
         mu = self.mu
-        value = tt.floor(value)
+        value = aet.floor(value)
         # Avoid C-assertion when the gammaincc function is called with invalid values (#4340)
-        safe_mu = tt.switch(tt.lt(mu, 0), 0, mu)
-        safe_value = tt.switch(tt.lt(value, 0), 0, value)
+        safe_mu = aet.switch(aet.lt(mu, 0), 0, mu)
+        safe_value = aet.switch(aet.lt(value, 0), 0, value)
 
         return bound(
-            tt.log(tt.gammaincc(safe_value + 1, safe_mu)),
+            aet.log(aet.gammaincc(safe_value + 1, safe_mu)),
             0 <= value,
             0 <= mu,
         )
@@ -800,16 +806,16 @@ def NegBinom(a, m, x):
     def __init__(self, mu=None, alpha=None, p=None, n=None, *args, **kwargs):
         super().__init__(*args, **kwargs)
         mu, alpha = self.get_mu_alpha(mu, alpha, p, n)
-        self.mu = mu = tt.as_tensor_variable(floatX(mu))
-        self.alpha = alpha = tt.as_tensor_variable(floatX(alpha))
-        self.mode = intX(tt.floor(mu))
+        self.mu = mu = aet.as_tensor_variable(floatX(mu))
+        self.alpha = alpha = aet.as_tensor_variable(floatX(alpha))
+        self.mode = intX(aet.floor(mu))
 
     def get_mu_alpha(self, mu=None, alpha=None, p=None, n=None):
         self._param_type = ["mu", "alpha"]
         if alpha is None:
             if n is not None:
                 self._param_type[1] = "n"
-                self.n = tt.as_tensor_variable(intX(n))
+                self.n = aet.as_tensor_variable(intX(n))
                 alpha = n
             else:
                 raise ValueError("Incompatible parametrization. Must specify either alpha or n.")
@@ -819,7 +825,7 @@ def get_mu_alpha(self, mu=None, alpha=None, p=None, n=None):
         if mu is None:
             if p is not None:
                 self._param_type[0] = "p"
-                self.p = tt.as_tensor_variable(floatX(p))
+                self.p = aet.as_tensor_variable(floatX(p))
                 mu = alpha * (1 - p) / p
             else:
                 raise ValueError("Incompatible parametrization. Must specify either mu or p.")
@@ -870,7 +876,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -888,7 +894,7 @@ def logp(self, value):
         )
 
         # Return Poisson when alpha gets very large.
-        return tt.switch(tt.gt(alpha, 1e10), Poisson.dist(self.mu).logp(value), negbinom)
+        return aet.switch(aet.gt(alpha, 1e10), Poisson.dist(self.mu).logp(value), negbinom)
 
     def logcdf(self, value):
         """
@@ -915,7 +921,7 @@ def logcdf(self, value):
         p = alpha / (self.mu + alpha)
 
         return bound(
-            tt.log(incomplete_beta(alpha, tt.floor(value) + 1, p)),
+            aet.log(incomplete_beta(alpha, aet.floor(value) + 1, p)),
             0 <= value,
             0 < alpha,
             0 <= p,
@@ -965,7 +971,7 @@ class Geometric(Discrete):
 
     def __init__(self, p, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.p = p = tt.as_tensor_variable(floatX(p))
+        self.p = p = aet.as_tensor_variable(floatX(p))
         self.mode = 1
 
     def random(self, point=None, size=None):
@@ -996,14 +1002,14 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
         TensorVariable
         """
         p = self.p
-        return bound(tt.log(p) + logpow(1 - p, value - 1), 0 <= p, p <= 1, value >= 1)
+        return bound(aet.log(p) + logpow(1 - p, value - 1), 0 <= p, p <= 1, value >= 1)
 
     def logcdf(self, value):
         """
@@ -1012,9 +1018,9 @@ def logcdf(self, value):
 
         Parameters
         ----------
-        value: numeric or np.ndarray or theano.tensor
+        value: numeric or np.ndarray or aesara.tensor
             Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or theano tensor.
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
         Returns
         -------
@@ -1023,7 +1029,7 @@ def logcdf(self, value):
         p = self.p
 
         return bound(
-            log1mexp(-tt.log1p(-p) * value),
+            log1mexp(-aet.log1p(-p) * value),
             0 <= value,
             0 <= p,
             p <= 1,
@@ -1081,7 +1087,7 @@ def __init__(self, N, k, n, *args, **kwargs):
         self.N = intX(N)
         self.k = intX(k)
         self.n = intX(n)
-        self.mode = intX(tt.floor((n + 1) * (k + 1) / (N + 2)))
+        self.mode = intX(aet.floor((n + 1) * (k + 1) / (N + 2)))
 
     def random(self, point=None, size=None):
         r"""
@@ -1120,7 +1126,7 @@ def logp(self, value):
         ----------
         value : numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -1140,8 +1146,8 @@ def logp(self, value):
             - betaln(tot + 1, 1)
         )
         # value in [max(0, n - N + k), min(k, n)]
-        lower = tt.switch(tt.gt(n - N + k, 0), n - N + k, 0)
-        upper = tt.switch(tt.lt(k, n), k, n)
+        lower = aet.switch(aet.gt(n - N + k, 0), n - N + k, 0)
+        upper = aet.switch(aet.lt(k, n), k, n)
         return bound(result, lower <= value, value <= upper)
 
     def logcdf(self, value):
@@ -1168,12 +1174,12 @@ def logcdf(self, value):
         N = self.N
         n = self.n
         k = self.k
-        safe_lower = tt.switch(tt.lt(value, 0), value, 0)
+        safe_lower = aet.switch(aet.lt(value, 0), value, 0)
 
         return bound(
-            tt.switch(
-                tt.lt(value, n),
-                logsumexp(self.logp(tt.arange(safe_lower, value + 1)), keepdims=False),
+            aet.switch(
+                aet.lt(value, n),
+                logsumexp(self.logp(aet.arange(safe_lower, value + 1)), keepdims=False),
                 0,
             ),
             0 <= value,
@@ -1226,9 +1232,9 @@ class DiscreteUniform(Discrete):
 
     def __init__(self, lower, upper, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.lower = intX(tt.floor(lower))
-        self.upper = intX(tt.floor(upper))
-        self.mode = tt.maximum(intX(tt.floor((upper + lower) / 2.0)), self.lower)
+        self.lower = intX(aet.floor(lower))
+        self.upper = intX(aet.floor(upper))
+        self.mode = aet.maximum(intX(aet.floor((upper + lower) / 2.0)), self.lower)
 
     def _random(self, lower, upper, size=None):
         # This way seems to be the only to deal with lower and upper
@@ -1264,7 +1270,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -1272,7 +1278,7 @@ def logp(self, value):
         """
         upper = self.upper
         lower = self.lower
-        return bound(-tt.log(upper - lower + 1), lower <= value, value <= upper)
+        return bound(-aet.log(upper - lower + 1), lower <= value, value <= upper)
 
     def logcdf(self, value):
         """
@@ -1281,9 +1287,9 @@ def logcdf(self, value):
 
         Parameters
         ----------
-        value: numeric or np.ndarray or theano.tensor
+        value: numeric or np.ndarray or aesara.tensor
             Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or theano tensor.
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
         Returns
         -------
@@ -1293,9 +1299,10 @@ def logcdf(self, value):
         lower = self.lower
 
         return bound(
-            tt.switch(
-                tt.lt(value, upper),
-                tt.log(tt.minimum(tt.floor(value), upper) - lower + 1) - tt.log(upper - lower + 1),
+            aet.switch(
+                aet.lt(value, upper),
+                aet.log(aet.minimum(aet.floor(value), upper) - lower + 1)
+                - aet.log(upper - lower + 1),
                 0,
             ),
             lower <= value,
@@ -1341,17 +1348,17 @@ class Categorical(Discrete):
     def __init__(self, p, *args, **kwargs):
         super().__init__(*args, **kwargs)
         try:
-            self.k = tt.shape(p)[-1].tag.test_value
+            self.k = aet.shape(p)[-1].tag.test_value
         except AttributeError:
-            self.k = tt.shape(p)[-1]
-        p = tt.as_tensor_variable(floatX(p))
+            self.k = aet.shape(p)[-1]
+        p = aet.as_tensor_variable(floatX(p))
 
         # From #2082, it may be dangerous to automatically rescale p at this
         # point without checking for positiveness
         self.p = p
-        self.mode = tt.argmax(p, axis=-1)
+        self.mode = aet.argmax(p, axis=-1)
         if self.mode.ndim == 1:
-            self.mode = tt.squeeze(self.mode)
+            self.mode = aet.squeeze(self.mode)
 
     def random(self, point=None, size=None):
         r"""
@@ -1389,7 +1396,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -1399,27 +1406,27 @@ def logp(self, value):
         k = self.k
 
         # Clip values before using them for indexing
-        value_clip = tt.clip(value, 0, k - 1)
+        value_clip = aet.clip(value, 0, k - 1)
 
-        p = p_ / tt.sum(p_, axis=-1, keepdims=True)
+        p = p_ / aet.sum(p_, axis=-1, keepdims=True)
 
         if p.ndim > 1:
             if p.ndim > value_clip.ndim:
-                value_clip = tt.shape_padleft(value_clip, p_.ndim - value_clip.ndim)
+                value_clip = aet.shape_padleft(value_clip, p_.ndim - value_clip.ndim)
             elif p.ndim < value_clip.ndim:
-                p = tt.shape_padleft(p, value_clip.ndim - p_.ndim)
+                p = aet.shape_padleft(p, value_clip.ndim - p_.ndim)
             pattern = (p.ndim - 1,) + tuple(range(p.ndim - 1))
-            a = tt.log(
+            a = aet.log(
                 take_along_axis(
                     p.dimshuffle(pattern),
                     value_clip,
                 )
             )
         else:
-            a = tt.log(p[value_clip])
+            a = aet.log(p[value_clip])
 
         return bound(
-            a, value >= 0, value <= (k - 1), tt.all(p_ >= 0, axis=-1), tt.all(p <= 1, axis=-1)
+            a, value >= 0, value <= (k - 1), aet.all(p_ >= 0, axis=-1), aet.all(p <= 1, axis=-1)
         )
 
 
@@ -1439,7 +1446,7 @@ def __init__(self, c, *args, **kwargs):
             DeprecationWarning,
         )
         super().__init__(*args, **kwargs)
-        self.mean = self.median = self.mode = self.c = c = tt.as_tensor_variable(c)
+        self.mean = self.median = self.mode = self.c = c = aet.as_tensor_variable(c)
 
     def random(self, point=None, size=None):
         r"""
@@ -1474,14 +1481,14 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
         TensorVariable
         """
         c = self.c
-        return bound(0, tt.eq(value, c))
+        return bound(0, aet.eq(value, c))
 
 
 ConstantDist = Constant
@@ -1539,8 +1546,8 @@ class ZeroInflatedPoisson(Discrete):
 
     def __init__(self, psi, theta, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.theta = theta = tt.as_tensor_variable(floatX(theta))
-        self.psi = tt.as_tensor_variable(floatX(psi))
+        self.theta = theta = aet.as_tensor_variable(floatX(theta))
+        self.psi = aet.as_tensor_variable(floatX(psi))
         self.pois = Poisson.dist(theta)
         self.mode = self.pois.mode
 
@@ -1574,7 +1581,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -1583,10 +1590,10 @@ def logp(self, value):
         psi = self.psi
         theta = self.theta
 
-        logp_val = tt.switch(
-            tt.gt(value, 0),
-            tt.log(psi) + self.pois.logp(value),
-            logaddexp(tt.log1p(-psi), tt.log(psi) - theta),
+        logp_val = aet.switch(
+            aet.gt(value, 0),
+            aet.log(psi) + self.pois.logp(value),
+            logaddexp(aet.log1p(-psi), aet.log(psi) - theta),
         )
 
         return bound(logp_val, 0 <= value, 0 <= psi, psi <= 1, 0 <= theta)
@@ -1598,9 +1605,9 @@ def logcdf(self, value):
 
         Parameters
         ----------
-        value: numeric or np.ndarray or theano.tensor
+        value: numeric or np.ndarray or aesara.tensor
             Value(s) for which log CDF is calculated. If the log CDF for multiple
-            values are desired the values must be provided in a numpy array or theano tensor.
+            values are desired the values must be provided in a numpy array or aesara tensor.
 
         Returns
         -------
@@ -1609,7 +1616,7 @@ def logcdf(self, value):
         psi = self.psi
 
         return bound(
-            logaddexp(tt.log1p(-psi), tt.log(psi) + self.pois.logcdf(value)),
+            logaddexp(aet.log1p(-psi), aet.log(psi) + self.pois.logcdf(value)),
             0 <= value,
             0 <= psi,
             psi <= 1,
@@ -1669,9 +1676,9 @@ class ZeroInflatedBinomial(Discrete):
 
     def __init__(self, psi, n, p, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.n = n = tt.as_tensor_variable(intX(n))
-        self.p = p = tt.as_tensor_variable(floatX(p))
-        self.psi = psi = tt.as_tensor_variable(floatX(psi))
+        self.n = n = aet.as_tensor_variable(intX(n))
+        self.p = p = aet.as_tensor_variable(floatX(p))
+        self.psi = psi = aet.as_tensor_variable(floatX(psi))
         self.bin = Binomial.dist(n, p)
         self.mode = self.bin.mode
 
@@ -1705,7 +1712,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -1715,10 +1722,10 @@ def logp(self, value):
         p = self.p
         n = self.n
 
-        logp_val = tt.switch(
-            tt.gt(value, 0),
-            tt.log(psi) + self.bin.logp(value),
-            logaddexp(tt.log1p(-psi), tt.log(psi) + n * tt.log1p(-p)),
+        logp_val = aet.switch(
+            aet.gt(value, 0),
+            aet.log(psi) + self.bin.logp(value),
+            logaddexp(aet.log1p(-psi), aet.log(psi) + n * aet.log1p(-p)),
         )
 
         return bound(logp_val, 0 <= value, value <= n, 0 <= psi, psi <= 1, 0 <= p, p <= 1)
@@ -1746,7 +1753,7 @@ def logcdf(self, value):
         psi = self.psi
 
         return bound(
-            logaddexp(tt.log1p(-psi), tt.log(psi) + self.bin.logcdf(value)),
+            logaddexp(aet.log1p(-psi), aet.log(psi) + self.bin.logcdf(value)),
             0 <= value,
             0 <= psi,
             psi <= 1,
@@ -1823,9 +1830,9 @@ def ZeroInfNegBinom(a, m, psi, x):
 
     def __init__(self, psi, mu, alpha, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.mu = mu = tt.as_tensor_variable(floatX(mu))
-        self.alpha = alpha = tt.as_tensor_variable(floatX(alpha))
-        self.psi = psi = tt.as_tensor_variable(floatX(psi))
+        self.mu = mu = aet.as_tensor_variable(floatX(mu))
+        self.alpha = alpha = aet.as_tensor_variable(floatX(alpha))
+        self.psi = psi = aet.as_tensor_variable(floatX(psi))
         self.nb = NegativeBinomial.dist(mu, alpha)
         self.mode = self.nb.mode
 
@@ -1872,7 +1879,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -1882,12 +1889,12 @@ def logp(self, value):
         mu = self.mu
         psi = self.psi
 
-        logp_other = tt.log(psi) + self.nb.logp(value)
+        logp_other = aet.log(psi) + self.nb.logp(value)
         logp_0 = logaddexp(
-            tt.log1p(-psi), tt.log(psi) + alpha * (tt.log(alpha) - tt.log(alpha + mu))
+            aet.log1p(-psi), aet.log(psi) + alpha * (aet.log(alpha) - aet.log(alpha + mu))
         )
 
-        logp_val = tt.switch(tt.gt(value, 0), logp_other, logp_0)
+        logp_val = aet.switch(aet.gt(value, 0), logp_other, logp_0)
 
         return bound(logp_val, 0 <= value, 0 <= psi, psi <= 1, mu > 0, alpha > 0)
 
@@ -1913,7 +1920,7 @@ def logcdf(self, value):
         psi = self.psi
 
         return bound(
-            logaddexp(tt.log1p(-psi), tt.log(psi) + self.nb.logcdf(value)),
+            logaddexp(aet.log1p(-psi), aet.log(psi) + self.nb.logcdf(value)),
             0 <= value,
             0 <= psi,
             psi <= 1,
@@ -1987,15 +1994,15 @@ class OrderedLogistic(Categorical):
     """
 
     def __init__(self, eta, cutpoints, *args, **kwargs):
-        self.eta = tt.as_tensor_variable(floatX(eta))
-        self.cutpoints = tt.as_tensor_variable(cutpoints)
+        self.eta = aet.as_tensor_variable(floatX(eta))
+        self.cutpoints = aet.as_tensor_variable(cutpoints)
 
-        pa = sigmoid(self.cutpoints - tt.shape_padright(self.eta))
-        p_cum = tt.concatenate(
+        pa = sigmoid(self.cutpoints - aet.shape_padright(self.eta))
+        p_cum = aet.concatenate(
             [
-                tt.zeros_like(tt.shape_padright(pa[..., 0])),
+                aet.zeros_like(aet.shape_padright(pa[..., 0])),
                 pa,
-                tt.ones_like(tt.shape_padright(pa[..., 0])),
+                aet.ones_like(aet.shape_padright(pa[..., 0])),
             ],
             axis=-1,
         )
@@ -2076,23 +2083,23 @@ class OrderedProbit(Categorical):
 
     def __init__(self, eta, cutpoints, *args, **kwargs):
 
-        self.eta = tt.as_tensor_variable(floatX(eta))
-        self.cutpoints = tt.as_tensor_variable(cutpoints)
+        self.eta = aet.as_tensor_variable(floatX(eta))
+        self.cutpoints = aet.as_tensor_variable(cutpoints)
 
-        probits = tt.shape_padright(self.eta) - self.cutpoints
-        _log_p = tt.concatenate(
+        probits = aet.shape_padright(self.eta) - self.cutpoints
+        _log_p = aet.concatenate(
             [
-                tt.shape_padright(normal_lccdf(0, 1, probits[..., 0])),
+                aet.shape_padright(normal_lccdf(0, 1, probits[..., 0])),
                 log_diff_normal_cdf(0, 1, probits[..., :-1], probits[..., 1:]),
-                tt.shape_padright(normal_lcdf(0, 1, probits[..., -1])),
+                aet.shape_padright(normal_lcdf(0, 1, probits[..., -1])),
             ],
             axis=-1,
         )
-        _log_p = tt.as_tensor_variable(floatX(_log_p))
+        _log_p = aet.as_tensor_variable(floatX(_log_p))
 
         self._log_p = _log_p
-        self.mode = tt.argmax(_log_p, axis=-1)
-        p = tt.exp(_log_p)
+        self.mode = aet.argmax(_log_p, axis=-1)
+        p = aet.exp(_log_p)
 
         super().__init__(p=p, *args, **kwargs)
 
@@ -2104,7 +2111,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -2114,13 +2121,13 @@ def logp(self, value):
         k = self.k
 
         # Clip values before using them for indexing
-        value_clip = tt.clip(value, 0, k - 1)
+        value_clip = aet.clip(value, 0, k - 1)
 
         if logp.ndim > 1:
             if logp.ndim > value_clip.ndim:
-                value_clip = tt.shape_padleft(value_clip, logp.ndim - value_clip.ndim)
+                value_clip = aet.shape_padleft(value_clip, logp.ndim - value_clip.ndim)
             elif logp.ndim < value_clip.ndim:
-                logp = tt.shape_padleft(logp, value_clip.ndim - logp.ndim)
+                logp = aet.shape_padleft(logp, value_clip.ndim - logp.ndim)
             pattern = (logp.ndim - 1,) + tuple(range(logp.ndim - 1))
             a = take_along_axis(
                 logp.dimshuffle(pattern),
diff --git a/pymc3/distributions/dist_math.py b/pymc3/distributions/dist_math.py
index 70877722271..e154e016f34 100644
--- a/pymc3/distributions/dist_math.py
+++ b/pymc3/distributions/dist_math.py
@@ -19,24 +19,25 @@
 """
 import platform
 
+import aesara
+import aesara.tensor as aet
 import numpy as np
 import scipy.linalg
 import scipy.stats
-import theano
-import theano.tensor as tt
 
-from theano import scan
-from theano.compile.builders import OpFromGraph
-from theano.graph.basic import Apply
-from theano.graph.op import Op
-from theano.scalar import UnaryScalarOp, upgrade_to_float_no_complex
-from theano.scan import until
-from theano.tensor.slinalg import Cholesky
+from aesara import scan
+from aesara.compile.builders import OpFromGraph
+from aesara.graph.basic import Apply
+from aesara.graph.op import Op
+from aesara.scalar import UnaryScalarOp, upgrade_to_float_no_complex
+from aesara.scan import until
+from aesara.tensor.elemwise import Elemwise
+from aesara.tensor.slinalg import Cholesky, Solve
 
+from pymc3.aesaraf import floatX
 from pymc3.distributions.shape_utils import to_tuple
 from pymc3.distributions.special import gammaln
 from pymc3.model import modelcontext
-from pymc3.theanof import floatX
 
 f = floatX
 c = -0.5 * np.log(2.0 * np.pi)
@@ -86,7 +87,7 @@ def bound(logp, *conditions, **kwargs):
     else:
         alltrue = alltrue_scalar
 
-    return tt.switch(alltrue(conditions), logp, -np.inf)
+    return aet.switch(alltrue(conditions), logp, -np.inf)
 
 
 def alltrue_elemwise(vals):
@@ -97,7 +98,7 @@ def alltrue_elemwise(vals):
 
 
 def alltrue_scalar(vals):
-    return tt.all([tt.all(1 * val) for val in vals])
+    return aet.all([aet.all(1 * val) for val in vals])
 
 
 def logpow(x, m):
@@ -105,7 +106,7 @@ def logpow(x, m):
     Calculates log(x**m) since m*log(x) will fail when m, x = 0.
     """
     # return m * log(x)
-    return tt.switch(tt.eq(x, 0), tt.switch(tt.eq(m, 0), 0.0, -np.inf), m * tt.log(x))
+    return aet.switch(aet.eq(x, 0), aet.switch(aet.eq(m, 0), 0.0, -np.inf), m * aet.log(x))
 
 
 def factln(n):
@@ -124,25 +125,25 @@ def std_cdf(x):
     """
     Calculates the standard normal cumulative distribution function.
     """
-    return 0.5 + 0.5 * tt.erf(x / tt.sqrt(2.0))
+    return 0.5 + 0.5 * aet.erf(x / aet.sqrt(2.0))
 
 
 def normal_lcdf(mu, sigma, x):
     """Compute the log of the cumulative density function of the normal."""
     z = (x - mu) / sigma
-    return tt.switch(
-        tt.lt(z, -1.0),
-        tt.log(tt.erfcx(-z / tt.sqrt(2.0)) / 2.0) - tt.sqr(z) / 2.0,
-        tt.log1p(-tt.erfc(z / tt.sqrt(2.0)) / 2.0),
+    return aet.switch(
+        aet.lt(z, -1.0),
+        aet.log(aet.erfcx(-z / aet.sqrt(2.0)) / 2.0) - aet.sqr(z) / 2.0,
+        aet.log1p(-aet.erfc(z / aet.sqrt(2.0)) / 2.0),
     )
 
 
 def normal_lccdf(mu, sigma, x):
     z = (x - mu) / sigma
-    return tt.switch(
-        tt.gt(z, 1.0),
-        tt.log(tt.erfcx(z / tt.sqrt(2.0)) / 2.0) - tt.sqr(z) / 2.0,
-        tt.log1p(-tt.erfc(-z / tt.sqrt(2.0)) / 2.0),
+    return aet.switch(
+        aet.gt(z, 1.0),
+        aet.log(aet.erfcx(z / aet.sqrt(2.0)) / 2.0) - aet.sqr(z) / 2.0,
+        aet.log1p(-aet.erfc(-z / aet.sqrt(2.0)) / 2.0),
     )
 
 
@@ -167,37 +168,38 @@ def log_diff_normal_cdf(mu, sigma, x, y):
     log (\\Phi(x) - \\Phi(y))
 
     """
-    x = (x - mu) / sigma / tt.sqrt(2.0)
-    y = (y - mu) / sigma / tt.sqrt(2.0)
+    x = (x - mu) / sigma / aet.sqrt(2.0)
+    y = (y - mu) / sigma / aet.sqrt(2.0)
 
     # To stabilize the computation, consider these three regions:
     # 1) x > y > 0 => Use erf(x) = 1 - e^{-x^2} erfcx(x) and erf(y) =1 - e^{-y^2} erfcx(y)
     # 2) 0 > x > y => Use erf(x) = e^{-x^2} erfcx(-x) and erf(y) = e^{-y^2} erfcx(-y)
     # 3) x > 0 > y => Naive formula log( (erf(x) - erf(y)) / 2 ) works fine.
-    return tt.log(0.5) + tt.switch(
-        tt.gt(y, 0),
-        -tt.square(y) + tt.log(tt.erfcx(y) - tt.exp(tt.square(y) - tt.square(x)) * tt.erfcx(x)),
-        tt.switch(
-            tt.lt(x, 0),  # 0 > x > y
-            -tt.square(x)
-            + tt.log(tt.erfcx(-x) - tt.exp(tt.square(x) - tt.square(y)) * tt.erfcx(-y)),
-            tt.log(tt.erf(x) - tt.erf(y)),  # x >0 > y
+    return aet.log(0.5) + aet.switch(
+        aet.gt(y, 0),
+        -aet.square(y)
+        + aet.log(aet.erfcx(y) - aet.exp(aet.square(y) - aet.square(x)) * aet.erfcx(x)),
+        aet.switch(
+            aet.lt(x, 0),  # 0 > x > y
+            -aet.square(x)
+            + aet.log(aet.erfcx(-x) - aet.exp(aet.square(x) - aet.square(y)) * aet.erfcx(-y)),
+            aet.log(aet.erf(x) - aet.erf(y)),  # x >0 > y
         ),
     )
 
 
 def sigma2rho(sigma):
     """
-    `sigma -> rho` theano converter
+    `sigma -> rho` aesara converter
     :math:`mu + sigma*e = mu + log(1+exp(rho))*e`"""
-    return tt.log(tt.exp(tt.abs_(sigma)) - 1.0)
+    return aet.log(aet.exp(aet.abs_(sigma)) - 1.0)
 
 
 def rho2sigma(rho):
     """
-    `rho -> sigma` theano converter
+    `rho -> sigma` aesara converter
     :math:`mu + sigma*e = mu + log(1+exp(rho))*e`"""
-    return tt.nnet.softplus(rho)
+    return aet.nnet.softplus(rho)
 
 
 rho2sd = rho2sigma
@@ -240,13 +242,13 @@ def log_normal(x, mean, **kwargs):
     if sigma is not None:
         std = sigma
     elif w is not None:
-        std = tt.exp(w)
+        std = aet.exp(w)
     elif rho is not None:
         std = rho2sigma(rho)
     else:
         std = tau ** (-1)
     std += f(eps)
-    return f(c) - tt.log(tt.abs_(std)) - (x - mean) ** 2 / (2.0 * std ** 2)
+    return f(c) - aet.log(aet.abs_(std)) - (x - mean) ** 2 / (2.0 * std ** 2)
 
 
 def MvNormalLogp():
@@ -256,34 +258,34 @@ def MvNormalLogp():
 
     Parameters
     ----------
-    cov: tt.matrix
+    cov: aet.matrix
         The covariance matrix.
-    delta: tt.matrix
+    delta: aet.matrix
         Array of deviations from the mean.
     """
-    cov = tt.matrix("cov")
+    cov = aet.matrix("cov")
     cov.tag.test_value = floatX(np.eye(3))
-    delta = tt.matrix("delta")
+    delta = aet.matrix("delta")
     delta.tag.test_value = floatX(np.zeros((2, 3)))
 
-    solve_lower = tt.slinalg.Solve(A_structure="lower_triangular")
-    solve_upper = tt.slinalg.Solve(A_structure="upper_triangular")
+    solve_lower = Solve(A_structure="lower_triangular")
+    solve_upper = Solve(A_structure="upper_triangular")
     cholesky = Cholesky(lower=True, on_error="nan")
 
     n, k = delta.shape
     n, k = f(n), f(k)
     chol_cov = cholesky(cov)
-    diag = tt.nlinalg.diag(chol_cov)
-    ok = tt.all(diag > 0)
+    diag = aet.nlinalg.diag(chol_cov)
+    ok = aet.all(diag > 0)
 
-    chol_cov = tt.switch(ok, chol_cov, tt.fill(chol_cov, 1))
+    chol_cov = aet.switch(ok, chol_cov, aet.fill(chol_cov, 1))
     delta_trans = solve_lower(chol_cov, delta.T).T
 
-    result = n * k * tt.log(f(2) * np.pi)
-    result += f(2) * n * tt.sum(tt.log(diag))
+    result = n * k * aet.log(f(2) * np.pi)
+    result += f(2) * n * aet.sum(aet.log(diag))
     result += (delta_trans ** f(2)).sum()
     result = f(-0.5) * result
-    logp = tt.switch(ok, result, -np.inf)
+    logp = aet.switch(ok, result, -np.inf)
 
     def dlogp(inputs, gradients):
         (g_logp,) = gradients
@@ -293,21 +295,21 @@ def dlogp(inputs, gradients):
         n, k = delta.shape
 
         chol_cov = cholesky(cov)
-        diag = tt.nlinalg.diag(chol_cov)
-        ok = tt.all(diag > 0)
+        diag = aet.nlinalg.diag(chol_cov)
+        ok = aet.all(diag > 0)
 
-        chol_cov = tt.switch(ok, chol_cov, tt.fill(chol_cov, 1))
+        chol_cov = aet.switch(ok, chol_cov, aet.fill(chol_cov, 1))
         delta_trans = solve_lower(chol_cov, delta.T).T
 
-        inner = n * tt.eye(k) - tt.dot(delta_trans.T, delta_trans)
+        inner = n * aet.eye(k) - aet.dot(delta_trans.T, delta_trans)
         g_cov = solve_upper(chol_cov.T, inner)
         g_cov = solve_upper(chol_cov.T, g_cov.T)
 
         tau_delta = solve_upper(chol_cov.T, delta_trans.T)
         g_delta = tau_delta.T
 
-        g_cov = tt.switch(ok, g_cov, -np.nan)
-        g_delta = tt.switch(ok, g_delta, -np.nan)
+        g_cov = aet.switch(ok, g_cov, -np.nan)
+        g_delta = aet.switch(ok, g_delta, -np.nan)
 
         return [-0.5 * g_cov * g_logp, -g_delta * g_logp]
 
@@ -316,7 +318,7 @@ def dlogp(inputs, gradients):
 
 class SplineWrapper(Op):
     """
-    Creates a theano operation from scipy.interpolate.UnivariateSpline
+    Creates a aesara operation from scipy.interpolate.UnivariateSpline
     """
 
     __props__ = ("spline",)
@@ -325,7 +327,7 @@ def __init__(self, spline):
         self.spline = spline
 
     def make_node(self, x):
-        x = tt.as_tensor_variable(x)
+        x = aet.as_tensor_variable(x)
         return Apply(self, [x], [x.type()])
 
     @property
@@ -363,7 +365,7 @@ def impl(self, x):
 
 
 i1e_scalar = I1e(upgrade_to_float_no_complex, name="i1e")
-i1e = tt.Elemwise(i1e_scalar, name="Elemwise{i1e,no_inplace}")
+i1e = Elemwise(i1e_scalar, name="Elemwise{i1e,no_inplace}")
 
 
 class I0e(UnaryScalarOp):
@@ -379,11 +381,11 @@ def impl(self, x):
     def grad(self, inp, grads):
         (x,) = inp
         (gz,) = grads
-        return (gz * (i1e_scalar(x) - theano.scalar.sgn(x) * i0e_scalar(x)),)
+        return (gz * (i1e_scalar(x) - aesara.scalar.sgn(x) * i0e_scalar(x)),)
 
 
 i0e_scalar = I0e(upgrade_to_float_no_complex, name="i0e")
-i0e = tt.Elemwise(i0e_scalar, name="Elemwise{i0e,no_inplace}")
+i0e = Elemwise(i0e_scalar, name="Elemwise{i0e,no_inplace}")
 
 
 def random_choice(*args, **kwargs):
@@ -437,13 +439,13 @@ def incomplete_beta_cfe(a, b, x, small):
     based on Cephes library by Steve Moshier (incbet.c).
     small: Choose element-wise which continued fraction expansion to use.
     """
-    BIG = tt.constant(4.503599627370496e15, dtype="float64")
-    BIGINV = tt.constant(2.22044604925031308085e-16, dtype="float64")
-    THRESH = tt.constant(3.0 * np.MachAr().eps, dtype="float64")
+    BIG = aet.constant(4.503599627370496e15, dtype="float64")
+    BIGINV = aet.constant(2.22044604925031308085e-16, dtype="float64")
+    THRESH = aet.constant(3.0 * np.MachAr().eps, dtype="float64")
 
-    zero = tt.constant(0.0, dtype="float64")
-    one = tt.constant(1.0, dtype="float64")
-    two = tt.constant(2.0, dtype="float64")
+    zero = aet.constant(0.0, dtype="float64")
+    one = aet.constant(1.0, dtype="float64")
+    two = aet.constant(2.0, dtype="float64")
 
     r = one
     k1 = a
@@ -452,11 +454,11 @@ def incomplete_beta_cfe(a, b, x, small):
     k5 = one
     k8 = a + two
 
-    k2 = tt.switch(small, a + b, b - one)
-    k6 = tt.switch(small, b - one, a + b)
-    k7 = tt.switch(small, k4, a + one)
-    k26update = tt.switch(small, one, -one)
-    x = tt.switch(small, x, x / (one - x))
+    k2 = aet.switch(small, a + b, b - one)
+    k6 = aet.switch(small, b - one, a + b)
+    k7 = aet.switch(small, k4, a + one)
+    k26update = aet.switch(small, one, -one)
+    x = aet.switch(small, x, x / (one - x))
 
     pkm2 = zero
     qkm2 = one
@@ -482,7 +484,7 @@ def _step(i, pkm1, pkm2, qkm1, qkm2, k1, k2, k3, k4, k5, k6, k7, k8, r):
         qkm1 = qk
 
         old_r = r
-        r = tt.switch(tt.eq(qk, zero), r, pk / qk)
+        r = aet.switch(aet.eq(qk, zero), r, pk / qk)
 
         k1 += one
         k2 += k26update
@@ -493,30 +495,32 @@ def _step(i, pkm1, pkm2, qkm1, qkm2, k1, k2, k3, k4, k5, k6, k7, k8, r):
         k7 += two
         k8 += two
 
-        big_cond = tt.gt(tt.abs_(qk) + tt.abs_(pk), BIG)
-        biginv_cond = tt.or_(tt.lt(tt.abs_(qk), BIGINV), tt.lt(tt.abs_(pk), BIGINV))
+        big_cond = aet.gt(aet.abs_(qk) + aet.abs_(pk), BIG)
+        biginv_cond = aet.or_(aet.lt(aet.abs_(qk), BIGINV), aet.lt(aet.abs_(pk), BIGINV))
 
-        pkm2 = tt.switch(big_cond, pkm2 * BIGINV, pkm2)
-        pkm1 = tt.switch(big_cond, pkm1 * BIGINV, pkm1)
-        qkm2 = tt.switch(big_cond, qkm2 * BIGINV, qkm2)
-        qkm1 = tt.switch(big_cond, qkm1 * BIGINV, qkm1)
+        pkm2 = aet.switch(big_cond, pkm2 * BIGINV, pkm2)
+        pkm1 = aet.switch(big_cond, pkm1 * BIGINV, pkm1)
+        qkm2 = aet.switch(big_cond, qkm2 * BIGINV, qkm2)
+        qkm1 = aet.switch(big_cond, qkm1 * BIGINV, qkm1)
 
-        pkm2 = tt.switch(biginv_cond, pkm2 * BIG, pkm2)
-        pkm1 = tt.switch(biginv_cond, pkm1 * BIG, pkm1)
-        qkm2 = tt.switch(biginv_cond, qkm2 * BIG, qkm2)
-        qkm1 = tt.switch(biginv_cond, qkm1 * BIG, qkm1)
+        pkm2 = aet.switch(biginv_cond, pkm2 * BIG, pkm2)
+        pkm1 = aet.switch(biginv_cond, pkm1 * BIG, pkm1)
+        qkm2 = aet.switch(biginv_cond, qkm2 * BIG, qkm2)
+        qkm1 = aet.switch(biginv_cond, qkm1 * BIG, qkm1)
 
         return (
             (pkm1, pkm2, qkm1, qkm2, k1, k2, k3, k4, k5, k6, k7, k8, r),
-            until(tt.abs_(old_r - r) < (THRESH * tt.abs_(r))),
+            until(aet.abs_(old_r - r) < (THRESH * aet.abs_(r))),
         )
 
     (pkm1, pkm2, qkm1, qkm2, k1, k2, k3, k4, k5, k6, k7, k8, r), _ = scan(
         _step,
-        sequences=[tt.arange(0, 300)],
+        sequences=[aet.arange(0, 300)],
         outputs_info=[
             e
-            for e in tt.cast((pkm1, pkm2, qkm1, qkm2, k1, k2, k3, k4, k5, k6, k7, k8, r), "float64")
+            for e in aet.cast(
+                (pkm1, pkm2, qkm1, qkm2, k1, k2, k3, k4, k5, k6, k7, k8, r), "float64"
+            )
         ],
     )
 
@@ -528,28 +532,28 @@ def incomplete_beta_ps(a, b, value):
     Use when b*x is small and value not too close to 1.
     Based on Cephes library by Steve Moshier (incbet.c)
     """
-    one = tt.constant(1, dtype="float64")
+    one = aet.constant(1, dtype="float64")
     ai = one / a
     u = (one - b) * value
     t1 = u / (a + one)
     t = u
     threshold = np.MachAr().eps * ai
-    s = tt.constant(0, dtype="float64")
+    s = aet.constant(0, dtype="float64")
 
     def _step(i, t, s):
         t *= (i - b) * value / i
         step = t / (a + i)
         s += step
-        return ((t, s), until(tt.abs_(step) < threshold))
+        return ((t, s), until(aet.abs_(step) < threshold))
 
     (t, s), _ = scan(
-        _step, sequences=[tt.arange(2, 302)], outputs_info=[e for e in tt.cast((t, s), "float64")]
+        _step, sequences=[aet.arange(2, 302)], outputs_info=[e for e in aet.cast((t, s), "float64")]
     )
 
     s = s[-1] + t1 + ai
 
-    t = gammaln(a + b) - gammaln(a) - gammaln(b) + a * tt.log(value) + tt.log(s)
-    return tt.exp(t)
+    t = gammaln(a + b) - gammaln(a) - gammaln(b) + a * aet.log(value) + aet.log(s)
+    return aet.exp(t)
 
 
 def incomplete_beta(a, b, value):
@@ -557,37 +561,37 @@ def incomplete_beta(a, b, value):
     Power series and continued fraction expansions chosen for best numerical
     convergence across the board based on inputs.
     """
-    machep = tt.constant(np.MachAr().eps, dtype="float64")
-    one = tt.constant(1, dtype="float64")
+    machep = aet.constant(np.MachAr().eps, dtype="float64")
+    one = aet.constant(1, dtype="float64")
     w = one - value
 
     ps = incomplete_beta_ps(a, b, value)
 
-    flip = tt.gt(value, (a / (a + b)))
+    flip = aet.gt(value, (a / (a + b)))
     aa, bb = a, b
-    a = tt.switch(flip, bb, aa)
-    b = tt.switch(flip, aa, bb)
-    xc = tt.switch(flip, value, w)
-    x = tt.switch(flip, w, value)
+    a = aet.switch(flip, bb, aa)
+    b = aet.switch(flip, aa, bb)
+    xc = aet.switch(flip, value, w)
+    x = aet.switch(flip, w, value)
 
     tps = incomplete_beta_ps(a, b, x)
-    tps = tt.switch(tt.le(tps, machep), one - machep, one - tps)
+    tps = aet.switch(aet.le(tps, machep), one - machep, one - tps)
 
     # Choose which continued fraction expansion for best convergence.
-    small = tt.lt(x * (a + b - 2.0) - (a - one), 0.0)
+    small = aet.lt(x * (a + b - 2.0) - (a - one), 0.0)
     cfe = incomplete_beta_cfe(a, b, x, small)
-    w = tt.switch(small, cfe, cfe / xc)
+    w = aet.switch(small, cfe, cfe / xc)
 
     # Direct incomplete beta accounting for flipped a, b.
-    t = tt.exp(
-        a * tt.log(x) + b * tt.log(xc) + gammaln(a + b) - gammaln(a) - gammaln(b) + tt.log(w / a)
+    t = aet.exp(
+        a * aet.log(x) + b * aet.log(xc) + gammaln(a + b) - gammaln(a) - gammaln(b) + aet.log(w / a)
     )
 
-    t = tt.switch(flip, tt.switch(tt.le(t, machep), one - machep, one - t), t)
-    return tt.switch(
-        tt.and_(flip, tt.and_(tt.le((b * x), one), tt.le(x, 0.95))),
+    t = aet.switch(flip, aet.switch(aet.le(t, machep), one - machep, one - t), t)
+    return aet.switch(
+        aet.and_(flip, aet.and_(aet.le((b * x), one), aet.le(x, 0.95))),
         tps,
-        tt.switch(tt.and_(tt.le(b * value, one), tt.le(value, 0.95)), ps, t),
+        aet.switch(aet.and_(aet.le(b * value, one), aet.le(value, 0.95)), ps, t),
     )
 
 
diff --git a/pymc3/distributions/distribution.py b/pymc3/distributions/distribution.py
index c24a9d9df6e..d0ef10b236c 100644
--- a/pymc3/distributions/distribution.py
+++ b/pymc3/distributions/distribution.py
@@ -27,12 +27,16 @@
 if TYPE_CHECKING:
     from typing import Optional, Callable
 
+import aesara
+import aesara.graph.basic
+import aesara.tensor as aet
 import numpy as np
-import theano
-import theano.graph.basic
-import theano.tensor as tt
 
-from theano import function
+from aesara import function
+from aesara.compile.sharedvalue import SharedVariable
+from aesara.graph.basic import Constant
+from aesara.tensor.type import TensorType as AesaraTensorType
+from aesara.tensor.var import TensorVariable
 
 from pymc3.distributions.shape_utils import (
     broadcast_dist_samples_shape,
@@ -49,7 +53,7 @@
     build_named_node_tree,
 )
 from pymc3.util import get_repr_for_variable, get_var_name
-from pymc3.vartypes import string_types, theano_constant
+from pymc3.vartypes import string_types
 
 __all__ = [
     "DensityDist",
@@ -164,13 +168,13 @@ def getattr_value(self, val):
         if isinstance(val, string_types):
             val = getattr(self, val)
 
-        if isinstance(val, tt.TensorVariable):
+        if isinstance(val, TensorVariable):
             return val.tag.test_value
 
-        if isinstance(val, tt.sharedvar.SharedVariable):
+        if isinstance(val, SharedVariable):
             return val.get_value()
 
-        if isinstance(val, theano_constant):
+        if isinstance(val, Constant):
             return val.value
 
         return val
@@ -264,7 +268,7 @@ def logp_sum(self, *args, **kwargs):
         Subclasses can use this to improve the speed of logp evaluations
         if only the sum of the logp values is needed.
         """
-        return tt.sum(self.logp(*args, **kwargs))
+        return aet.sum(self.logp(*args, **kwargs))
 
     __latex__ = _repr_latex_
 
@@ -272,7 +276,7 @@ def logp_sum(self, *args, **kwargs):
 def TensorType(dtype, shape, broadcastable=None):
     if broadcastable is None:
         broadcastable = np.atleast_1d(shape) == 1
-    return tt.TensorType(str(dtype), broadcastable)
+    return AesaraTensorType(str(dtype), broadcastable)
 
 
 class NoDistribution(Distribution):
@@ -311,7 +315,7 @@ def logp(self, x):
         -------
         TensorVariable
         """
-        return tt.zeros_like(x)
+        return aet.zeros_like(x)
 
     def _distr_parameters_for_repr(self):
         return []
@@ -322,7 +326,7 @@ class Discrete(Distribution):
 
     def __init__(self, shape=(), dtype=None, defaults=("mode",), *args, **kwargs):
         if dtype is None:
-            if theano.config.floatX == "float32":
+            if aesara.config.floatX == "float32":
                 dtype = "int16"
             else:
                 dtype = "int64"
@@ -340,7 +344,7 @@ class Continuous(Distribution):
 
     def __init__(self, shape=(), dtype=None, defaults=("median", "mean", "mode"), *args, **kwargs):
         if dtype is None:
-            dtype = theano.config.floatX
+            dtype = aesara.config.floatX
         super().__init__(shape, dtype, defaults=defaults, *args, **kwargs)
 
 
@@ -371,7 +375,7 @@ def __init__(
 
         logp: callable
             A callable that has the following signature ``logp(value)`` and
-            returns a theano tensor that represents the distribution's log
+            returns a aesara tensor that represents the distribution's log
             probability density.
         shape: tuple (Optional): defaults to `()`
             The shape of the distribution. The default value indicates a scalar.
@@ -526,7 +530,7 @@ def __init__(
 
         """
         if dtype is None:
-            dtype = theano.config.floatX
+            dtype = aesara.config.floatX
         super().__init__(shape, dtype, testval, *args, **kwargs)
         self.logp = logp
         if type(self.logp) == types.MethodType:
@@ -608,7 +612,7 @@ def random(self, point=None, size=None, **kwargs):
                             "DensityDist random method cannot "
                             "adapt to shape changes in the distribution's "
                             "shape, which sometimes are necessary for sampling "
-                            "when the model uses pymc3.Data or theano shared "
+                            "when the model uses pymc3.Data or aesara shared "
                             "tensors, or when the DensityDist has observed "
                             "values.\n"
                             "This check can be disabled by passing "
@@ -673,9 +677,7 @@ def __init__(self):
 
 
 def is_fast_drawable(var):
-    return isinstance(
-        var, (numbers.Number, np.ndarray, theano_constant, tt.sharedvar.SharedVariable)
-    )
+    return isinstance(var, (numbers.Number, np.ndarray, Constant, SharedVariable))
 
 
 def draw_values(params, point=None, size=None):
@@ -690,7 +692,7 @@ def draw_values(params, point=None, size=None):
             c) parameter can be fixed using tag.test_value (last resort)
 
         3) The parameter is a tensor variable/constant. Can be evaluated using
-        theano.function, but a variable may contain nodes which
+        aesara.function, but a variable may contain nodes which
 
             a) are named parameters in the point
             b) are RVs with a random method
@@ -756,20 +758,19 @@ def draw_values(params, point=None, size=None):
             if (next_, size) in drawn:
                 # If the node already has a givens value, skip it
                 continue
-            elif isinstance(next_, (theano_constant, tt.sharedvar.SharedVariable)):
-                # If the node is a theano.tensor.TensorConstant or a
-                # theano.tensor.sharedvar.SharedVariable, its value will be
-                # available automatically in _compile_theano_function so
-                # we can skip it. Furthermore, if this node was treated as a
-                # TensorVariable that should be compiled by theano in
-                # _compile_theano_function, it would raise a `TypeError:
-                # ('Constants not allowed in param list', ...)` for
-                # TensorConstant, and a `TypeError: Cannot use a shared
-                # variable (...) as explicit input` for SharedVariable.
-                # ObservedRV and MultiObservedRV instances are ViewOPs
-                # of TensorConstants or SharedVariables, we must add them
-                # to the stack or risk evaluating deterministics with the
-                # wrong values (issue #3354)
+            elif isinstance(next_, (Constant, SharedVariable)):
+                # If the node is a aesara.tensor.TensorConstant or a
+                # SharedVariable, its value will be available automatically in
+                # _compile_aesara_function so we can skip it. Furthermore, if
+                # this node was treated as a TensorVariable that should be
+                # compiled by aesara in _compile_aesara_function, it would
+                # raise a `TypeError: ('Constants not allowed in param list',
+                # ...)` for TensorConstant, and a `TypeError: Cannot use a
+                # shared variable (...) as explicit input` for SharedVariable.
+                # ObservedRV and MultiObservedRV instances are ViewOPs of
+                # TensorConstants or SharedVariables, we must add them to the
+                # stack or risk evaluating deterministics with the wrong values
+                # (issue #3354)
                 stack.extend(
                     [
                         node
@@ -791,7 +792,7 @@ def draw_values(params, point=None, size=None):
                     value = _draw_value(next_, point=point, givens=temp_givens, size=size)
                     givens[next_.name] = (next_, value)
                     drawn[(next_, size)] = value
-                except theano.graph.fg.MissingInputError:
+                except aesara.graph.fg.MissingInputError:
                     # The node failed, so we must add the node's parents to
                     # the stack of nodes to try to draw from. We exclude the
                     # nodes in the `params` list.
@@ -834,17 +835,17 @@ def draw_values(params, point=None, size=None):
                         value = _draw_value(param, point=point, givens=givens.values(), size=size)
                         evaluated[param_idx] = drawn[(param, size)] = value
                         givens[param.name] = (param, value)
-                    except theano.graph.fg.MissingInputError:
+                    except aesara.graph.fg.MissingInputError:
                         missing_inputs.add(param_idx)
 
     return [evaluated[j] for j in params]  # set the order back
 
 
 @memoize
-def _compile_theano_function(param, vars, givens=None):
-    """Compile theano function for a given parameter and input variables.
+def _compile_aesara_function(param, vars, givens=None):
+    """Compile aesara function for a given parameter and input variables.
 
-    This function is memoized to avoid repeating costly theano compilations
+    This function is memoized to avoid repeating costly aesara compilations
     when repeatedly drawing values, which is done when generating posterior
     predictive samples.
 
@@ -852,11 +853,11 @@ def _compile_theano_function(param, vars, givens=None):
     ----------
     param: Model variable from which to draw value
     vars: Children variables of `param`
-    givens: Variables to be replaced in the Theano graph
+    givens: Variables to be replaced in the Aesara graph
 
     Returns
     -------
-    A compiled theano function that takes the values of `vars` as input
+    A compiled aesara function that takes the values of `vars` as input
         positional args
     """
     f = function(
@@ -867,32 +868,32 @@ def _compile_theano_function(param, vars, givens=None):
         on_unused_input="ignore",
         allow_input_downcast=True,
     )
-    return vectorize_theano_function(f, inputs=vars, output=param)
+    return vectorize_aesara_function(f, inputs=vars, output=param)
 
 
-def vectorize_theano_function(f, inputs, output):
-    """Takes a compiled theano function and wraps it with a vectorized version.
-    Theano compiled functions expect inputs and outputs of a fixed number of
+def vectorize_aesara_function(f, inputs, output):
+    """Takes a compiled aesara function and wraps it with a vectorized version.
+    Aesara compiled functions expect inputs and outputs of a fixed number of
     dimensions. In our context, these usually come from deterministics which
     are compiled against a given RV, with its core shape. If we draw i.i.d.
     samples from said RV, we would not be able to compute the deterministic
     over the i.i.d sampled dimensions (i.e. those that are not the core
-    dimensions of the RV). To deal with this problem, we wrap the theano
+    dimensions of the RV). To deal with this problem, we wrap the aesara
     compiled function with numpy.vectorize, providing the correct signature
     for the core dimensions. The extra dimensions, will be interpreted as
     i.i.d. sampled axis and will be broadcast following the usual rules.
 
     Parameters
     ----------
-    f: theano compiled function
-    inputs: list of theano variables used as inputs for the function
-    givens: theano variable which is the output of the function
+    f: aesara compiled function
+    inputs: list of aesara variables used as inputs for the function
+    givens: aesara variable which is the output of the function
 
     Notes
     -----
-    If inputs is an empty list (theano function with no inputs needed), then
+    If inputs is an empty list (aesara function with no inputs needed), then
     the same `f` is returned.
-    Only functions that return a single theano variable's value can be
+    Only functions that return a single aesara variable's value can be
     vectorized.
 
     Returns
@@ -928,27 +929,27 @@ def _draw_value(param, point=None, givens=None, size=None):
 
     Parameters
     ----------
-    param: number, array like, theano variable or pymc3 random variable
+    param: number, array like, aesara variable or pymc3 random variable
         The value or distribution. Constants or shared variables
-        will be converted to an array and returned. Theano variables
+        will be converted to an array and returned. Aesara variables
         are evaluated. If `param` is a pymc3 random variables, draw
         a new value from it and return that, unless a value is specified
         in `point`.
     point: dict, optional
         A dictionary from pymc3 variable names to their values.
     givens: dict, optional
-        A dictionary from theano variables to their values. These values
-        are used to evaluate `param` if it is a theano variable.
+        A dictionary from aesara variables to their values. These values
+        are used to evaluate `param` if it is a aesara variable.
     size: int, optional
         Number of samples
     """
     if isinstance(param, (numbers.Number, np.ndarray)):
         return param
-    elif isinstance(param, theano_constant):
+    elif isinstance(param, Constant):
         return param.value
-    elif isinstance(param, tt.sharedvar.SharedVariable):
+    elif isinstance(param, SharedVariable):
         return param.get_value()
-    elif isinstance(param, (tt.TensorVariable, MultiObservedRV)):
+    elif isinstance(param, (TensorVariable, MultiObservedRV)):
         if point and hasattr(param, "model") and param.name in point:
             return point[param.name]
         elif hasattr(param, "random") and param.random is not None:
@@ -971,7 +972,7 @@ def _draw_value(param, point=None, givens=None, size=None):
                     return dist_tmp.random(point=point, size=size)
                 except (ValueError, TypeError):
                     # reset shape to account for shape changes
-                    # with theano.shared inputs
+                    # with aesara.shared inputs
                     dist_tmp.shape = np.array([])
                     # We want to draw values to infer the dist_shape,
                     # we don't want to store these drawn values to the context
@@ -995,14 +996,14 @@ def _draw_value(param, point=None, givens=None, size=None):
                 variables = values = []
             # We only truly care if the ancestors of param that were given
             # value have the matching dshape and val.shape
-            param_ancestors = set(theano.graph.basic.ancestors([param], blockers=list(variables)))
+            param_ancestors = set(aesara.graph.basic.ancestors([param], blockers=list(variables)))
             inputs = [(var, val) for var, val in zip(variables, values) if var in param_ancestors]
             if inputs:
                 input_vars, input_vals = list(zip(*inputs))
             else:
                 input_vars = []
                 input_vals = []
-            func = _compile_theano_function(param, input_vars)
+            func = _compile_aesara_function(param, input_vars)
             output = func(*input_vals)
             return output
     raise ValueError("Unexpected type in draw_value: %s" % type(param))
diff --git a/pymc3/distributions/mixture.py b/pymc3/distributions/mixture.py
index 756269d3306..f423f298de2 100644
--- a/pymc3/distributions/mixture.py
+++ b/pymc3/distributions/mixture.py
@@ -14,10 +14,11 @@
 
 from collections.abc import Iterable
 
+import aesara
+import aesara.tensor as aet
 import numpy as np
-import theano
-import theano.tensor as tt
 
+from pymc3.aesaraf import _conversion_map, take_along_axis
 from pymc3.distributions.continuous import Normal, get_tau_sigma
 from pymc3.distributions.dist_math import bound, random_choice
 from pymc3.distributions.distribution import (
@@ -34,7 +35,6 @@
     to_tuple,
 )
 from pymc3.math import logsumexp
-from pymc3.theanof import _conversion_map, take_along_axis
 
 __all__ = ["Mixture", "NormalMixture", "MixtureSameFamily"]
 
@@ -143,15 +143,15 @@ def __init__(self, w, comp_dists, *args, **kwargs):
             )
         shape = kwargs.pop("shape", ())
 
-        self.w = w = tt.as_tensor_variable(w)
+        self.w = w = aet.as_tensor_variable(w)
         self.comp_dists = comp_dists
 
         defaults = kwargs.pop("defaults", [])
 
         if all_discrete(comp_dists):
-            default_dtype = _conversion_map[theano.config.floatX]
+            default_dtype = _conversion_map[aesara.config.floatX]
         else:
-            default_dtype = theano.config.floatX
+            default_dtype = aesara.config.floatX
 
             try:
                 self.mean = (w * self._comp_means()).sum(axis=-1)
@@ -166,9 +166,9 @@ def __init__(self, w, comp_dists, *args, **kwargs):
             if isinstance(comp_dists, Distribution):
                 comp_mode_logps = comp_dists.logp(comp_dists.mode)
             else:
-                comp_mode_logps = tt.stack([cd.logp(cd.mode) for cd in comp_dists])
+                comp_mode_logps = aet.stack([cd.logp(cd.mode) for cd in comp_dists])
 
-            mode_idx = tt.argmax(tt.log(w) + comp_mode_logps, axis=-1)
+            mode_idx = aet.argmax(aet.log(w) + comp_mode_logps, axis=-1)
             self.mode = self._comp_modes()[mode_idx]
 
             if "mode" not in defaults:
@@ -253,7 +253,7 @@ def _comp_logp(self, value):
                 val_shape = tuple(value.shape.eval())
             except AttributeError:
                 val_shape = value.shape
-            except theano.graph.fg.MissingInputError:
+            except aesara.graph.fg.MissingInputError:
                 val_shape = None
             try:
                 self_shape = tuple(self.shape)
@@ -292,26 +292,30 @@ def _comp_logp(self, value):
                 if ndim <= 1:
                     ndim = len(comp_dists.shape) - 1
             if ndim < len(comp_dists.shape):
-                value_ = tt.shape_padright(value, len(comp_dists.shape) - ndim)
+                value_ = aet.shape_padright(value, len(comp_dists.shape) - ndim)
             else:
                 value_ = value
             return comp_dists.logp(value_)
         else:
-            return tt.squeeze(
-                tt.stack([comp_dist.logp(value) for comp_dist in comp_dists], axis=-1)
+            return aet.squeeze(
+                aet.stack([comp_dist.logp(value) for comp_dist in comp_dists], axis=-1)
             )
 
     def _comp_means(self):
         try:
-            return tt.as_tensor_variable(self.comp_dists.mean)
+            return aet.as_tensor_variable(self.comp_dists.mean)
         except AttributeError:
-            return tt.squeeze(tt.stack([comp_dist.mean for comp_dist in self.comp_dists], axis=-1))
+            return aet.squeeze(
+                aet.stack([comp_dist.mean for comp_dist in self.comp_dists], axis=-1)
+            )
 
     def _comp_modes(self):
         try:
-            return tt.as_tensor_variable(self.comp_dists.mode)
+            return aet.as_tensor_variable(self.comp_dists.mode)
         except AttributeError:
-            return tt.squeeze(tt.stack([comp_dist.mode for comp_dist in self.comp_dists], axis=-1))
+            return aet.squeeze(
+                aet.stack([comp_dist.mode for comp_dist in self.comp_dists], axis=-1)
+            )
 
     def _comp_samples(self, point=None, size=None, comp_dist_shapes=None, broadcast_shape=None):
         if self.comp_is_distribution:
@@ -418,7 +422,7 @@ def logp(self, value):
         ----------
         value: numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -427,10 +431,10 @@ def logp(self, value):
         w = self.w
 
         return bound(
-            logsumexp(tt.log(w) + self._comp_logp(value), axis=-1, keepdims=False),
+            logsumexp(aet.log(w) + self._comp_logp(value), axis=-1, keepdims=False),
             w >= 0,
             w <= 1,
-            tt.allclose(w.sum(axis=-1), 1),
+            aet.allclose(w.sum(axis=-1), 1),
             broadcast_conditions=False,
         )
 
@@ -632,8 +636,8 @@ def __init__(self, w, mu, sigma=None, tau=None, sd=None, comp_shape=(), *args, *
             sigma = sd
         _, sigma = get_tau_sigma(tau=tau, sigma=sigma)
 
-        self.mu = mu = tt.as_tensor_variable(mu)
-        self.sigma = self.sd = sigma = tt.as_tensor_variable(sigma)
+        self.mu = mu = aet.as_tensor_variable(mu)
+        self.sigma = self.sd = sigma = aet.as_tensor_variable(sigma)
 
         super().__init__(w, Normal.dist(mu, sigma=sigma, shape=comp_shape), *args, **kwargs)
 
@@ -675,7 +679,7 @@ class MixtureSameFamily(Distribution):
     """
 
     def __init__(self, w, comp_dists, mixture_axis=-1, *args, **kwargs):
-        self.w = tt.as_tensor_variable(w)
+        self.w = aet.as_tensor_variable(w)
         if not isinstance(comp_dists, Distribution):
             raise TypeError(
                 "The MixtureSameFamily distribution only accepts Distribution "
@@ -697,19 +701,19 @@ def __init__(self, w, comp_dists, mixture_axis=-1, *args, **kwargs):
         # Compute the mode so we don't always have to pass a testval
         defaults = kwargs.pop("defaults", [])
         event_shape = self.comp_dists.shape[mixture_axis + 1 :]
-        _w = tt.shape_padleft(
-            tt.shape_padright(w, len(event_shape)),
+        _w = aet.shape_padleft(
+            aet.shape_padright(w, len(event_shape)),
             len(self.comp_dists.shape) - w.ndim - len(event_shape),
         )
         mode = take_along_axis(
             self.comp_dists.mode,
-            tt.argmax(_w, keepdims=True),
+            aet.argmax(_w, keepdims=True),
             axis=mixture_axis,
         )
         self.mode = mode[(..., 0) + (slice(None),) * len(event_shape)]
 
         if not all_discrete(comp_dists):
-            mean = tt.as_tensor_variable(self.comp_dists.mean)
+            mean = aet.as_tensor_variable(self.comp_dists.mean)
             self.mean = (_w * mean).sum(axis=mixture_axis)
             if "mean" not in defaults:
                 defaults.append("mean")
@@ -725,7 +729,7 @@ def logp(self, value):
         ----------
         value : numeric
             Value(s) for which log-probability is calculated. If the log probabilities for multiple
-            values are desired the values must be provided in a numpy array or theano tensor
+            values are desired the values must be provided in a numpy array or aesara tensor
 
         Returns
         -------
@@ -742,7 +746,7 @@ def logp(self, value):
         # We first have to pad the shape of w to the right with ones
         # so that it can broadcast with the event_shape.
 
-        w = tt.shape_padright(w, len(event_shape))
+        w = aet.shape_padright(w, len(event_shape))
 
         # Second, we have to add the mixture_axis to the value tensor
         # To insert the mixture axis at the correct location, we use the
@@ -751,14 +755,14 @@ def logp(self, value):
         # than the ones present in the comp_dists.
         comp_dists_ndim = len(comp_dists.shape)
 
-        value = tt.shape_padaxis(value, axis=mixture_axis - comp_dists_ndim)
+        value = aet.shape_padaxis(value, axis=mixture_axis - comp_dists_ndim)
 
         comp_logp = comp_dists.logp(value)
         return bound(
-            logsumexp(tt.log(w) + comp_logp, axis=mixture_axis, keepdims=False),
+            logsumexp(aet.log(w) + comp_logp, axis=mixture_axis, keepdims=False),
             w >= 0,
             w <= 1,
-            tt.allclose(w.sum(axis=mixture_axis - comp_dists_ndim), 1),
+            aet.allclose(w.sum(axis=mixture_axis - comp_dists_ndim), 1),
             broadcast_conditions=False,
         )
 
diff --git a/pymc3/distributions/multivariate.py b/pymc3/distributions/multivariate.py
index 3fcdb8dbdaf..c23b9f191b6 100755
--- a/pymc3/distributions/multivariate.py
+++ b/pymc3/distributions/multivariate.py
@@ -17,20 +17,27 @@
 
 import warnings
 
+import aesara
+import aesara.tensor as aet
 import numpy as np
 import scipy
-import theano
-import theano.tensor as tt
 
+from aesara.graph.basic import Apply
+from aesara.graph.op import Op, get_test_value
+from aesara.graph.utils import TestValueError
+from aesara.tensor.nlinalg import det, eigh, matrix_inverse, trace
+from aesara.tensor.slinalg import (
+    Cholesky,
+    Solve,
+    solve_lower_triangular,
+    solve_upper_triangular,
+)
+from aesara.tensor.type import TensorType
 from scipy import linalg, stats
-from theano.graph.basic import Apply
-from theano.graph.op import Op, get_test_value
-from theano.graph.utils import TestValueError
-from theano.tensor.nlinalg import det, eigh, matrix_inverse, trace
-from theano.tensor.slinalg import Cholesky
 
 import pymc3 as pm
 
+from pymc3.aesaraf import floatX, intX
 from pymc3.distributions import transforms
 from pymc3.distributions.continuous import ChiSquared, Normal
 from pymc3.distributions.dist_math import bound, factln, logpow
@@ -46,7 +53,6 @@
 from pymc3.exceptions import ShapeError
 from pymc3.math import kron_diag, kron_dot, kron_solve_lower, kronecker
 from pymc3.model import Deterministic
-from pymc3.theanof import floatX, intX
 
 __all__ = [
     "MvNormal",
@@ -75,8 +81,8 @@ def __init__(self, mu=None, cov=None, chol=None, tau=None, lower=True, *args, **
             raise ValueError(
                 "Incompatible parameterization. Specify exactly one of tau, cov, or chol."
             )
-        self.mu = mu = tt.as_tensor_variable(mu)
-        self.solve_lower = tt.slinalg.Solve(A_structure="lower_triangular")
+        self.mu = mu = aet.as_tensor_variable(mu)
+        self.solve_lower = Solve(A_structure="lower_triangular")
         # Step methods and advi do not catch LinAlgErrors at the
         # moment. We work around that by using a cholesky op
         # that returns a nan as first entry instead of raising
@@ -86,7 +92,7 @@ def __init__(self, mu=None, cov=None, chol=None, tau=None, lower=True, *args, **
         if cov is not None:
             self.k = cov.shape[0]
             self._cov_type = "cov"
-            cov = tt.as_tensor_variable(cov)
+            cov = aet.as_tensor_variable(cov)
             if cov.ndim != 2:
                 raise ValueError("cov must be two dimensional.")
             self.chol_cov = cholesky(cov)
@@ -95,7 +101,7 @@ def __init__(self, mu=None, cov=None, chol=None, tau=None, lower=True, *args, **
         elif tau is not None:
             self.k = tau.shape[0]
             self._cov_type = "tau"
-            tau = tt.as_tensor_variable(tau)
+            tau = aet.as_tensor_variable(tau)
             if tau.ndim != 2:
                 raise ValueError("tau must be two dimensional.")
             self.chol_tau = cholesky(tau)
@@ -106,7 +112,7 @@ def __init__(self, mu=None, cov=None, chol=None, tau=None, lower=True, *args, **
             self._cov_type = "chol"
             if chol.ndim != 2:
                 raise ValueError("chol must be two dimensional.")
-            self.chol_cov = tt.as_tensor_variable(chol)
+            self.chol_cov = aet.as_tensor_variable(chol)
             self._n = self.chol_cov.shape[-1]
 
     def _quaddist(self, value):
@@ -137,16 +143,16 @@ def _quaddist(self, value):
 
     def _quaddist_chol(self, delta):
         chol_cov = self.chol_cov
-        diag = tt.nlinalg.diag(chol_cov)
+        diag = aet.nlinalg.diag(chol_cov)
         # Check if the covariance matrix is positive definite.
-        ok = tt.all(diag > 0)
+        ok = aet.all(diag > 0)
         # If not, replace the diagonal. We return -inf later, but
         # need to prevent solve_lower from throwing an exception.
-        chol_cov = tt.switch(ok, chol_cov, 1)
+        chol_cov = aet.switch(ok, chol_cov, 1)
 
         delta_trans = self.solve_lower(chol_cov, delta.T).T
         quaddist = (delta_trans ** 2).sum(axis=-1)
-        logdet = tt.sum(tt.log(diag))
+        logdet = aet.sum(aet.log(diag))
         return quaddist, logdet, ok
 
     def _quaddist_cov(self, delta):
@@ -154,16 +160,16 @@ def _quaddist_cov(self, delta):
 
     def _quaddist_tau(self, delta):
         chol_tau = self.chol_tau
-        diag = tt.nlinalg.diag(chol_tau)
+        diag = aet.nlinalg.diag(chol_tau)
         # Check if the precision matrix is positive definite.
-        ok = tt.all(diag > 0)
+        ok = aet.all(diag > 0)
         # If not, replace the diagonal. We return -inf later, but
         # need to prevent solve_lower from throwing an exception.
-        chol_tau = tt.switch(ok, chol_tau, 1)
+        chol_tau = aet.switch(ok, chol_tau, 1)
 
-        delta_trans = tt.dot(delta, chol_tau)
+        delta_trans = aet.dot(delta, chol_tau)
         quaddist = (delta_trans ** 2).sum(axis=-1)
-        logdet = -tt.sum(tt.log(diag))
+        logdet = -aet.sum(aet.log(diag))
         return quaddist, logdet, ok
 
     def _cov_param_for_repr(self):
@@ -235,7 +241,7 @@ class MvNormal(_QuadFormBase):
         chol, _, _ = pm.LKJCholeskyCov('chol_cov', n=3, eta=2,
             sd_dist=sd_dist, compute_corr=True)
         vals_raw = pm.Normal('vals_raw', mu=0, sigma=1, shape=(5, 3))
-        vals = pm.Deterministic('vals', tt.dot(chol, vals_raw.T).T)
+        vals = pm.Deterministic('vals', aet.dot(chol, vals_raw.T).T)
     """
 
     def __init__(self, mu, cov=None, tau=None, chol=None, lower=True, *args, **kwargs):
@@ -362,7 +368,7 @@ def __init__(
                 raise ValueError("Specify only one of cov and Sigma")
             cov = Sigma
         super().__init__(mu=mu, cov=cov, tau=tau, chol=chol, lower=lower, *args, **kwargs)
-        self.nu = nu = tt.as_tensor_variable(nu)
+        self.nu = nu = aet.as_tensor_variable(nu)
         self.mean = self.median = self.mode = self.mu = self.mu
 
     def random(self, point=None, size=None):
@@ -423,7 +429,7 @@ def logp(self, value):
             - gammaln(self.nu / 2.0)
             - 0.5 * k * floatX(np.log(self.nu * np.pi))
         )
-        inner = -(self.nu + k) / 2.0 * tt.log1p(quaddist / self.nu)
+        inner = -(self.nu + k) / 2.0 * aet.log1p(quaddist / self.nu)
         return bound(norm + inner - logdet, ok)
 
     def _distr_parameters_for_repr(self):
@@ -472,10 +478,10 @@ def __init__(self, a, transform=transforms.stick_breaking, *args, **kwargs):
 
         super().__init__(transform=transform, *args, **kwargs)
 
-        self.a = a = tt.as_tensor_variable(a)
-        self.mean = a / tt.sum(a)
+        self.a = a = aet.as_tensor_variable(a)
+        self.mean = a / aet.sum(a)
 
-        self.mode = tt.switch(tt.all(a > 1), (a - 1) / tt.sum(a - 1), np.nan)
+        self.mode = aet.switch(aet.all(a > 1), (a - 1) / aet.sum(a - 1), np.nan)
 
     def random(self, point=None, size=None):
         """
@@ -519,10 +525,10 @@ def logp(self, value):
 
         # only defined for sum(value) == 1
         return bound(
-            tt.sum(logpow(value, a - 1) - gammaln(a), axis=-1) + gammaln(tt.sum(a, axis=-1)),
-            tt.all(value >= 0),
-            tt.all(value <= 1),
-            tt.all(a > 0),
+            aet.sum(logpow(value, a - 1) - gammaln(a), axis=-1) + gammaln(aet.sum(a, axis=-1)),
+            aet.all(value >= 0),
+            aet.all(value <= 1),
+            aet.all(a > 0),
             broadcast_conditions=False,
         )
 
@@ -566,21 +572,21 @@ class Multinomial(Discrete):
     def __init__(self, n, p, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-        p = p / tt.sum(p, axis=-1, keepdims=True)
+        p = p / aet.sum(p, axis=-1, keepdims=True)
 
         if len(self.shape) > 1:
-            self.n = tt.shape_padright(n)
-            self.p = p if p.ndim > 1 else tt.shape_padleft(p)
+            self.n = aet.shape_padright(n)
+            self.p = p if p.ndim > 1 else aet.shape_padleft(p)
         else:
             # n is a scalar, p is a 1d array
-            self.n = tt.as_tensor_variable(n)
-            self.p = tt.as_tensor_variable(p)
+            self.n = aet.as_tensor_variable(n)
+            self.p = aet.as_tensor_variable(p)
 
         self.mean = self.n * self.p
-        mode = tt.cast(tt.round(self.mean), "int32")
-        diff = self.n - tt.sum(mode, axis=-1, keepdims=True)
-        inc_bool_arr = tt.abs_(diff) > 0
-        mode = tt.inc_subtensor(mode[inc_bool_arr.nonzero()], diff[inc_bool_arr.nonzero()])
+        mode = aet.cast(aet.round(self.mean), "int32")
+        diff = self.n - aet.sum(mode, axis=-1, keepdims=True)
+        inc_bool_arr = aet.abs_(diff) > 0
+        mode = aet.inc_subtensor(mode[inc_bool_arr.nonzero()], diff[inc_bool_arr.nonzero()])
         self.mode = mode
 
     def _random(self, n, p, size=None, raw_size=None):
@@ -663,12 +669,12 @@ def logp(self, x):
         p = self.p
 
         return bound(
-            factln(n) + tt.sum(-factln(x) + logpow(p, x), axis=-1, keepdims=True),
-            tt.all(x >= 0),
-            tt.all(tt.eq(tt.sum(x, axis=-1, keepdims=True), n)),
-            tt.all(p <= 1),
-            tt.all(tt.eq(tt.sum(p, axis=-1), 1)),
-            tt.all(tt.ge(n, 0)),
+            factln(n) + aet.sum(-factln(x) + logpow(p, x), axis=-1, keepdims=True),
+            aet.all(x >= 0),
+            aet.all(aet.eq(aet.sum(x, axis=-1, keepdims=True), n)),
+            aet.all(p <= 1),
+            aet.all(aet.eq(aet.sum(p, axis=-1), 1)),
+            aet.all(aet.ge(n, 0)),
             broadcast_conditions=False,
         )
 
@@ -714,22 +720,22 @@ def __init__(self, n, a, shape, *args, **kwargs):
         n = intX(n)
         a = floatX(a)
         if len(self.shape) > 1:
-            self.n = tt.shape_padright(n)
-            self.a = tt.as_tensor_variable(a) if a.ndim > 1 else tt.shape_padleft(a)
+            self.n = aet.shape_padright(n)
+            self.a = aet.as_tensor_variable(a) if a.ndim > 1 else aet.shape_padleft(a)
         else:
             # n is a scalar, p is a 1d array
-            self.n = tt.as_tensor_variable(n)
-            self.a = tt.as_tensor_variable(a)
+            self.n = aet.as_tensor_variable(n)
+            self.a = aet.as_tensor_variable(a)
 
         p = self.a / self.a.sum(-1, keepdims=True)
 
         self.mean = self.n * p
         # Mode is only an approximation. Exact computation requires a complex
         # iterative algorithm as described in https://doi.org/10.1016/j.spl.2009.09.013
-        mode = tt.cast(tt.round(self.mean), "int32")
-        diff = self.n - tt.sum(mode, axis=-1, keepdims=True)
-        inc_bool_arr = tt.abs_(diff) > 0
-        mode = tt.inc_subtensor(mode[inc_bool_arr.nonzero()], diff[inc_bool_arr.nonzero()])
+        mode = aet.cast(aet.round(self.mean), "int32")
+        diff = self.n - aet.sum(mode, axis=-1, keepdims=True)
+        inc_bool_arr = aet.abs_(diff) > 0
+        mode = aet.inc_subtensor(mode[inc_bool_arr.nonzero()], diff[inc_bool_arr.nonzero()])
         self._defaultval = mode
 
     def _random(self, n, a, size=None):
@@ -816,10 +822,10 @@ def logp(self, value):
         # and that each observation value_i sums to n_i.
         return bound(
             result,
-            tt.all(tt.ge(value, 0)),
-            tt.all(tt.gt(a, 0)),
-            tt.all(tt.ge(n, 0)),
-            tt.all(tt.eq(value.sum(axis=-1, keepdims=True), n)),
+            aet.all(aet.ge(value, 0)),
+            aet.all(aet.gt(a, 0)),
+            aet.all(aet.ge(n, 0)),
+            aet.all(aet.eq(value.sum(axis=-1, keepdims=True), n)),
             broadcast_conditions=False,
         )
 
@@ -847,9 +853,9 @@ class PosDefMatrix(Op):
     # Compulsory if itypes and otypes are not defined
 
     def make_node(self, x):
-        x = tt.as_tensor_variable(x)
+        x = aet.as_tensor_variable(x)
         assert x.ndim == 2
-        o = tt.TensorType(dtype="int8", broadcastable=[])()
+        o = TensorType(dtype="int8", broadcastable=[])()
         return Apply(self, [x], [o])
 
     # Python implementation:
@@ -868,7 +874,7 @@ def infer_shape(self, fgraph, node, shapes):
 
     def grad(self, inp, grads):
         (x,) = inp
-        return [x.zeros_like(theano.config.floatX)]
+        return [x.zeros_like(aesara.config.floatX)]
 
     def __str__(self):
         return "MatrixIsPositiveDefinite"
@@ -925,11 +931,11 @@ def __init__(self, nu, V, *args, **kwargs):
             "https://github.com/pymc-devs/pymc3/issues/538.",
             UserWarning,
         )
-        self.nu = nu = tt.as_tensor_variable(nu)
-        self.p = p = tt.as_tensor_variable(V.shape[0])
-        self.V = V = tt.as_tensor_variable(V)
+        self.nu = nu = aet.as_tensor_variable(nu)
+        self.p = p = aet.as_tensor_variable(V.shape[0])
+        self.V = V = aet.as_tensor_variable(V)
         self.mean = nu * V
-        self.mode = tt.switch(tt.ge(nu, p + 1), (nu - p - 1) * V, np.nan)
+        self.mode = aet.switch(aet.ge(nu, p + 1), (nu - p - 1) * V, np.nan)
 
     def random(self, point=None, size=None):
         """
@@ -975,15 +981,15 @@ def logp(self, X):
 
         return bound(
             (
-                (nu - p - 1) * tt.log(IXI)
+                (nu - p - 1) * aet.log(IXI)
                 - trace(matrix_inverse(V).dot(X))
-                - nu * p * tt.log(2)
-                - nu * tt.log(IVI)
+                - nu * p * aet.log(2)
+                - nu * aet.log(IVI)
                 - 2 * multigammaln(nu / 2.0, p)
             )
             / 2,
             matrix_pos_def(X),
-            tt.eq(X, X.T),
+            aet.eq(X, X.T),
             nu > (p - 1),
             broadcast_conditions=False,
         )
@@ -1053,44 +1059,44 @@ def WishartBartlett(name, S, nu, is_cholesky=False, return_cholesky=False, testv
         diag_testval = None
         tril_testval = None
 
-    c = tt.sqrt(
+    c = aet.sqrt(
         ChiSquared("%s_c" % name, nu - np.arange(2, 2 + n_diag), shape=n_diag, testval=diag_testval)
     )
     pm._log.info("Added new variable %s_c to model diagonal of Wishart." % name)
     z = Normal("%s_z" % name, 0.0, 1.0, shape=n_tril, testval=tril_testval)
     pm._log.info("Added new variable %s_z to model off-diagonals of Wishart." % name)
     # Construct A matrix
-    A = tt.zeros(S.shape, dtype=np.float32)
-    A = tt.set_subtensor(A[diag_idx], c)
-    A = tt.set_subtensor(A[tril_idx], z)
+    A = aet.zeros(S.shape, dtype=np.float32)
+    A = aet.set_subtensor(A[diag_idx], c)
+    A = aet.set_subtensor(A[tril_idx], z)
 
     # L * A * A.T * L.T ~ Wishart(L*L.T, nu)
     if return_cholesky:
-        return Deterministic(name, tt.dot(L, A))
+        return Deterministic(name, aet.dot(L, A))
     else:
-        return Deterministic(name, tt.dot(tt.dot(tt.dot(L, A), A.T), L.T))
+        return Deterministic(name, aet.dot(aet.dot(aet.dot(L, A), A.T), L.T))
 
 
 def _lkj_normalizing_constant(eta, n):
     if eta == 1:
-        result = gammaln(2.0 * tt.arange(1, int((n - 1) / 2) + 1)).sum()
+        result = gammaln(2.0 * aet.arange(1, int((n - 1) / 2) + 1)).sum()
         if n % 2 == 1:
             result += (
-                0.25 * (n ** 2 - 1) * tt.log(np.pi)
-                - 0.25 * (n - 1) ** 2 * tt.log(2.0)
+                0.25 * (n ** 2 - 1) * aet.log(np.pi)
+                - 0.25 * (n - 1) ** 2 * aet.log(2.0)
                 - (n - 1) * gammaln(int((n + 1) / 2))
             )
         else:
             result += (
-                0.25 * n * (n - 2) * tt.log(np.pi)
-                + 0.25 * (3 * n ** 2 - 4 * n) * tt.log(2.0)
+                0.25 * n * (n - 2) * aet.log(np.pi)
+                + 0.25 * (3 * n ** 2 - 4 * n) * aet.log(2.0)
                 + n * gammaln(n / 2)
                 - (n - 1) * gammaln(n)
             )
     else:
         result = -(n - 1) * gammaln(eta + 0.5 * (n - 1))
-        k = tt.arange(1, n)
-        result += (0.5 * k * tt.log(np.pi) + gammaln(eta + 0.5 * (n - 1 - k))).sum()
+        k = aet.arange(1, n)
+        result += (0.5 * k * aet.log(np.pi) + gammaln(eta + 0.5 * (n - 1 - k))).sum()
     return result
 
 
@@ -1100,8 +1106,8 @@ class _LKJCholeskyCov(Continuous):
     """
 
     def __init__(self, eta, n, sd_dist, *args, **kwargs):
-        self.n = tt.as_tensor_variable(n)
-        self.eta = tt.as_tensor_variable(eta)
+        self.n = aet.as_tensor_variable(n)
+        self.eta = aet.as_tensor_variable(eta)
 
         if "transform" in kwargs and kwargs["transform"] is not None:
             raise ValueError("Invalid parameter: transform.")
@@ -1143,22 +1149,22 @@ def logp(self, x):
         eta = self.eta
 
         diag_idxs = self.diag_idxs
-        cumsum = tt.cumsum(x ** 2)
-        variance = tt.zeros(n)
-        variance = tt.inc_subtensor(variance[0], x[0] ** 2)
-        variance = tt.inc_subtensor(variance[1:], cumsum[diag_idxs[1:]] - cumsum[diag_idxs[:-1]])
-        sd_vals = tt.sqrt(variance)
+        cumsum = aet.cumsum(x ** 2)
+        variance = aet.zeros(n)
+        variance = aet.inc_subtensor(variance[0], x[0] ** 2)
+        variance = aet.inc_subtensor(variance[1:], cumsum[diag_idxs[1:]] - cumsum[diag_idxs[:-1]])
+        sd_vals = aet.sqrt(variance)
 
         logp_sd = self.sd_dist.logp(sd_vals).sum()
         corr_diag = x[diag_idxs] / sd_vals
 
-        logp_lkj = (2 * eta - 3 + n - tt.arange(n)) * tt.log(corr_diag)
-        logp_lkj = tt.sum(logp_lkj)
+        logp_lkj = (2 * eta - 3 + n - aet.arange(n)) * aet.log(corr_diag)
+        logp_lkj = aet.sum(logp_lkj)
 
         # Compute the log det jacobian of the second transformation
         # described in the docstring.
-        idx = tt.arange(n)
-        det_invjac = tt.log(corr_diag) - idx * tt.log(sd_vals)
+        idx = aet.arange(n)
+        det_invjac = aet.log(corr_diag) - idx * aet.log(sd_vals)
         det_invjac = det_invjac.sum()
 
         norm = _lkj_normalizing_constant(eta, n)
@@ -1348,10 +1354,10 @@ def LKJCholeskyCov(name, eta, n, sd_dist, compute_corr=False, store_in_trace=Tru
 
             # Or transform an uncorrelated normal:
             vals_raw = pm.Normal('vals_raw', mu=0, sigma=1, shape=10)
-            vals = tt.dot(chol, vals_raw)
+            vals = aet.dot(chol, vals_raw)
 
             # Or compute the covariance matrix
-            cov = tt.dot(chol, chol.T)
+            cov = aet.dot(chol, chol.T)
 
     **Implementation** In the unconstrained space all values of the cholesky factor
     are stored untransformed, except for the diagonal entries, where
@@ -1411,9 +1417,9 @@ def LKJCholeskyCov(name, eta, n, sd_dist, compute_corr=False, store_in_trace=Tru
     else:
         chol = pm.expand_packed_triangular(n, packed_chol, lower=True)
         # compute covariance matrix
-        cov = tt.dot(chol, chol.T)
+        cov = aet.dot(chol, chol.T)
         # extract standard deviations and rho
-        stds = tt.sqrt(tt.diag(cov))
+        stds = aet.sqrt(aet.diag(cov))
         inv_stds = 1 / stds
         corr = inv_stds[None, :] * cov * inv_stds[:, None]
         if store_in_trace:
@@ -1562,14 +1568,14 @@ def logp(self, x):
         eta = self.eta
 
         X = x[self.tri_index]
-        X = tt.fill_diagonal(X, 1)
+        X = aet.fill_diagonal(X, 1)
 
         result = _lkj_normalizing_constant(eta, n)
-        result += (eta - 1.0) * tt.log(det(X))
+        result += (eta - 1.0) * aet.log(det(X))
         return bound(
             result,
-            tt.all(X <= 1),
-            tt.all(X >= -1),
+            aet.all(X <= 1),
+            aet.all(X >= -1),
             matrix_pos_def(X),
             eta > 0,
             broadcast_conditions=False,
@@ -1662,7 +1668,7 @@ class MatrixNormal(Continuous):
 
             # Setup left covariance matrix
             scale = pm.Lognormal('scale', mu=np.log(true_scale), sigma=0.5)
-            rowcov = tt.nlinalg.diag([scale**(2*i) for i in range(m)])
+            rowcov = aet.nlinalg.diag([scale**(2*i) for i in range(m)])
 
             vals = pm.MatrixNormal('vals', mu=mu, colchol=colchol, rowcov=rowcov,
                                    observed=data, shape=(m, n))
@@ -1687,10 +1693,10 @@ def __init__(
         assert len(shape) == 2, "shape must have length 2: mxn"
         self.shape = shape
         super().__init__(shape=shape, *args, **kwargs)
-        self.mu = tt.as_tensor_variable(mu)
+        self.mu = aet.as_tensor_variable(mu)
         self.mean = self.median = self.mode = self.mu
-        self.solve_lower = tt.slinalg.solve_lower_triangular
-        self.solve_upper = tt.slinalg.solve_upper_triangular
+        self.solve_lower = solve_lower_triangular
+        self.solve_upper = solve_upper_triangular
 
     def _setup_matrices(self, colcov, colchol, coltau, rowcov, rowchol, rowtau):
         cholesky = Cholesky(lower=True, on_error="raise")
@@ -1705,7 +1711,7 @@ def _setup_matrices(self, colcov, colchol, coltau, rowcov, rowchol, rowtau):
         if rowcov is not None:
             self.m = rowcov.shape[0]
             self._rowcov_type = "cov"
-            rowcov = tt.as_tensor_variable(rowcov)
+            rowcov = aet.as_tensor_variable(rowcov)
             if rowcov.ndim != 2:
                 raise ValueError("rowcov must be two dimensional.")
             self.rowchol_cov = cholesky(rowcov)
@@ -1714,7 +1720,7 @@ def _setup_matrices(self, colcov, colchol, coltau, rowcov, rowchol, rowtau):
             raise ValueError("rowtau not supported at this time")
             self.m = rowtau.shape[0]
             self._rowcov_type = "tau"
-            rowtau = tt.as_tensor_variable(rowtau)
+            rowtau = aet.as_tensor_variable(rowtau)
             if rowtau.ndim != 2:
                 raise ValueError("rowtau must be two dimensional.")
             self.rowchol_tau = cholesky(rowtau)
@@ -1724,7 +1730,7 @@ def _setup_matrices(self, colcov, colchol, coltau, rowcov, rowchol, rowtau):
             self._rowcov_type = "chol"
             if rowchol.ndim != 2:
                 raise ValueError("rowchol must be two dimensional.")
-            self.rowchol_cov = tt.as_tensor_variable(rowchol)
+            self.rowchol_cov = aet.as_tensor_variable(rowchol)
 
         # Among-column matrices
         if len([i for i in [coltau, colcov, colchol] if i is not None]) != 1:
@@ -1736,7 +1742,7 @@ def _setup_matrices(self, colcov, colchol, coltau, rowcov, rowchol, rowtau):
         if colcov is not None:
             self.n = colcov.shape[0]
             self._colcov_type = "cov"
-            colcov = tt.as_tensor_variable(colcov)
+            colcov = aet.as_tensor_variable(colcov)
             if colcov.ndim != 2:
                 raise ValueError("colcov must be two dimensional.")
             self.colchol_cov = cholesky(colcov)
@@ -1745,7 +1751,7 @@ def _setup_matrices(self, colcov, colchol, coltau, rowcov, rowchol, rowtau):
             raise ValueError("coltau not supported at this time")
             self.n = coltau.shape[0]
             self._colcov_type = "tau"
-            coltau = tt.as_tensor_variable(coltau)
+            coltau = aet.as_tensor_variable(coltau)
             if coltau.ndim != 2:
                 raise ValueError("coltau must be two dimensional.")
             self.colchol_tau = cholesky(coltau)
@@ -1755,7 +1761,7 @@ def _setup_matrices(self, colcov, colchol, coltau, rowcov, rowchol, rowtau):
             self._colcov_type = "chol"
             if colchol.ndim != 2:
                 raise ValueError("colchol must be two dimensional.")
-            self.colchol_cov = tt.as_tensor_variable(colchol)
+            self.colchol_cov = aet.as_tensor_variable(colchol)
 
     def random(self, point=None, size=None):
         """
@@ -1802,15 +1808,15 @@ def _trquaddist(self, value):
 
         # Find exponent piece by piece
         right_quaddist = self.solve_lower(rowchol_cov, delta)
-        quaddist = tt.nlinalg.matrix_dot(right_quaddist.T, right_quaddist)
+        quaddist = aet.nlinalg.matrix_dot(right_quaddist.T, right_quaddist)
         quaddist = self.solve_lower(colchol_cov, quaddist)
         quaddist = self.solve_upper(colchol_cov.T, quaddist)
-        trquaddist = tt.nlinalg.trace(quaddist)
+        trquaddist = aet.nlinalg.trace(quaddist)
 
-        coldiag = tt.nlinalg.diag(colchol_cov)
-        rowdiag = tt.nlinalg.diag(rowchol_cov)
-        half_collogdet = tt.sum(tt.log(coldiag))  # logdet(M) = 2*Tr(log(L))
-        half_rowlogdet = tt.sum(tt.log(rowdiag))  # Using Cholesky: M = L L^T
+        coldiag = aet.nlinalg.diag(colchol_cov)
+        rowdiag = aet.nlinalg.diag(rowchol_cov)
+        half_collogdet = aet.sum(aet.log(coldiag))  # logdet(M) = 2*Tr(log(L))
+        half_rowlogdet = aet.sum(aet.log(rowdiag))  # Using Cholesky: M = L L^T
         return trquaddist, half_collogdet, half_rowlogdet
 
     def logp(self, value):
@@ -1869,7 +1875,7 @@ class KroneckerNormal(Continuous):
         :math:`[(v_1, Q_1), (v_2, Q_2), ...]` such that
         :math:`K_i = Q_i \text{diag}(v_i) Q_i'`. For example::
 
-            v_i, Q_i = tt.nlinalg.eigh(K_i)
+            v_i, Q_i = aet.nlinalg.eigh(K_i)
 
     sigma: scalar, variable
         Standard deviation of the Gaussian white noise.
@@ -1930,7 +1936,7 @@ class KroneckerNormal(Continuous):
     def __init__(self, mu, covs=None, chols=None, evds=None, sigma=None, *args, **kwargs):
         self._setup(covs, chols, evds, sigma)
         super().__init__(*args, **kwargs)
-        self.mu = tt.as_tensor_variable(mu)
+        self.mu = aet.as_tensor_variable(mu)
         self.mean = self.median = self.mode = self.mu
 
     def _setup(self, covs, chols, evds, sigma):
@@ -1952,21 +1958,21 @@ def _setup(self, covs, chols, evds, sigma):
             else:
                 # Otherwise use cholesky as usual
                 self.chols = list(map(self.cholesky, self.covs))
-                self.chol_diags = list(map(tt.nlinalg.diag, self.chols))
-                self.sizes = tt.as_tensor_variable([chol.shape[0] for chol in self.chols])
-                self.N = tt.prod(self.sizes)
+                self.chol_diags = list(map(aet.nlinalg.diag, self.chols))
+                self.sizes = aet.as_tensor_variable([chol.shape[0] for chol in self.chols])
+                self.N = aet.prod(self.sizes)
         elif chols is not None:
             self._cov_type = "chol"
             if self.is_noisy:  # A strange case...
                 # Noise requires eigendecomposition
-                covs = [tt.dot(chol, chol.T) for chol in chols]
+                covs = [aet.dot(chol, chol.T) for chol in chols]
                 eigh_map = map(eigh, covs)
                 self._setup_evd(eigh_map)
             else:
                 self.chols = chols
-                self.chol_diags = list(map(tt.nlinalg.diag, self.chols))
-                self.sizes = tt.as_tensor_variable([chol.shape[0] for chol in self.chols])
-                self.N = tt.prod(self.sizes)
+                self.chol_diags = list(map(aet.nlinalg.diag, self.chols))
+                self.sizes = aet.as_tensor_variable([chol.shape[0] for chol in self.chols])
+                self.N = aet.prod(self.sizes)
         else:
             self._cov_type = "evd"
             self._setup_evd(evds)
@@ -1974,10 +1980,10 @@ def _setup(self, covs, chols, evds, sigma):
     def _setup_evd(self, eigh_iterable):
         self._isEVD = True
         eigs_sep, Qs = zip(*eigh_iterable)  # Unzip
-        self.Qs = list(map(tt.as_tensor_variable, Qs))
-        self.QTs = list(map(tt.transpose, self.Qs))
+        self.Qs = list(map(aet.as_tensor_variable, Qs))
+        self.QTs = list(map(aet.transpose, self.Qs))
 
-        self.eigs_sep = list(map(tt.as_tensor_variable, eigs_sep))
+        self.eigs_sep = list(map(aet.as_tensor_variable, eigs_sep))
         self.eigs = kron_diag(*self.eigs_sep)  # Combine separate eigs
         if self.is_noisy:
             self.eigs += self.sigma ** 2
@@ -1989,28 +1995,28 @@ def _setup_random(self):
             if self._cov_type == "cov":
                 cov = kronecker(*self.covs)
                 if self.is_noisy:
-                    cov = cov + self.sigma ** 2 * tt.identity_like(cov)
+                    cov = cov + self.sigma ** 2 * aet.identity_like(cov)
                 self.mv_params["cov"] = cov
             elif self._cov_type == "chol":
                 if self.is_noisy:
                     covs = []
                     for eig, Q in zip(self.eigs_sep, self.Qs):
-                        cov_i = tt.dot(Q, tt.dot(tt.diag(eig), Q.T))
+                        cov_i = aet.dot(Q, aet.dot(aet.diag(eig), Q.T))
                         covs.append(cov_i)
                     cov = kronecker(*covs)
                     if self.is_noisy:
-                        cov = cov + self.sigma ** 2 * tt.identity_like(cov)
+                        cov = cov + self.sigma ** 2 * aet.identity_like(cov)
                     self.mv_params["chol"] = self.cholesky(cov)
                 else:
                     self.mv_params["chol"] = kronecker(*self.chols)
             elif self._cov_type == "evd":
                 covs = []
                 for eig, Q in zip(self.eigs_sep, self.Qs):
-                    cov_i = tt.dot(Q, tt.dot(tt.diag(eig), Q.T))
+                    cov_i = aet.dot(Q, aet.dot(aet.diag(eig), Q.T))
                     covs.append(cov_i)
                 cov = kronecker(*covs)
                 if self.is_noisy:
-                    cov = cov + self.sigma ** 2 * tt.identity_like(cov)
+                    cov = cov + self.sigma ** 2 * aet.identity_like(cov)
                 self.mv_params["cov"] = cov
 
     def random(self, point=None, size=None):
@@ -2050,16 +2056,16 @@ def _quaddist(self, value):
         delta = value - self.mu
         if self._isEVD:
             sqrt_quad = kron_dot(self.QTs, delta.T)
-            sqrt_quad = sqrt_quad / tt.sqrt(self.eigs[:, None])
-            logdet = tt.sum(tt.log(self.eigs))
+            sqrt_quad = sqrt_quad / aet.sqrt(self.eigs[:, None])
+            logdet = aet.sum(aet.log(self.eigs))
         else:
             sqrt_quad = kron_solve_lower(self.chols, delta.T)
             logdet = 0
             for chol_size, chol_diag in zip(self.sizes, self.chol_diags):
-                logchol = tt.log(chol_diag) * self.N / chol_size
-                logdet += tt.sum(2 * logchol)
+                logchol = aet.log(chol_diag) * self.N / chol_size
+                logdet += aet.sum(2 * logchol)
         # Square each sample
-        quad = tt.batched_dot(sqrt_quad.T, sqrt_quad.T)
+        quad = aet.batched_dot(sqrt_quad.T, sqrt_quad.T)
         if onedim:
             quad = quad[0]
         return quad, logdet
@@ -2079,7 +2085,7 @@ def logp(self, value):
         TensorVariable
         """
         quad, logdet = self._quaddist(value)
-        return -(quad + logdet + self.N * tt.log(2 * np.pi)) / 2.0
+        return -(quad + logdet + self.N * aet.log(2 * np.pi)) / 2.0
 
     def _distr_parameters_for_repr(self):
         return ["mu"]
diff --git a/pymc3/distributions/posterior_predictive.py b/pymc3/distributions/posterior_predictive.py
index 31aa3e40f58..1125ae93577 100644
--- a/pymc3/distributions/posterior_predictive.py
+++ b/pymc3/distributions/posterior_predictive.py
@@ -9,18 +9,20 @@
 from contextlib import AbstractContextManager
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, cast, overload
 
+import aesara.graph.basic
+import aesara.graph.fg
 import numpy as np
-import theano.graph.basic
-import theano.graph.fg
-import theano.tensor as tt
 
+from aesara.compile.sharedvalue import SharedVariable
+from aesara.graph.basic import Constant
+from aesara.tensor.var import TensorVariable
 from arviz import InferenceData
 from typing_extensions import Literal, Protocol
 from xarray import Dataset
 
 from pymc3.backends.base import MultiTrace
 from pymc3.distributions.distribution import (
-    _compile_theano_function,
+    _compile_aesara_function,
     _DrawValuesContext,
     _DrawValuesContextBlocker,
     is_fast_drawable,
@@ -35,7 +37,6 @@
     modelcontext,
 )
 from pymc3.util import chains_and_samples, dataset_to_point_list, get_var_name
-from pymc3.vartypes import theano_constant
 
 # Failing tests:
 #    test_mixture_random_shape::test_mixture_random_shape
@@ -375,13 +376,13 @@ def draw_values(self) -> list[np.ndarray]:
                 if (next_, samples) in drawn:
                     # If the node already has a givens value, skip it
                     continue
-                elif isinstance(next_, (theano_constant, tt.sharedvar.SharedVariable)):
-                    # If the node is a theano.tensor.TensorConstant or a
-                    # theano.tensor.sharedvar.SharedVariable, its value will be
-                    # available automatically in _compile_theano_function so
+                elif isinstance(next_, (Constant, SharedVariable)):
+                    # If the node is a aesara.tensor.TensorConstant or a
+                    # aesara.tensor.sharedvar.SharedVariable, its value will be
+                    # available automatically in _compile_aesara_function so
                     # we can skip it. Furthermore, if this node was treated as a
-                    # TensorVariable that should be compiled by theano in
-                    # _compile_theano_function, it would raise a `TypeError:
+                    # TensorVariable that should be compiled by aesara in
+                    # _compile_aesara_function, it would raise a `TypeError:
                     # ('Constants not allowed in param list', ...)` for
                     # TensorConstant, and a `TypeError: Cannot use a shared
                     # variable (...) as explicit input` for SharedVariable.
@@ -411,7 +412,7 @@ def draw_values(self) -> list[np.ndarray]:
                         assert isinstance(value, np.ndarray)
                         givens[next_.name] = (next_, value)
                         drawn[(next_, samples)] = value
-                    except theano.graph.fg.MissingInputError:
+                    except aesara.graph.fg.MissingInputError:
                         # The node failed, so we must add the node's parents to
                         # the stack of nodes to try to draw from. We exclude the
                         # nodes in the `params` list.
@@ -456,7 +457,7 @@ def draw_values(self) -> list[np.ndarray]:
                             assert isinstance(value, np.ndarray)
                             self.evaluated[param_idx] = drawn[(param, samples)] = value
                             givens[param.name] = (param, value)
-                        except theano.graph.fg.MissingInputError:
+                        except aesara.graph.fg.MissingInputError:
                             missing_inputs.add(param_idx)
         return [self.evaluated[j] for j in params]
 
@@ -527,9 +528,9 @@ def draw_value(self, param, trace: _TraceDict | None = None, givens=None):
 
         Parameters
         ----------
-        param: number, array like, theano variable or pymc3 random variable
+        param: number, array like, aesara variable or pymc3 random variable
             The value or distribution. Constants or shared variables
-            will be converted to an array and returned. Theano variables
+            will be converted to an array and returned. Aesara variables
             are evaluated. If `param` is a pymc3 random variable, draw
             values from it and return that (as ``np.ndarray``), unless a
             value is specified in the ``trace``.
@@ -537,8 +538,8 @@ def draw_value(self, param, trace: _TraceDict | None = None, givens=None):
             A dictionary from pymc3 variable names to samples of their values
             used to provide context for evaluating ``param``.
         givens: dict, optional
-            A dictionary from theano variables to their values. These values
-            are used to evaluate ``param`` if it is a theano variable.
+            A dictionary from aesara variables to their values. These values
+            are used to evaluate ``param`` if it is a aesara variable.
         """
         samples = self.samples
 
@@ -569,11 +570,11 @@ def random_sample(
 
         if isinstance(param, (numbers.Number, np.ndarray)):
             return param
-        elif isinstance(param, theano_constant):
+        elif isinstance(param, Constant):
             return param.value
-        elif isinstance(param, tt.sharedvar.SharedVariable):
+        elif isinstance(param, SharedVariable):
             return param.get_value()
-        elif isinstance(param, (tt.TensorVariable, MultiObservedRV)):
+        elif isinstance(param, (TensorVariable, MultiObservedRV)):
             if hasattr(param, "model") and trace and param.name in trace.varnames:
                 return trace[param.name]
             elif hasattr(param, "random") and param.random is not None:
@@ -605,7 +606,7 @@ def random_sample(
                         )
                     except (ValueError, TypeError):
                         # reset shape to account for shape changes
-                        # with theano.shared inputs
+                        # with aesara.shared inputs
                         dist_tmp.shape = ()
                         # We want to draw values to infer the dist_shape,
                         # we don't want to store these drawn values to the context
@@ -651,7 +652,7 @@ def random_sample(
                 # We only truly care if the ancestors of param that were given
                 # value have the matching dshape and val.shape
                 param_ancestors = set(
-                    theano.graph.basic.ancestors([param], blockers=list(variables))
+                    aesara.graph.basic.ancestors([param], blockers=list(variables))
                 )
                 inputs = [
                     (var, val) for var, val in zip(variables, values) if var in param_ancestors
@@ -661,7 +662,7 @@ def random_sample(
                 else:
                     input_vars = []
                     input_vals = []
-                func = _compile_theano_function(param, input_vars)
+                func = _compile_aesara_function(param, input_vars)
                 if not input_vars:
                     assert input_vals == []  # AFAICT if there are now vars, there can't be vals
                     output = func(*input_vals)
@@ -685,7 +686,7 @@ def _param_shape(var_desig, model: Model) -> tuple[int, ...]:
     if hasattr(v, "observations"):
         try:
             # To get shape of _observed_ data container `pm.Data`
-            # (wrapper for theano.SharedVariable) we evaluate it.
+            # (wrapper for SharedVariable) we evaluate it.
             shape = tuple(v.observations.shape.eval())
         except AttributeError:
             shape = v.observations.shape
diff --git a/pymc3/distributions/special.py b/pymc3/distributions/special.py
index ba4662b2df1..8b218fea784 100644
--- a/pymc3/distributions/special.py
+++ b/pymc3/distributions/special.py
@@ -12,16 +12,17 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import aesara.tensor as aet
 import numpy as np
-import theano.tensor as tt
 
-from theano import scalar
-from theano.scalar.basic_scipy import GammaLn, Psi
+from aesara import scalar
+from aesara.scalar.basic_scipy import GammaLn, Psi
+from aesara.tensor.elemwise import Elemwise
 
 __all__ = ["gammaln", "multigammaln", "psi", "log_i0"]
 
 scalar_gammaln = GammaLn(scalar.upgrade_to_float, name="scalar_gammaln")
-gammaln = tt.Elemwise(scalar_gammaln, name="gammaln")
+gammaln = Elemwise(scalar_gammaln, name="gammaln")
 
 
 def multigammaln(a, p):
@@ -33,17 +34,17 @@ def multigammaln(a, p):
     p: int
        degrees of freedom. p > 0
     """
-    i = tt.arange(1, p + 1)
-    return p * (p - 1) * tt.log(np.pi) / 4.0 + tt.sum(gammaln(a + (1.0 - i) / 2.0), axis=0)
+    i = aet.arange(1, p + 1)
+    return p * (p - 1) * aet.log(np.pi) / 4.0 + aet.sum(gammaln(a + (1.0 - i) / 2.0), axis=0)
 
 
 def log_i0(x):
     """
     Calculates the logarithm of the 0 order modified Bessel function of the first kind""
     """
-    return tt.switch(
-        tt.lt(x, 5),
-        tt.log1p(
+    return aet.switch(
+        aet.lt(x, 5),
+        aet.log1p(
             x ** 2.0 / 4.0
             + x ** 4.0 / 64.0
             + x ** 6.0 / 2304.0
@@ -52,8 +53,8 @@ def log_i0(x):
             + x ** 12.0 / 2123366400.0
         ),
         x
-        - 0.5 * tt.log(2.0 * np.pi * x)
-        + tt.log1p(
+        - 0.5 * aet.log(2.0 * np.pi * x)
+        + aet.log1p(
             1.0 / (8.0 * x)
             + 9.0 / (128.0 * x ** 2.0)
             + 225.0 / (3072.0 * x ** 3.0)
@@ -63,4 +64,4 @@ def log_i0(x):
 
 
 scalar_psi = Psi(scalar.upgrade_to_float, name="scalar_psi")
-psi = tt.Elemwise(scalar_psi, name="psi")
+psi = Elemwise(scalar_psi, name="psi")
diff --git a/pymc3/distributions/timeseries.py b/pymc3/distributions/timeseries.py
index e3e1aa15bc4..ecd693df2ff 100644
--- a/pymc3/distributions/timeseries.py
+++ b/pymc3/distributions/timeseries.py
@@ -12,11 +12,11 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import aesara.tensor as aet
 import numpy as np
-import theano.tensor as tt
 
+from aesara import scan
 from scipy import stats
-from theano import scan
 
 from pymc3.distributions import distribution, multivariate
 from pymc3.distributions.continuous import Flat, Normal, get_tau_sigma
@@ -47,10 +47,10 @@ class AR1(distribution.Continuous):
 
     def __init__(self, k, tau_e, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.k = k = tt.as_tensor_variable(k)
-        self.tau_e = tau_e = tt.as_tensor_variable(tau_e)
+        self.k = k = aet.as_tensor_variable(k)
+        self.tau_e = tau_e = aet.as_tensor_variable(tau_e)
         self.tau = tau_e * (1 - k ** 2)
-        self.mode = tt.as_tensor_variable(0.0)
+        self.mode = aet.as_tensor_variable(0.0)
 
     def logp(self, x):
         """
@@ -74,7 +74,7 @@ def logp(self, x):
         boundary = Normal.dist(0.0, tau=tau).logp
 
         innov_like = Normal.dist(k * x_im1, tau=tau_e).logp(x_i)
-        return boundary(x[0]) + tt.sum(innov_like)
+        return boundary(x[0]) + aet.sum(innov_like)
 
 
 class AR(distribution.Continuous):
@@ -116,10 +116,10 @@ def __init__(
             sigma = sd
 
         tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
-        self.sigma = self.sd = tt.as_tensor_variable(sigma)
-        self.tau = tt.as_tensor_variable(tau)
+        self.sigma = self.sd = aet.as_tensor_variable(sigma)
+        self.tau = aet.as_tensor_variable(tau)
 
-        self.mean = tt.as_tensor_variable(0.0)
+        self.mean = aet.as_tensor_variable(0.0)
 
         if isinstance(rho, list):
             p = len(rho)
@@ -140,7 +140,7 @@ def __init__(
             self.p = p
 
         self.constant = constant
-        self.rho = rho = tt.as_tensor_variable(rho)
+        self.rho = rho = aet.as_tensor_variable(rho)
         self.init = init
 
     def logp(self, value):
@@ -157,7 +157,7 @@ def logp(self, value):
         TensorVariable
         """
         if self.constant:
-            x = tt.add(
+            x = aet.add(
                 *[self.rho[i + 1] * value[self.p - (i + 1) : -(i + 1)] for i in range(self.p)]
             )
             eps = value[self.p :] - self.rho[0] - x
@@ -165,7 +165,7 @@ def logp(self, value):
             if self.p == 1:
                 x = self.rho * value[:-1]
             else:
-                x = tt.add(
+                x = aet.add(
                     *[self.rho[i] * value[self.p - (i + 1) : -(i + 1)] for i in range(self.p)]
                 )
             eps = value[self.p :] - x
@@ -173,7 +173,7 @@ def logp(self, value):
         innov_like = Normal.dist(mu=0.0, tau=self.tau).logp(eps)
         init_like = self.init.logp(value[: self.p])
 
-        return tt.sum(innov_like) + tt.sum(init_like)
+        return aet.sum(innov_like) + aet.sum(init_like)
 
 
 class GaussianRandomWalk(distribution.Continuous):
@@ -181,7 +181,7 @@ class GaussianRandomWalk(distribution.Continuous):
 
     Note that this is mainly a user-friendly wrapper to enable an easier specification
     of GRW. You are not restricted to use only Normal innovations but can use any
-    distribution: just use `theano.tensor.cumsum()` to create the random walk behavior.
+    distribution: just use `aesara.tensor.cumsum()` to create the random walk behavior.
 
     Parameters
     ----------
@@ -209,12 +209,12 @@ def __init__(self, tau=None, init=Flat.dist(), sigma=None, mu=0.0, sd=None, *arg
         if sd is not None:
             sigma = sd
         tau, sigma = get_tau_sigma(tau=tau, sigma=sigma)
-        self.tau = tt.as_tensor_variable(tau)
-        sigma = tt.as_tensor_variable(sigma)
+        self.tau = aet.as_tensor_variable(tau)
+        sigma = aet.as_tensor_variable(sigma)
         self.sigma = self.sd = sigma
-        self.mu = tt.as_tensor_variable(mu)
+        self.mu = aet.as_tensor_variable(mu)
         self.init = init
-        self.mean = tt.as_tensor_variable(0.0)
+        self.mean = aet.as_tensor_variable(0.0)
 
     def _mu_and_sigma(self, mu, sigma):
         """Helper to get mu and sigma if they are high dimensional."""
@@ -242,7 +242,7 @@ def logp(self, x):
             x_i = x[1:]
             mu, sigma = self._mu_and_sigma(self.mu, self.sigma)
             innov_like = Normal.dist(mu=x_im1 + mu, sigma=sigma).logp(x_i)
-            return self.init.logp(x[0]) + tt.sum(innov_like)
+            return self.init.logp(x[0]) + aet.sum(innov_like)
         return self.init.logp(x)
 
     def random(self, point=None, size=None):
@@ -323,17 +323,17 @@ class GARCH11(distribution.Continuous):
     def __init__(self, omega, alpha_1, beta_1, initial_vol, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-        self.omega = omega = tt.as_tensor_variable(omega)
-        self.alpha_1 = alpha_1 = tt.as_tensor_variable(alpha_1)
-        self.beta_1 = beta_1 = tt.as_tensor_variable(beta_1)
-        self.initial_vol = tt.as_tensor_variable(initial_vol)
-        self.mean = tt.as_tensor_variable(0.0)
+        self.omega = omega = aet.as_tensor_variable(omega)
+        self.alpha_1 = alpha_1 = aet.as_tensor_variable(alpha_1)
+        self.beta_1 = beta_1 = aet.as_tensor_variable(beta_1)
+        self.initial_vol = aet.as_tensor_variable(initial_vol)
+        self.mean = aet.as_tensor_variable(0.0)
 
     def get_volatility(self, x):
         x = x[:-1]
 
         def volatility_update(x, vol, w, a, b):
-            return tt.sqrt(w + a * tt.square(x) + b * tt.square(vol))
+            return aet.sqrt(w + a * aet.square(x) + b * aet.square(vol))
 
         vol, _ = scan(
             fn=volatility_update,
@@ -341,7 +341,7 @@ def volatility_update(x, vol, w, a, b):
             outputs_info=[self.initial_vol],
             non_sequences=[self.omega, self.alpha_1, self.beta_1],
         )
-        return tt.concatenate([[self.initial_vol], vol])
+        return aet.concatenate([[self.initial_vol], vol])
 
     def logp(self, x):
         """
@@ -357,7 +357,7 @@ def logp(self, x):
         TensorVariable
         """
         vol = self.get_volatility(x)
-        return tt.sum(Normal.dist(0.0, sigma=vol).logp(x))
+        return aet.sum(Normal.dist(0.0, sigma=vol).logp(x))
 
     def _distr_parameters_for_repr(self):
         return ["omega", "alpha_1", "beta_1"]
@@ -379,7 +379,7 @@ class EulerMaruyama(distribution.Continuous):
 
     def __init__(self, dt, sde_fn, sde_pars, *args, **kwds):
         super().__init__(*args, **kwds)
-        self.dt = dt = tt.as_tensor_variable(dt)
+        self.dt = dt = aet.as_tensor_variable(dt)
         self.sde_fn = sde_fn
         self.sde_pars = sde_pars
 
@@ -399,8 +399,8 @@ def logp(self, x):
         xt = x[:-1]
         f, g = self.sde_fn(x[:-1], *self.sde_pars)
         mu = xt + self.dt * f
-        sd = tt.sqrt(self.dt) * g
-        return tt.sum(Normal.dist(mu=mu, sigma=sd).logp(x[1:]))
+        sd = aet.sqrt(self.dt) * g
+        return aet.sum(Normal.dist(mu=mu, sigma=sd).logp(x[1:]))
 
     def _distr_parameters_for_repr(self):
         return ["dt"]
@@ -437,7 +437,7 @@ def __init__(
         self.init = init
         self.innovArgs = (mu, cov, tau, chol, lower)
         self.innov = multivariate.MvNormal.dist(*self.innovArgs, shape=self.shape)
-        self.mean = tt.as_tensor_variable(0.0)
+        self.mean = aet.as_tensor_variable(0.0)
 
     def logp(self, x):
         """
@@ -551,7 +551,7 @@ class MvStudentTRandomWalk(MvGaussianRandomWalk):
 
     def __init__(self, nu, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.nu = tt.as_tensor_variable(nu)
+        self.nu = aet.as_tensor_variable(nu)
         self.innov = multivariate.MvStudentT.dist(self.nu, None, *self.innovArgs)
 
     def _distr_parameters_for_repr(self):
diff --git a/pymc3/distributions/transforms.py b/pymc3/distributions/transforms.py
index 880301182ce..b17e7b27f46 100644
--- a/pymc3/distributions/transforms.py
+++ b/pymc3/distributions/transforms.py
@@ -14,16 +14,18 @@
 
 import warnings
 
+import aesara.tensor as aet
 import numpy as np
-import theano.tensor as tt
 
+from aesara.tensor.subtensor import advanced_set_subtensor1
+from aesara.tensor.type import TensorType
 from scipy.special import logit as nplogit
 
+from pymc3.aesaraf import floatX, gradient
 from pymc3.distributions import distribution
 from pymc3.distributions.distribution import draw_values
 from pymc3.math import invlogit, logit, logsumexp
 from pymc3.model import FreeRV
-from pymc3.theanof import floatX, gradient
 
 __all__ = [
     "Transform",
@@ -131,8 +133,8 @@ def __str__(self):
 
 class ElemwiseTransform(Transform):
     def jacobian_det(self, x):
-        grad = tt.reshape(gradient(tt.sum(self.backward(x)), [x]), x.shape)
-        return tt.log(tt.abs_(grad))
+        grad = aet.reshape(gradient(aet.sum(self.backward(x)), [x]), x.shape)
+        return aet.log(aet.abs_(grad))
 
 
 class TransformedDistribution(distribution.Distribution):
@@ -159,7 +161,7 @@ def __init__(self, dist, transform, *args, **kwargs):
         if transform.name == "stickbreaking":
             b = np.hstack(((np.atleast_1d(self.shape) == 1)[:-1], False))
             # force the last dim not broadcastable
-            self.type = tt.TensorType(v.dtype, b)
+            self.type = TensorType(v.dtype, b)
 
     def logp(self, x):
         """
@@ -212,10 +214,10 @@ class Log(ElemwiseTransform):
     name = "log"
 
     def backward(self, x):
-        return tt.exp(x)
+        return aet.exp(x)
 
     def forward(self, x):
-        return tt.log(x)
+        return aet.log(x)
 
     def forward_val(self, x, point=None):
         return np.log(x)
@@ -231,7 +233,7 @@ class LogExpM1(ElemwiseTransform):
     name = "log_exp_m1"
 
     def backward(self, x):
-        return tt.nnet.softplus(x)
+        return aet.nnet.softplus(x)
 
     def forward(self, x):
         """Inverse operation of softplus.
@@ -239,13 +241,13 @@ def forward(self, x):
         y = Log(Exp(x) - 1)
           = Log(1 - Exp(-x)) + x
         """
-        return tt.log(1.0 - tt.exp(-x)) + x
+        return aet.log(1.0 - aet.exp(-x)) + x
 
     def forward_val(self, x, point=None):
         return np.log(1.0 - np.exp(-x)) + x
 
     def jacobian_det(self, x):
-        return -tt.nnet.softplus(-x)
+        return -aet.nnet.softplus(-x)
 
 
 log_exp_m1 = LogExpM1()
@@ -273,18 +275,18 @@ class Interval(ElemwiseTransform):
     name = "interval"
 
     def __init__(self, a, b):
-        self.a = tt.as_tensor_variable(a)
-        self.b = tt.as_tensor_variable(b)
+        self.a = aet.as_tensor_variable(a)
+        self.b = aet.as_tensor_variable(b)
 
     def backward(self, x):
         a, b = self.a, self.b
-        sigmoid_x = tt.nnet.sigmoid(x)
+        sigmoid_x = aet.nnet.sigmoid(x)
         r = sigmoid_x * b + (1 - sigmoid_x) * a
         return r
 
     def forward(self, x):
         a, b = self.a, self.b
-        return tt.log(x - a) - tt.log(b - x)
+        return aet.log(x - a) - aet.log(b - x)
 
     def forward_val(self, x, point=None):
         # 2017-06-19
@@ -294,8 +296,8 @@ def forward_val(self, x, point=None):
         return floatX(np.log(x - a) - np.log(b - x))
 
     def jacobian_det(self, x):
-        s = tt.nnet.softplus(-x)
-        return tt.log(self.b - self.a) - 2 * s - x
+        s = aet.nnet.softplus(-x)
+        return aet.log(self.b - self.a) - 2 * s - x
 
 
 interval = Interval
@@ -307,16 +309,16 @@ class LowerBound(ElemwiseTransform):
     name = "lowerbound"
 
     def __init__(self, a):
-        self.a = tt.as_tensor_variable(a)
+        self.a = aet.as_tensor_variable(a)
 
     def backward(self, x):
         a = self.a
-        r = tt.exp(x) + a
+        r = aet.exp(x) + a
         return r
 
     def forward(self, x):
         a = self.a
-        return tt.log(x - a)
+        return aet.log(x - a)
 
     def forward_val(self, x, point=None):
         # 2017-06-19
@@ -342,16 +344,16 @@ class UpperBound(ElemwiseTransform):
     name = "upperbound"
 
     def __init__(self, b):
-        self.b = tt.as_tensor_variable(b)
+        self.b = aet.as_tensor_variable(b)
 
     def backward(self, x):
         b = self.b
-        r = b - tt.exp(x)
+        r = b - aet.exp(x)
         return r
 
     def forward(self, x):
         b = self.b
-        return tt.log(b - x)
+        return aet.log(b - x)
 
     def forward_val(self, x, point=None):
         # 2017-06-19
@@ -375,15 +377,15 @@ class Ordered(Transform):
     name = "ordered"
 
     def backward(self, y):
-        x = tt.zeros(y.shape)
-        x = tt.inc_subtensor(x[..., 0], y[..., 0])
-        x = tt.inc_subtensor(x[..., 1:], tt.exp(y[..., 1:]))
-        return tt.cumsum(x, axis=-1)
+        x = aet.zeros(y.shape)
+        x = aet.inc_subtensor(x[..., 0], y[..., 0])
+        x = aet.inc_subtensor(x[..., 1:], aet.exp(y[..., 1:]))
+        return aet.cumsum(x, axis=-1)
 
     def forward(self, x):
-        y = tt.zeros(x.shape)
-        y = tt.inc_subtensor(y[..., 0], x[..., 0])
-        y = tt.inc_subtensor(y[..., 1:], tt.log(x[..., 1:] - x[..., :-1]))
+        y = aet.zeros(x.shape)
+        y = aet.inc_subtensor(y[..., 0], x[..., 0])
+        y = aet.inc_subtensor(y[..., 1:], aet.log(x[..., 1:] - x[..., :-1]))
         return y
 
     def forward_val(self, x, point=None):
@@ -393,7 +395,7 @@ def forward_val(self, x, point=None):
         return y
 
     def jacobian_det(self, y):
-        return tt.sum(y[..., 1:], axis=-1)
+        return aet.sum(y[..., 1:], axis=-1)
 
 
 ordered = Ordered()
@@ -412,8 +414,8 @@ class SumTo1(Transform):
     name = "sumto1"
 
     def backward(self, y):
-        remaining = 1 - tt.sum(y[..., :], axis=-1, keepdims=True)
-        return tt.concatenate([y[..., :], remaining], axis=-1)
+        remaining = 1 - aet.sum(y[..., :], axis=-1, keepdims=True)
+        return aet.concatenate([y[..., :], remaining], axis=-1)
 
     def forward(self, x):
         return x[..., :-1]
@@ -422,8 +424,8 @@ def forward_val(self, x, point=None):
         return x[..., :-1]
 
     def jacobian_det(self, x):
-        y = tt.zeros(x.shape)
-        return tt.sum(y, axis=-1)
+        y = aet.zeros(x.shape)
+        return aet.sum(y, axis=-1)
 
 
 sum_to_1 = SumTo1()
@@ -450,8 +452,8 @@ def __init__(self, eps=None):
     def forward(self, x_):
         x = x_.T
         n = x.shape[0]
-        lx = tt.log(x)
-        shift = tt.sum(lx, 0, keepdims=True) / n
+        lx = aet.log(x)
+        shift = aet.sum(lx, 0, keepdims=True) / n
         y = lx[:-1] - shift
         return floatX(y.T)
 
@@ -465,20 +467,20 @@ def forward_val(self, x_, point=None):
 
     def backward(self, y_):
         y = y_.T
-        y = tt.concatenate([y, -tt.sum(y, 0, keepdims=True)])
+        y = aet.concatenate([y, -aet.sum(y, 0, keepdims=True)])
         # "softmax" with vector support and no deprication warning:
-        e_y = tt.exp(y - tt.max(y, 0, keepdims=True))
-        x = e_y / tt.sum(e_y, 0, keepdims=True)
+        e_y = aet.exp(y - aet.max(y, 0, keepdims=True))
+        x = e_y / aet.sum(e_y, 0, keepdims=True)
         return floatX(x.T)
 
     def jacobian_det(self, y_):
         y = y_.T
         Km1 = y.shape[0] + 1
-        sy = tt.sum(y, 0, keepdims=True)
-        r = tt.concatenate([y + sy, tt.zeros(sy.shape)])
+        sy = aet.sum(y, 0, keepdims=True)
+        r = aet.concatenate([y + sy, aet.zeros(sy.shape)])
         sr = logsumexp(r, 0, keepdims=True)
-        d = tt.log(Km1) + (Km1 * sy) - (Km1 * sr)
-        return tt.sum(d, 0).T
+        d = aet.log(Km1) + (Km1 * sy) - (Km1 * sr)
+        return aet.sum(d, 0).T
 
 
 stick_breaking = StickBreaking()
@@ -490,16 +492,16 @@ class Circular(ElemwiseTransform):
     name = "circular"
 
     def backward(self, y):
-        return tt.arctan2(tt.sin(y), tt.cos(y))
+        return aet.arctan2(aet.sin(y), aet.cos(y))
 
     def forward(self, x):
-        return tt.as_tensor_variable(x)
+        return aet.as_tensor_variable(x)
 
     def forward_val(self, x, point=None):
         return x
 
     def jacobian_det(self, x):
-        return tt.zeros(x.shape)
+        return aet.zeros(x.shape)
 
 
 circular = Circular()
@@ -512,17 +514,17 @@ def __init__(self, n):
         self.diag_idxs = np.arange(1, n + 1).cumsum() - 1
 
     def backward(self, x):
-        return tt.advanced_set_subtensor1(x, tt.exp(x[self.diag_idxs]), self.diag_idxs)
+        return advanced_set_subtensor1(x, aet.exp(x[self.diag_idxs]), self.diag_idxs)
 
     def forward(self, y):
-        return tt.advanced_set_subtensor1(y, tt.log(y[self.diag_idxs]), self.diag_idxs)
+        return advanced_set_subtensor1(y, aet.log(y[self.diag_idxs]), self.diag_idxs)
 
     def forward_val(self, y, point=None):
         y[..., self.diag_idxs] = np.log(y[..., self.diag_idxs])
         return y
 
     def jacobian_det(self, y):
-        return tt.sum(y[self.diag_idxs])
+        return aet.sum(y[self.diag_idxs])
 
 
 class Chain(Transform):
@@ -549,7 +551,7 @@ def backward(self, y):
         return x
 
     def jacobian_det(self, y):
-        y = tt.as_tensor_variable(y)
+        y = aet.as_tensor_variable(y)
         det_list = []
         ndim0 = y.ndim
         for transf in reversed(self.transform_list):
diff --git a/pymc3/glm/families.py b/pymc3/glm/families.py
index 23ca136cf85..57232e28d1a 100644
--- a/pymc3/glm/families.py
+++ b/pymc3/glm/families.py
@@ -16,8 +16,8 @@
 
 from copy import copy
 
+import aesara.tensor as aet
 import numpy as np
-import theano.tensor as tt
 
 from pymc3 import distributions as pm_dists
 from pymc3.model import modelcontext
@@ -36,9 +36,9 @@ def __call__(self, x):
 
 
 identity = Identity()
-logit = tt.nnet.sigmoid
-inverse = tt.inv
-exp = tt.exp
+logit = aet.nnet.sigmoid
+inverse = aet.inv
+exp = aet.exp
 
 
 class Family:
@@ -80,7 +80,7 @@ def create_likelihood(self, name, y_est, y_data, model=None):
 
         Parameters
         ----------
-        y_est: theano.tensor
+        y_est: aesara.tensor
             Estimate of dependent variable
         y_data: array
             Observed dependent variable
diff --git a/pymc3/glm/linear.py b/pymc3/glm/linear.py
index 81c916c1185..9ec2a2b7313 100644
--- a/pymc3/glm/linear.py
+++ b/pymc3/glm/linear.py
@@ -12,8 +12,8 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import aesara.tensor as aet
 import numpy as np
-import theano.tensor as tt
 
 from pymc3.distributions import Flat, Normal
 from pymc3.glm import families
@@ -39,7 +39,7 @@ class LinearComponent(Model):
         use `Regressor` key for defining default prior for all regressors
             defaults to Normal.dist(mu=0, tau=1.0E-6)
     vars: dict - random variables instead of creating new ones
-    offset: scalar, or numpy/theano array with the same shape as y
+    offset: scalar, or numpy/aesara array with the same shape as y
         this can be used to specify an a priori known component to be
         included in the linear predictor during fitting.
     """
@@ -73,7 +73,7 @@ def __init__(
         x, labels = any_to_tensor_and_labels(x, labels)
         # now we have x, shape and labels
         if intercept:
-            x = tt.concatenate([tt.ones((x.shape[0], 1), x.dtype), x], axis=1)
+            x = aet.concatenate([aet.ones((x.shape[0], 1), x.dtype), x], axis=1)
             labels = ["Intercept"] + labels
         coeffs = list()
         for name in labels:
@@ -94,7 +94,7 @@ def __init__(
                         ),
                     )
                 coeffs.append(v)
-        self.coeffs = tt.stack(coeffs, axis=0)
+        self.coeffs = aet.stack(coeffs, axis=0)
         self.y_est = x.dot(self.coeffs) + offset
 
     @classmethod
@@ -149,7 +149,7 @@ class GLM(LinearComponent):
     init: dict - test_vals for coefficients
     vars: dict - random variables instead of creating new ones
     family: pymc3..families object
-    offset: scalar, or numpy/theano array with the same shape as y
+    offset: scalar, or numpy/aesara array with the same shape as y
         this can be used to specify an a priori known component to be
         included in the linear predictor during fitting.
     """
diff --git a/pymc3/glm/utils.py b/pymc3/glm/utils.py
index 889284b3179..64318925818 100644
--- a/pymc3/glm/utils.py
+++ b/pymc3/glm/utils.py
@@ -12,9 +12,11 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import aesara.tensor as aet
 import numpy as np
 import pandas as pd
-import theano.tensor as tt
+
+from aesara.graph.basic import Variable
 
 
 def any_to_tensor_and_labels(x, labels=None):
@@ -33,7 +35,7 @@ def any_to_tensor_and_labels(x, labels=None):
 
     Parameters
     ----------
-    x: np.ndarray | pd.DataFrame | tt.Variable | dict | list
+    x: np.ndarray | pd.DataFrame | Variable | dict | list
     labels: list - names for columns of output tensor
 
     Returns
@@ -76,13 +78,13 @@ def any_to_tensor_and_labels(x, labels=None):
             for k, v in x.items():
                 res.append(v)
                 labels.append(k)
-            x = tt.stack(res, axis=1)
+            x = aet.stack(res, axis=1)
             if x.ndim == 1:
                 x = x[:, None]
     # case when it can appear to be some
     # array like value like lists of lists
     # numpy deals with it
-    elif not isinstance(x, tt.Variable):
+    elif not isinstance(x, Variable):
         x = np.asarray(x)
         if x.ndim == 0:
             raise ValueError("Cannot use scalars")
@@ -92,7 +94,7 @@ def any_to_tensor_and_labels(x, labels=None):
     # but user passes labels trusting seems
     # to be a good option
     elif labels is not None:
-        x = tt.as_tensor_variable(x)
+        x = aet.as_tensor_variable(x)
         if x.ndim == 0:
             raise ValueError("Cannot use scalars")
         elif x.ndim == 1:
@@ -100,15 +102,15 @@ def any_to_tensor_and_labels(x, labels=None):
     else:  # trust input
         pass
     # we should check that we can extract labels
-    if labels is None and not isinstance(x, tt.Variable):
+    if labels is None and not isinstance(x, Variable):
         labels = ["x%d" % i for i in range(x.shape[1])]
-    # for theano variables we should have labels from user
+    # for aesara variables we should have labels from user
     elif labels is None:
         raise ValueError("Please provide labels as " "we cannot infer shape of input")
     else:  # trust labels, user knows what he is doing
         pass
     # it's time to check shapes if we can
-    if not isinstance(x, tt.Variable):
+    if not isinstance(x, Variable):
         if not len(labels) == x.shape[1]:
             raise ValueError(
                 "Please provide full list "
@@ -126,8 +128,8 @@ def any_to_tensor_and_labels(x, labels=None):
     elif not isinstance(labels, list):
         labels = list(labels)
     # as output we need tensor
-    if not isinstance(x, tt.Variable):
-        x = tt.as_tensor_variable(x)
+    if not isinstance(x, Variable):
+        x = aet.as_tensor_variable(x)
         # finally check dimensions
         if x.ndim == 0:
             raise ValueError("Cannot use scalars")
diff --git a/pymc3/gp/cov.py b/pymc3/gp/cov.py
index 7a01a9eec51..4a02827a5d5 100644
--- a/pymc3/gp/cov.py
+++ b/pymc3/gp/cov.py
@@ -18,9 +18,12 @@
 from numbers import Number
 from operator import add, mul
 
+import aesara
+import aesara.tensor as aet
 import numpy as np
-import theano
-import theano.tensor as tt
+
+from aesara.tensor.sharedvar import TensorSharedVariable
+from aesara.tensor.var import TensorConstant, TensorVariable
 
 __all__ = [
     "Constant",
@@ -96,9 +99,9 @@ def _slice(self, X, Xs):
                 " the number of columns to use. Ignore otherwise.",
                 UserWarning,
             )
-        X = tt.as_tensor_variable(X[:, self.active_dims])
+        X = aet.as_tensor_variable(X[:, self.active_dims])
         if Xs is not None:
-            Xs = tt.as_tensor_variable(Xs[:, self.active_dims])
+            Xs = aet.as_tensor_variable(Xs[:, self.active_dims])
         return X, Xs
 
     def __add__(self, other):
@@ -115,10 +118,10 @@ def __rmul__(self, other):
 
     def __pow__(self, other):
         if (
-            isinstance(other, theano.compile.SharedVariable)
+            isinstance(other, aesara.compile.SharedVariable)
             and other.get_value().squeeze().shape == ()
         ):
-            other = tt.squeeze(other)
+            other = aet.squeeze(other)
             return Exponentiated(self, other)
         elif isinstance(other, Number):
             return Exponentiated(self, other)
@@ -179,13 +182,13 @@ def merge_factors(self, X, Xs=None, diag=False):
             elif isinstance(
                 factor,
                 (
-                    tt.TensorConstant,
-                    tt.TensorVariable,
-                    tt.sharedvar.TensorSharedVariable,
+                    TensorConstant,
+                    TensorVariable,
+                    TensorSharedVariable,
                 ),
             ):
                 if factor.ndim == 2 and diag:
-                    factor_list.append(tt.diag(factor))
+                    factor_list.append(aet.diag(factor))
                 else:
                     factor_list.append(factor)
             else:
@@ -264,13 +267,13 @@ def __init__(self, c):
         self.c = c
 
     def diag(self, X):
-        return tt.alloc(self.c, X.shape[0])
+        return aet.alloc(self.c, X.shape[0])
 
     def full(self, X, Xs=None):
         if Xs is None:
-            return tt.alloc(self.c, X.shape[0], X.shape[0])
+            return aet.alloc(self.c, X.shape[0], X.shape[0])
         else:
-            return tt.alloc(self.c, X.shape[0], Xs.shape[0])
+            return aet.alloc(self.c, X.shape[0], Xs.shape[0])
 
 
 class WhiteNoise(Covariance):
@@ -287,13 +290,13 @@ def __init__(self, sigma):
         self.sigma = sigma
 
     def diag(self, X):
-        return tt.alloc(tt.square(self.sigma), X.shape[0])
+        return aet.alloc(aet.square(self.sigma), X.shape[0])
 
     def full(self, X, Xs=None):
         if Xs is None:
-            return tt.diag(self.diag(X))
+            return aet.diag(self.diag(X))
         else:
-            return tt.alloc(0.0, X.shape[0], Xs.shape[0])
+            return aet.alloc(0.0, X.shape[0], Xs.shape[0])
 
 
 class Circular(Covariance):
@@ -330,25 +333,25 @@ class Circular(Covariance):
 
     def __init__(self, input_dim, period, tau=4, active_dims=None):
         super().__init__(input_dim, active_dims)
-        self.c = tt.as_tensor_variable(period / 2)
+        self.c = aet.as_tensor_variable(period / 2)
         self.tau = tau
 
     def dist(self, X, Xs):
         if Xs is None:
-            Xs = tt.transpose(X)
+            Xs = aet.transpose(X)
         else:
-            Xs = tt.transpose(Xs)
-        return tt.abs_((X - Xs + self.c) % (self.c * 2) - self.c)
+            Xs = aet.transpose(Xs)
+        return aet.abs_((X - Xs + self.c) % (self.c * 2) - self.c)
 
     def weinland(self, t):
-        return (1 + self.tau * t / self.c) * tt.clip(1 - t / self.c, 0, np.inf) ** self.tau
+        return (1 + self.tau * t / self.c) * aet.clip(1 - t / self.c, 0, np.inf) ** self.tau
 
     def full(self, X, Xs=None):
         X, Xs = self._slice(X, Xs)
         return self.weinland(self.dist(X, Xs))
 
     def diag(self, X):
-        return tt.alloc(1.0, X.shape[0])
+        return aet.alloc(1.0, X.shape[0])
 
 
 class Stationary(Covariance):
@@ -371,29 +374,29 @@ def __init__(self, input_dim, ls=None, ls_inv=None, active_dims=None):
                 ls = 1.0 / np.asarray(ls_inv)
             else:
                 ls = 1.0 / ls_inv
-        self.ls = tt.as_tensor_variable(ls)
+        self.ls = aet.as_tensor_variable(ls)
 
     def square_dist(self, X, Xs):
-        X = tt.mul(X, 1.0 / self.ls)
-        X2 = tt.sum(tt.square(X), 1)
+        X = aet.mul(X, 1.0 / self.ls)
+        X2 = aet.sum(aet.square(X), 1)
         if Xs is None:
-            sqd = -2.0 * tt.dot(X, tt.transpose(X)) + (
-                tt.reshape(X2, (-1, 1)) + tt.reshape(X2, (1, -1))
+            sqd = -2.0 * aet.dot(X, aet.transpose(X)) + (
+                aet.reshape(X2, (-1, 1)) + aet.reshape(X2, (1, -1))
             )
         else:
-            Xs = tt.mul(Xs, 1.0 / self.ls)
-            Xs2 = tt.sum(tt.square(Xs), 1)
-            sqd = -2.0 * tt.dot(X, tt.transpose(Xs)) + (
-                tt.reshape(X2, (-1, 1)) + tt.reshape(Xs2, (1, -1))
+            Xs = aet.mul(Xs, 1.0 / self.ls)
+            Xs2 = aet.sum(aet.square(Xs), 1)
+            sqd = -2.0 * aet.dot(X, aet.transpose(Xs)) + (
+                aet.reshape(X2, (-1, 1)) + aet.reshape(Xs2, (1, -1))
             )
-        return tt.clip(sqd, 0.0, np.inf)
+        return aet.clip(sqd, 0.0, np.inf)
 
     def euclidean_dist(self, X, Xs):
         r2 = self.square_dist(X, Xs)
-        return tt.sqrt(r2 + 1e-12)
+        return aet.sqrt(r2 + 1e-12)
 
     def diag(self, X):
-        return tt.alloc(1.0, X.shape[0])
+        return aet.alloc(1.0, X.shape[0])
 
     def full(self, X, Xs=None):
         raise NotImplementedError
@@ -429,8 +432,8 @@ def full(self, X, Xs=None):
         f1 = X.dimshuffle(0, "x", 1)
         f2 = Xs.dimshuffle("x", 0, 1)
         r = np.pi * (f1 - f2) / self.period
-        r = tt.sum(tt.square(tt.sin(r) / self.ls), 2)
-        return tt.exp(-0.5 * r)
+        r = aet.sum(aet.square(aet.sin(r) / self.ls), 2)
+        return aet.exp(-0.5 * r)
 
 
 class ExpQuad(Stationary):
@@ -445,7 +448,7 @@ class ExpQuad(Stationary):
 
     def full(self, X, Xs=None):
         X, Xs = self._slice(X, Xs)
-        return tt.exp(-0.5 * self.square_dist(X, Xs))
+        return aet.exp(-0.5 * self.square_dist(X, Xs))
 
 
 class RatQuad(Stationary):
@@ -463,7 +466,7 @@ def __init__(self, input_dim, alpha, ls=None, ls_inv=None, active_dims=None):
 
     def full(self, X, Xs=None):
         X, Xs = self._slice(X, Xs)
-        return tt.power(
+        return aet.power(
             (1.0 + 0.5 * self.square_dist(X, Xs) * (1.0 / self.alpha)),
             -1.0 * self.alpha,
         )
@@ -483,7 +486,9 @@ class Matern52(Stationary):
     def full(self, X, Xs=None):
         X, Xs = self._slice(X, Xs)
         r = self.euclidean_dist(X, Xs)
-        return (1.0 + np.sqrt(5.0) * r + 5.0 / 3.0 * tt.square(r)) * tt.exp(-1.0 * np.sqrt(5.0) * r)
+        return (1.0 + np.sqrt(5.0) * r + 5.0 / 3.0 * aet.square(r)) * aet.exp(
+            -1.0 * np.sqrt(5.0) * r
+        )
 
 
 class Matern32(Stationary):
@@ -499,7 +504,7 @@ class Matern32(Stationary):
     def full(self, X, Xs=None):
         X, Xs = self._slice(X, Xs)
         r = self.euclidean_dist(X, Xs)
-        return (1.0 + np.sqrt(3.0) * r) * tt.exp(-np.sqrt(3.0) * r)
+        return (1.0 + np.sqrt(3.0) * r) * aet.exp(-np.sqrt(3.0) * r)
 
 
 class Matern12(Stationary):
@@ -512,7 +517,7 @@ class Matern12(Stationary):
     def full(self, X, Xs=None):
         X, Xs = self._slice(X, Xs)
         r = self.euclidean_dist(X, Xs)
-        return tt.exp(-r)
+        return aet.exp(-r)
 
 
 class Exponential(Stationary):
@@ -526,7 +531,7 @@ class Exponential(Stationary):
 
     def full(self, X, Xs=None):
         X, Xs = self._slice(X, Xs)
-        return tt.exp(-0.5 * self.euclidean_dist(X, Xs))
+        return aet.exp(-0.5 * self.euclidean_dist(X, Xs))
 
 
 class Cosine(Stationary):
@@ -539,7 +544,7 @@ class Cosine(Stationary):
 
     def full(self, X, Xs=None):
         X, Xs = self._slice(X, Xs)
-        return tt.cos(2.0 * np.pi * self.euclidean_dist(X, Xs))
+        return aet.cos(2.0 * np.pi * self.euclidean_dist(X, Xs))
 
 
 class Linear(Covariance):
@@ -556,20 +561,20 @@ def __init__(self, input_dim, c, active_dims=None):
 
     def _common(self, X, Xs=None):
         X, Xs = self._slice(X, Xs)
-        Xc = tt.sub(X, self.c)
+        Xc = aet.sub(X, self.c)
         return X, Xc, Xs
 
     def full(self, X, Xs=None):
         X, Xc, Xs = self._common(X, Xs)
         if Xs is None:
-            return tt.dot(Xc, tt.transpose(Xc))
+            return aet.dot(Xc, aet.transpose(Xc))
         else:
-            Xsc = tt.sub(Xs, self.c)
-            return tt.dot(Xc, tt.transpose(Xsc))
+            Xsc = aet.sub(Xs, self.c)
+            return aet.dot(Xc, aet.transpose(Xsc))
 
     def diag(self, X):
         X, Xc, _ = self._common(X, None)
-        return tt.sum(tt.square(Xc), 1)
+        return aet.sum(aet.square(Xc), 1)
 
 
 class Polynomial(Linear):
@@ -587,17 +592,17 @@ def __init__(self, input_dim, c, d, offset, active_dims=None):
 
     def full(self, X, Xs=None):
         linear = super().full(X, Xs)
-        return tt.power(linear + self.offset, self.d)
+        return aet.power(linear + self.offset, self.d)
 
     def diag(self, X):
         linear = super().diag(X)
-        return tt.power(linear + self.offset, self.d)
+        return aet.power(linear + self.offset, self.d)
 
 
 class WarpedInput(Covariance):
     r"""
     Warp the inputs of any kernel using an arbitrary function
-    defined using Theano.
+    defined using Aesara.
 
     .. math::
        k(x, x') = k(w(x), w(x'))
@@ -606,7 +611,7 @@ class WarpedInput(Covariance):
     ----------
     cov_func: Covariance
     warp_func: callable
-        Theano function of X and additional optional arguments.
+        Aesara function of X and additional optional arguments.
     args: optional, tuple or list of scalars or PyMC3 variables
         Additional inputs (besides X or Xs) to warp_func.
     """
@@ -636,7 +641,7 @@ def diag(self, X):
 class Gibbs(Covariance):
     r"""
     The Gibbs kernel.  Use an arbitrary lengthscale function defined
-    using Theano.  Only tested in one dimension.
+    using Aesara.  Only tested in one dimension.
 
     .. math::
        k(x, x') = \sqrt{\frac{2\ell(x)\ell(x')}{\ell^2(x) + \ell^2(x')}}
@@ -646,7 +651,7 @@ class Gibbs(Covariance):
     Parameters
     ----------
     lengthscale_func: callable
-        Theano function of X and additional optional arguments.
+        Aesara function of X and additional optional arguments.
     args: optional, tuple or list of scalars or PyMC3 variables
         Additional inputs (besides X or Xs) to lengthscale_func.
     """
@@ -665,39 +670,39 @@ def __init__(self, input_dim, lengthscale_func, args=None, active_dims=None):
         self.args = args
 
     def square_dist(self, X, Xs=None):
-        X2 = tt.sum(tt.square(X), 1)
+        X2 = aet.sum(aet.square(X), 1)
         if Xs is None:
-            sqd = -2.0 * tt.dot(X, tt.transpose(X)) + (
-                tt.reshape(X2, (-1, 1)) + tt.reshape(X2, (1, -1))
+            sqd = -2.0 * aet.dot(X, aet.transpose(X)) + (
+                aet.reshape(X2, (-1, 1)) + aet.reshape(X2, (1, -1))
             )
         else:
-            Xs2 = tt.sum(tt.square(Xs), 1)
-            sqd = -2.0 * tt.dot(X, tt.transpose(Xs)) + (
-                tt.reshape(X2, (-1, 1)) + tt.reshape(Xs2, (1, -1))
+            Xs2 = aet.sum(aet.square(Xs), 1)
+            sqd = -2.0 * aet.dot(X, aet.transpose(Xs)) + (
+                aet.reshape(X2, (-1, 1)) + aet.reshape(Xs2, (1, -1))
             )
-        return tt.clip(sqd, 0.0, np.inf)
+        return aet.clip(sqd, 0.0, np.inf)
 
     def full(self, X, Xs=None):
         X, Xs = self._slice(X, Xs)
-        rx = self.lfunc(tt.as_tensor_variable(X), self.args)
+        rx = self.lfunc(aet.as_tensor_variable(X), self.args)
         if Xs is None:
-            rz = self.lfunc(tt.as_tensor_variable(X), self.args)
+            rz = self.lfunc(aet.as_tensor_variable(X), self.args)
             r2 = self.square_dist(X, X)
         else:
-            rz = self.lfunc(tt.as_tensor_variable(Xs), self.args)
+            rz = self.lfunc(aet.as_tensor_variable(Xs), self.args)
             r2 = self.square_dist(X, Xs)
-        rx2 = tt.reshape(tt.square(rx), (-1, 1))
-        rz2 = tt.reshape(tt.square(rz), (1, -1))
-        return tt.sqrt((2.0 * tt.outer(rx, rz)) / (rx2 + rz2)) * tt.exp(-1.0 * r2 / (rx2 + rz2))
+        rx2 = aet.reshape(aet.square(rx), (-1, 1))
+        rz2 = aet.reshape(aet.square(rz), (1, -1))
+        return aet.sqrt((2.0 * aet.outer(rx, rz)) / (rx2 + rz2)) * aet.exp(-1.0 * r2 / (rx2 + rz2))
 
     def diag(self, X):
-        return tt.alloc(1.0, X.shape[0])
+        return aet.alloc(1.0, X.shape[0])
 
 
 class ScaledCov(Covariance):
     r"""
     Construct a kernel by multiplying a base kernel with a scaling
-    function defined using Theano.  The scaling function is
+    function defined using Aesara.  The scaling function is
     non-negative, and can be parameterized.
 
     .. math::
@@ -708,7 +713,7 @@ class ScaledCov(Covariance):
     cov_func: Covariance
         Base kernel or covariance function
     scaling_func: callable
-        Theano function of X and additional optional arguments.
+        Aesara function of X and additional optional arguments.
     args: optional, tuple or list of scalars or PyMC3 variables
         Additional inputs (besides X or Xs) to lengthscale_func.
     """
@@ -726,17 +731,17 @@ def __init__(self, input_dim, cov_func, scaling_func, args=None, active_dims=Non
     def diag(self, X):
         X, _ = self._slice(X, None)
         cov_diag = self.cov_func(X, diag=True)
-        scf_diag = tt.square(tt.flatten(self.scaling_func(X, self.args)))
+        scf_diag = aet.square(aet.flatten(self.scaling_func(X, self.args)))
         return cov_diag * scf_diag
 
     def full(self, X, Xs=None):
         X, Xs = self._slice(X, Xs)
         scf_x = self.scaling_func(X, self.args)
         if Xs is None:
-            return tt.outer(scf_x, scf_x) * self.cov_func(X)
+            return aet.outer(scf_x, scf_x) * self.cov_func(X)
         else:
             scf_xs = self.scaling_func(Xs, self.args)
-            return tt.outer(scf_x, scf_xs) * self.cov_func(X, Xs)
+            return aet.outer(scf_x, scf_xs) * self.cov_func(X, Xs)
 
 
 class Coregion(Covariance):
@@ -780,27 +785,27 @@ def __init__(self, input_dim, W=None, kappa=None, B=None, active_dims=None):
         if make_B and B is not None:
             raise ValueError("Exactly one of (W, kappa) and B must be provided to Coregion")
         if make_B:
-            self.W = tt.as_tensor_variable(W)
-            self.kappa = tt.as_tensor_variable(kappa)
-            self.B = tt.dot(self.W, self.W.T) + tt.diag(self.kappa)
+            self.W = aet.as_tensor_variable(W)
+            self.kappa = aet.as_tensor_variable(kappa)
+            self.B = aet.dot(self.W, self.W.T) + aet.diag(self.kappa)
         elif B is not None:
-            self.B = tt.as_tensor_variable(B)
+            self.B = aet.as_tensor_variable(B)
         else:
             raise ValueError("Exactly one of (W, kappa) and B must be provided to Coregion")
 
     def full(self, X, Xs=None):
         X, Xs = self._slice(X, Xs)
-        index = tt.cast(X, "int32")
+        index = aet.cast(X, "int32")
         if Xs is None:
             index2 = index.T
         else:
-            index2 = tt.cast(Xs, "int32").T
+            index2 = aet.cast(Xs, "int32").T
         return self.B[index, index2]
 
     def diag(self, X):
         X, _ = self._slice(X, None)
-        index = tt.cast(X, "int32")
-        return tt.diag(self.B)[index.ravel()]
+        index = aet.cast(X, "int32")
+        return aet.diag(self.B)[index.ravel()]
 
 
 def handle_args(func, args):
diff --git a/pymc3/gp/gp.py b/pymc3/gp/gp.py
index 654bf536cfa..43a52b2d168 100644
--- a/pymc3/gp/gp.py
+++ b/pymc3/gp/gp.py
@@ -15,10 +15,10 @@
 import functools
 import warnings
 
+import aesara.tensor as aet
 import numpy as np
-import theano.tensor as tt
 
-from theano.tensor.nlinalg import eigh
+from aesara.tensor.nlinalg import eigh
 
 import pymc3 as pm
 
@@ -195,9 +195,9 @@ def _build_conditional(self, Xnew, X, f, cov_total, mean_total):
         L = cholesky(stabilize(Kxx))
         A = solve_lower(L, Kxs)
         v = solve_lower(L, f - mean_total(X))
-        mu = self.mean_func(Xnew) + tt.dot(tt.transpose(A), v)
+        mu = self.mean_func(Xnew) + aet.dot(aet.transpose(A), v)
         Kss = self.cov_func(Xnew)
-        cov = Kss - tt.dot(tt.transpose(A), A)
+        cov = Kss - aet.dot(aet.transpose(A), A)
         return mu, cov
 
     def conditional(self, name, Xnew, given=None, **kwargs):
@@ -281,7 +281,7 @@ def _build_prior(self, name, X, reparameterize=True, **kwargs):
         if reparameterize:
             chi2 = pm.ChiSquared(name + "_chi2_", self.nu)
             v = pm.Normal(name + "_rotated_", mu=0.0, sigma=1.0, shape=shape, **kwargs)
-            f = pm.Deterministic(name, (tt.sqrt(self.nu) / chi2) * (mu + cholesky(cov).dot(v)))
+            f = pm.Deterministic(name, (aet.sqrt(self.nu) / chi2) * (mu + cholesky(cov).dot(v)))
         else:
             f = pm.MvStudentT(name, nu=self.nu, mu=mu, cov=cov, shape=shape, **kwargs)
         return f
@@ -318,10 +318,10 @@ def _build_conditional(self, Xnew, X, f):
         Kss = self.cov_func(Xnew)
         L = cholesky(stabilize(Kxx))
         A = solve_lower(L, Kxs)
-        cov = Kss - tt.dot(tt.transpose(A), A)
+        cov = Kss - aet.dot(aet.transpose(A), A)
         v = solve_lower(L, f - self.mean_func(X))
-        mu = self.mean_func(Xnew) + tt.dot(tt.transpose(A), v)
-        beta = tt.dot(v, v)
+        mu = self.mean_func(Xnew) + aet.dot(aet.transpose(A), v)
+        beta = aet.dot(v, v)
         nu2 = self.nu + X.shape[0]
         covT = (self.nu + beta - 2) / (nu2 - 2) * cov
         return nu2, mu, covT
@@ -476,16 +476,16 @@ def _build_conditional(self, Xnew, pred_noise, diag, X, y, noise, cov_total, mea
         L = cholesky(stabilize(Kxx) + Knx)
         A = solve_lower(L, Kxs)
         v = solve_lower(L, rxx)
-        mu = self.mean_func(Xnew) + tt.dot(tt.transpose(A), v)
+        mu = self.mean_func(Xnew) + aet.dot(aet.transpose(A), v)
         if diag:
             Kss = self.cov_func(Xnew, diag=True)
-            var = Kss - tt.sum(tt.square(A), 0)
+            var = Kss - aet.sum(aet.square(A), 0)
             if pred_noise:
                 var += noise(Xnew, diag=True)
             return mu, var
         else:
             Kss = self.cov_func(Xnew)
-            cov = Kss - tt.dot(tt.transpose(A), A)
+            cov = Kss - aet.dot(aet.transpose(A), A)
             if pred_noise:
                 cov += noise(Xnew)
             return mu, cov if pred_noise else stabilize(cov)
@@ -664,32 +664,32 @@ def __add__(self, other):
     # in marginal_likelihood instead of lambda. This makes pickling
     # possible.
     def _build_marginal_likelihood_logp(self, y, X, Xu, sigma):
-        sigma2 = tt.square(sigma)
+        sigma2 = aet.square(sigma)
         Kuu = self.cov_func(Xu)
         Kuf = self.cov_func(Xu, X)
         Luu = cholesky(stabilize(Kuu))
         A = solve_lower(Luu, Kuf)
-        Qffd = tt.sum(A * A, 0)
+        Qffd = aet.sum(A * A, 0)
         if self.approx == "FITC":
             Kffd = self.cov_func(X, diag=True)
-            Lamd = tt.clip(Kffd - Qffd, 0.0, np.inf) + sigma2
+            Lamd = aet.clip(Kffd - Qffd, 0.0, np.inf) + sigma2
             trace = 0.0
         elif self.approx == "VFE":
-            Lamd = tt.ones_like(Qffd) * sigma2
+            Lamd = aet.ones_like(Qffd) * sigma2
             trace = (1.0 / (2.0 * sigma2)) * (
-                tt.sum(self.cov_func(X, diag=True)) - tt.sum(tt.sum(A * A, 0))
+                aet.sum(self.cov_func(X, diag=True)) - aet.sum(aet.sum(A * A, 0))
             )
         else:  # DTC
-            Lamd = tt.ones_like(Qffd) * sigma2
+            Lamd = aet.ones_like(Qffd) * sigma2
             trace = 0.0
         A_l = A / Lamd
-        L_B = cholesky(tt.eye(Xu.shape[0]) + tt.dot(A_l, tt.transpose(A)))
+        L_B = cholesky(aet.eye(Xu.shape[0]) + aet.dot(A_l, aet.transpose(A)))
         r = y - self.mean_func(X)
         r_l = r / Lamd
-        c = solve_lower(L_B, tt.dot(A, r_l))
-        constant = 0.5 * X.shape[0] * tt.log(2.0 * np.pi)
-        logdet = 0.5 * tt.sum(tt.log(Lamd)) + tt.sum(tt.log(tt.diag(L_B)))
-        quadratic = 0.5 * (tt.dot(r, r_l) - tt.dot(c, c))
+        c = solve_lower(L_B, aet.dot(A, r_l))
+        constant = 0.5 * X.shape[0] * aet.log(2.0 * np.pi)
+        logdet = 0.5 * aet.sum(aet.log(Lamd)) + aet.sum(aet.log(aet.diag(L_B)))
+        quadratic = 0.5 * (aet.dot(r, r_l) - aet.dot(c, c))
         return -1.0 * (constant + logdet + quadratic + trace)
 
     def marginal_likelihood(self, name, X, Xu, y, noise=None, is_observed=True, **kwargs):
@@ -743,36 +743,38 @@ def marginal_likelihood(self, name, X, Xu, y, noise=None, is_observed=True, **kw
             return pm.DensityDist(name, logp, shape=shape, **kwargs)
 
     def _build_conditional(self, Xnew, pred_noise, diag, X, Xu, y, sigma, cov_total, mean_total):
-        sigma2 = tt.square(sigma)
+        sigma2 = aet.square(sigma)
         Kuu = cov_total(Xu)
         Kuf = cov_total(Xu, X)
         Luu = cholesky(stabilize(Kuu))
         A = solve_lower(Luu, Kuf)
-        Qffd = tt.sum(A * A, 0)
+        Qffd = aet.sum(A * A, 0)
         if self.approx == "FITC":
             Kffd = cov_total(X, diag=True)
-            Lamd = tt.clip(Kffd - Qffd, 0.0, np.inf) + sigma2
+            Lamd = aet.clip(Kffd - Qffd, 0.0, np.inf) + sigma2
         else:  # VFE or DTC
-            Lamd = tt.ones_like(Qffd) * sigma2
+            Lamd = aet.ones_like(Qffd) * sigma2
         A_l = A / Lamd
-        L_B = cholesky(tt.eye(Xu.shape[0]) + tt.dot(A_l, tt.transpose(A)))
+        L_B = cholesky(aet.eye(Xu.shape[0]) + aet.dot(A_l, aet.transpose(A)))
         r = y - mean_total(X)
         r_l = r / Lamd
-        c = solve_lower(L_B, tt.dot(A, r_l))
+        c = solve_lower(L_B, aet.dot(A, r_l))
         Kus = self.cov_func(Xu, Xnew)
         As = solve_lower(Luu, Kus)
-        mu = self.mean_func(Xnew) + tt.dot(tt.transpose(As), solve_upper(tt.transpose(L_B), c))
+        mu = self.mean_func(Xnew) + aet.dot(aet.transpose(As), solve_upper(aet.transpose(L_B), c))
         C = solve_lower(L_B, As)
         if diag:
             Kss = self.cov_func(Xnew, diag=True)
-            var = Kss - tt.sum(tt.square(As), 0) + tt.sum(tt.square(C), 0)
+            var = Kss - aet.sum(aet.square(As), 0) + aet.sum(aet.square(C), 0)
             if pred_noise:
                 var += sigma2
             return mu, var
         else:
-            cov = self.cov_func(Xnew) - tt.dot(tt.transpose(As), As) + tt.dot(tt.transpose(C), C)
+            cov = (
+                self.cov_func(Xnew) - aet.dot(aet.transpose(As), As) + aet.dot(aet.transpose(C), C)
+            )
             if pred_noise:
-                cov += sigma2 * tt.identity_like(cov)
+                cov += sigma2 * aet.identity_like(cov)
             return mu, cov if pred_noise else stabilize(cov)
 
     def _get_given_vals(self, given):
@@ -891,7 +893,7 @@ def _build_prior(self, name, Xs, **kwargs):
         chols = [cholesky(stabilize(cov(X))) for cov, X in zip(self.cov_funcs, Xs)]
         # remove reparameterization option
         v = pm.Normal(name + "_rotated_", mu=0.0, sigma=1.0, shape=self.N, **kwargs)
-        f = pm.Deterministic(name, mu + tt.flatten(kron_dot(chols, v)))
+        f = pm.Deterministic(name, mu + aet.flatten(kron_dot(chols, v)))
         return f
 
     def prior(self, name, Xs, **kwargs):
@@ -925,15 +927,15 @@ def _build_conditional(self, Xnew):
         delta = f - self.mean_func(X)
         covs = [stabilize(cov(Xi)) for cov, Xi in zip(self.cov_funcs, Xs)]
         chols = [cholesky(cov) for cov in covs]
-        cholTs = [tt.transpose(chol) for chol in chols]
+        cholTs = [aet.transpose(chol) for chol in chols]
         Kss = self.cov_func(Xnew)
         Kxs = self.cov_func(X, Xnew)
-        Ksx = tt.transpose(Kxs)
+        Ksx = aet.transpose(Kxs)
         alpha = kron_solve_lower(chols, delta)
         alpha = kron_solve_upper(cholTs, alpha)
-        mu = tt.dot(Ksx, alpha).ravel() + self.mean_func(Xnew)
+        mu = aet.dot(Ksx, alpha).ravel() + self.mean_func(Xnew)
         A = kron_solve_lower(chols, Kxs)
-        cov = stabilize(Kss - tt.dot(tt.transpose(A), A))
+        cov = stabilize(Kss - aet.dot(aet.transpose(A), A))
         return mu, cov
 
     def conditional(self, name, Xnew, **kwargs):
@@ -1103,7 +1105,7 @@ def _build_conditional(self, Xnew, pred_noise, diag):
         delta = y - self.mean_func(X)
         Kns = [f(x) for f, x in zip(self.cov_funcs, Xs)]
         eigs_sep, Qs = zip(*map(eigh, Kns))  # Unzip
-        QTs = list(map(tt.transpose, Qs))
+        QTs = list(map(aet.transpose, Qs))
         eigs = kron_diag(*eigs_sep)  # Combine separate eigs
         if sigma is not None:
             eigs += sigma ** 2
@@ -1117,21 +1119,21 @@ def _build_conditional(self, Xnew, pred_noise, diag):
         alpha = kron_dot(QTs, delta)
         alpha = alpha / eigs[:, None]
         alpha = kron_dot(Qs, alpha)
-        mu = tt.dot(Kmn, alpha).ravel() + self.mean_func(Xnew)
+        mu = aet.dot(Kmn, alpha).ravel() + self.mean_func(Xnew)
 
         # Build conditional cov
         A = kron_dot(QTs, Knm)
-        A = A / tt.sqrt(eigs[:, None])
+        A = A / aet.sqrt(eigs[:, None])
         if diag:
-            Asq = tt.sum(tt.square(A), 0)
+            Asq = aet.sum(aet.square(A), 0)
             cov = Km - Asq
             if pred_noise:
                 cov += sigma
         else:
-            Asq = tt.dot(A.T, A)
+            Asq = aet.dot(A.T, A)
             cov = Km - Asq
             if pred_noise:
-                cov += sigma * tt.identity_like(cov)
+                cov += sigma * aet.identity_like(cov)
         return mu, cov
 
     def conditional(self, name, Xnew, pred_noise=False, **kwargs):
diff --git a/pymc3/gp/mean.py b/pymc3/gp/mean.py
index d2e93fdfe5f..47d38d9897a 100644
--- a/pymc3/gp/mean.py
+++ b/pymc3/gp/mean.py
@@ -12,7 +12,7 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
-import theano.tensor as tt
+import aesara.tensor as aet
 
 __all__ = ["Zero", "Constant", "Linear"]
 
@@ -46,7 +46,7 @@ class Zero(Mean):
     """
 
     def __call__(self, X):
-        return tt.alloc(0.0, X.shape[0])
+        return aet.alloc(0.0, X.shape[0])
 
 
 class Constant(Mean):
@@ -64,7 +64,7 @@ def __init__(self, c=0):
         self.c = c
 
     def __call__(self, X):
-        return tt.alloc(1.0, X.shape[0]) * self.c
+        return aet.alloc(1.0, X.shape[0]) * self.c
 
 
 class Linear(Mean):
@@ -85,7 +85,7 @@ def __init__(self, coeffs, intercept=0):
         self.A = coeffs
 
     def __call__(self, X):
-        return tt.squeeze(tt.dot(X, self.A) + self.b)
+        return aet.squeeze(aet.dot(X, self.A) + self.b)
 
 
 class Add(Mean):
@@ -95,7 +95,7 @@ def __init__(self, first_mean, second_mean):
         self.m2 = second_mean
 
     def __call__(self, X):
-        return tt.add(self.m1(X), self.m2(X))
+        return aet.add(self.m1(X), self.m2(X))
 
 
 class Prod(Mean):
@@ -105,4 +105,4 @@ def __init__(self, first_mean, second_mean):
         self.m2 = second_mean
 
     def __call__(self, X):
-        return tt.mul(self.m1(X), self.m2(X))
+        return aet.mul(self.m1(X), self.m2(X))
diff --git a/pymc3/gp/util.py b/pymc3/gp/util.py
index 698c300564b..1ad05f6eab9 100644
--- a/pymc3/gp/util.py
+++ b/pymc3/gp/util.py
@@ -14,16 +14,16 @@
 
 import warnings
 
+import aesara.tensor as aet
 import numpy as np
-import theano.tensor as tt
-import theano.tensor.slinalg  # pylint: disable=unused-import
 
+from aesara.tensor.slinalg import Solve, cholesky  # pylint: disable=unused-import
+from aesara.tensor.var import TensorConstant
 from scipy.cluster.vq import kmeans
 
-cholesky = tt.slinalg.cholesky
-solve_lower = tt.slinalg.Solve(A_structure="lower_triangular")
-solve_upper = tt.slinalg.Solve(A_structure="upper_triangular")
-solve = tt.slinalg.Solve(A_structure="general")
+solve_lower = Solve(A_structure="lower_triangular")
+solve_upper = Solve(A_structure="upper_triangular")
+solve = Solve(A_structure="general")
 
 
 def infer_shape(X, n_points=None):
@@ -37,12 +37,12 @@ def infer_shape(X, n_points=None):
 
 def stabilize(K):
     """ adds small diagonal to a covariance matrix """
-    return K + 1e-6 * tt.identity_like(K)
+    return K + 1e-6 * aet.identity_like(K)
 
 
 def kmeans_inducing_points(n_inducing, X):
     # first whiten X
-    if isinstance(X, tt.TensorConstant):
+    if isinstance(X, TensorConstant):
         X = X.value
     elif isinstance(X, (np.ndarray, tuple, list)):
         X = np.asarray(X)
diff --git a/pymc3/math.py b/pymc3/math.py
index aff54d13b71..b90b85e09e7 100644
--- a/pymc3/math.py
+++ b/pymc3/math.py
@@ -16,20 +16,19 @@
 
 from functools import partial, reduce
 
+import aesara
+import aesara.sparse
+import aesara.tensor as aet
+import aesara.tensor.slinalg  # pylint: disable=unused-import
 import numpy as np
 import scipy as sp
 import scipy.sparse  # pylint: disable=unused-import
-import theano
-import theano.sparse
-import theano.tensor as tt
-import theano.tensor.slinalg  # pylint: disable=unused-import
 
-from scipy.linalg import block_diag as scipy_block_diag
-from theano.graph.basic import Apply
-from theano.graph.op import Op
+from aesara.graph.basic import Apply
+from aesara.graph.op import Op
 
 # pylint: disable=unused-import
-from theano.tensor import (
+from aesara.tensor import (
     abs_,
     and_,
     ceil,
@@ -71,10 +70,11 @@
     where,
     zeros_like,
 )
-from theano.tensor.nlinalg import det, extract_diag, matrix_dot, matrix_inverse, trace
-from theano.tensor.nnet import sigmoid
+from aesara.tensor.nlinalg import det, extract_diag, matrix_dot, matrix_inverse, trace
+from aesara.tensor.nnet import sigmoid
+from scipy.linalg import block_diag as scipy_block_diag
 
-from pymc3.theanof import floatX, ix_, largest_common_dtype
+from pymc3.aesaraf import floatX, ix_, largest_common_dtype
 
 # pylint: enable=unused-import
 
@@ -93,7 +93,7 @@ def kronecker(*Ks):
     np.ndarray :
         Block matrix Kroncker product of the argument matrices.
     """
-    return reduce(tt.slinalg.kron, Ks)
+    return reduce(aet.slinalg.kron, Ks)
 
 
 def cartesian(*arrays):
@@ -140,17 +140,17 @@ def kron_vector_op(v):
         raise ValueError(f"m must have ndim <= 2, not {m.ndim}")
     res = kron_vector_op(m)
     res_shape = res.shape
-    return tt.reshape(res, (res_shape[1], res_shape[0])).T
+    return aet.reshape(res, (res_shape[1], res_shape[0])).T
 
 
 # Define kronecker functions that work on 1D and 2D arrays
-kron_dot = partial(kron_matrix_op, op=tt.dot)
-kron_solve_lower = partial(kron_matrix_op, op=tt.slinalg.solve_lower_triangular)
-kron_solve_upper = partial(kron_matrix_op, op=tt.slinalg.solve_upper_triangular)
+kron_dot = partial(kron_matrix_op, op=aet.dot)
+kron_solve_lower = partial(kron_matrix_op, op=aet.slinalg.solve_lower_triangular)
+kron_solve_upper = partial(kron_matrix_op, op=aet.slinalg.solve_upper_triangular)
 
 
 def flat_outer(a, b):
-    return tt.outer(a, b).ravel()
+    return aet.outer(a, b).ravel()
 
 
 def kron_diag(*diags):
@@ -166,24 +166,24 @@ def kron_diag(*diags):
 
 def tround(*args, **kwargs):
     """
-    Temporary function to silence round warning in Theano. Please remove
+    Temporary function to silence round warning in Aesara. Please remove
     when the warning disappears.
     """
     kwargs["mode"] = "half_to_even"
-    return tt.round(*args, **kwargs)
+    return aet.round(*args, **kwargs)
 
 
 def logsumexp(x, axis=None, keepdims=True):
     # Adapted from https://github.com/Theano/Theano/issues/1563
-    x_max = tt.max(x, axis=axis, keepdims=True)
-    x_max = tt.switch(tt.isinf(x_max), 0, x_max)
-    res = tt.log(tt.sum(tt.exp(x - x_max), axis=axis, keepdims=True)) + x_max
+    x_max = aet.max(x, axis=axis, keepdims=True)
+    x_max = aet.switch(aet.isinf(x_max), 0, x_max)
+    res = aet.log(aet.sum(aet.exp(x - x_max), axis=axis, keepdims=True)) + x_max
     return res if keepdims else res.squeeze()
 
 
 def logaddexp(a, b):
     diff = b - a
-    return tt.switch(diff > 0, b + tt.log1p(tt.exp(-diff)), a + tt.log1p(tt.exp(diff)))
+    return aet.switch(diff > 0, b + aet.log1p(aet.exp(-diff)), a + aet.log1p(aet.exp(diff)))
 
 
 def logdiffexp(a, b):
@@ -198,7 +198,7 @@ def logdiffexp_numpy(a, b):
 
 def invlogit(x, eps=sys.float_info.epsilon):
     """The inverse of the logit function, 1 / (1 + exp(-x))."""
-    return (1.0 - 2.0 * eps) / (1.0 + tt.exp(-x)) + eps
+    return (1.0 - 2.0 * eps) / (1.0 + aet.exp(-x)) + eps
 
 
 def logbern(log_p):
@@ -208,7 +208,7 @@ def logbern(log_p):
 
 
 def logit(p):
-    return tt.log(p / (floatX(1) - p))
+    return aet.log(p / (floatX(1) - p))
 
 
 def log1pexp(x):
@@ -216,7 +216,7 @@ def log1pexp(x):
 
     This function is numerically more stable than the naive approach.
     """
-    return tt.nnet.softplus(x)
+    return aet.nnet.softplus(x)
 
 
 def log1mexp(x):
@@ -234,7 +234,9 @@ def log1mexp(x):
             package"
 
     """
-    return tt.switch(tt.lt(x, 0.6931471805599453), tt.log(-tt.expm1(-x)), tt.log1p(-tt.exp(-x)))
+    return aet.switch(
+        aet.lt(x, 0.6931471805599453), aet.log(-aet.expm1(-x)), aet.log1p(-aet.exp(-x))
+    )
 
 
 def log1mexp_numpy(x):
@@ -253,7 +255,7 @@ def log1mexp_numpy(x):
 
 
 def flatten_list(tensors):
-    return tt.concatenate([var.ravel() for var in tensors])
+    return aet.concatenate([var.ravel() for var in tensors])
 
 
 class LogDet(Op):
@@ -268,8 +270,8 @@ class LogDet(Op):
     """
 
     def make_node(self, x):
-        x = theano.tensor.as_tensor_variable(x)
-        o = theano.tensor.scalar(dtype=x.dtype)
+        x = aesara.tensor.as_tensor_variable(x)
+        o = aesara.tensor.scalar(dtype=x.dtype)
         return Apply(self, [x], [o])
 
     def perform(self, node, inputs, outputs, params=None):
@@ -319,7 +321,7 @@ def expand_packed_triangular(n, packed, lower=True, diagonal_only=False):
     ----------
     n: int
         The number of rows of the triangular matrix.
-    packed: theano.vector
+    packed: aesara.vector
         The matrix in packed format.
     lower: bool, default=True
         If true, assume that the matrix is lower triangular.
@@ -338,13 +340,13 @@ def expand_packed_triangular(n, packed, lower=True, diagonal_only=False):
         diag_idxs = np.arange(2, n + 2)[::-1].cumsum() - n - 1
         return packed[diag_idxs]
     elif lower:
-        out = tt.zeros((n, n), dtype=theano.config.floatX)
+        out = aet.zeros((n, n), dtype=aesara.config.floatX)
         idxs = np.tril_indices(n)
-        return tt.set_subtensor(out[idxs], packed)
+        return aet.set_subtensor(out[idxs], packed)
     elif not lower:
-        out = tt.zeros((n, n), dtype=theano.config.floatX)
+        out = aet.zeros((n, n), dtype=aesara.config.floatX)
         idxs = np.triu_indices(n)
-        return tt.set_subtensor(out[idxs], packed)
+        return aet.set_subtensor(out[idxs], packed)
 
 
 class BatchedDiag(Op):
@@ -355,11 +357,11 @@ class BatchedDiag(Op):
     __props__ = ()
 
     def make_node(self, diag):
-        diag = tt.as_tensor_variable(diag)
+        diag = aet.as_tensor_variable(diag)
         if diag.type.ndim != 2:
             raise TypeError("data argument must be a matrix", diag.type)
 
-        return Apply(self, [diag], [tt.tensor3(dtype=diag.dtype)])
+        return Apply(self, [diag], [aet.tensor3(dtype=diag.dtype)])
 
     def perform(self, node, ins, outs, params=None):
         (C,) = ins
@@ -375,7 +377,7 @@ def perform(self, node, ins, outs, params=None):
 
     def grad(self, inputs, gout):
         (gz,) = gout
-        idx = tt.arange(gz.shape[-1])
+        idx = aet.arange(gz.shape[-1])
         return [gz[..., idx, idx]]
 
     def infer_shape(self, fgraph, nodes, shapes):
@@ -383,14 +385,14 @@ def infer_shape(self, fgraph, nodes, shapes):
 
 
 def batched_diag(C):
-    C = tt.as_tensor(C)
+    C = aet.as_tensor(C)
     dim = C.shape[-1]
     if C.ndim == 2:
         # diag -> matrices
         return BatchedDiag()(C)
     elif C.ndim == 3:
         # matrices -> diag
-        idx = tt.arange(dim)
+        idx = aet.arange(dim)
         return C[..., idx, idx]
     else:
         raise ValueError("Input should be 2 or 3 dimensional")
@@ -408,13 +410,13 @@ def __init__(self, sparse=False, format="csr"):
     def make_node(self, *matrices):
         if not matrices:
             raise ValueError("no matrices to allocate")
-        matrices = list(map(tt.as_tensor, matrices))
+        matrices = list(map(aet.as_tensor, matrices))
         if any(mat.type.ndim != 2 for mat in matrices):
             raise TypeError("all data arguments must be matrices")
         if self.sparse:
-            out_type = theano.sparse.matrix(self.format, dtype=largest_common_dtype(matrices))
+            out_type = aesara.sparse.matrix(self.format, dtype=largest_common_dtype(matrices))
         else:
-            out_type = theano.tensor.matrix(dtype=largest_common_dtype(matrices))
+            out_type = aesara.tensor.matrix(dtype=largest_common_dtype(matrices))
         return Apply(self, matrices, [out_type])
 
     def perform(self, node, inputs, output_storage, params=None):
@@ -425,13 +427,13 @@ def perform(self, node, inputs, output_storage, params=None):
             output_storage[0][0] = scipy_block_diag(*inputs).astype(dtype)
 
     def grad(self, inputs, gout):
-        shapes = tt.stack([i.shape for i in inputs])
+        shapes = aet.stack([i.shape for i in inputs])
         index_end = shapes.cumsum(0)
         index_begin = index_end - shapes
         slices = [
             ix_(
-                tt.arange(index_begin[i, 0], index_end[i, 0]),
-                tt.arange(index_begin[i, 1], index_end[i, 1]),
+                aet.arange(index_begin[i, 0], index_end[i, 0]),
+                aet.arange(index_begin[i, 1], index_end[i, 1]),
             )
             for i in range(len(inputs))
         ]
@@ -439,7 +441,7 @@ def grad(self, inputs, gout):
 
     def infer_shape(self, fgraph, nodes, shapes):
         first, second = zip(*shapes)
-        return [(tt.add(*first), tt.add(*second))]
+        return [(aet.add(*first), aet.add(*second))]
 
 
 def block_diagonal(matrices, sparse=False, format="csr"):
diff --git a/pymc3/model.py b/pymc3/model.py
index 349affcfa01..a5a0a635c8b 100644
--- a/pymc3/model.py
+++ b/pymc3/model.py
@@ -20,25 +20,27 @@
 from sys import modules
 from typing import TYPE_CHECKING, Any, List, Optional, Type, TypeVar, Union, cast
 
+import aesara
+import aesara.graph.basic
+import aesara.sparse as sparse
+import aesara.tensor as aet
 import numpy as np
 import scipy.sparse as sps
-import theano
-import theano.graph.basic
-import theano.sparse as sparse
-import theano.tensor as tt
 
+from aesara.compile.sharedvalue import SharedVariable
+from aesara.gradient import grad
+from aesara.graph.basic import Apply, Variable
+from aesara.tensor.type import TensorType as AesaraTensorType
+from aesara.tensor.var import TensorVariable
 from pandas import Series
-from theano.compile import SharedVariable
-from theano.graph.basic import Apply
-from theano.tensor.var import TensorVariable
 
 import pymc3 as pm
 
+from pymc3.aesaraf import floatX, generator, gradient, hessian, inputvars
 from pymc3.blocking import ArrayOrdering, DictToArrayBijection
 from pymc3.exceptions import ImputationWarning
 from pymc3.math import flatten_list
 from pymc3.memoize import WithMemoization, memoize
-from pymc3.theanof import floatX, generator, gradient, hessian, inputvars
 from pymc3.util import get_transformed_name, get_var_name
 from pymc3.vartypes import continuous_types, discrete_types, isgenerator, typefilter
 
@@ -59,13 +61,13 @@
 
 
 class PyMC3Variable(TensorVariable):
-    """Class to wrap Theano TensorVariable for custom behavior."""
+    """Class to wrap Aesara TensorVariable for custom behavior."""
 
     # Implement matrix multiplication infix operator: X @ w
-    __matmul__ = tt.dot
+    __matmul__ = aet.dot
 
     def __rmatmul__(self, other):
-        return tt.dot(other, self)
+        return aet.dot(other, self)
 
     def _str_repr(self, name=None, dist=None, formatting="plain"):
         if getattr(self, "distribution", None) is None:
@@ -143,28 +145,28 @@ def incorporate_methods(source, destination, methods, wrapper=None, override=Fal
 
 
 def get_named_nodes_and_relations(graph):
-    """Get the named nodes in a theano graph (i.e., nodes whose name
+    """Get the named nodes in a aesara graph (i.e., nodes whose name
     attribute is not None) along with their relationships (i.e., the
     node's named parents, and named children, while skipping unnamed
     intermediate nodes)
 
     Parameters
     ----------
-    graph: a theano node
+    graph: a aesara node
 
     Returns:
     --------
     leaf_dict: Dict[str, node]
         A dictionary of name:node pairs, of the named nodes that
-        have no named ancestors in the provided theano graph.
+        have no named ancestors in the provided aesara graph.
     descendents: Dict[node, Set[node]]
-        Each key is a theano named node, and the corresponding value
-        is the set of theano named nodes that are descendents with no
+        Each key is a aesara named node, and the corresponding value
+        is the set of aesara named nodes that are descendents with no
         intervening named nodes in the supplied ``graph``.
     ancestors: Dict[node, Set[node]]
         A dictionary of node:set([ancestors]) pairs. Each key
-        is a theano named node, and the corresponding value is the set
-        of theano named nodes that are ancestors with no intervening named
+        is a aesara named node, and the corresponding value is the set
+        of aesara named nodes that are ancestors with no intervening named
         nodes in the supplied ``graph``.
 
     """
@@ -222,28 +224,28 @@ def _get_named_nodes_and_relations(graph, descendent, descendents, ancestors):
 
 def build_named_node_tree(graphs):
     """Build the combined descence/ancestry tree of named nodes (i.e., nodes
-    whose name attribute is not None) in a list (or iterable) of theano graphs.
+    whose name attribute is not None) in a list (or iterable) of aesara graphs.
     The relationship tree does not include unnamed intermediate nodes present
     in the supplied graphs.
 
     Parameters
     ----------
-    graphs - iterable of theano graphs
+    graphs - iterable of aesara graphs
 
     Returns:
     --------
     leaf_dict: Dict[str, node]
         A dictionary of name:node pairs, of the named nodes that
-        have no named ancestors in the provided theano graphs.
+        have no named ancestors in the provided aesara graphs.
     descendents: Dict[node, Set[node]]
         A dictionary of node:set([parents]) pairs. Each key is
-        a theano named node, and the corresponding value is the set of
-        theano named nodes that are descendents with no intervening named
+        a aesara named node, and the corresponding value is the set of
+        aesara named nodes that are descendents with no intervening named
         nodes in the supplied ``graphs``.
     ancestors: Dict[node, Set[node]]
         A dictionary of node:set([ancestors]) pairs. Each key
-        is a theano named node, and the corresponding value is the set
-        of theano named nodes that are ancestors with no intervening named
+        is a aesara named node, and the corresponding value is the set
+        of aesara named nodes that are ancestors with no intervening named
         nodes in the supplied ``graphs``.
 
     """
@@ -282,16 +284,16 @@ def __new__(cls, name, bases, dct, **kargs):  # pylint: disable=unused-argument
 
         def __enter__(self):
             self.__class__.context_class.get_contexts().append(self)
-            # self._theano_config is set in Model.__new__
+            # self._aesara_config is set in Model.__new__
             self._config_context = None
-            if hasattr(self, "_theano_config"):
-                self._config_context = theano.config.change_flags(**self._theano_config)
+            if hasattr(self, "_aesara_config"):
+                self._config_context = aesara.config.change_flags(**self._aesara_config)
                 self._config_context.__enter__()
             return self
 
         def __exit__(self, typ, value, traceback):  # pylint: disable=unused-argument
             self.__class__.context_class.get_contexts().pop()
-            # self._theano_config is set in Model.__new__
+            # self._aesara_config is set in Model.__new__
             if self._config_context:
                 self._config_context.__exit__(typ, value, traceback)
 
@@ -468,7 +470,7 @@ def fastd2logp_nojac(self, vars=None):
 
     @property
     def logpt(self):
-        """Theano scalar of log-probability of the model"""
+        """Aesara scalar of log-probability of the model"""
         if getattr(self, "total_size", None) is not None:
             logp = self.logp_sum_unscaledt * self.scaling
         else:
@@ -479,11 +481,11 @@ def logpt(self):
 
     @property
     def logp_nojact(self):
-        """Theano scalar of log-probability, excluding jacobian terms."""
+        """Aesara scalar of log-probability, excluding jacobian terms."""
         if getattr(self, "total_size", None) is not None:
-            logp = tt.sum(self.logp_nojac_unscaledt) * self.scaling
+            logp = aet.sum(self.logp_nojac_unscaledt) * self.scaling
         else:
-            logp = tt.sum(self.logp_nojac_unscaledt)
+            logp = aet.sum(self.logp_nojac_unscaledt)
         if self.name is not None:
             logp.name = "__logp_%s" % self.name
         return logp
@@ -578,20 +580,20 @@ def tree_contains(self, item):
 
 
 class ValueGradFunction:
-    """Create a theano function that computes a value and its gradient.
+    """Create a aesara function that computes a value and its gradient.
 
     Parameters
     ----------
-    costs: list of theano variables
-        We compute the weighted sum of the specified theano values, and the gradient
+    costs: list of aesara variables
+        We compute the weighted sum of the specified aesara values, and the gradient
         of that sum. The weights can be specified with `ValueGradFunction.set_weights`.
-    grad_vars: list of named theano variables or None
+    grad_vars: list of named aesara variables or None
         The arguments with respect to which the gradient is computed.
-    extra_vars: list of named theano variables or None
+    extra_vars: list of named aesara variables or None
         Other arguments of the function that are assumed constant. They
         are stored in shared variables and can be set using
         `set_extra_values`.
-    dtype: str, default=theano.config.floatX
+    dtype: str, default=aesara.config.floatX
         The dtype of the arrays.
     casting: {'no', 'equiv', 'save', 'same_kind', 'unsafe'}, default='no'
         Casting rule for casting `grad_args` to the array dtype.
@@ -601,14 +603,14 @@ class ValueGradFunction:
     compute_grads: bool, default=True
         If False, return only the logp, not the gradient.
     kwargs
-        Extra arguments are passed on to `theano.function`.
+        Extra arguments are passed on to `aesara.function`.
 
     Attributes
     ----------
     size: int
         The number of elements in the parameter array.
-    profile: theano profiling object or None
-        The profiling object of the theano function that computes value and
+    profile: aesara profiling object or None
+        The profiling object of the aesara function that computes value and
         gradient. This is None unless `profile=True` was set in the
         kwargs.
     """
@@ -640,14 +642,14 @@ def __init__(
         self._extra_var_names = {var.name for var in extra_vars}
 
         if dtype is None:
-            dtype = theano.config.floatX
+            dtype = aesara.config.floatX
         self.dtype = dtype
 
         self._n_costs = len(costs)
         if self._n_costs == 0:
             raise ValueError("At least one cost is required.")
         weights = np.ones(self._n_costs - 1, dtype=self.dtype)
-        self._weights = theano.shared(weights, "__weights")
+        self._weights = aesara.shared(weights, "__weights")
 
         cost = costs[0]
         for i, val in enumerate(costs[1:]):
@@ -674,7 +676,7 @@ def __init__(
         givens = []
         self._extra_vars_shared = {}
         for var in extra_vars:
-            shared = theano.shared(var.tag.test_value, var.name + "_shared__")
+            shared = aesara.shared(var.tag.test_value, var.name + "_shared__")
             # test TensorType compatibility
             if hasattr(var.tag.test_value, "shape"):
                 testtype = TensorType(var.dtype, var.tag.test_value.shape)
@@ -689,15 +691,15 @@ def __init__(
         )
 
         if compute_grads:
-            grad = tt.grad(self._cost_joined, self._vars_joined)
-            grad.name = "__grad"
-            outputs = [self._cost_joined, grad]
+            grad_out = grad(self._cost_joined, self._vars_joined)
+            grad_out.name = "__grad"
+            outputs = [self._cost_joined, grad_out]
         else:
             outputs = self._cost_joined
 
         inputs = [self._vars_joined]
 
-        self._theano_function = theano.function(inputs, outputs, givens=givens, **kwargs)
+        self._aesara_function = aesara.function(inputs, outputs, givens=givens, **kwargs)
 
     def set_weights(self, values):
         if values.shape != (self._n_costs - 1,):
@@ -732,7 +734,7 @@ def __call__(self, array, grad_out=None, extra_vars=None):
         else:
             out = grad_out
 
-        output = self._theano_function(array)
+        output = self._aesara_function(array)
         if grad_out is None:
             return output
         else:
@@ -741,8 +743,8 @@ def __call__(self, array, grad_out=None, extra_vars=None):
 
     @property
     def profile(self):
-        """Profiling information of the underlying theano function."""
-        return self._theano_function.profile
+        """Profiling information of the underlying aesara function."""
+        return self._aesara_function.profile
 
     def dict_to_array(self, point):
         """Convert a dictionary with values for grad_vars to an array."""
@@ -774,7 +776,7 @@ def array_to_full_dict(self, array):
         return point
 
     def _build_joined(self, cost, args, vmap):
-        args_joined = tt.vector("__args_joined")
+        args_joined = aet.vector("__args_joined")
         args_joined.tag.test_value = np.zeros(self.size, dtype=self.dtype)
 
         joined_slices = {}
@@ -784,7 +786,7 @@ def _build_joined(self, cost, args, vmap):
             joined_slices[vmap.var] = sliced
 
         replace = {var: joined_slices[var.name] for var in args}
-        return args_joined, theano.clone(cost, replace=replace)
+        return args_joined, aesara.clone_replace(cost, replace=replace)
 
 
 class Model(Factor, WithMemoization, metaclass=ContextMeta):
@@ -806,10 +808,10 @@ class Model(Factor, WithMemoization, metaclass=ContextMeta):
         defined within instance will be passed to the parent instance.
         So that 'nested' model contributes to the variables and
         likelihood factors of parent model.
-    theano_config: dict
-        A dictionary of theano config values that should be set
+    aesara_config: dict
+        A dictionary of aesara config values that should be set
         temporarily in the model context. See the documentation
-        of theano for a complete list. Set config key
+        of aesara for a complete list. Set config key
         ``compute_test_value`` to `raise` if it is None.
     check_bounds: bool
         Ensure that input parameters to distributions are in a valid
@@ -854,7 +856,7 @@ def __init__(self, mean=0, sigma=1, name='', model=None):
                 Deterministic('v3_sq', self.v3 ** 2)
 
                 # Potentials too
-                Potential('p1', tt.constant(1))
+                Potential('p1', aet.constant(1))
 
         # After defining a class CustomModel you can use it in several
         # ways
@@ -896,13 +898,13 @@ def __new__(cls, *args, **kwargs):
             instance._parent = kwargs.get("model")
         else:
             instance._parent = cls.get_context(error_if_none=False)
-        theano_config = kwargs.get("theano_config", None)
-        if theano_config is None or "compute_test_value" not in theano_config:
-            theano_config = {"compute_test_value": "raise"}
-        instance._theano_config = theano_config
+        aesara_config = kwargs.get("aesara_config", None)
+        if aesara_config is None or "compute_test_value" not in aesara_config:
+            aesara_config = {"compute_test_value": "raise"}
+        instance._aesara_config = aesara_config
         return instance
 
-    def __init__(self, name="", model=None, theano_config=None, coords=None, check_bounds=True):
+    def __init__(self, name="", model=None, aesara_config=None, coords=None, check_bounds=True):
         self.name = name
         self.coords = {}
         self.RV_dims = {}
@@ -970,7 +972,7 @@ def dlogp_array(self):
         return self.bijection.mapf(self.fastdlogp(vars))
 
     def logp_dlogp_function(self, grad_vars=None, tempered=False, **kwargs):
-        """Compile a theano function that computes logp and gradient.
+        """Compile a aesara function that computes logp and gradient.
 
         Parameters
         ----------
@@ -990,10 +992,10 @@ def logp_dlogp_function(self, grad_vars=None, tempered=False, **kwargs):
 
         if tempered:
             with self:
-                free_RVs_logp = tt.sum(
-                    [tt.sum(var.logpt) for var in self.free_RVs + self.potentials]
+                free_RVs_logp = aet.sum(
+                    [aet.sum(var.logpt) for var in self.free_RVs + self.potentials]
                 )
-                observed_RVs_logp = tt.sum([tt.sum(var.logpt) for var in self.observed_RVs])
+                observed_RVs_logp = aet.sum([aet.sum(var.logpt) for var in self.observed_RVs])
 
             costs = [free_RVs_logp, observed_RVs_logp]
         else:
@@ -1004,10 +1006,10 @@ def logp_dlogp_function(self, grad_vars=None, tempered=False, **kwargs):
 
     @property
     def logpt(self):
-        """Theano scalar of log-probability of the model"""
+        """Aesara scalar of log-probability of the model"""
         with self:
             factors = [var.logpt for var in self.basic_RVs] + self.potentials
-            logp = tt.sum([tt.sum(factor) for factor in factors])
+            logp = aet.sum([aet.sum(factor) for factor in factors])
             if self.name:
                 logp.name = "__logp_%s" % self.name
             else:
@@ -1016,14 +1018,14 @@ def logpt(self):
 
     @property
     def logp_nojact(self):
-        """Theano scalar of log-probability of the model but without the jacobian
+        """Aesara scalar of log-probability of the model but without the jacobian
         if transformed Random Variable is presented.
         Note that If there is no transformed variable in the model, logp_nojact
         will be the same as logpt as there is no need for Jacobian correction.
         """
         with self:
             factors = [var.logp_nojact for var in self.basic_RVs] + self.potentials
-            logp = tt.sum([tt.sum(factor) for factor in factors])
+            logp = aet.sum([aet.sum(factor) for factor in factors])
             if self.name:
                 logp.name = "__logp_nojac_%s" % self.name
             else:
@@ -1032,18 +1034,18 @@ def logp_nojact(self):
 
     @property
     def varlogpt(self):
-        """Theano scalar of log-probability of the unobserved random variables
+        """Aesara scalar of log-probability of the unobserved random variables
         (excluding deterministic)."""
         with self:
             factors = [var.logpt for var in self.free_RVs]
-            return tt.sum(factors)
+            return aet.sum(factors)
 
     @property
     def datalogpt(self):
         with self:
             factors = [var.logpt for var in self.observed_RVs]
-            factors += [tt.sum(factor) for factor in self.potentials]
-            return tt.sum(factors)
+            factors += [aet.sum(factor) for factor in self.potentials]
+            return aet.sum(factors)
 
     @property
     def vars(self):
@@ -1237,20 +1239,20 @@ def __getitem__(self, key):
                 raise e
 
     def makefn(self, outs, mode=None, *args, **kwargs):
-        """Compiles a Theano function which returns ``outs`` and takes the variable
+        """Compiles a Aesara function which returns ``outs`` and takes the variable
         ancestors of ``outs`` as inputs.
 
         Parameters
         ----------
-        outs: Theano variable or iterable of Theano variables
-        mode: Theano compilation mode
+        outs: Aesara variable or iterable of Aesara variables
+        mode: Aesara compilation mode
 
         Returns
         -------
-        Compiled Theano function
+        Compiled Aesara function
         """
         with self:
-            return theano.function(
+            return aesara.function(
                 self.vars,
                 outs,
                 allow_input_downcast=True,
@@ -1262,43 +1264,43 @@ def makefn(self, outs, mode=None, *args, **kwargs):
             )
 
     def fn(self, outs, mode=None, *args, **kwargs):
-        """Compiles a Theano function which returns the values of ``outs``
+        """Compiles a Aesara function which returns the values of ``outs``
         and takes values of model vars as arguments.
 
         Parameters
         ----------
-        outs: Theano variable or iterable of Theano variables
-        mode: Theano compilation mode
+        outs: Aesara variable or iterable of Aesara variables
+        mode: Aesara compilation mode
 
         Returns
         -------
-        Compiled Theano function
+        Compiled Aesara function
         """
         return LoosePointFunc(self.makefn(outs, mode, *args, **kwargs), self)
 
     def fastfn(self, outs, mode=None, *args, **kwargs):
-        """Compiles a Theano function which returns ``outs`` and takes values
+        """Compiles a Aesara function which returns ``outs`` and takes values
         of model vars as a dict as an argument.
 
         Parameters
         ----------
-        outs: Theano variable or iterable of Theano variables
-        mode: Theano compilation mode
+        outs: Aesara variable or iterable of Aesara variables
+        mode: Aesara compilation mode
 
         Returns
         -------
-        Compiled Theano function as point function.
+        Compiled Aesara function as point function.
         """
         f = self.makefn(outs, mode, *args, **kwargs)
         return FastPointFunc(f)
 
     def profile(self, outs, n=1000, point=None, profile=True, *args, **kwargs):
-        """Compiles and profiles a Theano function which returns ``outs`` and
+        """Compiles and profiles a Aesara function which returns ``outs`` and
         takes values of model vars as a dict as an argument.
 
         Parameters
         ----------
-        outs: Theano variable or iterable of Theano variables
+        outs: Aesara variable or iterable of Aesara variables
         n: int, default 1000
             Number of iterations to run
         point: point
@@ -1335,7 +1337,7 @@ def flatten(self, vars=None, order=None, inputvar=None):
             if None, then all model.free_RVs are used for flattening input
         order: ArrayOrdering
             Optional, use predefined ordering
-        inputvar: tt.vector
+        inputvar: aet.vector
             Optional, use predefined inputvar
 
         Returns
@@ -1347,8 +1349,8 @@ def flatten(self, vars=None, order=None, inputvar=None):
         if order is None:
             order = ArrayOrdering(vars)
         if inputvar is None:
-            inputvar = tt.vector("flat_view", dtype=theano.config.floatX)
-            if theano.config.compute_test_value != "off":
+            inputvar = aet.vector("flat_view", dtype=aesara.config.floatX)
+            if aesara.config.compute_test_value != "off":
                 if vars:
                     inputvar.tag.test_value = flatten_list(vars).tag.test_value
                 else:
@@ -1482,34 +1484,34 @@ def set_data(new_data, model=None):
 
 
 def fn(outs, mode=None, model=None, *args, **kwargs):
-    """Compiles a Theano function which returns the values of ``outs`` and
+    """Compiles a Aesara function which returns the values of ``outs`` and
     takes values of model vars as arguments.
 
     Parameters
     ----------
-    outs: Theano variable or iterable of Theano variables
-    mode: Theano compilation mode
+    outs: Aesara variable or iterable of Aesara variables
+    mode: Aesara compilation mode
 
     Returns
     -------
-    Compiled Theano function
+    Compiled Aesara function
     """
     model = modelcontext(model)
     return model.fn(outs, mode, *args, **kwargs)
 
 
 def fastfn(outs, mode=None, model=None):
-    """Compiles a Theano function which returns ``outs`` and takes values of model
+    """Compiles a Aesara function which returns ``outs`` and takes values of model
     vars as a dict as an argument.
 
     Parameters
     ----------
-    outs: Theano variable or iterable of Theano variables
-    mode: Theano compilation mode
+    outs: Aesara variable or iterable of Aesara variables
+    mode: Aesara compilation mode
 
     Returns
     -------
-    Compiled Theano function as point function.
+    Compiled Aesara function as point function.
     """
     model = modelcontext(model)
     return model.fastfn(outs, mode)
@@ -1619,12 +1621,12 @@ def _get_scaling(total_size, shape, ndim):
         begin_coef = [floatX(t) / shp_begin[i] for i, t in enumerate(begin) if t is not None]
         end_coef = [floatX(t) / shp_end[i] for i, t in enumerate(end) if t is not None]
         coefs = begin_coef + end_coef
-        coef = tt.prod(coefs)
+        coef = aet.prod(coefs)
     else:
         raise TypeError(
             "Unrecognized `total_size` type, expected int or list of ints, got %r" % total_size
         )
-    return tt.as_tensor(floatX(coef))
+    return aet.as_tensor(floatX(coef))
 
 
 class FreeRV(Factor, PyMC3Variable):
@@ -1648,8 +1650,8 @@ def __init__(
         """
         Parameters
         ----------
-        type: theano type (optional)
-        owner: theano owner (optional)
+        type: aesara type (optional)
+        owner: aesara owner (optional)
         name: str
         distribution: Distribution
         model: Model
@@ -1692,7 +1694,7 @@ def init_value(self):
 def pandas_to_array(data):
     """Convert a pandas object to a NumPy array.
 
-    XXX: When `data` is a generator, this will return a Theano tensor!
+    XXX: When `data` is a generator, this will return a Aesara tensor!
 
     """
     if hasattr(data, "to_numpy") and hasattr(data, "isnull"):
@@ -1720,7 +1722,7 @@ def pandas_to_array(data):
             else:
                 # no masking required
                 ret = data
-    elif isinstance(data, theano.graph.basic.Variable):
+    elif isinstance(data, Variable):
         ret = data
     elif sps.issparse(data):
         ret = data
@@ -1762,9 +1764,9 @@ def as_tensor(data, name, model, distribution):
             parent_dist=distribution,
         )
         missing_values = FreeRV(name=name + "_missing", distribution=fakedist, model=model)
-        constant = tt.as_tensor_variable(data.filled())
+        constant = aet.as_tensor_variable(data.filled())
 
-        dataTensor = tt.set_subtensor(constant[data.mask.nonzero()], missing_values)
+        dataTensor = aet.set_subtensor(constant[data.mask.nonzero()], missing_values)
         dataTensor.missing_values = missing_values
         return dataTensor
     elif sps.issparse(data):
@@ -1772,7 +1774,7 @@ def as_tensor(data, name, model, distribution):
         data.missing_values = None
         return data
     else:
-        data = tt.as_tensor_variable(data, name=name)
+        data = aet.as_tensor_variable(data, name=name)
         data.missing_values = None
         return data
 
@@ -1796,8 +1798,8 @@ def __init__(
         """
         Parameters
         ----------
-        type: theano type (optional)
-        owner: theano owner (optional)
+        type: aesara type (optional)
+        owner: aesara owner (optional)
         name: str
         distribution: Distribution
         model: Model
@@ -1806,12 +1808,12 @@ def __init__(
         """
         from pymc3.distributions import TensorType
 
-        if hasattr(data, "type") and isinstance(data.type, tt.TensorType):
+        if hasattr(data, "type") and isinstance(data.type, AesaraTensorType):
             type = data.type
 
         if type is None:
             data = pandas_to_array(data)
-            if isinstance(data, theano.graph.basic.Variable):
+            if isinstance(data, Variable):
                 type = data.type
             else:
                 type = TensorType(distribution.dtype, data.shape)
@@ -1834,8 +1836,8 @@ def __init__(
             self.distribution = distribution
 
             # make this RV a view on the combined missing/nonmissing array
-            Apply(theano.compile.view_op, inputs=[data], outputs=[self])
-            self.tag.test_value = theano.compile.view_op(data).tag.test_value.astype(self.dtype)
+            Apply(aesara.compile.view_op, inputs=[data], outputs=[self])
+            self.tag.test_value = aesara.compile.view_op(data).tag.test_value.astype(self.dtype)
             self.scaling = _get_scaling(total_size, data.shape, data.ndim)
 
     @property
@@ -1853,8 +1855,8 @@ def __init__(self, name, data, distribution, total_size=None, model=None):
         """
         Parameters
         ----------
-        type: theano type (optional)
-        owner: theano owner (optional)
+        type: aesara type (optional)
+        owner: aesara owner (optional)
         name: str
         distribution: Distribution
         model: Model
@@ -1893,7 +1895,7 @@ def __ne__(self, other):
 
 
 def _walk_up_rv(rv, formatting="plain"):
-    """Walk up theano graph to get inputs for deterministic RV."""
+    """Walk up aesara graph to get inputs for deterministic RV."""
     all_rvs = []
     parents = list(itertools.chain(*[j.inputs for j in rv.get_parents()]))
     if parents:
@@ -1906,7 +1908,7 @@ def _walk_up_rv(rv, formatting="plain"):
     return all_rvs
 
 
-class DeterministicWrapper(tt.TensorVariable):
+class DeterministicWrapper(TensorVariable):
     def _str_repr(self, formatting="plain"):
         if "latex" in formatting:
             if formatting == "latex_with_params":
@@ -1935,7 +1937,7 @@ def Deterministic(name, var, model=None, dims=None):
     Parameters
     ----------
     name: str
-    var: theano variables
+    var: aesara variables
 
     Returns
     -------
@@ -1956,7 +1958,7 @@ def Potential(name, var, model=None):
     Parameters
     ----------
     name: str
-    var: theano variables
+    var: aesara variables
 
     Returns
     -------
@@ -1974,8 +1976,8 @@ class TransformedRV(PyMC3Variable):
     Parameters
     ----------
 
-    type: theano type (optional)
-    owner: theano owner (optional)
+    type: aesara type (optional)
+    owner: aesara owner (optional)
     name: str
     distribution: Distribution
     model: Model
@@ -2014,7 +2016,7 @@ def __init__(
 
             normalRV = transform.backward(self.transformed)
 
-            Apply(theano.compile.view_op, inputs=[normalRV], outputs=[self])
+            Apply(aesara.compile.view_op, inputs=[normalRV], outputs=[self])
             self.tag.test_value = normalRV.tag.test_value
             self.scaling = _get_scaling(total_size, self.shape, self.ndim)
             incorporate_methods(
diff --git a/pymc3/model_graph.py b/pymc3/model_graph.py
index cd3feb30709..433dcfa54f3 100644
--- a/pymc3/model_graph.py
+++ b/pymc3/model_graph.py
@@ -13,19 +13,19 @@
 #   limitations under the License.
 
 from collections import deque
-from typing import Dict, Iterator, Optional, Set
+from typing import Dict, Iterator, NewType, Optional, Set
 
-VarName = str
-
-from theano.compile import SharedVariable
-from theano.graph.basic import walk
-from theano.tensor import Tensor
+from aesara.compile import SharedVariable
+from aesara.graph.basic import walk
+from aesara.tensor.var import TensorVariable
 
 import pymc3 as pm
 
 from pymc3.model import ObservedRV
 from pymc3.util import get_default_varnames, get_var_name
 
+VarName = NewType("VarName", str)
+
 
 class ModelGraph:
     def __init__(self, model):
@@ -46,17 +46,17 @@ def get_deterministics(self, var):
                 deterministics.append(v)
         return deterministics
 
-    def _get_ancestors(self, var: Tensor, func) -> Set[Tensor]:
+    def _get_ancestors(self, var: TensorVariable, func) -> Set[TensorVariable]:
         """Get all ancestors of a function, doing some accounting for deterministics."""
 
         # this contains all of the variables in the model EXCEPT var...
         vars = set(self.var_list)
         vars.remove(var)
 
-        blockers = set()  # type: Set[Tensor]
-        retval = set()  # type: Set[Tensor]
+        blockers = set()  # type: Set[TensorVariable]
+        retval = set()  # type: Set[TensorVariable]
 
-        def _expand(node) -> Optional[Iterator[Tensor]]:
+        def _expand(node) -> Optional[Iterator[TensorVariable]]:
             if node in blockers:
                 return None
             elif node in vars:
@@ -87,7 +87,7 @@ def _filter_parents(self, var, parents) -> Set[VarName]:
                 raise AssertionError("Do not know what to do with {}".format(get_var_name(p)))
         return keep
 
-    def get_parents(self, var: Tensor) -> Set[VarName]:
+    def get_parents(self, var: TensorVariable) -> Set[VarName]:
         """Get the named nodes that are direct inputs to the var"""
         if hasattr(var, "transformed"):
             func = var.transformed.logpt
@@ -167,7 +167,7 @@ def get_plates(self):
             if hasattr(v, "observations"):
                 try:
                     # To get shape of _observed_ data container `pm.Data`
-                    # (wrapper for theano.SharedVariable) we evaluate it.
+                    # (wrapper for aesara.SharedVariable) we evaluate it.
                     shape = tuple(v.observations.shape.eval())
                 except AttributeError:
                     shape = v.observations.shape
diff --git a/pymc3/ode/ode.py b/pymc3/ode/ode.py
index 2eba398404f..5563bf898c1 100644
--- a/pymc3/ode/ode.py
+++ b/pymc3/ode/ode.py
@@ -14,19 +14,20 @@
 
 import logging
 
+import aesara
+import aesara.tensor as aet
 import numpy as np
 import scipy
-import theano
-import theano.tensor as tt
 
-from theano.graph.basic import Apply
-from theano.graph.op import Op, get_test_value
+from aesara.graph.basic import Apply
+from aesara.graph.op import Op, get_test_value
+from aesara.tensor.type import TensorType
 
 from pymc3.exceptions import DtypeError, ShapeError
 from pymc3.ode import utils
 
 _log = logging.getLogger("pymc3")
-floatX = theano.config.floatX
+floatX = aesara.config.floatX
 
 
 class DifferentialEquation(Op):
@@ -65,12 +66,12 @@ def odefunc(y, t, p):
         ode_model = DifferentialEquation(func=odefunc, times=times, n_states=1, n_theta=1, t0=0)
     """
     _itypes = [
-        tt.TensorType(floatX, (False,)),  # y0 as 1D floatX vector
-        tt.TensorType(floatX, (False,)),  # theta as 1D floatX vector
+        TensorType(floatX, (False,)),  # y0 as 1D floatX vector
+        TensorType(floatX, (False,)),  # theta as 1D floatX vector
     ]
     _otypes = [
-        tt.TensorType(floatX, (False, False)),  # model states as floatX of shape (T, S)
-        tt.TensorType(
+        TensorType(floatX, (False, False)),  # model states as floatX of shape (T, S)
+        TensorType(
             floatX, (False, False, False)
         ),  # sensitivities as floatX of shape (T, S, len(y0) + len(theta))
     ]
@@ -153,8 +154,8 @@ def __call__(self, y0, theta, return_sens=False, **kwargs):
             )
 
         # convert inputs to tensors (and check their types)
-        y0 = tt.cast(tt.unbroadcast(tt.as_tensor_variable(y0), 0), floatX)
-        theta = tt.cast(tt.unbroadcast(tt.as_tensor_variable(theta), 0), floatX)
+        y0 = aet.cast(aet.unbroadcast(aet.as_tensor_variable(y0), 0), floatX)
+        theta = aet.cast(aet.unbroadcast(aet.as_tensor_variable(theta), 0), floatX)
         inputs = [y0, theta]
         for i, (input_val, itype) in enumerate(zip(inputs, self._itypes)):
             if not input_val.type == itype:
@@ -165,7 +166,7 @@ def __call__(self, y0, theta, return_sens=False, **kwargs):
         # use default implementation to prepare symbolic outputs (via make_node)
         states, sens = super().__call__(y0, theta, **kwargs)
 
-        if theano.config.compute_test_value != "off":
+        if aesara.config.compute_test_value != "off":
             # compute test values from input test values
             test_states, test_sens = self._simulate(
                 y0=get_test_value(y0), theta=get_test_value(theta)
@@ -234,8 +235,8 @@ def grad(self, inputs, output_grads):
         # for each parameter, multiply sensitivities with the output gradient and sum the result
         # sens is (n_times, n_states, n_p)
         # ograds is (n_times, n_states)
-        grads = [tt.sum(sens[:, :, p] * ograds) for p in range(self.n_p)]
+        grads = [aet.sum(sens[:, :, p] * ograds) for p in range(self.n_p)]
 
         # return separate gradient tensors for y0 and theta inputs
-        result = tt.stack(grads[: self.n_states]), tt.stack(grads[self.n_states :])
+        result = aet.stack(grads[: self.n_states]), aet.stack(grads[self.n_states :])
         return result
diff --git a/pymc3/ode/utils.py b/pymc3/ode/utils.py
index 141c5503f19..474ed901baf 100644
--- a/pymc3/ode/utils.py
+++ b/pymc3/ode/utils.py
@@ -12,9 +12,9 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import aesara
+import aesara.tensor as aet
 import numpy as np
-import theano
-import theano.tensor as tt
 
 
 def make_sens_ic(n_states, n_theta, floatX):
@@ -83,20 +83,20 @@ def augment_system(ode_func, n_states, n_theta):
     """
 
     # Present state of the system
-    t_y = tt.vector("y", dtype="float64")
+    t_y = aet.vector("y", dtype="float64")
     t_y.tag.test_value = np.ones((n_states,), dtype="float64")
     # Parameter(s).  Should be vector to allow for generaliztion to multiparameter
     # systems of ODEs.  Is m dimensional because it includes all initial conditions as well as ode parameters
-    t_p = tt.vector("p", dtype="float64")
+    t_p = aet.vector("p", dtype="float64")
     t_p.tag.test_value = np.ones((n_states + n_theta,), dtype="float64")
     # Time.  Allow for non-automonous systems of ODEs to be analyzed
-    t_t = tt.scalar("t", dtype="float64")
+    t_t = aet.scalar("t", dtype="float64")
     t_t.tag.test_value = 2.459
 
     # Present state of the gradients:
     # Will always be 0 unless the parameter is the inital condition
     # Entry i,j is partial of y[i] wrt to p[j]
-    dydp_vec = tt.vector("dydp", dtype="float64")
+    dydp_vec = aet.vector("dydp", dtype="float64")
     dydp_vec.tag.test_value = make_sens_ic(n_states, n_theta, "float64")
 
     dydp = dydp_vec.reshape((n_states, n_states + n_theta))
@@ -106,19 +106,19 @@ def augment_system(ode_func, n_states, n_theta):
     # Stack the results of the ode_func into a single tensor variable
     if not isinstance(yhat, (list, tuple)):
         yhat = (yhat,)
-    t_yhat = tt.stack(yhat, axis=0)
+    t_yhat = aet.stack(yhat, axis=0)
 
     # Now compute gradients
-    J = tt.jacobian(t_yhat, t_y)
+    J = aet.jacobian(t_yhat, t_y)
 
-    Jdfdy = tt.dot(J, dydp)
+    Jdfdy = aet.dot(J, dydp)
 
-    grad_f = tt.jacobian(t_yhat, t_p)
+    grad_f = aet.jacobian(t_yhat, t_p)
 
     # This is the time derivative of dydp
     ddt_dydp = (Jdfdy + grad_f).flatten()
 
-    system = theano.function(
+    system = aesara.function(
         inputs=[t_y, t_t, t_p, dydp_vec], outputs=[t_yhat, ddt_dydp], on_unused_input="ignore"
     )
 
diff --git a/pymc3/parallel_sampling.py b/pymc3/parallel_sampling.py
index bdfe1a274b7..4cd39921b24 100644
--- a/pymc3/parallel_sampling.py
+++ b/pymc3/parallel_sampling.py
@@ -27,7 +27,7 @@
 
 from fastprogress.fastprogress import progress_bar
 
-from pymc3 import theanof
+from pymc3 import aesaraf
 from pymc3.exceptions import SamplingError
 
 logger = logging.getLogger("pymc3")
@@ -99,7 +99,7 @@ def __init__(
         self._step_method_is_pickled = step_method_is_pickled
         self._shared_point = shared_point
         self._seed = seed
-        self._tt_seed = seed + 1
+        self._aet_seed = seed + 1
         self._draws = draws
         self._tune = tune
         self._pickle_backend = pickle_backend
@@ -170,7 +170,7 @@ def _recv_msg(self):
 
     def _start_loop(self):
         np.random.seed(self._seed)
-        theanof.set_tt_rng(self._tt_seed)
+        aesaraf.set_aet_rng(self._aet_seed)
 
         draw = 0
         tuning = True
diff --git a/pymc3/sampling.py b/pymc3/sampling.py
index 481d20ff034..98a2e8f3e86 100644
--- a/pymc3/sampling.py
+++ b/pymc3/sampling.py
@@ -25,10 +25,10 @@
 from copy import copy, deepcopy
 from typing import Any, Dict, Iterable, List, Optional, Set, Union, cast
 
+import aesara.gradient as tg
 import arviz
 import numpy as np
 import packaging
-import theano.gradient as tg
 import xarray
 
 from arviz import InferenceData
diff --git a/pymc3/sampling_jax.py b/pymc3/sampling_jax.py
index 522bca7b12e..4f10414caf2 100644
--- a/pymc3/sampling_jax.py
+++ b/pymc3/sampling_jax.py
@@ -9,13 +9,13 @@
 xla_flags = re.sub(r"xla_force_host_platform_device_count=.+\s", "", xla_flags).split()
 os.environ["XLA_FLAGS"] = " ".join(["--xla_force_host_platform_device_count={}".format(100)])
 
+import aesara.graph.fg
 import arviz as az
 import jax
 import numpy as np
 import pandas as pd
-import theano.graph.fg
 
-from theano.link.jax.jax_dispatch import jax_funcify
+from aesara.link.jax.jax_dispatch import jax_funcify
 
 import pymc3 as pm
 
@@ -24,9 +24,9 @@
 warnings.warn("This module is experimental.")
 
 # Disable C compilation by default
-# theano.config.cxx = ""
+# aesara.config.cxx = ""
 # This will make the JAX Linker the default
-# theano.config.mode = "JAX"
+# aesara.config.mode = "JAX"
 
 
 def sample_tfp_nuts(
@@ -47,7 +47,7 @@ def sample_tfp_nuts(
 
     seed = jax.random.PRNGKey(random_seed)
 
-    fgraph = theano.graph.fg.FunctionGraph(model.free_RVs, [model.logpt])
+    fgraph = aesara.graph.fg.FunctionGraph(model.free_RVs, [model.logpt])
     fns = jax_funcify(fgraph)
     logp_fn_jax = fns[0]
 
@@ -133,7 +133,7 @@ def sample_numpyro_nuts(
 
     seed = jax.random.PRNGKey(random_seed)
 
-    fgraph = theano.graph.fg.FunctionGraph(model.free_RVs, [model.logpt])
+    fgraph = aesara.graph.fg.FunctionGraph(model.free_RVs, [model.logpt])
     fns = jax_funcify(fgraph)
     logp_fn_jax = fns[0]
 
@@ -199,7 +199,7 @@ def _transform_samples(samples, model, keep_untransformed=False):
     ops_to_compute = [x for x in model.unobserved_RVs if x.name in names_to_compute]
 
     # Create function graph for these:
-    fgraph = theano.graph.fg.FunctionGraph(model.free_RVs, ops_to_compute)
+    fgraph = aesara.graph.fg.FunctionGraph(model.free_RVs, ops_to_compute)
 
     # Jaxify, which returns a list of functions, one for each op
     jax_fns = jax_funcify(fgraph)
diff --git a/pymc3/smc/smc.py b/pymc3/smc/smc.py
index 2e7e369ad32..25f278d1886 100644
--- a/pymc3/smc/smc.py
+++ b/pymc3/smc/smc.py
@@ -14,22 +14,22 @@
 
 from collections import OrderedDict
 
+import aesara.tensor as aet
 import numpy as np
-import theano.tensor as tt
 
+from aesara import function as aesara_function
 from scipy.special import logsumexp
 from scipy.stats import multivariate_normal
-from theano import function as theano_function
 
-from pymc3.backends.ndarray import NDArray
-from pymc3.model import Point, modelcontext
-from pymc3.sampling import sample_prior_predictive
-from pymc3.theanof import (
+from pymc3.aesaraf import (
     floatX,
     inputvars,
     join_nonshared_inputs,
     make_shared_replacements,
 )
+from pymc3.backends.ndarray import NDArray
+from pymc3.model import Point, modelcontext
+from pymc3.sampling import sample_prior_predictive
 
 
 class SMC:
@@ -111,8 +111,8 @@ def setup_kernel(self):
 
         if self.kernel == "abc":
             factors = [var.logpt for var in self.model.free_RVs]
-            factors += [tt.sum(factor) for factor in self.model.potentials]
-            self.prior_logp_func = logp_forw([tt.sum(factors)], self.variables, shared)
+            factors += [aet.sum(factor) for factor in self.model.potentials]
+            self.prior_logp_func = logp_forw([aet.sum(factors)], self.variables, shared)
             simulator = self.model.observed_RVs[0]
             distance = simulator.distribution.distance
             sum_stat = simulator.distribution.sum_stat
@@ -271,7 +271,7 @@ def posterior_to_trace(self):
 
 
 def logp_forw(out_vars, vars, shared):
-    """Compile Theano function of the model and the input and output variables.
+    """Compile Aesara function of the model and the input and output variables.
 
     Parameters
     ----------
@@ -280,10 +280,10 @@ def logp_forw(out_vars, vars, shared):
     vars: List
         containing :class:`pymc3.Distribution` for the input variables
     shared: List
-        containing :class:`theano.tensor.Tensor` for depended shared data
+        containing :class:`aesara.tensor.Tensor` for depended shared data
     """
     out_list, inarray0 = join_nonshared_inputs(out_vars, vars, shared)
-    f = theano_function([inarray0], out_list[0])
+    f = aesara_function([inarray0], out_list[0])
     f.trust_input = True
     return f
 
diff --git a/pymc3/step_methods/arraystep.py b/pymc3/step_methods/arraystep.py
index c3e1cf6f8bb..7992153f710 100644
--- a/pymc3/step_methods/arraystep.py
+++ b/pymc3/step_methods/arraystep.py
@@ -19,10 +19,10 @@
 
 from numpy.random import uniform
 
+from pymc3.aesaraf import inputvars
 from pymc3.blocking import ArrayOrdering, DictToArrayBijection
 from pymc3.model import PyMC3Variable, modelcontext
 from pymc3.step_methods.compound import CompoundStep
-from pymc3.theanof import inputvars
 from pymc3.util import get_var_name
 
 __all__ = ["ArrayStep", "ArrayStepShared", "metrop_select", "Competence"]
@@ -137,7 +137,7 @@ class ArrayStep(BlockedStep):
     ----------
     vars: list
         List of variables for sampler.
-    fs: list of logp theano functions
+    fs: list of logp aesara functions
     allvars: Boolean (default False)
     blocked: Boolean (default True)
     """
@@ -177,7 +177,7 @@ def __init__(self, vars, shared, blocked=True):
         Parameters
         ----------
         vars: list of sampling variables
-        shared: dict of theano variable -> shared variable
+        shared: dict of aesara variable -> shared variable
         blocked: Boolean (default True)
         """
         self.vars = vars
@@ -212,7 +212,7 @@ def __init__(self, vars, shared, blocked=True):
         Parameters
         ----------
         vars: list of sampling variables
-        shared: dict of theano variable -> shared variable
+        shared: dict of aesara variable -> shared variable
         blocked: Boolean (default True)
         """
         self.population = None
@@ -244,14 +244,14 @@ def link_population(self, population, chain_index):
 
 class GradientSharedStep(BlockedStep):
     def __init__(
-        self, vars, model=None, blocked=True, dtype=None, logp_dlogp_func=None, **theano_kwargs
+        self, vars, model=None, blocked=True, dtype=None, logp_dlogp_func=None, **aesara_kwargs
     ):
         model = modelcontext(model)
         self.vars = vars
         self.blocked = blocked
 
         if logp_dlogp_func is None:
-            func = model.logp_dlogp_function(vars, dtype=dtype, **theano_kwargs)
+            func = model.logp_dlogp_function(vars, dtype=dtype, **aesara_kwargs)
         else:
             func = logp_dlogp_func
 
@@ -263,8 +263,8 @@ def __init__(
         except ValueError:
             if logp_dlogp_func is not None:
                 raise
-            theano_kwargs.update(mode="FAST_COMPILE")
-            func = model.logp_dlogp_function(vars, dtype=dtype, **theano_kwargs)
+            aesara_kwargs.update(mode="FAST_COMPILE")
+            func = model.logp_dlogp_function(vars, dtype=dtype, **aesara_kwargs)
 
         self._logp_dlogp_func = func
 
diff --git a/pymc3/step_methods/elliptical_slice.py b/pymc3/step_methods/elliptical_slice.py
index f1c1bb40d33..0a8d432644f 100644
--- a/pymc3/step_methods/elliptical_slice.py
+++ b/pymc3/step_methods/elliptical_slice.py
@@ -12,14 +12,14 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import aesara.tensor as aet
 import numpy as np
 import numpy.random as nr
-import theano.tensor as tt
 
+from pymc3.aesaraf import inputvars
 from pymc3.distributions import draw_values
 from pymc3.model import modelcontext
 from pymc3.step_methods.arraystep import ArrayStep, Competence
-from pymc3.theanof import inputvars
 
 __all__ = ["EllipticalSlice"]
 
@@ -44,7 +44,7 @@ def get_chol(cov, chol):
         raise ValueError("Must pass exactly one of cov or chol")
 
     if cov is not None:
-        chol = tt.slinalg.cholesky(cov)
+        chol = aet.slinalg.cholesky(cov)
     return chol
 
 
@@ -86,7 +86,7 @@ class EllipticalSlice(ArrayStep):
     def __init__(self, vars=None, prior_cov=None, prior_chol=None, model=None, **kwargs):
         self.model = modelcontext(model)
         chol = get_chol(prior_cov, prior_chol)
-        self.prior_chol = tt.as_tensor_variable(chol)
+        self.prior_chol = aet.as_tensor_variable(chol)
 
         if vars is None:
             vars = self.model.cont_vars
diff --git a/pymc3/step_methods/gibbs.py b/pymc3/step_methods/gibbs.py
index 2646a8a9e82..f109d49b263 100644
--- a/pymc3/step_methods/gibbs.py
+++ b/pymc3/step_methods/gibbs.py
@@ -19,6 +19,8 @@
 """
 from warnings import warn
 
+from aesara.graph.basic import graph_inputs
+from aesara.tensor import add
 from numpy import (
     arange,
     array,
@@ -31,8 +33,6 @@
     searchsorted,
 )
 from numpy.random import uniform
-from theano.graph.basic import graph_inputs
-from theano.tensor import add
 
 from pymc3.distributions.discrete import Categorical
 from pymc3.model import modelcontext
diff --git a/pymc3/step_methods/hmc/base_hmc.py b/pymc3/step_methods/hmc/base_hmc.py
index 323503fe491..7228b8a9c64 100644
--- a/pymc3/step_methods/hmc/base_hmc.py
+++ b/pymc3/step_methods/hmc/base_hmc.py
@@ -19,13 +19,13 @@
 
 import numpy as np
 
+from pymc3.aesaraf import floatX, inputvars
 from pymc3.backends.report import SamplerWarning, WarningType
 from pymc3.exceptions import SamplingError
 from pymc3.model import Point, modelcontext
 from pymc3.step_methods import arraystep, step_sizes
 from pymc3.step_methods.hmc import integration
 from pymc3.step_methods.hmc.quadpotential import QuadPotentialDiagAdapt, quad_potential
-from pymc3.theanof import floatX, inputvars
 from pymc3.tuning import guess_scaling
 
 logger = logging.getLogger("pymc3")
@@ -57,13 +57,13 @@ def __init__(
         t0=10,
         adapt_step_size=True,
         step_rand=None,
-        **theano_kwargs
+        **aesara_kwargs
     ):
         """Set up Hamiltonian samplers with common structures.
 
         Parameters
         ----------
-        vars: list of theano variables
+        vars: list of aesara variables
         scaling: array_like, ndim = {1,2}
             Scaling for momentum distribution. 1d arrays interpreted matrix
             diagonal.
@@ -77,7 +77,7 @@ def __init__(
         potential: Potential, optional
             An object that represents the Hamiltonian with methods `velocity`,
             `energy`, and `random` methods.
-        **theano_kwargs: passed to theano functions
+        **aesara_kwargs: passed to aesara functions
         """
         self._model = modelcontext(model)
 
@@ -85,7 +85,7 @@ def __init__(
             vars = self._model.cont_vars
         vars = inputvars(vars)
 
-        super().__init__(vars, blocked=blocked, model=model, dtype=dtype, **theano_kwargs)
+        super().__init__(vars, blocked=blocked, model=model, dtype=dtype, **aesara_kwargs)
 
         self.adapt_step_size = adapt_step_size
         self.Emax = Emax
diff --git a/pymc3/step_methods/hmc/hmc.py b/pymc3/step_methods/hmc/hmc.py
index 613160c27e3..522a40d94fb 100644
--- a/pymc3/step_methods/hmc/hmc.py
+++ b/pymc3/step_methods/hmc/hmc.py
@@ -59,7 +59,7 @@ def __init__(self, vars=None, path_length=2.0, max_steps=1024, **kwargs):
 
         Parameters
         ----------
-        vars: list of theano variables
+        vars: list of aesara variables
         path_length: float, default=2
             total length to travel
         step_rand: function float -> float, default=unif
diff --git a/pymc3/step_methods/hmc/nuts.py b/pymc3/step_methods/hmc/nuts.py
index 4a00ec98739..8d7b9a69ad8 100644
--- a/pymc3/step_methods/hmc/nuts.py
+++ b/pymc3/step_methods/hmc/nuts.py
@@ -16,13 +16,13 @@
 
 import numpy as np
 
+from pymc3.aesaraf import floatX
 from pymc3.backends.report import SamplerWarning, WarningType
 from pymc3.distributions import BART
 from pymc3.math import logbern, logdiffexp_numpy
 from pymc3.step_methods.arraystep import Competence
 from pymc3.step_methods.hmc.base_hmc import BaseHMC, DivergenceInfo, HMCStepData
 from pymc3.step_methods.hmc.integration import IntegrationError
-from pymc3.theanof import floatX
 from pymc3.vartypes import continuous_types
 
 __all__ = ["NUTS"]
@@ -114,7 +114,7 @@ def __init__(self, vars=None, max_treedepth=10, early_max_treedepth=8, **kwargs)
 
         Parameters
         ----------
-        vars: list of Theano variables, default all continuous vars
+        vars: list of Aesara variables, default all continuous vars
         Emax: float, default 1000
             Maximum energy change allowed during leapfrog steps. Larger
             deviations will abort the integration.
diff --git a/pymc3/step_methods/hmc/quadpotential.py b/pymc3/step_methods/hmc/quadpotential.py
index 4c2e6acc7a3..f77f1f99883 100644
--- a/pymc3/step_methods/hmc/quadpotential.py
+++ b/pymc3/step_methods/hmc/quadpotential.py
@@ -14,14 +14,14 @@
 
 import warnings
 
+import aesara
 import numpy as np
 import scipy.linalg
-import theano
 
 from numpy.random import normal
 from scipy.sparse import issparse
 
-from pymc3.theanof import floatX
+from pymc3.aesaraf import floatX
 
 __all__ = [
     "quad_potential",
@@ -170,7 +170,7 @@ def __init__(
             )
 
         if dtype is None:
-            dtype = theano.config.floatX
+            dtype = aesara.config.floatX
 
         if initial_diag is None:
             initial_diag = np.ones(n, dtype=dtype)
@@ -189,7 +189,7 @@ def __init__(
 
     def reset(self):
         self._var = np.array(self._initial_diag, dtype=self.dtype, copy=True)
-        self._var_theano = theano.shared(self._var)
+        self._var_aesara = aesara.shared(self._var)
         self._stds = np.sqrt(self._initial_diag)
         self._inv_stds = floatX(1.0) / self._stds
         self._foreground_var = _WeightedVariance(
@@ -222,7 +222,7 @@ def _update_from_weightvar(self, weightvar):
         weightvar.current_variance(out=self._var)
         np.sqrt(self._var, out=self._stds)
         np.divide(1, self._stds, out=self._inv_stds)
-        self._var_theano.set_value(self._var)
+        self._var_aesara.set_value(self._var)
 
     def update(self, sample, grad, tune):
         """Inform the potential about a new sample during tuning."""
@@ -304,7 +304,7 @@ def _update(self, var):
         self._var[:] = var
         np.sqrt(self._var, out=self._stds)
         np.divide(1, self._stds, out=self._inv_stds)
-        self._var_theano.set_value(self._var)
+        self._var_aesara.set_value(self._var)
 
     def update(self, sample, grad, tune):
         """Inform the potential about a new sample during tuning."""
@@ -384,7 +384,7 @@ def __init__(self, v, dtype=None):
            Diagonal of covariance matrix for the potential vector
         """
         if dtype is None:
-            dtype = theano.config.floatX
+            dtype = aesara.config.floatX
         self.dtype = dtype
         v = v.astype(self.dtype)
         s = v ** 0.5
@@ -428,7 +428,7 @@ def __init__(self, A, dtype=None):
            Inverse of covariance matrix for the potential vector
         """
         if dtype is None:
-            dtype = theano.config.floatX
+            dtype = aesara.config.floatX
         self.dtype = dtype
         self.L = floatX(scipy.linalg.cholesky(A, lower=True))
 
@@ -468,7 +468,7 @@ def __init__(self, cov, dtype=None):
             scaling matrix for the potential vector
         """
         if dtype is None:
-            dtype = theano.config.floatX
+            dtype = aesara.config.floatX
         self.dtype = dtype
         self._cov = np.array(cov, dtype=self.dtype, copy=True)
         self._chol = scipy.linalg.cholesky(self._cov, lower=True)
@@ -525,7 +525,7 @@ def __init__(
             )
 
         if dtype is None:
-            dtype = theano.config.floatX
+            dtype = aesara.config.floatX
 
         if initial_cov is None:
             initial_cov = np.eye(n, dtype=dtype)
@@ -658,7 +658,7 @@ def current_mean(self):
 if chol_available:
     __all__ += ["QuadPotentialSparse"]
 
-    import theano.sparse
+    import aesara.sparse
 
     class QuadPotentialSparse(QuadPotential):
         def __init__(self, A):
@@ -676,8 +676,8 @@ def __init__(self, A):
 
         def velocity(self, x):
             """Compute the current velocity at a position in parameter space."""
-            A = theano.sparse.as_sparse(self.A)
-            return theano.sparse.dot(A, x)
+            A = aesara.sparse.as_sparse(self.A)
+            return aesara.sparse.dot(A, x)
 
         def random(self):
             """Draw random value from QuadPotential."""
diff --git a/pymc3/step_methods/metropolis.py b/pymc3/step_methods/metropolis.py
index 76804db2f8f..0878b2b7728 100644
--- a/pymc3/step_methods/metropolis.py
+++ b/pymc3/step_methods/metropolis.py
@@ -12,13 +12,14 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import aesara
 import numpy as np
 import numpy.random as nr
 import scipy.linalg
-import theano
 
 import pymc3 as pm
 
+from pymc3.aesaraf import floatX
 from pymc3.distributions import draw_values
 from pymc3.step_methods.arraystep import (
     ArrayStep,
@@ -27,7 +28,6 @@
     PopulationArrayStepShared,
     metrop_select,
 )
-from pymc3.theanof import floatX
 
 __all__ = [
     "Metropolis",
@@ -142,7 +142,7 @@ def __init__(
         model: PyMC Model
             Optional model for sampling step. Defaults to None (taken from context).
         mode: string or `Mode` instance.
-            compilation mode passed to Theano functions
+            compilation mode passed to Aesara functions
         """
 
         model = pm.modelcontext(model)
@@ -571,7 +571,7 @@ class DEMetropolis(PopulationArrayStepShared):
     model: PyMC Model
         Optional model for sampling step. Defaults to None (taken from context).
     mode:  string or `Mode` instance.
-        compilation mode passed to Theano functions
+        compilation mode passed to Aesara functions
 
     References
     ----------
@@ -713,7 +713,7 @@ class DEMetropolisZ(ArrayStepShared):
     model: PyMC Model
         Optional model for sampling step. Defaults to None (taken from context).
     mode:  string or `Mode` instance.
-        compilation mode passed to Theano functions
+        compilation mode passed to Aesara functions
 
     References
     ----------
@@ -887,6 +887,6 @@ def delta_logp(logp, vars, shared):
 
     logp1 = pm.CallableTensor(logp0)(inarray1)
 
-    f = theano.function([inarray1, inarray0], logp1 - logp0)
+    f = aesara.function([inarray1, inarray0], logp1 - logp0)
     f.trust_input = True
     return f
diff --git a/pymc3/step_methods/mlda.py b/pymc3/step_methods/mlda.py
index 559f894f300..8edf54209b0 100644
--- a/pymc3/step_methods/mlda.py
+++ b/pymc3/step_methods/mlda.py
@@ -17,10 +17,11 @@
 
 from typing import List, Optional, Type, Union
 
+import aesara
 import arviz as az
 import numpy as np
-import theano
-import theano.tensor as tt
+
+from aesara.tensor.sharedvar import TensorSharedVariable
 
 import pymc3 as pm
 
@@ -254,7 +255,7 @@ class MLDA(ArrayStepShared):
         (taken from context). This model should be the finest of all
         multilevel models.
     mode :  string or `Mode` instance.
-        Compilation mode passed to Theano functions
+        Compilation mode passed to Aesara functions
     subsampling_rates : integer or list of integers
         One interger for all levels or a list with one number for each level
         (excluding the finest level).
@@ -275,7 +276,7 @@ class MLDA(ArrayStepShared):
         the PyMC3 model (also demonstrated in the example notebook):
             - Include a `pm.Data()` variable with the name `Q` in the
             model description of all levels.
-            - Use a Theano Op to calculate the forward model (or the
+            - Use a Aesara Op to calculate the forward model (or the
             combination of a forward model and a likelihood). This Op
             should have a `perform()` method which (in addition to all
             the other calculations), calculates the quantity of interest
@@ -300,7 +301,7 @@ class MLDA(ArrayStepShared):
             extra variables mu_B and Sigma_B, which will capture
             the bias between different levels. All these variables
             should be instantiated using the pm.Data method.
-            - Use a Theano Op to define the forward model (and
+            - Use a Aesara Op to define the forward model (and
             optionally the likelihood) for all levels. The Op needs
             to store the result of each forward model calculation
             to the variable model_output of the PyMC3 model,
@@ -419,7 +420,7 @@ def __init__(
                     "for storing the fine Q."
                     "Use pm.Data() to define it."
                 )
-            if not isinstance(self.model.Q, tt.sharedvar.TensorSharedVariable):
+            if not isinstance(self.model.Q, TensorSharedVariable):
                 raise TypeError(
                     "The variable 'Q' in the model definition is not of type "
                     "'TensorSharedVariable'. Use pm.Data() to define the"
@@ -454,8 +455,8 @@ def __init__(
                     "Use pm.Data() to define it."
                 )
             if not (
-                isinstance(self.model_below.mu_B, tt.sharedvar.TensorSharedVariable)
-                and isinstance(self.model_below.Sigma_B, tt.sharedvar.TensorSharedVariable)
+                isinstance(self.model_below.mu_B, TensorSharedVariable)
+                and isinstance(self.model_below.Sigma_B, TensorSharedVariable)
             ):
                 raise TypeError(
                     "At least one of the variables 'mu_B' and 'Sigma_B' "
@@ -549,12 +550,12 @@ def __init__(
 
         self.accepted = 0
 
-        # Construct theano function for current-level model likelihood
+        # Construct aesara function for current-level model likelihood
         # (for use in acceptance)
         shared = pm.make_shared_replacements(vars, model)
         self.delta_logp = delta_logp_inverse(model.logpt, vars, shared)
 
-        # Construct theano function for below-level model likelihood
+        # Construct aesara function for below-level model likelihood
         # (for use in acceptance)
         model_below = pm.modelcontext(self.model_below)
         vars_below = [var for var in model_below.vars if var.name in self.var_names]
@@ -964,7 +965,7 @@ def delta_logp_inverse(logp, vars, shared):
 
     logp1 = pm.CallableTensor(logp0)(inarray1)
 
-    f = theano.function([inarray1, inarray0], -logp0 + logp1)
+    f = aesara.function([inarray1, inarray0], -logp0 + logp1)
     f.trust_input = True
     return f
 
diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py
index c3bac3ade93..9649a9cb8fb 100644
--- a/pymc3/step_methods/pgbart.py
+++ b/pymc3/step_methods/pgbart.py
@@ -16,13 +16,13 @@
 
 import numpy as np
 
-from theano import function as theano_function
+from aesara import function as aesara_function
 
+from pymc3.aesaraf import inputvars, join_nonshared_inputs, make_shared_replacements
 from pymc3.distributions import BART
 from pymc3.distributions.tree import Tree
 from pymc3.model import modelcontext
 from pymc3.step_methods.arraystep import ArrayStepShared, Competence
-from pymc3.theanof import inputvars, join_nonshared_inputs, make_shared_replacements
 
 _log = logging.getLogger("pymc3")
 
@@ -274,7 +274,7 @@ def set_particle_to_step(self, t):
 
 
 def logp(out_vars, vars, shared):
-    """Compile Theano function of the model and the input and output variables.
+    """Compile Aesara function of the model and the input and output variables.
 
     Parameters
     ----------
@@ -283,9 +283,9 @@ def logp(out_vars, vars, shared):
     vars: List
         containing :class:`pymc3.Distribution` for the input variables
     shared: List
-        containing :class:`theano.tensor.Tensor` for depended shared data
+        containing :class:`aesara.tensor.Tensor` for depended shared data
     """
     out_list, inarray0 = join_nonshared_inputs(out_vars, vars, shared)
-    f = theano_function([inarray0], out_list[0])
+    f = aesara_function([inarray0], out_list[0])
     f.trust_input = True
     return f
diff --git a/pymc3/step_methods/sgmcmc.py b/pymc3/step_methods/sgmcmc.py
index 1620f21b0e8..80246db758e 100644
--- a/pymc3/step_methods/sgmcmc.py
+++ b/pymc3/step_methods/sgmcmc.py
@@ -16,12 +16,12 @@
 
 from collections import OrderedDict
 
-import theano
-import theano.tensor as tt
+import aesara
+import aesara.tensor as aet
 
+from pymc3.aesaraf import aet_rng, make_shared_replacements
 from pymc3.model import inputvars, modelcontext
 from pymc3.step_methods.arraystep import ArrayStepShared
-from pymc3.theanof import make_shared_replacements, tt_rng
 
 __all__ = []
 
@@ -45,8 +45,8 @@ def _check_minibatches(minibatch_tensors, minibatches):
 
 def prior_dlogp(vars, model, flat_view):
     """Returns the gradient of the prior on the parameters as a vector of size D x 1"""
-    terms = tt.concatenate([theano.grad(var.logpt, var).flatten() for var in vars], axis=0)
-    dlogp = theano.clone(terms, flat_view.replacements, strict=False)
+    terms = aet.concatenate([aesara.grad(var.logpt, var).flatten() for var in vars], axis=0)
+    dlogp = aesara.clone_replace(terms, flat_view.replacements, strict=False)
 
     return dlogp
 
@@ -63,12 +63,14 @@ def elemwise_dlogL(vars, model, flat_view):
     # calculate fisher information
     terms = []
     for var in vars:
-        output, _ = theano.scan(
-            lambda i, logX=logL, v=var: theano.grad(logX[i], v).flatten(),
-            sequences=[tt.arange(logL.shape[0])],
+        output, _ = aesara.scan(
+            lambda i, logX=logL, v=var: aesara.grad(logX[i], v).flatten(),
+            sequences=[aet.arange(logL.shape[0])],
         )
         terms.append(output)
-    dlogL = theano.clone(tt.concatenate(terms, axis=1), flat_view.replacements, strict=False)
+    dlogL = aesara.clone_replace(
+        aet.concatenate(terms, axis=1), flat_view.replacements, strict=False
+    )
     return dlogL
 
 
@@ -106,7 +108,7 @@ class BaseStochasticGradient(ArrayStepShared):
     Defining a BaseStochasticGradient needs
     custom implementation of the following methods:
         - :code: `.mk_training_fn()`
-            Returns a theano function which is called for each sampling step
+            Returns a aesara function which is called for each sampling step
         - :code: `._initialize_values()`
             Returns None it creates class variables which are required for the training fn
     """
@@ -145,9 +147,9 @@ def __init__(
         # set random stream
         self.random = None
         if random_seed is None:
-            self.random = tt_rng()
+            self.random = aet_rng()
         else:
-            self.random = tt_rng(random_seed)
+            self.random = aet_rng(random_seed)
 
         self.step_size = step_size
 
@@ -169,7 +171,7 @@ def __init__(
 
             # Replace input shared variables with tensors
             def is_shared(t):
-                return isinstance(t, theano.compile.sharedvalue.SharedVariable)
+                return isinstance(t, aesara.compile.sharedvalue.SharedVariable)
 
             tensors = [(t.type() if is_shared(t) else t) for t in minibatch_tensors]
             updates = OrderedDict(
diff --git a/pymc3/step_methods/slicer.py b/pymc3/step_methods/slicer.py
index ef68dec9939..b0320a9effd 100644
--- a/pymc3/step_methods/slicer.py
+++ b/pymc3/step_methods/slicer.py
@@ -17,9 +17,9 @@
 import numpy as np
 import numpy.random as nr
 
+from pymc3.aesaraf import inputvars
 from pymc3.model import modelcontext
 from pymc3.step_methods.arraystep import ArrayStep, Competence
-from pymc3.theanof import inputvars
 from pymc3.vartypes import continuous_types
 
 __all__ = ["Slice"]
diff --git a/pymc3/tests/backend_fixtures.py b/pymc3/tests/backend_fixtures.py
index 6fd0b1318cd..9ef8d03a7d5 100644
--- a/pymc3/tests/backend_fixtures.py
+++ b/pymc3/tests/backend_fixtures.py
@@ -16,10 +16,10 @@
 import os
 import shutil
 
+import aesara
 import numpy as np
 import numpy.testing as npt
 import pytest
-import theano
 
 from pymc3.backends import base
 from pymc3.tests import models
@@ -250,7 +250,7 @@ def record_point(self, val):
         else:
             self.strace.record(point=point)
 
-    @pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32")
+    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     def test_standard_close(self):
         for idx in range(self.draws):
             self.record_point(idx)
@@ -293,14 +293,14 @@ class SelectionTestCase(ModelBackendSampledTestCase):
     - shape
     """
 
-    @pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32")
+    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     def test_get_values_default(self):
         for varname in self.test_point.keys():
             expected = np.concatenate([self.expected[chain][varname] for chain in [0, 1]])
             result = self.mtrace.get_values(varname)
             npt.assert_equal(result, expected)
 
-    @pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32")
+    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     def test_get_values_nocombine_burn_keyword(self):
         burn = 2
         for varname in self.test_point.keys():
@@ -311,7 +311,7 @@ def test_get_values_nocombine_burn_keyword(self):
     def test_len(self):
         assert len(self.mtrace) == self.draws
 
-    @pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32")
+    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     def test_dtypes(self):
         for varname in self.test_point.keys():
             assert (
@@ -515,7 +515,7 @@ def test_chain_length(self):
         assert self.mtrace0.nchains == self.mtrace1.nchains
         assert len(self.mtrace0) == len(self.mtrace1)
 
-    @pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32")
+    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     def test_dtype(self):
         for varname in self.test_point.keys():
             assert (
diff --git a/pymc3/tests/conftest.py b/pymc3/tests/conftest.py
index e9d38d163ff..1be0184c0ec 100644
--- a/pymc3/tests/conftest.py
+++ b/pymc3/tests/conftest.py
@@ -12,31 +12,31 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import aesara
 import numpy as np
 import pytest
-import theano
 
 import pymc3 as pm
 
 
 @pytest.fixture(scope="function", autouse=True)
-def theano_config():
-    config = theano.config.change_flags(compute_test_value="raise")
+def aesara_config():
+    config = aesara.config.change_flags(compute_test_value="raise")
     with config:
         yield
 
 
 @pytest.fixture(scope="function", autouse=True)
 def exception_verbosity():
-    config = theano.config.change_flags(exception_verbosity="high")
+    config = aesara.config.change_flags(exception_verbosity="high")
     with config:
         yield
 
 
 @pytest.fixture(scope="function", autouse=False)
 def strict_float32():
-    if theano.config.floatX == "float32":
-        config = theano.config.change_flags(warn_float64="raise")
+    if aesara.config.floatX == "float32":
+        config = aesara.config.change_flags(warn_float64="raise")
         with config:
             yield
     else:
@@ -47,4 +47,4 @@ def strict_float32():
 def seeded_test():
     # TODO: use this instead of SeededTest
     np.random.seed(42)
-    pm.set_tt_rng(42)
+    pm.set_aet_rng(42)
diff --git a/pymc3/tests/helpers.py b/pymc3/tests/helpers.py
index 6e56fad9d02..9806fb0b8e0 100644
--- a/pymc3/tests/helpers.py
+++ b/pymc3/tests/helpers.py
@@ -16,13 +16,13 @@
 
 from logging.handlers import BufferingHandler
 
+import aesara
 import numpy.random as nr
-import theano
 
-from theano.gradient import verify_grad as tt_verify_grad
-from theano.sandbox.rng_mrg import MRG_RandomStream as RandomStream
+from aesara.gradient import verify_grad as aet_verify_grad
+from aesara.sandbox.rng_mrg import MRG_RandomStream as RandomStream
 
-from pymc3.theanof import set_tt_rng, tt_rng
+from pymc3.aesaraf import aet_rng, set_aet_rng
 
 
 class SeededTest:
@@ -34,11 +34,11 @@ def setup_class(cls):
 
     def setup_method(self):
         nr.seed(self.random_seed)
-        self.old_tt_rng = tt_rng()
-        set_tt_rng(RandomStream(self.random_seed))
+        self.old_aet_rng = aet_rng()
+        set_aet_rng(RandomStream(self.random_seed))
 
     def teardown_method(self):
-        set_tt_rng(self.old_tt_rng)
+        set_aet_rng(self.old_aet_rng)
 
 
 class LoggingHandler(BufferingHandler):
@@ -104,7 +104,7 @@ def match_value(self, k, dv, v):
 
 def select_by_precision(float64, float32):
     """Helper function to choose reasonable decimal cutoffs for different floatX modes."""
-    decimal = float64 if theano.config.floatX == "float64" else float32
+    decimal = float64 if aesara.config.floatX == "float64" else float32
     return decimal
 
 
@@ -116,4 +116,4 @@ def not_raises():
 def verify_grad(op, pt, n_tests=2, rng=None, *args, **kwargs):
     if rng is None:
         rng = nr.RandomState(411342)
-    tt_verify_grad(op, pt, n_tests, rng, *args, **kwargs)
+    aet_verify_grad(op, pt, n_tests, rng, *args, **kwargs)
diff --git a/pymc3/tests/models.py b/pymc3/tests/models.py
index 5f607348591..49f9cd1e7a3 100644
--- a/pymc3/tests/models.py
+++ b/pymc3/tests/models.py
@@ -14,23 +14,23 @@
 
 from itertools import product
 
+import aesara
+import aesara.tensor as aet
 import numpy as np
-import theano
-import theano.tensor as tt
 
-from theano.compile.ops import as_op
+from aesara.compile.ops import as_op
 
 import pymc3 as pm
 
 from pymc3 import Categorical, Metropolis, Model, Normal
-from pymc3.theanof import floatX_array
+from pymc3.aesaraf import floatX_array
 
 
 def simple_model():
     mu = -2.1
     tau = 1.3
     with Model() as model:
-        Normal("x", mu, tau=tau, shape=2, testval=tt.ones(2) * 0.1)
+        Normal("x", mu, tau=tau, shape=2, testval=aet.ones(2) * 0.1)
 
     return model.test_point, model, (mu, tau ** -0.5)
 
@@ -50,13 +50,13 @@ def multidimensional_model():
     mu = -2.1
     tau = 1.3
     with Model() as model:
-        Normal("x", mu, tau=tau, shape=(3, 2), testval=0.1 * tt.ones((3, 2)))
+        Normal("x", mu, tau=tau, shape=(3, 2), testval=0.1 * aet.ones((3, 2)))
 
     return model.test_point, model, (mu, tau ** -0.5)
 
 
 def simple_arbitrary_det():
-    scalar_type = tt.dscalar if theano.config.floatX == "float64" else tt.fscalar
+    scalar_type = aet.dscalar if aesara.config.floatX == "float64" else aet.fscalar
 
     @as_op(itypes=[scalar_type], otypes=[scalar_type])
     def arbitrary_det(value):
@@ -82,7 +82,7 @@ def simple_2model():
     p = 0.4
     with Model() as model:
         x = pm.Normal("x", mu, tau=tau, testval=0.1)
-        pm.Deterministic("logx", tt.log(x))
+        pm.Deterministic("logx", aet.log(x))
         pm.Bernoulli("y", p)
     return model.test_point, model
 
@@ -92,7 +92,7 @@ def simple_2model_continuous():
     tau = 1.3
     with Model() as model:
         x = pm.Normal("x", mu, tau=tau, testval=0.1)
-        pm.Deterministic("logx", tt.log(x))
+        pm.Deterministic("logx", aet.log(x))
         pm.Beta("y", alpha=1, beta=1, shape=2)
     return model.test_point, model
 
@@ -104,8 +104,8 @@ def mv_simple():
     with pm.Model() as model:
         pm.MvNormal(
             "x",
-            tt.constant(mu),
-            tau=tt.constant(tau),
+            aet.constant(mu),
+            tau=aet.constant(tau),
             shape=3,
             testval=floatX_array([0.1, 1.0, 0.8]),
         )
@@ -121,8 +121,8 @@ def mv_simple_coarse():
     with pm.Model() as model:
         pm.MvNormal(
             "x",
-            tt.constant(mu),
-            tau=tt.constant(tau),
+            aet.constant(mu),
+            tau=aet.constant(tau),
             shape=3,
             testval=floatX_array([0.1, 1.0, 0.8]),
         )
@@ -138,8 +138,8 @@ def mv_simple_very_coarse():
     with pm.Model() as model:
         pm.MvNormal(
             "x",
-            tt.constant(mu),
-            tau=tt.constant(tau),
+            aet.constant(mu),
+            tau=aet.constant(tau),
             shape=3,
             testval=floatX_array([0.1, 1.0, 0.8]),
         )
@@ -153,7 +153,7 @@ def mv_simple_discrete():
     n = 5
     p = floatX_array([0.15, 0.85])
     with pm.Model() as model:
-        pm.Multinomial("x", n, tt.constant(p), shape=d, testval=np.array([1, 4]))
+        pm.Multinomial("x", n, aet.constant(p), shape=d, testval=np.array([1, 4]))
         mu = n * p
         # covariance matrix
         C = np.zeros((d, d))
diff --git a/pymc3/tests/sampler_fixtures.py b/pymc3/tests/sampler_fixtures.py
index fcf66f15569..aacb3fb3aba 100644
--- a/pymc3/tests/sampler_fixtures.py
+++ b/pymc3/tests/sampler_fixtures.py
@@ -12,10 +12,10 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import aesara.tensor as aet
 import arviz as az
 import numpy as np
 import numpy.testing as npt
-import theano.tensor as tt
 
 from scipy import stats
 
@@ -124,9 +124,9 @@ def make_model(cls):
             sd_dist = pm.Lognormal.dist(mu=sd_mu, sigma=sd_mu / 10.0, shape=5)
             chol_packed = pm.LKJCholeskyCov("chol_packed", eta=3, n=5, sd_dist=sd_dist)
             chol = pm.expand_packed_triangular(5, chol_packed, lower=True)
-            cov = tt.dot(chol, chol.T)
-            stds = tt.sqrt(tt.diag(cov))
-            pm.Deterministic("log_stds", tt.log(stds))
+            cov = aet.dot(chol, chol.T)
+            stds = aet.sqrt(aet.diag(cov))
+            pm.Deterministic("log_stds", aet.log(stds))
             corr = cov / stds[None, :] / stds[:, None]
             corr_entries_unit = (corr[np.tril_indices(5, -1)] + 1) / 2
             pm.Deterministic("corr_entries_unit", corr_entries_unit)
diff --git a/pymc3/tests/test_theanof.py b/pymc3/tests/test_aesaraf.py
similarity index 90%
rename from pymc3/tests/test_theanof.py
rename to pymc3/tests/test_aesaraf.py
index d54aed680d8..1b591e0a859 100644
--- a/pymc3/tests/test_theanof.py
+++ b/pymc3/tests/test_aesaraf.py
@@ -14,15 +14,17 @@
 
 from itertools import product
 
+import aesara
+import aesara.tensor as aet
 import numpy as np
 import pytest
-import theano
-import theano.tensor as tt
 
-from pymc3.theanof import _conversion_map, take_along_axis
+from aesara.tensor.type import TensorType
+
+from pymc3.aesaraf import _conversion_map, take_along_axis
 from pymc3.vartypes import int_types
 
-FLOATX = str(theano.config.floatX)
+FLOATX = str(aesara.config.floatX)
 INTX = str(_conversion_map[FLOATX])
 
 
@@ -78,8 +80,8 @@ def setup_class(self):
 
     def _input_tensors(self, shape):
         ndim = len(shape)
-        arr = tt.TensorType(FLOATX, [False] * ndim)("arr")
-        indices = tt.TensorType(INTX, [False] * ndim)("indices")
+        arr = TensorType(FLOATX, [False] * ndim)("arr")
+        indices = TensorType(INTX, [False] * ndim)("indices")
         arr.tag.test_value = np.zeros(shape, dtype=FLOATX)
         indices.tag.test_value = np.zeros(shape, dtype=INTX)
         return arr, indices
@@ -107,7 +109,7 @@ def get_output_tensors(self, shape, axis):
             return out
 
     def _function(self, arr, indices, out):
-        return theano.function([arr, indices], [out])
+        return aesara.function([arr, indices], [out])
 
     def get_function(self, shape, axis):
         ndim = len(shape)
@@ -181,13 +183,13 @@ def test_take_along_axis_grad(self, shape, axis, samples):
             _axis = len(shape) + axis
         else:
             _axis = axis
-        # Setup the theano function
+        # Setup the aesara function
         t_arr, t_indices = self.get_input_tensors(shape)
-        t_out2 = theano.grad(
-            tt.sum(self._output_tensor(t_arr ** 2, t_indices, axis)),
+        t_out2 = aesara.grad(
+            aet.sum(self._output_tensor(t_arr ** 2, t_indices, axis)),
             t_arr,
         )
-        func = theano.function([t_arr, t_indices], [t_out2])
+        func = aesara.function([t_arr, t_indices], [t_out2])
 
         # Test that the gradient gives the same output as what is expected
         arr, indices = self.get_input_values(shape, axis, samples)
@@ -209,16 +211,16 @@ def test_axis_failure(self, axis):
             take_along_axis(arr, indices, axis=axis)
 
     def test_ndim_failure(self):
-        arr = tt.TensorType(FLOATX, [False] * 3)("arr")
-        indices = tt.TensorType(INTX, [False] * 2)("indices")
+        arr = TensorType(FLOATX, [False] * 3)("arr")
+        indices = TensorType(INTX, [False] * 2)("indices")
         arr.tag.test_value = np.zeros((1,) * arr.ndim, dtype=FLOATX)
         indices.tag.test_value = np.zeros((1,) * indices.ndim, dtype=INTX)
         with pytest.raises(ValueError):
             take_along_axis(arr, indices)
 
     def test_dtype_failure(self):
-        arr = tt.TensorType(FLOATX, [False] * 3)("arr")
-        indices = tt.TensorType(FLOATX, [False] * 3)("indices")
+        arr = TensorType(FLOATX, [False] * 3)("arr")
+        indices = TensorType(FLOATX, [False] * 3)("indices")
         arr.tag.test_value = np.zeros((1,) * arr.ndim, dtype=FLOATX)
         indices.tag.test_value = np.zeros((1,) * indices.ndim, dtype=FLOATX)
         with pytest.raises(IndexError):
diff --git a/pymc3/tests/test_data_container.py b/pymc3/tests/test_data_container.py
index 966ce47cd6a..fb4a3557493 100644
--- a/pymc3/tests/test_data_container.py
+++ b/pymc3/tests/test_data_container.py
@@ -16,12 +16,12 @@
 import pandas as pd
 import pytest
 
-from theano import shared
+from aesara import shared
 
 import pymc3 as pm
 
+from pymc3.aesaraf import floatX
 from pymc3.tests.helpers import SeededTest
-from pymc3.theanof import floatX
 
 
 class TestData(SeededTest):
diff --git a/pymc3/tests/test_dist_math.py b/pymc3/tests/test_dist_math.py
index de9bbd5b7e5..f3b193b8a16 100644
--- a/pymc3/tests/test_dist_math.py
+++ b/pymc3/tests/test_dist_math.py
@@ -13,16 +13,17 @@
 #   limitations under the License.
 import sys
 
+import aesara
+import aesara.tensor as aet
 import numpy as np
 import numpy.testing as npt
 import pytest
-import theano
-import theano.tensor as tt
 
 from scipy import interpolate, stats
 
 import pymc3 as pm
 
+from pymc3.aesaraf import floatX
 from pymc3.distributions import Discrete
 from pymc3.distributions.dist_math import (
     MvNormalLogp,
@@ -34,28 +35,27 @@
     i0e,
 )
 from pymc3.tests.helpers import verify_grad
-from pymc3.theanof import floatX
 
 
 def test_bound():
-    logp = tt.ones((10, 10))
-    cond = tt.ones((10, 10))
+    logp = aet.ones((10, 10))
+    cond = aet.ones((10, 10))
     assert np.all(bound(logp, cond).eval() == logp.eval())
 
-    logp = tt.ones((10, 10))
-    cond = tt.zeros((10, 10))
+    logp = aet.ones((10, 10))
+    cond = aet.zeros((10, 10))
     assert np.all(bound(logp, cond).eval() == (-np.inf * logp).eval())
 
-    logp = tt.ones((10, 10))
+    logp = aet.ones((10, 10))
     cond = True
     assert np.all(bound(logp, cond).eval() == logp.eval())
 
-    logp = tt.ones(3)
+    logp = aet.ones(3)
     cond = np.array([1, 0, 1])
     assert not np.all(bound(logp, cond).eval() == 1)
     assert np.prod(bound(logp, cond).eval()) == -np.inf
 
-    logp = tt.ones((2, 3))
+    logp = aet.ones((2, 3))
     cond = np.array([[1, 1, 1], [1, 0, 1]])
     assert not np.all(bound(logp, cond).eval() == 1)
     assert np.prod(bound(logp, cond).eval()) == -np.inf
@@ -63,7 +63,7 @@ def test_bound():
 
 def test_check_bounds_false():
     with pm.Model(check_bounds=False):
-        logp = tt.ones(3)
+        logp = aet.ones(3)
         cond = np.array([1, 0, 1])
         assert np.all(bound(logp, cond).eval() == logp.eval())
 
@@ -71,21 +71,21 @@ def test_check_bounds_false():
 def test_alltrue_scalar():
     assert alltrue_scalar([]).eval()
     assert alltrue_scalar([True]).eval()
-    assert alltrue_scalar([tt.ones(10)]).eval()
-    assert alltrue_scalar([tt.ones(10), 5 * tt.ones(101)]).eval()
-    assert alltrue_scalar([np.ones(10), 5 * tt.ones(101)]).eval()
-    assert alltrue_scalar([np.ones(10), True, 5 * tt.ones(101)]).eval()
-    assert alltrue_scalar([np.array([1, 2, 3]), True, 5 * tt.ones(101)]).eval()
+    assert alltrue_scalar([aet.ones(10)]).eval()
+    assert alltrue_scalar([aet.ones(10), 5 * aet.ones(101)]).eval()
+    assert alltrue_scalar([np.ones(10), 5 * aet.ones(101)]).eval()
+    assert alltrue_scalar([np.ones(10), True, 5 * aet.ones(101)]).eval()
+    assert alltrue_scalar([np.array([1, 2, 3]), True, 5 * aet.ones(101)]).eval()
 
     assert not alltrue_scalar([False]).eval()
-    assert not alltrue_scalar([tt.zeros(10)]).eval()
+    assert not alltrue_scalar([aet.zeros(10)]).eval()
     assert not alltrue_scalar([True, False]).eval()
-    assert not alltrue_scalar([np.array([0, -1]), tt.ones(60)]).eval()
-    assert not alltrue_scalar([np.ones(10), False, 5 * tt.ones(101)]).eval()
+    assert not alltrue_scalar([np.array([0, -1]), aet.ones(60)]).eval()
+    assert not alltrue_scalar([np.ones(10), False, 5 * aet.ones(101)]).eval()
 
 
 def test_alltrue_shape():
-    vals = [True, tt.ones(10), tt.zeros(5)]
+    vals = [True, aet.ones(10), aet.zeros(5)]
 
     assert alltrue_scalar(vals).eval().shape == ()
 
@@ -102,11 +102,11 @@ def logp(self, value):
         p = self.p
 
         return bound(
-            factln(n) - factln(value).sum() + (value * tt.log(p)).sum(),
+            factln(n) - factln(value).sum() + (value * aet.log(p)).sum(),
             value >= 0,
             0 <= p,
             p <= 1,
-            tt.isclose(p.sum(), 1),
+            aet.isclose(p.sum(), 1),
             broadcast_conditions=False,
         )
 
@@ -123,11 +123,11 @@ def logp(self, value):
         p = self.p
 
         return bound(
-            factln(n) - factln(value).sum() + (value * tt.log(p)).sum(),
-            tt.all(value >= 0),
-            tt.all(0 <= p),
-            tt.all(p <= 1),
-            tt.isclose(p.sum(), 1),
+            factln(n) - factln(value).sum() + (value * aet.log(p)).sum(),
+            aet.all(value >= 0),
+            aet.all(0 <= p),
+            aet.all(p <= 1),
+            aet.isclose(p.sum(), 1),
             broadcast_conditions=False,
         )
 
@@ -156,30 +156,30 @@ def test_logp(self):
 
         chol_val = floatX(np.array([[1, 0.9], [0, 2]]))
         cov_val = floatX(np.dot(chol_val, chol_val.T))
-        cov = tt.matrix("cov")
+        cov = aet.matrix("cov")
         cov.tag.test_value = cov_val
         delta_val = floatX(np.random.randn(5, 2))
-        delta = tt.matrix("delta")
+        delta = aet.matrix("delta")
         delta.tag.test_value = delta_val
         expect = stats.multivariate_normal(mean=np.zeros(2), cov=cov_val)
         expect = expect.logpdf(delta_val).sum()
         logp = MvNormalLogp()(cov, delta)
-        logp_f = theano.function([cov, delta], logp)
+        logp_f = aesara.function([cov, delta], logp)
         logp = logp_f(cov_val, delta_val)
         npt.assert_allclose(logp, expect)
 
-    @theano.config.change_flags(compute_test_value="ignore")
+    @aesara.config.change_flags(compute_test_value="ignore")
     def test_grad(self):
         np.random.seed(42)
 
         def func(chol_vec, delta):
-            chol = tt.stack(
+            chol = aet.stack(
                 [
-                    tt.stack([tt.exp(0.1 * chol_vec[0]), 0]),
-                    tt.stack([chol_vec[1], 2 * tt.exp(chol_vec[2])]),
+                    aet.stack([aet.exp(0.1 * chol_vec[0]), 0]),
+                    aet.stack([chol_vec[1], 2 * aet.exp(chol_vec[2])]),
                 ]
             )
-            cov = tt.dot(chol, chol.T)
+            cov = aet.dot(chol, chol.T)
             return MvNormalLogp()(cov, delta)
 
         chol_vec_val = floatX(np.array([0.5, 1.0, -0.1]))
@@ -190,46 +190,46 @@ def func(chol_vec, delta):
         delta_val = floatX(np.random.randn(5, 2))
         verify_grad(func, [chol_vec_val, delta_val])
 
-    @pytest.mark.skip(reason="Fix in theano not released yet: Theano#5908")
-    @theano.config.change_flags(compute_test_value="ignore")
+    @pytest.mark.skip(reason="Fix in aesara not released yet: Theano#5908")
+    @aesara.config.change_flags(compute_test_value="ignore")
     def test_hessian(self):
-        chol_vec = tt.vector("chol_vec")
+        chol_vec = aet.vector("chol_vec")
         chol_vec.tag.test_value = np.array([0.1, 2, 3])
-        chol = tt.stack(
+        chol = aet.stack(
             [
-                tt.stack([tt.exp(0.1 * chol_vec[0]), 0]),
-                tt.stack([chol_vec[1], 2 * tt.exp(chol_vec[2])]),
+                aet.stack([aet.exp(0.1 * chol_vec[0]), 0]),
+                aet.stack([chol_vec[1], 2 * aet.exp(chol_vec[2])]),
             ]
         )
-        cov = tt.dot(chol, chol.T)
-        delta = tt.matrix("delta")
+        cov = aet.dot(chol, chol.T)
+        delta = aet.matrix("delta")
         delta.tag.test_value = np.ones((5, 2))
         logp = MvNormalLogp()(cov, delta)
-        g_cov, g_delta = tt.grad(logp, [cov, delta])
-        tt.grad(g_delta.sum() + g_cov.sum(), [delta, cov])
+        g_cov, g_delta = aet.grad(logp, [cov, delta])
+        aet.grad(g_delta.sum() + g_cov.sum(), [delta, cov])
 
 
 class TestSplineWrapper:
-    @theano.config.change_flags(compute_test_value="ignore")
+    @aesara.config.change_flags(compute_test_value="ignore")
     def test_grad(self):
         x = np.linspace(0, 1, 100)
         y = x * x
         spline = SplineWrapper(interpolate.InterpolatedUnivariateSpline(x, y, k=1))
         verify_grad(spline, [0.5])
 
-    @theano.config.change_flags(compute_test_value="ignore")
+    @aesara.config.change_flags(compute_test_value="ignore")
     def test_hessian(self):
         x = np.linspace(0, 1, 100)
         y = x * x
         spline = SplineWrapper(interpolate.InterpolatedUnivariateSpline(x, y, k=1))
-        x_var = tt.dscalar("x")
-        (g_x,) = tt.grad(spline(x_var), [x_var])
+        x_var = aet.dscalar("x")
+        (g_x,) = aet.grad(spline(x_var), [x_var])
         with pytest.raises(NotImplementedError):
-            tt.grad(g_x, [x_var])
+            aet.grad(g_x, [x_var])
 
 
 class TestI0e:
-    @theano.config.change_flags(compute_test_value="ignore")
+    @aesara.config.change_flags(compute_test_value="ignore")
     def test_grad(self):
         verify_grad(i0e, [0.5])
         verify_grad(i0e, [-2.0])
diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py
index 06efc90b8d8..f26b6743b88 100644
--- a/pymc3/tests/test_distributions.py
+++ b/pymc3/tests/test_distributions.py
@@ -15,14 +15,15 @@
 import itertools
 import sys
 
+import aesara
+import aesara.tensor as aet
 import numpy as np
 import numpy.random as nr
 import pytest
 import scipy.stats
 import scipy.stats.distributions as sp
-import theano
-import theano.tensor as tt
 
+from aesara.tensor.var import TensorVariable
 from numpy import array, exp, inf, log
 from numpy.testing import assert_allclose, assert_almost_equal, assert_equal
 from packaging.version import parse
@@ -32,6 +33,7 @@
 
 import pymc3 as pm
 
+from pymc3.aesaraf import floatX
 from pymc3.blocking import DictToVarBijection
 from pymc3.distributions import (
     AR1,
@@ -98,7 +100,6 @@
 from pymc3.math import kronecker, logsumexp
 from pymc3.model import Deterministic, Model, Point
 from pymc3.tests.helpers import select_by_precision
-from pymc3.theanof import floatX
 from pymc3.vartypes import continuous_types
 
 SCIPY_VERSION = parse(scipy_version)
@@ -126,7 +127,7 @@ class Domain:
     def __init__(self, vals, dtype=None, edges=None, shape=None):
         avals = array(vals, dtype=dtype)
         if dtype is None and not str(avals.dtype).startswith("int"):
-            avals = avals.astype(theano.config.floatX)
+            avals = avals.astype(aesara.config.floatX)
         vals = [array(v, dtype=avals.dtype) for v in vals]
 
         if edges is None:
@@ -941,7 +942,7 @@ def test_chi_squared(self):
         )
 
     @pytest.mark.xfail(
-        condition=(theano.config.floatX == "float32"),
+        condition=(aesara.config.floatX == "float32"),
         reason="Poor CDF in SciPy. See scipy/scipy#869 for details.",
     )
     def test_wald_scipy(self):
@@ -1240,12 +1241,12 @@ def test_fun(value, mu, sigma):
         )
 
     @pytest.mark.xfail(
-        condition=(theano.config.floatX == "float32"),
+        condition=(aesara.config.floatX == "float32"),
         reason="Fails on float32 due to numerical issues",
     )
     def test_gamma_logcdf(self):
-        # pymc-devs/Theano-PyMC#224: skip_paramdomain_outside_edge_test has to be set
-        # True to avoid triggering a C-level assertion in the Theano GammaQ function
+        # pymc-devs/aesara#224: skip_paramdomain_outside_edge_test has to be set
+        # True to avoid triggering a C-level assertion in the Aesara GammaQ function
         # in gamma.c file. Can be set back to False (default) once that issue is solved
         self.check_logcdf(
             Gamma,
@@ -1256,7 +1257,7 @@ def test_gamma_logcdf(self):
         )
 
     @pytest.mark.xfail(
-        condition=(theano.config.floatX == "float32"),
+        condition=(aesara.config.floatX == "float32"),
         reason="Fails on float32 due to numerical issues",
     )
     def test_inverse_gamma(self):
@@ -1266,8 +1267,8 @@ def test_inverse_gamma(self):
             {"alpha": Rplus, "beta": Rplus},
             lambda value, alpha, beta: sp.invgamma.logpdf(value, alpha, scale=beta),
         )
-        # pymc-devs/Theano-PyMC#224: skip_paramdomain_outside_edge_test has to be set
-        # True to avoid triggering a C-level assertion in the Theano GammaQ function
+        # pymc-devs/aesara#224: skip_paramdomain_outside_edge_test has to be set
+        # True to avoid triggering a C-level assertion in the Aesara GammaQ function
         # in gamma.c file. Can be set back to False (default) once that issue is solved
         self.check_logcdf(
             InverseGamma,
@@ -1278,7 +1279,7 @@ def test_inverse_gamma(self):
         )
 
     @pytest.mark.xfail(
-        condition=(theano.config.floatX == "float32"),
+        condition=(aesara.config.floatX == "float32"),
         reason="Fails on float32 due to scaling issues",
     )
     def test_inverse_gamma_alt_params(self):
@@ -1309,7 +1310,7 @@ def test_pareto(self):
         )
 
     @pytest.mark.xfail(
-        condition=(theano.config.floatX == "float32"),
+        condition=(aesara.config.floatX == "float32"),
         reason="Fails on float32 due to inf issues",
     )
     def test_weibull(self):
@@ -1366,7 +1367,7 @@ def test_binomial(self):
         )
 
     # Too lazy to propagate decimal parameter through the whole chain of deps
-    @pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32")
+    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     @pytest.mark.xfail(
         condition=(SCIPY_VERSION < parse("1.4.0")), reason="betabinom is new in Scipy 1.4.0"
     )
@@ -1474,7 +1475,7 @@ def test_constantdist(self):
         self.check_logp(Constant, I, {"c": I}, lambda value, c: np.log(c == value))
 
     # Too lazy to propagate decimal parameter through the whole chain of deps
-    @pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32")
+    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     def test_zeroinflatedpoisson(self):
         self.checkd(
             ZeroInflatedPoisson,
@@ -1488,7 +1489,7 @@ def test_zeroinflatedpoisson(self):
         )
 
     # Too lazy to propagate decimal parameter through the whole chain of deps
-    @pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32")
+    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     def test_zeroinflatednegativebinomial(self):
         self.checkd(
             ZeroInflatedNegativeBinomial,
@@ -1503,7 +1504,7 @@ def test_zeroinflatednegativebinomial(self):
         )
 
     # Too lazy to propagate decimal parameter through the whole chain of deps
-    @pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32")
+    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     def test_zeroinflatedbinomial(self):
         self.checkd(
             ZeroInflatedBinomial,
@@ -1570,28 +1571,28 @@ def MvNormalUpper(*args, **kwargs):
         )
 
     @pytest.mark.xfail(
-        condition=(theano.config.floatX == "float32"),
+        condition=(aesara.config.floatX == "float32"),
         reason="Fails on float32 due to inf issues",
     )
     def test_mvnormal_indef(self):
         cov_val = np.array([[1, 0.5], [0.5, -2]])
-        cov = tt.matrix("cov")
+        cov = aet.matrix("cov")
         cov.tag.test_value = np.eye(2)
         mu = floatX(np.zeros(2))
-        x = tt.vector("x")
+        x = aet.vector("x")
         x.tag.test_value = np.zeros(2)
         logp = MvNormal.dist(mu=mu, cov=cov).logp(x)
-        f_logp = theano.function([cov, x], logp)
+        f_logp = aesara.function([cov, x], logp)
         assert f_logp(cov_val, np.ones(2)) == -np.inf
-        dlogp = tt.grad(logp, cov)
-        f_dlogp = theano.function([cov, x], dlogp)
+        dlogp = aet.grad(logp, cov)
+        f_dlogp = aesara.function([cov, x], dlogp)
         assert not np.all(np.isfinite(f_dlogp(cov_val, np.ones(2))))
 
         logp = MvNormal.dist(mu=mu, tau=cov).logp(x)
-        f_logp = theano.function([cov, x], logp)
+        f_logp = aesara.function([cov, x], logp)
         assert f_logp(cov_val, np.ones(2)) == -np.inf
-        dlogp = tt.grad(logp, cov)
-        f_dlogp = theano.function([cov, x], dlogp)
+        dlogp = aet.grad(logp, cov)
+        f_dlogp = aesara.function([cov, x], dlogp)
         assert not np.all(np.isfinite(f_dlogp(cov_val, np.ones(2))))
 
     def test_mvnormal_init_fail(self):
@@ -1778,13 +1779,13 @@ def test_dirichlet_with_batch_shapes(self, dist_shape):
             assert_almost_equal(pymc3_res[idx], scipy_res)
 
     def test_dirichlet_shape(self):
-        a = tt.as_tensor_variable(np.r_[1, 2])
+        a = aet.as_tensor_variable(np.r_[1, 2])
         with pytest.warns(DeprecationWarning):
             dir_rv = Dirichlet.dist(a)
             assert dir_rv.shape == (2,)
 
-        with pytest.warns(DeprecationWarning), theano.change_flags(compute_test_value="ignore"):
-            dir_rv = Dirichlet.dist(tt.vector())
+        with pytest.warns(DeprecationWarning), aesara.change_flags(compute_test_value="ignore"):
+            dir_rv = Dirichlet.dist(aet.vector())
 
     def test_dirichlet_2D(self):
         self.check_logp(
@@ -1925,16 +1926,16 @@ def test_multinomial_vec_2d_p(self):
     def test_batch_multinomial(self):
         n = 10
         vals = np.zeros((4, 5, 3), dtype="int32")
-        p = np.zeros_like(vals, dtype=theano.config.floatX)
+        p = np.zeros_like(vals, dtype=aesara.config.floatX)
         inds = np.random.randint(vals.shape[-1], size=vals.shape[:-1])[..., None]
         np.put_along_axis(vals, inds, n, axis=-1)
         np.put_along_axis(p, inds, 1, axis=-1)
 
         dist = Multinomial.dist(n=n, p=p, shape=vals.shape)
-        value = tt.tensor3(dtype="int32")
+        value = aet.tensor3(dtype="int32")
         value.tag.test_value = np.zeros_like(vals, dtype="int32")
-        logp = tt.exp(dist.logp(value))
-        f = theano.function(inputs=[value], outputs=logp)
+        logp = aet.exp(dist.logp(value))
+        f = aesara.function(inputs=[value], outputs=logp)
         assert_almost_equal(
             f(vals),
             np.ones(vals.shape[:-1] + (1,)),
@@ -2063,7 +2064,7 @@ def test_batch_dirichlet_multinomial(self):
         # except for one category / dimension which is given the value of 1000
         n = 5
         vals = np.zeros((4, 5, 3), dtype="int32")
-        a = np.zeros_like(vals, dtype=theano.config.floatX) + 0.001
+        a = np.zeros_like(vals, dtype=aesara.config.floatX) + 0.001
         inds = np.random.randint(vals.shape[-1], size=vals.shape[:-1])[..., None]
         np.put_along_axis(vals, inds, n, axis=-1)
         np.put_along_axis(a, inds, 1000, axis=-1)
@@ -2213,7 +2214,7 @@ def test_ex_gaussian_cdf_outside_edges(self):
             skip_paramdomain_inside_edge_test=True,  # Valid values are tested above
         )
 
-    @pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32")
+    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     def test_vonmises(self):
         self.check_logp(
             VonMises,
@@ -2278,7 +2279,7 @@ def test_rice(self):
             lambda value, b, sigma: sp.rice.logpdf(value, b=b, loc=0, scale=sigma),
         )
 
-    @pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32")
+    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     def test_moyal(self):
         self.check_logp(
             Moyal,
@@ -2293,7 +2294,7 @@ def test_moyal(self):
             lambda value, mu, sigma: floatX(sp.moyal.logcdf(value, mu, sigma)),
         )
 
-    @pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32")
+    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     def test_interpolated(self):
         for mu in R.vals:
             for sigma in Rplus.vals:
@@ -2352,8 +2353,8 @@ def test_bound():
         a = ArrayNormal("c", shape=2)
         assert_equal(a.tag.test_value, np.array([1.5, 2.5]))
 
-    lower = tt.vector("lower")
-    lower.tag.test_value = np.array([1, 2]).astype(theano.config.floatX)
+    lower = aet.vector("lower")
+    lower.tag.test_value = np.array([1, 2]).astype(aesara.config.floatX)
     upper = 3
     ArrayNormal = Bound(Normal, lower=lower, upper=upper)
     dist = ArrayNormal.dist(mu=0, sigma=1, shape=2)
@@ -2421,7 +2422,7 @@ def setup_class(self):
             nb2 = pm.NegativeBinomial("nb_with_p_n", p=pm.Uniform("nbp"), n=10)
 
             # Expected value of outcome
-            mu = Deterministic("mu", floatX(alpha + tt.dot(X, b)))
+            mu = Deterministic("mu", floatX(alpha + aet.dot(X, b)))
 
             # add a bounded variable as well
             bound_var = Bound(Normal, lower=1.0)("bound_var", mu=0, sigma=10)
@@ -2582,7 +2583,7 @@ def test_issue_3051(self, dims, dist_cls, kwargs):
 
         X = np.random.normal(size=(20, dims))
         actual_t = d.logp(X)
-        assert isinstance(actual_t, tt.TensorVariable)
+        assert isinstance(actual_t, TensorVariable)
         actual_a = actual_t.eval()
         assert isinstance(actual_a, np.ndarray)
         assert actual_a.shape == (X.shape[0],)
diff --git a/pymc3/tests/test_distributions_random.py b/pymc3/tests/test_distributions_random.py
index a56f3f3b7b2..684f1898ac6 100644
--- a/pymc3/tests/test_distributions_random.py
+++ b/pymc3/tests/test_distributions_random.py
@@ -17,12 +17,12 @@
 
 from contextlib import ExitStack as does_not_raise
 
+import aesara
 import numpy as np
 import numpy.random as nr
 import numpy.testing as npt
 import pytest
 import scipy.stats as st
-import theano
 
 from scipy import linalg
 from scipy.special import expit
@@ -1127,7 +1127,7 @@ def ref_rand(size, mu, sigma):
 
         pymc3_random(pm.Moyal, {"mu": R, "sigma": Rplus}, ref_rand=ref_rand)
 
-    @pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32")
+    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     def test_interpolated(self):
         for mu in R.vals:
             for sigma in Rplus.vals:
diff --git a/pymc3/tests/test_distributions_timeseries.py b/pymc3/tests/test_distributions_timeseries.py
index b1401bd90e1..8319cde6544 100644
--- a/pymc3/tests/test_distributions_timeseries.py
+++ b/pymc3/tests/test_distributions_timeseries.py
@@ -15,6 +15,7 @@
 import numpy as np
 import pytest
 
+from pymc3.aesaraf import floatX
 from pymc3.distributions.continuous import Flat, Normal
 from pymc3.distributions.timeseries import AR, AR1, GARCH11, EulerMaruyama
 from pymc3.model import Model
@@ -24,7 +25,6 @@
     sample_posterior_predictive,
 )
 from pymc3.tests.helpers import select_by_precision
-from pymc3.theanof import floatX
 
 pytestmark = pytest.mark.usefixtures("seeded_test")
 
diff --git a/pymc3/tests/test_examples.py b/pymc3/tests/test_examples.py
index d79093b3927..5cb6c9c8c20 100644
--- a/pymc3/tests/test_examples.py
+++ b/pymc3/tests/test_examples.py
@@ -12,20 +12,20 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import aesara
+import aesara.tensor as aet
 import arviz as az
 import matplotlib
 import numpy as np
 import pandas as pd
 import pytest
-import theano
-import theano.tensor as tt
 
 from packaging import version
 
 import pymc3 as pm
 
+from pymc3.aesaraf import floatX
 from pymc3.tests.helpers import SeededTest
-from pymc3.theanof import floatX
 
 if version.parse(matplotlib.__version__) < version.parse("3.3"):
     matplotlib.use("Agg", warn=False)
@@ -68,7 +68,7 @@ def build_model(self):
 
         with pm.Model() as model:
             effects = pm.Normal("effects", mu=0, sigma=100, shape=len(P.columns))
-            logit_p = tt.dot(floatX(np.array(P)), effects)
+            logit_p = aet.dot(floatX(np.array(P)), effects)
             pm.Bernoulli("s", logit_p=logit_p, observed=floatX(data.switch.values))
         return model
 
@@ -186,13 +186,13 @@ def build_disaster_model(masked=False):
         # Allocate appropriate Poisson rates to years before and after current
         # switchpoint location
         idx = np.arange(years)
-        rate = tt.switch(switchpoint >= idx, early_mean, late_mean)
+        rate = aet.switch(switchpoint >= idx, early_mean, late_mean)
         # Data likelihood
         pm.Poisson("disasters", rate, observed=disasters_data)
     return model
 
 
-@pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32")
+@pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
 class TestDisasterModel(SeededTest):
     # Time series of recorded coal mining disasters in the UK from 1851 to 1962
     def test_disaster_model(self):
@@ -294,7 +294,7 @@ def test_run(self):
 
 
 @pytest.mark.xfail(
-    condition=(theano.config.floatX == "float32"),
+    condition=(aesara.config.floatX == "float32"),
     reason="Fails on float32 due to starting inf at starting logP",
 )
 class TestRSV(SeededTest):
diff --git a/pymc3/tests/test_gp.py b/pymc3/tests/test_gp.py
index 893aeeaf77c..77f4261bc43 100644
--- a/pymc3/tests/test_gp.py
+++ b/pymc3/tests/test_gp.py
@@ -16,11 +16,11 @@
 from functools import reduce
 from operator import add
 
+import aesara
+import aesara.tensor as aet
 import numpy as np
 import numpy.testing as npt
 import pytest
-import theano
-import theano.tensor as tt
 
 import pymc3 as pm
 
@@ -34,7 +34,7 @@ def test_value(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             zero_mean = pm.gp.mean.Zero()
-        M = theano.function([], zero_mean(X))()
+        M = aesara.function([], zero_mean(X))()
         assert np.all(M == 0)
         assert M.shape == (10,)
 
@@ -44,7 +44,7 @@ def test_value(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             const_mean = pm.gp.mean.Constant(6)
-        M = theano.function([], const_mean(X))()
+        M = aesara.function([], const_mean(X))()
         assert np.all(M == 6)
         assert M.shape == (10,)
 
@@ -54,7 +54,7 @@ def test_value(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             linear_mean = pm.gp.mean.Linear(2, 0.5)
-        M = theano.function([], linear_mean(X))()
+        M = aesara.function([], linear_mean(X))()
         npt.assert_allclose(M[1], 0.7222, atol=1e-3)
         assert M.shape == (10,)
 
@@ -66,7 +66,7 @@ def test_add(self):
             mean1 = pm.gp.mean.Linear(coeffs=2, intercept=0.5)
             mean2 = pm.gp.mean.Constant(2)
             mean = mean1 + mean2 + mean2
-        M = theano.function([], mean(X))()
+        M = aesara.function([], mean(X))()
         npt.assert_allclose(M[1], 0.7222 + 2 + 2, atol=1e-3)
 
     def test_prod(self):
@@ -75,7 +75,7 @@ def test_prod(self):
             mean1 = pm.gp.mean.Linear(coeffs=2, intercept=0.5)
             mean2 = pm.gp.mean.Constant(2)
             mean = mean1 * mean2 * mean2
-        M = theano.function([], mean(X))()
+        M = aesara.function([], mean(X))()
         npt.assert_allclose(M[1], 0.7222 * 2 * 2, atol=1e-3)
 
     def test_add_multid(self):
@@ -86,7 +86,7 @@ def test_add_multid(self):
             mean1 = pm.gp.mean.Linear(coeffs=A, intercept=b)
             mean2 = pm.gp.mean.Constant(2)
             mean = mean1 + mean2 + mean2
-        M = theano.function([], mean(X))()
+        M = aesara.function([], mean(X))()
         npt.assert_allclose(M[1], 10.8965 + 2 + 2, atol=1e-3)
 
     def test_prod_multid(self):
@@ -97,7 +97,7 @@ def test_prod_multid(self):
             mean1 = pm.gp.mean.Linear(coeffs=A, intercept=b)
             mean2 = pm.gp.mean.Constant(2)
             mean = mean1 * mean2 * mean2
-        M = theano.function([], mean(X))()
+        M = aesara.function([], mean(X))()
         npt.assert_allclose(M[1], 10.8965 * 2 * 2, atol=1e-3)
 
 
@@ -108,10 +108,10 @@ def test_symadd_cov(self):
             cov1 = pm.gp.cov.ExpQuad(1, 0.1)
             cov2 = pm.gp.cov.ExpQuad(1, 0.1)
             cov = cov1 + cov2
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 2 * 0.53940, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_rightadd_scalar(self):
@@ -119,10 +119,10 @@ def test_rightadd_scalar(self):
         with pm.Model() as model:
             a = 1
             cov = pm.gp.cov.ExpQuad(1, 0.1) + a
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 1.53940, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_leftadd_scalar(self):
@@ -130,10 +130,10 @@ def test_leftadd_scalar(self):
         with pm.Model() as model:
             a = 1
             cov = a + pm.gp.cov.ExpQuad(1, 0.1)
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 1.53940, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_rightadd_matrix(self):
@@ -141,21 +141,21 @@ def test_rightadd_matrix(self):
         M = 2 * np.ones((10, 10))
         with pm.Model() as model:
             cov = pm.gp.cov.ExpQuad(1, 0.1) + M
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 2.53940, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_leftadd_matrixt(self):
         X = np.linspace(0, 1, 10)[:, None]
-        M = 2 * tt.ones((10, 10))
+        M = 2 * aet.ones((10, 10))
         with pm.Model() as model:
             cov = M + pm.gp.cov.ExpQuad(1, 0.1)
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 2.53940, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_leftprod_matrix(self):
@@ -164,8 +164,8 @@ def test_leftprod_matrix(self):
         with pm.Model() as model:
             cov = M + pm.gp.cov.ExpQuad(1, 0.1)
             cov_true = pm.gp.cov.ExpQuad(1, 0.1) + M
-        K = theano.function([], cov(X))()
-        K_true = theano.function([], cov_true(X))()
+        K = aesara.function([], cov(X))()
+        K_true = aesara.function([], cov_true(X))()
         assert np.allclose(K, K_true)
 
     def test_inv_rightadd(self):
@@ -181,10 +181,10 @@ def test_symprod_cov(self):
             cov1 = pm.gp.cov.ExpQuad(1, 0.1)
             cov2 = pm.gp.cov.ExpQuad(1, 0.1)
             cov = cov1 * cov2
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 0.53940 * 0.53940, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_rightprod_scalar(self):
@@ -192,10 +192,10 @@ def test_rightprod_scalar(self):
         with pm.Model() as model:
             a = 2
             cov = pm.gp.cov.ExpQuad(1, 0.1) * a
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 2 * 0.53940, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_leftprod_scalar(self):
@@ -203,10 +203,10 @@ def test_leftprod_scalar(self):
         with pm.Model() as model:
             a = 2
             cov = a * pm.gp.cov.ExpQuad(1, 0.1)
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 2 * 0.53940, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_rightprod_matrix(self):
@@ -214,10 +214,10 @@ def test_rightprod_matrix(self):
         M = 2 * np.ones((10, 10))
         with pm.Model() as model:
             cov = pm.gp.cov.ExpQuad(1, 0.1) * M
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 2 * 0.53940, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_leftprod_matrix(self):
@@ -226,8 +226,8 @@ def test_leftprod_matrix(self):
         with pm.Model() as model:
             cov = M * pm.gp.cov.ExpQuad(1, 0.1)
             cov_true = pm.gp.cov.ExpQuad(1, 0.1) * M
-        K = theano.function([], cov(X))()
-        K_true = theano.function([], cov_true(X))()
+        K = aesara.function([], cov(X))()
+        K_true = aesara.function([], cov_true(X))()
         assert np.allclose(K, K_true)
 
     def test_multiops(self):
@@ -244,12 +244,12 @@ def test_multiops(self):
                 + pm.gp.cov.ExpQuad(1, 0.1)
                 + 3
             )
-        K1 = theano.function([], cov1(X))()
-        K2 = theano.function([], cov2(X))()
+        K1 = aesara.function([], cov1(X))()
+        K2 = aesara.function([], cov2(X))()
         assert np.allclose(K1, K2)
         # check diagonal
-        K1d = theano.function([], cov1(X, diag=True))()
-        K2d = theano.function([], cov2(X, diag=True))()
+        K1d = aesara.function([], cov1(X, diag=True))()
+        K2d = aesara.function([], cov2(X, diag=True))()
         npt.assert_allclose(np.diag(K1), K2d, atol=1e-5)
         npt.assert_allclose(np.diag(K2), K1d, atol=1e-5)
 
@@ -265,10 +265,10 @@ def test_symexp_cov(self):
         with pm.Model() as model:
             cov1 = pm.gp.cov.ExpQuad(1, 0.1)
             cov = cov1 ** 2
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 0.53940 ** 2, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_covexp_numpy(self):
@@ -276,32 +276,32 @@ def test_covexp_numpy(self):
         with pm.Model() as model:
             a = np.array([[2]])
             cov = pm.gp.cov.ExpQuad(1, 0.1) ** a
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 0.53940 ** 2, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
-    def test_covexp_theano(self):
+    def test_covexp_aesara(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
-            a = tt.alloc(2.0, 1, 1)
+            a = aet.alloc(2.0, 1, 1)
             cov = pm.gp.cov.ExpQuad(1, 0.1) ** a
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 0.53940 ** 2, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_covexp_shared(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
-            a = theano.shared(2.0)
+            a = aesara.shared(2.0)
             cov = pm.gp.cov.ExpQuad(1, 0.1) ** a
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 0.53940 ** 2, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_invalid_covexp(self):
@@ -321,11 +321,11 @@ def test_symprod_cov(self):
             cov1 = pm.gp.cov.ExpQuad(1, 0.1)
             cov2 = pm.gp.cov.ExpQuad(1, 0.1)
             cov = pm.gp.cov.Kron([cov1, cov2])
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 1 * 0.53940, atol=1e-3)
         npt.assert_allclose(K[0, 11], 0.53940 * 0.53940, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_multiops(self):
@@ -342,8 +342,8 @@ def test_multiops(self):
             )
             cov2 = pm.gp.cov.ExpQuad(1, 0.1) * pm.gp.cov.ExpQuad(2, 0.1)
             cov = pm.gp.cov.Kron([cov1, cov2])
-        K_true = kronecker(theano.function([], cov1(X1))(), theano.function([], cov2(X2))()).eval()
-        K = theano.function([], cov(X))()
+        K_true = kronecker(aesara.function([], cov1(X1))(), aesara.function([], cov2(X2))()).eval()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K_true, K)
 
 
@@ -352,30 +352,30 @@ def test_slice1(self):
         X = np.linspace(0, 1, 30).reshape(10, 3)
         with pm.Model() as model:
             cov = pm.gp.cov.ExpQuad(3, 0.1, active_dims=[0, 0, 1])
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 0.20084298, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_slice2(self):
         X = np.linspace(0, 1, 30).reshape(10, 3)
         with pm.Model() as model:
             cov = pm.gp.cov.ExpQuad(3, ls=[0.1, 0.1], active_dims=[1, 2])
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 0.34295549, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_slice3(self):
         X = np.linspace(0, 1, 30).reshape(10, 3)
         with pm.Model() as model:
             cov = pm.gp.cov.ExpQuad(3, ls=np.array([0.1, 0.1]), active_dims=[1, 2])
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 0.34295549, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_diffslice(self):
@@ -384,10 +384,10 @@ def test_diffslice(self):
             cov = pm.gp.cov.ExpQuad(3, ls=0.1, active_dims=[1, 0, 0]) + pm.gp.cov.ExpQuad(
                 3, ls=[0.1, 0.2, 0.3]
             )
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 0.683572, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_raises(self):
@@ -402,7 +402,7 @@ def test_stable(self):
         X = np.random.uniform(low=320.0, high=400.0, size=[2000, 2])
         with pm.Model() as model:
             cov = pm.gp.cov.ExpQuad(2, 0.1)
-        dists = theano.function([], cov.square_dist(X, X))()
+        dists = aesara.function([], cov.square_dist(X, X))()
         assert not np.any(dists < 0)
 
 
@@ -411,44 +411,44 @@ def test_1d(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             cov = pm.gp.cov.ExpQuad(1, 0.1)
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 0.53940, atol=1e-3)
-        K = theano.function([], cov(X, X))()
+        K = aesara.function([], cov(X, X))()
         npt.assert_allclose(K[0, 1], 0.53940, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_2d(self):
         X = np.linspace(0, 1, 10).reshape(5, 2)
         with pm.Model() as model:
             cov = pm.gp.cov.ExpQuad(2, 0.5)
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 0.820754, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_2dard(self):
         X = np.linspace(0, 1, 10).reshape(5, 2)
         with pm.Model() as model:
             cov = pm.gp.cov.ExpQuad(2, np.array([1, 2]))
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 0.969607, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_inv_lengthscale(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             cov = pm.gp.cov.ExpQuad(1, ls_inv=10)
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 0.53940, atol=1e-3)
-        K = theano.function([], cov(X, X))()
+        K = aesara.function([], cov(X, X))()
         npt.assert_allclose(K[0, 1], 0.53940, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
 
@@ -457,14 +457,14 @@ def test_1d(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             cov = pm.gp.cov.WhiteNoise(sigma=0.5)
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 0.0, atol=1e-3)
         npt.assert_allclose(K[0, 0], 0.5 ** 2, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
         # check predict
-        K = theano.function([], cov(X, X))()
+        K = aesara.function([], cov(X, X))()
         npt.assert_allclose(K[0, 1], 0.0, atol=1e-3)
         # white noise predicting should return all zeros
         npt.assert_allclose(K[0, 0], 0.0, atol=1e-3)
@@ -475,14 +475,14 @@ def test_1d(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             cov = pm.gp.cov.Constant(2.5)
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 2.5, atol=1e-3)
         npt.assert_allclose(K[0, 0], 2.5, atol=1e-3)
-        K = theano.function([], cov(X, X))()
+        K = aesara.function([], cov(X, X))()
         npt.assert_allclose(K[0, 1], 2.5, atol=1e-3)
         npt.assert_allclose(K[0, 0], 2.5, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
 
@@ -491,12 +491,12 @@ def test_1d(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             cov = pm.gp.cov.RatQuad(1, ls=0.1, alpha=0.5)
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 0.66896, atol=1e-3)
-        K = theano.function([], cov(X, X))()
+        K = aesara.function([], cov(X, X))()
         npt.assert_allclose(K[0, 1], 0.66896, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
 
@@ -505,12 +505,12 @@ def test_1d(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             cov = pm.gp.cov.Exponential(1, 0.1)
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 0.57375, atol=1e-3)
-        K = theano.function([], cov(X, X))()
+        K = aesara.function([], cov(X, X))()
         npt.assert_allclose(K[0, 1], 0.57375, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
 
@@ -519,12 +519,12 @@ def test_1d(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             cov = pm.gp.cov.Matern52(1, 0.1)
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 0.46202, atol=1e-3)
-        K = theano.function([], cov(X, X))()
+        K = aesara.function([], cov(X, X))()
         npt.assert_allclose(K[0, 1], 0.46202, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
 
@@ -533,12 +533,12 @@ def test_1d(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             cov = pm.gp.cov.Matern32(1, 0.1)
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 0.42682, atol=1e-3)
-        K = theano.function([], cov(X, X))()
+        K = aesara.function([], cov(X, X))()
         npt.assert_allclose(K[0, 1], 0.42682, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
 
@@ -547,11 +547,11 @@ def test_1d(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             cov = pm.gp.cov.Matern12(1, 0.1)
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 0.32919, atol=1e-3)
-        K = theano.function([], cov(X, X))()
+        K = aesara.function([], cov(X, X))()
         npt.assert_allclose(K[0, 1], 0.32919, atol=1e-3)
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
 
@@ -560,12 +560,12 @@ def test_1d(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             cov = pm.gp.cov.Cosine(1, 0.1)
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 0.766, atol=1e-3)
-        K = theano.function([], cov(X, X))()
+        K = aesara.function([], cov(X, X))()
         npt.assert_allclose(K[0, 1], 0.766, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
 
@@ -574,12 +574,12 @@ def test_1d(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             cov = pm.gp.cov.Periodic(1, 0.1, 0.1)
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 0.00288, atol=1e-3)
-        K = theano.function([], cov(X, X))()
+        K = aesara.function([], cov(X, X))()
         npt.assert_allclose(K[0, 1], 0.00288, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
 
@@ -588,12 +588,12 @@ def test_1d(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             cov = pm.gp.cov.Linear(1, 0.5)
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 0.19444, atol=1e-3)
-        K = theano.function([], cov(X, X))()
+        K = aesara.function([], cov(X, X))()
         npt.assert_allclose(K[0, 1], 0.19444, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
 
@@ -602,12 +602,12 @@ def test_1d(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             cov = pm.gp.cov.Polynomial(1, 0.5, 2, 0)
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 0.03780, atol=1e-3)
-        K = theano.function([], cov(X, X))()
+        K = aesara.function([], cov(X, X))()
         npt.assert_allclose(K[0, 1], 0.03780, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
 
@@ -616,17 +616,17 @@ def test_1d(self):
         X = np.linspace(0, 1, 10)[:, None]
 
         def warp_func(x, a, b, c):
-            return x + (a * tt.tanh(b * (x - c)))
+            return x + (a * aet.tanh(b * (x - c)))
 
         with pm.Model() as model:
             cov_m52 = pm.gp.cov.Matern52(1, 0.2)
             cov = pm.gp.cov.WarpedInput(1, warp_func=warp_func, args=(1, 10, 1), cov_func=cov_m52)
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 0.79593, atol=1e-3)
-        K = theano.function([], cov(X, X))()
+        K = aesara.function([], cov(X, X))()
         npt.assert_allclose(K[0, 1], 0.79593, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_raises(self):
@@ -642,16 +642,16 @@ def test_1d(self):
         X = np.linspace(0, 2, 10)[:, None]
 
         def tanh_func(x, x1, x2, w, x0):
-            return (x1 + x2) / 2.0 - (x1 - x2) / 2.0 * tt.tanh((x - x0) / w)
+            return (x1 + x2) / 2.0 - (x1 - x2) / 2.0 * aet.tanh((x - x0) / w)
 
         with pm.Model() as model:
             cov = pm.gp.cov.Gibbs(1, tanh_func, args=(0.05, 0.6, 0.4, 1.0))
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[2, 3], 0.136683, atol=1e-4)
-        K = theano.function([], cov(X, X))()
+        K = aesara.function([], cov(X, X))()
         npt.assert_allclose(K[2, 3], 0.136683, atol=1e-4)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_raises(self):
@@ -673,12 +673,12 @@ def scaling_func(x, a, b):
         with pm.Model() as model:
             cov_m52 = pm.gp.cov.Matern52(1, 0.2)
             cov = pm.gp.cov.ScaledCov(1, scaling_func=scaling_func, args=(2, -1), cov_func=cov_m52)
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], 3.00686, atol=1e-3)
-        K = theano.function([], cov(X, X))()
+        K = aesara.function([], cov(X, X))()
         npt.assert_allclose(K[0, 1], 3.00686, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_raises(self):
@@ -1200,12 +1200,12 @@ def test_1d_tau1(self):
         etalon = 0.600881
         with pm.Model():
             cov = pm.gp.cov.Circular(1, 1, tau=5)
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], etalon, atol=1e-3)
-        K = theano.function([], cov(X, X))()
+        K = aesara.function([], cov(X, X))()
         npt.assert_allclose(K[0, 1], etalon, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_1d_tau2(self):
@@ -1213,10 +1213,10 @@ def test_1d_tau2(self):
         etalon = 0.691239
         with pm.Model():
             cov = pm.gp.cov.Circular(1, 1, tau=4)
-        K = theano.function([], cov(X))()
+        K = aesara.function([], cov(X))()
         npt.assert_allclose(K[0, 1], etalon, atol=1e-3)
-        K = theano.function([], cov(X, X))()
+        K = aesara.function([], cov(X, X))()
         npt.assert_allclose(K[0, 1], etalon, atol=1e-3)
         # check diagonal
-        Kd = theano.function([], cov(X, diag=True))()
+        Kd = aesara.function([], cov(X, diag=True))()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
diff --git a/pymc3/tests/test_hmc.py b/pymc3/tests/test_hmc.py
index 057c3170750..1a113343cc8 100644
--- a/pymc3/tests/test_hmc.py
+++ b/pymc3/tests/test_hmc.py
@@ -19,9 +19,9 @@
 
 import pymc3
 
+from pymc3.aesaraf import floatX
 from pymc3.step_methods.hmc.base_hmc import BaseHMC
 from pymc3.tests import models
-from pymc3.theanof import floatX
 
 logger = logging.getLogger("pymc3")
 
diff --git a/pymc3/tests/test_math.py b/pymc3/tests/test_math.py
index b31319021fd..b82459602db 100644
--- a/pymc3/tests/test_math.py
+++ b/pymc3/tests/test_math.py
@@ -12,14 +12,15 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import aesara
+import aesara.tensor as aet
 import numpy as np
 import numpy.testing as npt
 import pytest
-import theano
-import theano.tensor as tt
 
 from scipy.special import logsumexp as scipy_logsumexp
 
+from pymc3.aesaraf import floatX
 from pymc3.math import (
     LogDet,
     cartesian,
@@ -36,7 +37,6 @@
     probit,
 )
 from pymc3.tests.helpers import SeededTest, verify_grad
-from pymc3.theanof import floatX
 
 
 def test_kronecker():
@@ -45,7 +45,7 @@ def test_kronecker():
     [a, b, c] = [np.random.rand(3, 3 + i) for i in range(3)]
 
     custom = kronecker(a, b, c)  # Custom version
-    nested = tt.slinalg.kron(a, tt.slinalg.kron(b, c))
+    nested = aet.slinalg.kron(a, aet.slinalg.kron(b, c))
     np.testing.assert_array_almost_equal(custom.eval(), nested.eval())  # Standard nested version
 
 
@@ -83,7 +83,7 @@ def test_kron_dot():
     x = np.random.rand(tot_size).reshape((tot_size, 1))
     # Construct entire kronecker product then multiply
     big = kronecker(*Ks)
-    slow_ans = tt.dot(big, x)
+    slow_ans = aet.dot(big, x)
     # Use tricks to avoid construction of entire kronecker product
     fast_ans = kron_dot(Ks, x)
     np.testing.assert_array_almost_equal(slow_ans.eval(), fast_ans.eval())
@@ -98,7 +98,7 @@ def test_kron_solve_lower():
     x = np.random.rand(tot_size).reshape((tot_size, 1))
     # Construct entire kronecker product then solve
     big = kronecker(*Ls)
-    slow_ans = tt.slinalg.solve_lower_triangular(big, x)
+    slow_ans = aet.slinalg.solve_lower_triangular(big, x)
     # Use tricks to avoid construction of entire kronecker product
     fast_ans = kron_solve_lower(Ls, x)
     np.testing.assert_array_almost_equal(slow_ans.eval(), fast_ans.eval())
@@ -170,10 +170,10 @@ def setup_method(self):
         self.op_class = LogDet
         self.op = logdet
 
-    @theano.config.change_flags(compute_test_value="ignore")
+    @aesara.config.change_flags(compute_test_value="ignore")
     def validate(self, input_mat):
-        x = theano.tensor.matrix()
-        f = theano.function([x], self.op(x))
+        x = aesara.tensor.matrix()
+        f = aesara.function([x], self.op(x))
         out = f(input_mat)
         svd_diag = np.linalg.svd(input_mat, compute_uv=False)
         numpy_out = np.sum(np.log(np.abs(svd_diag)))
@@ -185,24 +185,24 @@ def validate(self, input_mat):
         verify_grad(self.op, [input_mat])
 
     @pytest.mark.skipif(
-        theano.config.device in ["cuda", "gpu"],
+        aesara.config.device in ["cuda", "gpu"],
         reason="No logDet implementation on GPU.",
     )
     def test_basic(self):
         # Calls validate with different params
         test_case_1 = np.random.randn(3, 3) / np.sqrt(3)
         test_case_2 = np.random.randn(10, 10) / np.sqrt(10)
-        self.validate(test_case_1.astype(theano.config.floatX))
-        self.validate(test_case_2.astype(theano.config.floatX))
+        self.validate(test_case_1.astype(aesara.config.floatX))
+        self.validate(test_case_2.astype(aesara.config.floatX))
 
 
 def test_expand_packed_triangular():
     with pytest.raises(ValueError):
-        x = tt.matrix("x")
-        x.tag.test_value = np.array([[1.0]], dtype=theano.config.floatX)
+        x = aet.matrix("x")
+        x.tag.test_value = np.array([[1.0]], dtype=aesara.config.floatX)
         expand_packed_triangular(5, x)
     N = 5
-    packed = tt.vector("packed")
+    packed = aet.vector("packed")
     packed.tag.test_value = floatX(np.zeros(N * (N + 1) // 2))
     with pytest.raises(TypeError):
         expand_packed_triangular(packed.shape[0], packed)
diff --git a/pymc3/tests/test_minibatches.py b/pymc3/tests/test_minibatches.py
index 34dadaa8eba..9629a0765c7 100644
--- a/pymc3/tests/test_minibatches.py
+++ b/pymc3/tests/test_minibatches.py
@@ -15,18 +15,18 @@
 import itertools
 import pickle
 
+import aesara
 import numpy as np
 import pytest
-import theano
 
+from aesara import tensor as aet
 from scipy import stats as stats
-from theano import tensor as tt
 
 import pymc3 as pm
 
-from pymc3 import GeneratorAdapter, Normal, floatX, generator, tt_rng
+from pymc3 import GeneratorAdapter, Normal, aet_rng, floatX, generator
+from pymc3.aesaraf import GeneratorOp
 from pymc3.tests.helpers import select_by_precision
-from pymc3.theanof import GeneratorOp
 
 
 class _DataSampler:
@@ -35,7 +35,7 @@ class _DataSampler:
     """
 
     def __init__(self, data, batchsize=50, random_seed=42, dtype="floatX"):
-        self.dtype = theano.config.floatX if dtype == "floatX" else dtype
+        self.dtype = aesara.config.floatX if dtype == "floatX" else dtype
         self.rng = np.random.RandomState(random_seed)
         self.data = data
         self.n = batchsize
@@ -77,7 +77,7 @@ def test_basic(self):
         generator = GeneratorAdapter(integers())
         gop = GeneratorOp(generator)()
         assert gop.tag.test_value == np.float32(0)
-        f = theano.function([], gop)
+        f = aesara.function([], gop)
         assert f() == np.float32(0)
         assert f() == np.float32(1)
         for _ in range(2, 100):
@@ -89,7 +89,7 @@ def test_ndim(self):
             res = list(itertools.islice(integers_ndim(ndim), 0, 2))
             generator = GeneratorAdapter(integers_ndim(ndim))
             gop = GeneratorOp(generator)()
-            f = theano.function([], gop)
+            f = aesara.function([], gop)
             assert ndim == res[0].ndim
             np.testing.assert_equal(f(), res[0])
             np.testing.assert_equal(f(), res[1])
@@ -97,9 +97,9 @@ def test_ndim(self):
     def test_cloning_available(self):
         gop = generator(integers())
         res = gop ** 2
-        shared = theano.shared(floatX(10))
-        res1 = theano.clone(res, {gop: shared})
-        f = theano.function([], res1)
+        shared = aesara.shared(floatX(10))
+        res1 = aesara.clone_replace(res, {gop: shared})
+        f = aesara.function([], res1)
         assert f() == np.float32(100)
 
     def test_default_value(self):
@@ -108,7 +108,7 @@ def gen():
                 yield floatX(np.ones((10, 10)) * i)
 
         gop = generator(gen(), np.ones((10, 10)) * 10)
-        f = theano.function([], gop)
+        f = aesara.function([], gop)
         np.testing.assert_equal(np.ones((10, 10)) * 0, f())
         np.testing.assert_equal(np.ones((10, 10)) * 1, f())
         np.testing.assert_equal(np.ones((10, 10)) * 10, f())
@@ -121,7 +121,7 @@ def gen():
                 yield floatX(np.ones((10, 10)) * i)
 
         gop = generator(gen())
-        f = theano.function([], gop)
+        f = aesara.function([], gop)
         np.testing.assert_equal(np.ones((10, 10)) * 0, f())
         np.testing.assert_equal(np.ones((10, 10)) * 1, f())
         with pytest.raises(StopIteration):
@@ -139,12 +139,12 @@ def test_pickling(self, datagen):
 
     def test_gen_cloning_with_shape_change(self, datagen):
         gen = generator(datagen)
-        gen_r = tt_rng().normal(size=gen.shape).T
+        gen_r = aet_rng().normal(size=gen.shape).T
         X = gen.dot(gen_r)
-        res, _ = theano.scan(lambda x: x.sum(), X, n_steps=X.shape[0])
+        res, _ = aesara.scan(lambda x: x.sum(), X, n_steps=X.shape[0])
         assert res.eval().shape == (50,)
-        shared = theano.shared(datagen.data.astype(gen.dtype))
-        res2 = theano.clone(res, {gen: shared ** 2})
+        shared = aesara.shared(datagen.data.astype(gen.dtype))
+        res2 = aesara.clone_replace(res, {gen: shared ** 2})
         assert res2.eval().shape == (1000,)
 
 
@@ -170,11 +170,11 @@ class TestScaling:
     def test_density_scaling(self):
         with pm.Model() as model1:
             Normal("n", observed=[[1]], total_size=1)
-            p1 = theano.function([], model1.logpt)
+            p1 = aesara.function([], model1.logpt)
 
         with pm.Model() as model2:
             Normal("n", observed=[[1]], total_size=2)
-            p2 = theano.function([], model2.logpt)
+            p2 = aesara.function([], model2.logpt)
         assert p1() * 2 == p2()
 
     def test_density_scaling_with_genarator(self):
@@ -189,12 +189,12 @@ def true_dens():
         # We have same size models
         with pm.Model() as model1:
             Normal("n", observed=gen1(), total_size=100)
-            p1 = theano.function([], model1.logpt)
+            p1 = aesara.function([], model1.logpt)
 
         with pm.Model() as model2:
             gen_var = generator(gen2())
             Normal("n", observed=gen_var, total_size=100)
-            p2 = theano.function([], model2.logpt)
+            p2 = aesara.function([], model2.logpt)
 
         for i in range(10):
             _1, _2, _t = p1(), p2(), next(t)
@@ -208,12 +208,12 @@ def test_gradient_with_scaling(self):
             genvar = generator(gen1())
             m = Normal("m")
             Normal("n", observed=genvar, total_size=1000)
-            grad1 = theano.function([m], tt.grad(model1.logpt, m))
+            grad1 = aesara.function([m], aet.grad(model1.logpt, m))
         with pm.Model() as model2:
             m = Normal("m")
-            shavar = theano.shared(np.ones((1000, 100)))
+            shavar = aesara.shared(np.ones((1000, 100)))
             Normal("n", observed=shavar)
-            grad2 = theano.function([m], tt.grad(model2.logpt, m))
+            grad2 = aesara.function([m], aet.grad(model2.logpt, m))
 
         for i in range(10):
             shavar.set_value(np.ones((100, 100)) * i)
@@ -224,27 +224,27 @@ def test_gradient_with_scaling(self):
     def test_multidim_scaling(self):
         with pm.Model() as model0:
             Normal("n", observed=[[1, 1], [1, 1]], total_size=[])
-            p0 = theano.function([], model0.logpt)
+            p0 = aesara.function([], model0.logpt)
 
         with pm.Model() as model1:
             Normal("n", observed=[[1, 1], [1, 1]], total_size=[2, 2])
-            p1 = theano.function([], model1.logpt)
+            p1 = aesara.function([], model1.logpt)
 
         with pm.Model() as model2:
             Normal("n", observed=[[1], [1]], total_size=[2, 2])
-            p2 = theano.function([], model2.logpt)
+            p2 = aesara.function([], model2.logpt)
 
         with pm.Model() as model3:
             Normal("n", observed=[[1, 1]], total_size=[2, 2])
-            p3 = theano.function([], model3.logpt)
+            p3 = aesara.function([], model3.logpt)
 
         with pm.Model() as model4:
             Normal("n", observed=[[1]], total_size=[2, 2])
-            p4 = theano.function([], model4.logpt)
+            p4 = aesara.function([], model4.logpt)
 
         with pm.Model() as model5:
             Normal("n", observed=[[1]], total_size=[2, Ellipsis, 2])
-            p5 = theano.function([], model5.logpt)
+            p5 = aesara.function([], model5.logpt)
         _p0 = p0()
         assert (
             np.allclose(_p0, p1())
@@ -287,11 +287,11 @@ def test_mixed2(self):
     def test_free_rv(self):
         with pm.Model() as model4:
             Normal("n", observed=[[1, 1], [1, 1]], total_size=[2, 2])
-            p4 = theano.function([], model4.logpt)
+            p4 = aesara.function([], model4.logpt)
 
         with pm.Model() as model5:
             Normal("n", total_size=[2, Ellipsis, 2], shape=(1, 1), broadcastable=(False, False))
-            p5 = theano.function([model5.n], model5.logpt)
+            p5 = aesara.function([model5.n], model5.logpt)
         assert p4() == p5(pm.floatX([[1]]))
         assert p4() == p5(pm.floatX([[1, 1], [1, 1]]))
 
@@ -327,15 +327,15 @@ def test_special4(self):
     def test_cloning_available(self):
         gop = pm.Minibatch(np.arange(100), 1)
         res = gop ** 2
-        shared = theano.shared(np.array([10]))
-        res1 = theano.clone(res, {gop: shared})
-        f = theano.function([], res1)
+        shared = aesara.shared(np.array([10]))
+        res1 = aesara.clone_replace(res, {gop: shared})
+        f = aesara.function([], res1)
         assert f() == np.array([100])
 
     def test_align(self):
         m = pm.Minibatch(np.arange(1000), 1, random_seed=1)
         n = pm.Minibatch(np.arange(1000), 1, random_seed=1)
-        f = theano.function([], [m, n])
+        f = aesara.function([], [m, n])
         n.eval()  # not aligned
         a, b = zip(*(f() for _ in range(1000)))
         assert a != b
diff --git a/pymc3/tests/test_mixture.py b/pymc3/tests/test_mixture.py
index 94b272bf43d..a6646b812b9 100644
--- a/pymc3/tests/test_mixture.py
+++ b/pymc3/tests/test_mixture.py
@@ -12,14 +12,14 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import aesara
 import numpy as np
 import pytest
 import scipy.stats as st
-import theano
 
+from aesara import tensor as aet
 from numpy.testing import assert_allclose
 from scipy.special import logsumexp
-from theano import tensor as tt
 
 import pymc3 as pm
 
@@ -37,9 +37,9 @@
     Poisson,
     sample,
 )
+from pymc3.aesaraf import floatX
 from pymc3.distributions.shape_utils import to_tuple
 from pymc3.tests.helpers import SeededTest
-from pymc3.theanof import floatX
 
 
 # Generate data
@@ -248,7 +248,7 @@ def test_mixture_of_mvn(self):
                 st.multivariate_normal.logpdf(obs, mu2, cov2),
             )
         ).T
-        complogp = y.distribution._comp_logp(theano.shared(obs)).eval()
+        complogp = y.distribution._comp_logp(aesara.shared(obs)).eval()
         assert_allclose(complogp, complogp_st)
 
         # check logp of mixture
@@ -264,7 +264,7 @@ def test_mixture_of_mvn(self):
         assert_allclose(model.logp(testpoint), mixlogp_st.sum() + priorlogp)
 
     def test_mixture_of_mixture(self):
-        if theano.config.floatX == "float32":
+        if aesara.config.floatX == "float32":
             rtol = 1e-4
         else:
             rtol = 1e-7
@@ -290,7 +290,7 @@ def test_mixture_of_mixture(self):
         test_point = model.test_point
 
         def mixmixlogp(value, point):
-            floatX = theano.config.floatX
+            floatX = aesara.config.floatX
             priorlogp = (
                 st.dirichlet.logpdf(
                     x=point["g_w"],
@@ -392,7 +392,7 @@ def setup_method(self, *args, **kwargs):
         super().setup_method(*args, **kwargs)
         self.nd = 3
         self.npop = 3
-        self.mus = tt.as_tensor_variable(
+        self.mus = aet.as_tensor_variable(
             np.tile(
                 np.reshape(
                     np.arange(self.npop),
@@ -446,7 +446,7 @@ def test_2d_w(self):
                 shape=nd,
             )
             z = pm.Categorical("z", p=np.ones(npop) / npop, shape=nd)
-            mu = tt.as_tensor_variable([mus[i, z[i]] for i in range(nd)])
+            mu = aet.as_tensor_variable([mus[i, z[i]] for i in range(nd)])
             latent_m = pm.Normal("latent_m", mu=mu, sigma=1e-5, shape=nd)
 
         m_val = m.random(size=size)
@@ -470,7 +470,7 @@ def samples_from_same_distribution(self, *args):
         assert p_marginal >= 0.05 and p_correlation >= 0.05
 
     def logp_matches(self, mixture, latent_mix, z, npop, model):
-        if theano.config.floatX == "float32":
+        if aesara.config.floatX == "float32":
             rtol = 1e-4
         else:
             rtol = 1e-7
@@ -523,7 +523,7 @@ def test_with_multinomial(self, batch_shape):
         assert prior["mixture"].shape == (self.n_samples, *batch_shape, 3)
         assert mixture.random(size=self.size).shape == (self.size, *batch_shape, 3)
 
-        if theano.config.floatX == "float32":
+        if aesara.config.floatX == "float32":
             rtol = 1e-4
         else:
             rtol = 1e-7
@@ -558,7 +558,7 @@ def test_with_mvnormal(self):
         assert prior["mixture"].shape == (self.n_samples, 3)
         assert mixture.random(size=self.size).shape == (self.size, 3)
 
-        if theano.config.floatX == "float32":
+        if aesara.config.floatX == "float32":
             rtol = 1e-4
         else:
             rtol = 1e-7
diff --git a/pymc3/tests/test_model.py b/pymc3/tests/test_model.py
index 2e5a83c1c33..4d747e203da 100644
--- a/pymc3/tests/test_model.py
+++ b/pymc3/tests/test_model.py
@@ -15,12 +15,12 @@
 import pickle
 import unittest
 
+import aesara
+import aesara.tensor as aet
 import numpy as np
 import numpy.testing as npt
 import pandas as pd
 import pytest
-import theano
-import theano.tensor as tt
 
 import pymc3 as pm
 
@@ -39,8 +39,8 @@ def __init__(self, name="", model=None):
         self.v2 = pm.Normal("v2", mu=0, sigma=1)
         # 2) Potentials and Deterministic variables with method too
         # be sure that names will not overlap with other same models
-        pm.Deterministic("d", tt.constant(1))
-        pm.Potential("p", tt.constant(1))
+        pm.Deterministic("d", aet.constant(1))
+        pm.Potential("p", aet.constant(1))
 
 
 class DocstringModel(pm.Model):
@@ -50,7 +50,7 @@ def __init__(self, mean=0, sigma=1, name="", model=None):
         Normal("v2", mu=mean, sigma=sigma)
         Normal("v3", mu=mean, sigma=HalfCauchy("sd", beta=10, testval=1.0))
         Deterministic("v3_sq", self.v3 ** 2)
-        Potential("p1", tt.constant(1))
+        Potential("p1", aet.constant(1))
 
 
 class TestBaseModel:
@@ -156,7 +156,7 @@ def test_observed_rv_fail(self):
 
     def test_observed_type(self):
         X_ = np.random.randn(100, 5)
-        X = pm.floatX(theano.shared(X_))
+        X = pm.floatX(aesara.shared(X_))
         with pm.Model():
             x1 = pm.Normal("x1", observed=X_)
             x2 = pm.Normal("x2", observed=X)
@@ -165,21 +165,21 @@ def test_observed_type(self):
         assert x2.type == X.type
 
 
-class TestTheanoConfig:
+class TestAesaraConfig:
     def test_set_testval_raise(self):
-        with theano.config.change_flags(compute_test_value="off"):
+        with aesara.config.change_flags(compute_test_value="off"):
             with pm.Model():
-                assert theano.config.compute_test_value == "raise"
-            assert theano.config.compute_test_value == "off"
+                assert aesara.config.compute_test_value == "raise"
+            assert aesara.config.compute_test_value == "off"
 
     def test_nested(self):
-        with theano.config.change_flags(compute_test_value="off"):
-            with pm.Model(theano_config={"compute_test_value": "ignore"}):
-                assert theano.config.compute_test_value == "ignore"
-                with pm.Model(theano_config={"compute_test_value": "warn"}):
-                    assert theano.config.compute_test_value == "warn"
-                assert theano.config.compute_test_value == "ignore"
-            assert theano.config.compute_test_value == "off"
+        with aesara.config.change_flags(compute_test_value="off"):
+            with pm.Model(aesara_config={"compute_test_value": "ignore"}):
+                assert aesara.config.compute_test_value == "ignore"
+                with pm.Model(aesara_config={"compute_test_value": "warn"}):
+                    assert aesara.config.compute_test_value == "warn"
+                assert aesara.config.compute_test_value == "ignore"
+            assert aesara.config.compute_test_value == "off"
 
 
 def test_matrix_multiplication():
@@ -262,7 +262,7 @@ def test_empty_observed():
 
 class TestValueGradFunction(unittest.TestCase):
     def test_no_extra(self):
-        a = tt.vector("a")
+        a = aet.vector("a")
         a.tag.test_value = np.zeros(3, dtype=a.dtype)
         a.dshape = (3,)
         a.dsize = 3
@@ -270,7 +270,7 @@ def test_no_extra(self):
         assert f_grad.size == 3
 
     def test_invalid_type(self):
-        a = tt.ivector("a")
+        a = aet.ivector("a")
         a.tag.test_value = np.zeros(3, dtype=a.dtype)
         a.dshape = (3,)
         a.dsize = 3
@@ -279,19 +279,19 @@ def test_invalid_type(self):
         err.match("Invalid dtype")
 
     def setUp(self):
-        extra1 = tt.iscalar("extra1")
+        extra1 = aet.iscalar("extra1")
         extra1_ = np.array(0, dtype=extra1.dtype)
         extra1.tag.test_value = extra1_
         extra1.dshape = tuple()
         extra1.dsize = 1
 
-        val1 = tt.vector("val1")
+        val1 = aet.vector("val1")
         val1_ = np.zeros(3, dtype=val1.dtype)
         val1.tag.test_value = val1_
         val1.dshape = (3,)
         val1.dsize = 3
 
-        val2 = tt.matrix("val2")
+        val2 = aet.matrix("val2")
         val2_ = np.zeros((2, 3), dtype=val2.dtype)
         val2.tag.test_value = val2_
         val2.dshape = (2, 3)
@@ -366,8 +366,8 @@ def test_tensor_type_conversion(self):
 
         assert m["x2_missing"].type == gf._extra_vars_shared["x2_missing"].type
 
-    def test_theano_switch_broadcast_edge_cases(self):
-        # Tests against two subtle issues related to a previous bug in Theano where tt.switch would not
+    def test_aesara_switch_broadcast_edge_cases(self):
+        # Tests against two subtle issues related to a previous bug in Aesara where aet.switch would not
         # always broadcast tensors with single values https://github.com/pymc-devs/aesara/issues/270
 
         # Known issue 1: https://github.com/pymc-devs/pymc3/issues/4389
diff --git a/pymc3/tests/test_model_graph.py b/pymc3/tests/test_model_graph.py
index d68abafaabd..fe0d10955c2 100644
--- a/pymc3/tests/test_model_graph.py
+++ b/pymc3/tests/test_model_graph.py
@@ -12,8 +12,8 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import aesara as th
 import numpy as np
-import theano as th
 
 import pymc3 as pm
 
diff --git a/pymc3/tests/test_model_helpers.py b/pymc3/tests/test_model_helpers.py
index 20745febad0..72bd1b058a7 100644
--- a/pymc3/tests/test_model_helpers.py
+++ b/pymc3/tests/test_model_helpers.py
@@ -12,15 +12,18 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import aesara
+import aesara.sparse as sparse
+import aesara.tensor as aet
 import numpy as np
 import numpy.ma as ma
 import numpy.testing as npt
 import pandas as pd
 import pytest
 import scipy.sparse as sps
-import theano
-import theano.sparse as sparse
-import theano.tensor as tt
+
+from aesara.graph.basic import Variable
+from aesara.tensor.var import TensorConstant, TensorVariable
 
 import pymc3 as pm
 
@@ -37,7 +40,7 @@ def test_pandas_to_array(self, input_dtype):
         dense_input = np.arange(9).reshape((3, 3)).astype(input_dtype)
 
         input_name = "input_variable"
-        theano_graph_input = tt.as_tensor(dense_input, name=input_name)
+        aesara_graph_input = aet.as_tensor(dense_input, name=input_name)
         pandas_input = pd.DataFrame(dense_input)
 
         # All the even numbers are replaced with NaN
@@ -77,22 +80,22 @@ def test_pandas_to_array(self, input_dtype):
             assert func_output.shape == input_value.shape
             npt.assert_allclose(func_output, masked_array_input)
 
-        # Check function behavior with Theano graph variable
-        theano_output = func(theano_graph_input)
-        assert isinstance(theano_output, theano.graph.basic.Variable)
-        npt.assert_allclose(theano_output.eval(), theano_graph_input.eval())
-        intX = pm.theanof._conversion_map[theano.config.floatX]
-        if dense_input.dtype == intX or dense_input.dtype == theano.config.floatX:
-            assert theano_output.owner is None  # func should not have added new nodes
-            assert theano_output.name == input_name
+        # Check function behavior with Aesara graph variable
+        aesara_output = func(aesara_graph_input)
+        assert isinstance(aesara_output, Variable)
+        npt.assert_allclose(aesara_output.eval(), aesara_graph_input.eval())
+        intX = pm.aesaraf._conversion_map[aesara.config.floatX]
+        if dense_input.dtype == intX or dense_input.dtype == aesara.config.floatX:
+            assert aesara_output.owner is None  # func should not have added new nodes
+            assert aesara_output.name == input_name
         else:
-            assert theano_output.owner is not None  # func should have casted
-            assert theano_output.owner.inputs[0].name == input_name
+            assert aesara_output.owner is not None  # func should have casted
+            assert aesara_output.owner.inputs[0].name == input_name
 
         if "float" in input_dtype:
-            assert theano_output.dtype == theano.config.floatX
+            assert aesara_output.dtype == aesara.config.floatX
         else:
-            assert theano_output.dtype == intX
+            assert aesara_output.dtype == intX
 
         # Check function behavior with generator data
         generator_output = func(square_generator)
@@ -102,15 +105,15 @@ def test_pandas_to_array(self, input_dtype):
         # Make sure the returned object has .set_gen and .set_default methods
         assert hasattr(wrapped, "set_gen")
         assert hasattr(wrapped, "set_default")
-        # Make sure the returned object is a Theano TensorVariable
-        assert isinstance(wrapped, tt.TensorVariable)
+        # Make sure the returned object is a Aesara TensorVariable
+        assert isinstance(wrapped, TensorVariable)
 
     def test_as_tensor(self):
         """
         Check returned values for `data` given known inputs to `as_tensor()`.
 
         Note that ndarrays should return a TensorConstant and sparse inputs
-        should return a Sparse Theano object.
+        should return a Sparse Aesara object.
         """
         # Create the various inputs to the function
         input_name = "testing_inputs"
@@ -137,18 +140,18 @@ def test_as_tensor(self):
         for func_output in [dense_output, sparse_output]:
             assert func_output.missing_values is None
 
-        # Ensure that the Theano variable names are correctly set.
+        # Ensure that the Aesara variable names are correctly set.
         # Note that the output for masked inputs do not have their names set
         # to the passed value.
         for func_output in [dense_output, sparse_output]:
             assert func_output.name == input_name
 
         # Ensure the that returned functions are all of the correct type
-        assert isinstance(dense_output, tt.TensorConstant)
+        assert isinstance(dense_output, TensorConstant)
         assert sparse.basic._is_sparse_variable(sparse_output)
 
         # Masked output is something weird. Just ensure it has missing values
-        # self.assertIsInstance(masked_output, tt.TensorConstant)
+        # self.assertIsInstance(masked_output, TensorConstant)
         assert masked_output.missing_values is not None
 
         return None
diff --git a/pymc3/tests/test_models_utils.py b/pymc3/tests/test_models_utils.py
index 84d25b3c2bf..c6f55f8b090 100644
--- a/pymc3/tests/test_models_utils.py
+++ b/pymc3/tests/test_models_utils.py
@@ -12,10 +12,10 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import aesara.tensor as aet
 import numpy as np
 import pandas as pd
 import pytest
-import theano.tensor as tt
 
 from pymc3.glm import utils
 
@@ -51,7 +51,7 @@ def test_dict_input(self):
         m, l = utils.any_to_tensor_and_labels(self.data.to_dict("list"))
         self.assertMatrixLabels(m, l, mt=self.data[l].values, lt=l)
 
-        inp = {k: tt.as_tensor_variable(v.values) for k, v in self.data.to_dict("series").items()}
+        inp = {k: aet.as_tensor_variable(v.values) for k, v in self.data.to_dict("series").items()}
         m, l = utils.any_to_tensor_and_labels(inp)
         self.assertMatrixLabels(m, l, mt=self.data[l].values, lt=l)
 
@@ -63,18 +63,18 @@ def test_list_input(self):
 
     def test_tensor_input(self):
         m, l = utils.any_to_tensor_and_labels(
-            tt.as_tensor_variable(self.data.values.tolist()), labels=["x0", "x1"]
+            aet.as_tensor_variable(self.data.values.tolist()), labels=["x0", "x1"]
         )
         self.assertMatrixLabels(m, l, lt=["x0", "x1"])
         m, l = utils.any_to_tensor_and_labels(
-            tt.as_tensor_variable(self.data.values.tolist()), labels=["x2", "x3"]
+            aet.as_tensor_variable(self.data.values.tolist()), labels=["x2", "x3"]
         )
         self.assertMatrixLabels(m, l, lt=["x2", "x3"])
 
     def test_user_mistakes(self):
         # no labels for tensor variable
         with pytest.raises(ValueError):
-            utils.any_to_tensor_and_labels(tt.as_tensor_variable(self.data.values.tolist()))
+            utils.any_to_tensor_and_labels(aet.as_tensor_variable(self.data.values.tolist()))
         # len of labels is bad
         with pytest.raises(ValueError):
             utils.any_to_tensor_and_labels(self.data.values.tolist(), labels=["x"])
diff --git a/pymc3/tests/test_ode.py b/pymc3/tests/test_ode.py
index 1d336bfba67..efdaa31812b 100644
--- a/pymc3/tests/test_ode.py
+++ b/pymc3/tests/test_ode.py
@@ -12,9 +12,9 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import aesara
 import numpy as np
 import pytest
-import theano
 
 from scipy.integrate import odeint
 from scipy.stats import norm
@@ -26,13 +26,13 @@
 
 
 def test_gradients():
-    """Tests the computation of the sensitivities from the theano computation graph"""
+    """Tests the computation of the sensitivities from the aesara computation graph"""
 
     # ODE system for which to compute gradients
     def ode_func(y, t, p):
         return np.exp(-t) - p[0] * y[0]
 
-    # Computation of graidients with Theano
+    # Computation of graidients with Aesara
     augmented_ode_func = augment_system(ode_func, 1, 1 + 1)
 
     # This is the new system, ODE + Sensitivities, which will be integrated
@@ -210,22 +210,22 @@ def system(y, t, p):
 
     ode_model = DifferentialEquation(func=system, t0=0, times=times, n_states=1, n_theta=1)
 
-    @pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32")
+    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     def test_too_many_params(self):
         with pytest.raises(pm.ShapeError):
             self.ode_model(theta=[1, 1], y0=[0])
 
-    @pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32")
+    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     def test_too_many_y0(self):
         with pytest.raises(pm.ShapeError):
             self.ode_model(theta=[1], y0=[0, 0])
 
-    @pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32")
+    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     def test_too_few_params(self):
         with pytest.raises(pm.ShapeError):
             self.ode_model(theta=[], y0=[1])
 
-    @pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32")
+    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     def test_too_few_y0(self):
         with pytest.raises(pm.ShapeError):
             self.ode_model(theta=[1], y0=[])
diff --git a/pymc3/tests/test_parallel_sampling.py b/pymc3/tests/test_parallel_sampling.py
index e458c609a8b..bd1a37abcff 100644
--- a/pymc3/tests/test_parallel_sampling.py
+++ b/pymc3/tests/test_parallel_sampling.py
@@ -14,12 +14,13 @@
 import multiprocessing
 import os
 
+import aesara
+import aesara.tensor as aet
 import numpy as np
 import pytest
-import theano
-import theano.tensor as tt
 
-from theano.compile.ops import as_op
+from aesara.compile.ops import as_op
+from aesara.tensor.type import TensorType
 
 import pymc3 as pm
 import pymc3.parallel_sampling as ps
@@ -60,10 +61,10 @@ def test_bad_unpickle():
         assert "could not be unpickled" in str(exc_info.getrepr(style="short"))
 
 
-tt_vector = tt.TensorType(theano.config.floatX, [False])
+aet_vector = TensorType(aesara.config.floatX, [False])
 
 
-@as_op([tt_vector, tt.iscalar], [tt_vector])
+@as_op([aet_vector, aet.iscalar], [aet_vector])
 def _crash_remote_process(a, master_pid):
     if os.getpid() != master_pid:
         os.exit(0)
@@ -80,8 +81,8 @@ def test_remote_pipe_closed():
     master_pid = os.getpid()
     with pm.Model():
         x = pm.Normal("x", shape=2, mu=0.1)
-        tt_pid = tt.as_tensor_variable(np.array(master_pid, dtype="int32"))
-        pm.Normal("y", mu=_crash_remote_process(x, tt_pid), shape=2)
+        aet_pid = aet.as_tensor_variable(np.array(master_pid, dtype="int32"))
+        pm.Normal("y", mu=_crash_remote_process(x, aet_pid), shape=2)
 
         step = pm.Metropolis()
         with pytest.raises(RuntimeError, match="Chain [0-9] failed"):
diff --git a/pymc3/tests/test_posdef_sym.py b/pymc3/tests/test_posdef_sym.py
index a7aa714357f..cfb406ca1df 100644
--- a/pymc3/tests/test_posdef_sym.py
+++ b/pymc3/tests/test_posdef_sym.py
@@ -12,19 +12,19 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import aesara
 import numpy as np
-import theano
 
 from pymc3.distributions import multivariate as mv
 
 
 def test_posdef_symmetric1():
-    data = np.array([[1.0, 0], [0, 1]], dtype=theano.config.floatX)
+    data = np.array([[1.0, 0], [0, 1]], dtype=aesara.config.floatX)
     assert mv.posdef(data) == 1
 
 
 def test_posdef_symmetric2():
-    data = np.array([[1.0, 2], [2, 1]], dtype=theano.config.floatX)
+    data = np.array([[1.0, 2], [2, 1]], dtype=aesara.config.floatX)
     assert mv.posdef(data) == 0
 
 
@@ -33,11 +33,11 @@ def test_posdef_symmetric3():
 
     Is this correct?
     """
-    data = np.array([[1.0, 1], [1, 1]], dtype=theano.config.floatX)
+    data = np.array([[1.0, 1], [1, 1]], dtype=aesara.config.floatX)
     assert mv.posdef(data) == 0
 
 
 def test_posdef_symmetric4():
-    d = np.array([[1, 0.99, 1], [0.99, 1, 0.999], [1, 0.999, 1]], theano.config.floatX)
+    d = np.array([[1, 0.99, 1], [0.99, 1, 0.999], [1, 0.999, 1]], aesara.config.floatX)
 
     assert mv.posdef(d) == 0
diff --git a/pymc3/tests/test_posteriors.py b/pymc3/tests/test_posteriors.py
index 453ae98efd8..8ac068bd757 100644
--- a/pymc3/tests/test_posteriors.py
+++ b/pymc3/tests/test_posteriors.py
@@ -12,13 +12,13 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import aesara
 import pytest
-import theano
 
 from pymc3.tests import sampler_fixtures as sf
 
 
-@pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32")
+@pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
 class TestNUTSUniform(sf.NutsFixture, sf.UniformFixture):
     n_samples = 10000
     tune = 1000
diff --git a/pymc3/tests/test_quadpotential.py b/pymc3/tests/test_quadpotential.py
index d91a80b5e90..aa89f37075b 100644
--- a/pymc3/tests/test_quadpotential.py
+++ b/pymc3/tests/test_quadpotential.py
@@ -19,8 +19,8 @@
 
 import pymc3
 
+from pymc3.aesaraf import floatX
 from pymc3.step_methods.hmc import quadpotential
-from pymc3.theanof import floatX
 
 
 def test_elemwise_posdef():
diff --git a/pymc3/tests/test_random.py b/pymc3/tests/test_random.py
index 7a4ae42ce22..f88e6f75f96 100644
--- a/pymc3/tests/test_random.py
+++ b/pymc3/tests/test_random.py
@@ -12,11 +12,11 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import aesara
+import aesara.tensor as aet
 import numpy as np
 import numpy.testing as npt
 import pytest
-import theano
-import theano.tensor as tt
 
 from numpy import random as nr
 
@@ -30,15 +30,15 @@ def test_draw_value():
     npt.assert_equal(_draw_value(np.array([5, 6])), [5, 6])
     npt.assert_equal(_draw_value(np.array(5.0)), 5)
 
-    npt.assert_equal(_draw_value(tt.constant([5.0, 6.0])), [5, 6])
-    assert _draw_value(tt.constant(5)) == 5
-    npt.assert_equal(_draw_value(2 * tt.constant([5.0, 6.0])), [10, 12])
+    npt.assert_equal(_draw_value(aet.constant([5.0, 6.0])), [5, 6])
+    assert _draw_value(aet.constant(5)) == 5
+    npt.assert_equal(_draw_value(2 * aet.constant([5.0, 6.0])), [10, 12])
 
-    val = theano.shared(np.array([5.0, 6.0]))
+    val = aesara.shared(np.array([5.0, 6.0]))
     npt.assert_equal(_draw_value(val), [5, 6])
     npt.assert_equal(_draw_value(2 * val), [10, 12])
 
-    a = tt.scalar("a")
+    a = aet.scalar("a")
     a.tag.test_value = 6
     npt.assert_equal(_draw_value(2 * a, givens=[(a, 1)]), 2)
 
@@ -48,7 +48,7 @@ def test_draw_value():
     assert isinstance(_draw_value(5), type(5))
 
     with pm.Model():
-        mu = 2 * tt.constant(np.array([5.0, 6.0])) + theano.shared(np.array(5))
+        mu = 2 * aet.constant(np.array([5.0, 6.0])) + aesara.shared(np.array(5))
         a = pm.Normal("a", mu=mu, sigma=5, shape=2)
 
     val1 = _draw_value(a)
@@ -68,17 +68,17 @@ def test_vals(self):
         npt.assert_equal(draw_values([np.array([5, 6])])[0], [5, 6])
         npt.assert_equal(draw_values([np.array(5.0)])[0], 5)
 
-        npt.assert_equal(draw_values([tt.constant([5.0, 6.0])])[0], [5, 6])
-        assert draw_values([tt.constant(5)])[0] == 5
-        npt.assert_equal(draw_values([2 * tt.constant([5.0, 6.0])])[0], [10, 12])
+        npt.assert_equal(draw_values([aet.constant([5.0, 6.0])])[0], [5, 6])
+        assert draw_values([aet.constant(5)])[0] == 5
+        npt.assert_equal(draw_values([2 * aet.constant([5.0, 6.0])])[0], [10, 12])
 
-        val = theano.shared(np.array([5.0, 6.0]))
+        val = aesara.shared(np.array([5.0, 6.0]))
         npt.assert_equal(draw_values([val])[0], [5, 6])
         npt.assert_equal(draw_values([2 * val])[0], [10, 12])
 
     def test_simple_model(self):
         with pm.Model():
-            mu = 2 * tt.constant(np.array([5.0, 6.0])) + theano.shared(np.array(5))
+            mu = 2 * aet.constant(np.array([5.0, 6.0])) + aesara.shared(np.array(5))
             a = pm.Normal("a", mu=mu, sigma=5, shape=2)
 
         val1 = draw_values([a])
@@ -90,7 +90,7 @@ def test_simple_model(self):
 
     def test_dep_vars(self):
         with pm.Model():
-            mu = 2 * tt.constant(np.array([5.0, 6.0])) + theano.shared(np.array(5))
+            mu = 2 * aet.constant(np.array([5.0, 6.0])) + aesara.shared(np.array(5))
             sd = pm.HalfNormal("sd", shape=2)
             tau = 1 / sd ** 2
             a = pm.Normal("a", mu=mu, tau=tau, shape=2)
@@ -116,7 +116,7 @@ def test_dep_vars(self):
 
     def test_graph_constant(self):
         # Issue 3595 pointed out that slice(None) can introduce
-        # theano.graph.basic.Constant into the compute graph, which wasn't
+        # aesara.graph.basic.Constant into the compute graph, which wasn't
         # handled correctly by draw_values
         n_d = 500
         n_x = 2
diff --git a/pymc3/tests/test_sampling.py b/pymc3/tests/test_sampling.py
index f3f2872c442..243ece046ed 100644
--- a/pymc3/tests/test_sampling.py
+++ b/pymc3/tests/test_sampling.py
@@ -18,15 +18,15 @@
 from itertools import combinations
 from typing import Tuple
 
+import aesara
+import aesara.tensor as aet
 import arviz as az
 import numpy as np
 import numpy.testing as npt
 import pytest
-import theano
-import theano.tensor as tt
 
+from aesara import shared
 from scipy import stats
-from theano import shared
 
 import pymc3 as pm
 
@@ -36,7 +36,7 @@
 from pymc3.tests.models import simple_init
 
 
-@pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32")
+@pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
 class TestSample(SeededTest):
     def setup_method(self):
         super().setup_method()
@@ -348,7 +348,7 @@ def test_choose_chains(n_points, tune, expected_length, expected_n_traces):
     assert expected_n_traces == len(traces)
 
 
-@pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32")
+@pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
 class TestNamedSampling(SeededTest):
     def test_shared_named(self):
         G_var = shared(value=np.atleast_2d(1.0), broadcastable=(True, False), name="G")
@@ -362,7 +362,7 @@ def test_shared_named(self):
                 testval=np.atleast_2d(0),
             )
             theta = pm.Normal(
-                "theta", mu=tt.dot(G_var, theta0), tau=np.atleast_2d(1e20), shape=(1, 1)
+                "theta", mu=aet.dot(G_var, theta0), tau=np.atleast_2d(1e20), shape=(1, 1)
             )
             res = theta.random()
             assert np.isclose(res, 0.0)
@@ -378,13 +378,13 @@ def test_shared_unnamed(self):
                 testval=np.atleast_2d(0),
             )
             theta = pm.Normal(
-                "theta", mu=tt.dot(G_var, theta0), tau=np.atleast_2d(1e20), shape=(1, 1)
+                "theta", mu=aet.dot(G_var, theta0), tau=np.atleast_2d(1e20), shape=(1, 1)
             )
             res = theta.random()
             assert np.isclose(res, 0.0)
 
     def test_constant_named(self):
-        G_var = tt.constant(np.atleast_2d(1.0), name="G")
+        G_var = aet.constant(np.atleast_2d(1.0), name="G")
         with pm.Model():
             theta0 = pm.Normal(
                 "theta0",
@@ -394,7 +394,7 @@ def test_constant_named(self):
                 testval=np.atleast_2d(0),
             )
             theta = pm.Normal(
-                "theta", mu=tt.dot(G_var, theta0), tau=np.atleast_2d(1e20), shape=(1, 1)
+                "theta", mu=aet.dot(G_var, theta0), tau=np.atleast_2d(1e20), shape=(1, 1)
             )
 
             res = theta.random()
@@ -621,8 +621,8 @@ def test_model_not_drawable_prior(self):
     def test_model_shared_variable(self):
         x = np.random.randn(100)
         y = x > 0
-        x_shared = theano.shared(x)
-        y_shared = theano.shared(y)
+        x_shared = aesara.shared(x)
+        y_shared = aesara.shared(y)
         with pm.Model() as model:
             coeff = pm.Normal("x", mu=0, sd=1)
             logistic = pm.Deterministic("p", pm.math.sigmoid(coeff * x_shared))
@@ -655,8 +655,8 @@ def test_model_shared_variable(self):
         npt.assert_allclose(post_pred["p"], expected_p)
 
     def test_deterministic_of_observed(self):
-        meas_in_1 = pm.theanof.floatX(2 + 4 * np.random.randn(10))
-        meas_in_2 = pm.theanof.floatX(5 + 4 * np.random.randn(10))
+        meas_in_1 = pm.aesaraf.floatX(2 + 4 * np.random.randn(10))
+        meas_in_2 = pm.aesaraf.floatX(5 + 4 * np.random.randn(10))
         nchains = 2
         with pm.Model() as model:
             mu_in_1 = pm.Normal("mu_in_1", 0, 1)
@@ -671,7 +671,7 @@ def test_deterministic_of_observed(self):
 
             trace = pm.sample(100, chains=nchains)
             np.random.seed(0)
-            rtol = 1e-5 if theano.config.floatX == "float64" else 1e-4
+            rtol = 1e-5 if aesara.config.floatX == "float64" else 1e-4
 
             np.random.seed(0)
             ppc = pm.sample_posterior_predictive(
@@ -694,8 +694,8 @@ def test_deterministic_of_observed(self):
             npt.assert_allclose(ppc["in_1"] + ppc["in_2"], ppc["out"], rtol=rtol)
 
     def test_deterministic_of_observed_modified_interface(self):
-        meas_in_1 = pm.theanof.floatX(2 + 4 * np.random.randn(100))
-        meas_in_2 = pm.theanof.floatX(5 + 4 * np.random.randn(100))
+        meas_in_1 = pm.aesaraf.floatX(2 + 4 * np.random.randn(100))
+        meas_in_2 = pm.aesaraf.floatX(5 + 4 * np.random.randn(100))
         with pm.Model() as model:
             mu_in_1 = pm.Normal("mu_in_1", 0, 1)
             sigma_in_1 = pm.HalfNormal("sd_in_1", 1)
@@ -718,7 +718,7 @@ def test_deterministic_of_observed_modified_interface(self):
                 var_names=[x.name for x in (model.deterministics + model.basic_RVs)],
             )
 
-            rtol = 1e-5 if theano.config.floatX == "float64" else 1e-3
+            rtol = 1e-5 if aesara.config.floatX == "float64" else 1e-3
             npt.assert_allclose(ppc["in_1"] + ppc["in_2"], ppc["out"], rtol=rtol)
 
             ppc = pm.fast_sample_posterior_predictive(
@@ -728,7 +728,7 @@ def test_deterministic_of_observed_modified_interface(self):
                 var_names=[x.name for x in (model.deterministics + model.basic_RVs)],
             )
 
-            rtol = 1e-5 if theano.config.floatX == "float64" else 1e-3
+            rtol = 1e-5 if aesara.config.floatX == "float64" else 1e-3
             npt.assert_allclose(ppc["in_1"] + ppc["in_2"], ppc["out"], rtol=rtol)
 
     def test_variable_type(self):
@@ -987,7 +987,7 @@ def test_transformed(self):
             phi = pm.Beta("phi", alpha=1.0, beta=1.0)
 
             kappa_log = pm.Exponential("logkappa", lam=5.0)
-            kappa = pm.Deterministic("kappa", tt.exp(kappa_log))
+            kappa = pm.Deterministic("kappa", aet.exp(kappa_log))
 
             thetas = pm.Beta("thetas", alpha=phi * kappa, beta=(1.0 - phi) * kappa, shape=n)
 
@@ -1053,7 +1053,7 @@ def test_zeroinflatedpoisson(self):
     def test_bounded_dist(self):
         with pm.Model() as model:
             BoundedNormal = pm.Bound(pm.Normal, lower=0.0)
-            x = BoundedNormal("x", mu=tt.zeros((3, 1)), sd=1 * tt.ones((3, 1)), shape=(3, 1))
+            x = BoundedNormal("x", mu=aet.zeros((3, 1)), sd=1 * aet.ones((3, 1)), shape=(3, 1))
 
         with model:
             prior_trace = pm.sample_prior_predictive(5)
diff --git a/pymc3/tests/test_shape_handling.py b/pymc3/tests/test_shape_handling.py
index 070535969df..39cd181083a 100644
--- a/pymc3/tests/test_shape_handling.py
+++ b/pymc3/tests/test_shape_handling.py
@@ -15,7 +15,7 @@
 import numpy as np
 import pytest
 
-from theano import tensor as tt
+from aesara import tensor as aet
 
 import pymc3 as pm
 
@@ -106,7 +106,7 @@ def fixture_model():
             cov = pm.InverseGamma("cov", alpha=1, beta=1)
             x = pm.Normal("x", mu=np.ones((dim,)), sigma=pm.math.sqrt(cov), shape=(n, dim))
             eps = pm.HalfNormal("eps", np.ones((n, 1)), shape=(n, dim))
-            mu = pm.Deterministic("mu", tt.sum(x + eps, axis=-1))
+            mu = pm.Deterministic("mu", aet.sum(x + eps, axis=-1))
             y = pm.Normal("y", mu=mu, sigma=1, shape=(n,))
     return model, [cov, x, eps, y]
 
diff --git a/pymc3/tests/test_shared.py b/pymc3/tests/test_shared.py
index 723216362fb..247b5ebdb55 100644
--- a/pymc3/tests/test_shared.py
+++ b/pymc3/tests/test_shared.py
@@ -12,8 +12,8 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import aesara
 import numpy as np
-import theano
 
 import pymc3 as pm
 
@@ -24,7 +24,7 @@ class TestShared(SeededTest):
     def test_deterministic(self):
         with pm.Model() as model:
             data_values = np.array([0.5, 0.4, 5, 2])
-            X = theano.shared(np.asarray(data_values, dtype=theano.config.floatX), borrow=True)
+            X = aesara.shared(np.asarray(data_values, dtype=aesara.config.floatX), borrow=True)
             pm.Normal("y", 0, 1, observed=X)
             model.logp(model.test_point)
 
@@ -34,7 +34,7 @@ def test_sample(self):
 
         x_pred = np.linspace(-3, 3, 200)
 
-        x_shared = theano.shared(x)
+        x_shared = aesara.shared(x)
 
         with pm.Model() as model:
             b = pm.Normal("b", 0.0, 10.0)
diff --git a/pymc3/tests/test_smc.py b/pymc3/tests/test_smc.py
index 695ea461f73..b2ebdd65dde 100644
--- a/pymc3/tests/test_smc.py
+++ b/pymc3/tests/test_smc.py
@@ -12,9 +12,9 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import aesara.tensor as aet
 import numpy as np
 import pytest
-import theano.tensor as tt
 
 import pymc3 as pm
 
@@ -39,16 +39,16 @@ def setup_class(self):
 
         def two_gaussians(x):
             log_like1 = (
-                -0.5 * n * tt.log(2 * np.pi)
-                - 0.5 * tt.log(dsigma)
+                -0.5 * n * aet.log(2 * np.pi)
+                - 0.5 * aet.log(dsigma)
                 - 0.5 * (x - mu1).T.dot(isigma).dot(x - mu1)
             )
             log_like2 = (
-                -0.5 * n * tt.log(2 * np.pi)
-                - 0.5 * tt.log(dsigma)
+                -0.5 * n * aet.log(2 * np.pi)
+                - 0.5 * aet.log(dsigma)
                 - 0.5 * (x - mu2).T.dot(isigma).dot(x - mu2)
             )
-            return tt.log(w1 * tt.exp(log_like1) + w2 * tt.exp(log_like2))
+            return aet.log(w1 * aet.exp(log_like1) + w2 * aet.exp(log_like2))
 
         with pm.Model() as self.SMC_test:
             X = pm.Uniform("X", lower=-2, upper=2.0, shape=n)
diff --git a/pymc3/tests/test_special_functions.py b/pymc3/tests/test_special_functions.py
index e7e2e53cbc4..b293163ad63 100644
--- a/pymc3/tests/test_special_functions.py
+++ b/pymc3/tests/test_special_functions.py
@@ -12,11 +12,11 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import aesara.tensor as aet
 import numpy as np
 import scipy.special as ss
-import theano.tensor as tt
 
-from theano import function
+from aesara import function
 
 import pymc3.distributions.special as ps
 
@@ -26,10 +26,10 @@
 def test_functions():
     xvals = list(map(np.atleast_1d, [0.01, 0.1, 2, 100, 10000]))
 
-    x = tt.dvector("x")
+    x = aet.dvector("x")
     x.tag.test_value = xvals[0]
 
-    p = tt.iscalar("p")
+    p = aet.iscalar("p")
     p.tag.test_value = 1
 
     gammaln = function([x], ps.gammaln(x))
@@ -55,10 +55,10 @@ def test_functions():
 def t_multigamma():
     xvals = list(map(np.atleast_1d, [0, 0.1, 2, 100]))
 
-    x = tt.dvector("x")
+    x = aet.dvector("x")
     x.tag.test_value = xvals[0]
 
-    p = tt.iscalar("p")
+    p = aet.iscalar("p")
     p.tag.test_value = 1
 
     multigammaln = function([x, p], ps.multigammaln(x, p))
diff --git a/pymc3/tests/test_step.py b/pymc3/tests/test_step.py
index 6da70f2a7ab..54b126ba0f3 100644
--- a/pymc3/tests/test_step.py
+++ b/pymc3/tests/test_step.py
@@ -18,17 +18,18 @@
 
 from math import isclose
 
+import aesara
+import aesara.tensor as aet
 import arviz as az
 import numpy as np
 import numpy.testing as npt
 import pytest
-import theano
-import theano.tensor as tt
 
+from aesara.compile.ops import as_op
+from aesara.graph.op import Op
 from numpy.testing import assert_array_almost_equal
-from theano.compile.ops import as_op
-from theano.graph.op import Op
 
+from pymc3.aesaraf import floatX
 from pymc3.data import Data
 from pymc3.distributions import (
     Bernoulli,
@@ -71,7 +72,6 @@
     simple_2model_continuous,
     simple_categorical,
 )
-from pymc3.theanof import floatX
 
 
 class TestStepMethods:  # yield test doesn't work subclassing object
@@ -500,7 +500,7 @@ def setup_class(self):
     def teardown_class(self):
         shutil.rmtree(self.temp_dir)
 
-    @pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32")
+    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     def test_sample_exact(self):
         for step_method in self.master_samples:
             self.check_trace(step_method)
@@ -591,7 +591,7 @@ def test_step_continuous(self):
             self.check_stat(check, trace, step.__class__.__name__)
 
     def test_step_discrete(self):
-        if theano.config.floatX == "float32":
+        if aesara.config.floatX == "float32":
             return  # Cannot use @skip because it only skips one iteration of the yield
         start, model, (mu, C) = mv_simple_discrete()
         unc = np.diag(C) ** 0.5
@@ -657,7 +657,7 @@ class TestCompoundStep:
     samplers = (Metropolis, Slice, HamiltonianMC, NUTS, DEMetropolis)
 
     @pytest.mark.skipif(
-        theano.config.floatX == "float32", reason="Test fails on 32 bit due to linalg issues"
+        aesara.config.floatX == "float32", reason="Test fails on 32 bit due to linalg issues"
     )
     def test_non_blocked(self):
         """Test that samplers correctly create non-blocked compound steps."""
@@ -667,7 +667,7 @@ def test_non_blocked(self):
                 assert isinstance(sampler(blocked=False), CompoundStep)
 
     @pytest.mark.skipif(
-        theano.config.floatX == "float32", reason="Test fails on 32 bit due to linalg issues"
+        aesara.config.floatX == "float32", reason="Test fails on 32 bit due to linalg issues"
     )
     def test_blocked(self):
         _, model = simple_2model_continuous()
@@ -716,17 +716,17 @@ def test_normal_nograd_op(self):
         with Model() as model:
             x = Normal("x", 0, 1)
 
-            # a custom Theano Op that does not have a grad:
-            is_64 = theano.config.floatX == "float64"
-            itypes = [tt.dscalar] if is_64 else [tt.fscalar]
-            otypes = [tt.dscalar] if is_64 else [tt.fscalar]
+            # a custom Aesara Op that does not have a grad:
+            is_64 = aesara.config.floatX == "float64"
+            itypes = [aet.dscalar] if is_64 else [aet.fscalar]
+            otypes = [aet.dscalar] if is_64 else [aet.fscalar]
 
             @as_op(itypes, otypes)
             def kill_grad(x):
                 return x
 
             data = np.random.normal(size=(100,))
-            Normal("y", mu=kill_grad(x), sigma=1, observed=data.astype(theano.config.floatX))
+            Normal("y", mu=kill_grad(x), sigma=1, observed=data.astype(aesara.config.floatX))
 
             steps = assign_step_methods(model, [])
         assert isinstance(steps, Slice)
@@ -957,7 +957,7 @@ def test_custom_proposal_dist(self):
         pass
 
 
-@pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32")
+@pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
 class TestNutsCheckTrace:
     def test_multiple_samplers(self, caplog):
         with Model():
@@ -986,8 +986,8 @@ def test_bad_init_parallel(self):
     def test_linalg(self, caplog):
         with Model():
             a = Normal("a", shape=2)
-            a = tt.switch(a > 0, np.inf, a)
-            b = tt.slinalg.solve(floatX(np.eye(2)), a)
+            a = aet.switch(a > 0, np.inf, a)
+            b = aet.slinalg.solve(floatX(np.eye(2)), a)
             Normal("c", mu=b, shape=2)
             caplog.clear()
             trace = sample(20, init=None, tune=5, chains=2)
@@ -1440,7 +1440,7 @@ def test_aem_mu_sigma(self):
         """Test that AEM estimates mu_B and Sigma_B in
         the coarse models of a 3-level LR example correctly"""
         # create data for linear regression
-        if theano.config.floatX == "float32":
+        if aesara.config.floatX == "float32":
             p = "float32"
         else:
             p = "float64"
@@ -1459,12 +1459,12 @@ def test_aem_mu_sigma(self):
 
         # forward model Op - here, just the regression equation
         class ForwardModel(Op):
-            if theano.config.floatX == "float32":
-                itypes = [tt.fvector]
-                otypes = [tt.fvector]
+            if aesara.config.floatX == "float32":
+                itypes = [aet.fvector]
+                otypes = [aet.fvector]
             else:
-                itypes = [tt.dvector]
-                otypes = [tt.dvector]
+                itypes = [aet.dvector]
+                otypes = [aet.dvector]
 
             def __init__(self, x, pymc3_model):
                 self.x = x
@@ -1494,7 +1494,7 @@ def perform(self, node, inputs, outputs):
             intercept = Normal("Intercept", 0, sigma=20)
             x_coeff = Normal("x", 0, sigma=20)
 
-            theta = tt.as_tensor_variable([intercept, x_coeff])
+            theta = aet.as_tensor_variable([intercept, x_coeff])
 
             mout.append(ForwardModel(x, coarse_model_0))
 
@@ -1514,7 +1514,7 @@ def perform(self, node, inputs, outputs):
             intercept = Normal("Intercept", 0, sigma=20)
             x_coeff = Normal("x", 0, sigma=20)
 
-            theta = tt.as_tensor_variable([intercept, x_coeff])
+            theta = aet.as_tensor_variable([intercept, x_coeff])
 
             mout.append(ForwardModel(x, coarse_model_1))
 
@@ -1533,7 +1533,7 @@ def perform(self, node, inputs, outputs):
             intercept = Normal("Intercept", 0, sigma=20)
             x_coeff = Normal("x", 0, sigma=20)
 
-            theta = tt.as_tensor_variable([intercept, x_coeff])
+            theta = aet.as_tensor_variable([intercept, x_coeff])
 
             mout.append(ForwardModel(x, model))
 
@@ -1569,7 +1569,7 @@ def test_variance_reduction(self):
         model with multiple levels where approximate levels have fewer data.
         """
         # arithmetic precision
-        if theano.config.floatX == "float32":
+        if aesara.config.floatX == "float32":
             p = "float32"
         else:
             p = "float64"
@@ -1601,12 +1601,12 @@ def test_variance_reduction(self):
 
         # define likelihoods with different Q
         class Likelihood1(Op):
-            if theano.config.floatX == "float32":
-                itypes = [tt.fvector]
-                otypes = [tt.fscalar]
+            if aesara.config.floatX == "float32":
+                itypes = [aet.fvector]
+                otypes = [aet.fscalar]
             else:
-                itypes = [tt.dvector]
-                otypes = [tt.dscalar]
+                itypes = [aet.dvector]
+                otypes = [aet.dscalar]
 
             def __init__(self, x, y, pymc3_model):
                 self.x = x
@@ -1624,12 +1624,12 @@ def perform(self, node, inputs, outputs):
                 )
 
         class Likelihood2(Op):
-            if theano.config.floatX == "float32":
-                itypes = [tt.fvector]
-                otypes = [tt.fscalar]
+            if aesara.config.floatX == "float32":
+                itypes = [aet.fvector]
+                otypes = [aet.fscalar]
             else:
-                itypes = [tt.dvector]
-                otypes = [tt.dscalar]
+                itypes = [aet.dvector]
+                otypes = [aet.dscalar]
 
             def __init__(self, x, y, pymc3_model):
                 self.x = x
@@ -1654,7 +1654,7 @@ def perform(self, node, inputs, outputs):
                 coarse_models = []
 
                 with Model() as coarse_model_0:
-                    if theano.config.floatX == "float32":
+                    if aesara.config.floatX == "float32":
                         Q = Data("Q", np.float32(0.0))
                     else:
                         Q = Data("Q", np.float64(0.0))
@@ -1663,7 +1663,7 @@ def perform(self, node, inputs, outputs):
                     intercept = Normal("Intercept", 0, sigma=20)
                     x_coeff = Normal("x", 0, sigma=20)
 
-                    theta = tt.as_tensor_variable([intercept, x_coeff])
+                    theta = aet.as_tensor_variable([intercept, x_coeff])
 
                     mout.append(f(x_coarse_0, y_coarse_0, coarse_model_0))
                     Potential("likelihood", mout[0](theta))
@@ -1671,7 +1671,7 @@ def perform(self, node, inputs, outputs):
                     coarse_models.append(coarse_model_0)
 
                 with Model() as coarse_model_1:
-                    if theano.config.floatX == "float32":
+                    if aesara.config.floatX == "float32":
                         Q = Data("Q", np.float32(0.0))
                     else:
                         Q = Data("Q", np.float64(0.0))
@@ -1680,7 +1680,7 @@ def perform(self, node, inputs, outputs):
                     intercept = Normal("Intercept", 0, sigma=20)
                     x_coeff = Normal("x", 0, sigma=20)
 
-                    theta = tt.as_tensor_variable([intercept, x_coeff])
+                    theta = aet.as_tensor_variable([intercept, x_coeff])
 
                     mout.append(f(x_coarse_1, y_coarse_1, coarse_model_1))
                     Potential("likelihood", mout[1](theta))
@@ -1688,7 +1688,7 @@ def perform(self, node, inputs, outputs):
                     coarse_models.append(coarse_model_1)
 
                 with Model() as model:
-                    if theano.config.floatX == "float32":
+                    if aesara.config.floatX == "float32":
                         Q = Data("Q", np.float32(0.0))
                     else:
                         Q = Data("Q", np.float64(0.0))
@@ -1697,7 +1697,7 @@ def perform(self, node, inputs, outputs):
                     intercept = Normal("Intercept", 0, sigma=20)
                     x_coeff = Normal("x", 0, sigma=20)
 
-                    theta = tt.as_tensor_variable([intercept, x_coeff])
+                    theta = aet.as_tensor_variable([intercept, x_coeff])
 
                     mout.append(f(x, y, model))
                     Potential("likelihood", mout[-1](theta))
diff --git a/pymc3/tests/test_transforms.py b/pymc3/tests/test_transforms.py
index e9ab89938b6..844a9eb389e 100644
--- a/pymc3/tests/test_transforms.py
+++ b/pymc3/tests/test_transforms.py
@@ -12,14 +12,17 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import aesara
+import aesara.tensor as aet
 import numpy as np
 import pytest
-import theano
-import theano.tensor as tt
+
+from aesara.tensor.var import TensorConstant
 
 import pymc3 as pm
 import pymc3.distributions.transforms as tr
 
+from pymc3.aesaraf import jacobian
 from pymc3.tests.checks import close_to, close_to_logical
 from pymc3.tests.helpers import SeededTest
 from pymc3.tests.test_distributions import (
@@ -34,38 +37,37 @@
     UnitSortedVector,
     Vector,
 )
-from pymc3.theanof import jacobian
 
 # some transforms (stick breaking) require additon of small slack in order to be numerically
 # stable. The minimal addable slack for float32 is higher thus we need to be less strict
-tol = 1e-7 if theano.config.floatX == "float64" else 1e-6
+tol = 1e-7 if aesara.config.floatX == "float64" else 1e-6
 
 
-def check_transform(transform, domain, constructor=tt.dscalar, test=0):
+def check_transform(transform, domain, constructor=aet.dscalar, test=0):
     x = constructor("x")
     x.tag.test_value = test
     # test forward and forward_val
-    forward_f = theano.function([x], transform.forward(x))
+    forward_f = aesara.function([x], transform.forward(x))
     # test transform identity
-    identity_f = theano.function([x], transform.backward(transform.forward(x)))
+    identity_f = aesara.function([x], transform.backward(transform.forward(x)))
     for val in domain.vals:
         close_to(val, identity_f(val), tol)
         close_to(transform.forward_val(val), forward_f(val), tol)
 
 
 def check_vector_transform(transform, domain):
-    return check_transform(transform, domain, tt.dvector, test=np.array([0, 0]))
+    return check_transform(transform, domain, aet.dvector, test=np.array([0, 0]))
 
 
-def get_values(transform, domain=R, constructor=tt.dscalar, test=0):
+def get_values(transform, domain=R, constructor=aet.dscalar, test=0):
     x = constructor("x")
     x.tag.test_value = test
-    f = theano.function([x], transform.backward(x))
+    f = aesara.function([x], transform.backward(x))
     return np.array([f(val) for val in domain.vals])
 
 
 def check_jacobian_det(
-    transform, domain, constructor=tt.dscalar, test=0, make_comparable=None, elemwise=False
+    transform, domain, constructor=aet.dscalar, test=0, make_comparable=None, elemwise=False
 ):
     y = constructor("y")
     y.tag.test_value = test
@@ -75,15 +77,15 @@ def check_jacobian_det(
         x = make_comparable(x)
 
     if not elemwise:
-        jac = tt.log(tt.nlinalg.det(jacobian(x, [y])))
+        jac = aet.log(aet.nlinalg.det(jacobian(x, [y])))
     else:
-        jac = tt.log(tt.abs_(tt.diag(jacobian(x, [y]))))
+        jac = aet.log(aet.abs_(aet.diag(jacobian(x, [y]))))
 
     # ljd = log jacobian det
-    actual_ljd = theano.function([y], jac)
+    actual_ljd = aesara.function([y], jac)
 
-    computed_ljd = theano.function(
-        [y], tt.as_tensor_variable(transform.jacobian_det(y)), on_unused_input="ignore"
+    computed_ljd = aesara.function(
+        [y], aet.as_tensor_variable(transform.jacobian_det(y)), on_unused_input="ignore"
     )
 
     for yval in domain.vals:
@@ -99,27 +101,27 @@ def test_stickbreaking():
     check_vector_transform(tr.stick_breaking, Simplex(4))
 
     check_transform(
-        tr.stick_breaking, MultiSimplex(3, 2), constructor=tt.dmatrix, test=np.zeros((2, 2))
+        tr.stick_breaking, MultiSimplex(3, 2), constructor=aet.dmatrix, test=np.zeros((2, 2))
     )
 
 
 def test_stickbreaking_bounds():
-    vals = get_values(tr.stick_breaking, Vector(R, 2), tt.dvector, np.array([0, 0]))
+    vals = get_values(tr.stick_breaking, Vector(R, 2), aet.dvector, np.array([0, 0]))
 
     close_to(vals.sum(axis=1), 1, tol)
     close_to_logical(vals > 0, True, tol)
     close_to_logical(vals < 1, True, tol)
 
     check_jacobian_det(
-        tr.stick_breaking, Vector(R, 2), tt.dvector, np.array([0, 0]), lambda x: x[:-1]
+        tr.stick_breaking, Vector(R, 2), aet.dvector, np.array([0, 0]), lambda x: x[:-1]
     )
 
 
 def test_stickbreaking_accuracy():
     val = np.array([-30])
-    x = tt.dvector("x")
+    x = aet.dvector("x")
     x.tag.test_value = val
-    identity_f = theano.function([x], tr.stick_breaking.forward(tr.stick_breaking.backward(x)))
+    identity_f = aesara.function([x], tr.stick_breaking.forward(tr.stick_breaking.backward(x)))
     close_to(val, identity_f(val), tol)
 
 
@@ -127,14 +129,16 @@ def test_sum_to_1():
     check_vector_transform(tr.sum_to_1, Simplex(2))
     check_vector_transform(tr.sum_to_1, Simplex(4))
 
-    check_jacobian_det(tr.sum_to_1, Vector(Unit, 2), tt.dvector, np.array([0, 0]), lambda x: x[:-1])
+    check_jacobian_det(
+        tr.sum_to_1, Vector(Unit, 2), aet.dvector, np.array([0, 0]), lambda x: x[:-1]
+    )
 
 
 def test_log():
     check_transform(tr.log, Rplusbig)
 
     check_jacobian_det(tr.log, Rplusbig, elemwise=True)
-    check_jacobian_det(tr.log, Vector(Rplusbig, 2), tt.dvector, [0, 0], elemwise=True)
+    check_jacobian_det(tr.log, Vector(Rplusbig, 2), aet.dvector, [0, 0], elemwise=True)
 
     vals = get_values(tr.log)
     close_to_logical(vals > 0, True, tol)
@@ -144,7 +148,7 @@ def test_log_exp_m1():
     check_transform(tr.log_exp_m1, Rplusbig)
 
     check_jacobian_det(tr.log_exp_m1, Rplusbig, elemwise=True)
-    check_jacobian_det(tr.log_exp_m1, Vector(Rplusbig, 2), tt.dvector, [0, 0], elemwise=True)
+    check_jacobian_det(tr.log_exp_m1, Vector(Rplusbig, 2), aet.dvector, [0, 0], elemwise=True)
 
     vals = get_values(tr.log_exp_m1)
     close_to_logical(vals > 0, True, tol)
@@ -154,7 +158,7 @@ def test_logodds():
     check_transform(tr.logodds, Unit)
 
     check_jacobian_det(tr.logodds, Unit, elemwise=True)
-    check_jacobian_det(tr.logodds, Vector(Unit, 2), tt.dvector, [0.5, 0.5], elemwise=True)
+    check_jacobian_det(tr.logodds, Vector(Unit, 2), aet.dvector, [0.5, 0.5], elemwise=True)
 
     vals = get_values(tr.logodds)
     close_to_logical(vals > 0, True, tol)
@@ -166,7 +170,7 @@ def test_lowerbound():
     check_transform(trans, Rplusbig)
 
     check_jacobian_det(trans, Rplusbig, elemwise=True)
-    check_jacobian_det(trans, Vector(Rplusbig, 2), tt.dvector, [0, 0], elemwise=True)
+    check_jacobian_det(trans, Vector(Rplusbig, 2), aet.dvector, [0, 0], elemwise=True)
 
     vals = get_values(trans)
     close_to_logical(vals > 0, True, tol)
@@ -177,7 +181,7 @@ def test_upperbound():
     check_transform(trans, Rminusbig)
 
     check_jacobian_det(trans, Rminusbig, elemwise=True)
-    check_jacobian_det(trans, Vector(Rminusbig, 2), tt.dvector, [-1, -1], elemwise=True)
+    check_jacobian_det(trans, Vector(Rminusbig, 2), aet.dvector, [-1, -1], elemwise=True)
 
     vals = get_values(trans)
     close_to_logical(vals < 0, True, tol)
@@ -196,7 +200,7 @@ def test_interval():
         close_to_logical(vals < b, True, tol)
 
 
-@pytest.mark.skipif(theano.config.floatX == "float32", reason="Test fails on 32 bit")
+@pytest.mark.skipif(aesara.config.floatX == "float32", reason="Test fails on 32 bit")
 def test_interval_near_boundary():
     lb = -1.0
     ub = 1e-7
@@ -219,26 +223,26 @@ def test_circular():
     close_to_logical(vals > -np.pi, True, tol)
     close_to_logical(vals < np.pi, True, tol)
 
-    assert isinstance(trans.forward(1), tt.TensorConstant)
+    assert isinstance(trans.forward(1), TensorConstant)
 
 
 def test_ordered():
     check_vector_transform(tr.ordered, SortedVector(6))
 
-    check_jacobian_det(tr.ordered, Vector(R, 2), tt.dvector, np.array([0, 0]), elemwise=False)
+    check_jacobian_det(tr.ordered, Vector(R, 2), aet.dvector, np.array([0, 0]), elemwise=False)
 
-    vals = get_values(tr.ordered, Vector(R, 3), tt.dvector, np.zeros(3))
+    vals = get_values(tr.ordered, Vector(R, 3), aet.dvector, np.zeros(3))
     close_to_logical(np.diff(vals) >= 0, True, tol)
 
 
-@pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32")
+@pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
 def test_chain():
     chain_tranf = tr.Chain([tr.logodds, tr.ordered])
     check_vector_transform(chain_tranf, UnitSortedVector(3))
 
-    check_jacobian_det(chain_tranf, Vector(R, 4), tt.dvector, np.zeros(4), elemwise=False)
+    check_jacobian_det(chain_tranf, Vector(R, 4), aet.dvector, np.zeros(4), elemwise=False)
 
-    vals = get_values(chain_tranf, Vector(R, 5), tt.dvector, np.zeros(5))
+    vals = get_values(chain_tranf, Vector(R, 5), aet.dvector, np.zeros(5))
     close_to_logical(np.diff(vals) >= 0, True, tol)
 
 
@@ -260,7 +264,7 @@ def check_transform_elementwise_logp(self, model):
         pt[x.name] = array
         dist = x.distribution
         logp_nojac = x0.distribution.logp(dist.transform_used.backward(array))
-        jacob_det = dist.transform_used.jacobian_det(theano.shared(array))
+        jacob_det = dist.transform_used.jacobian_det(aesara.shared(array))
         assert x.logp_elemwiset.ndim == jacob_det.ndim
 
         elementwiselogp = logp_nojac + jacob_det
@@ -277,7 +281,7 @@ def check_vectortransform_elementwise_logp(self, model, vect_opt=0):
         pt[x.name] = array
         dist = x.distribution
         logp_nojac = x0.distribution.logp(dist.transform_used.backward(array))
-        jacob_det = dist.transform_used.jacobian_det(theano.shared(array))
+        jacob_det = dist.transform_used.jacobian_det(aesara.shared(array))
         assert x.logp_elemwiset.ndim == jacob_det.ndim
 
         if vect_opt == 0:
@@ -369,7 +373,7 @@ def test_normal_ordered(self):
             (np.ones(3), (4, 3)),
         ],
     )
-    @pytest.mark.xfail(condition=(theano.config.floatX == "float32"), reason="Fails on float32")
+    @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     def test_half_normal_ordered(self, sd, shape):
         testval = np.sort(np.abs(np.random.randn(*shape)))
         model = self.build_model(
diff --git a/pymc3/tests/test_types.py b/pymc3/tests/test_types.py
index bd8eaa42df0..4adf8a62186 100644
--- a/pymc3/tests/test_types.py
+++ b/pymc3/tests/test_types.py
@@ -14,8 +14,8 @@
 
 from copy import copy
 
+import aesara
 import numpy as np
-import theano
 
 from pymc3.distributions import Normal
 from pymc3.model import Model
@@ -27,14 +27,14 @@ class TestType:
     samplers = (Metropolis, Slice, HamiltonianMC, NUTS)
 
     def setup_method(self):
-        # save theano config object
-        self.theano_config = copy(theano.config)
+        # save aesara config object
+        self.aesara_config = copy(aesara.config)
 
     def teardown_method(self):
-        # restore theano config
-        theano.config = self.theano_config
+        # restore aesara config
+        aesara.config = self.aesara_config
 
-    @theano.config.change_flags({"floatX": "float64", "warn_float64": "ignore"})
+    @aesara.config.change_flags({"floatX": "float64", "warn_float64": "ignore"})
     def test_float64(self):
         with Model() as model:
             x = Normal("x", testval=np.array(1.0, dtype="float64"))
@@ -47,7 +47,7 @@ def test_float64(self):
             with model:
                 sample(10, sampler())
 
-    @theano.config.change_flags({"floatX": "float32", "warn_float64": "warn"})
+    @aesara.config.change_flags({"floatX": "float32", "warn_float64": "warn"})
     def test_float32(self):
         with Model() as model:
             x = Normal("x", testval=np.array(1.0, dtype="float32"))
@@ -60,7 +60,7 @@ def test_float32(self):
             with model:
                 sample(10, sampler())
 
-    @theano.config.change_flags({"floatX": "float64", "warn_float64": "ignore"})
+    @aesara.config.change_flags({"floatX": "float64", "warn_float64": "ignore"})
     def test_float64_MLDA(self):
         data = np.random.randn(5)
 
@@ -78,7 +78,7 @@ def test_float64_MLDA(self):
         with model:
             sample(10, MLDA(coarse_models=[coarse_model]))
 
-    @theano.config.change_flags({"floatX": "float32", "warn_float64": "warn"})
+    @aesara.config.change_flags({"floatX": "float32", "warn_float64": "warn"})
     def test_float32_MLDA(self):
         data = np.random.randn(5).astype("float32")
 
diff --git a/pymc3/tests/test_updates.py b/pymc3/tests/test_updates.py
index 9d8f644075c..77dff3f17ad 100644
--- a/pymc3/tests/test_updates.py
+++ b/pymc3/tests/test_updates.py
@@ -12,9 +12,9 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import aesara
 import numpy as np
 import pytest
-import theano
 
 from pymc3.variational.updates import (
     adadelta,
@@ -28,12 +28,12 @@
     sgd,
 )
 
-_a = theano.shared(1.0)
+_a = aesara.shared(1.0)
 _b = _a * 2
 
-_m = theano.shared(np.empty((10,), theano.config.floatX))
+_m = aesara.shared(np.empty((10,), aesara.config.floatX))
 _n = _m.sum()
-_m2 = theano.shared(np.empty((10, 10, 10), theano.config.floatX))
+_m2 = aesara.shared(np.empty((10, 10, 10), aesara.config.floatX))
 _n2 = _b + _n + _m2.sum()
 
 
@@ -71,7 +71,7 @@
     ids=["scalar", "matrix", "mixed"],
 )
 def test_updates_fast(opt, loss_and_params, kwargs, getter):
-    with theano.config.change_flags(compute_test_value="ignore"):
+    with aesara.config.change_flags(compute_test_value="ignore"):
         loss, param = getter(loss_and_params)
         args = dict()
         args.update(**kwargs)
diff --git a/pymc3/tests/test_variational_inference.py b/pymc3/tests/test_variational_inference.py
index 1ef9b616290..8e115350b49 100644
--- a/pymc3/tests/test_variational_inference.py
+++ b/pymc3/tests/test_variational_inference.py
@@ -16,18 +16,18 @@
 import io
 import operator
 
+import aesara
+import aesara.tensor as aet
 import numpy as np
 import pytest
-import theano
-import theano.tensor as tt
 
 import pymc3 as pm
 import pymc3.memoize
 import pymc3.util
 
+from pymc3.aesaraf import intX
 from pymc3.tests import models
 from pymc3.tests.helpers import not_raises
-from pymc3.theanof import intX
 from pymc3.variational import flows, opvi
 from pymc3.variational.approximations import (
     Empirical,
@@ -51,7 +51,7 @@ def test_callbacks_convergence(diff, ord):
     cb = pm.variational.callbacks.CheckParametersConvergence(every=1, diff=diff, ord=ord)
 
     class _approx:
-        params = (theano.shared(np.asarray([1, 2, 3])),)
+        params = (aesara.shared(np.asarray([1, 2, 3])),)
 
     approx = _approx()
 
@@ -186,7 +186,7 @@ def test_sample_simple(three_var_approx, request):
 
 @pytest.fixture
 def aevb_initial():
-    return theano.shared(np.random.rand(3, 7).astype("float32"))
+    return aesara.shared(np.random.rand(3, 7).astype("float32"))
 
 
 @pytest.fixture(
@@ -251,7 +251,7 @@ def test_sample_aevb(three_var_aevb_approx, aevb_initial):
 
 
 def test_replacements_in_sample_node_aevb(three_var_aevb_approx, aevb_initial):
-    inp = tt.matrix(dtype="float32")
+    inp = aet.matrix(dtype="float32")
     three_var_aevb_approx.sample_node(
         three_var_aevb_approx.model.one, 2, more_replacements={aevb_initial: inp}
     ).eval({inp: np.random.rand(7, 7).astype("float32")})
@@ -265,14 +265,14 @@ def test_vae():
     minibatch_size = 10
     data = pm.floatX(np.random.rand(100))
     x_mini = pm.Minibatch(data, minibatch_size)
-    x_inp = tt.vector()
+    x_inp = aet.vector()
     x_inp.tag.test_value = data[:minibatch_size]
 
-    ae = theano.shared(pm.floatX([0.1, 0.1]))
-    be = theano.shared(pm.floatX(1.0))
+    ae = aesara.shared(pm.floatX([0.1, 0.1]))
+    be = aesara.shared(pm.floatX(1.0))
 
-    ad = theano.shared(pm.floatX(1.0))
-    bd = theano.shared(pm.floatX(1.0))
+    ad = aesara.shared(pm.floatX(1.0))
+    bd = aesara.shared(pm.floatX(1.0))
 
     enc = x_inp.dimshuffle(0, "x") * ae.dimshuffle("x", 0) + be
     mu, rho = enc[:, 0], enc[:, 1]
@@ -496,8 +496,8 @@ def test_elbo():
     sigma = 1.0
     y_obs = np.array([1.6, 1.4])
 
-    post_mu = np.array([1.88], dtype=theano.config.floatX)
-    post_sigma = np.array([1], dtype=theano.config.floatX)
+    post_mu = np.array([1.88], dtype=aesara.config.floatX)
+    post_sigma = np.array([1], dtype=aesara.config.floatX)
     # Create a model for test
     with pm.Model() as model:
         mu = pm.Normal("mu", mu=mu0, sigma=sigma)
@@ -505,13 +505,13 @@ def test_elbo():
 
     # Create variational gradient tensor
     mean_field = MeanField(model=model)
-    with theano.config.change_flags(compute_test_value="off"):
+    with aesara.config.change_flags(compute_test_value="off"):
         elbo = -pm.operators.KL(mean_field)()(10000)
 
     mean_field.shared_params["mu"].set_value(post_mu)
     mean_field.shared_params["rho"].set_value(np.log(np.exp(post_sigma) - 1))
 
-    f = theano.function([], elbo)
+    f = aesara.function([], elbo)
     elbo_mc = f()
 
     # Exact value
@@ -534,17 +534,17 @@ def test_scale_cost_to_minibatch_works(aux_total_size):
     y_obs = np.array([1.6, 1.4])
     beta = len(y_obs) / float(aux_total_size)
 
-    # TODO: theano_config
-    # with pm.Model(theano_config=dict(floatX='float64')):
+    # TODO: aesara_config
+    # with pm.Model(aesara_config=dict(floatX='float64')):
     # did not not work as expected
     # there were some numeric problems, so float64 is forced
-    with theano.config.change_flags(floatX="float64", warn_float64="ignore"):
+    with aesara.config.change_flags(floatX="float64", warn_float64="ignore"):
 
-        assert theano.config.floatX == "float64"
-        assert theano.config.warn_float64 == "ignore"
+        assert aesara.config.floatX == "float64"
+        assert aesara.config.warn_float64 == "ignore"
 
-        post_mu = np.array([1.88], dtype=theano.config.floatX)
-        post_sigma = np.array([1], dtype=theano.config.floatX)
+        post_mu = np.array([1.88], dtype=aesara.config.floatX)
+        post_sigma = np.array([1], dtype=aesara.config.floatX)
 
         with pm.Model():
             mu = pm.Normal("mu", mu=mu0, sigma=sigma)
@@ -555,7 +555,7 @@ def test_scale_cost_to_minibatch_works(aux_total_size):
             mean_field_1.shared_params["mu"].set_value(post_mu)
             mean_field_1.shared_params["rho"].set_value(np.log(np.exp(post_sigma) - 1))
 
-            with theano.config.change_flags(compute_test_value="off"):
+            with aesara.config.change_flags(compute_test_value="off"):
                 elbo_via_total_size_scaled = -pm.operators.KL(mean_field_1)()(10000)
 
         with pm.Model():
@@ -569,7 +569,7 @@ def test_scale_cost_to_minibatch_works(aux_total_size):
             mean_field_2.shared_params["mu"].set_value(post_mu)
             mean_field_2.shared_params["rho"].set_value(np.log(np.exp(post_sigma) - 1))
 
-        with theano.config.change_flags(compute_test_value="off"):
+        with aesara.config.change_flags(compute_test_value="off"):
             elbo_via_total_size_unscaled = -pm.operators.KL(mean_field_2)()(10000)
 
         np.testing.assert_allclose(
@@ -587,10 +587,10 @@ def test_elbo_beta_kl(aux_total_size):
     y_obs = np.array([1.6, 1.4])
     beta = len(y_obs) / float(aux_total_size)
 
-    with theano.config.change_flags(floatX="float64", warn_float64="ignore"):
+    with aesara.config.change_flags(floatX="float64", warn_float64="ignore"):
 
-        post_mu = np.array([1.88], dtype=theano.config.floatX)
-        post_sigma = np.array([1], dtype=theano.config.floatX)
+        post_mu = np.array([1.88], dtype=aesara.config.floatX)
+        post_sigma = np.array([1], dtype=aesara.config.floatX)
 
         with pm.Model():
             mu = pm.Normal("mu", mu=mu0, sigma=sigma)
@@ -601,7 +601,7 @@ def test_elbo_beta_kl(aux_total_size):
             mean_field_1.shared_params["mu"].set_value(post_mu)
             mean_field_1.shared_params["rho"].set_value(np.log(np.exp(post_sigma) - 1))
 
-            with theano.config.change_flags(compute_test_value="off"):
+            with aesara.config.change_flags(compute_test_value="off"):
                 elbo_via_total_size_scaled = -pm.operators.KL(mean_field_1)()(10000)
 
         with pm.Model():
@@ -612,7 +612,7 @@ def test_elbo_beta_kl(aux_total_size):
             mean_field_3.shared_params["mu"].set_value(post_mu)
             mean_field_3.shared_params["rho"].set_value(np.log(np.exp(post_sigma) - 1))
 
-            with theano.config.change_flags(compute_test_value="off"):
+            with aesara.config.change_flags(compute_test_value="off"):
                 elbo_via_beta_kl = -pm.operators.KL(mean_field_3, beta=beta)()(10000)
 
         np.testing.assert_allclose(
@@ -750,7 +750,7 @@ def test_remove_scan_op():
         inference = ADVI()
         buff = io.StringIO()
         inference.run_profiling(n=10).summary(buff)
-        assert "theano.scan.op.Scan" not in buff.getvalue()
+        assert "aesara.scan.op.Scan" not in buff.getvalue()
         buff.close()
 
 
@@ -780,7 +780,7 @@ def test_clear_cache():
 def another_simple_model():
     _model = models.simple_model()[1]
     with _model:
-        pm.Potential("pot", tt.ones((10, 10)))
+        pm.Potential("pot", aet.ones((10, 10)))
     return _model
 
 
@@ -831,8 +831,8 @@ def aevb_model():
         pm.Normal("y", shape=(2,))
     x = model.x
     y = model.y
-    mu = theano.shared(x.init_value)
-    rho = theano.shared(np.zeros_like(x.init_value))
+    mu = aesara.shared(x.init_value)
+    rho = aesara.shared(np.zeros_like(x.init_value))
     return {"model": model, "y": y, "x": x, "replace": dict(mu=mu, rho=rho)}
 
 
@@ -911,13 +911,13 @@ def binomial_model_inference(binomial_model, inference_spec):
 
 
 def test_replacements(binomial_model_inference):
-    d = tt.bscalar()
+    d = aet.bscalar()
     d.tag.test_value = 1
     approx = binomial_model_inference.approx
     p = approx.model.p
     p_t = p ** 3
     p_s = approx.sample_node(p_t)
-    if theano.config.compute_test_value != "off":
+    if aesara.config.compute_test_value != "off":
         assert p_s.tag.test_value.shape == p_t.tag.test_value.shape
     sampled = [p_s.eval() for _ in range(100)]
     assert any(map(operator.ne, sampled[1:], sampled[:-1]))  # stochastic
@@ -934,13 +934,13 @@ def test_replacements(binomial_model_inference):
 
 
 def test_sample_replacements(binomial_model_inference):
-    i = tt.iscalar()
+    i = aet.iscalar()
     i.tag.test_value = 1
     approx = binomial_model_inference.approx
     p = approx.model.p
     p_t = p ** 3
     p_s = approx.sample_node(p_t, size=100)
-    if theano.config.compute_test_value != "off":
+    if aesara.config.compute_test_value != "off":
         assert p_s.tag.test_value.shape == (100,) + p_t.tag.test_value.shape
     sampled = p_s.eval()
     assert any(map(operator.ne, sampled[1:], sampled[:-1]))  # stochastic
@@ -961,7 +961,7 @@ def test_discrete_not_allowed():
 
     with pm.Model():
         mu = pm.Normal("mu", mu=0, sigma=10, shape=3)
-        z = pm.Categorical("z", p=tt.ones(3) / 3, shape=len(y))
+        z = pm.Categorical("z", p=aet.ones(3) / 3, shape=len(y))
         pm.Normal("y_obs", mu=mu[z], sigma=1.0, observed=y)
         with pytest.raises(opvi.ParametrizationError):
             pm.fit(n=1)  # fails
@@ -1016,34 +1016,34 @@ def init_(**kw):
 
 
 def test_flow_det(flow_spec):
-    z0 = tt.arange(0, 20).astype("float32")
+    z0 = aet.arange(0, 20).astype("float32")
     flow = flow_spec(dim=20, z0=z0.dimshuffle("x", 0))
-    with theano.config.change_flags(compute_test_value="off"):
+    with aesara.config.change_flags(compute_test_value="off"):
         z1 = flow.forward.flatten()
-        J = tt.jacobian(z1, z0)
-        logJdet = tt.log(tt.abs_(tt.nlinalg.det(J)))
+        J = aet.jacobian(z1, z0)
+        logJdet = aet.log(aet.abs_(aet.nlinalg.det(J)))
         det = flow.logdet[0]
     np.testing.assert_allclose(logJdet.eval(), det.eval(), atol=0.0001)
 
 
 def test_flow_det_local(flow_spec):
-    z0 = tt.arange(0, 12).astype("float32")
+    z0 = aet.arange(0, 12).astype("float32")
     spec = flow_spec.cls.get_param_spec_for(d=12)
     params = dict()
     for k, shp in spec.items():
         params[k] = np.random.randn(1, *shp).astype("float32")
     flow = flow_spec(dim=12, z0=z0.reshape((1, 1, 12)), **params)
     assert flow.batched
-    with theano.config.change_flags(compute_test_value="off"):
+    with aesara.config.change_flags(compute_test_value="off"):
         z1 = flow.forward.flatten()
-        J = tt.jacobian(z1, z0)
-        logJdet = tt.log(tt.abs_(tt.nlinalg.det(J)))
+        J = aet.jacobian(z1, z0)
+        logJdet = aet.log(aet.abs_(aet.nlinalg.det(J)))
         det = flow.logdet[0]
     np.testing.assert_allclose(logJdet.eval(), det.eval(), atol=0.0001)
 
 
 def test_flows_collect_chain():
-    initial = tt.ones((3, 2))
+    initial = aet.ones((3, 2))
     flow1 = flows.PlanarFlow(dim=2, z0=initial)
     flow2 = flows.PlanarFlow(dim=2, z0=flow1)
     assert len(flow2.params) == 3
@@ -1067,4 +1067,4 @@ def test_flow_formula(formula, length, order):
     assert len(flows_list) == length
     if order is not None:
         assert flows_list == order
-    spec(dim=2, jitter=1)(tt.ones((3, 2))).eval()  # should work
+    spec(dim=2, jitter=1)(aet.ones((3, 2))).eval()  # should work
diff --git a/pymc3/tuning/scaling.py b/pymc3/tuning/scaling.py
index 49a59ff0d74..41d2af28203 100644
--- a/pymc3/tuning/scaling.py
+++ b/pymc3/tuning/scaling.py
@@ -16,9 +16,9 @@
 
 from numpy import exp, log, sqrt
 
+from pymc3.aesaraf import hessian_diag, inputvars
 from pymc3.blocking import ArrayOrdering, DictToArrayBijection
 from pymc3.model import Point, modelcontext
-from pymc3.theanof import hessian_diag, inputvars
 from pymc3.util import get_var_name
 
 __all__ = ["find_hessian", "trace_cov", "guess_scaling"]
diff --git a/pymc3/tuning/starting.py b/pymc3/tuning/starting.py
index 2a800b2b4dd..fcdd4fe8c4d 100644
--- a/pymc3/tuning/starting.py
+++ b/pymc3/tuning/starting.py
@@ -19,8 +19,8 @@
 """
 import copy
 
+import aesara.gradient as tg
 import numpy as np
-import theano.gradient as tg
 
 from fastprogress.fastprogress import ProgressBar, progress_bar
 from numpy import isfinite, nan_to_num
@@ -28,9 +28,9 @@
 
 import pymc3 as pm
 
+from pymc3.aesaraf import inputvars
 from pymc3.blocking import ArrayOrdering, DictToArrayBijection
 from pymc3.model import Point, modelcontext
-from pymc3.theanof import inputvars
 from pymc3.util import (
     check_start_vals,
     get_default_varnames,
diff --git a/pymc3/util.py b/pymc3/util.py
index 84b4f6c3e5f..f0429901f8e 100644
--- a/pymc3/util.py
+++ b/pymc3/util.py
@@ -22,7 +22,7 @@
 import numpy as np
 import xarray
 
-from theano.tensor import TensorVariable
+from aesara.tensor.var import TensorVariable
 
 from pymc3.exceptions import SamplingError
 
@@ -169,7 +169,7 @@ def get_repr_for_variable(variable, formatting="plain"):
 
 def get_var_name(var):
     """Get an appropriate, plain variable name for a variable. Necessary
-    because we override theano.tensor.TensorVariable.__str__ to give informative
+    because we override aesara.tensor.var.TensorVariable.__str__ to give informative
     string representations to our pymc3.PyMC3Variables, yet we want to use the
     plain name as e.g. keys in dicts.
     """
diff --git a/pymc3/variational/approximations.py b/pymc3/variational/approximations.py
index 896f7422c3d..4b6784f2efd 100644
--- a/pymc3/variational/approximations.py
+++ b/pymc3/variational/approximations.py
@@ -12,10 +12,12 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import aesara
 import numpy as np
-import theano
 
-from theano import tensor as tt
+from aesara import tensor as aet
+from aesara.graph.basic import Variable
+from aesara.tensor.var import TensorVariable
 
 import pymc3 as pm
 
@@ -53,13 +55,13 @@ def cov(self):
         if self.batched:
             return batched_diag(var)
         else:
-            return tt.diag(var)
+            return aet.diag(var)
 
     @node_property
     def std(self):
         return rho2sigma(self.rho)
 
-    @theano.config.change_flags(compute_test_value="off")
+    @aesara.config.change_flags(compute_test_value="off")
     def __init_group__(self, group):
         super().__init_group__(group)
         if not self._check_user_params():
@@ -82,8 +84,8 @@ def create_shared_params(self, start=None):
             start = np.tile(start, (self.bdim, 1))
             rho = np.tile(rho, (self.bdim, 1))
         return {
-            "mu": theano.shared(pm.floatX(start), "mu"),
-            "rho": theano.shared(pm.floatX(rho), "rho"),
+            "mu": aesara.shared(pm.floatX(start), "mu"),
+            "rho": aesara.shared(pm.floatX(rho), "rho"),
         }
 
     @node_property
@@ -97,7 +99,7 @@ def symbolic_random(self):
     def symbolic_logq_not_scaled(self):
         z0 = self.symbolic_initial
         std = rho2sigma(self.rho)
-        logdet = tt.log(std)
+        logdet = aet.log(std)
         logq = pm.Normal.dist().logp(z0) - logdet
         return logq.sum(range(1, logq.ndim))
 
@@ -114,7 +116,7 @@ class FullRankGroup(Group):
     short_name = "full_rank"
     alias_names = frozenset(["fr"])
 
-    @theano.config.change_flags(compute_test_value="off")
+    @aesara.config.change_flags(compute_test_value="off")
     def __init_group__(self, group):
         super().__init_group__(group)
         if not self._check_user_params():
@@ -133,21 +135,21 @@ def create_shared_params(self, start=None):
         else:
             start = self.bij.map(start)
         n = self.ddim
-        L_tril = np.eye(n)[np.tril_indices(n)].astype(theano.config.floatX)
+        L_tril = np.eye(n)[np.tril_indices(n)].astype(aesara.config.floatX)
         if self.batched:
             start = np.tile(start, (self.bdim, 1))
             L_tril = np.tile(L_tril, (self.bdim, 1))
-        return {"mu": theano.shared(start, "mu"), "L_tril": theano.shared(L_tril, "L_tril")}
+        return {"mu": aesara.shared(start, "mu"), "L_tril": aesara.shared(L_tril, "L_tril")}
 
     @node_property
     def L(self):
         if self.batched:
-            L = tt.zeros((self.ddim, self.ddim, self.bdim))
-            L = tt.set_subtensor(L[self.tril_indices], self.params_dict["L_tril"].T)
+            L = aet.zeros((self.ddim, self.ddim, self.bdim))
+            L = aet.set_subtensor(L[self.tril_indices], self.params_dict["L_tril"].T)
             L = L.dimshuffle(2, 0, 1)
         else:
-            L = tt.zeros((self.ddim, self.ddim))
-            L = tt.set_subtensor(L[self.tril_indices], self.params_dict["L_tril"])
+            L = aet.zeros((self.ddim, self.ddim))
+            L = aet.set_subtensor(L[self.tril_indices], self.params_dict["L_tril"])
         return L
 
     @node_property
@@ -158,16 +160,16 @@ def mean(self):
     def cov(self):
         L = self.L
         if self.batched:
-            return tt.batched_dot(L, L.swapaxes(-1, -2))
+            return aet.batched_dot(L, L.swapaxes(-1, -2))
         else:
             return L.dot(L.T)
 
     @node_property
     def std(self):
         if self.batched:
-            return tt.sqrt(batched_diag(self.cov))
+            return aet.sqrt(batched_diag(self.cov))
         else:
-            return tt.sqrt(tt.diag(self.cov))
+            return aet.sqrt(aet.diag(self.cov))
 
     @property
     def num_tril_entries(self):
@@ -189,7 +191,7 @@ def logq(z_b, mu_b, L_b):
             # it's gonna be so slow
             # scan is computed over batch and then summed up
             # output shape is (batch, samples)
-            return theano.scan(logq, [z.swapaxes(0, 1), self.mean, self.L])[0].sum(0)
+            return aesara.scan(logq, [z.swapaxes(0, 1), self.mean, self.L])[0].sum(0)
         else:
             return pm.MvNormal.dist(mu=self.mean, chol=self.L).logp(z)
 
@@ -202,7 +204,7 @@ def symbolic_random(self):
             # initial: bxsxd
             # L: bxdxd
             initial = initial.swapaxes(0, 1)
-            return tt.batched_dot(initial, L.swapaxes(1, 2)).swapaxes(0, 1) + mu
+            return aet.batched_dot(initial, L.swapaxes(1, 2)).swapaxes(0, 1) + mu
         else:
             return initial.dot(L.T) + mu
 
@@ -218,7 +220,7 @@ class EmpiricalGroup(Group):
     __param_spec__ = dict(histogram=("s", "d"))
     short_name = "empirical"
 
-    @theano.config.change_flags(compute_test_value="off")
+    @aesara.config.change_flags(compute_test_value="off")
     def __init_group__(self, group):
         super().__init_group__(group)
         self._check_trace()
@@ -254,7 +256,7 @@ def create_shared_params(self, trace=None, size=None, jitter=1, start=None):
                 for j in range(len(trace)):
                     histogram[i] = self.bij.map(trace.point(j, t))
                     i += 1
-        return dict(histogram=theano.shared(pm.floatX(histogram), "histogram"))
+        return dict(histogram=aesara.shared(pm.floatX(histogram), "histogram"))
 
     def _check_trace(self):
         trace = self._kwargs.get("trace", None)
@@ -264,7 +266,7 @@ def _check_trace(self):
     def randidx(self, size=None):
         if size is None:
             size = (1,)
-        elif isinstance(size, tt.TensorVariable):
+        elif isinstance(size, TensorVariable):
             if size.ndim < 1:
                 size = size[None]
             elif size.ndim > 1:
@@ -278,16 +280,16 @@ def randidx(self, size=None):
         ).astype("int32")
 
     def _new_initial(self, size, deterministic, more_replacements=None):
-        theano_condition_is_here = isinstance(deterministic, tt.Variable)
-        if theano_condition_is_here:
-            return tt.switch(
+        aesara_condition_is_here = isinstance(deterministic, Variable)
+        if aesara_condition_is_here:
+            return aet.switch(
                 deterministic,
-                tt.repeat(self.mean.dimshuffle("x", 0), size if size is not None else 1, -1),
+                aet.repeat(self.mean.dimshuffle("x", 0), size if size is not None else 1, -1),
                 self.histogram[self.randidx(size)],
             )
         else:
             if deterministic:
-                return tt.repeat(self.mean.dimshuffle("x", 0), size if size is not None else 1, -1)
+                return aet.repeat(self.mean.dimshuffle("x", 0), size if size is not None else 1, -1)
             else:
                 return self.histogram[self.randidx(size)]
 
@@ -310,10 +312,10 @@ def cov(self):
 
     @node_property
     def std(self):
-        return tt.sqrt(tt.diag(self.cov))
+        return aet.sqrt(aet.diag(self.cov))
 
     def __str__(self):
-        if isinstance(self.histogram, theano.compile.SharedVariable):
+        if isinstance(self.histogram, aesara.compile.SharedVariable):
             shp = ", ".join(map(str, self.histogram.shape.eval()))
         else:
             shp = "None, " + str(self.ddim)
@@ -370,7 +372,7 @@ class NormalizingFlowGroup(Group):
     """
     default_flow = "scale-loc"
 
-    @theano.config.change_flags(compute_test_value="off")
+    @aesara.config.change_flags(compute_test_value="off")
     def __init_group__(self, group):
         super().__init_group__(group)
         # objects to be resolved
@@ -584,7 +586,7 @@ def evaluate_over_trace(self, node):
 
         Parameters
         ----------
-        node: Theano Variables (or Theano expressions)
+        node: Aesara Variables (or Aesara expressions)
 
         Returns
         -------
@@ -593,9 +595,9 @@ def evaluate_over_trace(self, node):
         node = self.to_flat_input(node)
 
         def sample(post):
-            return theano.clone(node, {self.input: post})
+            return aesara.clone_replace(node, {self.input: post})
 
-        nodes, _ = theano.scan(sample, self.histogram)
+        nodes, _ = aesara.scan(sample, self.histogram)
         return nodes
 
 
diff --git a/pymc3/variational/flows.py b/pymc3/variational/flows.py
index 601c7351fa7..f78c32e69bb 100644
--- a/pymc3/variational/flows.py
+++ b/pymc3/variational/flows.py
@@ -12,10 +12,10 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+import aesara
 import numpy as np
-import theano
 
-from theano import tensor as tt
+from aesara import tensor as aet
 
 from pymc3.distributions.dist_math import rho2sigma
 from pymc3.memoize import WithMemoization
@@ -161,14 +161,14 @@ def __init__(self, z0=None, dim=None, jitter=0.001, batch_size=None, local=False
                 "Cannot infer dimension of flow, " "please provide dim or Flow instance as z0"
             )
         if z0 is None:
-            self.z0 = tt.matrix()  # type: tt.TensorVariable
+            self.z0 = aet.matrix()  # type: TensorVariable
         else:
-            self.z0 = tt.as_tensor(z0)
+            self.z0 = aet.as_tensor(z0)
         self.parent = parent
 
     def add_param(self, user=None, name=None, ref=0.0, dtype="floatX"):
         if dtype == "floatX":
-            dtype = theano.config.floatX
+            dtype = aesara.config.floatX
         spec = self.__param_spec__[name]
         shape = tuple(eval(s, {"d": self.dim}) for s in spec)
         if user is None:
@@ -178,7 +178,7 @@ def add_param(self, user=None, name=None, ref=0.0, dtype="floatX"):
                 if self.batch_size is None:
                     raise opvi.BatchedGroupError("Need batch size to infer parameter shape")
                 shape = (self.batch_size,) + shape
-            return theano.shared(
+            return aesara.shared(
                 np.asarray(np.random.normal(size=shape) * self.__jitter + ref).astype(dtype),
                 name=name,
             )
@@ -189,7 +189,7 @@ def add_param(self, user=None, name=None, ref=0.0, dtype="floatX"):
                     shape = (-1,) + shape
                 else:
                     shape = (self.batch_size,) + shape
-            return tt.as_tensor(user).reshape(shape)
+            return aet.as_tensor(user).reshape(shape)
 
     @property
     def params(self):
@@ -205,14 +205,14 @@ def all_params(self):
         return params
 
     @property
-    @theano.config.change_flags(compute_test_value="off")
+    @aesara.config.change_flags(compute_test_value="off")
     def sum_logdets(self):
         dets = [self.logdet]
         current = self
         while not current.isroot:
             current = current.parent
             dets.append(current.logdet)
-        return tt.add(*dets)
+        return aet.add(*dets)
 
     @node_property
     def forward(self):
@@ -222,9 +222,9 @@ def forward(self):
     def logdet(self):
         raise NotImplementedError
 
-    @theano.config.change_flags(compute_test_value="off")
+    @aesara.config.change_flags(compute_test_value="off")
     def forward_pass(self, z0):
-        ret = theano.clone(self.forward, {self.root.z0: z0})
+        ret = aesara.clone_replace(self.forward, {self.root.z0: z0})
         try:
             ret.tag.test_value = np.random.normal(size=z0.tag.test_value.shape).astype(
                 self.z0.dtype
@@ -297,7 +297,7 @@ def __call__(self, *args):
 class LinearFlow(AbstractFlow):
     __param_spec__ = dict(u=("d",), w=("d",), b=())
 
-    @theano.config.change_flags(compute_test_value="off")
+    @aesara.config.change_flags(compute_test_value="off")
     def __init__(self, h, u=None, w=None, b=None, **kwargs):
         self.h = h
         super().__init__(**kwargs)
@@ -325,7 +325,7 @@ def forward(self):
         if not self.batched:
             hwz = h(z.dot(w) + b)  # s
             # sxd + (s \outer d) = sxd
-            z1 = z + tt.outer(hwz, u)  # sxd
+            z1 = z + aet.outer(hwz, u)  # sxd
             return z1
         else:
             z = z.swapaxes(0, 1)
@@ -334,7 +334,7 @@ def forward(self):
             # w bxd
             b = b.dimshuffle(0, "x")
             # b bx-
-            hwz = h(tt.batched_dot(z, w) + b)  # bxs
+            hwz = h(aet.batched_dot(z, w) + b)  # bxs
             # bxsxd + (bxsx- * bx-xd) = bxsxd
             hwz = hwz.dimshuffle(0, 1, "x")  # bxsx-
             u = u.dimshuffle(0, "x", 1)  # bx-xd
@@ -352,8 +352,8 @@ def logdet(self):
             # f'(sxd \dot d + .) * -xd = sxd
             phi = deriv(z.dot(w) + b).dimshuffle(0, "x") * w.dimshuffle("x", 0)
             # \abs(. + sxd \dot d) = s
-            det = tt.abs_(1.0 + phi.dot(u))
-            return tt.log(det)
+            det = aet.abs_(1.0 + phi.dot(u))
+            return aet.log(det)
         else:
             z = z.swapaxes(0, 1)
             b = b.dimshuffle(0, "x")
@@ -362,20 +362,20 @@ def logdet(self):
             # w bxd
             # b bx-x-
             # f'(bxsxd \bdot bxd + bx-x-) * bx-xd = bxsxd
-            phi = deriv(tt.batched_dot(z, w) + b).dimshuffle(0, 1, "x") * w.dimshuffle(0, "x", 1)
+            phi = deriv(aet.batched_dot(z, w) + b).dimshuffle(0, 1, "x") * w.dimshuffle(0, "x", 1)
             # \abs(. + bxsxd \bdot bxd) = bxs
-            det = tt.abs_(1.0 + tt.batched_dot(phi, u))  # bxs
-            return tt.log(det).sum(0)  # s
+            det = aet.abs_(1.0 + aet.batched_dot(phi, u))  # bxs
+            return aet.log(det).sum(0)  # s
 
 
 class Tanh(FlowFn):
-    fn = tt.tanh
-    inv = tt.arctanh
+    fn = aet.tanh
+    inv = aet.arctanh
 
     @staticmethod
     def deriv(*args):
         (x,) = args
-        return 1.0 - tt.tanh(x) ** 2
+        return 1.0 - aet.tanh(x) ** 2
 
 
 @AbstractFlow.register
@@ -390,7 +390,7 @@ def make_uw(self, u, w):
             # u_: d
             # w_: d
             wu = u.dot(w)  # .
-            mwu = -1.0 + tt.nnet.softplus(wu)  # .
+            mwu = -1.0 + aet.nnet.softplus(wu)  # .
             # d + (. - .) * d / .
             u_h = u + (mwu - wu) * w / ((w ** 2).sum() + 1e-10)
             return u_h, w
@@ -398,7 +398,7 @@ def make_uw(self, u, w):
             # u_: bxd
             # w_: bxd
             wu = (u * w).sum(-1, keepdims=True)  # bx-
-            mwu = -1.0 + tt.nnet.softplus(wu)  # bx-
+            mwu = -1.0 + aet.nnet.softplus(wu)  # bx-
             # bxd + (bx- - bx-) * bxd / bx- = bxd
             u_h = u + (mwu - wu) * w / ((w ** 2).sum(-1, keepdims=True) + 1e-10)
             return u_h, w
@@ -407,7 +407,7 @@ def make_uw(self, u, w):
 class ReferencePointFlow(AbstractFlow):
     __param_spec__ = dict(a=(), b=(), z_ref=("d",))
 
-    @theano.config.change_flags(compute_test_value="off")
+    @aesara.config.change_flags(compute_test_value="off")
     def __init__(self, h, a=None, b=None, z_ref=None, **kwargs):
         super().__init__(**kwargs)
         a = self.add_param(a, "a")
@@ -474,7 +474,7 @@ def logdet(self):
         r = (z - z_ref).norm(2, axis=-1, keepdims=True)  # s
         har = h(a, r)
         dar = deriv(a, r)
-        logdet = tt.log((1.0 + b * har) ** (d - 1.0) * (1.0 + b * har + b * dar * r))
+        logdet = aet.log((1.0 + b * har) ** (d - 1.0) * (1.0 + b * har + b * dar * r))
         if self.batched:
             return logdet.sum([0, -1])
         else:
@@ -506,8 +506,8 @@ def __init__(self, **kwargs):
         super().__init__(Radial(), **kwargs)
 
     def make_ab(self, a, b):
-        a = tt.exp(a)
-        b = -a + tt.nnet.softplus(b)
+        a = aet.exp(a)
+        b = -a + aet.nnet.softplus(b)
         return a, b
 
 
@@ -531,7 +531,7 @@ def forward(self):
 
     @node_property
     def logdet(self):
-        return tt.zeros((self.z0.shape[0],))
+        return aet.zeros((self.z0.shape[0],))
 
 
 @AbstractFlow.register
@@ -539,7 +539,7 @@ class ScaleFlow(AbstractFlow):
     __param_spec__ = dict(rho=("d",))
     short_name = "scale"
 
-    @theano.config.change_flags(compute_test_value="off")
+    @aesara.config.change_flags(compute_test_value="off")
     def __init__(self, rho=None, **kwargs):
         super().__init__(**kwargs)
         rho = self.add_param(rho, "rho")
@@ -556,7 +556,7 @@ def forward(self):
 
     @node_property
     def logdet(self):
-        return tt.repeat(tt.sum(tt.log(self.scale)), self.z0.shape[0])
+        return aet.repeat(aet.sum(aet.log(self.scale)), self.z0.shape[0])
 
 
 @AbstractFlow.register
@@ -564,18 +564,18 @@ class HouseholderFlow(AbstractFlow):
     __param_spec__ = dict(v=("d",))
     short_name = "hh"
 
-    @theano.config.change_flags(compute_test_value="raise")
+    @aesara.config.change_flags(compute_test_value="raise")
     def __init__(self, v=None, **kwargs):
         super().__init__(**kwargs)
         v = self.add_param(v, "v")
         self.shared_params = dict(v=v)
         if self.batched:
             vv = v.dimshuffle(0, 1, "x") * v.dimshuffle(0, "x", 1)
-            I = tt.eye(self.dim).dimshuffle("x", 0, 1)
+            I = aet.eye(self.dim).dimshuffle("x", 0, 1)
             vvn = (1e-10 + (v ** 2).sum(-1)).dimshuffle(0, "x", "x")
         else:
-            vv = tt.outer(v, v)
-            I = tt.eye(self.dim)
+            vv = aet.outer(v, v)
+            I = aet.eye(self.dim)
             vvn = (v ** 2).sum(-1) + 1e-10
         self.H = I - 2.0 * vv / vvn
 
@@ -584,10 +584,10 @@ def forward(self):
         z = self.z0  # sxd
         H = self.H  # dxd
         if self.batched:
-            return tt.batched_dot(z.swapaxes(0, 1), H).swapaxes(0, 1)
+            return aet.batched_dot(z.swapaxes(0, 1), H).swapaxes(0, 1)
         else:
             return z.dot(H)
 
     @node_property
     def logdet(self):
-        return tt.zeros((self.z0.shape[0],))
+        return aet.zeros((self.z0.shape[0],))
diff --git a/pymc3/variational/inference.py b/pymc3/variational/inference.py
index 85eb08e65c0..1b77104c60a 100644
--- a/pymc3/variational/inference.py
+++ b/pymc3/variational/inference.py
@@ -130,7 +130,7 @@ def fit(self, n=10000, score=None, callbacks=None, progressbar=True, **kwargs):
         total_grad_norm_constraint: `float`
             Bounds gradient norm, prevents exploding gradient problem
         fn_kwargs: `dict`
-            Add kwargs to theano.function (e.g. `{'profile': True}`)
+            Add kwargs to aesara.function (e.g. `{'profile': True}`)
         more_replacements: `dict`
             Apply custom replacements before calculating gradients
 
@@ -423,7 +423,7 @@ class ADVI(KLqp):
 
         The tensors to which mini-bathced samples are supplied are
         handled separately by using callbacks in :func:`Inference.fit` method
-        that change storage of shared theano variable or by :func:`pymc3.generator`
+        that change storage of shared aesara variable or by :func:`pymc3.generator`
         that automatically iterates over minibatches and defined beforehand.
 
     -   (optional) Parameters of deterministic mappings
@@ -794,7 +794,7 @@ def fit(
     total_grad_norm_constraint: `float`
         Bounds gradient norm, prevents exploding gradient problem
     fn_kwargs: `dict`
-        Add kwargs to theano.function (e.g. `{'profile': True}`)
+        Add kwargs to aesara.function (e.g. `{'profile': True}`)
     more_replacements: `dict`
         Apply custom replacements before calculating gradients
 
diff --git a/pymc3/variational/operators.py b/pymc3/variational/operators.py
index 9a5c2fdc200..e69d9c447e4 100644
--- a/pymc3/variational/operators.py
+++ b/pymc3/variational/operators.py
@@ -11,9 +11,9 @@
 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
-import theano
+import aesara
 
-from theano import tensor as tt
+from aesara import tensor as aet
 
 import pymc3 as pm
 
@@ -75,7 +75,7 @@ def __init__(self, op, tf):
             raise opvi.ParametrizationError("Op should be KSD")
         ObjectiveFunction.__init__(self, op, tf)
 
-    @theano.config.change_flags(compute_test_value="off")
+    @aesara.config.change_flags(compute_test_value="off")
     def __call__(self, nmc, **kwargs):
         op = self.op  # type: KSD
         grad = op.apply(self.tf)
@@ -88,7 +88,7 @@ def __call__(self, nmc, **kwargs):
         else:
             params = self.test_params + kwargs["more_tf_params"]
             grad *= pm.floatX(-1)
-        grads = tt.grad(None, params, known_grads={z: grad})
+        grads = aet.grad(None, params, known_grads={z: grad})
         return self.approx.set_size_and_deterministic(
             grads, nmc, 0, kwargs.get("more_replacements")
         )
diff --git a/pymc3/variational/opvi.py b/pymc3/variational/opvi.py
index ebf4a9cda84..115c0abcaef 100644
--- a/pymc3/variational/opvi.py
+++ b/pymc3/variational/opvi.py
@@ -49,17 +49,19 @@
 import itertools
 import warnings
 
+import aesara
+import aesara.tensor as aet
 import numpy as np
-import theano
-import theano.tensor as tt
+
+from aesara.graph.basic import Variable
 
 import pymc3 as pm
 
+from pymc3.aesaraf import aet_rng, identity
 from pymc3.backends import NDArray
 from pymc3.blocking import ArrayOrdering, DictToArrayBijection, VarMap
 from pymc3.memoize import WithMemoization, memoize
 from pymc3.model import modelcontext
-from pymc3.theanof import identity, tt_rng
 from pymc3.util import get_default_varnames, get_transformed
 from pymc3.variational.updates import adagrad_window
 
@@ -116,7 +118,7 @@ def node_property(f):
         def wrapper(fn):
             return property(
                 memoize(
-                    theano.config.change_flags(compute_test_value="off")(append_name(f)(fn)),
+                    aesara.config.change_flags(compute_test_value="off")(append_name(f)(fn)),
                     bound=True,
                 )
             )
@@ -124,16 +126,16 @@ def wrapper(fn):
         return wrapper
     else:
         return property(
-            memoize(theano.config.change_flags(compute_test_value="off")(f), bound=True)
+            memoize(aesara.config.change_flags(compute_test_value="off")(f), bound=True)
         )
 
 
-@theano.config.change_flags(compute_test_value="ignore")
+@aesara.config.change_flags(compute_test_value="ignore")
 def try_to_set_test_value(node_in, node_out, s):
     _s = s
     if s is None:
         s = 1
-    s = theano.compile.view_op(tt.as_tensor(s))
+    s = aesara.compile.view_op(aet.as_tensor(s))
     if not isinstance(node_in, (list, tuple)):
         node_in = [node_in]
     if not isinstance(node_out, (list, tuple)):
@@ -150,7 +152,7 @@ def try_to_set_test_value(node_in, node_out, s):
                 o.tag.test_value = tv
 
 
-class ObjectiveUpdates(theano.OrderedUpdates):
+class ObjectiveUpdates(aesara.OrderedUpdates):
     """OrderedUpdates extension for storing loss"""
 
     loss = None
@@ -291,7 +293,7 @@ def add_obj_updates(
         if self.op.returns_loss:
             updates.loss = obj_target
 
-    @theano.config.change_flags(compute_test_value="off")
+    @aesara.config.change_flags(compute_test_value="off")
     def step_function(
         self,
         obj_n_mc=None,
@@ -335,13 +337,13 @@ def step_function(
         score: `bool`
             calculate loss on each step? Defaults to False for speed
         fn_kwargs: `dict`
-            Add kwargs to theano.function (e.g. `{'profile': True}`)
+            Add kwargs to aesara.function (e.g. `{'profile': True}`)
         more_replacements: `dict`
             Apply custom replacements before calculating gradients
 
         Returns
         -------
-        `theano.function`
+        `aesara.function`
         """
         if fn_kwargs is None:
             fn_kwargs = {}
@@ -359,12 +361,12 @@ def step_function(
             total_grad_norm_constraint=total_grad_norm_constraint,
         )
         if score:
-            step_fn = theano.function([], updates.loss, updates=updates, **fn_kwargs)
+            step_fn = aesara.function([], updates.loss, updates=updates, **fn_kwargs)
         else:
-            step_fn = theano.function([], None, updates=updates, **fn_kwargs)
+            step_fn = aesara.function([], None, updates=updates, **fn_kwargs)
         return step_fn
 
-    @theano.config.change_flags(compute_test_value="off")
+    @aesara.config.change_flags(compute_test_value="off")
     def score_function(
         self, sc_n_mc=None, more_replacements=None, fn_kwargs=None
     ):  # pragma: no cover
@@ -377,11 +379,11 @@ def score_function(
         more_replacements:
             Apply custom replacements before compiling a function
         fn_kwargs: `dict`
-            arbitrary kwargs passed to `theano.function`
+            arbitrary kwargs passed to `aesara.function`
 
         Returns
         -------
-        theano.function
+        aesara.function
         """
         if fn_kwargs is None:
             fn_kwargs = {}
@@ -390,9 +392,9 @@ def score_function(
         if more_replacements is None:
             more_replacements = {}
         loss = self(sc_n_mc, more_replacements=more_replacements)
-        return theano.function([], loss, **fn_kwargs)
+        return aesara.function([], loss, **fn_kwargs)
 
-    @theano.config.change_flags(compute_test_value="off")
+    @aesara.config.change_flags(compute_test_value="off")
     def __call__(self, nmc, **kwargs):
         if "more_tf_params" in kwargs:
             m = -1.0
@@ -504,7 +506,7 @@ def collect_shared_to_list(params):
         return list(
             t[1]
             for t in sorted(params.items(), key=lambda t: t[0])
-            if isinstance(t[1], theano.compile.SharedVariable)
+            if isinstance(t[1], aesara.compile.SharedVariable)
         )
     elif params is None:
         return []
@@ -842,7 +844,7 @@ def __init__(
         self._vfam = vfam
         self._local = local
         self._batched = rowwise
-        self._rng = tt_rng(random_seed)
+        self._rng = aet_rng(random_seed)
         model = modelcontext(model)
         self.model = model
         self.group = group
@@ -895,7 +897,7 @@ def _check_user_params(self, **kwargs):
                 shape = (-1,) + shape
             elif self.batched:
                 shape = (self.bdim,) + shape
-            self._user_params[name] = tt.as_tensor(param).reshape(shape)
+            self._user_params[name] = aet.as_tensor(param).reshape(shape)
         return True
 
     def _initial_type(self, name):
@@ -910,9 +912,9 @@ def _initial_type(self, name):
         tensor
         """
         if self.batched:
-            return tt.tensor3(name)
+            return aet.tensor3(name)
         else:
-            return tt.matrix(name)
+            return aet.matrix(name)
 
     def _input_type(self, name):
         R"""*Dev* - input type with given name. The correct type depends on `self.batched`
@@ -926,11 +928,11 @@ def _input_type(self, name):
         tensor
         """
         if self.batched:
-            return tt.matrix(name)
+            return aet.matrix(name)
         else:
-            return tt.vector(name)
+            return aet.vector(name)
 
-    @theano.config.change_flags(compute_test_value="off")
+    @aesara.config.change_flags(compute_test_value="off")
     def __init_group__(self, group):
         if not group:
             raise GroupError("Got empty group")
@@ -1020,11 +1022,11 @@ def _new_initial_shape(self, size, dim, more_replacements=None):
         shape vector
         """
         if self.batched:
-            bdim = tt.as_tensor(self.bdim)
-            bdim = theano.clone(bdim, more_replacements)
-            return tt.stack([size, bdim, dim])
+            bdim = aet.as_tensor(self.bdim)
+            bdim = aesara.clone_replace(bdim, more_replacements)
+            return aet.stack([size, bdim, dim])
         else:
-            return tt.stack([size, dim])
+            return aet.stack([size, dim])
 
     @node_property
     def bdim(self):
@@ -1071,22 +1073,22 @@ def _new_initial(self, size, deterministic, more_replacements=None):
         """
         if size is None:
             size = 1
-        if not isinstance(deterministic, tt.Variable):
+        if not isinstance(deterministic, Variable):
             deterministic = np.int8(deterministic)
         dim, dist_name, dist_map = (self.ddim, self.initial_dist_name, self.initial_dist_map)
         dtype = self.symbolic_initial.dtype
-        dim = tt.as_tensor(dim)
-        size = tt.as_tensor(size)
+        dim = aet.as_tensor(dim)
+        size = aet.as_tensor(size)
         shape = self._new_initial_shape(size, dim, more_replacements)
         # apply optimizations if possible
-        if not isinstance(deterministic, tt.Variable):
+        if not isinstance(deterministic, Variable):
             if deterministic:
-                return tt.ones(shape, dtype) * dist_map
+                return aet.ones(shape, dtype) * dist_map
             else:
                 return getattr(self._rng, dist_name)(size=shape)
         else:
             sample = getattr(self._rng, dist_name)(size=shape)
-            initial = tt.switch(deterministic, tt.ones(shape, dtype) * dist_map, sample)
+            initial = aet.switch(deterministic, aet.ones(shape, dtype) * dist_map, sample)
             return initial
 
     @node_property
@@ -1111,7 +1113,7 @@ def symbolic_random2d(self):
         else:
             return self.symbolic_random
 
-    @theano.config.change_flags(compute_test_value="off")
+    @aesara.config.change_flags(compute_test_value="off")
     def set_size_and_deterministic(self, node, s, d, more_replacements=None):
         """*Dev* - after node is sampled via :func:`symbolic_sample_over_posterior` or
         :func:`symbolic_single_sample` new random generator can be allocated and applied to node
@@ -1119,7 +1121,7 @@ def set_size_and_deterministic(self, node, s, d, more_replacements=None):
         Parameters
         ----------
         node: :class:`Variable`
-            Theano node with symbolically applied VI replacements
+            Aesara node with symbolically applied VI replacements
         s: scalar
             desired number of samples
         d: bool or int
@@ -1132,13 +1134,13 @@ def set_size_and_deterministic(self, node, s, d, more_replacements=None):
         :class:`Variable` with applied replacements, ready to use
         """
         flat2rand = self.make_size_and_deterministic_replacements(s, d, more_replacements)
-        node_out = theano.clone(node, flat2rand)
+        node_out = aesara.clone_replace(node, flat2rand)
         try_to_set_test_value(node, node_out, s)
         return node_out
 
     def to_flat_input(self, node):
         """*Dev* - replace vars with flattened view stored in `self.inputs`"""
-        return theano.clone(node, self.replacements)
+        return aesara.clone_replace(node, self.replacements)
 
     def symbolic_sample_over_posterior(self, node):
         """*Dev* - performs sampling of node applying independent samples from posterior each time.
@@ -1146,12 +1148,12 @@ def symbolic_sample_over_posterior(self, node):
         """
         node = self.to_flat_input(node)
         random = self.symbolic_random.astype(self.symbolic_initial.dtype)
-        random = tt.patternbroadcast(random, self.symbolic_initial.broadcastable)
+        random = aet.patternbroadcast(random, self.symbolic_initial.broadcastable)
 
         def sample(post):
-            return theano.clone(node, {self.input: post})
+            return aesara.clone_replace(node, {self.input: post})
 
-        nodes, _ = theano.scan(sample, random)
+        nodes, _ = aesara.scan(sample, random)
         return nodes
 
     def symbolic_single_sample(self, node):
@@ -1161,8 +1163,8 @@ def symbolic_single_sample(self, node):
         """
         node = self.to_flat_input(node)
         random = self.symbolic_random.astype(self.symbolic_initial.dtype)
-        random = tt.patternbroadcast(random, self.symbolic_initial.broadcastable)
-        return theano.clone(node, {self.input: random[0]})
+        random = aet.patternbroadcast(random, self.symbolic_initial.broadcastable)
+        return aesara.clone_replace(node, {self.input: random[0]})
 
     def make_size_and_deterministic_replacements(self, s, d, more_replacements=None):
         """*Dev* - creates correct replacements for initial depending on
@@ -1182,15 +1184,15 @@ def make_size_and_deterministic_replacements(self, s, d, more_replacements=None)
         dict with replacements for initial
         """
         initial = self._new_initial(s, d, more_replacements)
-        initial = tt.patternbroadcast(initial, self.symbolic_initial.broadcastable)
+        initial = aet.patternbroadcast(initial, self.symbolic_initial.broadcastable)
         if more_replacements:
-            initial = theano.clone(initial, more_replacements)
+            initial = aesara.clone_replace(initial, more_replacements)
         return {self.symbolic_initial: initial}
 
     @node_property
     def symbolic_normalizing_constant(self):
         """*Dev* - normalizing constant for `self.logq`, scales it to `minibatch_size` instead of `total_size`"""
-        t = self.to_flat_input(tt.max([v.scaling for v in self.group]))
+        t = self.to_flat_input(aet.max([v.scaling for v in self.group]))
         t = self.symbolic_single_sample(t)
         return pm.floatX(t)
 
@@ -1282,7 +1284,7 @@ class Approximation(WithMemoization):
     """
 
     def __init__(self, groups, model=None):
-        self._scale_cost_to_minibatch = theano.shared(np.int8(1))
+        self._scale_cost_to_minibatch = aesara.shared(np.int8(1))
         model = modelcontext(model)
         if not model.free_RVs:
             raise TypeError("Model does not have FreeRVs")
@@ -1341,22 +1343,22 @@ def symbolic_normalizing_constant(self):
         """*Dev* - normalizing constant for `self.logq`, scales it to `minibatch_size` instead of `total_size`.
         Here the effect is controlled by `self.scale_cost_to_minibatch`
         """
-        t = tt.max(
+        t = aet.max(
             self.collect("symbolic_normalizing_constant")
             + [var.scaling for var in self.model.observed_RVs]
         )
-        t = tt.switch(self._scale_cost_to_minibatch, t, tt.constant(1, dtype=t.dtype))
+        t = aet.switch(self._scale_cost_to_minibatch, t, aet.constant(1, dtype=t.dtype))
         return pm.floatX(t)
 
     @node_property
     def symbolic_logq(self):
         """*Dev* - collects `symbolic_logq` for all groups"""
-        return tt.add(*self.collect("symbolic_logq"))
+        return aet.add(*self.collect("symbolic_logq"))
 
     @node_property
     def logq(self):
         """*Dev* - collects `logQ` for all groups"""
-        return tt.add(*self.collect("logq"))
+        return aet.add(*self.collect("logq"))
 
     @node_property
     def logq_norm(self):
@@ -1365,7 +1367,7 @@ def logq_norm(self):
 
     @node_property
     def _sized_symbolic_varlogp_and_datalogp(self):
-        """*Dev* - computes sampled prior term from model via `theano.scan`"""
+        """*Dev* - computes sampled prior term from model via `aesara.scan`"""
         varlogp_s, datalogp_s = self.symbolic_sample_over_posterior(
             [self.model.varlogpt, self.model.datalogpt]
         )
@@ -1373,55 +1375,55 @@ def _sized_symbolic_varlogp_and_datalogp(self):
 
     @node_property
     def sized_symbolic_varlogp(self):
-        """*Dev* - computes sampled prior term from model via `theano.scan`"""
+        """*Dev* - computes sampled prior term from model via `aesara.scan`"""
         return self._sized_symbolic_varlogp_and_datalogp[0]  # shape (s,)
 
     @node_property
     def sized_symbolic_datalogp(self):
-        """*Dev* - computes sampled data term from model via `theano.scan`"""
+        """*Dev* - computes sampled data term from model via `aesara.scan`"""
         return self._sized_symbolic_varlogp_and_datalogp[1]  # shape (s,)
 
     @node_property
     def sized_symbolic_logp(self):
-        """*Dev* - computes sampled logP from model via `theano.scan`"""
+        """*Dev* - computes sampled logP from model via `aesara.scan`"""
         return self.sized_symbolic_varlogp + self.sized_symbolic_datalogp  # shape (s,)
 
     @node_property
     def logp(self):
-        """*Dev* - computes :math:`E_{q}(logP)` from model via `theano.scan` that can be optimized later"""
+        """*Dev* - computes :math:`E_{q}(logP)` from model via `aesara.scan` that can be optimized later"""
         return self.varlogp + self.datalogp
 
     @node_property
     def varlogp(self):
-        """*Dev* - computes :math:`E_{q}(prior term)` from model via `theano.scan` that can be optimized later"""
+        """*Dev* - computes :math:`E_{q}(prior term)` from model via `aesara.scan` that can be optimized later"""
         return self.sized_symbolic_varlogp.mean(0)
 
     @node_property
     def datalogp(self):
-        """*Dev* - computes :math:`E_{q}(data term)` from model via `theano.scan` that can be optimized later"""
+        """*Dev* - computes :math:`E_{q}(data term)` from model via `aesara.scan` that can be optimized later"""
         return self.sized_symbolic_datalogp.mean(0)
 
     @node_property
     def _single_symbolic_varlogp_and_datalogp(self):
-        """*Dev* - computes sampled prior term from model via `theano.scan`"""
+        """*Dev* - computes sampled prior term from model via `aesara.scan`"""
         varlogp, datalogp = self.symbolic_single_sample([self.model.varlogpt, self.model.datalogpt])
         return varlogp, datalogp
 
     @node_property
     def single_symbolic_varlogp(self):
-        """*Dev* - for single MC sample estimate of :math:`E_{q}(prior term)` `theano.scan`
+        """*Dev* - for single MC sample estimate of :math:`E_{q}(prior term)` `aesara.scan`
         is not needed and code can be optimized"""
         return self._single_symbolic_varlogp_and_datalogp[0]
 
     @node_property
     def single_symbolic_datalogp(self):
-        """*Dev* - for single MC sample estimate of :math:`E_{q}(data term)` `theano.scan`
+        """*Dev* - for single MC sample estimate of :math:`E_{q}(data term)` `aesara.scan`
         is not needed and code can be optimized"""
         return self._single_symbolic_varlogp_and_datalogp[1]
 
     @node_property
     def single_symbolic_logp(self):
-        """*Dev* - for single MC sample estimate of :math:`E_{q}(logP)` `theano.scan`
+        """*Dev* - for single MC sample estimate of :math:`E_{q}(logP)` `aesara.scan`
         is not needed and code can be optimized"""
         return self.single_symbolic_datalogp + self.single_symbolic_varlogp
 
@@ -1472,7 +1474,7 @@ def make_size_and_deterministic_replacements(self, s, d, more_replacements=None)
         flat2rand.update(more_replacements)
         return flat2rand
 
-    @theano.config.change_flags(compute_test_value="off")
+    @aesara.config.change_flags(compute_test_value="off")
     def set_size_and_deterministic(self, node, s, d, more_replacements=None):
         """*Dev* - after node is sampled via :func:`symbolic_sample_over_posterior` or
         :func:`symbolic_single_sample` new random generator can be allocated and applied to node
@@ -1480,7 +1482,7 @@ def set_size_and_deterministic(self, node, s, d, more_replacements=None):
         Parameters
         ----------
         node: :class:`Variable`
-            Theano node with symbolically applied VI replacements
+            Aesara node with symbolically applied VI replacements
         s: scalar
             desired number of samples
         d: bool or int
@@ -1495,14 +1497,14 @@ def set_size_and_deterministic(self, node, s, d, more_replacements=None):
         _node = node
         optimizations = self.get_optimization_replacements(s, d)
         flat2rand = self.make_size_and_deterministic_replacements(s, d, more_replacements)
-        node = theano.clone(node, optimizations)
-        node = theano.clone(node, flat2rand)
+        node = aesara.clone_replace(node, optimizations)
+        node = aesara.clone_replace(node, flat2rand)
         try_to_set_test_value(_node, node, s)
         return node
 
     def to_flat_input(self, node):
         """*Dev* - replace vars with flattened view stored in `self.inputs`"""
-        return theano.clone(node, self.replacements)
+        return aesara.clone_replace(node, self.replacements)
 
     def symbolic_sample_over_posterior(self, node):
         """*Dev* - performs sampling of node applying independent samples from posterior each time.
@@ -1511,9 +1513,9 @@ def symbolic_sample_over_posterior(self, node):
         node = self.to_flat_input(node)
 
         def sample(*post):
-            return theano.clone(node, dict(zip(self.inputs, post)))
+            return aesara.clone_replace(node, dict(zip(self.inputs, post)))
 
-        nodes, _ = theano.scan(sample, self.symbolic_randoms)
+        nodes, _ = aesara.scan(sample, self.symbolic_randoms)
         return nodes
 
     def symbolic_single_sample(self, node):
@@ -1524,11 +1526,11 @@ def symbolic_single_sample(self, node):
         node = self.to_flat_input(node)
         post = [v[0] for v in self.symbolic_randoms]
         inp = self.inputs
-        return theano.clone(node, dict(zip(inp, post)))
+        return aesara.clone_replace(node, dict(zip(inp, post)))
 
     def get_optimization_replacements(self, s, d):
         """*Dev* - optimizations for logP. If sample size is static and equal to 1:
-        then `theano.scan` MC estimate is replaced with single sample without call to `theano.scan`.
+        then `aesara.scan` MC estimate is replaced with single sample without call to `aesara.scan`.
         """
         repl = collections.OrderedDict()
         # avoid scan if size is constant and equal to one
@@ -1537,13 +1539,13 @@ def get_optimization_replacements(self, s, d):
             repl[self.datalogp] = self.single_symbolic_datalogp
         return repl
 
-    @theano.config.change_flags(compute_test_value="off")
+    @aesara.config.change_flags(compute_test_value="off")
     def sample_node(self, node, size=None, deterministic=False, more_replacements=None):
         """Samples given node or nodes over shared posterior
 
         Parameters
         ----------
-        node: Theano Variables (or Theano expressions)
+        node: Aesara Variables (or Aesara expressions)
         size: None or scalar
             number of samples
         more_replacements: `dict`
@@ -1557,7 +1559,7 @@ def sample_node(self, node, size=None, deterministic=False, more_replacements=No
         sampled node(s) with replacements
         """
         node_in = node
-        node = theano.clone(node, more_replacements)
+        node = aesara.clone_replace(node, more_replacements)
         if size is None:
             node_out = self.symbolic_single_sample(node)
         else:
@@ -1567,7 +1569,7 @@ def sample_node(self, node, size=None, deterministic=False, more_replacements=No
         return node_out
 
     def rslice(self, name):
-        """*Dev* - vectorized sampling for named random variable without call to `theano.scan`.
+        """*Dev* - vectorized sampling for named random variable without call to `aesara.scan`.
         This node still needs :func:`set_size_and_deterministic` to be evaluated
         """
 
@@ -1588,13 +1590,13 @@ def vars_names(vs):
 
     @property
     @memoize(bound=True)
-    @theano.config.change_flags(compute_test_value="off")
+    @aesara.config.change_flags(compute_test_value="off")
     def sample_dict_fn(self):
-        s = tt.iscalar()
+        s = aet.iscalar()
         names = [v.name for v in self.model.free_RVs]
         sampled = [self.rslice(name) for name in names]
         sampled = self.set_size_and_deterministic(sampled, s, 0)
-        sample_fn = theano.function([s], sampled)
+        sample_fn = aesara.function([s], sampled)
 
         def inner(draws=100):
             _samples = sample_fn(draws)
@@ -1658,7 +1660,7 @@ def has_batched(self):
 
     @node_property
     def symbolic_random(self):
-        return tt.concatenate(self.collect("symbolic_random2d"), axis=-1)
+        return aet.concatenate(self.collect("symbolic_random2d"), axis=-1)
 
     def __str__(self):
         if len(self.groups) < 5:
@@ -1679,7 +1681,7 @@ def any_histograms(self):
     def joint_histogram(self):
         if not self.all_histograms:
             raise VariationalInferenceError("%s does not consist of all Empirical approximations")
-        return tt.concatenate(self.collect("histogram"), axis=-1)
+        return aet.concatenate(self.collect("histogram"), axis=-1)
 
     @property
     def params(self):
diff --git a/pymc3/variational/stein.py b/pymc3/variational/stein.py
index ca9a9249106..79a7d78183c 100644
--- a/pymc3/variational/stein.py
+++ b/pymc3/variational/stein.py
@@ -12,11 +12,11 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
-import theano
-import theano.tensor as tt
+import aesara
+import aesara.tensor as aet
 
+from pymc3.aesaraf import floatX
 from pymc3.memoize import WithMemoization, memoize
-from pymc3.theanof import floatX
 from pymc3.variational.opvi import node_property
 from pymc3.variational.test_functions import rbf
 
@@ -46,12 +46,12 @@ def approx_symbolic_matrices(self):
 
     @node_property
     def dlogp(self):
-        grad = tt.grad(self.logp_norm.sum(), self.approx_symbolic_matrices)
+        grad = aet.grad(self.logp_norm.sum(), self.approx_symbolic_matrices)
 
         def flatten2(tensor):
             return tensor.flatten(2)
 
-        return tt.concatenate(list(map(flatten2, grad)), -1)
+        return aet.concatenate(list(map(flatten2, grad)), -1)
 
     @node_property
     def grad(self):
@@ -64,7 +64,7 @@ def grad(self):
     def density_part_grad(self):
         Kxy = self.Kxy
         dlogpdx = self.dlogp
-        return tt.dot(Kxy, dlogpdx)
+        return aet.dot(Kxy, dlogpdx)
 
     @node_property
     def repulsive_part_grad(self):
@@ -84,13 +84,13 @@ def dxkxy(self):
     def logp_norm(self):
         sized_symbolic_logp = self.approx.sized_symbolic_logp
         if self.use_histogram:
-            sized_symbolic_logp = theano.clone(
+            sized_symbolic_logp = aesara.clone_replace(
                 sized_symbolic_logp,
                 dict(zip(self.approx.symbolic_randoms, self.approx.collect("histogram"))),
             )
         return sized_symbolic_logp / self.approx.symbolic_normalizing_constant
 
     @memoize
-    @theano.config.change_flags(compute_test_value="off")
+    @aesara.config.change_flags(compute_test_value="off")
     def _kernel(self):
         return self._kernel_f(self.input_joint_matrix)
diff --git a/pymc3/variational/test_functions.py b/pymc3/variational/test_functions.py
index 8f95abd4e18..3380ed27b85 100644
--- a/pymc3/variational/test_functions.py
+++ b/pymc3/variational/test_functions.py
@@ -12,9 +12,9 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
-from theano import tensor as tt
+from aesara import tensor as aet
 
-from pymc3.theanof import floatX
+from pymc3.aesaraf import floatX
 from pymc3.variational.opvi import TestFunction
 
 __all__ = ["rbf"]
@@ -34,30 +34,30 @@ class Kernel(TestFunction):
 class RBF(Kernel):
     def __call__(self, X):
         XY = X.dot(X.T)
-        x2 = tt.sum(X ** 2, axis=1).dimshuffle(0, "x")
-        X2e = tt.repeat(x2, X.shape[0], axis=1)
+        x2 = aet.sum(X ** 2, axis=1).dimshuffle(0, "x")
+        X2e = aet.repeat(x2, X.shape[0], axis=1)
         H = X2e + X2e.T - 2.0 * XY
 
-        V = tt.sort(H.flatten())
+        V = aet.sort(H.flatten())
         length = V.shape[0]
         # median distance
-        m = tt.switch(
-            tt.eq((length % 2), 0),
+        m = aet.switch(
+            aet.eq((length % 2), 0),
             # if even vector
-            tt.mean(V[((length // 2) - 1) : ((length // 2) + 1)]),
+            aet.mean(V[((length // 2) - 1) : ((length // 2) + 1)]),
             # if odd vector
             V[length // 2],
         )
 
-        h = 0.5 * m / tt.log(floatX(H.shape[0]) + floatX(1))
+        h = 0.5 * m / aet.log(floatX(H.shape[0]) + floatX(1))
 
         #  RBF
-        Kxy = tt.exp(-H / h / 2.0)
+        Kxy = aet.exp(-H / h / 2.0)
 
         # Derivative
-        dxkxy = -tt.dot(Kxy, X)
-        sumkxy = tt.sum(Kxy, axis=-1, keepdims=True)
-        dxkxy = tt.add(dxkxy, tt.mul(X, sumkxy)) / h
+        dxkxy = -aet.dot(Kxy, X)
+        sumkxy = aet.sum(Kxy, axis=-1, keepdims=True)
+        dxkxy = aet.add(dxkxy, aet.mul(X, sumkxy)) / h
 
         return Kxy, dxkxy
 
diff --git a/pymc3/variational/updates.py b/pymc3/variational/updates.py
index a2baa462c5e..62776f48ad1 100755
--- a/pymc3/variational/updates.py
+++ b/pymc3/variational/updates.py
@@ -44,7 +44,7 @@
 # SOFTWARE.
 
 """
-Functions to generate Theano update dictionaries for training.
+Functions to generate Aesara update dictionaries for training.
 
 The update functions implement different methods to control the learning
 rate for use with stochastic gradient descent.
@@ -88,21 +88,20 @@
 Examples
 --------
 >>> import lasagne
->>> import theano.tensor as T
->>> import theano
+>>> import aesara
 >>> from lasagne.nonlinearities import softmax
 >>> from lasagne.layers import InputLayer, DenseLayer, get_output
 >>> from lasagne.updates import sgd, apply_momentum
 >>> l_in = InputLayer((100, 20))
 >>> l1 = DenseLayer(l_in, num_units=3, nonlinearity=softmax)
->>> x = tt.matrix('x')  # shp: num_batch x num_features
->>> y = tt.ivector('y') # shp: num_batch
+>>> x = aet.matrix('x')  # shp: num_batch x num_features
+>>> y = aet.ivector('y') # shp: num_batch
 >>> l_out = get_output(l1, x)
 >>> params = lasagne.layers.get_all_params(l1)
->>> loss = tt.mean(tt.nnet.categorical_crossentropy(l_out, y))
+>>> loss = aet.mean(aet.nnet.categorical_crossentropy(l_out, y))
 >>> updates_sgd = sgd(loss, params, learning_rate=0.0001)
 >>> updates = apply_momentum(updates_sgd, params, momentum=0.9)
->>> train_function = theano.function([x, y], updates=updates)
+>>> train_function = aesara.function([x, y], updates=updates)
 
 Notes
 -----
@@ -112,9 +111,9 @@
 from collections import OrderedDict
 from functools import partial
 
+import aesara
+import aesara.tensor as aet
 import numpy as np
-import theano
-import theano.tensor as tt
 
 import pymc3 as pm
 
@@ -152,7 +151,7 @@ def get_or_compute_grads(loss_or_grads, params):
         gradients and returned as is, unless it does not match the length
         of `params`, in which case a `ValueError` is raised.
         Otherwise, `loss_or_grads` is assumed to be a cost expression and
-        the function returns `theano.grad(loss_or_grads, params)`.
+        the function returns `aesara.grad(loss_or_grads, params)`.
 
     Raises
     ------
@@ -161,7 +160,7 @@ def get_or_compute_grads(loss_or_grads, params):
         any element of `params` is not a shared variable (while we could still
         compute its gradient, we can never update it and want to fail early).
     """
-    if any(not isinstance(p, theano.compile.SharedVariable) for p in params):
+    if any(not isinstance(p, aesara.compile.SharedVariable) for p in params):
         raise ValueError(
             "params must contain shared variables only. If it "
             "contains arbitrary parameter expressions, then "
@@ -174,7 +173,7 @@ def get_or_compute_grads(loss_or_grads, params):
             )
         return loss_or_grads
     else:
-        return theano.grad(loss_or_grads, params)
+        return aesara.grad(loss_or_grads, params)
 
 
 def _get_call_kwargs(_locals_):
@@ -212,7 +211,7 @@ def sgd(loss_or_grads=None, params=None, learning_rate=1e-3):
 
     Examples
     --------
-    >>> a = theano.shared(1.)
+    >>> a = aesara.shared(1.)
     >>> b = a*2
     >>> updates = sgd(b, [a], learning_rate=.01)
     >>> isinstance(updates, dict)
@@ -276,7 +275,7 @@ def apply_momentum(updates, params=None, momentum=0.9):
 
     for param in params:
         value = param.get_value(borrow=True)
-        velocity = theano.shared(
+        velocity = aesara.shared(
             np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable
         )
         x = momentum * velocity + updates[param]
@@ -326,7 +325,7 @@ def momentum(loss_or_grads=None, params=None, learning_rate=1e-3, momentum=0.9):
 
     Examples
     --------
-    >>> a = theano.shared(1.)
+    >>> a = aesara.shared(1.)
     >>> b = a*2
     >>> updates = momentum(b, [a], learning_rate=.01)
     >>> isinstance(updates, dict)
@@ -391,7 +390,7 @@ def apply_nesterov_momentum(updates, params=None, momentum=0.9):
 
     for param in params:
         value = param.get_value(borrow=True)
-        velocity = theano.shared(
+        velocity = aesara.shared(
             np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable
         )
         x = momentum * velocity + updates[param] - param
@@ -446,7 +445,7 @@ def nesterov_momentum(loss_or_grads=None, params=None, learning_rate=1e-3, momen
 
     Examples
     --------
-    >>> a = theano.shared(1.)
+    >>> a = aesara.shared(1.)
     >>> b = a*2
     >>> updates = nesterov_momentum(b, [a], learning_rate=.01)
     >>> isinstance(updates, dict)
@@ -514,7 +513,7 @@ def adagrad(loss_or_grads=None, params=None, learning_rate=1.0, epsilon=1e-6):
 
     Examples
     --------
-    >>> a = theano.shared(1.)
+    >>> a = aesara.shared(1.)
     >>> b = a*2
     >>> updates = adagrad(b, [a], learning_rate=.01)
     >>> isinstance(updates, dict)
@@ -535,12 +534,12 @@ def adagrad(loss_or_grads=None, params=None, learning_rate=1.0, epsilon=1e-6):
 
     for param, grad in zip(params, grads):
         value = param.get_value(borrow=True)
-        accu = theano.shared(
+        accu = aesara.shared(
             np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable
         )
         accu_new = accu + grad ** 2
         updates[accu] = accu_new
-        updates[param] = param - (learning_rate * grad / tt.sqrt(accu_new + epsilon))
+        updates[param] = param - (learning_rate * grad / aet.sqrt(accu_new + epsilon))
 
     return updates
 
@@ -574,19 +573,19 @@ def adagrad_window(loss_or_grads=None, params=None, learning_rate=0.001, epsilon
     grads = get_or_compute_grads(loss_or_grads, params)
     updates = OrderedDict()
     for param, grad in zip(params, grads):
-        i = theano.shared(pm.floatX(0))
+        i = aesara.shared(pm.floatX(0))
         i_int = i.astype("int32")
         value = param.get_value(borrow=True)
-        accu = theano.shared(np.zeros(value.shape + (n_win,), dtype=value.dtype))
+        accu = aesara.shared(np.zeros(value.shape + (n_win,), dtype=value.dtype))
 
         # Append squared gradient vector to accu_new
-        accu_new = tt.set_subtensor(accu[..., i_int], grad ** 2)
-        i_new = tt.switch((i + 1) < n_win, i + 1, 0)
+        accu_new = aet.set_subtensor(accu[..., i_int], grad ** 2)
+        i_new = aet.switch((i + 1) < n_win, i + 1, 0)
         updates[accu] = accu_new
         updates[i] = i_new
 
         accu_sum = accu_new.sum(axis=-1)
-        updates[param] = param - (learning_rate * grad / tt.sqrt(accu_sum + epsilon))
+        updates[param] = param - (learning_rate * grad / aet.sqrt(accu_sum + epsilon))
     return updates
 
 
@@ -633,13 +632,13 @@ def rmsprop(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.9, epsilon
 
     References
     ----------
-    .. [1] Tieleman, tt. and Hinton, G. (2012):
+    .. [1] Tieleman, aet. and Hinton, G. (2012):
            Neural Networks for Machine Learning, Lecture 6.5 - rmsprop.
            Coursera. http://www.youtube.com/watch?v=O3sxAc4hxZU (formula @5:20)
 
     Examples
     --------
-    >>> a = theano.shared(1.)
+    >>> a = aesara.shared(1.)
     >>> b = a*2
     >>> updates = rmsprop(b, [a], learning_rate=.01)
     >>> isinstance(updates, dict)
@@ -658,17 +657,17 @@ def rmsprop(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.9, epsilon
     grads = get_or_compute_grads(loss_or_grads, params)
     updates = OrderedDict()
 
-    # Using theano constant to prevent upcasting of float32
-    one = tt.constant(1)
+    # Using aesara constant to prevent upcasting of float32
+    one = aet.constant(1)
 
     for param, grad in zip(params, grads):
         value = param.get_value(borrow=True)
-        accu = theano.shared(
+        accu = aesara.shared(
             np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable
         )
         accu_new = rho * accu + (one - rho) * grad ** 2
         updates[accu] = accu_new
-        updates[param] = param - (learning_rate * grad / tt.sqrt(accu_new + epsilon))
+        updates[param] = param - (learning_rate * grad / aet.sqrt(accu_new + epsilon))
 
     return updates
 
@@ -731,7 +730,7 @@ def adadelta(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.95, epsil
 
     Examples
     --------
-    >>> a = theano.shared(1.)
+    >>> a = aesara.shared(1.)
     >>> b = a*2
     >>> updates = adadelta(b, [a], learning_rate=.01)
     >>> isinstance(updates, dict)
@@ -750,17 +749,17 @@ def adadelta(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.95, epsil
     grads = get_or_compute_grads(loss_or_grads, params)
     updates = OrderedDict()
 
-    # Using theano constant to prevent upcasting of float32
-    one = tt.constant(1)
+    # Using aesara constant to prevent upcasting of float32
+    one = aet.constant(1)
 
     for param, grad in zip(params, grads):
         value = param.get_value(borrow=True)
         # accu: accumulate gradient magnitudes
-        accu = theano.shared(
+        accu = aesara.shared(
             np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable
         )
         # delta_accu: accumulate update magnitudes (recursively!)
-        delta_accu = theano.shared(
+        delta_accu = aesara.shared(
             np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable
         )
 
@@ -769,7 +768,7 @@ def adadelta(loss_or_grads=None, params=None, learning_rate=1.0, rho=0.95, epsil
         updates[accu] = accu_new
 
         # compute parameter update, using the 'old' delta_accu
-        update = grad * tt.sqrt(delta_accu + epsilon) / tt.sqrt(accu_new + epsilon)
+        update = grad * aet.sqrt(delta_accu + epsilon) / aet.sqrt(accu_new + epsilon)
         updates[param] = param - learning_rate * update
 
         # update delta_accu (as accu, but accumulating updates)
@@ -823,7 +822,7 @@ def adam(
 
     Examples
     --------
-    >>> a = theano.shared(1.)
+    >>> a = aesara.shared(1.)
     >>> b = a*2
     >>> updates = adam(b, [a], learning_rate=.01)
     >>> isinstance(updates, dict)
@@ -840,27 +839,27 @@ def adam(
     elif loss_or_grads is None or params is None:
         raise ValueError("Please provide both `loss_or_grads` and `params` to get updates")
     all_grads = get_or_compute_grads(loss_or_grads, params)
-    t_prev = theano.shared(pm.theanof.floatX(0.0))
+    t_prev = aesara.shared(pm.aesaraf.floatX(0.0))
     updates = OrderedDict()
 
-    # Using theano constant to prevent upcasting of float32
-    one = tt.constant(1)
+    # Using aesara constant to prevent upcasting of float32
+    one = aet.constant(1)
 
     t = t_prev + 1
-    a_t = learning_rate * tt.sqrt(one - beta2 ** t) / (one - beta1 ** t)
+    a_t = learning_rate * aet.sqrt(one - beta2 ** t) / (one - beta1 ** t)
 
     for param, g_t in zip(params, all_grads):
         value = param.get_value(borrow=True)
-        m_prev = theano.shared(
+        m_prev = aesara.shared(
             np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable
         )
-        v_prev = theano.shared(
+        v_prev = aesara.shared(
             np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable
         )
 
         m_t = beta1 * m_prev + (one - beta1) * g_t
         v_t = beta2 * v_prev + (one - beta2) * g_t ** 2
-        step = a_t * m_t / (tt.sqrt(v_t) + epsilon)
+        step = a_t * m_t / (aet.sqrt(v_t) + epsilon)
 
         updates[m_prev] = m_t
         updates[v_prev] = v_t
@@ -911,7 +910,7 @@ def adamax(
 
     Examples
     --------
-    >>> a = theano.shared(1.)
+    >>> a = aesara.shared(1.)
     >>> b = a*2
     >>> updates = adamax(b, [a], learning_rate=.01)
     >>> isinstance(updates, dict)
@@ -928,26 +927,26 @@ def adamax(
     elif loss_or_grads is None or params is None:
         raise ValueError("Please provide both `loss_or_grads` and `params` to get updates")
     all_grads = get_or_compute_grads(loss_or_grads, params)
-    t_prev = theano.shared(pm.theanof.floatX(0.0))
+    t_prev = aesara.shared(pm.aesaraf.floatX(0.0))
     updates = OrderedDict()
 
-    # Using theano constant to prevent upcasting of float32
-    one = tt.constant(1)
+    # Using aesara constant to prevent upcasting of float32
+    one = aet.constant(1)
 
     t = t_prev + 1
     a_t = learning_rate / (one - beta1 ** t)
 
     for param, g_t in zip(params, all_grads):
         value = param.get_value(borrow=True)
-        m_prev = theano.shared(
+        m_prev = aesara.shared(
             np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable
         )
-        u_prev = theano.shared(
+        u_prev = aesara.shared(
             np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable
         )
 
         m_t = beta1 * m_prev + (one - beta1) * g_t
-        u_t = tt.maximum(beta2 * u_prev, abs(g_t))
+        u_t = aet.maximum(beta2 * u_prev, abs(g_t))
         step = a_t * m_t / (u_t + epsilon)
 
         updates[m_prev] = m_t
@@ -968,7 +967,7 @@ def norm_constraint(tensor_var, max_norm, norm_axes=None, epsilon=1e-7):
     Parameters
     ----------
     tensor_var: TensorVariable
-        Theano expression for update, gradient, or other quantity.
+        Aesara expression for update, gradient, or other quantity.
     max_norm: scalar
         This value sets the maximum allowed value of any norm in
         `tensor_var`.
@@ -993,11 +992,11 @@ def norm_constraint(tensor_var, max_norm, norm_axes=None, epsilon=1e-7):
 
     Examples
     --------
-    >>> param = theano.shared(
-    ...     np.random.randn(100, 200).astype(theano.config.floatX))
+    >>> param = aesara.shared(
+    ...     np.random.randn(100, 200).astype(aesara.config.floatX))
     >>> update = param + 100
     >>> update = norm_constraint(update, 10)
-    >>> func = theano.function([], [], updates=[(param, update)])
+    >>> func = aesara.function([], [], updates=[(param, update)])
     >>> # Apply constrained update
     >>> _ = func()
     >>> from lasagne.utils import compute_norms
@@ -1028,9 +1027,9 @@ def norm_constraint(tensor_var, max_norm, norm_axes=None, epsilon=1e-7):
             "Unsupported tensor dimensionality {}." "Must specify `norm_axes`".format(ndim)
         )
 
-    dtype = np.dtype(theano.config.floatX).type
-    norms = tt.sqrt(tt.sum(tt.sqr(tensor_var), axis=sum_over, keepdims=True))
-    target_norms = tt.clip(norms, 0, dtype(max_norm))
+    dtype = np.dtype(aesara.config.floatX).type
+    norms = aet.sqrt(aet.sum(aet.sqr(tensor_var), axis=sum_over, keepdims=True))
+    target_norms = aet.clip(norms, 0, dtype(max_norm))
     constrained_output = tensor_var * (target_norms / (dtype(epsilon) + norms))
 
     return constrained_output
@@ -1061,7 +1060,7 @@ def total_norm_constraint(tensor_vars, max_norm, epsilon=1e-7, return_norm=False
     -------
     tensor_vars_scaled: list of TensorVariables
         The scaled tensor variables.
-    norm: Theano scalar
+    norm: Aesara scalar
         The combined norms of the input variables prior to rescaling,
         only returned if ``return_norms=True``.
 
@@ -1070,14 +1069,14 @@ def total_norm_constraint(tensor_vars, max_norm, epsilon=1e-7, return_norm=False
     >>> from lasagne.layers import InputLayer, DenseLayer
     >>> import lasagne
     >>> from lasagne.updates import sgd, total_norm_constraint
-    >>> x = tt.matrix()
-    >>> y = tt.ivector()
+    >>> x = aet.matrix()
+    >>> y = aet.ivector()
     >>> l_in = InputLayer((5, 10))
-    >>> l1 = DenseLayer(l_in, num_units=7, nonlinearity=tt.nnet.softmax)
+    >>> l1 = DenseLayer(l_in, num_units=7, nonlinearity=aet.nnet.softmax)
     >>> output = lasagne.layers.get_output(l1, x)
-    >>> cost = tt.mean(tt.nnet.categorical_crossentropy(output, y))
+    >>> cost = aet.mean(aet.nnet.categorical_crossentropy(output, y))
     >>> all_params = lasagne.layers.get_all_params(l1)
-    >>> all_grads = tt.grad(cost, all_params)
+    >>> all_grads = aet.grad(cost, all_params)
     >>> scaled_grads = total_norm_constraint(all_grads, 5)
     >>> updates = sgd(scaled_grads, all_params, learning_rate=0.1)
 
@@ -1091,9 +1090,9 @@ def total_norm_constraint(tensor_vars, max_norm, epsilon=1e-7, return_norm=False
        learning with neural networks. In Advances in Neural Information
        Processing Systems (pp. 3104-3112).
     """
-    norm = tt.sqrt(sum(tt.sum(tensor ** 2) for tensor in tensor_vars))
-    dtype = np.dtype(theano.config.floatX).type
-    target_norm = tt.clip(norm, 0, dtype(max_norm))
+    norm = aet.sqrt(sum(aet.sum(tensor ** 2) for tensor in tensor_vars))
+    dtype = np.dtype(aesara.config.floatX).type
+    target_norm = aet.clip(norm, 0, dtype(max_norm))
     multiplier = target_norm / (dtype(epsilon) + norm)
     tensor_vars_scaled = [step * multiplier for step in tensor_vars]
 
diff --git a/pymc3/vartypes.py b/pymc3/vartypes.py
index 2469036f312..8cb61333aba 100644
--- a/pymc3/vartypes.py
+++ b/pymc3/vartypes.py
@@ -12,9 +12,6 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
-from theano.graph.basic import Constant as graph_constant
-from theano.tensor import Constant as tensor_constant
-
 __all__ = [
     "bool_types",
     "int_types",
@@ -24,7 +21,6 @@
     "discrete_types",
     "typefilter",
     "isgenerator",
-    "theano_constant",
 ]
 
 bool_types = {"int8"}
@@ -45,6 +41,3 @@ def typefilter(vars, types):
 
 def isgenerator(obj):
     return hasattr(obj, "__next__")
-
-
-theano_constant = (tensor_constant, graph_constant)
diff --git a/requirements.txt b/requirements.txt
index 93cb80ebc13..9ec84e75387 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,9 @@
-arviz>=0.11.0
+aesara>=2.0.1
+arviz>=0.11.1
 dill
 fastprogress>=0.2.0
 numpy>=1.15.0
 pandas>=0.24.0
 patsy>=0.5.1
 scipy>=1.2.0
-theano-pymc==1.1.2
 typing-extensions>=3.7.4
diff --git a/scripts/test.sh b/scripts/test.sh
index f9ae8111f7b..9045f8df509 100755
--- a/scripts/test.sh
+++ b/scripts/test.sh
@@ -3,4 +3,4 @@
 set -e
 
 _FLOATX=${FLOATX:=float64}
-THEANO_FLAGS="floatX=${_FLOATX},gcc__cxxflags='-march=core2'" pytest -v --cov=pymc3 --cov-report=xml "$@" --cov-report term
+AESARA_FLAGS="floatX=${_FLOATX},gcc__cxxflags='-march=core2'" pytest -v --cov=pymc3 --cov-report=xml "$@" --cov-report term
diff --git a/setup.py b/setup.py
index 9b8091ba1bb..c7ccef4c071 100755
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,7 @@
 from setuptools import find_packages, setup
 
 DISTNAME = "pymc3"
-DESCRIPTION = "Probabilistic Programming in Python: Bayesian Modeling and Probabilistic Machine Learning with Theano"
+DESCRIPTION = "Probabilistic Programming in Python: Bayesian Modeling and Probabilistic Machine Learning with Aesara"
 AUTHOR = "PyMC Developers"
 AUTHOR_EMAIL = "pymc.devs@gmail.com"
 URL = "http://github.com/pymc-devs/pymc3"