diff --git a/.cargo/config b/.cargo/config
new file mode 100644
index 000000000000..d47f983e474f
--- /dev/null
+++ b/.cargo/config
@@ -0,0 +1,11 @@
+[target.x86_64-apple-darwin]
+rustflags = [
+  "-C", "link-arg=-undefined",
+  "-C", "link-arg=dynamic_lookup",
+]
+
+[target.aarch64-apple-darwin]
+rustflags = [
+  "-C", "link-arg=-undefined",
+  "-C", "link-arg=dynamic_lookup",
+]
diff --git a/.gitignore b/.gitignore
index b395e6aeef10..74b9026af629 100644
--- a/.gitignore
+++ b/.gitignore
@@ -151,3 +151,8 @@ test/ipynb/mpl/circuit/result_test.json
 test/ipynb/mpl/graph/*.png
 test/ipynb/mpl/graph/*.zip
 test/ipynb/mpl/graph/result_test.json
+
+# Added by cargo
+
+/target
+Cargo.lock
diff --git a/.pylintrc b/.pylintrc
index c45325296758..b2f744ba968f 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -33,7 +33,7 @@ unsafe-load-any-extension=no
 # A comma-separated list of package or module names from where C extensions may
 # be loaded. Extensions are loading into the active Python interpreter and may
 # run arbitrary code
-extension-pkg-allow-list=retworkx, numpy, tweedledum
+extension-pkg-allow-list=retworkx, numpy, tweedledum, qiskit._accelerate
 
 
 [MESSAGES CONTROL]
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 48486e66373d..22ebd6127080 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -225,9 +225,32 @@ build all the documentation into `docs/_build/html` and the release notes in
 particular will be located at `docs/_build/html/release_notes.html`
 
 ## Installing Qiskit Terra from source
-Please see the [Installing Qiskit Terra from
-Source](https://qiskit.org/documentation/contributing_to_qiskit.html#installing-terra-from-source)
-section of the Qiskit documentation.
+
+Qiskit Terra is primarily written in Python but there are some core routines
+that are written in the [Rust](https://www.rust-lang.org/) programming
+language to improve the runtime performance. For the released versions of
+qiskit-terra we publish precompiled binaries on the
+[Python Package Index](https://pypi.org/) for all the supported platforms
+which only requires a functional Python environment to install. However, when
+building and installing from source you will need a rust compiler installed. You can do this very easily
+using rustup: https://rustup.rs/ which provides a single tool to install and
+configure the latest version of the rust compiler.
+[Other installation methods](https://forge.rust-lang.org/infra/other-installation-methods.html)
+exist too. For windows users besides rustup you will also need install
+the Visual C++ build tools so that rust can link against the system c/c++
+libraries. You can see more details on this in the
+[rustup documentation](https://rust-lang.github.io/rustup/installation/windows.html).
+
+Once you have a rust compiler installed you can rely on the normal Python
+build/install steps to install Qiskit Terra. This means you just run
+`pip install .` in your local git clone to build and install Qiskit Terra.
+
+Do note that if you do use develop mode/editable install (via `python setup.py develop` or `pip install -e .`) the Rust extension will be built in debug mode
+without any optimizations enabled. This will result in poor runtime performance.
+If you'd like to use an editable install with an optimized binary you can
+run `python setup.py build_rust --release --inplace` after you install in
+editable mode to recompile the rust extensions in release mode.
+
 
 ## Test
 
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 000000000000..1c88d5bceb28
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,33 @@
+[package]
+name = "qiskit-terra"
+version = "0.20.0"
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[lib]
+name = "qiskit_accelerate"
+crate-type = ["cdylib"]
+
+[dependencies]
+rayon = "1.5"
+numpy = "0.15.1"
+rand = "0.8"
+rand_pcg = "0.3"
+rand_distr = "0.4.3"
+
+[dependencies.pyo3]
+version = "0.15.1"
+features = ["extension-module", "hashbrown"]
+
+[dependencies.ndarray]
+version = "^0.15.0"
+features = ["rayon"]
+
+[dependencies.hashbrown]
+version = "0.11.2"
+features = ["rayon"]
+
+[profile.release]
+lto = 'fat'
+codegen-units = 1
diff --git a/MANIFEST.in b/MANIFEST.in
index 9eb57c925b42..13898b9b59e7 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -15,3 +15,6 @@ include test/python/pickles/*.pickle
 include test/python/qasm/*.qasm
 include test/python/visualization/references/*.png
 include test/python/notebooks/*.ipynb
+
+include Cargo.toml
+recursive-include src *
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 359bae6fc30f..c71f8b8e11ee 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -180,6 +180,7 @@ stages:
             pip install -U "cplex" "qiskit-aer" "z3-solver" -c constraints.txt
             mkdir -p /tmp/terra-tests
             cp -r test /tmp/terra-tests/.
+            cp tools/verify_parallel_map.py /tmp/terra-tests/.
             cp .stestr.conf /tmp/terra-tests/.
             cp -r .stestr /tmp/terra-tests/. || :
             sudo apt-get update
@@ -193,8 +194,11 @@ stages:
             export PYTHONHASHSEED=$(python -S -c "import random; print(random.randint(1, 4294967295))")
             echo "PYTHONHASHSEED=$PYTHONHASHSEED"
             stestr run
+            python ./verify_parallel_map.py
             popd
           displayName: 'Run tests'
+          env:
+            QISKIT_PARALLEL: FALSE
         - task: CopyFiles@2
           condition: failed()
           displayName: 'Copy images'
@@ -239,7 +243,6 @@ stages:
             virtualenv image_tests
             image_tests/bin/pip install -U -r requirements.txt -c constraints.txt
             image_tests/bin/pip install -U -c constraints.txt -e ".[visualization]"
-            image_tests/bin/python setup.py build_ext --inplace
             sudo apt-get update
             sudo apt-get install -y graphviz pandoc
             image_tests/bin/pip check
@@ -286,6 +289,8 @@ stages:
             tools/verify_headers.py qiskit test
             python tools/find_optional_imports.py
             reno lint
+            cargo fmt --check
+            cargo clippy -- -D warnings
           displayName: 'Style and lint'
     - job: 'Docs'
       pool: {vmImage: 'ubuntu-latest'}
@@ -314,7 +319,6 @@ stages:
             set -e
             python -m pip install --upgrade pip setuptools wheel
             pip install -U tox
-            python setup.py build_ext --inplace
             sudo apt-get update
             sudo apt-get install -y graphviz
           displayName: 'Install dependencies'
@@ -384,7 +388,10 @@ stages:
             export PYTHONHASHSEED=$(python -S -c "import random; print(random.randint(1, 4294967295))")
             echo "PYTHONHASHSEED=$PYTHONHASHSEED"
             stestr run
+            python ./tools/verify_parallel_map.py
           displayName: 'Run tests'
+          env:
+            QISKIT_PARALLEL: FALSE
         - task: CopyFiles@2
           condition: failed()
           displayName: 'Copy images'
@@ -454,10 +461,12 @@ stages:
             export PYTHONHASHSEED=$(python -S -c "import random; print(random.randint(1, 1024))")
             echo "PYTHONHASHSEED=$PYTHONHASHSEED"
             stestr run
+            python ./tools/verify_parallel_map.py
           displayName: 'Run tests'
           env:
             LANG: 'C.UTF-8'
             PYTHONIOENCODING: 'utf-8:backslashreplace'
+            QISKIT_PARALLEL: FALSE
         - task: CopyFiles@2
           condition: failed()
           displayName: 'Copy images'
@@ -538,6 +547,7 @@ stages:
             export PYTHONHASHSEED=$(python -S -c "import random; print(random.randint(1, 1024))")
             echo "PYTHONHASHSEED=$PYTHONHASHSEED"
             stestr run
+            python ./tools/verify_parallel_map.py
           env:
             LANG: 'C.UTF-8'
             PYTHONIOENCODING: 'utf-8:backslashreplace'
@@ -630,7 +640,10 @@ stages:
             export PYTHONHASHSEED=$(python -S -c "import random; print(random.randint(1, 4294967295))")
             echo "PYTHONHASHSEED=$PYTHONHASHSEED"
             stestr run
+            python ./tools/verify_parallel_map.py
           displayName: 'Run tests'
+          env:
+            QISKIT_PARALLEL: FALSE
         - task: CopyFiles@2
           condition: failed()
           displayName: 'Copy images'
@@ -712,6 +725,7 @@ stages:
             export PYTHONHASHSEED=$(python -S -c "import random; print(random.randint(1, 4294967295))")
             echo "PYTHONHASHSEED=$PYTHONHASHSEED"
             stestr run
+            python ./tools/verify_parallel_map.py
           displayName: 'Run tests'
         - task: CopyFiles@2
           condition: failed()
diff --git a/examples/python/stochastic_swap.py b/examples/python/stochastic_swap.py
index c07cc0731a8b..7625cba3b73a 100644
--- a/examples/python/stochastic_swap.py
+++ b/examples/python/stochastic_swap.py
@@ -73,23 +73,22 @@
 # Build the expected output to verify the pass worked
 expected = QuantumCircuit(qr, cr)
 expected.cx(qr[1], qr[2])
+expected.h(qr[2])
 expected.swap(qr[0], qr[1])
+expected.h(qr[0])
 expected.cx(qr[1], qr[3])
 expected.h(qr[3])
-expected.h(qr[2])
 expected.measure(qr[1], cr[0])
-expected.h(qr[0])
 expected.swap(qr[1], qr[3])
-expected.h(qr[3])
 expected.cx(qr[2], qr[1])
+expected.h(qr[3])
+expected.swap(qr[0], qr[1])
 expected.measure(qr[2], cr[2])
-expected.swap(qr[1], qr[3])
-expected.measure(qr[3], cr[3])
-expected.cx(qr[1], qr[0])
-expected.measure(qr[1], cr[0])
-expected.measure(qr[0], cr[1])
+expected.cx(qr[3], qr[1])
+expected.measure(qr[0], cr[3])
+expected.measure(qr[3], cr[0])
+expected.measure(qr[1], cr[1])
 expected_dag = circuit_to_dag(expected)
-
 # Run the pass on the dag from the input circuit
 pass_ = StochasticSwap(coupling, 20, 999)
 after = pass_.run(dag)
diff --git a/pyproject.toml b/pyproject.toml
index 8e5a5fd0b539..b2787bc5edfa 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,6 @@
 [build-system]
-requires = ["Cython>=0.27.1", "setuptools", "wheel"]
+requires = ["Cython>=0.27.1", "setuptools", "wheel", "setuptools-rust"]
+build-backend = "setuptools.build_meta"
 
 [tool.black]
 line-length = 100
@@ -16,3 +17,7 @@ test-command = "python {project}/examples/python/stochastic_swap.py"
 # Numpy 1.22 there are no i686 wheels, so we force pip to use older ones without
 # restricting any dependencies that Numpy and Scipy might have.
 before-test = "pip install --only-binary=numpy,scipy numpy scipy"
+
+[tool.cibuildwheel.linux]
+before-all = "yum install -y wget && {package}/tools/install_rust.sh"
+environment = 'PATH="$PATH:$HOME/.cargo/bin"'
diff --git a/qiskit/__init__.py b/qiskit/__init__.py
index 24aa8d4ed806..45bf6fa49858 100644
--- a/qiskit/__init__.py
+++ b/qiskit/__init__.py
@@ -18,6 +18,15 @@
 import sys
 import warnings
 
+import qiskit._accelerate
+
+# Globally define compiled modules. The normal import mechanism will not
+# find compiled submodules in _accelerate because it relies on file paths
+# manually define them on import so people can directly import
+# qiskit._accelerate.* submodules and not have to rely on attribute access
+sys.modules["qiskit._accelerate.stochastic_swap"] = qiskit._accelerate.stochastic_swap
+
+
 # qiskit errors operator
 from qiskit.exceptions import QiskitError, MissingOptionalLibraryError
 
diff --git a/qiskit/transpiler/passes/routing/cython/__init__.py b/qiskit/transpiler/passes/routing/cython/__init__.py
deleted file mode 100644
index 29d444f08100..000000000000
--- a/qiskit/transpiler/passes/routing/cython/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# This code is part of Qiskit.
-#
-# (C) Copyright IBM 2017, 2018.
-#
-# This code is licensed under the Apache License, Version 2.0. You may
-# obtain a copy of this license in the LICENSE.txt file in the root directory
-# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
-#
-# Any modifications or derivative works of this code must retain this
-# copyright notice, and modified files need to carry a notice indicating
-# that they have been altered from the originals.
-
-"""Module containing transpiler Cython code."""
diff --git a/qiskit/transpiler/passes/routing/cython/stochastic_swap/__init__.py b/qiskit/transpiler/passes/routing/cython/stochastic_swap/__init__.py
deleted file mode 100644
index 5a4bde09e943..000000000000
--- a/qiskit/transpiler/passes/routing/cython/stochastic_swap/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# This code is part of Qiskit.
-#
-# (C) Copyright IBM 2017, 2018.
-#
-# This code is licensed under the Apache License, Version 2.0. You may
-# obtain a copy of this license in the LICENSE.txt file in the root directory
-# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
-#
-# Any modifications or derivative works of this code must retain this
-# copyright notice, and modified files need to carry a notice indicating
-# that they have been altered from the originals.
-
-"""Module containing Cython code for StochasticSwap mapper."""
diff --git a/qiskit/transpiler/passes/routing/cython/stochastic_swap/swap_trial.pyx b/qiskit/transpiler/passes/routing/cython/stochastic_swap/swap_trial.pyx
deleted file mode 100644
index 0647cfbd094d..000000000000
--- a/qiskit/transpiler/passes/routing/cython/stochastic_swap/swap_trial.pyx
+++ /dev/null
@@ -1,194 +0,0 @@
-#!python
-#cython: language_level = 3
-#distutils: language = c++
-
-# This code is part of Qiskit.
-#
-# (C) Copyright IBM 2017, 2018.
-#
-# This code is licensed under the Apache License, Version 2.0. You may
-# obtain a copy of this license in the LICENSE.txt file in the root directory
-# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
-#
-# Any modifications or derivative works of this code must retain this
-# copyright notice, and modified files need to carry a notice indicating
-# that they have been altered from the originals.
-
-cimport cython
-from libcpp.unordered_set cimport unordered_set as cset
-from .utils cimport NLayout, EdgeCollection
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-cdef double compute_cost(const double[:, ::1] dist,
-                         unsigned int * logic_to_phys,
-                         int[::1] gates, unsigned int num_gates) nogil:
-    """ Computes the cost (distance) of a logical to physical mapping.
-    
-    Args:
-        dist (ndarray): An array of doubles that specifies the distance.
-        logic_to_phys (int *): Pointer to logical to physical array.
-        gates (ndarray): Array of ints giving gates in layer.
-        num_gates (int): The number of gates (length of gates//2).
-    
-    Returns:
-        double: The distance calculated.
-    """
-    cdef unsigned int ii, jj, kk
-    cdef double cost = 0.0
-    for kk in range(num_gates):
-        ii = logic_to_phys[gates[2*kk]]
-        jj = logic_to_phys[gates[2*kk+1]]
-        cost += dist[ii,jj]
-    return cost
-
-@cython.nonecheck(False)
-@cython.boundscheck(False)
-@cython.wraparound(False)
-cdef compute_random_scaling(double[:, ::1] scale, const double[:, ::1] cdist2,
-                            double * rand, unsigned int num_qubits):
-    """ Computes the symmetric random scaling (perturbation) matrix, 
-    and places the values in the 'scale' array.
-
-    Args:
-        scale (ndarray): An array of doubles where the values are to be stored.
-        cdist2 (ndarray): Array representing the coupling map distance squared.
-        rand (double *): Array of rands of length num_qubits*(num_qubits+1)//2.
-        num_qubits (int): Number of physical qubits.
-    """
-    cdef size_t ii, jj, idx=0
-    for ii in range(num_qubits):
-        for jj in range(ii):
-            scale[ii,jj] = rand[idx]*cdist2[ii,jj]
-            scale[jj,ii] = scale[ii,jj]
-            idx += 1
-
-
-@cython.nonecheck(False)
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def swap_trial(int num_qubits, NLayout int_layout, int[::1] int_qubit_subset,
-               int[::1] gates, const double[:, ::1] cdist2,
-               const double[:, ::1] cdist, 
-               int[::1] edges, double[:, ::1] scale, object rng):
-    """ A single iteration of the tchastic swap mapping routine.
-
-    Args:
-        num_qubits (int): The number of physical qubits.
-        int_layout (NLayout): The numeric (integer) representation of 
-                              the initial_layout.
-        int_qubit_subset (ndarray): Int ndarray listing qubits in set.
-        gates (ndarray): Int array with integers giving qubits on which
-                         two-qubits gates act on.
-        cdist2 (ndarray): Array of doubles that gives the square of the 
-                          distance graph.
-        cdist (ndarray): Array of doubles that gives the distance graph.
-        edges (ndarray): Int array of edges in coupling map.
-        scale (ndarray): A double array that holds the perturbed cdist2 array.
-        rng (default_rng): An instance of the NumPy default_rng.
-
-    Returns:
-        double: Best distance achieved in this trial.
-        EdgeCollection: Collection of optimal edges found.
-        NLayout: The optimal layout found.
-        int: The number of depth steps required in mapping.
-    """
-    cdef EdgeCollection opt_edges = EdgeCollection()
-    cdef NLayout optimal_layout, new_layout, trial_layout = int_layout.copy()
-    
-    cdef unsigned int num_gates = gates.shape[0]//2
-    cdef unsigned int num_edges = edges.shape[0]//2
-    
-    cdef unsigned int need_copy, cost_reduced
-    cdef unsigned int depth_step = 1
-    cdef unsigned int depth_max = 2 * num_qubits + 1
-    cdef double min_cost, new_cost, dist
-    
-    cdef unsigned int start_edge, end_edge, start_qubit, end_qubit
-    cdef unsigned int optimal_start, optimal_end, optimal_start_qubit, optimal_end_qubit
-    
-    cdef size_t idx
-    
-    # Compute randomized distance
-    cdef double[::1] rand = 1.0 + rng.normal(0.0, 1.0/num_qubits,
-                                             size=num_qubits*(num_qubits+1)//2)
-    
-    compute_random_scaling(scale, cdist2, &rand[0], num_qubits)
-    
-    # Convert int qubit array to c++ set
-    cdef cset[unsigned int] qubit_set
-    cdef cset[unsigned int] input_qubit_set
-    
-    for idx in range(<unsigned int>int_qubit_subset.shape[0]):
-        input_qubit_set.insert(int_qubit_subset[idx])
-    
-    # Loop over depths from 1 up to a maximum depth
-    while depth_step < depth_max:
-        qubit_set = input_qubit_set
-        # While there are still qubits available
-        while not qubit_set.empty():
-            # Compute the objective function
-            min_cost = compute_cost(scale, trial_layout.logic_to_phys,
-                                   gates, num_gates)
-            # Try to decrease objective function
-            cost_reduced = 0
-
-            # Loop over edges of coupling graph
-            need_copy = 1
-            for idx in range(num_edges):
-                start_edge = edges[2*idx]
-                end_edge = edges[2*idx+1]
-                start_qubit = trial_layout.phys_to_logic[start_edge]
-                end_qubit =  trial_layout.phys_to_logic[end_edge]
-                # Are the qubits available?
-                if  qubit_set.count(start_qubit) and qubit_set.count(end_qubit):
-                    # Try this edge to reduce the cost
-                    if need_copy:
-                        new_layout = trial_layout.copy()
-                        need_copy = 0
-                    new_layout.swap(start_edge, end_edge)
-                    # Compute the objective function
-                    new_cost = compute_cost(scale, new_layout.logic_to_phys,
-                                   gates, num_gates)
-                    # Record progress if we succeed
-                    if new_cost < min_cost:
-                        cost_reduced = True
-                        min_cost = new_cost
-                        optimal_layout = new_layout
-                        optimal_start = start_edge
-                        optimal_end = end_edge
-                        optimal_start_qubit = start_qubit
-                        optimal_end_qubit = end_qubit
-                        need_copy = 1
-                    else:
-                        new_layout.swap(start_edge, end_edge)
-
-            # After going over all edges
-            # Were there any good swap choices?
-            if cost_reduced:
-                qubit_set.erase(optimal_start_qubit)
-                qubit_set.erase(optimal_end_qubit)
-                trial_layout = optimal_layout
-                opt_edges.add(optimal_start, optimal_end)
-            else:
-                break
-
-        # We have either run out of swap pairs to try or
-        # failed to improve the cost.
-
-        # Compute the coupling graph distance
-        dist = compute_cost(cdist, trial_layout.logic_to_phys,
-                                   gates, num_gates)
-        # If all gates can be applied now, we are finished.
-        # Otherwise we need to consider a deeper swap circuit
-        if dist == num_gates:
-            break
-
-        # Increment the depth
-        depth_step += 1
-
-    # Either we have succeeded at some depth d < dmax or failed
-    dist = compute_cost(cdist, trial_layout.logic_to_phys,
-                                   gates, num_gates)
-    
-    return dist, opt_edges, trial_layout, depth_step
diff --git a/qiskit/transpiler/passes/routing/cython/stochastic_swap/utils.pxd b/qiskit/transpiler/passes/routing/cython/stochastic_swap/utils.pxd
deleted file mode 100644
index f41e16fcea63..000000000000
--- a/qiskit/transpiler/passes/routing/cython/stochastic_swap/utils.pxd
+++ /dev/null
@@ -1,43 +0,0 @@
-#!python
-#cython: language_level = 3, cdivision = True, nonecheck = False
-#distutils: language = c++
-
-# This code is part of Qiskit.
-#
-# (C) Copyright IBM 2017, 2018.
-#
-# This code is licensed under the Apache License, Version 2.0. You may
-# obtain a copy of this license in the LICENSE.txt file in the root directory
-# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
-#
-# Any modifications or derivative works of this code must retain this
-# copyright notice, and modified files need to carry a notice indicating
-# that they have been altered from the originals.
-
-from libcpp.vector cimport vector
-
-# Numeric layout --------------------------------------------------------------
-cdef class NLayout:
-    cdef:
-        unsigned int l2p_len
-        unsigned int p2l_len
-        unsigned int * logic_to_phys
-        unsigned int * phys_to_logic
-
-    # Methods
-    cdef NLayout copy(self)
-    cdef void swap(self, unsigned int idx1, unsigned int idx2)
-    cpdef object to_layout(self, object dag)
-
-
-cpdef NLayout nlayout_from_layout(object layout,
-                                  dict qubit_indices,
-                                  unsigned int logical_qubits,
-                                  unsigned int physical_qubits)
-
-
-# Edge collection -------------------------------------------------------------
-cdef class EdgeCollection:
-    cdef vector[unsigned int] _edges
-
-    cpdef void add(self, unsigned int edge_start, unsigned int edge_end)
diff --git a/qiskit/transpiler/passes/routing/cython/stochastic_swap/utils.pyx b/qiskit/transpiler/passes/routing/cython/stochastic_swap/utils.pyx
deleted file mode 100644
index f60574f10f1a..000000000000
--- a/qiskit/transpiler/passes/routing/cython/stochastic_swap/utils.pyx
+++ /dev/null
@@ -1,188 +0,0 @@
-#!python
-#cython: language_level = 3
-#distutils: language = c++
-
-# This code is part of Qiskit.
-#
-# (C) Copyright IBM 2017, 2018.
-#
-# This code is licensed under the Apache License, Version 2.0. You may
-# obtain a copy of this license in the LICENSE.txt file in the root directory
-# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
-#
-# Any modifications or derivative works of this code must retain this
-# copyright notice, and modified files need to carry a notice indicating
-# that they have been altered from the originals.
-
-cimport cython
-import numpy as np
-from libc.stdlib cimport calloc, free
-from libcpp.vector cimport vector
-
-from qiskit.transpiler.layout import Layout
-from qiskit.circuit import Qubit
-
-cdef class EdgeCollection:
-    """ A simple contain that contains a C++ vector
-    representing edges in the coupling map that are
-    found to be optimal by the swap mapper.  This allows
-    us to keep the vector alive.
-    """
-    cpdef void add(self, unsigned int edge_start, unsigned int edge_end):
-        """ Add two edges, in order, to the collection.
-
-        Args:
-            edge_start (int): The beginning edge.
-            edge_end (int): The end of the edge.
-        """
-        self._edges.push_back(edge_start)
-        self._edges.push_back(edge_end)
-    
-    @property
-    def size(self):
-        """ The size of the edge collection.
-        Returns:
-            int: Size of the edge collection.
-        """
-        return self._edges.size()
-
-    @cython.boundscheck(False)
-    def edges(self):
-        """ Returns the vector of edges as a NumPy array.
-        Returns:
-            ndarray: Int array of edges.
-        """
-        cdef size_t kk
-        out = np.zeros(self._edges.size(), dtype=np.uint32)
-        for kk in range(self._edges.size()):
-            out[kk] = self._edges[kk]
-        return out
-
-
-cdef class NLayout:
-    """ A Numeric representation of a Qiskit Layout object.
-    Here all qubit layouts are stored as int arrays.
-    """
-    def __cinit__(self, unsigned int num_logical,
-                  unsigned int num_physical):
-        """ Init object.
-        Args:
-            num_logical (int): Number of logical qubits.
-            num_physical (int): Number of physical qubits.
-        """
-        self.l2p_len = num_logical
-        self.p2l_len = num_physical
-        self.logic_to_phys = <unsigned int *>calloc(num_logical,
-                                                    sizeof(unsigned int))
-        self.phys_to_logic = <unsigned int *>calloc(num_physical,
-                                                    sizeof(unsigned int))
-    
-    def __dealloc__(self):
-        """ Clears the pointers when finished.
-        """
-        if self.logic_to_phys is not NULL:
-            free(self.logic_to_phys)
-            self.logic_to_phys = NULL
-        if self.phys_to_logic is not NULL:
-            free(self.phys_to_logic)
-            self.phys_to_logic = NULL
-            
-    @property
-    def logic_to_phys(self):
-        """ The array mapping logical to physical qubits.
-        Returns:
-            ndarray: Int array of logical to physical mappings.
-        """
-        cdef size_t kk
-        out = np.zeros(self.l2p_len, dtype=np.int32)
-        for kk in range(self.l2p_len):
-            out[kk] = self.logic_to_phys[kk]
-        return out
-    
-    @property
-    def phys_to_logic(self):
-        """ The array mapping physical to logical qubits.
-        Returns:
-            ndarray: Int array of physical to logical mappings.
-        """
-        cdef size_t kk
-        out = np.zeros(self.p2l_len, dtype=np.int32)
-        for kk in range(<unsigned int>self.p2l_len):
-            out[kk] = self.phys_to_logic[kk]
-        return out
-    
-    @cython.boundscheck(False)
-    cdef NLayout copy(self):
-        """ Returns a copy of the layout.
-
-        Returns:
-            NLayout: A copy of the layout.
-        """
-        cdef NLayout out = NLayout(self.l2p_len, self.p2l_len)
-        cdef size_t kk
-        for kk in range(<unsigned int>self.l2p_len):
-            out.logic_to_phys[kk] = self.logic_to_phys[kk]
-        for kk in range(<unsigned int>self.p2l_len):
-            out.phys_to_logic[kk] = self.phys_to_logic[kk]
-        return out
-            
-    @cython.boundscheck(False)
-    cdef void swap(self, unsigned int idx1, unsigned int idx2):
-        """ Swaps two indices in the Layout
-
-        Args:
-            idx1 (int): Index 1.
-            idx2 (int): Index 2.
-        """
-        cdef unsigned int temp1, temp2
-        temp1 = self.phys_to_logic[idx1]
-        temp2 = self.phys_to_logic[idx2]
-        self.phys_to_logic[idx1] = temp2
-        self.phys_to_logic[idx2] = temp1
-        self.logic_to_phys[self.phys_to_logic[idx1]] = idx1
-        self.logic_to_phys[self.phys_to_logic[idx2]] = idx2
-        
-    @cython.boundscheck(False)
-    cpdef object to_layout(self, object qregs):
-        """ Converts numeric layout back to Qiskit Layout object.
-
-        Args:
-            qregs (OrderedDict): An ordered dict of Qubit instances.
-        
-        Returns:
-            Layout: The corresponding Qiskit Layout object.
-        """
-        out = Layout()
-        cdef unsigned int main_idx = 0
-        cdef size_t idx
-        for qreg in qregs.values():
-            for idx in range(<unsigned int>qreg.size):
-                out[qreg[idx]] = self.logic_to_phys[main_idx]
-                main_idx += 1
-        return out
-    
-    
-cpdef NLayout nlayout_from_layout(object layout,
-                                  dict qubit_indices,
-                                  unsigned int logical_qubits,
-                                  unsigned int physical_qubits):
-    """ Converts Qiskit Layout object to numerical NLayout.
-
-    Args:
-        layout (Layout): A Qiskit Layout instance.
-        qubit_indices (dict): Dict of Qubit instances to an integer index.
-        logical_qubits (int): Number of logical qubits.
-        physical_qubits (int): Number of physical qubits.
-    Returns:
-        NLayout: The corresponding numerical layout.
-    """
-
-    cdef NLayout out = NLayout(logical_qubits, physical_qubits)
-    cdef object key, val
-    cdef dict merged_dict = {**layout._p2v, **layout._v2p}
-    for key, val in merged_dict.items():
-        if isinstance(key, Qubit):
-            out.logic_to_phys[qubit_indices[key]] = val
-        else:
-            out.phys_to_logic[key] = qubit_indices[val]
-    return out
diff --git a/qiskit/transpiler/passes/routing/stochastic_swap.py b/qiskit/transpiler/passes/routing/stochastic_swap.py
index b1ffe0cd6e72..f9c534ce1ceb 100644
--- a/qiskit/transpiler/passes/routing/stochastic_swap.py
+++ b/qiskit/transpiler/passes/routing/stochastic_swap.py
@@ -14,7 +14,6 @@
 
 import logging
 from math import inf
-from collections import OrderedDict
 import numpy as np
 
 from qiskit.circuit.quantumregister import QuantumRegister
@@ -24,12 +23,7 @@
 from qiskit.circuit.library.standard_gates import SwapGate
 from qiskit.transpiler.layout import Layout
 
-# pylint: disable=no-name-in-module
-from .cython.stochastic_swap.utils import nlayout_from_layout
-
-# pylint: disable=no-name-in-module
-from .cython.stochastic_swap.swap_trial import swap_trial
-
+from qiskit._accelerate import stochastic_swap as stochastic_swap_rs
 
 logger = logging.getLogger(__name__)
 
@@ -99,10 +93,7 @@ def run(self, dag):
         self._qubit_indices = {bit: idx for idx, bit in enumerate(dag.qubits)}
 
         self.qregs = dag.qregs
-        if self.seed is None:
-            self.seed = np.random.randint(0, np.iinfo(np.int32).max)
-        self.rng = np.random.default_rng(self.seed)
-        logger.debug("StochasticSwap default_rng seeded with seed=%s", self.seed)
+        logger.debug("StochasticSwap rng seeded with seed=%s", self.seed)
         self.coupling_map.compute_distance_matrix()
         new_dag = self._mapper(dag, self.coupling_map, trials=self.trials)
         return new_dag
@@ -146,9 +137,7 @@ def _layer_permutation(self, layer_partition, layout, qubit_subset, coupling, tr
         logger.debug("layer_permutation: trials = %s", trials)
 
         # The input dag is on a flat canonical register
-        # TODO: cleanup the code that is general for multiple qregs below
         canonical_register = QuantumRegister(len(layout), "q")
-        qregs = OrderedDict({canonical_register.name: canonical_register})
 
         gates = []  # list of lists of tuples [[(register, index), ...], ...]
         for gate_args in layer_partition:
@@ -177,55 +166,37 @@ def _layer_permutation(self, layer_partition, layout, qubit_subset, coupling, tr
         best_layout = None  # initialize best final layout
 
         cdist2 = coupling._dist_matrix**2
-        # Scaling matrix
-        scale = np.zeros((num_qubits, num_qubits))
-
         int_qubit_subset = np.fromiter(
             (self._qubit_indices[bit] for bit in qubit_subset),
-            dtype=np.int32,
+            dtype=np.uint64,
             count=len(qubit_subset),
         )
 
         int_gates = np.fromiter(
             (self._qubit_indices[bit] for gate in gates for bit in gate),
-            dtype=np.int32,
+            dtype=np.uint64,
             count=2 * len(gates),
         )
 
-        int_layout = nlayout_from_layout(layout, self._qubit_indices, num_qubits, coupling.size())
+        layout_mapping = {self._qubit_indices[k]: v for k, v in layout.get_virtual_bits().items()}
+        int_layout = stochastic_swap_rs.NLayout(layout_mapping, num_qubits, coupling.size())
 
         trial_circuit = DAGCircuit()  # SWAP circuit for slice of swaps in this trial
         trial_circuit.add_qubits(layout.get_virtual_bits())
 
-        edges = np.asarray(coupling.get_edges(), dtype=np.int32).ravel()
+        edges = np.asarray(coupling.get_edges(), dtype=np.uint64).ravel()
         cdist = coupling._dist_matrix
-        for trial in range(trials):
-            logger.debug("layer_permutation: trial %s", trial)
-            # This is one Trial --------------------------------------
-            dist, optim_edges, trial_layout, depth_step = swap_trial(
-                num_qubits,
-                int_layout,
-                int_qubit_subset,
-                int_gates,
-                cdist2,
-                cdist,
-                edges,
-                scale,
-                self.rng,
-            )
-
-            logger.debug("layer_permutation: final distance for this trial = %s", dist)
-            if dist == len(gates) and depth_step < best_depth:
-                logger.debug("layer_permutation: got circuit with improved depth %s", depth_step)
-                best_edges = optim_edges
-                best_layout = trial_layout
-                best_depth = min(best_depth, depth_step)
-
-            # Break out of trial loop if we found a depth 1 circuit
-            # since we can't improve it further
-            if best_depth == 1:
-                break
-
+        best_edges, best_layout, best_depth = stochastic_swap_rs.swap_trials(
+            trials,
+            num_qubits,
+            int_layout,
+            int_qubit_subset,
+            int_gates,
+            cdist,
+            cdist2,
+            edges,
+            seed=self.seed,
+        )
         # If we have no best circuit for this layer, all of the
         # trials have failed
         if best_layout is None:
@@ -233,7 +204,7 @@ def _layer_permutation(self, layer_partition, layout, qubit_subset, coupling, tr
             return False, None, None, None
 
         edges = best_edges.edges()
-        for idx in range(best_edges.size // 2):
+        for idx in range(len(edges) // 2):
             swap_src = self.trivial_layout._p2v[edges[2 * idx]]
             swap_tgt = self.trivial_layout._p2v[edges[2 * idx + 1]]
             trial_circuit.apply_operation_back(SwapGate(), [swap_src, swap_tgt], [])
@@ -241,7 +212,9 @@ def _layer_permutation(self, layer_partition, layout, qubit_subset, coupling, tr
 
         # Otherwise, we return our result for this layer
         logger.debug("layer_permutation: success!")
-        best_lay = best_layout.to_layout(qregs)
+        layout_mapping = best_layout.layout_mapping()
+
+        best_lay = Layout({best_circuit.qubits[k]: v for (k, v) in layout_mapping})
         return True, best_circuit, best_depth, best_lay
 
     def _layer_update(self, dag, layer, best_layout, best_depth, best_circuit):
diff --git a/releasenotes/notes/multithreaded-stochastic-swap-6c2f13d7bd566284.yaml b/releasenotes/notes/multithreaded-stochastic-swap-6c2f13d7bd566284.yaml
new file mode 100644
index 000000000000..a2fd09dc7e82
--- /dev/null
+++ b/releasenotes/notes/multithreaded-stochastic-swap-6c2f13d7bd566284.yaml
@@ -0,0 +1,58 @@
+---
+features:
+  - |
+    The internals of the :class:`.StochasticSwap` algorithm have been reimplemented
+    to be multithreaded and are now written in the
+    `Rust <https://www.rust-lang.org/>`__ programming language instead of Cython.
+    This significantly increases the run time performance of the compiler pass
+    and by extension :func:`~.transpile` when run with ``optimization_level`` 0,
+    1, and 2. By default the pass will use up to the number of logical CPUs on your
+    local system but you can control the number of threads used by the pass by setting
+    the ``RAYON_NUM_THREADS`` environment variable to an integer value. For example,
+    setting ``RAYON_NUM_THREADS=4`` will run the :class:`.StochasticSwap` with 4 
+    threads.
+  - |
+    A new environment variable ``QISKIT_FORCE_THREADS`` is available for users to
+    directly control whether potentially multithreaded portions of qiskit's code
+    will run in multiple threads. Currently this is only used by the
+    :class:`~.StochasticSwap` transpiler pass but it likely will be used other
+    parts of Qiskit in the future. When this env variable is set to ``TRUE`` any
+    multithreaded code in Qiskit Terra will always use multiple threads regardless
+    of any other runtime conditions that might have otherwise caused the function
+    to use a single threaded variant. For example, in :class:`~.StochasticSwap` if
+    the pass is being run as part of a :func:`~.transpile` call with > 1 circuit
+    that is being executed in parallel with ``multiprocessing`` via
+    :func:`~.parallel_map` the :class:`~.StochasticSwap` will not use multiple
+    threads to avoid potentially oversubscribing CPU resources. However, if you'd
+    like to use multiple threads in the pass along with multiple processes you
+    can set ``QISKIT_FORCE_THREADS=TRUE``.
+upgrade:
+  - |
+    The :class:`.StochasticSwap` transpiler pass may return different results with
+    the same seed value set. This is due to the internal rewrite of the transpiler
+    pass to improve runtime performance. However, this means that if you ran
+    :func:`~.transpile` with ``optimization_level`` 0, 1 (the default), or 2 with a
+    value set for ``seed_transpiler`` you may get an output with different swap
+    mapping present after upgrading to Qiskit Terra 0.20.0.
+  - |
+    To build Qiskit Terra from source a `Rust <https://www.rust-lang.org/>`__
+    compiler is now needed. This is due to the internal rewrite of the
+    :class:`.StochasticSwap` transpiler pass which greatly improves the runtime
+    performance of the transpiler. The rust compiler can easily be installed
+    using rustup, which can be found here: https://rustup.rs/
+issues:
+  - |
+    When running :func:`.parallel_map` (which is done internally by
+    performance sensitive functions such as :func:`.transpile` and
+    :func:`.assemble`) in a subprocess launched outside of
+    :func:`.parallel_map` it is possible that the parallel dispatch performed
+    inside :func:`.parallel_map` will hang and never return.
+    This is due to upstream issues in cpython (see:
+    https://bugs.python.org/issue40379 for more details) around the default
+    method to launch subprocesses on Linux and macOS (with Python 3.7). If you
+    encounter this you have two options you can either remove the nested
+    parallel processes as calling :func:`.parallel_map` from a main process
+    should work fine or you can manually call the cPython standard library
+    ``multiprocessing`` module to perform similar parallel dispatch from a
+    subprocess but use the ``"spawn"`` or ``"forkserver"`` launch methods to
+    avoid the potential to have things get stuck and never return.
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 7d0d367d5938..a83d45b91736 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,3 +1,4 @@
+setuptools-rust
 coverage>=4.4.0
 hypothesis>=4.24.3
 ipython<7.22.0
diff --git a/setup.py b/setup.py
index 5bf3a0ebade4..b40376fbc29f 100755
--- a/setup.py
+++ b/setup.py
@@ -16,6 +16,7 @@
 import re
 import sys
 from setuptools import setup, find_packages, Extension
+from setuptools_rust import Binding, RustExtension
 
 try:
     from Cython.Build import cythonize
@@ -25,17 +26,12 @@
     subprocess.call([sys.executable, "-m", "pip", "install", "Cython>=0.27.1"])
     from Cython.Build import cythonize
 
+
 with open("requirements.txt") as f:
     REQUIREMENTS = f.read().splitlines()
 
 # Add Cython extensions here
 CYTHON_EXTS = {
-    "qiskit/transpiler/passes/routing/cython/stochastic_swap/utils": (
-        "qiskit.transpiler.passes.routing.cython.stochastic_swap.utils"
-    ),
-    "qiskit/transpiler/passes/routing/cython/stochastic_swap/swap_trial": (
-        "qiskit.transpiler.passes.routing.cython.stochastic_swap.swap_trial"
-    ),
     "qiskit/quantum_info/states/cython/exp_value": "qiskit.quantum_info.states.cython.exp_value",
 }
 
@@ -139,6 +135,7 @@
         "Source Code": "https://github.com/Qiskit/qiskit-terra",
     },
     ext_modules=cythonize(EXT_MODULES),
+    rust_extensions=[RustExtension("qiskit._accelerate", "Cargo.toml", binding=Binding.PyO3)],
     zip_safe=False,
     entry_points={
         "qiskit.unitary_synthesis": [
diff --git a/src/edge_collections.rs b/src/edge_collections.rs
new file mode 100644
index 000000000000..103d0db5d4cf
--- /dev/null
+++ b/src/edge_collections.rs
@@ -0,0 +1,67 @@
+// This code is part of Qiskit.
+//
+// (C) Copyright IBM 2022
+//
+// This code is licensed under the Apache License, Version 2.0. You may
+// obtain a copy of this license in the LICENSE.txt file in the root directory
+// of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+//
+// Any modifications or derivative works of this code must retain this
+// copyright notice, and modified files need to carry a notice indicating
+// that they have been altered from the originals.
+
+use numpy::IntoPyArray;
+use pyo3::prelude::*;
+use pyo3::Python;
+
+/// A simple container that contains a vector representing edges in the
+/// coupling map that are found to be optimal by the swap mapper.
+#[pyclass(module = "qiskit._accelerate.stochastic_swap")]
+#[pyo3(text_signature = "(/)")]
+#[derive(Clone, Debug)]
+pub struct EdgeCollection {
+    pub edges: Vec<usize>,
+}
+
+impl Default for EdgeCollection {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[pymethods]
+impl EdgeCollection {
+    #[new]
+    pub fn new() -> Self {
+        EdgeCollection { edges: Vec::new() }
+    }
+
+    /// Add two edges, in order, to the collection.
+    ///
+    /// Args:
+    ///     edge_start (int): The beginning edge.
+    ///     edge_end (int): The end of the edge.
+    #[pyo3(text_signature = "(self, edge_start, edge_end, /)")]
+    pub fn add(&mut self, edge_start: usize, edge_end: usize) {
+        self.edges.push(edge_start);
+        self.edges.push(edge_end);
+    }
+
+    /// Return the numpy array of edges
+    ///
+    /// The out array is the flattened edge list from the coupling graph.
+    /// For example, if the edge list were ``[(0, 1), (1, 2), (2, 3)]`` the
+    /// output array here would be ``[0, 1, 1, 2, 2, 3]``.
+    #[pyo3(text_signature = "(self, /)")]
+    pub fn edges(&self, py: Python) -> PyObject {
+        self.edges.clone().into_pyarray(py).into()
+    }
+
+    fn __getstate__(&self) -> Vec<usize> {
+        self.edges.clone()
+    }
+
+    fn __setstate__(&mut self, state: Vec<usize>) {
+        self.edges = state
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 000000000000..ff13bea71783
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,27 @@
+// This code is part of Qiskit.
+//
+// (C) Copyright IBM 2022
+//
+// This code is licensed under the Apache License, Version 2.0. You may
+// obtain a copy of this license in the LICENSE.txt file in the root directory
+// of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+//
+// Any modifications or derivative works of this code must retain this
+// copyright notice, and modified files need to carry a notice indicating
+// that they have been altered from the originals.
+
+use pyo3::prelude::*;
+use pyo3::wrap_pymodule;
+use pyo3::Python;
+
+mod edge_collections;
+mod nlayout;
+mod stochastic_swap;
+
+use crate::stochastic_swap::PyInit_stochastic_swap;
+
+#[pymodule]
+fn _accelerate(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
+    m.add_wrapped(wrap_pymodule!(stochastic_swap))?;
+    Ok(())
+}
diff --git a/src/nlayout.rs b/src/nlayout.rs
new file mode 100644
index 000000000000..53675d07f521
--- /dev/null
+++ b/src/nlayout.rs
@@ -0,0 +1,89 @@
+// This code is part of Qiskit.
+//
+// (C) Copyright IBM 2022
+//
+// This code is licensed under the Apache License, Version 2.0. You may
+// obtain a copy of this license in the LICENSE.txt file in the root directory
+// of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+//
+// Any modifications or derivative works of this code must retain this
+// copyright notice, and modified files need to carry a notice indicating
+// that they have been altered from the originals.
+
+use pyo3::prelude::*;
+
+use hashbrown::HashMap;
+
+/// An unsigned integer Vector based layout class
+///
+/// This class tracks the layout (or mapping between virtual qubits in the the
+/// circuit and physical qubits on the physical device) efficiently
+///
+/// Args:
+///     qubit_indices (dict): A dictionary mapping the virtual qubit index in the circuit to the
+///         physical qubit index on the coupling graph.
+///     logical_qubits (int): The number of logical qubits in the layout
+///     physical_qubits (int): The number of physical qubits in the layout
+#[pyclass(module = "qiskit._accelerate.stochastic_swap")]
+#[pyo3(text_signature = "(qubit_indices, logical_qubits, physical_qubits, /)")]
+#[derive(Clone, Debug)]
+pub struct NLayout {
+    pub logic_to_phys: Vec<usize>,
+    pub phys_to_logic: Vec<usize>,
+}
+
+impl NLayout {
+    pub fn swap(&mut self, idx1: usize, idx2: usize) {
+        self.phys_to_logic.swap(idx1, idx2);
+        self.logic_to_phys[self.phys_to_logic[idx1]] = idx1;
+        self.logic_to_phys[self.phys_to_logic[idx2]] = idx2;
+    }
+}
+
+#[pymethods]
+impl NLayout {
+    #[new]
+    fn new(
+        qubit_indices: HashMap<usize, usize>,
+        logical_qubits: usize,
+        physical_qubits: usize,
+    ) -> Self {
+        let mut res = NLayout {
+            logic_to_phys: vec![std::usize::MAX; logical_qubits],
+            phys_to_logic: vec![std::usize::MAX; physical_qubits],
+        };
+        for (key, value) in qubit_indices {
+            res.logic_to_phys[key] = value;
+            res.phys_to_logic[value] = key;
+        }
+        res
+    }
+
+    fn __getstate__(&self) -> [Vec<usize>; 2] {
+        [self.logic_to_phys.clone(), self.phys_to_logic.clone()]
+    }
+
+    fn __setstate__(&mut self, state: [Vec<usize>; 2]) {
+        self.logic_to_phys = state[0].clone();
+        self.phys_to_logic = state[1].clone();
+    }
+
+    /// Return the layout mapping
+    ///
+    /// .. note::
+    ///
+    ///     this copies the data from Rust to Python and has linear
+    ///     overhead based on the number of qubits.
+    ///
+    /// Returns:
+    ///     list: A list of 2 element lists in the form:
+    ///     ``[[logical_qubit, physical_qubit], ...]``. Where the logical qubit
+    ///     is the index in the qubit index in the circuit.
+    ///
+    #[pyo3(text_signature = "(self, /)")]
+    fn layout_mapping(&self) -> Vec<[usize; 2]> {
+        (0..self.logic_to_phys.len())
+            .map(|i| [i, self.logic_to_phys[i]])
+            .collect()
+    }
+}
diff --git a/src/stochastic_swap.rs b/src/stochastic_swap.rs
new file mode 100644
index 000000000000..033335024831
--- /dev/null
+++ b/src/stochastic_swap.rs
@@ -0,0 +1,352 @@
+// This code is part of Qiskit.
+//
+// (C) Copyright IBM 2022
+//
+// This code is licensed under the Apache License, Version 2.0. You may
+// obtain a copy of this license in the LICENSE.txt file in the root directory
+// of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+//
+// Any modifications or derivative works of this code must retain this
+// copyright notice, and modified files need to carry a notice indicating
+// that they have been altered from the originals.
+
+// Needed to pass shared state between functions
+// closures don't work because of recurssion
+#![allow(clippy::too_many_arguments)]
+#![allow(clippy::type_complexity)]
+
+use std::env;
+use std::sync::RwLock;
+
+use hashbrown::HashSet;
+
+use ndarray::prelude::*;
+use numpy::{PyReadonlyArray1, PyReadonlyArray2};
+use rayon::prelude::*;
+
+use pyo3::prelude::*;
+use pyo3::wrap_pyfunction;
+use pyo3::Python;
+
+use rand::prelude::*;
+use rand_distr::{Distribution, Normal};
+use rand_pcg::Pcg64Mcg;
+
+use crate::edge_collections::EdgeCollection;
+use crate::nlayout::NLayout;
+
+#[inline]
+fn compute_cost(
+    dist: &ArrayView2<f64>,
+    layout: &NLayout,
+    gates: &[usize],
+    num_gates: usize,
+) -> f64 {
+    (0..num_gates)
+        .map(|kk| {
+            let ii = layout.logic_to_phys[gates[2 * kk]];
+            let jj = layout.logic_to_phys[gates[2 * kk + 1]];
+            dist[[ii, jj]]
+        })
+        .sum()
+}
+
+/// Computes the symmetric random scaling (perturbation) matrix,
+/// and places the values in the 'scale' array.
+///
+/// Args:
+///     scale (ndarray): An array of doubles where the values are to be stored.
+///     cdist2 (ndarray): Array representing the coupling map distance squared.
+///     rand (double *): Array of rands of length num_qubits*(num_qubits+1)//2.
+///     num_qubits (int): Number of physical qubits.
+#[inline]
+fn compute_random_scaling(
+    scale: &mut Array2<f64>,
+    cdist2: &ArrayView2<f64>,
+    rand: &[f64],
+    num_qubits: usize,
+) {
+    let mut idx: usize = 0;
+    for ii in 0..num_qubits {
+        for jj in 0..ii {
+            scale[[ii, jj]] = rand[idx] * cdist2[[ii, jj]];
+            scale[[jj, ii]] = scale[[ii, jj]];
+            idx += 1
+        }
+    }
+}
+
+fn swap_trial(
+    num_qubits: usize,
+    int_layout: &NLayout,
+    int_qubit_subset: &[usize],
+    gates: &[usize],
+    cdist: ArrayView2<f64>,
+    cdist2: ArrayView2<f64>,
+    edges: &[usize],
+    seed: u64,
+    trial_num: u64,
+    locked_best_possible: Option<&RwLock<&mut Option<(u64, f64, EdgeCollection, NLayout)>>>,
+) -> Option<(f64, EdgeCollection, NLayout, usize)> {
+    if let Some(locked_best_possible) = locked_best_possible {
+        // Return fast if a depth == 1 solution was already found in another parallel
+        // trial. However for deterministic results in cases of multiple depth == 1
+        // solutions still search for a solution if this trial number is less than
+        // the found solution (this mirrors the previous behavior of a serial loop).
+        let best_possible = locked_best_possible.read().unwrap();
+        if best_possible.is_some() && best_possible.as_ref().unwrap().0 < trial_num {
+            return None;
+        }
+    }
+    let mut opt_edges = EdgeCollection::new();
+    let mut trial_layout = int_layout.clone();
+    let mut optimal_layout = int_layout.clone();
+
+    let num_gates: usize = gates.len() / 2;
+    let num_edges: usize = edges.len() / 2;
+
+    let mut cost_reduced;
+    let mut depth_step: usize = 1;
+    let depth_max: usize = 2 * num_qubits + 1;
+    let mut min_cost: f64;
+    let mut new_cost: f64;
+    let mut dist: f64;
+
+    let mut optimal_start: usize = std::usize::MAX;
+    let mut optimal_end: usize = std::usize::MAX;
+    let mut optimal_start_qubit = std::usize::MAX;
+    let mut optimal_end_qubit = std::usize::MAX;
+
+    let mut scale = Array2::zeros((num_qubits, num_qubits));
+
+    let distribution = Normal::new(1.0, 1.0 / num_qubits as f64).unwrap();
+    let mut rng: Pcg64Mcg = Pcg64Mcg::seed_from_u64(seed);
+    let rand_arr: Vec<f64> = distribution
+        .sample_iter(&mut rng)
+        .take(num_qubits * (num_qubits + 1) / 2)
+        .collect();
+
+    compute_random_scaling(&mut scale, &cdist2, &rand_arr, num_qubits);
+
+    let input_qubit_set: HashSet<usize> = int_qubit_subset.iter().copied().collect();
+
+    while depth_step < depth_max {
+        let mut qubit_set = input_qubit_set.clone();
+        while !qubit_set.is_empty() {
+            min_cost = compute_cost(&scale.view(), &trial_layout, gates, num_gates);
+            // Try to decrease the objective function
+            cost_reduced = false;
+            for idx in 0..num_edges {
+                let start_edge = edges[2 * idx];
+                let end_edge = edges[2 * idx + 1];
+                let start_qubit = trial_layout.phys_to_logic[start_edge];
+                let end_qubit = trial_layout.phys_to_logic[end_edge];
+                if qubit_set.contains(&start_qubit) && qubit_set.contains(&end_qubit) {
+                    // Try this edge to reduce cost
+                    trial_layout.swap(start_edge, end_edge);
+                    // compute objective function
+                    new_cost = compute_cost(&scale.view(), &trial_layout, gates, num_gates);
+                    // record progress if we succeed
+                    if new_cost < min_cost {
+                        cost_reduced = true;
+                        min_cost = new_cost;
+                        optimal_layout = trial_layout.clone();
+                        optimal_start = start_edge;
+                        optimal_end = end_edge;
+                        optimal_start_qubit = start_qubit;
+                        optimal_end_qubit = end_qubit;
+                    }
+                    trial_layout.swap(start_edge, end_edge);
+                }
+            }
+            // After going over all edges
+            // Were there any good swap choices?
+            if cost_reduced {
+                qubit_set.remove(&optimal_start_qubit);
+                qubit_set.remove(&optimal_end_qubit);
+                trial_layout = optimal_layout.clone();
+                opt_edges.add(optimal_start, optimal_end);
+            } else {
+                break;
+            }
+        }
+        // We have either run out of swap pairs to try or failed to improve
+        // the cost
+
+        // Compute the coupling graph distance
+        dist = compute_cost(&cdist, &trial_layout, gates, num_gates);
+        // If all gates can be applied now we're finished.
+        // Otherwise we need to consider a deeper swap circuit
+        if dist as usize == num_gates {
+            break;
+        }
+        // increment the depth
+        depth_step += 1;
+    }
+    // Either we have succeeded at some depth d < d_max or failed
+    dist = compute_cost(&cdist, &trial_layout, gates, num_gates);
+    if let Some(locked_best_possible) = locked_best_possible {
+        if dist as usize == num_gates && depth_step == 1 {
+            let mut best_possible = locked_best_possible.write().unwrap();
+            // In the case an ideal solution has already been found to preserve
+            // behavior consistent with the single threaded predecessor to this function
+            // we defer to the earlier trial
+            if best_possible.is_none() || best_possible.as_ref().unwrap().0 > trial_num {
+                **best_possible = Some((trial_num, dist, opt_edges, trial_layout));
+            }
+            return None;
+        }
+    }
+    Some((dist, opt_edges, trial_layout, depth_step))
+}
+
+/// Run the random trials as part of the layer permutation used internally for
+/// the stochastic swap algorithm.
+///
+/// This function is multithreaded and will spawn a thread pool as part of its
+/// execution. By default the number of threads will be equal to the number of
+/// CPUs. You can tune the number of threads with the RAYON_NUM_THREADS
+/// environment variable. For example, setting RAYON_NUM_THREADS=4 would limit
+/// the thread pool to 4 threads.
+///
+/// Args:
+///     num_trials (int): The number of random trials to attempt
+///     num_qubits (int): The number of qubits
+///     int_layout (NLayout): The initial layout for the layer. The layout is a mapping
+///         of virtual qubits to physical qubits in the coupling graph
+///     int_qubit_subset (ndarray): A 1D array of qubit indices for the set of qubits in the
+///         coupling map that we've chosen to map into.
+///     int_gates (ndarray): A 1D array of qubit pairs that each 2 qubit gate operates on.
+///         The pairs are flattened on the array so that each pair in the list of 2q gates
+///         are adjacent in the array. For example, if the 2q interaction list was
+///         ``[(0, 1), (2, 1), (3, 2)]``, the input here would be ``[0, 1, 2, 1, 3, 2]``.
+///     cdist (ndarray): The distance matrix for the coupling graph of the target
+///         backend
+///     cdist2 (ndarray): The distance matrix squared for the coupling graph of the
+///         target backend
+///     edges (ndarray): A flattened 1d array of the edge list of the coupling graph.
+///         The pairs are flattened on the array so that each node pair in the edge are
+///         adjacent in the array. For example, if the edge list were ``[(0, 1), (1, 2), (2, 3)]``
+///         the input array here would be ``[0, 1, 1, 2, 2, 3]``.
+///     seed (int): An optional seed for the rng used to generate the random perturbation
+///         matrix used in each trial
+/// Returns:
+///     tuple: If a valid layout permutation is found a tuple of the form:
+///         ``(edges, layout, depth)`` is returned. If a solution is not found the output
+///         will be ``(None, None, max int)``.
+#[pyfunction]
+#[pyo3(
+    text_signature = "(num_trials, num_qubits, int_layout, int_qubit_subset, int_gates, cdist, cdist2, edges, /, seed=None)"
+)]
+pub fn swap_trials(
+    num_trials: u64,
+    num_qubits: usize,
+    int_layout: &NLayout,
+    int_qubit_subset: PyReadonlyArray1<usize>,
+    int_gates: PyReadonlyArray1<usize>,
+    cdist: PyReadonlyArray2<f64>,
+    cdist2: PyReadonlyArray2<f64>,
+    edges: PyReadonlyArray1<usize>,
+    seed: Option<u64>,
+) -> PyResult<(Option<EdgeCollection>, Option<NLayout>, usize)> {
+    let int_qubit_subset_arr = int_qubit_subset.as_slice()?;
+    let int_gates_arr = int_gates.as_slice()?;
+    let cdist_arr = cdist.as_array();
+    let cdist2_arr = cdist2.as_array();
+    let edges_arr = edges.as_slice()?;
+    let num_gates: usize = int_gates.len() / 2;
+    let mut best_possible: Option<(u64, f64, EdgeCollection, NLayout)> = None;
+    let locked_best_possible: RwLock<&mut Option<(u64, f64, EdgeCollection, NLayout)>> =
+        RwLock::new(&mut best_possible);
+    let outer_rng: Pcg64Mcg = match seed {
+        Some(seed) => Pcg64Mcg::seed_from_u64(seed),
+        None => Pcg64Mcg::from_entropy(),
+    };
+    let seed_vec: Vec<u64> = outer_rng
+        .sample_iter(&rand::distributions::Standard)
+        .take(num_trials as usize)
+        .collect();
+    // Run in parallel only if we're not already in a multiprocessing context
+    // unless force threads is set.
+    let parallel_context = env::var("QISKIT_IN_PARALLEL")
+        .unwrap_or_else(|_| "FALSE".to_string())
+        .to_uppercase()
+        == "TRUE";
+    let force_threads = env::var("QISKIT_FORCE_THREADS")
+        .unwrap_or_else(|_| "FALSE".to_string())
+        .to_uppercase()
+        == "TRUE";
+    let run_in_parallel = !parallel_context || force_threads;
+
+    let mut best_depth = std::usize::MAX;
+    let mut best_edges: Option<EdgeCollection> = None;
+    let mut best_layout: Option<NLayout> = None;
+    if run_in_parallel {
+        let result: Vec<Option<(f64, EdgeCollection, NLayout, usize)>> = (0..num_trials)
+            .into_par_iter()
+            .map(|trial_num| {
+                swap_trial(
+                    num_qubits,
+                    int_layout,
+                    int_qubit_subset_arr,
+                    int_gates_arr,
+                    cdist_arr,
+                    cdist2_arr,
+                    edges_arr,
+                    seed_vec[trial_num as usize],
+                    trial_num,
+                    Some(&locked_best_possible),
+                )
+            })
+            .collect();
+        match best_possible {
+            Some((_trial_num, _dist, edges, layout)) => {
+                best_edges = Some(edges);
+                best_layout = Some(layout);
+                best_depth = 1;
+            }
+            None => {
+                for (dist, edges, layout, depth) in result.into_iter().flatten() {
+                    if dist as usize == num_gates && depth < best_depth {
+                        best_edges = Some(edges);
+                        best_layout = Some(layout);
+                        best_depth = depth;
+                    }
+                }
+            }
+        };
+    } else {
+        for trial_num in 0..num_trials {
+            let (dist, edges, layout, depth) = swap_trial(
+                num_qubits,
+                int_layout,
+                int_qubit_subset_arr,
+                int_gates_arr,
+                cdist_arr,
+                cdist2_arr,
+                edges_arr,
+                seed_vec[trial_num as usize],
+                trial_num,
+                None,
+            )
+            .unwrap();
+            if dist as usize == num_gates && depth < best_depth {
+                best_edges = Some(edges);
+                best_layout = Some(layout);
+                best_depth = depth;
+                if depth == 1 {
+                    return Ok((best_edges, best_layout, best_depth));
+                }
+            }
+        }
+    }
+    Ok((best_edges, best_layout, best_depth))
+}
+
+#[pymodule]
+pub fn stochastic_swap(_py: Python, m: &PyModule) -> PyResult<()> {
+    m.add_wrapped(wrap_pyfunction!(swap_trials))?;
+    m.add_class::<NLayout>()?;
+    m.add_class::<EdgeCollection>()?;
+    Ok(())
+}
diff --git a/test/python/qasm/TestsStochasticSwap_handle_measurement.qasm b/test/python/qasm/TestsStochasticSwap_handle_measurement.qasm
index bc161f609758..a14feda051e2 100644
--- a/test/python/qasm/TestsStochasticSwap_handle_measurement.qasm
+++ b/test/python/qasm/TestsStochasticSwap_handle_measurement.qasm
@@ -5,11 +5,10 @@ creg c[4];
 cx q[0],q[1];
 h q[3];
 measure q[2] -> c[2];
-swap q[1],q[2];
-swap q[0],q[1];
-cx q[3],q[2];
 swap q[2],q[3];
 cx q[2],q[1];
+swap q[0],q[1];
+measure q[0] -> c[1];
+cx q[2],q[1];
 measure q[1] -> c[0];
-measure q[3] -> c[1];
 measure q[2] -> c[3];
diff --git a/test/python/transpiler/test_stochastic_swap.py b/test/python/transpiler/test_stochastic_swap.py
index d1e2a233bde9..8aed99218c6f 100644
--- a/test/python/transpiler/test_stochastic_swap.py
+++ b/test/python/transpiler/test_stochastic_swap.py
@@ -318,36 +318,30 @@ def test_overoptimization_case(self):
         expected.z(qr[2])
         expected.y(qr[1])
         expected.x(qr[0])
-        expected.swap(qr[1], qr[2])
-        expected.cx(qr[0], qr[2])
-        expected.swap(qr[2], qr[3])
-        expected.cx(qr[1], qr[2])
-        expected.s(qr[3])
-        expected.t(qr[1])
-        expected.h(qr[2])
+        expected.swap(qr[0], qr[2])
+        expected.cx(qr[2], qr[1])
+        expected.swap(qr[0], qr[2])
+        expected.cx(qr[2], qr[3])
+        expected.s(qr[1])
+        expected.t(qr[2])
+        expected.h(qr[3])
         expected.measure(qr[0], cr[0])
-        expected.swap(qr[1], qr[2])
-        expected.cx(qr[3], qr[2])
-        expected.measure(qr[1], cr[3])
-        expected.measure(qr[3], cr[1])
+        expected.cx(qr[1], qr[2])
+        expected.measure(qr[3], cr[3])
+        expected.measure(qr[1], cr[1])
         expected.measure(qr[2], cr[2])
         expected_dag = circuit_to_dag(expected)
-        #                      ┌───┐     ┌─┐
-        # q_0: |0>─────────────┤ X ├──■──┤M├────────────────────────────────────────
-        #              ┌───┐   └───┘  │  └╥┘             ┌───┐        ┌───┐┌─┐
-        # q_1: |0>─────┤ Y ├─X────────┼───╫───────────■──┤ T ├────────┤ X ├┤M├──────
-        #         ┌───┐└───┘ │      ┌─┴─┐ ║         ┌─┴─┐└───┘┌───┐   └─┬─┘└╥┘┌─┐
-        # q_2: |0>┤ Z ├──────X──────┤ X ├─╫──X──────┤ X ├─────┤ H ├─X───■───╫─┤M├───
-        #         └───┘             └───┘ ║  │ ┌───┐└───┘     └───┘ │       ║ └╥┘┌─┐
-        # q_3: |0>────────────────────────╫──X─┤ S ├────────────────X───────╫──╫─┤M├
-        #                                 ║    └───┘                        ║  ║ └╥┘
-        #  c_0: 0 ════════════════════════╩═════════════════════════════════╬══╬══╬═
-        #                                                                   ║  ║  ║
-        #  c_1: 0 ══════════════════════════════════════════════════════════╬══╩══╬═
-        #                                                                   ║     ║
-        #  c_2: 0 ══════════════════════════════════════════════════════════╩═════╬═
-        #                                                                         ║
-        #  c_3: 0 ════════════════════════════════════════════════════════════════╩═
+        #      ┌───┐                ┌─┐
+        # q_0: ┤ X ├─X───────X──────┤M├────────────────
+        #      ├───┤ │ ┌───┐ │ ┌───┐└╥┘          ┌─┐
+        # q_1: ┤ Y ├─┼─┤ X ├─┼─┤ S ├─╫────────■──┤M├───
+        #      ├───┤ │ └─┬─┘ │ └───┘ ║ ┌───┐┌─┴─┐└╥┘┌─┐
+        # q_2: ┤ Z ├─X───■───X───■───╫─┤ T ├┤ X ├─╫─┤M├
+        #      └───┘           ┌─┴─┐ ║ ├───┤└┬─┬┘ ║ └╥┘
+        # q_3: ────────────────┤ X ├─╫─┤ H ├─┤M├──╫──╫─
+        #                      └───┘ ║ └───┘ └╥┘  ║  ║
+        # c: 4/══════════════════════╩════════╩═══╩══╩═
+        #                            0        3   1  2
 
         #
         # Layout --
@@ -428,6 +422,7 @@ def test_congestion(self):
         circ.measure(qr[2], cr[2])
         circ.measure(qr[3], cr[3])
         dag = circuit_to_dag(circ)
+        # Input:
         #                                             ┌─┐┌───┐        ┌─┐
         # q_0: |0>─────────────────■──────────────────┤M├┤ H ├──■─────┤M├
         #                   ┌───┐  │                  └╥┘└───┘┌─┴─┐┌─┐└╥┘
@@ -445,23 +440,20 @@ def test_congestion(self):
         #                                        ║
         #  c_3: 0 ═══════════════════════════════╩═══════════════════════
         #
-        #                    ┌───┐                      ┌───┐   ┌─┐
-        #  q_0: |0>───────X──┤ H ├──────────────────────┤ X ├───┤M├
-        #                 │  └───┘┌─┐        ┌───┐      └─┬─┘┌─┐└╥┘
-        #  q_1: |0>──■────X────■──┤M├──────X─┤ X ├─X──────■──┤M├─╫─
-        #          ┌─┴─┐┌───┐  │  └╥┘      │ └─┬─┘ │ ┌─┐     └╥┘ ║
-        #  q_2: |0>┤ X ├┤ H ├──┼───╫───────┼───■───┼─┤M├──────╫──╫─
-        #          └───┘└───┘┌─┴─┐ ║ ┌───┐ │ ┌───┐ │ └╥┘ ┌─┐  ║  ║
-        #  q_3: |0>──────────┤ X ├─╫─┤ H ├─X─┤ H ├─X──╫──┤M├──╫──╫─
-        #                    └───┘ ║ └───┘   └───┘    ║  └╥┘  ║  ║
-        #   c_0: 0 ════════════════╩══════════════════╬═══╬═══╩══╬═
-        #                                             ║   ║      ║
-        #   c_1: 0 ═══════════════════════════════════╬═══╬══════╩═
-        #                                             ║   ║
-        #   c_2: 0 ═══════════════════════════════════╩═══╬════════
-        #                                                 ║
-        #   c_3: 0 ═══════════════════════════════════════╩════════
+        # Expected output (with seed 999):
+        #                ┌───┐                        ┌─┐
+        # q_0: ───────X──┤ H ├─────────────────X──────┤M├──────
+        #             │  └───┘     ┌─┐   ┌───┐ │ ┌───┐└╥┘   ┌─┐
+        # q_1: ──■────X────■───────┤M├─X─┤ X ├─X─┤ X ├─╫────┤M├
+        #      ┌─┴─┐┌───┐  │       └╥┘ │ └─┬─┘┌─┐└─┬─┘ ║    └╥┘
+        # q_2: ┤ X ├┤ H ├──┼────────╫──┼───■──┤M├──┼───╫─────╫─
+        #      └───┘└───┘┌─┴─┐┌───┐ ║  │ ┌───┐└╥┘  │   ║ ┌─┐ ║
+        # q_3: ──────────┤ X ├┤ H ├─╫──X─┤ H ├─╫───■───╫─┤M├─╫─
+        #                └───┘└───┘ ║    └───┘ ║       ║ └╥┘ ║
+        # c: 4/═════════════════════╩══════════╩═══════╩══╩══╩═
+        #                           0          2       3  0  1
         #
+        # Target coupling graph:
         #     2
         #     |
         # 0 - 1 - 3
@@ -472,22 +464,21 @@ def test_congestion(self):
         expected.swap(qr[0], qr[1])
         expected.h(qr[0])
         expected.cx(qr[1], qr[3])
-        expected.measure(qr[1], cr[0])
         expected.h(qr[3])
+        expected.measure(qr[1], cr[0])
         expected.swap(qr[1], qr[3])
         expected.cx(qr[2], qr[1])
         expected.h(qr[3])
-        expected.swap(qr[1], qr[3])
+        expected.swap(qr[0], qr[1])
         expected.measure(qr[2], cr[2])
-        expected.measure(qr[3], cr[3])
-        expected.cx(qr[1], qr[0])
-        expected.measure(qr[1], cr[0])
-        expected.measure(qr[0], cr[1])
+        expected.cx(qr[3], qr[1])
+        expected.measure(qr[0], cr[3])
+        expected.measure(qr[3], cr[0])
+        expected.measure(qr[1], cr[1])
         expected_dag = circuit_to_dag(expected)
 
         pass_ = StochasticSwap(coupling, 20, 999)
         after = pass_.run(dag)
-
         self.assertEqual(expected_dag, after)
 
     def test_only_output_cx_and_swaps_in_coupling_map(self):
diff --git a/tools/install_rust.sh b/tools/install_rust.sh
new file mode 100755
index 000000000000..d86416207240
--- /dev/null
+++ b/tools/install_rust.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+if [ ! -d rust-installer ]; then
+    mkdir rust-installer
+    wget https://sh.rustup.rs -O rust-installer/rustup.sh
+    sh rust-installer/rustup.sh -y
+fi
diff --git a/tools/verify_parallel_map.py b/tools/verify_parallel_map.py
new file mode 100755
index 000000000000..87fc8830abf7
--- /dev/null
+++ b/tools/verify_parallel_map.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+# This code is part of Qiskit.
+#
+# (C) Copyright IBM 2022.
+#
+# This code is licensed under the Apache License, Version 2.0. You may
+# obtain a copy of this license in the LICENSE.txt file in the root directory
+# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# Any modifications or derivative works of this code must retain this
+# copyright notice, and modified files need to carry a notice indicating
+# that they have been altered from the originals.
+
+# pylint: disable=wrong-import-position
+
+"""Test script to verify parallel dispatch via parallel_map() works as expected."""
+
+
+import math
+import os
+
+
+ORIG_ENV_VAR = os.getenv("QISKIT_PARALLEL", None)
+if ORIG_ENV_VAR is not None:
+    print("Removing QISKIT_PARALLEL env var to verify defaults")
+    del os.environ["QISKIT_PARALLEL"]
+
+
+from qiskit.compiler import transpile
+from qiskit.circuit import QuantumCircuit, QuantumRegister, ClassicalRegister
+from qiskit.test.mock import FakeRueschlikon
+
+
+def run_test():
+    """Run tests."""
+    backend = FakeRueschlikon()
+    qr = QuantumRegister(16)
+    cr = ClassicalRegister(16)
+    qc = QuantumCircuit(qr, cr)
+    qc.h(qr[0])
+    for k in range(1, 15):
+        qc.cx(qr[0], qr[k])
+    qc.measure(qr, cr)
+    qlist = [qc for k in range(15)]
+    for opt_level in [0, 1, 2, 3]:
+        tqc = transpile(
+            qlist, backend=backend, optimization_level=opt_level, seed_transpiler=424242
+        )
+        result = backend.run(tqc, seed_simulator=4242424242, shots=1000).result()
+        counts = result.get_counts()
+        for count in counts:
+            assert math.isclose(count["0000000000000000"], 500, rel_tol=0.1)
+            assert math.isclose(count["0111111111111111"], 500, rel_tol=0.1)
+
+
+if __name__ == "__main__":
+    run_test()
+    if ORIG_ENV_VAR is not None:
+        print(f"Restoring QISKIT_PARALLEL env var to {ORIG_ENV_VAR}")
+        os.environ["QISKIT_PARALLEL"] = ORIG_ENV_VAR
diff --git a/tox.ini b/tox.ini
index 0ca05f10807a..219bc954d5f0 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,7 +1,7 @@
 [tox]
-minversion = 2.1
+minversion = 3.3.0
 envlist = py37, py38, py39, py310, lint-incr
-skipsdist = True
+isolated_build = true
 
 [testenv]
 usedevelop = True
@@ -13,10 +13,12 @@ setenv =
   ARGS="-V"
   QISKIT_SUPRESS_PACKAGING_WARNINGS=Y
   QISKIT_TEST_CAPTURE_STREAMS=1
+  QISKIT_PARALLEL=FALSE
 deps = -r{toxinidir}/requirements.txt
        -r{toxinidir}/requirements-dev.txt
 commands =
   stestr run {posargs}
+  {toxinidir}/tools/verify_parallel_map.py
 
 [testenv:lint]
 envdir = .tox/lint
@@ -58,6 +60,7 @@ deps = -r{toxinidir}/requirements.txt
        qiskit-aer
 commands =
   stestr run {posargs}
+  coverage3 run --source qiskit --parallel-mode {toxinidir}/tools/verify_parallel_map.py
   coverage3 combine
   coverage3 report