From ad06d1af8b4158eb6d73262c51f04e0035e96a75 Mon Sep 17 00:00:00 2001
From: Matthew Treinish <mtreinish@kortar.org>
Date: Sun, 10 Nov 2024 10:38:20 -0500
Subject: [PATCH] Add new multithreaded TwoQubitPeepholeOptimization pass

This commit adds a new transpiler pass for physical optimization,
TwoQubitPeepholeOptimization. This replaces the use of Collect2qBlocks,
ConsolidateBlocks, and UnitarySynthesis in the optimization stage for
a default pass manager setup. The pass logically works the same way
where it analyzes the dag to get a list of 2q runs, calculates the matrix
of each run, and then synthesizes the matrix and substitutes it inplace.
The distinction this pass makes though is it does this all in a single
pass and also parallelizes the matrix calculation and synthesis steps
because there is no data dependency there.

This new pass is not meant to fully replace the Collect2qBlocks,
ConsolidateBlocks, or UnitarySynthesis passes as those also run in
contexts where we don't have a physical circuit. This is meant instead
to replace their usage in the optimization stage only. Accordingly this
new pass also changes the logic on how we select the synthesis to use
and when to make a substituion. Previously this logic was primarily done
via the ConsolidateBlocks pass by only consolidating to a UnitaryGate if
the number of basis gates needed based on the weyl chamber coordinates
was less than the number of 2q gates in the block (see #11659 for
discussion on this). Since this new pass skips the explicit
consolidation stage we go ahead and try all the available synthesizers

Right now this commit has a number of limitations, the largest are:

- Only supports the target
- It doesn't support any synthesizers besides the TwoQubitBasisDecomposer,
  because it's the only one in rust currently.

For plugin handling I left the logic as running the three pass series,
but I'm not sure this is the behavior we want. We could say keep the
synthesis plugins for `UnitarySynthesis` only and then rely on our
built-in methods for physical optimiztion only. But this also seems less
than ideal because the plugin mechanism is how we support synthesizing
to custom basis gates, and also more advanced approximate synthesis
methods. Both of those are things we need to do as part of the synthesis
here.

Additionally, this is currently missing tests and documentation and while
running it manually "works" as in it returns a circuit that looks valid,
I've not done any validation yet. This also likely will need several
rounds of performance optimization and tuning. t this point this is
just a rough proof of concept and will need a lof refinement along with
larger changes to Qiskit's rust code before this is ready to merge.

Fixes #12007
Fixes #11659
---
 crates/accelerate/Cargo.toml                  |   7 +-
 crates/accelerate/src/consolidate_blocks.rs   |   8 +-
 .../accelerate/src/convert_2q_block_matrix.rs |  21 +-
 crates/accelerate/src/lib.rs                  |   1 +
 crates/accelerate/src/two_qubit_decompose.rs  |  19 +-
 crates/accelerate/src/two_qubit_peephole.rs   | 408 ++++++++++++++++++
 crates/accelerate/src/unitary_synthesis.rs    |   3 +-
 crates/pyext/src/lib.rs                       |   5 +
 qiskit/__init__.py                            |   1 +
 .../passes/optimization/two_qubit_peephole.py |  59 +++
 10 files changed, 507 insertions(+), 25 deletions(-)
 create mode 100644 crates/accelerate/src/two_qubit_peephole.rs
 create mode 100644 qiskit/transpiler/passes/optimization/two_qubit_peephole.py

diff --git a/crates/accelerate/Cargo.toml b/crates/accelerate/Cargo.toml
index cc624e7750d6..0a3c295474a7 100644
--- a/crates/accelerate/Cargo.toml
+++ b/crates/accelerate/Cargo.toml
@@ -9,6 +9,10 @@ license.workspace = true
 name = "qiskit_accelerate"
 doctest = false
 
+
+[features]
+cache_pygates = ["qiskit-circuit/cache_pygates"]
+
 [dependencies]
 rayon.workspace = true
 numpy.workspace = true
@@ -60,6 +64,3 @@ features = ["ndarray"]
 [dependencies.pulp]
 version = "0.18.22"
 features = ["macro"]
-
-[features]
-cache_pygates = ["qiskit-circuit/cache_pygates"]
diff --git a/crates/accelerate/src/consolidate_blocks.rs b/crates/accelerate/src/consolidate_blocks.rs
index 1edd592ce877..46c648af75c1 100644
--- a/crates/accelerate/src/consolidate_blocks.rs
+++ b/crates/accelerate/src/consolidate_blocks.rs
@@ -107,7 +107,7 @@ pub(crate) fn consolidate_blocks(
                 dag.get_qargs(inst.qubits),
             ) {
                 all_block_gates.insert(inst_node);
-                let matrix = match get_matrix_from_inst(py, inst) {
+                let matrix = match get_matrix_from_inst(inst) {
                     Ok(mat) => mat,
                     Err(_) => continue,
                 };
@@ -198,7 +198,7 @@ pub(crate) fn consolidate_blocks(
                 *block_qargs.iter().min().unwrap(),
                 *block_qargs.iter().max().unwrap(),
             ];
-            let matrix = blocks_to_matrix(py, dag, &block, block_index_map).ok();
+            let matrix = blocks_to_matrix(dag, &block, block_index_map).ok();
             if let Some(matrix) = matrix {
                 if force_consolidate
                     || decomposer.num_basis_gates_inner(matrix.view()) < basis_count
@@ -252,7 +252,7 @@ pub(crate) fn consolidate_blocks(
                     first_qubits,
                 )
             {
-                let matrix = match get_matrix_from_inst(py, first_inst) {
+                let matrix = match get_matrix_from_inst(first_inst) {
                     Ok(mat) => mat,
                     Err(_) => continue,
                 };
@@ -272,7 +272,7 @@ pub(crate) fn consolidate_blocks(
                     already_in_block = true;
                 }
                 let gate = dag.dag()[*node].unwrap_operation();
-                let operator = match get_matrix_from_inst(py, gate) {
+                let operator = match get_matrix_from_inst(gate) {
                     Ok(mat) => mat,
                     Err(_) => {
                         // Set this to skip this run because we can't compute the matrix of the
diff --git a/crates/accelerate/src/convert_2q_block_matrix.rs b/crates/accelerate/src/convert_2q_block_matrix.rs
index aefc5976e82f..c79bd655dce8 100644
--- a/crates/accelerate/src/convert_2q_block_matrix.rs
+++ b/crates/accelerate/src/convert_2q_block_matrix.rs
@@ -31,7 +31,7 @@ use crate::euler_one_qubit_decomposer::matmul_1q;
 use crate::QiskitError;
 
 #[inline]
-pub fn get_matrix_from_inst(py: Python, inst: &PackedInstruction) -> PyResult<Array2<Complex64>> {
+pub fn get_matrix_from_inst(inst: &PackedInstruction) -> PyResult<Array2<Complex64>> {
     if let Some(mat) = inst.op.matrix(inst.params_view()) {
         Ok(mat)
     } else if inst.op.try_standard_gate().is_some() {
@@ -39,13 +39,15 @@ pub fn get_matrix_from_inst(py: Python, inst: &PackedInstruction) -> PyResult<Ar
             "Parameterized gates can't be consolidated",
         ))
     } else if let OperationRef::Gate(gate) = inst.op.view() {
-        Ok(QI_OPERATOR
-            .get_bound(py)
-            .call1((gate.gate.clone_ref(py),))?
-            .getattr(intern!(py, "data"))?
-            .extract::<PyReadonlyArray2<Complex64>>()?
-            .as_array()
-            .to_owned())
+        Python::with_gil(|py| {
+            Ok(QI_OPERATOR
+                .get_bound(py)
+                .call1((gate.gate.clone_ref(py),))?
+                .getattr(intern!(py, "data"))?
+                .extract::<PyReadonlyArray2<Complex64>>()?
+                .as_array()
+                .to_owned())
+        })
     } else {
         Err(QiskitError::new_err(
             "Can't compute matrix of non-unitary op",
@@ -55,7 +57,6 @@ pub fn get_matrix_from_inst(py: Python, inst: &PackedInstruction) -> PyResult<Ar
 
 /// Return the matrix Operator resulting from a block of Instructions.
 pub fn blocks_to_matrix(
-    py: Python,
     dag: &DAGCircuit,
     op_list: &[NodeIndex],
     block_index_map: [Qubit; 2],
@@ -73,7 +74,7 @@ pub fn blocks_to_matrix(
     let mut output_matrix: Option<Array2<Complex64>> = None;
     for node in op_list {
         let inst = dag.dag()[*node].unwrap_operation();
-        let op_matrix = get_matrix_from_inst(py, inst)?;
+        let op_matrix = get_matrix_from_inst(inst)?;
         match dag
             .get_qargs(inst.qubits)
             .iter()
diff --git a/crates/accelerate/src/lib.rs b/crates/accelerate/src/lib.rs
index 45cf047a6808..eab307859c4b 100644
--- a/crates/accelerate/src/lib.rs
+++ b/crates/accelerate/src/lib.rs
@@ -56,6 +56,7 @@ pub mod synthesis;
 pub mod target_transpiler;
 pub mod twirling;
 pub mod two_qubit_decompose;
+pub mod two_qubit_peephole;
 pub mod uc_gate;
 pub mod unitary_synthesis;
 pub mod utils;
diff --git a/crates/accelerate/src/two_qubit_decompose.rs b/crates/accelerate/src/two_qubit_decompose.rs
index 48ba1fd0ad59..f24d8940be8c 100644
--- a/crates/accelerate/src/two_qubit_decompose.rs
+++ b/crates/accelerate/src/two_qubit_decompose.rs
@@ -1243,9 +1243,9 @@ type TwoQubitSequenceVec = Vec<(Option<StandardGate>, SmallVec<[f64; 3]>, SmallV
 #[derive(Clone, Debug)]
 #[pyclass(sequence)]
 pub struct TwoQubitGateSequence {
-    gates: TwoQubitSequenceVec,
+    pub gates: TwoQubitSequenceVec,
     #[pyo3(get)]
-    global_phase: f64,
+    pub global_phase: f64,
 }
 
 impl TwoQubitGateSequence {
@@ -1708,7 +1708,7 @@ impl TwoQubitBasisDecomposer {
         gate: String,
         gate_matrix: ArrayView2<Complex64>,
         basis_fidelity: f64,
-        euler_basis: &str,
+        euler_basis: EulerBasis,
         pulse_optimize: Option<bool>,
     ) -> PyResult<Self> {
         let ipz: ArrayView2<Complex64> = aview2(&IPZ);
@@ -1816,7 +1816,7 @@ impl TwoQubitBasisDecomposer {
         Ok(TwoQubitBasisDecomposer {
             gate,
             basis_fidelity,
-            euler_basis: EulerBasis::__new__(euler_basis)?,
+            euler_basis,
             pulse_optimize,
             basis_decomposer,
             super_controlled,
@@ -1985,7 +1985,7 @@ impl TwoQubitBasisDecomposer {
             gate,
             gate_matrix.as_array(),
             basis_fidelity,
-            euler_basis,
+            EulerBasis::__new__(euler_basis)?,
             pulse_optimize,
         )
     }
@@ -2283,8 +2283,13 @@ fn two_qubit_decompose_up_to_diagonal(
     let (su4, phase) = u4_to_su4(mat_arr);
     let mut real_map = real_trace_transform(su4.view());
     let mapped_su4 = real_map.dot(&su4.view());
-    let decomp =
-        TwoQubitBasisDecomposer::new_inner("cx".to_string(), aview2(&CX_GATE), 1.0, "U", None)?;
+    let decomp = TwoQubitBasisDecomposer::new_inner(
+        "cx".to_string(),
+        aview2(&CX_GATE),
+        1.0,
+        EulerBasis::__new__("U")?,
+        None,
+    )?;
 
     let circ_seq = decomp.call_inner(mapped_su4.view(), None, true, None)?;
     let circ = CircuitData::from_standard_gates(
diff --git a/crates/accelerate/src/two_qubit_peephole.rs b/crates/accelerate/src/two_qubit_peephole.rs
new file mode 100644
index 000000000000..e4c282743741
--- /dev/null
+++ b/crates/accelerate/src/two_qubit_peephole.rs
@@ -0,0 +1,408 @@
+// This code is part of Qiskit.
+//
+// (C) Copyright IBM 2024
+//
+// This code is licensed under the Apache License, Version 2.0. You may
+// obtain a copy of this license in the LICENSE.txt file in the root directory
+// of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+//
+// Any modifications or derivative works of this code must retain this
+// copyright notice, and modified files need to carry a notice indicating
+// that they have been altered from the originals.
+
+use std::cmp::Ordering;
+use std::sync::Mutex;
+
+use hashbrown::{HashMap, HashSet};
+use pyo3::prelude::*;
+use rayon::prelude::*;
+use rustworkx_core::petgraph::stable_graph::NodeIndex;
+use smallvec::{smallvec, SmallVec};
+
+use qiskit_circuit::circuit_instruction::ExtraInstructionAttributes;
+use qiskit_circuit::dag_circuit::{DAGCircuit, NodeType};
+use qiskit_circuit::operations::{Operation, OperationRef, Param, StandardGate};
+use qiskit_circuit::packed_instruction::PackedOperation;
+use qiskit_circuit::Qubit;
+
+use crate::convert_2q_block_matrix::blocks_to_matrix;
+use crate::euler_one_qubit_decomposer::{
+    EulerBasis, EulerBasisSet, EULER_BASES, EULER_BASIS_NAMES,
+};
+use crate::nlayout::PhysicalQubit;
+use crate::target_transpiler::Target;
+use crate::two_qubit_decompose::{TwoQubitBasisDecomposer, TwoQubitGateSequence};
+
+fn get_decomposers_from_target(
+    target: &Target,
+    qubits: &[Qubit],
+    fidelity: f64,
+) -> PyResult<Vec<TwoQubitBasisDecomposer>> {
+    let physical_qubits = smallvec![PhysicalQubit(qubits[0].0), PhysicalQubit(qubits[1].0)];
+    let gate_names = match target.operation_names_for_qargs(Some(&physical_qubits)) {
+        Ok(names) => names,
+        Err(_) => {
+            let reverse_qubits = physical_qubits.iter().rev().copied().collect();
+            target
+                .operation_names_for_qargs(Some(&reverse_qubits))
+                .unwrap()
+        }
+    };
+
+    let available_kak_gate: Vec<(&str, &PackedOperation, &[Param])> = gate_names
+        .iter()
+        .filter_map(|name| match target.operation_from_name(name) {
+            Ok(raw_op) => match raw_op.operation.view() {
+                OperationRef::Standard(_) | OperationRef::Gate(_) => {
+                    Some((*name, &raw_op.operation, raw_op.params.as_slice()))
+                }
+                _ => None,
+            },
+            Err(_) => None,
+        })
+        .collect();
+
+    let single_qubit_basis_list =
+        target.operation_names_for_qargs(Some(&smallvec![physical_qubits[0]]));
+    let mut target_basis_set = EulerBasisSet::new();
+    match single_qubit_basis_list {
+        Ok(basis_list) => {
+            EULER_BASES
+                .iter()
+                .enumerate()
+                .filter_map(|(idx, gates)| {
+                    if !gates.iter().all(|gate| basis_list.contains(gate)) {
+                        return None;
+                    }
+                    let basis = EULER_BASIS_NAMES[idx];
+                    Some(basis)
+                })
+                .for_each(|basis| target_basis_set.add_basis(basis));
+        }
+        Err(_) => target_basis_set.support_all(),
+    }
+    if target_basis_set.basis_supported(EulerBasis::U3)
+        && target_basis_set.basis_supported(EulerBasis::U321)
+    {
+        target_basis_set.remove(EulerBasis::U3);
+    }
+    if target_basis_set.basis_supported(EulerBasis::ZSX)
+        && target_basis_set.basis_supported(EulerBasis::ZSXX)
+    {
+        target_basis_set.remove(EulerBasis::ZSX);
+    }
+
+    let euler_bases: Vec<EulerBasis> = target_basis_set.get_bases().collect();
+
+    available_kak_gate
+        .iter()
+        .filter_map(|(two_qubit_name, two_qubit_gate, params)| {
+            let matrix = two_qubit_gate.matrix(params);
+            matrix.map(|matrix| {
+                euler_bases.iter().map(move |euler_basis| {
+                    TwoQubitBasisDecomposer::new_inner(
+                        two_qubit_name.to_string(),
+                        matrix.view(),
+                        fidelity,
+                        *euler_basis,
+                        None,
+                    )
+                })
+            })
+        })
+        .flatten()
+        .collect()
+}
+
+#[inline]
+fn score_sequence<'a>(
+    target: &'a Target,
+    kak_gate_name: &str,
+    sequence: impl Iterator<Item = (Option<StandardGate>, SmallVec<[Qubit; 2]>)> + 'a,
+) -> f64 {
+    1. - sequence
+        .map(|(gate, local_qubits)| {
+            let qubits = local_qubits
+                .iter()
+                .map(|qubit| PhysicalQubit(qubit.0))
+                .collect::<Vec<_>>();
+            let name = match gate.as_ref() {
+                Some(g) => g.name(),
+                None => kak_gate_name,
+            };
+            1. - target.get_error(name, qubits.as_slice()).unwrap_or(0.)
+        })
+        .product::<f64>()
+}
+
+type MappingIterItem = Option<((TwoQubitGateSequence, String), [Qubit; 2])>;
+
+/// This transpiler pass can only run in a context where we've translated the circuit gates (or
+/// where we know all gates have a matrix). If any gate identified in the run fails to have a
+/// matrix defined (either in rust or python) it will be skipped
+#[pyfunction]
+pub(crate) fn two_qubit_unitary_peephole_optimize(
+    py: Python,
+    dag: &DAGCircuit,
+    target: &Target,
+    fidelity: f64,
+) -> PyResult<DAGCircuit> {
+    let runs: Vec<Vec<NodeIndex>> = dag.collect_2q_runs().unwrap();
+    let node_mapping: HashMap<NodeIndex, usize> =
+        HashMap::with_capacity(runs.iter().map(|run| run.len()).sum());
+    let locked_node_mapping = Mutex::new(node_mapping);
+
+    // Build a vec of all the best synthesized two qubit gate sequences from the collected runs.
+    // This is done in parallel
+    let run_mapping: PyResult<Vec<MappingIterItem>> = runs
+        .par_iter()
+        .enumerate()
+        .map(|(run_index, node_indices)| {
+            let block_qubit_map = node_indices
+                .iter()
+                .find_map(|node_index| {
+                    let inst = dag.dag()[*node_index].unwrap_operation();
+                    let qubits = dag.get_qargs(inst.qubits);
+                    if qubits.len() == 2 {
+                        if qubits[0] > qubits[1] {
+                            Some([qubits[1], qubits[0]])
+                        } else {
+                            Some([qubits[0], qubits[1]])
+                        }
+                    } else {
+                        None
+                    }
+                })
+                .unwrap();
+            let matrix = blocks_to_matrix(dag, node_indices, block_qubit_map)?;
+            let decomposers = get_decomposers_from_target(target, &block_qubit_map, fidelity)?;
+            let mut decomposer_scores: Vec<Option<f64>> = vec![None; decomposers.len()];
+
+            let order_sequence =
+                |(index_a, sequence_a): &(usize, (TwoQubitGateSequence, String)),
+                 (index_b, sequence_b): &(usize, (TwoQubitGateSequence, String))| {
+                    let score_a = match decomposer_scores[*index_a] {
+                        Some(score) => score,
+                        None => {
+                            let score: f64 =
+                                score_sequence(
+                                    target,
+                                    sequence_a.1.as_str(),
+                                    sequence_a.0.gates.iter().map(
+                                        |(gate, _params, local_qubits)| {
+                                            let qubits = local_qubits
+                                                .iter()
+                                                .map(|qubit| block_qubit_map[*qubit as usize])
+                                                .collect();
+                                            (*gate, qubits)
+                                        },
+                                    ),
+                                );
+                            decomposer_scores[*index_a] = Some(score);
+                            score
+                        }
+                    };
+
+                    let score_b = match decomposer_scores[*index_b] {
+                        Some(score) => score,
+                        None => {
+                            let score: f64 =
+                                score_sequence(
+                                    target,
+                                    sequence_b.1.as_str(),
+                                    sequence_b.0.gates.iter().map(
+                                        |(gate, _params, local_qubits)| {
+                                            let qubits = local_qubits
+                                                .iter()
+                                                .map(|qubit| block_qubit_map[*qubit as usize])
+                                                .collect();
+                                            (*gate, qubits)
+                                        },
+                                    ),
+                                );
+                            decomposer_scores[*index_b] = Some(score);
+                            score
+                        }
+                    };
+                    score_a.partial_cmp(&score_b).unwrap_or(Ordering::Equal)
+                };
+
+            let sequence = decomposers
+                .iter()
+                .map(|decomposer| {
+                    (
+                        decomposer
+                            .call_inner(matrix.view(), None, true, None)
+                            .unwrap(),
+                        decomposer.gate_name().to_string(),
+                    )
+                })
+                .enumerate()
+                .min_by(order_sequence)
+                .unwrap()
+                .1;
+            let original_score = 1.
+                - node_indices
+                    .iter()
+                    .map(|node_index| {
+                        let NodeType::Operation(ref inst) = dag.dag()[*node_index] else {
+                            unreachable!("All run nodes will be ops")
+                        };
+                        let qubits = dag
+                            .get_qargs(inst.qubits)
+                            .iter()
+                            .map(|qubit| PhysicalQubit(qubit.0))
+                            .collect::<Vec<_>>();
+                        let name = inst.op.name();
+                        1. - target.get_error(name, qubits.as_slice()).unwrap_or(0.)
+                    })
+                    .product::<f64>();
+            let new_score = score_sequence(
+                target,
+                sequence.1.as_str(),
+                sequence
+                    .0
+                    .gates
+                    .iter()
+                    .map(|(gate, _params, local_qubits)| {
+                        let qubits = local_qubits
+                            .iter()
+                            .map(|qubit| block_qubit_map[*qubit as usize])
+                            .collect();
+                        (*gate, qubits)
+                    }),
+            );
+
+            if new_score > original_score
+                || (new_score == original_score
+                    && sequence
+                        .0
+                        .gates
+                        .iter()
+                        .filter(|(_, __, qubits)| qubits.len() == 2)
+                        .count()
+                        >= node_indices
+                            .iter()
+                            .filter(|node_index| {
+                                let NodeType::Operation(ref inst) = dag.dag()[**node_index] else {
+                                    unreachable!("All run nodes will be ops")
+                                };
+                                let qubits = dag.get_qargs(inst.qubits);
+                                qubits.len() == 2
+                            })
+                            .count())
+            {
+                return Ok(None);
+            }
+            // This is done at the end of the map in some attempt to minimize
+            // lock contention. If this were serial code it'd make more sense
+            // to do this as part of the iteration building the
+            let mut node_mapping = locked_node_mapping.lock().unwrap();
+            for node in node_indices {
+                node_mapping.insert(*node, run_index);
+            }
+            Ok(Some((sequence, block_qubit_map)))
+        })
+        .collect();
+
+    let run_mapping = run_mapping?;
+    // After we've computed all the sequences to execute now serially build up a new dag.
+    let mut processed_runs: HashSet<usize> = HashSet::with_capacity(run_mapping.len());
+    let mut out_dag = dag.copy_empty_like(py, "alike")?;
+    let node_mapping = locked_node_mapping.into_inner().unwrap();
+    for node in dag.topological_op_nodes()? {
+        match node_mapping.get(&node) {
+            Some(run_index) => {
+                if processed_runs.contains(run_index) {
+                    continue;
+                }
+                if run_mapping[*run_index].is_none() {
+                    let NodeType::Operation(ref instr) = dag.dag()[node] else {
+                        unreachable!("Must be an op node")
+                    };
+                    out_dag.push_back(py, instr.clone())?;
+                    continue;
+                }
+                let (sequence, qubit_map) = &run_mapping[*run_index].as_ref().unwrap();
+                for (gate, params, local_qubits) in &sequence.0.gates {
+                    let qubits: Vec<Qubit> = local_qubits
+                        .iter()
+                        .map(|index| qubit_map[*index as usize])
+                        .collect();
+                    let out_params = if params.is_empty() {
+                        None
+                    } else {
+                        Some(params.iter().map(|val| Param::Float(*val)).collect())
+                    };
+                    match gate {
+                        Some(gate) => {
+                            #[cfg(feature = "cache_pygates")]
+                            {
+                                out_dag.apply_operation_back(
+                                    py,
+                                    PackedOperation::from_standard(*gate),
+                                    qubits.as_slice(),
+                                    &[],
+                                    out_params,
+                                    ExtraInstructionAttributes::default(),
+                                    None,
+                                )
+                            }
+                            #[cfg(not(feature = "cache_pygates"))]
+                            {
+                                out_dag.apply_operation_back(
+                                    py,
+                                    PackedOperation::from_standard(*gate),
+                                    qubits.as_slice(),
+                                    &[],
+                                    out_params,
+                                    ExtraInstructionAttributes::default(),
+                                )
+                            }
+                        }
+                        None => {
+                            let gate = target.operation_from_name(sequence.1.as_str()).unwrap();
+                            #[cfg(feature = "cache_pygates")]
+                            {
+                                out_dag.apply_operation_back(
+                                    py,
+                                    gate.operation.clone(),
+                                    qubits.as_slice(),
+                                    &[],
+                                    out_params,
+                                    ExtraInstructionAttributes::default(),
+                                    None,
+                                )
+                            }
+                            #[cfg(not(feature = "cache_pygates"))]
+                            {
+                                out_dag.apply_operation_back(
+                                    py,
+                                    gate.operation.clone(),
+                                    qubits.as_slice(),
+                                    &[],
+                                    out_params,
+                                    ExtraInstructionAttributes::default(),
+                                )
+                            }
+                        }
+                    }?;
+                }
+                out_dag.add_global_phase(py, &Param::Float(sequence.0.global_phase))?;
+                processed_runs.insert(*run_index);
+            }
+            None => {
+                let NodeType::Operation(ref instr) = dag.dag()[node] else {
+                    unreachable!("Must be an op node")
+                };
+                out_dag.push_back(py, instr.clone())?;
+            }
+        }
+    }
+    Ok(out_dag)
+}
+
+pub fn two_qubit_peephole_mod(m: &Bound<PyModule>) -> PyResult<()> {
+    m.add_wrapped(wrap_pyfunction!(two_qubit_unitary_peephole_optimize))?;
+    Ok(())
+}
diff --git a/crates/accelerate/src/unitary_synthesis.rs b/crates/accelerate/src/unitary_synthesis.rs
index 62f41c78084c..338e9481f05d 100644
--- a/crates/accelerate/src/unitary_synthesis.rs
+++ b/crates/accelerate/src/unitary_synthesis.rs
@@ -666,6 +666,7 @@ fn get_2q_decomposers_from_target(
             if let Some(approx_degree) = approximation_degree {
                 basis_2q_fidelity *= approx_degree;
             }
+            let basis_1q = EulerBasis::__new__(basis_1q)?;
             let decomposer = TwoQubitBasisDecomposer::new_inner(
                 gate.operation.name().to_string(),
                 gate.operation.matrix(&gate.params).unwrap().view(),
@@ -761,7 +762,7 @@ fn get_2q_decomposers_from_target(
                         pi_2_basis.to_string(),
                         StandardGate::CXGate.matrix(&[]).unwrap().view(),
                         fidelity,
-                        basis_1q,
+                        EulerBasis::__new__(basis_1q)?,
                         Some(true),
                     )?)
                 } else {
diff --git a/crates/pyext/src/lib.rs b/crates/pyext/src/lib.rs
index d8e59e04e51e..890cf486df88 100644
--- a/crates/pyext/src/lib.rs
+++ b/crates/pyext/src/lib.rs
@@ -28,6 +28,11 @@ where
 #[rustfmt::skip]
 #[pymodule]
 fn _accelerate(m: &Bound<PyModule>) -> PyResult<()> {
+    add_submodule(
+        m,
+        ::qiskit_accelerate::two_qubit_peephole::two_qubit_peephole_mod,
+        "two_qubit_peephole",
+    )?;
     add_submodule(m, ::qiskit_accelerate::barrier_before_final_measurement::barrier_before_final_measurements_mod, "barrier_before_final_measurement")?;
     add_submodule(m, ::qiskit_accelerate::basis::basis, "basis")?;
     add_submodule(m, ::qiskit_accelerate::check_map::check_map_mod, "check_map")?;
diff --git a/qiskit/__init__.py b/qiskit/__init__.py
index af543bea80c7..9105e39fed91 100644
--- a/qiskit/__init__.py
+++ b/qiskit/__init__.py
@@ -107,6 +107,7 @@
 sys.modules["qiskit._accelerate.inverse_cancellation"] = _accelerate.inverse_cancellation
 sys.modules["qiskit._accelerate.check_map"] = _accelerate.check_map
 sys.modules["qiskit._accelerate.filter_op_nodes"] = _accelerate.filter_op_nodes
+sys.modules["qiskit._accelerate.two_qubit_peephole"] = _accelerate.two_qubit_peephole
 sys.modules["qiskit._accelerate.twirling"] = _accelerate.twirling
 sys.modules["qiskit._accelerate.high_level_synthesis"] = _accelerate.high_level_synthesis
 sys.modules["qiskit._accelerate.remove_identity_equiv"] = _accelerate.remove_identity_equiv
diff --git a/qiskit/transpiler/passes/optimization/two_qubit_peephole.py b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
new file mode 100644
index 000000000000..60536d09aa71
--- /dev/null
+++ b/qiskit/transpiler/passes/optimization/two_qubit_peephole.py
@@ -0,0 +1,59 @@
+# This code is part of Qiskit.
+#
+# (C) Copyright IBM 2017, 2024.
+#
+# This code is licensed under the Apache License, Version 2.0. You may
+# obtain a copy of this license in the LICENSE.txt file in the root directory
+# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# Any modifications or derivative works of this code must retain this
+# copyright notice, and modified files need to carry a notice indicating
+# that they have been altered from the originals.
+
+"""Splits each two-qubit gate in the `dag` into two single-qubit gates, if possible without error."""
+
+from __future__ import annotations
+
+from qiskit.transpiler.basepasses import TransformationPass
+from qiskit.transpiler.passmanager import PassManager
+from qiskit.transpiler.passes.optimization import Collect2qBlocks, ConsolidateBlocks
+from qiskit.transpiler.passes.synthesis import UnitarySynthesis
+from qiskit.transpiler.target import Target
+from qiskit.dagcircuit.dagcircuit import DAGCircuit
+from qiskit._accelerate.two_qubit_peephole import two_qubit_unitary_peephole_optimize
+
+
+class TwoQubitPeepholeOptimization(TransformationPass):
+    """Unified two qubit unitary peephole optimization"""
+
+    def __init__(
+        self,
+        target: Target,
+        approximation_degree: float | None = 1.0,
+        method: str = "default",
+        plugin_config: dict = None,
+    ):
+        super().__init__()
+        self._target = target
+        self._approximation_degree = approximation_degree
+        self._pm = None
+        if method != "default":
+            self._pm = PassManager(
+                [
+                    Collect2qBlocks(),
+                    ConsolidateBlocks(
+                        target=self._target, approximation_degree=self._approximation_degree
+                    ),
+                    UnitarySynthesis(
+                        target=target,
+                        approximation_degree=approximation_degree,
+                        method=method,
+                        plugin_config=plugin_config,
+                    ),
+                ]
+            )
+
+    def run(self, dag: DAGCircuit) -> DAGCircuit:
+        if self._pm is not None:
+            return self._pm.run(dag)
+        return two_qubit_unitary_peephole_optimize(dag, self._target, self._approximation_degree)