diff --git a/WORKSPACE b/WORKSPACE
index 6a29d598b..e28452a8d 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -24,19 +24,11 @@ cc_library(
         ],
 )
 
-# http_archive(
-#     name = "qsim",
-#     sha256 = "b9c1eba09a885a938b5e73dfc2e02f5231cf3b01d899415caa24769346a731d5",
-#     strip_prefix = "qsim-0.13.3",
-#     urls = ["https://github.com/quantumlib/qsim/archive/refs/tags/v0.13.3.zip"],
-# )
-
-# TODO: After merging this patch later into qsim mainstream, remove this and uncomment the above.
 http_archive(
     name = "qsim",
-    sha256 = "",
-    strip_prefix = "qsim-0.15.0-dev20230327_v3",
-    urls = ["https://github.com/jaeyoo/qsim/archive/refs/tags/v0.15.0+dev20230327_v3.tar.gz"],
+    sha256 = "f7f410a07543a51b254f7a5810b5153e196a4c7b4ec89dc8faf86f9c77eec97b",
+    strip_prefix = "qsim-0.16.1",
+    urls = ["https://github.com/quantumlib/qsim/archive/refs/tags/v0.16.1.zip"],
 )
 
 http_archive(
@@ -81,21 +73,6 @@ bind(
     actual = "@six_archive//:six",
 )
 
-new_local_repository(
-    name = "cuquantum_libs",
-    path = "/usr/local/google/home/jaeyoo/workspace/cuquantum-linux-x86_64-22.11.0.13-archive",
-    build_file_content = """
-cc_library(
-    name = "custatevec_headers",
-    srcs = ["include/custatevec.h"],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "custatevec",
-    srcs = ["lib/libcustatevec.so"],
-    visibility = ["//visibility:public"],
-)
-""",
-)
+load("//third_party/cuquantum:cuquantum_configure.bzl", "cuquantum_configure")
 
+cuquantum_configure(name = "local_config_cuquantum")
diff --git a/benchmarks/scripts/BUILD b/benchmarks/scripts/BUILD
index 095a10d74..8edf4e49c 100644
--- a/benchmarks/scripts/BUILD
+++ b/benchmarks/scripts/BUILD
@@ -1,3 +1,4 @@
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
 package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])
@@ -27,6 +28,18 @@ py_test(
     ],
 )
 
+py_test(
+    name = "benchmark_cuquantum_ops",
+    srcs = ["benchmark_cuquantum_ops.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow_quantum/core/ops:tfq_simulate_ops_cuquantum_py",
+        "//tensorflow_quantum/core/ops:tfq_simulate_ops_py",
+        "//tensorflow_quantum/core/serialize:serializer",
+        "@local_config_tf//:test_log_pb2",
+        "//tensorflow_quantum/python:util",
+    ],
+)
 py_test(
     name = "benchmark_op_gradients",
     srcs = ["benchmark_op_gradients.py"],
diff --git a/benchmarks/scripts/benchmark_cuquantum_ops.py b/benchmarks/scripts/benchmark_cuquantum_ops.py
new file mode 100644
index 000000000..d141d4999
--- /dev/null
+++ b/benchmarks/scripts/benchmark_cuquantum_ops.py
@@ -0,0 +1,580 @@
+import os
+import time
+import numpy as np
+from absl.testing import parameterized
+import tensorflow as tf
+import cirq
+
+from tensorflow_quantum.core.ops import tfq_simulate_ops
+from tensorflow_quantum.core.ops import tfq_simulate_ops_cuquantum
+from tensorflow_quantum.python import util
+import flags
+from dataclasses import dataclass
+
+SRC = os.path.dirname(os.path.realpath(__file__))
+os.environ['TEST_REPORT_FILE_PREFIX'] = os.path.join(SRC, 'reports/')
+
+
+@dataclass(frozen=True)
+class BenchmarkParams:
+    """Frozen dataclass to store the parameters for the benchmark"""
+    n_qubits: int
+    n_moments: int
+    batch_size: int
+    n_iters: int = 100
+
+
+_test_params_1 = BenchmarkParams(n_qubits=20, n_moments=15, batch_size=5)
+_test_params_2 = BenchmarkParams(n_qubits=21, n_moments=25,
+                                 batch_size=5)  # more depth
+_test_params_3 = BenchmarkParams(n_qubits=22,
+                                 n_moments=15,
+                                 batch_size=5,
+                                 n_iters=10)
+
+TEST_PARAMS_EXPECTATION = [
+    _test_params_1,
+    _test_params_2,  # uncomment for depth params
+]
+TEST_PARAMS_SAMPLED_EXPECTATION = [
+    _test_params_1,
+    _test_params_2,  # uncomment for depth params
+]
+TEST_PARAMS_SAMPLES = [
+    _test_params_1,
+    _test_params_2,  # uncomment for depth params
+]
+TEST_PARAMS_STATE = [
+    _test_params_3,
+]
+
+
+def _measure_median_runtime(
+        fn,
+        tag,
+        num_samples=10,
+        result_avg=False,
+):
+    """Measures median runtime for given function.
+
+    Args:
+        fn: function.
+        tag: The message title.
+        num_samples: The number of measurements.
+        result_avg: True if the results are all mediand.
+
+    Returns:
+        The median time and the (averaged) result.
+    """
+    median_time = []
+    avg_res = []
+    for _ in range(num_samples):
+        begin_time = time.time()
+        result = fn()
+        duration = time.time() - begin_time
+        median_time.append(duration)
+        if result_avg:
+            avg_res.append(result)
+    median_time = np.median(median_time)
+    print(f"\n\t{tag} time: {median_time}\n")
+    if result_avg:
+        result = np.average(avg_res, axis=0)
+    return median_time, result
+
+
+class RandomCircuitBenchmark(tf.test.Benchmark):
+    """Benchmark cuquantum simulations against cpu."""
+
+    def __init__(self, params: BenchmarkParams):
+        """Pull in command line flags or use provided flags."""
+        super(RandomCircuitBenchmark, self).__init__()
+        # Allow input params for testing purposes.
+        self.params = params
+
+    def benchmark_expectation_cpu(self):
+        """Benchmark expectation simulator on cpu."""
+
+        n_qubits = self.params.n_qubits
+        batch_size = self.params.batch_size
+        circuit_depth = self.params.n_moments
+        symbol_names = ['alpha']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size, n_moments=circuit_depth)
+
+        circuit_batch_tensor = util.convert_to_tensor(circuit_batch)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        pauli_sums = util.random_pauli_sums(qubits, 3, batch_size)
+        pauli_sums_tensor = util.convert_to_tensor([[x] for x in pauli_sums])
+
+        cpu_avg_time, _ = _measure_median_runtime(
+            lambda: tfq_simulate_ops.tfq_simulate_expectation(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64), pauli_sums_tensor),
+            "Expectation CPU",
+            num_samples=self.params.n_iters,
+        )
+
+        extras = {
+            'n_qubits': self.params.n_qubits,
+            'batch_size': self.params.batch_size,
+            'num_samples': self.params.n_iters,
+            'median_time': cpu_avg_time,
+            # 'cuquantum_avg_time': cuquantum_avg_time,
+        }
+
+        name = "benchmark_expectation_cpu"
+        full_path = os.path.join(os.environ['TEST_REPORT_FILE_PREFIX'],
+                                 "{}.{}".format(self.__class__.__name__, name))
+        if os.path.exists(full_path):
+            os.remove(full_path)
+
+        benchmark_values = {
+            "iters": 1,
+            "wall_time": cpu_avg_time,
+            "extras": extras,
+            "name": name,
+        }
+        self.report_benchmark(**benchmark_values)
+
+        return benchmark_values
+
+    def benchmark_expectation_cuquantum(self):
+        """Benchmark expectation simulator on cpu."""
+
+        n_qubits = self.params.n_qubits
+        batch_size = self.params.batch_size
+        circuit_depth = self.params.n_moments
+        symbol_names = ['alpha']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size, n_moments=circuit_depth)
+
+        circuit_batch_tensor = util.convert_to_tensor(circuit_batch)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        pauli_sums = util.random_pauli_sums(qubits, 3, batch_size)
+        pauli_sums_tensor = util.convert_to_tensor([[x] for x in pauli_sums])
+
+        # Benchmark time on GPU (cuquantum)
+        cuquantum_avg_time, _ = _measure_median_runtime(
+            lambda: tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64), pauli_sums_tensor),
+            "Expectation cuQuantum",
+            num_samples=self.params.n_iters,
+        )
+
+        extras = {
+            'n_qubits': self.params.n_qubits,
+            'batch_size': self.params.batch_size,
+            'num_samples': self.params.n_iters,
+            'median_time': cuquantum_avg_time,
+        }
+
+        name = "benchmark_expectation_cuquantum"
+        full_path = os.path.join(os.environ['TEST_REPORT_FILE_PREFIX'],
+                                 "{}.{}".format(self.__class__.__name__, name))
+        if os.path.exists(full_path):
+            os.remove(full_path)
+
+        benchmark_values = {
+            "iters": 1,
+            "wall_time": cuquantum_avg_time,
+            "extras": extras,
+            "name": name,
+        }
+        self.report_benchmark(**benchmark_values)
+
+        return benchmark_values
+
+    def benchmark_sampled_expectation_cpu(self, params=None):
+        params = params if params else self.params
+        n_qubits = params.n_qubits
+        batch_size = params.batch_size
+        circuit_depth = params.n_moments
+        symbol_names = ['alpha']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size, n_moments=circuit_depth)
+        n_samples = [[10000]] * batch_size
+
+        circuit_batch_tensor = util.convert_to_tensor(circuit_batch)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        pauli_sums = util.random_pauli_sums(qubits, 3, batch_size)
+        pauli_sums_tensor = util.convert_to_tensor([[x] for x in pauli_sums])
+
+        cpu_avg_time, _ = _measure_median_runtime(
+            lambda: tfq_simulate_ops.tfq_simulate_sampled_expectation(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64), pauli_sums_tensor,
+                n_samples),
+            "SampledExpectation CPU",
+            num_samples=params.n_iters,
+            result_avg=False,
+        )
+
+        extras = {
+            'n_qubits': params.n_qubits,
+            'batch_size': params.batch_size,
+            'num_samples': params.n_iters,
+            'median_time': cpu_avg_time,
+            # 'cuquantum_avg_time': cuquantum_avg_time,
+        }
+
+        name = "benchmark_sampled_expectation_cpu"
+        full_path = os.path.join(os.environ['TEST_REPORT_FILE_PREFIX'],
+                                 "{}.{}".format(self.__class__.__name__, name))
+        if os.path.exists(full_path):
+            os.remove(full_path)
+
+        benchmark_values = {
+            "iters": 1,
+            "wall_time": cpu_avg_time,
+            "extras": extras,
+            "name": name,
+        }
+        self.report_benchmark(**benchmark_values)
+
+        return benchmark_values
+
+    def benchmark_sampled_expectation_cuquantum(self, params=None):
+        params = params if params else self.params
+        n_qubits = params.n_qubits
+        batch_size = params.batch_size
+        circuit_depth = params.n_moments
+        symbol_names = ['alpha']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size, n_moments=circuit_depth)
+        n_samples = [[10000]] * batch_size
+
+        circuit_batch_tensor = util.convert_to_tensor(circuit_batch)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        pauli_sums = util.random_pauli_sums(qubits, 3, batch_size)
+        pauli_sums_tensor = util.convert_to_tensor([[x] for x in pauli_sums])
+
+        cuquantum_avg_time, res_cuquantum = _measure_median_runtime(
+            lambda: tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64), pauli_sums_tensor,
+                n_samples),
+            "SampledExpectation cuQuantum",
+            num_samples=params.n_iters,
+            result_avg=False,
+        )
+
+        extras = {
+            'n_qubits': params.n_qubits,
+            'batch_size': params.batch_size,
+            'num_samples': params.n_iters,
+            'median_time': cuquantum_avg_time,
+            # 'cuquantum_avg_time': cuquantum_avg_time,
+        }
+
+        name = "benchmark_sampled_expectation_cuquantum"
+        full_path = os.path.join(os.environ['TEST_REPORT_FILE_PREFIX'],
+                                 "{}.{}".format(self.__class__.__name__, name))
+        if os.path.exists(full_path):
+            os.remove(full_path)
+
+        benchmark_values = {
+            "iters": 1,
+            "wall_time": cuquantum_avg_time,
+            "extras": extras,
+            "name": name,
+        }
+        self.report_benchmark(**benchmark_values)
+
+        return benchmark_values
+
+    def benchmark_samples_cpu(self, params=None):
+        params = params if params else self.params
+        n_qubits = params.n_qubits
+        batch_size = params.batch_size
+        circuit_depth = params.n_moments
+        symbol_names = ['alpha']
+        n_samples = [100]
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size, n_moments=circuit_depth)
+
+        circuit_batch_tensor = util.convert_to_tensor(circuit_batch)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        cpu_avg_time, _ = _measure_median_runtime(
+            lambda: tfq_simulate_ops.tfq_simulate_samples(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64), n_samples),
+            "Samples CPU",
+            num_samples=params.n_iters,
+            result_avg=False,
+        )
+
+        extras = {
+            'n_qubits': params.n_qubits,
+            'batch_size': params.batch_size,
+            'num_samples': params.n_iters,
+            'median_time': cpu_avg_time,
+            # 'cuquantum_avg_time': cuquantum_avg_time,
+        }
+
+        name = "benchmark_simulate_samples_cpu"
+        full_path = os.path.join(os.environ['TEST_REPORT_FILE_PREFIX'],
+                                 "{}.{}".format(self.__class__.__name__, name))
+        if os.path.exists(full_path):
+            os.remove(full_path)
+
+        benchmark_values = {
+            "iters": 1,
+            "wall_time": cpu_avg_time,
+            "extras": extras,
+            "name": name,
+        }
+        self.report_benchmark(**benchmark_values)
+
+        return benchmark_values
+
+    def benchmark_samples_cuquantum(self, params=None):
+        params = params if params else self.params
+        n_qubits = params.n_qubits
+        batch_size = params.batch_size
+        circuit_depth = params.n_moments
+        symbol_names = ['alpha']
+        n_samples = [100]
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size, n_moments=circuit_depth)
+
+        circuit_batch_tensor = util.convert_to_tensor(circuit_batch)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        cuquantum_avg_time, _ = _measure_median_runtime(
+            lambda: tfq_simulate_ops_cuquantum.tfq_simulate_samples(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64), n_samples),
+            "Samples cuQuantum",
+            num_samples=params.n_iters,
+            result_avg=False,
+        )
+
+        extras = {
+            'n_qubits': params.n_qubits,
+            'batch_size': params.batch_size,
+            'num_samples': params.n_iters,
+            'median_time': cuquantum_avg_time,
+            # 'cuquantum_avg_time': cuquantum_avg_time,
+        }
+
+        name = "benchmark_simulate_samples_cuquantum"
+        full_path = os.path.join(os.environ['TEST_REPORT_FILE_PREFIX'],
+                                 "{}.{}".format(self.__class__.__name__, name))
+        if os.path.exists(full_path):
+            os.remove(full_path)
+
+        benchmark_values = {
+            "iters": 1,
+            "wall_time": cuquantum_avg_time,
+            "extras": extras,
+            "name": name,
+        }
+        self.report_benchmark(**benchmark_values)
+
+        return benchmark_values
+
+    def benchmark_state_cpu(self, params=None):
+        params = params if params else self.params
+        n_qubits = params.n_qubits
+        batch_size = params.batch_size
+        circuit_depth = params.n_moments
+        symbol_names = ['alpha']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size, n_moments=circuit_depth)
+
+        circuit_batch_tensor = util.convert_to_tensor(circuit_batch)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        cpu_avg_time, _ = _measure_median_runtime(
+            lambda: tfq_simulate_ops.tfq_simulate_state(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64)),
+            "State CPU",
+            num_samples=params.n_iters,
+        )
+
+        extras = {
+            'n_qubits': params.n_qubits,
+            'batch_size': params.batch_size,
+            'num_samples': params.n_iters,
+            'median_time': cpu_avg_time,
+        }
+
+        name = "benchmark_simulate_state_cpu"
+        full_path = os.path.join(os.environ['TEST_REPORT_FILE_PREFIX'],
+                                 "{}.{}".format(self.__class__.__name__, name))
+        if os.path.exists(full_path):
+            os.remove(full_path)
+
+        benchmark_values = {
+            "iters": 1,
+            "wall_time": cpu_avg_time,
+            "extras": extras,
+            "name": name,
+        }
+        self.report_benchmark(**benchmark_values)
+
+        return benchmark_values
+
+    def benchmark_state_cuquantum(self, params=None):
+        params = params if params else self.params
+        n_qubits = params.n_qubits
+        batch_size = params.batch_size
+        circuit_depth = params.n_moments
+        symbol_names = ['alpha']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size, n_moments=circuit_depth)
+
+        circuit_batch_tensor = util.convert_to_tensor(circuit_batch)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        cuquantum_avg_time, _ = _measure_median_runtime(
+            lambda: tfq_simulate_ops_cuquantum.tfq_simulate_state(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64)),
+            "State cuQuantum",
+            num_samples=params.n_iters,
+        )
+
+        extras = {
+            'n_qubits': params.n_qubits,
+            'batch_size': params.batch_size,
+            'num_samples': params.n_iters,
+            'median_time': cuquantum_avg_time,
+        }
+
+        name = "benchmark_simulate_state_cuquantum"
+        full_path = os.path.join(os.environ['TEST_REPORT_FILE_PREFIX'],
+                                 "{}.{}".format(self.__class__.__name__, name))
+        if os.path.exists(full_path):
+            os.remove(full_path)
+
+        benchmark_values = {
+            "iters": 1,
+            "wall_time": cuquantum_avg_time,
+            "extras": extras,
+            "name": name,
+        }
+        self.report_benchmark(**benchmark_values)
+
+        return benchmark_values
+
+
+class SimulateExpectationCuquantumTest(tf.test.TestCase,
+                                       parameterized.TestCase):
+    """Tests tfq_simulate_expectation."""
+
+    @parameterized.parameters(TEST_PARAMS_EXPECTATION)
+    def test_simulate_expectation_cpu_vs_cuquantum(self, params):
+        """Make sure that cuquantum version is faster."""
+        bench = RandomCircuitBenchmark(params)
+
+        benchmark_cpu = bench.benchmark_expectation_cpu()
+        benchmark_gpu = bench.benchmark_expectation_cuquantum()
+
+        cpu_median_time = benchmark_cpu['extras']['median_time']
+        gpu_median_time = benchmark_gpu['extras']['median_time']
+
+        # cuQuantum op should be faster than CPU op.
+        self.assertGreater(cpu_median_time, gpu_median_time)
+
+    @parameterized.parameters(TEST_PARAMS_SAMPLED_EXPECTATION)
+    def test_simulate_sampled_expectation_cpu_vs_cuquantum(self, params):
+        """Make sure that cpu & gpu(cuquantum) ops have the same results."""
+        bench = RandomCircuitBenchmark(params)
+
+        benchmark_cpu = bench.benchmark_sampled_expectation_cpu()
+        benchmark_gpu = bench.benchmark_sampled_expectation_cuquantum()
+
+        cpu_median_time = benchmark_cpu['extras']['median_time']
+        gpu_median_time = benchmark_gpu['extras']['median_time']
+
+        # cuQuantum op should be faster than CPU op.
+        self.assertGreater(cpu_median_time, gpu_median_time)
+
+    @parameterized.parameters(TEST_PARAMS_SAMPLES)
+    def test_simulate_samples_cpu_vs_cuquantum(self, params):
+        """Make sure that cpu & gpu(cuquantum) ops have the same results."""
+        bench = RandomCircuitBenchmark(params)
+
+        benchmark_cpu = bench.benchmark_samples_cpu()
+        benchmark_gpu = bench.benchmark_samples_cuquantum()
+
+        cpu_median_time = benchmark_cpu['extras']['median_time']
+        gpu_median_time = benchmark_gpu['extras']['median_time']
+
+        # cuQuantum op should be faster than CPU op.
+        self.assertGreater(cpu_median_time, gpu_median_time)
+
+    @parameterized.parameters(TEST_PARAMS_STATE)
+    def test_simulate_state_cpu_vs_cuquantum(self, params):
+        """Make sure that cpu & gpu(cuquantum) ops have the same results."""
+        bench = RandomCircuitBenchmark(params)
+
+        benchmark_cpu = bench.benchmark_state_cpu()
+        benchmark_gpu = bench.benchmark_state_cuquantum()
+
+        cpu_median_time = benchmark_cpu['extras']['median_time']
+        gpu_median_time = benchmark_gpu['extras']['median_time']
+
+        self.assertGreater(cpu_median_time, gpu_median_time)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/configure.sh b/configure.sh
index 0ca4a0ae4..c73d40bc0 100755
--- a/configure.sh
+++ b/configure.sh
@@ -20,11 +20,11 @@ function write_to_bazelrc() {
 }
 
 function write_action_env_to_bazelrc() {
-  write_to_bazelrc "build --action_env $1=\"$2\""
+  write_to_bazelrc "$1 --action_env $2=\"$3\""
 }
 
 function write_linkopt_dir_to_bazelrc() {
-  write_to_bazelrc "build --linkopt -Wl,-rpath,$1" >> .bazelrc
+  write_to_bazelrc "$1 --linkopt -Wl,-rpath,$2" >> .bazelrc
 }
 
 
@@ -49,48 +49,81 @@ function is_ppc64le() {
 # Remove .bazelrc if it already exist
 [ -e .bazelrc ] && rm .bazelrc
 
-# Check if we are building GPU or CPU ops, default CPU
-while [[ "$TF_NEED_CUDA" == "" ]]; do
-  read -p "Do you want to build ops again TensorFlow CPU pip package?"\
-" Y or enter for CPU (tensorflow-cpu), N for GPU (tensorflow). [Y/n] " INPUT
+# Check if we are building TFQ GPU or not (TODO)
+while [[ "$TFQ_NEED_CUDA" == "" ]]; do
+  read -p "Do you want to build TFQ against CPU?"\
+" Y or enter for CPU, N for GPU. [Y/n] " INPUT
   case $INPUT in
-    [Yy]* ) echo "Build with CPU pip package."; TF_NEED_CUDA=0;;
-    [Nn]* ) echo "Build with GPU pip package."; TF_NEED_CUDA=1;;
-    "" ) echo "Build with CPU pip package."; TF_NEED_CUDA=0;;
+    [Yy]* ) echo "Build with CPU ops only."; TFQ_NEED_CUDA=0;;
+    [Nn]* ) echo "Build with cuQuantum support."; TFQ_NEED_CUDA=1;;
+    "" ) echo "Build with CPU ops only."; TFQ_NEED_CUDA=0;;
     * ) echo "Invalid selection: " $INPUT;;
   esac
 done
 
-while [[ "$TF_CUDA_VERSION" == "" ]]; do
-  read -p "Are you building against TensorFlow 2.11(including RCs) or newer?[Y/n] " INPUT
-  case $INPUT in
-    [Yy]* ) echo "Build against TensorFlow 2.11 or newer."; TF_CUDA_VERSION=11;;
-    [Nn]* ) echo "Build against TensorFlow <2.11."; TF_CUDA_VERSION=10.0;;
-    "" ) echo "Build against TensorFlow 2.11 or newer."; TF_CUDA_VERSION=11;;
-    * ) echo "Invalid selection: " $INPUT;;
-  esac
-done
+# Set the CUDA SDK version for TF
+if [[ "$TFQ_NEED_CUDA" == "1" ]]; then
+  _DEFAULT_CUDA_VERSION=11
+  while [[ "$TF_CUDA_VERSION" == "" ]]; do
+    read -p "Please specify the CUDA SDK major version you want to use. [Leave empty to default to CUDA $_DEFAULT_CUDA_VERSION]: " INPUT
+    case $INPUT in
+      "" ) echo "Build against CUDA $_DEFAULT_CUDA_VERSION."; TF_CUDA_VERSION=$_DEFAULT_CUDA_VERSION;;
+      # check if the input is a number
+      *[!0-9]* ) echo "Invalid selection: $INPUT";;
+      * ) echo "Build against CUDA $INPUT."; TF_CUDA_VERSION=$INPUT;;
+    esac
+  done
+fi
 
+# If TFQ_NEED_CUDA then enforce building against TensorFlow 2.11 or newer.
+IS_VALID_TF_VERSION=$(python -c "import tensorflow as tf; v = tf.__version__; print(float(v[:v.rfind('.')]) < 2.11)")
+TF_VERSION=$(python -c "import tensorflow as tf; print(tf.__version__)")
+if [[ $IS_VALID_TF_VERSION == "True" ]]; then
+  echo "Building against TensorFlow 2.11 or newer is required."
+  echo "Please upgrade your TensorFlow version."
+  exit 1
+elif [[ $IS_VALID_TF_VERSION == "False" ]]; then
+  echo "Using TensorFlow 2.11"
+else
+  echo "Unable to determine TensorFlow version."
+  exit 1
+fi
+
+# Check if we are building cuQuantum ops on top of CUDA.
+if [[ "$TFQ_NEED_CUDA" == "1" ]]; then
+  if [[ "$CUQUANTUM_ROOT" != "" ]]; then
+    echo "  [*] cuQuantum library is detected here: CUQUANTUM_ROOT=$CUQUANTUM_ROOT."
+  else
+    # Prompt the user to enter the cuQuantum root path, do not allow empty input (pressing enter)
+    # If the user enters an invalid path, prompt again.
+    while true; do
+      read -p "Please specify the cuQuantum root directory: " INPUT
+      if [[ -z "$INPUT" ]]; then
+        echo "Input cannot be empty. Please enter a valid path."
+      elif [[ "$INPUT" =~ ^(/[A-Za-z0-9_-]+)+$ ]]; then
+        echo "Path pattern is valid: $INPUT"
+        CUQUANTUM_ROOT=$INPUT
+        break
+      else
+        echo "Invalid path pattern: $INPUT. Please enter a valid path."
+      fi
+    done
+  fi
+  write_action_env_to_bazelrc "build:cuda" "CUQUANTUM_ROOT" ${CUQUANTUM_ROOT}
+  write_linkopt_dir_to_bazelrc "build:cuda" "${CUQUANTUM_ROOT}/lib"
+fi
 
 # Check if it's installed
 if [[ $(pip show tensorflow) == *tensorflow* ]] || [[ $(pip show tf-nightly) == *tf-nightly* ]]; then
-  echo 'Using installed tensorflow'
+  echo "Using installed tensorflow-($TF_VERSION)"
 else
-  # Uninstall CPU version if it is installed.
-  if [[ $(pip show tensorflow-cpu) == *tensorflow-cpu* ]]; then
-    echo 'Already have tensorflow non-gpu installed. Uninstalling......\n'
-    pip uninstall tensorflow
-  elif [[ $(pip show tf-nightly-cpu) == *tf-nightly-cpu* ]]; then
-    echo 'Already have tensorflow non-gpu installed. Uninstalling......\n'
-    pip uninstall tf-nightly
-  fi
-  # Install GPU version
-  echo 'Installing tensorflow .....\n'
-  pip install tensorflow
+  echo 'Installing tensorflow 2.11 .....\n'
+  pip install tensorflow==2.11.0
 fi
 
 
 
+
 TF_CFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))') )
 TF_LFLAGS="$(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))')"
 
@@ -101,7 +134,8 @@ write_to_bazelrc "build --strategy=Genrule=standalone"
 write_to_bazelrc "build -c opt"
 write_to_bazelrc "build --cxxopt=\"-D_GLIBCXX_USE_CXX11_ABI=1\""
 write_to_bazelrc "build --cxxopt=\"-std=c++17\""
-
+write_to_bazelrc "build --cxxopt=\"-O3\""
+write_to_bazelrc "build --cxxopt=\"-march=native\""
 
 if is_windows; then
   # Use pywrap_tensorflow instead of tensorflow_framework on Windows
@@ -127,29 +161,38 @@ if is_windows; then
   SHARED_LIBRARY_NAME=${SHARED_LIBRARY_NAME//\\//}
   HEADER_DIR=${HEADER_DIR//\\//}
 fi
-write_action_env_to_bazelrc "TF_HEADER_DIR" ${HEADER_DIR}
-write_action_env_to_bazelrc "TF_SHARED_LIBRARY_DIR" ${SHARED_LIBRARY_DIR}
-write_action_env_to_bazelrc "TF_SHARED_LIBRARY_NAME" ${SHARED_LIBRARY_NAME}
-write_action_env_to_bazelrc "TF_NEED_CUDA" ${TF_NEED_CUDA}
+
+TF_NEED_CUDA=${TFQ_NEED_CUDA}
+write_action_env_to_bazelrc "build" "TF_HEADER_DIR" ${HEADER_DIR} ""
+write_action_env_to_bazelrc "build" "TF_SHARED_LIBRARY_DIR" ${SHARED_LIBRARY_DIR} ""
+write_action_env_to_bazelrc "build" "TF_SHARED_LIBRARY_NAME" ${SHARED_LIBRARY_NAME} ""
+write_action_env_to_bazelrc "build" "TF_NEED_CUDA" ${TF_NEED_CUDA} ""
 
 if ! is_windows; then
-  write_linkopt_dir_to_bazelrc ${SHARED_LIBRARY_DIR}
+  write_linkopt_dir_to_bazelrc "build"  ${SHARED_LIBRARY_DIR} ""
 fi
 
 # TODO(yifeif): do not hardcode path
 if [[ "$TF_NEED_CUDA" == "1" ]]; then
-  write_to_bazelrc "build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true"
+  write_to_bazelrc "build:cuda --experimental_repo_remote_exec"
+  write_to_bazelrc "build:cuda --spawn_strategy=standalone"
+  write_to_bazelrc "build:cuda --strategy=Genrule=standalone"
+  write_to_bazelrc "build:cuda -c opt"
+  write_to_bazelrc "build:cuda --cxxopt=\"-D_GLIBCXX_USE_CXX11_ABI=1\""
+  write_to_bazelrc "build:cuda --cxxopt=\"-std=c++17\""
+  write_to_bazelrc "build:cuda --cxxopt=\"-O3\""
+  write_to_bazelrc "build:cuda --cxxopt=\"-march=native\""
   write_to_bazelrc "build:cuda --@local_config_cuda//:enable_cuda"
   write_to_bazelrc "build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain"
 
-  write_action_env_to_bazelrc "TF_CUDA_VERSION" ${TF_CUDA_VERSION}
-  write_action_env_to_bazelrc "TF_CUDNN_VERSION" "8"
+  write_action_env_to_bazelrc "build:cuda" "TF_CUDA_VERSION" ${TF_CUDA_VERSION} 
+  write_action_env_to_bazelrc "build:cuda" "TF_CUDNN_VERSION" "8"
   if is_windows; then
-    write_action_env_to_bazelrc "CUDNN_INSTALL_PATH" "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${TF_CUDA_VERSION}"
-    write_action_env_to_bazelrc "CUDA_TOOLKIT_PATH" "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${TF_CUDA_VERSION}"
+    write_action_env_to_bazelrc "build:cuda" "CUDNN_INSTALL_PATH" "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${TF_CUDA_VERSION}"
+    write_action_env_to_bazelrc "build:cuda" "CUDA_TOOLKIT_PATH" "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${TF_CUDA_VERSION}"
   else
-    write_action_env_to_bazelrc "CUDNN_INSTALL_PATH" "/usr/lib/x86_64-linux-gnu"
-    write_action_env_to_bazelrc "CUDA_TOOLKIT_PATH" "/usr/local/cuda"
+    write_action_env_to_bazelrc "build:cuda" "CUDNN_INSTALL_PATH" "/usr/lib/x86_64-linux-gnu"
+    write_action_env_to_bazelrc "build:cuda" "CUDA_TOOLKIT_PATH" "/usr/local/cuda"
   fi
   write_to_bazelrc "build --config=cuda"
   write_to_bazelrc "test --config=cuda"
diff --git a/docs/install.md b/docs/install.md
index 3de77ecf9..575354cd6 100644
--- a/docs/install.md
+++ b/docs/install.md
@@ -16,7 +16,7 @@ TensorFlow Quantum is supported on Python 3.7, 3.8, and 3.9 and depends directly
 
 ### Requirements
 
-* pip 19.0 or later (requires `manylinux2010` support)
+* pip 23.0 or later (requires `manylinux2014` support)
 * [TensorFlow == 2.11.0](https://www.tensorflow.org/install/pip)
 
 See the [TensorFlow install guide](https://www.tensorflow.org/install/pip) to
@@ -187,20 +187,20 @@ We use the standard [fork and pull request workflow](https://guides.github.com/a
 <!-- common_typos_enable -->
 
 
-### 6. Build the TensorFlow Quantum pip package
+### 6. Build the TensorFlow Quantum pip package for CPU
 
 Build the TensorFlow Quantum pip package and install:
 
 <!-- common_typos_disable -->
 <pre class="devsite-click-to-copy">
-  <code class="devsite-terminal">./configure.sh</code>
+  <code class="devsite-terminal">./configure.sh  # Type 'Y' for the first question.</code>
   <code class="devsite-terminal">bazel build -c opt --cxxopt="-O3" --cxxopt="-march=native" --cxxopt="-std=c++17" --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=1" release:build_pip_package</code>
   <code class="devsite-terminal">bazel-bin/release/build_pip_package /tmp/tfquantum/</code>
   <code class="devsite-terminal">python3 -m pip install /tmp/tfquantum/<var>name_of_generated_wheel</var>.whl</code>
 </pre>
 <!-- common_typos_enable -->
 
-To confirm that TensorFlow Quantum has successfully been installed, you can run the tests:
+To confirm that TensorFlow Quantum for CPU has successfully been installed, you can run the tests:
 <!-- common_typos_disable -->
 <pre class="devsite-click-to-copy">
   <code class="devsite-terminal">./scripts/test_all.sh</code>
@@ -208,4 +208,33 @@ To confirm that TensorFlow Quantum has successfully been installed, you can run
 <!-- common_typos_enable -->
 
 
-Success: TensorFlow Quantum is now installed.
+Success: TensorFlow Quantum for CPU is now installed.
+
+### 7. Build the TensorFlow Quantum pip package for GPU
+
+To enable GPU (cuQuantum) backend, cuStatevec must be installed, see <a href="https://docs.nvidia.com/cuda/cuquantum/custatevec/getting_started.html">installation guide</a> for details. Importantly, we require that the `CUQUANTUM_ROOT` environment variable has been set by running the following with your installation path.
+<pre class="devsite-click-to-copy">
+  <code class="devsite-terminal">export CUQUANTUM_ROOT=/path/to/cuquantum/installation/dir </code>
+</pre>
+
+Build the TensorFlow Quantum GPU pip package and install:
+
+<!-- common_typos_disable -->
+<pre class="devsite-click-to-copy">
+  <code class="devsite-terminal">bazel clean --expunge  # If you got stuck `.so` related issue, please clean the cache.</code>
+  <code class="devsite-terminal">./configure.sh  # Type 'n' for the second question.</code>
+  <code class="devsite-terminal">bazel build -c opt --config=cuda --cxxopt="-O3" --cxxopt="-march=native" --cxxopt="-std=c++17" --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=1" release:build_pip_package</code>
+  <code class="devsite-terminal">bazel-bin/release/build_pip_package /tmp/tfquantum_gpu/</code>
+  <code class="devsite-terminal">python3 -m pip install /tmp/tfquantum_gpu/<var>name_of_generated_wheel</var>.whl</code>
+</pre>
+<!-- common_typos_enable -->
+
+To confirm that TensorFlow Quantum for GPU has successfully been installed, you can run the tests:
+<!-- common_typos_disable -->
+<pre class="devsite-click-to-copy">
+  <code class="devsite-terminal">./scripts/test_all.sh gpu</code>
+</pre>
+<!-- common_typos_enable -->
+
+
+Success: TensorFlow Quantum for GPU is now installed.
diff --git a/docs/tutorials/hello_many_worlds.ipynb b/docs/tutorials/hello_many_worlds.ipynb
index 229136219..801d388de 100644
--- a/docs/tutorials/hello_many_worlds.ipynb
+++ b/docs/tutorials/hello_many_worlds.ipynb
@@ -255,7 +255,7 @@
         "# Create a circuit on these qubits using the parameters you created above.\n",
         "circuit = cirq.Circuit(\n",
         "    cirq.rx(a).on(q0),\n",
-        "    cirq.ry(b).on(q1), cirq.CNOT(control=q0, target=q1))\n",
+        "    cirq.ry(b).on(q1), cirq.CNOT(q0, q1))\n",
         "\n",
         "SVGCircuit(circuit)"
       ]
diff --git a/docs/tutorials/qcnn.ipynb b/docs/tutorials/qcnn.ipynb
index f53182701..abbb8c560 100644
--- a/docs/tutorials/qcnn.ipynb
+++ b/docs/tutorials/qcnn.ipynb
@@ -554,7 +554,7 @@
         "    source_basis_selector = one_qubit_unitary(source_qubit, symbols[3:6])\n",
         "    pool_circuit.append(sink_basis_selector)\n",
         "    pool_circuit.append(source_basis_selector)\n",
-        "    pool_circuit.append(cirq.CNOT(control=source_qubit, target=sink_qubit))\n",
+        "    pool_circuit.append(cirq.CNOT(source_qubit, sink_qubit))\n",
         "    pool_circuit.append(sink_basis_selector**-1)\n",
         "    return pool_circuit"
       ]
diff --git a/docs/tutorials/research_tools.ipynb b/docs/tutorials/research_tools.ipynb
index 538fcf46c..29c7c3752 100644
--- a/docs/tutorials/research_tools.ipynb
+++ b/docs/tutorials/research_tools.ipynb
@@ -83,25 +83,26 @@
    },
    "outputs": [],
    "source": [
-    "!pip install tensorflow==2.7.0 tensorflow-quantum==0.7.2 tensorboard_plugin_profile==2.4.0"
+    "!pip install tensorflow==2.7.0 tensorflow-quantum==0.7.2 tensorboard_plugin_profile==2.4.0\n",
+    "!pip install --quiet git+https://github.com/quantumlib/ReCirq"
    ]
   },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "4Ql5PW-ACO0J"
-      },
-      "outputs": [],
-      "source": [
-        "# Update package resources to account for version changes.\n",
-        "import importlib, pkg_resources\n",
-        "importlib.reload(pkg_resources)"
-      ]
-    },
- {
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {},
+    "colab_type": "code",
+    "id": "4Ql5PW-ACO0J"
+   },
+   "outputs": [],
+   "source": [
+    "# Update package resources to account for version changes.\n",
+    "import importlib, pkg_resources\n",
+    "importlib.reload(pkg_resources)"
+   ]
+  },
+  {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
@@ -124,6 +125,7 @@
     "import datetime\n",
     "import time\n",
     "import cirq\n",
+    "from recirq import beyond_classical\n",
     "import tensorflow as tf\n",
     "import tensorflow_quantum as tfq\n",
     "from tensorflow.keras import layers\n",
@@ -155,7 +157,7 @@
    "source": [
     "def generate_circuit(qubits):\n",
     "    \"\"\"Generate a random circuit on qubits.\"\"\"\n",
-    "    random_circuit = cirq.generate_boixo_2018_supremacy_circuits_v2(\n",
+    "    random_circuit = beyond_classical.generate_boixo_2018_beyond_classical_v2(\n",
     "        qubits, cz_depth=2, seed=1234)\n",
     "    return random_circuit\n",
     "\n",
diff --git a/release/BUILD b/release/BUILD
index b588a6c5a..ff3db2ba0 100644
--- a/release/BUILD
+++ b/release/BUILD
@@ -69,7 +69,7 @@ sh_binary(
         "//tensorflow_quantum/python/optimizers:rotosolve_minimizer",
         "//tensorflow_quantum/python/optimizers:spsa_minimizer",
     ] + if_cuda_is_configured([
-        "//tensorflow_quantum/core/ops:tfq_simulate_ops_cuda_py",
         "//tensorflow_quantum/core/ops:tfq_simulate_ops_cuquantum_py",
+        "//tensorflow_quantum/core/ops:tfq_adj_grad_op_cuquantum_py",
     ]),
 )
diff --git a/release/setup.py b/release/setup.py
index 7b6037023..e9deb961f 100644
--- a/release/setup.py
+++ b/release/setup.py
@@ -51,15 +51,17 @@ def finalize_options(self):
 
 
 REQUIRED_PACKAGES = [
-    'cirq-core==0.13.1', 'cirq-google>=0.13.1', 'sympy == 1.8',
+    'cirq-core~=1.0', 'cirq-google~=1.0', 'sympy == 1.8',
     'googleapis-common-protos==1.52.0', 'google-api-core==1.21.0',
     'google-auth==1.18.0', 'protobuf==3.19.5'
 ]
 
+REQUIRED_GPU_PACKAGES = []
+
 # placed as extra to not have required overwrite existing nightly installs if
 # they exist.
 EXTRA_PACKAGES = ['tensorflow == 2.11.0']
-CUR_VERSION = '0.7.3'
+CUR_VERSION = '0.8.0'
 
 
 class BinaryDistribution(Distribution):
@@ -74,11 +76,21 @@ def has_ext_modules(self):
     nightly = True
     sys.argv.remove('--nightly')
 
+gpu = False
+if '--gpu' in sys.argv:
+    gpu = True
+    sys.argv.remove('--gpu')
+
 project_name = 'tensorflow-quantum'
 build_version = CUR_VERSION
+
+if gpu:
+    build_version = build_version + '.gpu'
+    REQUIRED_PACKAGES = REQUIRED_PACKAGES + REQUIRED_GPU_PACKAGES
+
 if nightly:
     project_name = 'tfq-nightly'
-    build_version = CUR_VERSION + '.dev' + str(date.today()).replace('-', '')
+    build_version = build_version + '.dev' + str(date.today()).replace('-', '')
 
 setup(
     name=project_name,
diff --git a/requirements.txt b/requirements.txt
index 578179982..f899bfc5b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-cirq-core==0.13.1
-cirq-google==0.13.1
+cirq-core~=1.0
+cirq-google~=1.0
 sympy==1.8
 numpy==1.24.2  # TensorFlow can detect if it was built against other versions.
 nbformat==4.4.0
diff --git a/scripts/ci_validate_tutorials.sh b/scripts/ci_validate_tutorials.sh
index d64361464..4fe94c465 100755
--- a/scripts/ci_validate_tutorials.sh
+++ b/scripts/ci_validate_tutorials.sh
@@ -24,6 +24,8 @@ pip install gym==0.24.1
 pip install seaborn==0.12.0
 # tf_docs pip package needed for noise tutorial.
 pip install -q git+https://github.com/tensorflow/docs
+# ReCirq pip package needed for research tools.
+pip install --quiet git+https://github.com/quantumlib/ReCirq
 # Leave the quantum directory, otherwise errors may occur
 cd ..
 examples_output=$(python3 quantum/scripts/test_tutorials.py)
diff --git a/scripts/test_all.sh b/scripts/test_all.sh
index 2795e0429..ffb43d42d 100755
--- a/scripts/test_all.sh
+++ b/scripts/test_all.sh
@@ -14,7 +14,21 @@
 # limitations under the License.
 # =============================================================================
 echo "Testing All Bazel py_test and cc_tests.";
-test_outputs=$(bazel test -c opt --experimental_repo_remote_exec --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=1" --cxxopt="-std=c++17" --cxxopt="-msse2" --cxxopt="-msse3" --cxxopt="-msse4" --notest_keep_going --test_output=errors //tensorflow_quantum/...)
+ENABLE_CUDA=${1}
+
+if [[ ${ENABLE_CUDA} == "gpu" ]]; then
+  echo "GPU mode. CUDA config is set."
+  CUDA_CONFIG="--config=cuda"
+  # Tests all including cuquantum ops.
+  TAG_FILTER=""
+else
+  echo "CPU mode."
+  CUDA_CONFIG=""
+  # Tests cpu only excluding cuquantum ops.
+  TAG_FILTER="--test_tag_filters=-cuquantum --build_tag_filters=-cuquantum"
+fi
+
+test_outputs=$(bazel test -c opt ${CUDA_CONFIG} ${TAG_FILTER} --experimental_repo_remote_exec --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=1" --cxxopt="-std=c++17" --cxxopt="-msse2" --cxxopt="-msse3" --cxxopt="-msse4" --test_output=errors //tensorflow_quantum/...)
 exit_code=$?
 if [ "$exit_code" == "0" ]; then
 	echo "Testing Complete!";
diff --git a/tensorflow_quantum/__init__.py b/tensorflow_quantum/__init__.py
index 44f4ba3ea..67d66f403 100644
--- a/tensorflow_quantum/__init__.py
+++ b/tensorflow_quantum/__init__.py
@@ -64,4 +64,4 @@
 del core
 # pylint: enable=undefined-variable
 
-__version__ = '0.7.2'
+__version__ = '0.8.0'
diff --git a/tensorflow_quantum/core/ops/BUILD b/tensorflow_quantum/core/ops/BUILD
index 84361cef1..cb764f25c 100644
--- a/tensorflow_quantum/core/ops/BUILD
+++ b/tensorflow_quantum/core/ops/BUILD
@@ -1,6 +1,4 @@
-# load op_wrapper
-load("@org_tensorflow//tensorflow:tensorflow.bzl", "tf_gpu_kernel_library", "tf_gen_op_wrapper_py")
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured", "if_cuda")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -50,8 +48,8 @@ py_library(
         "//tensorflow_quantum/core/ops/math_ops:fidelity_op_py",
         "//tensorflow_quantum/core/ops/noise:noisy_expectation_op_py",
     ] + if_cuda_is_configured([
-        ":tfq_simulate_ops_cuda_py",
         ":tfq_simulate_ops_cuquantum_py",
+        ":tfq_adj_grad_op_cuquantum_py",
     ]),
 )
 
@@ -539,7 +537,9 @@ py_library(
         ":tfq_simulate_ops_py",
         ":tfq_utility_ops_py",
         "//tensorflow_quantum/python:quantum_context",
-    ],
+    ] + if_cuda_is_configured([
+        ":tfq_simulate_ops_cuquantum_py",
+    ]),
 )
 
 py_test(
@@ -641,19 +641,6 @@ py_test(
     ],
 )
 
-py_library(
-    name = "tfq_simulate_ops_cuda_py",
-    srcs = ["tfq_simulate_ops_cuda.py"],
-    data = [
-        ":_tfq_simulate_ops_cuda.so",
-    ],
-    srcs_version = "PY3",
-    deps = [
-        # tensorflow framework for wrappers
-        ":load_module",
-    ],
-)
-
 py_library(
     name = "tfq_simulate_ops_cuquantum_py",
     srcs = ["tfq_simulate_ops_cuquantum.py"],
@@ -665,24 +652,29 @@ py_library(
         # tensorflow framework for wrappers
         ":load_module",
     ],
+    tags = ["cuquantum"],
 )
 
 py_test(
-    name = "tfq_simulate_ops_gpu_test",
-    srcs = ["tfq_simulate_ops_gpu_test.py"],
+    name = "tfq_simulate_ops_cuquantum_test",
+    timeout = "long",
+    srcs = ["tfq_simulate_ops_cuquantum_test.py"],
     deps = [
-        ":tfq_simulate_ops_cuda_py",
         ":tfq_simulate_ops_cuquantum_py",
         ":tfq_simulate_ops_py",
         "//tensorflow_quantum/python:util",
     ],
     srcs_version = "PY3",
+    tags = ["cuquantum"],
 )
 
 cc_binary(
-    name = "_tfq_simulate_ops_cuda.so",
+    name = "_tfq_simulate_ops_cuquantum.so",
     srcs = [
-        "tfq_simulate_expectation_op_cuda.cu.cc",
+        "tfq_simulate_expectation_op_cuquantum.cu.cc",
+        "tfq_simulate_sampled_expectation_op_cuquantum.cu.cc",
+        "tfq_simulate_samples_op_cuquantum.cu.cc",
+        "tfq_simulate_state_op_cuquantum.cu.cc",
     ],
     linkshared = 1,
     features = select({
@@ -717,14 +709,10 @@ cc_binary(
             "/wd4577",
             "/DNOGDI",
             "/UTF_COMPILE_LIBRARY",
+            "/D__CUSTATEVEC__",
         ],
         "//conditions:default": [
             "-Iexternal/local_cuda/cuda/include",
-            # "--cuda-gpu-arch=sm_86",
-            # "-L/usr/local/cuda/lib64",
-            # "-lcudart_static",
-            # "-ldl",
-            # "-lrt",
             "-pthread",
             "-std=c++17",
             "-D_GLIBCXX_USE_CXX11_ABI=1",
@@ -733,7 +721,14 @@ cc_binary(
             "-DNV_CUDNN_DISABLE_EXCEPTION",
             # "-fpermissive",
         ],
-    }) + if_cuda_is_configured(["-DTENSORFLOW_USE_NVCC=1", "-DGOOGLE_CUDA=1", "-x cuda", "-nvcc_options=relaxed-constexpr", "-nvcc_options=ftz=true"]),
+    }) + if_cuda_is_configured([
+        "-DTENSORFLOW_USE_NVCC=1",
+        "-DGOOGLE_CUDA=1",
+        "-x cuda",
+        "-nvcc_options=relaxed-constexpr",
+        "-nvcc_options=ftz=true",
+        "-D__CUSTATEVEC__",
+    ]),
     deps = [
         # cirq cc proto
         "//tensorflow_quantum/core/ops:parse_context",
@@ -750,15 +745,18 @@ cc_binary(
     ] + if_cuda_is_configured([
         ":cuda",
         "@local_config_cuda//cuda:cuda_headers",
-        "@qsim//lib:qsim_cuda_lib",
+        "@local_config_cuquantum//:cuquantum_headers",
+        "@local_config_cuquantum//:libcuquantum",
+        "@qsim//lib:qsim_cuquantum_lib",
     ]),
+    tags = ["cuquantum"],
     # alwayslink=1,
 )
 
 cc_binary(
-    name = "_tfq_simulate_ops_cuquantum.so",
+    name = "_tfq_adj_grad_cuquantum.so",
     srcs = [
-        "tfq_simulate_expectation_op_cuquantum.cu.cc",
+        "tfq_adj_grad_op_cuquantum.cu.cc",
     ],
     linkshared = 1,
     features = select({
@@ -793,14 +791,10 @@ cc_binary(
             "/wd4577",
             "/DNOGDI",
             "/UTF_COMPILE_LIBRARY",
+            "/D__CUSTATEVEC__",
         ],
         "//conditions:default": [
             "-Iexternal/local_cuda/cuda/include",
-            # "--cuda-gpu-arch=sm_86",
-            # "-L/usr/local/cuda/lib64",
-            # "-lcudart_static",
-            # "-ldl",
-            # "-lrt",
             "-pthread",
             "-std=c++17",
             "-D_GLIBCXX_USE_CXX11_ABI=1",
@@ -809,30 +803,60 @@ cc_binary(
             "-DNV_CUDNN_DISABLE_EXCEPTION",
             # "-fpermissive",
         ],
-    }) + if_cuda_is_configured(["-DTENSORFLOW_USE_NVCC=1", "-DGOOGLE_CUDA=1", "-x cuda", "-nvcc_options=relaxed-constexpr", "-nvcc_options=ftz=true"]),
+    }) + if_cuda_is_configured([
+        "-DTENSORFLOW_USE_NVCC=1",
+        "-DGOOGLE_CUDA=1",
+        "-x cuda",
+        "-nvcc_options=relaxed-constexpr",
+        "-nvcc_options=ftz=true",
+        "-D__CUSTATEVEC__",
+    ]),
     deps = [
-        # cirq cc proto
         "//tensorflow_quantum/core/ops:parse_context",
-        "//tensorflow_quantum/core/ops:tfq_simulate_utils",
-        "//tensorflow_quantum/core/proto:pauli_sum_cc_proto",
-        "//tensorflow_quantum/core/proto:program_cc_proto",
-        "//tensorflow_quantum/core/src:circuit_parser_qsim",
         "//tensorflow_quantum/core/src:util_qsim",
-        "@eigen//:eigen3",
-        # "@local_cuda//:cuda_headers"
-        # tensorflow core framework
-        # tensorflow core lib
-        # tensorflow core protos
+        "//tensorflow_quantum/core/src:adj_util",
+        # "//tensorflow_quantum/core/proto:pauli_sum_cc_proto",
+        # "//tensorflow_quantum/core/proto:program_cc_proto",
+        # "//tensorflow_quantum/core/src:circuit_parser_qsim",
+        # "@eigen//:eigen3",
     ] + if_cuda_is_configured([
         ":cuda",
-        "@cuquantum_libs//:custatevec",
-        "@cuquantum_libs//:custatevec_headers",
         "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_cuquantum//:cuquantum_headers",
+        "@local_config_cuquantum//:libcuquantum",
         "@qsim//lib:qsim_cuquantum_lib",
     ]),
+    tags = ["cuquantum"],
     # alwayslink=1,
 )
 
+py_library(
+    name = "tfq_adj_grad_op_cuquantum_py",
+    srcs = ["tfq_adj_grad_op_cuquantum.py"],
+    data = [":_tfq_adj_grad_cuquantum.so"],
+    srcs_version = "PY3",
+    deps = [
+        ":load_module",
+        # pauli sum cc proto
+        # projector sum cc proto
+        # tensorflow framework for wrappers
+    ],
+    tags = ["cuquantum"],
+)
+
+py_test(
+    name = "tfq_adj_grad_op_cuquantum_test",
+    srcs = ["tfq_adj_grad_op_cuquantum_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":tfq_adj_grad_op_cuquantum_py",
+        ":tfq_adj_grad_op_py", # for testing cpu vs gpu diff
+        "//tensorflow_quantum/python:util",
+    ],
+    srcs_version = "PY3",
+    tags = ["cuquantum"],
+)
+
 py_library(
     name = "load_module",
     srcs = ["load_module.py"],
diff --git a/tensorflow_quantum/core/ops/circuit_execution_ops.py b/tensorflow_quantum/core/ops/circuit_execution_ops.py
index b2cc1ce70..b21298ad3 100644
--- a/tensorflow_quantum/core/ops/circuit_execution_ops.py
+++ b/tensorflow_quantum/core/ops/circuit_execution_ops.py
@@ -21,25 +21,65 @@
                                          tfq_utility_ops)
 from tensorflow_quantum.python import quantum_context
 
+try:
+    from tensorflow_quantum.core.ops import tfq_simulate_ops_cuquantum
+    _ENABLE_USE_CUQUANTUM = True
+except:
+    # `_ENABLE_USE_CUQUANTUM = False` makes `use_cuquantum` silent.
+    _ENABLE_USE_CUQUANTUM = False
+    tfq_simulate_ops_cuquantum = tfq_simulate_ops
+
+
+def is_gpu_configured() -> bool:
+    """Returns True if gpu ops are available or not."""
+    return _ENABLE_USE_CUQUANTUM
+
+
+def _preprocess_use_cuquantum(use_cuquantum: bool) -> bool:
+    if is_gpu_configured():
+        return use_cuquantum
+
+    # GPU is not set. `use_cuquantum` becomes silent.
+    if use_cuquantum:
+        print("WARNING: cuQuantum was not set, "
+              "`use_cuquantum=True` option becomes effectless. Using CPU.")
+    return False
+
 
 class TFQStateVectorSimulator(enum.Enum):
     """Enum to make specifying TFQ simulators user-friendly."""
     expectation = tfq_simulate_ops.tfq_simulate_expectation
+    expectation_cuquantum = tfq_simulate_ops_cuquantum.tfq_simulate_expectation
+
     samples = tfq_simulate_ops.tfq_simulate_samples
+    samples_cuquantum = tfq_simulate_ops_cuquantum.tfq_simulate_samples
+
     state = tfq_simulate_ops.tfq_simulate_state
+    state_cuquantum = tfq_simulate_ops_cuquantum.tfq_simulate_state
+
     sampled_expectation = tfq_simulate_ops.tfq_simulate_sampled_expectation
+    sampled_expectation_cuquantum = (
+        tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation)
 
 
-def _check_quantum_concurrent(quantum_concurrent):
+def _check_quantum_concurrent(quantum_concurrent, use_cuquantum):
     if not isinstance(quantum_concurrent, bool):
         raise TypeError("quantum_concurrent must be type bool."
                         " Given: {}".format(str(type(quantum_concurrent))))
+    if not isinstance(use_cuquantum, bool):
+        raise TypeError("use_cuquantum must be type bool."
+                        " Given: {}".format(str(type(use_cuquantum))))
+    if use_cuquantum is True and quantum_concurrent is True:
+        raise ValueError("use_cuquantum and quantum_concurrent should "
+                         "not be True at the same time. Please set False to "
+                         "quantum_concurrent.")
 
 
 def get_expectation_op(
         backend=None,
         *,
-        quantum_concurrent=quantum_context.get_quantum_concurrent_op_mode()):
+        quantum_concurrent=quantum_context.get_quantum_concurrent_op_mode(),
+        use_cuquantum=False):
     """Get a TensorFlow op that will calculate batches of expectation values.
 
     This function produces a non-differentiable TF op that will calculate
@@ -80,8 +120,8 @@ def get_expectation_op(
         backend: Optional Python `object` that specifies what backend this op
             should use when evaluating circuits. Can be
             `cirq.DensityMatrixSimulator` or any
-            `cirq.sim.simulator.SimulatesExpectationValues`. If not provided the
-            default C++ analytical expectation calculation op is returned.
+            `cirq.sim.simulator.SimulatesExpectationValues`. If not provided
+            the default C++ analytical expectation calculation op is returned.
         quantum_concurrent: Optional Python `bool`. True indicates that the
             returned op should not block graph level parallelism on itself when
             executing. False indicates that graph level parallelism on itself
@@ -90,6 +130,8 @@ def get_expectation_op(
             (no blocking). This flag is only needed for advanced users when
             using TFQ for very large simulations, or when running on a real
             chip.
+        use_cuquantum: Set True to turn on TFQ cuQuantum version op, which
+            requires `quantum_concurrent` to be False.
 
     Returns:
         A `callable` with the following signature:
@@ -115,21 +157,28 @@ def get_expectation_op(
                 expectation value for each circuit with each op applied to it
                 (after resolving the corresponding parameters in).
     """
-
     # TODO (mbbrough): investigate how the above docstring renders.
-    _check_quantum_concurrent(quantum_concurrent)
+    _check_quantum_concurrent(quantum_concurrent, use_cuquantum)
+    use_cuquantum = _preprocess_use_cuquantum(use_cuquantum)
 
     op = None
     if backend is None:
-        op = TFQStateVectorSimulator.expectation
+        if use_cuquantum:
+            op = TFQStateVectorSimulator.expectation_cuquantum
+        else:
+            op = TFQStateVectorSimulator.expectation
 
     # TODO(zaqqwerty): remove DM check after cirq #3964
     if isinstance(backend, (cirq.sim.simulator.SimulatesExpectationValues,
                             cirq.DensityMatrixSimulator)):
+        if use_cuquantum:
+            raise ValueError(
+                "use_cuquantum is not supported for cirq simulator. Please \
+                    set use_cuquantum to False.")
         op = cirq_ops._get_cirq_analytical_expectation(backend)
 
     if op is not None:
-        if quantum_concurrent is True:
+        if use_cuquantum is False and quantum_concurrent is True:
             # Return an op that does not block graph level parallelism.
             return lambda programs, symbol_names, symbol_values, pauli_sums: \
                 op(programs, symbol_names, symbol_values, pauli_sums)
@@ -152,7 +201,8 @@ def get_expectation_op(
 def get_sampling_op(
         backend=None,
         *,
-        quantum_concurrent=quantum_context.get_quantum_concurrent_op_mode()):
+        quantum_concurrent=quantum_context.get_quantum_concurrent_op_mode(),
+        use_cuquantum=False):
     """Get a Tensorflow op that produces samples from given quantum circuits.
 
     This function produces a non-differentiable op that will calculate
@@ -190,6 +240,8 @@ def get_sampling_op(
             (no blocking). This flag is only needed for advanced users when
             using TFQ for very large simulations, or when running on a real
             chip.
+        use_cuquantum: Set True to turn on TFQ cuQuantum version op, which
+            requires `quantum_concurrent` to be False.
 
     Returns:
         A `callable` with the following signature:
@@ -216,17 +268,25 @@ def get_sampling_op(
     """
 
     # TODO (mbbrough): investigate how the above docstring renders.
-    _check_quantum_concurrent(quantum_concurrent)
+    _check_quantum_concurrent(quantum_concurrent, use_cuquantum)
+    use_cuquantum = _preprocess_use_cuquantum(use_cuquantum)
 
     op = None
     if backend is None:
-        op = TFQStateVectorSimulator.samples
+        if use_cuquantum:
+            op = TFQStateVectorSimulator.samples_cuquantum
+        else:
+            op = TFQStateVectorSimulator.samples
 
     if isinstance(backend, cirq.Sampler):
+        if use_cuquantum:
+            raise ValueError(
+                "use_cuquantum is not supported for cirq sampler. Please \
+                    set use_cuquantum to False.")
         op = cirq_ops._get_cirq_samples(backend)
 
     if op is not None:
-        if quantum_concurrent is True:
+        if use_cuquantum is False and quantum_concurrent is True:
             # Return an op that does not block graph level parallelism.
             return lambda programs, symbol_names, symbol_values, num_samples: \
                 tfq_utility_ops.padded_to_ragged(
@@ -244,7 +304,8 @@ def get_sampling_op(
 def get_state_op(
         backend=None,
         *,
-        quantum_concurrent=quantum_context.get_quantum_concurrent_op_mode()):
+        quantum_concurrent=quantum_context.get_quantum_concurrent_op_mode(),
+        use_cuquantum=False):
     """Get a TensorFlow op that produces states from given quantum circuits.
 
     This function produces a non-differentiable op that will calculate
@@ -282,6 +343,8 @@ def get_state_op(
             (no blocking). This flag is only needed for advanced users when
             using TFQ for very large simulations, or when running on a real
             chip.
+        use_cuquantum: Set True to turn on TFQ cuQuantum version op, which
+            requires `quantum_concurrent` to be False.
 
     Returns:
         A `callable` with the following signature:
@@ -305,17 +368,25 @@ def get_state_op(
     """
 
     # TODO (mbbrough): investigate how the above docstring renders.
-    _check_quantum_concurrent(quantum_concurrent)
+    _check_quantum_concurrent(quantum_concurrent, use_cuquantum)
+    use_cuquantum = _preprocess_use_cuquantum(use_cuquantum)
 
     op = None
     if backend is None:
-        op = TFQStateVectorSimulator.state
+        if use_cuquantum:
+            op = TFQStateVectorSimulator.state_cuquantum
+        else:
+            op = TFQStateVectorSimulator.state
 
     if isinstance(backend, (cirq.SimulatesFinalState)):
+        if use_cuquantum:
+            raise ValueError(
+                "use_cuquantum is not supported for cirq simulator. Please \
+                    set use_cuquantum to False.")
         op = cirq_ops._get_cirq_simulate_state(backend)
 
     if op is not None:
-        if quantum_concurrent is True:
+        if use_cuquantum is False and quantum_concurrent is True:
             # Return an op that does not block graph level parallelism.
             return lambda programs, symbol_names, symbol_values: \
                 tfq_utility_ops.padded_to_ragged(
@@ -334,7 +405,8 @@ def get_state_op(
 def get_sampled_expectation_op(
         backend=None,
         *,
-        quantum_concurrent=quantum_context.get_quantum_concurrent_op_mode()):
+        quantum_concurrent=quantum_context.get_quantum_concurrent_op_mode(),
+        use_cuquantum=False):
     """Get a TensorFlow op that will calculate sampled expectation values.
 
     This function produces a non-differentiable TF op that will calculate
@@ -386,6 +458,8 @@ def get_sampled_expectation_op(
             (no blocking). This flag is only needed for advanced users when
             using TFQ for very large simulations, or when running on a real
             chip.
+        use_cuquantum: Set True to turn on TFQ cuQuantum version op, which
+            requires `quantum_concurrent` to be False.
 
     Returns:
         A `callable` with the following signature:
@@ -416,17 +490,25 @@ def get_sampled_expectation_op(
                 (after resolving the corresponding parameters in).
     """
     # TODO (mbbrough): investigate how the above docstring renders.
-    _check_quantum_concurrent(quantum_concurrent)
+    _check_quantum_concurrent(quantum_concurrent, use_cuquantum)
+    use_cuquantum = _preprocess_use_cuquantum(use_cuquantum)
 
     op = None
     if backend is None:
-        op = TFQStateVectorSimulator.sampled_expectation
+        if use_cuquantum:
+            op = TFQStateVectorSimulator.sampled_expectation_cuquantum
+        else:
+            op = TFQStateVectorSimulator.sampled_expectation
 
     if isinstance(backend, cirq.Sampler):
+        if use_cuquantum:
+            raise ValueError(
+                "use_cuquantum is not supported for cirq sampler. Please \
+                    set use_cuquantum to False.")
         op = cirq_ops._get_cirq_sampled_expectation(backend)
 
     if op is not None:
-        if quantum_concurrent is True:
+        if use_cuquantum is False and quantum_concurrent is True:
             # Return an op that does not block graph level parallelism.
             return lambda programs, symbol_names, symbol_values, pauli_sums, \
                 num_samples: op(programs,
diff --git a/tensorflow_quantum/core/ops/circuit_execution_ops_test.py b/tensorflow_quantum/core/ops/circuit_execution_ops_test.py
index 08e4f5b6f..b89c85aa5 100644
--- a/tensorflow_quantum/core/ops/circuit_execution_ops_test.py
+++ b/tensorflow_quantum/core/ops/circuit_execution_ops_test.py
@@ -28,6 +28,7 @@
 from scipy import stats
 import cirq
 import cirq_google
+from cirq_google.engine.abstract_processor import AbstractProcessor
 
 from tensorflow_quantum.core.ops import batch_util, circuit_execution_ops
 from tensorflow_quantum.python import util
@@ -48,7 +49,11 @@
                                              quantum_concurrent=True),
     # For timing interests C++ backend is tested in quantum_concurrent mode.
     circuit_execution_ops.get_expectation_op(backend=None,
-                                             quantum_concurrent=False)
+                                             quantum_concurrent=False),
+    # For cuQuantum op. quantum_concurrent=True is not allowed.
+    circuit_execution_ops.get_expectation_op(backend=None,
+                                             quantum_concurrent=False,
+                                             use_cuquantum=True)
 ]
 
 SAMPLING_OPS = [
@@ -60,7 +65,11 @@
                                           quantum_concurrent=True),
     # For timing interests C++ backend is tested in quantum_concurrent mode.
     circuit_execution_ops.get_sampling_op(backend=None,
-                                          quantum_concurrent=False)
+                                          quantum_concurrent=False),
+    # For cuQuantum op. quantum_concurrent=True is not allowed.
+    circuit_execution_ops.get_sampling_op(backend=None,
+                                          quantum_concurrent=False,
+                                          use_cuquantum=True)
 ]
 
 STATE_OPS = [
@@ -68,8 +77,13 @@
     circuit_execution_ops.get_state_op(backend=WF_SIM, quantum_concurrent=True),
     circuit_execution_ops.get_state_op(backend=DM_SIM, quantum_concurrent=True),
     # For timing interests C++ backend is tested in quantum_concurrent mode.
-    circuit_execution_ops.get_state_op(backend=None, quantum_concurrent=False)
+    circuit_execution_ops.get_state_op(backend=None, quantum_concurrent=False),
+    # For cuQuantum op. quantum_concurrent=True is not allowed.
+    circuit_execution_ops.get_state_op(backend=None,
+                                       quantum_concurrent=False,
+                                       use_cuquantum=True)
 ]
+NO_DM_STATE_OPS = STATE_OPS[:2] + STATE_OPS[2:]
 
 SAMPLED_EXPECTATION_OPS = [
     circuit_execution_ops.get_sampled_expectation_op(backend=None,
@@ -81,9 +95,14 @@
     # For timing interests C++ backend is tested in quantum_concurrent mode.
     circuit_execution_ops.get_sampled_expectation_op(backend=None,
                                                      quantum_concurrent=False),
+    # For cuQuantum op. quantum_concurrent=True is not allowed.
+    circuit_execution_ops.get_sampled_expectation_op(backend=None,
+                                                     quantum_concurrent=False,
+                                                     use_cuquantum=True)
 ]
 
-SIMS = [WF_SIM, WF_SIM, DM_SIM, WF_SIM]
+SIMS = [WF_SIM, WF_SIM, DM_SIM, WF_SIM, WF_SIM]
+NO_DM_SIMS = SIMS[:2] + SIMS[2:]
 
 
 class OpGetterInputChecks(tf.test.TestCase):
@@ -98,11 +117,9 @@ def test_get_expectation_inputs(self):
         circuit_execution_ops.get_expectation_op()
         with self.assertRaisesRegex(NotImplementedError,
                                     expected_regex='Sample-based'):
-            mock_engine = mock.Mock()
+            mock_processor = mock.create_autospec(AbstractProcessor)
             circuit_execution_ops.get_expectation_op(
-                cirq_google.QuantumEngineSampler(engine=mock_engine,
-                                                 processor_id='test',
-                                                 gate_set=cirq_google.XMON))
+                cirq_google.ProcessorSampler(processor=mock_processor))
         with self.assertRaisesRegex(
                 TypeError,
                 expected_regex="cirq.sim.simulator.SimulatesExpectationValues"):
@@ -112,6 +129,15 @@ def test_get_expectation_inputs(self):
                                     expected_regex="must be type bool."):
             circuit_execution_ops.get_expectation_op(quantum_concurrent='junk')
 
+        with self.assertRaisesRegex(TypeError,
+                                    expected_regex="must be type bool."):
+            circuit_execution_ops.get_expectation_op(use_cuquantum='junk')
+
+        with self.assertRaisesRegex(
+                ValueError, expected_regex="not be True at the same time"):
+            circuit_execution_ops.get_expectation_op(quantum_concurrent=True,
+                                                     use_cuquantum=True)
+
     def test_get_sampled_expectation_inputs(self):
         """Test that get expectation only accepts inputs it should."""
         circuit_execution_ops.get_sampled_expectation_op()
@@ -119,11 +145,9 @@ def test_get_sampled_expectation_inputs(self):
             backend=cirq.Simulator())
         circuit_execution_ops.get_sampled_expectation_op(
             backend=cirq.DensityMatrixSimulator())
-        mock_engine = mock.Mock()
+        mock_processor = mock.create_autospec(AbstractProcessor)
         circuit_execution_ops.get_sampled_expectation_op(
-            cirq_google.QuantumEngineSampler(engine=mock_engine,
-                                             processor_id='test',
-                                             gate_set=cirq_google.XMON))
+            cirq_google.ProcessorSampler(processor=mock_processor))
         with self.assertRaisesRegex(TypeError, expected_regex="a Cirq.Sampler"):
             circuit_execution_ops.get_sampled_expectation_op(backend="junk")
 
@@ -132,17 +156,25 @@ def test_get_sampled_expectation_inputs(self):
             circuit_execution_ops.get_sampled_expectation_op(
                 quantum_concurrent='junk')
 
+        with self.assertRaisesRegex(TypeError,
+                                    expected_regex="must be type bool."):
+            circuit_execution_ops.get_sampled_expectation_op(
+                use_cuquantum='junk')
+
+        with self.assertRaisesRegex(
+                ValueError, expected_regex="not be True at the same time"):
+            circuit_execution_ops.get_sampled_expectation_op(
+                quantum_concurrent=True, use_cuquantum=True)
+
     def test_get_samples_inputs(self):
         """Test that get_samples only accepts inputs it should."""
         circuit_execution_ops.get_sampling_op()
         circuit_execution_ops.get_sampling_op(backend=cirq.Simulator())
         circuit_execution_ops.get_sampling_op(
             backend=cirq.DensityMatrixSimulator())
-        mock_engine = mock.Mock()
+        mock_processor = mock.create_autospec(AbstractProcessor)
         circuit_execution_ops.get_sampling_op(
-            backend=cirq_google.QuantumEngineSampler(engine=mock_engine,
-                                                     processor_id='test',
-                                                     gate_set=cirq_google.XMON))
+            backend=cirq_google.ProcessorSampler(processor=mock_processor))
         with self.assertRaisesRegex(TypeError,
                                     expected_regex="Expected a Cirq.Sampler"):
             circuit_execution_ops.get_sampling_op(backend="junk")
@@ -151,6 +183,15 @@ def test_get_samples_inputs(self):
                                     expected_regex="must be type bool."):
             circuit_execution_ops.get_sampling_op(quantum_concurrent='junk')
 
+        with self.assertRaisesRegex(TypeError,
+                                    expected_regex="must be type bool."):
+            circuit_execution_ops.get_sampling_op(use_cuquantum='junk')
+
+        with self.assertRaisesRegex(
+                ValueError, expected_regex="not be True at the same time"):
+            circuit_execution_ops.get_sampling_op(quantum_concurrent=True,
+                                                  use_cuquantum=True)
+
     def test_get_state_inputs(self):
         """Test that get_states only accepts inputs it should."""
         circuit_execution_ops.get_state_op()
@@ -162,17 +203,23 @@ def test_get_state_inputs(self):
             circuit_execution_ops.get_state_op(backend="junk")
         with self.assertRaisesRegex(TypeError,
                                     expected_regex="Cirq.SimulatesFinalState"):
-            mock_engine = mock.Mock()
+            mock_processor = mock.create_autospec(AbstractProcessor)
             circuit_execution_ops.get_state_op(
-                backend=cirq_google.QuantumEngineSampler(
-                    engine=mock_engine,
-                    processor_id='test',
-                    gate_set=cirq_google.XMON))
+                backend=cirq_google.ProcessorSampler(processor=mock_processor))
 
         with self.assertRaisesRegex(TypeError,
                                     expected_regex="must be type bool."):
             circuit_execution_ops.get_state_op(quantum_concurrent='junk')
 
+        with self.assertRaisesRegex(TypeError,
+                                    expected_regex="must be type bool."):
+            circuit_execution_ops.get_state_op(use_cuquantum='junk')
+
+        with self.assertRaisesRegex(
+                ValueError, expected_regex="not be True at the same time"):
+            circuit_execution_ops.get_state_op(quantum_concurrent=True,
+                                               use_cuquantum=True)
+
 
 class ExecutionOpsConsistentyTest(tf.test.TestCase, parameterized.TestCase):
     """Test all ops produce equivalent output to one another."""
@@ -277,9 +324,7 @@ def test_simulate_state_with_symbols(self, op_and_sim, n_qubits,
             util.kwargs_cartesian_product(
                 **{
                     'op_and_sim': [(op, sim) for (
-                        op,
-                        sim) in zip(STATE_OPS[:-2] +
-                                    [STATE_OPS[-1]], SIMS[:-2] + [SIMS[-1]])],
+                        op, sim) in zip(NO_DM_STATE_OPS, NO_DM_SIMS)],
                 })))
     def test_simulate_state_large(self, op_and_sim):
         """Test a reasonably large and complex circuit."""
@@ -287,7 +332,7 @@ def test_simulate_state_large(self, op_and_sim):
         symbol_names = []
         circuit_batch, resolver_batch = \
             util.random_circuit_resolver_batch(
-                cirq.GridQubit.rect(4, 4), 5)
+                cirq.GridQubit.rect(3, 3), 5)
 
         symbol_values_array = np.array(
             [[resolver[symbol]
diff --git a/tensorflow_quantum/core/ops/cirq_ops.py b/tensorflow_quantum/core/ops/cirq_ops.py
index 2c1039ac8..808296433 100644
--- a/tensorflow_quantum/core/ops/cirq_ops.py
+++ b/tensorflow_quantum/core/ops/cirq_ops.py
@@ -491,7 +491,7 @@ def _no_grad(grad):
         ]
         max_n_qubits = max(len(p.all_qubits()) for p in programs)
 
-        if isinstance(sampler, cirq_google.QuantumEngineSampler):
+        if isinstance(sampler, cirq_google.ProcessorSampler):
             # group samples from identical circuits to reduce communication
             # overhead. Have to keep track of the order in which things came
             # in to make sure the output is ordered correctly
diff --git a/tensorflow_quantum/core/ops/cirq_ops_test.py b/tensorflow_quantum/core/ops/cirq_ops_test.py
index 8dc0de4a9..a17b6f1f5 100644
--- a/tensorflow_quantum/core/ops/cirq_ops_test.py
+++ b/tensorflow_quantum/core/ops/cirq_ops_test.py
@@ -27,6 +27,7 @@
 from absl.testing import parameterized
 import cirq
 import cirq_google
+from cirq_google.engine.abstract_processor import AbstractProcessor
 
 from tensorflow_quantum.core.ops import cirq_ops
 from tensorflow_quantum.core.serialize import serializer
@@ -349,11 +350,9 @@ def test_get_cirq_sampling_op(self):
         cirq_ops._get_cirq_samples()
         cirq_ops._get_cirq_samples(cirq.Simulator())
         cirq_ops._get_cirq_samples(cirq.DensityMatrixSimulator())
-        mock_engine = mock.Mock()
+        mock_processor = mock.create_autospec(AbstractProcessor)
         cirq_ops._get_cirq_samples(
-            cirq_google.QuantumEngineSampler(engine=mock_engine,
-                                             processor_id='test',
-                                             gate_set=cirq_google.XMON))
+            cirq_google.ProcessorSampler(processor=mock_processor))
 
     def test_cirq_sampling_op_inputs(self):
         """test input checking in the cirq sampling op."""
@@ -452,7 +451,9 @@ class DummySampler(cirq.Sampler):
             def run_sweep(self, program, params, repetitions):
                 """Returns all ones in the correct sample shape."""
                 return [
-                    cirq.Result(
+                    cirq_google.EngineResult(
+                        job_id="1",
+                        job_finished_time="1",
                         params=param,
                         measurements={
                             'tfq':
diff --git a/tensorflow_quantum/core/ops/tfq_adj_grad_op_cuquantum.cu.cc b/tensorflow_quantum/core/ops/tfq_adj_grad_op_cuquantum.cu.cc
new file mode 100644
index 000000000..55213c78b
--- /dev/null
+++ b/tensorflow_quantum/core/ops/tfq_adj_grad_op_cuquantum.cu.cc
@@ -0,0 +1,342 @@
+/* Copyright 2020 The TensorFlow Quantum Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <custatevec.h>
+
+#include <memory>
+#include <vector>
+
+#include "../qsim/lib/circuit.h"
+#include "../qsim/lib/gate_appl.h"
+#include "../qsim/lib/gates_cirq.h"
+#include "../qsim/lib/seqfor.h"
+#include "../qsim/lib/simmux_gpu.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow_quantum/core/ops/parse_context.h"
+#include "tensorflow_quantum/core/proto/pauli_sum.pb.h"
+#include "tensorflow_quantum/core/proto/program.pb.h"
+#include "tensorflow_quantum/core/src/adj_util.h"
+#include "tensorflow_quantum/core/src/util_qsim.h"
+
+namespace tfq {
+
+namespace {
+// TODO(jaeyoo): Temorary hack for BulkSetAmpl with cuda ops.
+// Updates qsim custatevec side BulkSetAmple ops, and remove these utilities.
+template <typename FP>
+__global__ void BulkSetAmplKernel(uint64_t mask, uint64_t bits, FP re, FP im,
+                                  bool exclude, FP* state) {
+  uint64_t k1 = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
+
+  bool set = ((k1 & mask) == bits) ^ exclude;
+
+  if (set) {
+    state[2 * k1] = re;
+    state[2 * k1 + 1] = im;
+  }
+}
+
+// Sets state[i] = complex(re, im) where (i & mask) == bits.
+// if `exclude` is true then the criteria becomes (i & mask) != bits.
+template <typename fp_type>
+void BulkSetAmpl(qsim::SimulatorCuStateVec<float>::StateSpace::State& state,
+                 uint64_t mask, uint64_t bits, fp_type re, fp_type im,
+                 bool exclude = false) {
+  uint64_t size = uint64_t{1} << state.num_qubits();
+
+  unsigned threads = std::min(size, uint64_t{512});
+  unsigned blocks = size / threads;
+
+  BulkSetAmplKernel<<<blocks, threads>>>(mask, bits, re, im, exclude,
+                                         state.get());
+  cudaPeekAtLastError();
+  cudaDeviceSynchronize();
+}
+}  // namespace
+
+using ::tensorflow::Status;
+using ::tfq::proto::PauliSum;
+using ::tfq::proto::Program;
+
+typedef qsim::Cirq::GateCirq<float> QsimGate;
+typedef qsim::Circuit<QsimGate> QsimCircuit;
+
+class TfqAdjointGradientCuquantumOp : public tensorflow::OpKernel {
+ public:
+  explicit TfqAdjointGradientCuquantumOp(
+      tensorflow::OpKernelConstruction* context)
+      : OpKernel(context) {
+    // create handles for simulator
+    cublasCreate(&cublas_handle_);
+    custatevecCreate(&custatevec_handle_);
+  }
+
+  ~TfqAdjointGradientCuquantumOp() {
+    // destroy handles in sync with simulator lifetime
+    cublasDestroy(cublas_handle_);
+    custatevecDestroy(custatevec_handle_);
+  }
+
+  void Compute(tensorflow::OpKernelContext* context) override {
+    // TODO (mbbrough): add more dimension checks for other inputs here.
+    const int num_inputs = context->num_inputs();
+    OP_REQUIRES(context, num_inputs == 5,
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "Expected 5 inputs, got ", num_inputs, " inputs.")));
+
+    // Create the output Tensor.
+    const int output_dim_batch_size = context->input(0).dim_size(0);
+    const int output_dim_param_size = context->input(2).dim_size(1);
+    tensorflow::TensorShape output_shape;
+    output_shape.AddDim(output_dim_batch_size);
+    output_shape.AddDim(output_dim_param_size);
+
+    tensorflow::Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+    auto output_tensor = output->matrix<float>();
+
+    // Parse program protos.
+    std::vector<Program> programs;
+    std::vector<int> num_qubits;
+    std::vector<std::vector<PauliSum>> pauli_sums;
+    OP_REQUIRES_OK(context, GetProgramsAndNumQubits(context, &programs,
+                                                    &num_qubits, &pauli_sums));
+
+    std::vector<SymbolMap> maps;
+    OP_REQUIRES_OK(context, GetSymbolMaps(context, &maps));
+
+    OP_REQUIRES(context, programs.size() == maps.size(),
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "Number of circuits and symbol_values do not match. Got ",
+                    programs.size(), " circuits and ", maps.size(),
+                    " symbol values.")));
+
+    // Construct qsim circuits.
+    std::vector<QsimCircuit> qsim_circuits(programs.size(), QsimCircuit());
+    std::vector<std::vector<qsim::GateFused<QsimGate>>> full_fuse(
+        programs.size(), std::vector<qsim::GateFused<QsimGate>>({}));
+    std::vector<std::vector<std::vector<qsim::GateFused<QsimGate>>>>
+        partial_fused_circuits(
+            programs.size(),
+            std::vector<std::vector<qsim::GateFused<QsimGate>>>({}));
+
+    // track metadata.
+    std::vector<std::vector<tfq::GateMetaData>> gate_meta(
+        programs.size(), std::vector<tfq::GateMetaData>({}));
+
+    // track gradients
+    std::vector<std::vector<GradientOfGate>> gradient_gates(
+        programs.size(), std::vector<GradientOfGate>({}));
+
+    Status parse_status = ::tensorflow::Status();
+    auto p_lock = tensorflow::mutex();
+    auto construct_f = [&](int start, int end) {
+      for (int i = start; i < end; i++) {
+        Status local = QsimCircuitFromProgram(programs[i], maps[i],
+                                              num_qubits[i], &qsim_circuits[i],
+                                              &full_fuse[i], &gate_meta[i]);
+        NESTED_FN_STATUS_SYNC(parse_status, local, p_lock);
+        CreateGradientCircuit(qsim_circuits[i], gate_meta[i],
+                              &partial_fused_circuits[i], &gradient_gates[i]);
+      }
+    };
+
+    const int num_cycles = 1000;
+    context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
+        programs.size(), num_cycles, construct_f);
+    OP_REQUIRES_OK(context, parse_status);
+
+    // Get downstream gradients.
+    std::vector<std::vector<float>> downstream_grads;
+    OP_REQUIRES_OK(context, GetPrevGrads(context, &downstream_grads));
+
+    OP_REQUIRES(context, downstream_grads.size() == programs.size(),
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "Number of gradients and circuits do not match. Got ",
+                    downstream_grads.size(), " gradients and ", programs.size(),
+                    " circuits.")));
+
+    OP_REQUIRES(
+        context, context->input(4).dim_size(1) == context->input(3).dim_size(1),
+        tensorflow::errors::InvalidArgument(absl::StrCat(
+            "Number of gradients and pauli sum dimension do not match. Got ",
+            context->input(4).dim_size(1), " gradient entries and ",
+            context->input(3).dim_size(1), " paulis per circuit.")));
+
+    int max_num_qubits = 0;
+    for (const int num : num_qubits) {
+      max_num_qubits = std::max(max_num_qubits, num);
+    }
+
+    output_tensor.setZero();
+
+    ComputeLarge(num_qubits, qsim_circuits, maps, full_fuse,
+                 partial_fused_circuits, pauli_sums, gradient_gates,
+                 downstream_grads, context, &output_tensor);
+  }
+
+ private:
+  cublasHandle_t cublas_handle_;
+  custatevecHandle_t custatevec_handle_;
+
+  void ComputeLarge(
+      const std::vector<int>& num_qubits,
+      const std::vector<QsimCircuit>& qsim_circuits,
+      const std::vector<SymbolMap>& maps,
+      const std::vector<std::vector<qsim::GateFused<QsimGate>>>& full_fuse,
+      const std::vector<std::vector<std::vector<qsim::GateFused<QsimGate>>>>&
+          partial_fused_circuits,
+      const std::vector<std::vector<PauliSum>>& pauli_sums,
+      const std::vector<std::vector<tfq::GradientOfGate>>& gradient_gates,
+      const std::vector<std::vector<float>>& downstream_grads,
+      tensorflow::OpKernelContext* context,
+      tensorflow::TTypes<float, 1>::Matrix* output_tensor) {
+    // Instantiate qsim objects.
+    using Simulator = qsim::SimulatorCuStateVec<float>;
+    using StateSpace = Simulator::StateSpace;
+
+    // Begin simulation.
+    int largest_nq = 1;
+    Simulator sim = Simulator(cublas_handle_, custatevec_handle_);
+    StateSpace ss = StateSpace(cublas_handle_, custatevec_handle_);
+    auto sv = ss.Create(largest_nq);
+    auto scratch = ss.Create(largest_nq);
+    auto scratch2 = ss.Create(largest_nq);
+
+    for (size_t i = 0; i < partial_fused_circuits.size(); i++) {
+      int nq = num_qubits[i];
+
+      if (nq > largest_nq) {
+        // need to switch to larger statespace.
+        largest_nq = nq;
+        sv = ss.Create(largest_nq);
+        scratch = ss.Create(largest_nq);
+        scratch2 = ss.Create(largest_nq);
+      }
+
+      // (#679) Just ignore empty program
+      if (qsim_circuits[i].gates.size() == 0) {
+        continue;
+      }
+
+      ss.SetStateZero(sv);
+      for (size_t j = 0; j < full_fuse[i].size(); j++) {
+        qsim::ApplyFusedGate(sim, full_fuse[i][j], sv);
+      }
+
+      // sv now contains psi
+      // scratch contains (sum_j paulis_sums[i][j] * downstream_grads[j])|psi>
+      // scratch2 now contains psi as well.
+      [[maybe_unused]] Status unused = AccumulateOperators(
+          pauli_sums[i], downstream_grads[i], sim, ss, sv, scratch2, scratch);
+
+      for (int j = partial_fused_circuits[i].size() - 1; j >= 0; j--) {
+        for (int k = partial_fused_circuits[i][j].size() - 1; k >= 0; k--) {
+          ApplyFusedGateDagger(sim, partial_fused_circuits[i][j][k], sv);
+          ApplyFusedGateDagger(sim, partial_fused_circuits[i][j][k], scratch);
+        }
+        if (j == 0) {
+          // last layer will have no parametrized gates so can break.
+          break;
+        }
+
+        // Hit a parameterized gate.
+        // todo fix this copy.
+        auto cur_gate = qsim_circuits[i].gates[gradient_gates[i][j - 1].index];
+        ApplyGateDagger(sim, cur_gate, sv);
+
+        // if applicable compute control qubit mask and control value bits.
+        uint64_t mask = 0;
+        uint64_t cbits = 0;
+        for (size_t k = 0; k < cur_gate.controlled_by.size(); k++) {
+          uint64_t control_loc = cur_gate.controlled_by[k];
+          mask |= uint64_t{1} << control_loc;
+          cbits |= ((cur_gate.cmask >> k) & 1) << control_loc;
+        }
+
+        for (size_t k = 0; k < gradient_gates[i][j - 1].grad_gates.size();
+             k++) {
+          // Copy sv onto scratch2 in anticipation of non-unitary "gradient
+          // gate".
+          ss.Copy(sv, scratch2);
+          if (!cur_gate.controlled_by.empty()) {
+            // Gradient of controlled gates puts zeros on diagonal which is
+            // the same as collapsing the state and then applying the
+            // non-controlled version of the gradient gate.
+            BulkSetAmpl<float>(scratch2, mask, cbits, 0, 0, true);
+          }
+          qsim::ApplyGate(sim, gradient_gates[i][j - 1].grad_gates[k],
+                          scratch2);
+
+          // don't need not-found check since this is done upstream already.
+          const auto it = maps[i].find(gradient_gates[i][j - 1].params[k]);
+          const int loc = it->second.first;
+          // Apply finite differencing for adjoint gradients.
+          // Finite differencing enables applying multiple `gradient_gate`
+          // of a symbol at the same circuit. For analytic methods like
+          // parameter-shift we need to apply a single `gradient_gate`
+          // per a symbol.
+          (*output_tensor)(i, loc) += ss.RealInnerProduct(scratch2, scratch) +
+                                      ss.RealInnerProduct(scratch, scratch2);
+        }
+        ApplyGateDagger(sim, cur_gate, scratch);
+      }
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("TfqAdjointGradientCuquantum").Device(tensorflow::DEVICE_CPU),
+    TfqAdjointGradientCuquantumOp);
+
+REGISTER_OP("TfqAdjointGradientCuquantum")
+    .Input("programs: string")
+    .Input("symbol_names: string")
+    .Input("symbol_values: float")
+    .Input("pauli_sums: string")
+    .Input("downstream_grads: float")
+    .Output("grads: float")
+    .SetShapeFn([](tensorflow::shape_inference::InferenceContext* c) {
+      tensorflow::shape_inference::ShapeHandle programs_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &programs_shape));
+
+      tensorflow::shape_inference::ShapeHandle symbol_names_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &symbol_names_shape));
+
+      tensorflow::shape_inference::ShapeHandle symbol_values_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &symbol_values_shape));
+
+      tensorflow::shape_inference::ShapeHandle pauli_sums_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 2, &pauli_sums_shape));
+
+      tensorflow::shape_inference::ShapeHandle downstream_grads_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 2, &downstream_grads_shape));
+
+      tensorflow::shape_inference::DimensionHandle output_rows =
+          c->Dim(programs_shape, 0);
+      tensorflow::shape_inference::DimensionHandle output_cols =
+          c->Dim(symbol_names_shape, 0);
+      c->set_output(0, c->Matrix(output_rows, output_cols));
+
+      return ::tensorflow::Status();
+    });
+
+}  // namespace tfq
diff --git a/tensorflow_quantum/core/ops/tfq_simulate_ops_cuda.py b/tensorflow_quantum/core/ops/tfq_adj_grad_op_cuquantum.py
similarity index 75%
rename from tensorflow_quantum/core/ops/tfq_simulate_ops_cuda.py
rename to tensorflow_quantum/core/ops/tfq_adj_grad_op_cuquantum.py
index d1a2ca4d2..e73775a45 100644
--- a/tensorflow_quantum/core/ops/tfq_simulate_ops_cuda.py
+++ b/tensorflow_quantum/core/ops/tfq_adj_grad_op_cuquantum.py
@@ -12,15 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-"""Module to register cuda simulation python op."""
+"""Module to register python op gradient."""
 import tensorflow as tf
 from tensorflow_quantum.core.ops.load_module import load_module
 
-SIM_OP_MODULE = load_module("_tfq_simulate_ops_cuda.so")
+SIM_OP_MODULE = load_module("_tfq_adj_grad_cuquantum.so")
 
 
-def tfq_simulate_expectation(programs, symbol_names, symbol_values, pauli_sums):
-    """Calculates the expectation value of circuits wrt some operator(s).
+def tfq_adj_grad(programs, symbol_names, symbol_values, pauli_sums, prev_grad):
+    """Calculate gradient of expectation value of circuits wrt some operator(s).
+
     Args:
         programs: `tf.Tensor` of strings with shape [batch_size] containing
             the string representations of the circuits to be executed.
@@ -35,10 +36,13 @@ def tfq_simulate_expectation(programs, symbol_names, symbol_values, pauli_sums):
         pauli_sums: `tf.Tensor` of strings with shape [batch_size, n_ops]
             containing the string representation of the operators that will
             be used on all of the circuits in the expectation calculations.
+        prev_grad: `tf.Tensor` of real numbers with shape [batch_size, n_ops]
+            backprop of values from downstream in the compute graph.
     Returns:
-        `tf.Tensor` with shape [batch_size, n_ops] that holds the
+        `tf.Tensor` with shape [batch_size, n_params] that holds the gradient of
             expectation value for each circuit with each op applied to it
             (after resolving the corresponding parameters in).
     """
-    return SIM_OP_MODULE.tfq_simulate_expectation_cuda(
-        programs, symbol_names, tf.cast(symbol_values, tf.float32), pauli_sums)
\ No newline at end of file
+    return SIM_OP_MODULE.tfq_adjoint_gradient_cuquantum(
+        programs, symbol_names, tf.cast(symbol_values, tf.float32), pauli_sums,
+        tf.cast(prev_grad, tf.float32))
diff --git a/tensorflow_quantum/core/ops/tfq_adj_grad_op_cuquantum_test.py b/tensorflow_quantum/core/ops/tfq_adj_grad_op_cuquantum_test.py
new file mode 100644
index 000000000..262f81728
--- /dev/null
+++ b/tensorflow_quantum/core/ops/tfq_adj_grad_op_cuquantum_test.py
@@ -0,0 +1,490 @@
+# Copyright 2020 The TensorFlow Quantum Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Tests that specifically target tfq_unitary_op."""
+# Remove PYTHONPATH collisions for protobuf.
+# pylint: disable=wrong-import-position
+import sys
+NEW_PATH = [x for x in sys.path if 'com_google_protobuf' not in x]
+sys.path = NEW_PATH
+# pylint: enable=wrong-import-position
+
+import time
+import numpy as np
+from absl.testing import parameterized
+import tensorflow as tf
+import cirq
+import sympy
+
+from tensorflow_quantum.python import util
+from tensorflow_quantum.core.ops import tfq_adj_grad_op
+from tensorflow_quantum.core.ops import tfq_adj_grad_op_cuquantum
+
+
+def measure_average_runtime(
+        fn,
+        tag,
+        num_samples=10,
+        result_avg=False,
+):
+    """Measures average runtime for given function.
+
+    Args:
+        fn: function.
+        tag: The message title.
+        num_samples: The number of measurements.
+        result_avg: True if the results are all averaged.
+
+    Returns:
+        The average time and the (averaged) result.
+    """
+    avg_time = []
+    avg_res = []
+    for _ in range(num_samples):
+        begin_time = time.time()
+        result = fn()
+        duration = time.time() - begin_time
+        avg_time.append(duration)
+        if result_avg:
+            avg_res.append(result)
+    avg_time = sum(avg_time) / float(num_samples)
+    print(f"\n\t{tag} time: {avg_time}\n")
+    if result_avg:
+        result = np.average(avg_res, axis=0)
+    return avg_time, result
+
+
+class ADJGradTest(tf.test.TestCase, parameterized.TestCase):
+    """Tests tfq_calculate_unitary."""
+
+    def test_calculate_adj_grad_cpu_vs_cuquantum(self):
+        """Make sure that cpu & gpu(cuquantum) ops have the same results."""
+        n_qubits = 20
+        batch_size = 5
+        symbol_names = ['alpha']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size)
+
+        circuit_batch_tensor = util.convert_to_tensor(circuit_batch)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        pauli_sums = util.random_pauli_sums(qubits, 3, batch_size)
+        pauli_sums_tensor = util.convert_to_tensor([[x] for x in pauli_sums])
+
+        prev_grads = tf.ones([batch_size, len(symbol_names)])
+
+        cpu_avg_time, res_cpu = measure_average_runtime(
+            lambda: tfq_adj_grad_op.tfq_adj_grad(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64), pauli_sums_tensor,
+                prev_grads),
+            "Adjoint CPU",
+            num_samples=10,
+            result_avg=True,
+        )
+
+        cuquantum_avg_time, res_cuquantum = measure_average_runtime(
+            lambda: tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64), pauli_sums_tensor,
+                prev_grads),
+            "Adjoint cuQuantum",
+            num_samples=10,
+            result_avg=True,
+        )
+
+        # cuQuantum op should be faster than CPU op.
+        self.assertGreater(cpu_avg_time, cuquantum_avg_time)
+
+        # The result should be the similar within a tolerance.
+        np.testing.assert_allclose(res_cpu,
+                                   res_cuquantum,
+                                   atol=1e-4,
+                                   err_msg="""
+        # If failed, the GPU architecture in this system may be unsupported.
+        # Please refer to the supported architectures here.
+        # https://docs.nvidia.com/cuda/cuquantum/getting_started.html#custatevec
+        """)
+
+    def test_adj_grad_inputs(self):
+        """Make sure that the expectation op fails gracefully on bad inputs."""
+        n_qubits = 5
+        batch_size = 5
+        symbol_names = ['alpha']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        pauli_sums = util.random_pauli_sums(qubits, 3, batch_size)
+        upstream_grads = np.ones((batch_size, len(symbol_names)))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'programs must be rank 1'):
+            # Circuit tensor has too many dimensions.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor([circuit_batch]), symbol_names,
+                tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_names must be rank 1.'):
+            # symbol_names tensor has too many dimensions.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), np.array([symbol_names]),
+                tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_values must be rank 2.'):
+            # symbol_values_array tensor has too many dimensions.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                tf.convert_to_tensor(np.array([symbol_values_array])),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_values must be rank 2.'):
+            # symbol_values_array tensor has too few dimensions.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                tf.convert_to_tensor(symbol_values_array[0]),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'pauli_sums must be rank 2.'):
+            # pauli_sums tensor has too few dimensions.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor(list(pauli_sums)),
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'pauli_sums must be rank 2.'):
+            # pauli_sums tensor has too many dimensions.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor([[[x]] for x in pauli_sums]),
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Unparseable proto'):
+            # circuit tensor has the right type but invalid values.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                ['junk'] * batch_size, symbol_names,
+                tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Could not find symbol in parameter map'):
+            # symbol_names tensor has the right type but invalid values.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), ['junk'],
+                tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'qubits not found in circuit'):
+            # pauli_sums tensor has the right type but invalid values.
+            new_qubits = [cirq.GridQubit(5, 5), cirq.GridQubit(9, 9)]
+            new_pauli_sums = util.random_pauli_sums(new_qubits, 2, batch_size)
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor([[x] for x in new_pauli_sums]),
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Unparseable proto'):
+            # pauli_sums tensor has the right type but invalid values 2.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                tf.convert_to_tensor(symbol_values_array),
+                [['junk']] * batch_size, tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # circuits tensor has the wrong type.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                [1.0] * batch_size, symbol_names,
+                tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # symbol_names tensor has the wrong type.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), [0.1234],
+                tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(tf.errors.UnimplementedError, ''):
+            # symbol_values tensor has the wrong type.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                [['junk']] * batch_size,
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # pauli_sums tensor has the wrong type.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                tf.convert_to_tensor(symbol_values_array), [[1.0]] * batch_size,
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(TypeError, 'missing'):
+            # we are missing an argument.
+            # pylint: disable=no-value-for-parameter
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                tf.convert_to_tensor(symbol_values_array),
+                tf.convert_to_tensor(upstream_grads))
+            # pylint: enable=no-value-for-parameter
+
+        with self.assertRaisesRegex(TypeError, 'positional arguments'):
+            # pylint: disable=too-many-function-args
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor(upstream_grads), [])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='do not match'):
+            # wrong op size.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor([cirq.Circuit()]), symbol_names,
+                symbol_values_array.astype(np.float64),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='rank 2'):
+            # wrong grad shape.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor([upstream_grads]))
+
+        with self.assertRaisesRegex(
+                tf.errors.InvalidArgumentError,
+                expected_regex='gradients and circuits do not match'):
+            # wrong grad batch size.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor([[0 for i in range(len(symbol_names))]]))
+
+        with self.assertRaisesRegex(
+                tf.errors.InvalidArgumentError,
+                expected_regex='gradients and pauli sum dimension do not match'
+        ):
+            # wrong grad inner size.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor([[0, 0] for _ in range(len(circuit_batch))
+                                     ]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='cirq.Channel'):
+            # attempting to use noisy circuit.
+            noisy_circuit = cirq.Circuit(cirq.depolarize(0.3).on_each(*qubits))
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor([noisy_circuit for _ in circuit_batch]),
+                symbol_names, tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor(upstream_grads))
+
+    def test_calculate_adj_grad_empty(self):
+        """Verify that the empty case is handled gracefully."""
+        out = tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+            util.convert_to_tensor([cirq.Circuit()]),
+            tf.convert_to_tensor([], dtype=tf.dtypes.string),
+            tf.convert_to_tensor([[]]),
+            tf.convert_to_tensor([[]], dtype=tf.dtypes.string),
+            tf.convert_to_tensor([[]]))
+        self.assertShapeEqual(np.zeros((1, 0)), out)
+
+    def test_calculate_adj_grad_no_circuit(self):
+        """Verify that the no circuit case is handled gracefully."""
+        out = tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+            tf.raw_ops.Empty(shape=(0,), dtype=tf.string),
+            tf.raw_ops.Empty(shape=(0,), dtype=tf.string),
+            tf.raw_ops.Empty(shape=(0, 0), dtype=tf.float32),
+            tf.raw_ops.Empty(shape=(0, 0), dtype=tf.string),
+            tf.raw_ops.Empty(shape=(0, 0), dtype=tf.float32),
+        )
+        self.assertShapeEqual(np.zeros((0, 0)), out)
+
+    def test_calculate_adj_grad_simple_case(self):
+        """Make sure that adjoint gradient works on simple input case."""
+        n_qubits = 2
+        batch_size = 1
+        symbol_names = ['alpha', 'beta']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+        [cirq.Circuit(cirq.X(qubits[0]) ** sympy.Symbol('alpha'),
+            cirq.Y(qubits[1]) ** sympy.Symbol('beta'),
+            cirq.CNOT(qubits[0], qubits[1]))], [{'alpha': 0.123, 'beta': 0.456}]
+
+        op_batch = [
+            [cirq.Z(qubits[0]), cirq.X(qubits[1])] for _ in range(batch_size)
+        ]
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        prev_grads = tf.ones([batch_size, len(symbol_names)])
+
+        out = tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+            util.convert_to_tensor(circuit_batch),
+            tf.convert_to_tensor(symbol_names),
+            tf.convert_to_tensor(symbol_values_array),
+            util.convert_to_tensor(op_batch), prev_grads)
+
+        self.assertAllClose(out, np.array([[-1.18392, 0.43281]]), atol=1e-3)
+
+    def test_calculate_adj_grad_simple_case2(self):
+        """Make sure the adjoint gradient works on another simple input case."""
+        n_qubits = 2
+        batch_size = 1
+        symbol_names = ['alpha', 'beta', 'gamma']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+        [cirq.Circuit(cirq.X(qubits[0]) ** sympy.Symbol('alpha'),
+            cirq.Y(qubits[1]) ** sympy.Symbol('beta'),
+            cirq.CNOT(qubits[0], qubits[1]),
+            cirq.FSimGate(sympy.Symbol('gamma'), 0.5)(qubits[0], qubits[1]))
+        ], [{'alpha': 0.123, 'beta': 0.456, 'gamma': 0.789}]
+
+        op_batch = [
+            [cirq.Z(qubits[0]), cirq.X(qubits[1])] for _ in range(batch_size)
+        ]
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        prev_grads = tf.ones([batch_size, len(op_batch[0])])
+
+        out = tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+            util.convert_to_tensor(circuit_batch),
+            tf.convert_to_tensor(symbol_names),
+            tf.convert_to_tensor(symbol_values_array),
+            util.convert_to_tensor(op_batch), prev_grads)
+
+        self.assertAllClose(out,
+                            np.array([[-2.100, -1.7412, -1.5120]]),
+                            atol=1e-3)
+
+    def test_calculate_adj_grad_simple_case_shared(self):
+        """Make sure the adjoint gradient works on a shared symbol gate."""
+        n_qubits = 2
+        batch_size = 1
+        symbol_names = ['alpha', 'beta', 'gamma']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+        [cirq.Circuit(cirq.X(qubits[0]) ** sympy.Symbol('alpha'),
+            cirq.Y(qubits[1]) ** sympy.Symbol('beta'),
+            cirq.CNOT(qubits[0], qubits[1]),
+            cirq.FSimGate(
+                sympy.Symbol('gamma'),
+                sympy.Symbol('gamma'))(qubits[0], qubits[1]))
+        ], [{'alpha': 0.123, 'beta': 0.456, 'gamma': 0.789}]
+
+        op_batch = [
+            [cirq.Z(qubits[0]), cirq.X(qubits[1])] for _ in range(batch_size)
+        ]
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        prev_grads = tf.ones([batch_size, len(op_batch[0])])
+
+        out = tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+            util.convert_to_tensor(circuit_batch),
+            tf.convert_to_tensor(symbol_names),
+            tf.convert_to_tensor(symbol_values_array),
+            util.convert_to_tensor(op_batch), prev_grads)
+
+        self.assertAllClose(out,
+                            np.array([[-2.3484, -1.7532, -1.64264]]),
+                            atol=1e-3)
+
+    def test_calculate_adj_grad_simple_case_single(self):
+        """Make sure the adjoint gradient works on a one symbol for all gate."""
+        n_qubits = 2
+        batch_size = 1
+        symbol_names = ['alpha', 'beta', 'gamma']
+        qubits = cirq.LineQubit.range(n_qubits)
+        circuit_batch, resolver_batch = \
+        [cirq.Circuit(cirq.X(qubits[0]) ** sympy.Symbol('alpha'),
+            cirq.Y(qubits[1]) ** sympy.Symbol('alpha'),
+            cirq.CNOT(qubits[0], qubits[1]),
+            cirq.FSimGate(
+                -0.56,
+                sympy.Symbol('alpha'))(qubits[0], qubits[1]))
+        ], [{'alpha': 0.123, 'beta': 0.456, 'gamma': 0.789}]
+
+        op_batch = [
+            [cirq.Z(qubits[0]), cirq.X(qubits[1])] for _ in range(batch_size)
+        ]
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        prev_grads = tf.ones([batch_size, len(op_batch[0])])
+
+        out = tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+            util.convert_to_tensor(circuit_batch),
+            tf.convert_to_tensor(symbol_names),
+            tf.convert_to_tensor(symbol_values_array),
+            util.convert_to_tensor(op_batch), prev_grads)
+
+        self.assertAllClose(out, np.array([[1.2993, 0, 0]]), atol=1e-3)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/tensorflow_quantum/core/ops/tfq_simulate_expectation_op_cuquantum.cu.cc b/tensorflow_quantum/core/ops/tfq_simulate_expectation_op_cuquantum.cu.cc
index 6c9f03eb7..28fe5ee65 100644
--- a/tensorflow_quantum/core/ops/tfq_simulate_expectation_op_cuquantum.cu.cc
+++ b/tensorflow_quantum/core/ops/tfq_simulate_expectation_op_cuquantum.cu.cc
@@ -10,19 +10,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <memory>
-#include <vector>
+#include <custatevec.h>
 
 #include <chrono>
+#include <memory>
+#include <vector>
 
-#include "../cuquantum_libs/include/custatevec.h"
 #include "../qsim/lib/circuit.h"
 #include "../qsim/lib/gate_appl.h"
 #include "../qsim/lib/gates_cirq.h"
 #include "../qsim/lib/gates_qsim.h"
-#include "../qsim/lib/seqfor.h"
-#include "../qsim/lib/simulator_custatevec.h"
-#include "../qsim/lib/statespace_custatevec.h"
+#include "../qsim/lib/simmux_gpu.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -48,7 +46,17 @@ class TfqSimulateExpectationOpCuQuantum : public tensorflow::OpKernel {
  public:
   explicit TfqSimulateExpectationOpCuQuantum(
       tensorflow::OpKernelConstruction* context)
-      : OpKernel(context) {}
+      : OpKernel(context) {
+    // Allocates handlers for initialization.
+    cublasCreate(&cublas_handle_);
+    custatevecCreate(&custatevec_handle_);
+  }
+
+  ~TfqSimulateExpectationOpCuQuantum() {
+    // Destroys handlers in sync with simulator lifetime.
+    cublasDestroy(cublas_handle_);
+    custatevecDestroy(custatevec_handle_);
+  }
 
   void Compute(tensorflow::OpKernelContext* context) override {
     // TODO (mbbrough): add more dimension checks for other inputs here.
@@ -93,7 +101,7 @@ class TfqSimulateExpectationOpCuQuantum : public tensorflow::OpKernel {
     std::vector<std::vector<qsim::GateFused<QsimGate>>> fused_circuits(
         programs.size(), std::vector<qsim::GateFused<QsimGate>>({}));
 
-    Status parse_status = Status::OK();
+    Status parse_status = ::tensorflow::Status();
     auto p_lock = tensorflow::mutex();
     auto construct_f = [&](int start, int end) {
       for (int i = start; i < end; i++) {
@@ -114,16 +122,8 @@ class TfqSimulateExpectationOpCuQuantum : public tensorflow::OpKernel {
       max_num_qubits = std::max(max_num_qubits, num);
     }
 
-    // create handles for simulator
-    cublasCreate(&cublas_handle_);
-    custatevecCreate(&custatevec_handle_);
-
     ComputeLarge(num_qubits, fused_circuits, pauli_sums, context,
                  &output_tensor);
-
-    // destroy handles in sync with simulator lifetime
-    cublasDestroy(cublas_handle_);
-    custatevecDestroy(custatevec_handle_);
   }
 
  private:
@@ -153,7 +153,7 @@ class TfqSimulateExpectationOpCuQuantum : public tensorflow::OpKernel {
     // Simulate programs one by one. Parallelizing over state vectors
     // we no longer parallelize over circuits. Each time we encounter a
     // a larger circuit we will grow the Statevector as necessary.
-    for (int i = 0; i < fused_circuits.size(); i++) {
+    for (size_t i = 0; i < fused_circuits.size(); i++) {
       int nq = num_qubits[i];
 
       if (nq > largest_nq) {
@@ -166,10 +166,10 @@ class TfqSimulateExpectationOpCuQuantum : public tensorflow::OpKernel {
       //  the state if there is a possibility that circuit[i] and
       //  circuit[i + 1] produce the same state.
       ss.SetStateZero(sv);
-      for (int j = 0; j < fused_circuits[i].size(); j++) {
+      for (size_t j = 0; j < fused_circuits[i].size(); j++) {
         qsim::ApplyFusedGate(sim, fused_circuits[i][j], sv);
       }
-      for (int j = 0; j < pauli_sums[i].size(); j++) {
+      for (size_t j = 0; j < pauli_sums[i].size(); j++) {
         // (#679) Just ignore empty program
         if (fused_circuits[i].size() == 0) {
           (*output_tensor)(i, j) = -2.0;
@@ -214,7 +214,7 @@ REGISTER_OP("TfqSimulateExpectationCuquantum")
           c->Dim(pauli_sums_shape, 1);
       c->set_output(0, c->Matrix(output_rows, output_cols));
 
-      return tensorflow::Status::OK();
+      return ::tensorflow::Status();
     });
 
 }  // namespace tfq
diff --git a/tensorflow_quantum/core/ops/tfq_simulate_ops_cuquantum.py b/tensorflow_quantum/core/ops/tfq_simulate_ops_cuquantum.py
index 785cce16c..27165e4d6 100644
--- a/tensorflow_quantum/core/ops/tfq_simulate_ops_cuquantum.py
+++ b/tensorflow_quantum/core/ops/tfq_simulate_ops_cuquantum.py
@@ -41,4 +41,94 @@ def tfq_simulate_expectation(programs, symbol_names, symbol_values, pauli_sums):
             (after resolving the corresponding parameters in).
     """
     return SIM_OP_MODULE.tfq_simulate_expectation_cuquantum(
-        programs, symbol_names, tf.cast(symbol_values, tf.float32), pauli_sums)
\ No newline at end of file
+        programs, symbol_names, tf.cast(symbol_values, tf.float32), pauli_sums)
+
+
+def tfq_simulate_state(programs, symbol_names, symbol_values):
+    """Returns the state of the programs using the C++ state vector simulator.
+
+    Simulate the final state of `programs` given `symbol_values` are placed
+    inside of the symbols with the name in `symbol_names` in each circuit.
+
+    Args:
+        programs: `tf.Tensor` of strings with shape [batch_size] containing
+            the string representations of the circuits to be executed.
+        symbol_names: `tf.Tensor` of strings with shape [n_params], which
+            is used to specify the order in which the values in
+            `symbol_values` should be placed inside of the circuits in
+            `programs`.
+        symbol_values: `tf.Tensor` of real numbers with shape
+            [batch_size, n_params] specifying parameter values to resolve
+            into the circuits specificed by programs, following the ordering
+            dictated by `symbol_names`.
+    Returns:
+        A `tf.Tensor` containing the final state of each circuit in `programs`.
+    """
+    return SIM_OP_MODULE.tfq_simulate_state_cuquantum(
+        programs, symbol_names, tf.cast(symbol_values, tf.float32))
+
+
+def tfq_simulate_samples(programs, symbol_names, symbol_values, num_samples):
+    """Generate samples using the C++ state vector simulator.
+
+    Simulate the final state of `programs` given `symbol_values` are placed
+    inside of the symbols with the name in `symbol_names` in each circuit.
+    From there we will then sample from the final state using native tensorflow
+    operations.
+
+    Args:
+        programs: `tf.Tensor` of strings with shape [batch_size] containing
+            the string representations of the circuits to be executed.
+        symbol_names: `tf.Tensor` of strings with shape [n_params], which
+            is used to specify the order in which the values in
+            `symbol_values` should be placed inside of the circuits in
+            `programs`.
+        symbol_values: `tf.Tensor` of real numbers with shape
+            [batch_size, n_params] specifying parameter values to resolve
+            into the circuits specified by programs, following the ordering
+            dictated by `symbol_names`.
+        num_samples: `tf.Tensor` with one element indicating the number of
+            samples to draw.
+    Returns:
+        A `tf.Tensor` containing the samples taken from each circuit in
+        `programs`.
+    """
+    return SIM_OP_MODULE.tfq_simulate_samples_cuquantum(
+        programs, symbol_names, tf.cast(symbol_values, tf.float32), num_samples)
+
+
+def tfq_simulate_sampled_expectation(programs, symbol_names, symbol_values,
+                                     pauli_sums, num_samples):
+    """Calculate the expectation value of circuits using samples.
+
+    Simulate the final state of `programs` given `symbol_values` are placed
+    inside of the symbols with the name in `symbol_names` in each circuit.
+    Them, sample the resulting state `num_samples` times and use these samples
+    to compute expectation values of the given `pauli_sums`.
+
+    Args:
+        programs: `tf.Tensor` of strings with shape [batch_size] containing
+            the string representations of the circuits to be executed.
+        symbol_names: `tf.Tensor` of strings with shape [n_params], which
+            is used to specify the order in which the values in
+            `symbol_values` should be placed inside of the circuits in
+            `programs`.
+        symbol_values: `tf.Tensor` of real numbers with shape
+            [batch_size, n_params] specifying parameter values to resolve
+            into the circuits specificed by programs, following the ordering
+            dictated by `symbol_names`.
+        pauli_sums: `tf.Tensor` of strings with shape [batch_size, n_ops]
+            containing the string representation of the operators that will
+            be used on all of the circuits in the expectation calculations.
+        num_samples: `tf.Tensor` with `num_samples[i][j]` is equal to the
+            number of samples to draw in each term of `pauli_sums[i][j]`
+            when estimating the expectation. Therefore, `num_samples` must
+            have the same shape as `pauli_sums`.
+    Returns:
+        `tf.Tensor` with shape [batch_size, n_ops] that holds the
+            expectation value for each circuit with each op applied to it
+            (after resolving the corresponding parameters in).
+    """
+    return SIM_OP_MODULE.tfq_simulate_sampled_expectation_cuquantum(
+        programs, symbol_names, tf.cast(symbol_values, tf.float32), pauli_sums,
+        tf.cast(num_samples, dtype=tf.int32))
diff --git a/tensorflow_quantum/core/ops/tfq_simulate_ops_cuquantum_test.py b/tensorflow_quantum/core/ops/tfq_simulate_ops_cuquantum_test.py
new file mode 100644
index 000000000..f3854a3c8
--- /dev/null
+++ b/tensorflow_quantum/core/ops/tfq_simulate_ops_cuquantum_test.py
@@ -0,0 +1,918 @@
+# Copyright 2020 The TensorFlow Quantum Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Tests that specifically target tfq_simulate_ops_cu*."""
+import time
+import numpy as np
+from absl.testing import parameterized
+import tensorflow as tf
+import cirq
+
+from tensorflow_quantum.core.ops import tfq_simulate_ops
+from tensorflow_quantum.core.ops import tfq_simulate_ops_cuquantum
+from tensorflow_quantum.python import util
+
+
+def measure_average_runtime(
+        fn,
+        tag,
+        num_samples=10,
+        result_avg=False,
+):
+    """Measures average runtime for given function.
+
+    Args:
+        fn: function.
+        tag: The message title.
+        num_samples: The number of measurements.
+        result_avg: True if the results are all averaged.
+
+    Returns:
+        The average time and the (averaged) result.
+    """
+    avg_time = []
+    avg_res = []
+    for _ in range(num_samples):
+        begin_time = time.time()
+        result = fn()
+        duration = time.time() - begin_time
+        avg_time.append(duration)
+        if result_avg:
+            avg_res.append(result)
+    avg_time = sum(avg_time) / float(num_samples)
+    print(f"\n\t{tag} time: {avg_time}\n")
+    if result_avg:
+        result = np.average(avg_res, axis=0)
+    return avg_time, result
+
+
+class SimulateExpectationCuquantumTest(tf.test.TestCase):
+    """Tests tfq_simulate_expectation."""
+
+    def test_simulate_expectation_cpu_vs_cuquantum(self):
+        """Make sure that cpu & gpu(cuquantum) ops have the same results."""
+        n_qubits = 20
+        batch_size = 5
+        symbol_names = ['alpha']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size)
+
+        circuit_batch_tensor = util.convert_to_tensor(circuit_batch)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        pauli_sums = util.random_pauli_sums(qubits, 3, batch_size)
+        pauli_sums_tensor = util.convert_to_tensor([[x] for x in pauli_sums])
+
+        _, res_cpu = measure_average_runtime(
+            lambda: tfq_simulate_ops.tfq_simulate_expectation(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64), pauli_sums_tensor),
+            "Expectation CPU",
+            num_samples=100,
+        )
+
+        _, res_cuquantum = measure_average_runtime(
+            lambda: tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64), pauli_sums_tensor),
+            "Expectation cuQuantum",
+            num_samples=100,
+        )
+
+        # The result should be the similar within a tolerance.
+        np.testing.assert_allclose(res_cpu,
+                                   res_cuquantum,
+                                   atol=1e-4,
+                                   err_msg="""
+        # If failed, the GPU architecture in this system may be unsupported.
+        # Please refer to the supported architectures here.
+        # https://docs.nvidia.com/cuda/cuquantum/getting_started.html#custatevec
+        """)
+
+    def test_simulate_expectation_inputs(self):
+        """Make sure that the expectation op fails gracefully on bad inputs."""
+        n_qubits = 5
+        batch_size = 5
+        symbol_names = ['alpha']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        pauli_sums = util.random_pauli_sums(qubits, 3, batch_size)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'programs must be rank 1'):
+            # Circuit tensor has too many dimensions.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor([circuit_batch]), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_names must be rank 1.'):
+            # symbol_names tensor has too many dimensions.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), np.array([symbol_names]),
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_values must be rank 2.'):
+            # symbol_values_array tensor has too many dimensions.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                np.array([symbol_values_array]),
+                util.convert_to_tensor([[x] for x in pauli_sums]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_values must be rank 2.'):
+            # symbol_values_array tensor has too few dimensions.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array[0],
+                util.convert_to_tensor([[x] for x in pauli_sums]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'pauli_sums must be rank 2.'):
+            # pauli_sums tensor has too few dimensions.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, util.convert_to_tensor(list(pauli_sums)))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'pauli_sums must be rank 2.'):
+            # pauli_sums tensor has too many dimensions.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[[x]] for x in pauli_sums]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Unparseable proto'):
+            # circuit tensor has the right type but invalid values.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                ['junk'] * batch_size, symbol_names, symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Could not find symbol in parameter map'):
+            # symbol_names tensor has the right type but invalid values.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), ['junk'],
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'qubits not found in circuit'):
+            # pauli_sums tensor has the right type but invalid values.
+            new_qubits = [cirq.GridQubit(5, 5), cirq.GridQubit(9, 9)]
+            new_pauli_sums = util.random_pauli_sums(new_qubits, 2, batch_size)
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in new_pauli_sums]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Unparseable proto'):
+            # pauli_sums tensor has the right type but invalid values 2.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, [['junk']] * batch_size)
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # circuits tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                [1.0] * batch_size, symbol_names, symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]))
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # symbol_names tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), [0.1234],
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]))
+
+        with self.assertRaisesRegex(tf.errors.UnimplementedError, ''):
+            # symbol_values tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                [['junk']] * batch_size,
+                util.convert_to_tensor([[x] for x in pauli_sums]))
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # pauli_sums tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, [[1.0]] * batch_size)
+
+        with self.assertRaisesRegex(TypeError, 'missing'):
+            # we are missing an argument.
+            # pylint: disable=no-value-for-parameter
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array)
+            # pylint: enable=no-value-for-parameter
+
+        with self.assertRaisesRegex(TypeError, 'positional arguments'):
+            # pylint: disable=too-many-function-args
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]), [])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='do not match'):
+            # wrong op size.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums
+                                       ][:int(batch_size * 0.5)]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='do not match'):
+            # wrong symbol_values size.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array[:int(batch_size * 0.5)],
+                util.convert_to_tensor([[x] for x in pauli_sums]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='cirq.Channel'):
+            # attempting to use noisy circuit.
+            noisy_circuit = cirq.Circuit(cirq.depolarize(0.3).on_each(*qubits))
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor([noisy_circuit for _ in pauli_sums]),
+                symbol_names, symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]))
+
+        res = tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+            util.convert_to_tensor([cirq.Circuit() for _ in pauli_sums]),
+            symbol_names, symbol_values_array.astype(np.float64),
+            util.convert_to_tensor([[x] for x in pauli_sums]))
+        self.assertDTypeEqual(res, np.float32)
+
+
+class SimulateSampledExpectationCuquantumTest(tf.test.TestCase):
+    """Tests tfq_simulate_sampled_expectation."""
+
+    def test_simulate_sampled_expectation_cpu_vs_cuquantum(self):
+        """Make sure that cpu & gpu(cuquantum) ops have the same results."""
+        n_qubits = 20
+        batch_size = 5
+        symbol_names = ['alpha']
+        n_samples = [[10000]] * batch_size
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size)
+
+        circuit_batch_tensor = util.convert_to_tensor(circuit_batch)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        pauli_sums = util.random_pauli_sums(qubits, 3, batch_size)
+        pauli_sums_tensor = util.convert_to_tensor([[x] for x in pauli_sums])
+
+        _, res_cpu = measure_average_runtime(
+            lambda: tfq_simulate_ops.tfq_simulate_sampled_expectation(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64), pauli_sums_tensor,
+                n_samples),
+            "SampledExpectation CPU",
+            num_samples=10,
+            result_avg=False,
+        )
+
+        _, res_cuquantum = measure_average_runtime(
+            lambda: tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64), pauli_sums_tensor,
+                n_samples),
+            "SampledExpectation cuQuantum",
+            num_samples=10,
+            result_avg=False,
+        )
+
+        # cuQuantum op should be faster than CPU op.
+
+        # The result should be the similar within a tolerance.
+        np.testing.assert_allclose(res_cpu,
+                                   res_cuquantum,
+                                   atol=0.07,
+                                   err_msg="""
+        # If failed, the GPU architecture in this system may be unsupported.
+        # Please refer to the supported architectures here.
+        # https://docs.nvidia.com/cuda/cuquantum/getting_started.html#custatevec
+        """)
+
+    def test_simulate_sampled_expectation_inputs(self):
+        """Make sure sampled expectation op fails gracefully on bad inputs."""
+        n_qubits = 5
+        batch_size = 5
+        symbol_names = ['alpha']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        pauli_sums = util.random_pauli_sums(qubits, 3, batch_size)
+        num_samples = [[10]] * batch_size
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'programs must be rank 1'):
+            # Circuit tensor has too many dimensions.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor([circuit_batch]), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_names must be rank 1.'):
+            # symbol_names tensor has too many dimensions.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), np.array([symbol_names]),
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_values must be rank 2.'):
+            # symbol_values_array tensor has too many dimensions.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                np.array([symbol_values_array]),
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_values must be rank 2.'):
+            # symbol_values_array tensor has too few dimensions.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array[0],
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'pauli_sums must be rank 2.'):
+            # pauli_sums tensor has too few dimensions.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch),
+                symbol_names, symbol_values_array,
+                util.convert_to_tensor(list(pauli_sums)), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'pauli_sums must be rank 2.'):
+            # pauli_sums tensor has too many dimensions.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                [util.convert_to_tensor([[x] for x in pauli_sums])],
+                num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'num_samples must be rank 2'):
+            # num_samples tensor has the wrong shape.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                [num_samples])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'num_samples must be rank 2'):
+            # num_samples tensor has the wrong shape.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                num_samples[0])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Unparseable proto'):
+            # circuit tensor has the right type but invalid values.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                ['junk'] * batch_size, symbol_names, symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Could not find symbol in parameter map'):
+            # symbol_names tensor has the right type but invalid values.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), ['junk'],
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'qubits not found in circuit'):
+            # pauli_sums tensor has the right type but invalid values.
+            new_qubits = [cirq.GridQubit(5, 5), cirq.GridQubit(9, 9)]
+            new_pauli_sums = util.random_pauli_sums(new_qubits, 2, batch_size)
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in new_pauli_sums]),
+                num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Unparseable proto'):
+            # pauli_sums tensor has the right type but invalid values 2.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, [['junk']] * batch_size, num_samples)
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # circuits tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                [1.0] * batch_size, symbol_names, symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # symbol_names tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), [0.1234],
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.UnimplementedError, ''):
+            # symbol_values tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                [['junk']] * batch_size,
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # pauli_sums tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, [[1.0]] * batch_size, num_samples)
+
+        with self.assertRaisesRegex(TypeError, 'missing'):
+            # we are missing an argument.
+            # pylint: disable=no-value-for-parameter
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, num_samples)
+            # pylint: enable=no-value-for-parameter
+
+        with self.assertRaisesRegex(TypeError, 'positional arguments'):
+            # pylint: disable=too-many-function-args
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]), [],
+                num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='do not match'):
+            # wrong op size.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor([cirq.Circuit()]), symbol_names,
+                symbol_values_array.astype(np.float64),
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'greater than 0'):
+            # pylint: disable=too-many-function-args
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                [[-1]] * batch_size)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='do not match'):
+            # wrong symbol_values size.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array[:int(batch_size * 0.5)],
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='cirq.Channel'):
+            # attempting to use noisy circuit.
+            noisy_circuit = cirq.Circuit(cirq.depolarize(0.3).on_each(*qubits))
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor([noisy_circuit for _ in pauli_sums]),
+                symbol_names, symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+
+class SimulateSamplesCuquantumTest(tf.test.TestCase, parameterized.TestCase):
+    """Tests tfq_simulate_samples."""
+
+    def test_simulate_samples_cpu_vs_cuquantum(self):
+        """Make sure that cpu & gpu(cuquantum) ops have the same results."""
+        n_qubits = 20
+        batch_size = 5
+        symbol_names = ['alpha']
+        n_samples = [100]
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size)
+
+        circuit_batch_tensor = util.convert_to_tensor(circuit_batch)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        _, res_cpu = measure_average_runtime(
+            lambda: tfq_simulate_ops.tfq_simulate_samples(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64), n_samples),
+            "Samples CPU",
+            num_samples=10,
+            result_avg=False,
+        )
+
+        _, res_cuquantum = measure_average_runtime(
+            lambda: tfq_simulate_ops_cuquantum.tfq_simulate_samples(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64), n_samples),
+            "Samples cuQuantum",
+            num_samples=10,
+            result_avg=False,
+        )
+
+        # cuQuantum op should be faster than CPU op.
+
+        res_cpu = np.average(res_cpu, axis=1)
+        res_cuquantum = np.average(res_cuquantum, axis=1)
+
+        # The result should be the similar within a tolerance.
+        np.testing.assert_allclose(res_cpu,
+                                   res_cuquantum,
+                                   atol=0.3,
+                                   err_msg="""
+        # If failed, the GPU architecture in this system may be unsupported.
+        # Please refer to the supported architectures here.
+        # https://docs.nvidia.com/cuda/cuquantum/getting_started.html#custatevec
+        """)
+
+    def test_simulate_samples_inputs(self):
+        """Make sure the sample op fails gracefully on bad inputs."""
+        n_qubits = 5
+        batch_size = 5
+        num_samples = 10
+        symbol_names = ['alpha']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'rank 1. Got rank 2'):
+            # programs tensor has the wrong shape.
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples(
+                util.convert_to_tensor([circuit_batch]), symbol_names,
+                symbol_values_array, [num_samples])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'rank 1. Got rank 2'):
+            # symbol_names tensor has the wrong shape.
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples(
+                util.convert_to_tensor(circuit_batch), np.array([symbol_names]),
+                symbol_values_array, [num_samples])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'rank 2. Got rank 3'):
+            # symbol_values tensor has the wrong shape.
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                np.array([symbol_values_array]), [num_samples])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'rank 2. Got rank 1'):
+            # symbol_values tensor has the wrong shape 2.
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array[0], [num_samples])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'rank 1. Got rank 2'):
+            # num_samples tensor has the wrong shape.
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, [[num_samples]])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Unparseable proto'):
+            # programs tensor has the right type, but invalid value.
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples(\
+                                                  ['junk'] * batch_size,
+                                                  symbol_names,
+                                                  symbol_values_array,
+                                                  [num_samples])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Could not find symbol in parameter map'):
+            # symbol_names tensor has the right type, but invalid value.
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples(
+                util.convert_to_tensor(circuit_batch), ['junk'],
+                symbol_values_array, [num_samples])
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # programs tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples([1] * batch_size,
+                                                            symbol_names,
+                                                            symbol_values_array,
+                                                            [num_samples])
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # programs tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples(
+                util.convert_to_tensor(circuit_batch), [1], symbol_values_array,
+                [num_samples])
+
+        with self.assertRaisesRegex(tf.errors.UnimplementedError,
+                                    'Cast string to float is not supported'):
+            # programs tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                [['junk']] * batch_size, [num_samples])
+
+        with self.assertRaisesRegex(Exception, 'junk'):
+            # num_samples tensor has the wrong shape.
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, ['junk'])
+
+        with self.assertRaisesRegex(TypeError, 'missing'):
+            # too few tensors.
+            # pylint: disable=no-value-for-parameter
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array)
+            # pylint: enable=no-value-for-parameter
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='do not match'):
+            # wrong symbol_values size.
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array[:int(batch_size * 0.5)], num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='cirq.Channel'):
+            # attempting to use noisy circuit.
+            noisy_circuit = cirq.Circuit(cirq.depolarize(0.3).on_each(*qubits))
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples(
+                util.convert_to_tensor([noisy_circuit for _ in circuit_batch]),
+                symbol_names, symbol_values_array, [num_samples])
+
+    @parameterized.parameters([
+        {
+            'all_n_qubits': [2, 3],
+            'n_samples': 10
+        },
+        {
+            'all_n_qubits': [1, 5, 8],
+            'n_samples': 10
+        },
+    ])
+    def test_sampling_output_padding(self, all_n_qubits, n_samples):
+        """Check that the sampling ops pad outputs correctly"""
+        op = tfq_simulate_ops_cuquantum.tfq_simulate_samples
+        circuits = []
+        expected_outputs = []
+        for n_qubits in all_n_qubits:
+            this_expected_output = np.zeros((n_samples, max(all_n_qubits)))
+            this_expected_output[:, max(all_n_qubits) - n_qubits:] = 1
+            this_expected_output[:, :max(all_n_qubits) - n_qubits] = -2
+            expected_outputs.append(this_expected_output)
+            circuits.append(
+                cirq.Circuit(*cirq.X.on_each(
+                    *cirq.GridQubit.rect(1, n_qubits))))
+        results = op(util.convert_to_tensor(circuits), [], [[]] * len(circuits),
+                     [n_samples]).numpy()
+        self.assertAllClose(expected_outputs, results)
+
+
+class SimulateStateCuquantumTest(tf.test.TestCase, parameterized.TestCase):
+    """Tests tfq_simulate_samples."""
+
+    def test_simulate_state_cpu_vs_cuquantum(self):
+        """Make sure that cpu & gpu(cuquantum) ops have the same results."""
+        n_qubits = 20
+        batch_size = 5
+        symbol_names = ['alpha']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size)
+
+        circuit_batch_tensor = util.convert_to_tensor(circuit_batch)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        _, res_cpu = measure_average_runtime(
+            lambda: tfq_simulate_ops.tfq_simulate_state(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64)),
+            "State CPU",
+            num_samples=10,
+        )
+
+        _, res_cuquantum = measure_average_runtime(
+            lambda: tfq_simulate_ops_cuquantum.tfq_simulate_state(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64)),
+            "State cuQuantum",
+            num_samples=10,
+        )
+
+        # cuQuantum op should be faster than CPU op.
+
+        # The result should be the similar within a tolerance.
+        np.testing.assert_allclose(res_cpu,
+                                   res_cuquantum,
+                                   atol=1e-4,
+                                   err_msg="""
+        # If failed, the GPU architecture in this system may be unsupported.
+        # Please refer to the supported architectures here.
+        # https://docs.nvidia.com/cuda/cuquantum/getting_started.html#custatevec
+        """)
+
+    def test_simulate_state_inputs(self):
+        """Make sure the state op fails gracefully on bad inputs."""
+        n_qubits = 5
+        batch_size = 5
+        symbol_names = ['alpha']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'programs must be rank 1'):
+            # programs tensor has the wrong shape.
+            tfq_simulate_ops_cuquantum.tfq_simulate_state(
+                util.convert_to_tensor([circuit_batch]), symbol_names,
+                symbol_values_array)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_names must be rank 1'):
+            # symbol_names tensor has the wrong shape.
+            tfq_simulate_ops_cuquantum.tfq_simulate_state(
+                util.convert_to_tensor(circuit_batch), np.array([symbol_names]),
+                symbol_values_array)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_values must be rank 2'):
+            # symbol_values tensor has the wrong shape.
+            tfq_simulate_ops_cuquantum.tfq_simulate_state(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                np.array([symbol_values_array]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_values must be rank 2'):
+            # symbol_values tensor has the wrong shape 2.
+            tfq_simulate_ops_cuquantum.tfq_simulate_state(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array[0])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Unparseable proto'):
+            # programs tensor has the right type, but invalid value.
+            tfq_simulate_ops_cuquantum.tfq_simulate_state(['junk'] * batch_size,
+                                                          symbol_names,
+                                                          symbol_values_array)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Could not find symbol in parameter map'):
+            # symbol_names tensor has the right type, but invalid value.
+            tfq_simulate_ops_cuquantum.tfq_simulate_state(
+                util.convert_to_tensor(circuit_batch), ['junk'],
+                symbol_values_array)
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # programs tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_state([1] * batch_size,
+                                                          symbol_names,
+                                                          symbol_values_array)
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # symbol_names tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_state(
+                util.convert_to_tensor(circuit_batch), [1], symbol_values_array)
+
+        with self.assertRaisesRegex(tf.errors.UnimplementedError, ''):
+            # symbol_values tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_state(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                [['junk']] * batch_size)
+
+        with self.assertRaisesRegex(TypeError, 'missing'):
+            # too few tensors.
+            # pylint: disable=no-value-for-parameter
+            tfq_simulate_ops_cuquantum.tfq_simulate_state(
+                util.convert_to_tensor(circuit_batch), symbol_names)
+            # pylint: enable=no-value-for-parameter
+
+        # TODO (mbbrough): determine if we should allow extra arguments ?
+        with self.assertRaisesRegex(TypeError, 'positional arguments'):
+            # pylint: disable=too-many-function-args
+            tfq_simulate_ops_cuquantum.tfq_simulate_state(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, [])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='do not match'):
+            # wrong symbol_values size.
+            tfq_simulate_ops_cuquantum.tfq_simulate_state(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array[:int(batch_size * 0.5)])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='cirq.Channel'):
+            # attempting to use noisy circuit.
+            noisy_circuit = cirq.Circuit(cirq.depolarize(0.3).on_each(*qubits))
+            tfq_simulate_ops_cuquantum.tfq_simulate_state(
+                util.convert_to_tensor([noisy_circuit for _ in circuit_batch]),
+                symbol_names, symbol_values_array)
+
+    @parameterized.parameters([
+        {
+            'all_n_qubits': [2, 3]
+        },
+        {
+            'all_n_qubits': [1, 5, 8]
+        },
+    ])
+    def test_simulate_state_output_padding(self, all_n_qubits):
+        """If a tfq_simulate op is asked to simulate states given circuits
+        acting on different numbers of qubits, the op should return a tensor
+        padded with zeros up to the size of the largest circuit. The padding
+        should be physically correct, such that samples taken from the padded
+        states still match samples taken from the original circuit. """
+        circuit_batch = []
+        for n_qubits in all_n_qubits:
+            qubits = cirq.GridQubit.rect(1, n_qubits)
+            circuit_batch += util.random_circuit_resolver_batch(qubits, 1)[0]
+
+        tfq_results = tfq_simulate_ops_cuquantum.tfq_simulate_state(
+            util.convert_to_tensor(circuit_batch), [],
+            [[]] * len(circuit_batch))
+
+        # Don't use batch_util here to enforce consistent padding everywhere
+        # without extra tests.
+        sim = cirq.Simulator()
+        manual_padded_results = []
+        for circuit in circuit_batch:
+            result = sim.simulate(circuit)
+            wf = result.final_state_vector
+            blank_state = np.ones(
+                (2**max(all_n_qubits)), dtype=np.complex64) * -2
+            blank_state[:wf.shape[0]] = wf
+            manual_padded_results.append(blank_state)
+
+        self.assertAllClose(tfq_results, manual_padded_results, atol=1e-5)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/tensorflow_quantum/core/ops/tfq_simulate_sampled_expectation_op.cc b/tensorflow_quantum/core/ops/tfq_simulate_sampled_expectation_op.cc
index e0ed05a49..82abb74b9 100644
--- a/tensorflow_quantum/core/ops/tfq_simulate_sampled_expectation_op.cc
+++ b/tensorflow_quantum/core/ops/tfq_simulate_sampled_expectation_op.cc
@@ -49,7 +49,9 @@ class TfqSimulateSampledExpectationOp : public tensorflow::OpKernel {
  public:
   explicit TfqSimulateSampledExpectationOp(
       tensorflow::OpKernelConstruction* context)
-      : OpKernel(context) {}
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, random_gen_.Init(context));
+  }
 
   void Compute(tensorflow::OpKernelContext* context) override {
     // TODO (mbbrough): add more dimension checks for other inputs here.
@@ -141,6 +143,8 @@ class TfqSimulateSampledExpectationOp : public tensorflow::OpKernel {
   }
 
  private:
+  tensorflow::GuardedPhiloxRandom random_gen_;
+
   void ComputeLarge(
       const std::vector<int>& num_qubits,
       const std::vector<std::vector<qsim::GateFused<QsimGate>>>& fused_circuits,
@@ -160,22 +164,20 @@ class TfqSimulateSampledExpectationOp : public tensorflow::OpKernel {
     auto sv = ss.Create(largest_nq);
     auto scratch = ss.Create(largest_nq);
 
-    tensorflow::GuardedPhiloxRandom random_gen;
-    random_gen.Init(tensorflow::random::New64(), tensorflow::random::New64());
     int largest_sum = -1;
     for (const auto& sums : pauli_sums) {
       for (const auto& sum : sums) {
         largest_sum = std::max(largest_sum, sum.terms().size());
       }
     }
-    auto local_gen = random_gen.ReserveSamples32(
+    auto local_gen = random_gen_.ReserveSamples32(
         largest_sum * pauli_sums[0].size() * fused_circuits.size() + 1);
     tensorflow::random::SimplePhilox rand_source(&local_gen);
 
     // Simulate programs one by one. Parallelizing over state vectors
     // we no longer parallelize over circuits. Each time we encounter a
     // a larger circuit we will grow the Statevector as necessary.
-    for (int i = 0; i < fused_circuits.size(); i++) {
+    for (size_t i = 0; i < fused_circuits.size(); i++) {
       int nq = num_qubits[i];
 
       if (nq > largest_nq) {
@@ -188,10 +190,10 @@ class TfqSimulateSampledExpectationOp : public tensorflow::OpKernel {
       //  the state if there is a possibility that circuit[i] and
       //  circuit[i + 1] produce the same state.
       ss.SetStateZero(sv);
-      for (int j = 0; j < fused_circuits[i].size(); j++) {
+      for (size_t j = 0; j < fused_circuits[i].size(); j++) {
         qsim::ApplyFusedGate(sim, fused_circuits[i][j], sv);
       }
-      for (int j = 0; j < pauli_sums[i].size(); j++) {
+      for (size_t j = 0; j < pauli_sums[i].size(); j++) {
         // (#679) Just ignore empty program
         if (fused_circuits[i].size() == 0) {
           (*output_tensor)(i, j) = -2.0;
@@ -219,8 +221,6 @@ class TfqSimulateSampledExpectationOp : public tensorflow::OpKernel {
 
     const int output_dim_op_size = output_tensor->dimension(1);
 
-    tensorflow::GuardedPhiloxRandom random_gen;
-    random_gen.Init(tensorflow::random::New64(), tensorflow::random::New64());
     int largest_sum = -1;
     for (const auto& sums : pauli_sums) {
       for (const auto& sum : sums) {
@@ -247,7 +247,7 @@ class TfqSimulateSampledExpectationOp : public tensorflow::OpKernel {
       int n_random = largest_sum * output_dim_op_size * fused_circuits.size();
       n_random /= num_threads;
       n_random += 1;
-      auto local_gen = random_gen.ReserveSamples32(n_random);
+      auto local_gen = random_gen_.ReserveSamples32(n_random);
       tensorflow::random::SimplePhilox rand_source(&local_gen);
 
       for (int i = start; i < end; i++) {
@@ -273,7 +273,7 @@ class TfqSimulateSampledExpectationOp : public tensorflow::OpKernel {
           // no need to update scratch_state since ComputeExpectation
           // will take care of things for us.
           ss.SetStateZero(sv);
-          for (int j = 0; j < fused_circuits[cur_batch_index].size(); j++) {
+          for (size_t j = 0; j < fused_circuits[cur_batch_index].size(); j++) {
             qsim::ApplyFusedGate(sim, fused_circuits[cur_batch_index][j], sv);
           }
         }
@@ -310,7 +310,10 @@ REGISTER_OP("TfqSimulateSampledExpectation")
     .Input("symbol_values: float")
     .Input("pauli_sums: string")
     .Input("num_samples: int32")
+    .SetIsStateful()
     .Output("expectations: float")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
     .SetShapeFn([](tensorflow::shape_inference::InferenceContext* c) {
       tensorflow::shape_inference::ShapeHandle programs_shape;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &programs_shape));
diff --git a/tensorflow_quantum/core/ops/tfq_simulate_expectation_op_cuda.cu.cc b/tensorflow_quantum/core/ops/tfq_simulate_sampled_expectation_op_cuquantum.cu.cc
similarity index 64%
rename from tensorflow_quantum/core/ops/tfq_simulate_expectation_op_cuda.cu.cc
rename to tensorflow_quantum/core/ops/tfq_simulate_sampled_expectation_op_cuquantum.cu.cc
index 20e5bfe8c..5d4300fc5 100644
--- a/tensorflow_quantum/core/ops/tfq_simulate_expectation_op_cuda.cu.cc
+++ b/tensorflow_quantum/core/ops/tfq_simulate_sampled_expectation_op_cuquantum.cu.cc
@@ -1,8 +1,11 @@
 /* Copyright 2020 The TensorFlow Quantum Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -10,25 +13,26 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <memory>
-#include <vector>
+#include <custatevec.h>
 
 #include <chrono>
+#include <memory>
+#include <vector>
 
 #include "../qsim/lib/circuit.h"
 #include "../qsim/lib/gate_appl.h"
 #include "../qsim/lib/gates_cirq.h"
-#include "../qsim/lib/gates_qsim.h"
-#include "../qsim/lib/seqfor.h"
-#include "../qsim/lib/simulator_cuda.h"
-#include "../qsim/lib/statespace_cuda.h"
+#include "../qsim/lib/simmux_gpu.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/util/guarded_philox_random.h"
 #include "tensorflow_quantum/core/ops/parse_context.h"
 #include "tensorflow_quantum/core/proto/pauli_sum.pb.h"
 #include "tensorflow_quantum/core/proto/program.pb.h"
@@ -43,18 +47,29 @@ using ::tfq::proto::Program;
 typedef qsim::Cirq::GateCirq<float> QsimGate;
 typedef qsim::Circuit<QsimGate> QsimCircuit;
 
-class TfqSimulateExpectationOpCuda : public tensorflow::OpKernel {
+class TfqSimulateSampledExpectationOpCuQuantum : public tensorflow::OpKernel {
  public:
-  explicit TfqSimulateExpectationOpCuda(
+  explicit TfqSimulateSampledExpectationOpCuQuantum(
       tensorflow::OpKernelConstruction* context)
-      : OpKernel(context) {}
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, random_gen_.Init(context));
+    // Allocates handlers for initialization.
+    cublasCreate(&cublas_handle_);
+    custatevecCreate(&custatevec_handle_);
+  }
+
+  ~TfqSimulateSampledExpectationOpCuQuantum() {
+    // Destroys handlers in sync with simulator lifetime.
+    cublasDestroy(cublas_handle_);
+    custatevecDestroy(custatevec_handle_);
+  }
 
   void Compute(tensorflow::OpKernelContext* context) override {
     // TODO (mbbrough): add more dimension checks for other inputs here.
     const int num_inputs = context->num_inputs();
-    OP_REQUIRES(context, num_inputs == 4,
+    OP_REQUIRES(context, num_inputs == 5,
                 tensorflow::errors::InvalidArgument(absl::StrCat(
-                    "Expected 4 inputs, got ", num_inputs, " inputs.")));
+                    "Expected 5 inputs, got ", num_inputs, " inputs.")));
 
     // Create the output Tensor.
     const int output_dim_batch_size = context->input(0).dim_size(0);
@@ -64,13 +79,9 @@ class TfqSimulateExpectationOpCuda : public tensorflow::OpKernel {
     output_shape.AddDim(output_dim_op_size);
 
     tensorflow::Tensor* output = nullptr;
-    tensorflow::AllocatorAttributes alloc_attr;
-    alloc_attr.set_on_host(true);
-    alloc_attr.set_gpu_compatible(true);
-    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output,
-                                                     alloc_attr));
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
     auto output_tensor = output->matrix<float>();
-    // Parse program protos.
+
     std::vector<Program> programs;
     std::vector<int> num_qubits;
     std::vector<std::vector<PauliSum>> pauli_sums;
@@ -86,12 +97,28 @@ class TfqSimulateExpectationOpCuda : public tensorflow::OpKernel {
                     programs.size(), " circuits and ", maps.size(),
                     " symbol values.")));
 
+    std::vector<std::vector<int>> num_samples;
+    OP_REQUIRES_OK(context, GetNumSamples(context, &num_samples));
+
+    OP_REQUIRES(context, num_samples.size() == pauli_sums.size(),
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "Dimension 0 of num_samples and pauli_sums do not match.",
+                    "Got ", num_samples.size(), " lists of sample sizes and ",
+                    pauli_sums.size(), " lists of pauli sums.")));
+
+    OP_REQUIRES(
+        context, context->input(4).dim_size(1) == context->input(3).dim_size(1),
+        tensorflow::errors::InvalidArgument(absl::StrCat(
+            "Dimension 1 of num_samples and pauli_sums do not match.", "Got ",
+            context->input(4).dim_size(1), " lists of sample sizes and ",
+            context->input(3).dim_size(1), " lists of pauli sums.")));
+
     // Construct qsim circuits.
     std::vector<QsimCircuit> qsim_circuits(programs.size(), QsimCircuit());
     std::vector<std::vector<qsim::GateFused<QsimGate>>> fused_circuits(
         programs.size(), std::vector<qsim::GateFused<QsimGate>>({}));
 
-    Status parse_status = Status();
+    Status parse_status = ::tensorflow::Status();
     auto p_lock = tensorflow::mutex();
     auto construct_f = [&](int start, int end) {
       for (int i = start; i < end; i++) {
@@ -111,35 +138,51 @@ class TfqSimulateExpectationOpCuda : public tensorflow::OpKernel {
     for (const int num : num_qubits) {
       max_num_qubits = std::max(max_num_qubits, num);
     }
-    ComputeLarge(num_qubits, fused_circuits, pauli_sums, context,
+
+    ComputeLarge(num_qubits, fused_circuits, pauli_sums, num_samples, context,
                  &output_tensor);
   }
 
  private:
-  int num_threads_in_sim_;
-  int block_count_;
+  cublasHandle_t cublas_handle_;
+  custatevecHandle_t custatevec_handle_;
+  tensorflow::GuardedPhiloxRandom random_gen_;
 
-  // Define the GPU implementation that launches the CUDA kernel.
   void ComputeLarge(
       const std::vector<int>& num_qubits,
       const std::vector<std::vector<qsim::GateFused<QsimGate>>>& fused_circuits,
       const std::vector<std::vector<PauliSum>>& pauli_sums,
+      const std::vector<std::vector<int>>& num_samples,
       tensorflow::OpKernelContext* context,
       tensorflow::TTypes<float, 1>::Matrix* output_tensor) {
     // Instantiate qsim objects.
-    using Simulator = qsim::SimulatorCUDA<float>;
+    using Simulator = qsim::SimulatorCuStateVec<float>;
     using StateSpace = Simulator::StateSpace;
-    // Begin simulation with default parameters.
+
+    // Begin simulation.
     int largest_nq = 1;
-    Simulator sim = Simulator();
-    StateSpace ss = StateSpace(StateSpace::Parameter());
+    Simulator sim = Simulator(cublas_handle_, custatevec_handle_);
+    StateSpace ss = StateSpace(cublas_handle_, custatevec_handle_);
     auto sv = ss.Create(largest_nq);
     auto scratch = ss.Create(largest_nq);
 
+    int largest_sum = 0;
+    for (const auto& sums : pauli_sums) {
+      for (const auto& sum : sums) {
+        largest_sum = std::max(largest_sum, sum.terms().size());
+      }
+    }
+    // If empty tensor is fed, just return.
+    if (fused_circuits.size() == 0) return;
+
+    auto local_gen = random_gen_.ReserveSamples32(
+        largest_sum * pauli_sums[0].size() * fused_circuits.size() + 1);
+    tensorflow::random::SimplePhilox rand_source(&local_gen);
+
     // Simulate programs one by one. Parallelizing over state vectors
     // we no longer parallelize over circuits. Each time we encounter a
     // a larger circuit we will grow the Statevector as necessary.
-    for (int i = 0; i < fused_circuits.size(); i++) {
+    for (size_t i = 0; i < fused_circuits.size(); i++) {
       int nq = num_qubits[i];
 
       if (nq > largest_nq) {
@@ -152,35 +195,39 @@ class TfqSimulateExpectationOpCuda : public tensorflow::OpKernel {
       //  the state if there is a possibility that circuit[i] and
       //  circuit[i + 1] produce the same state.
       ss.SetStateZero(sv);
-      for (int j = 0; j < fused_circuits[i].size(); j++) {
+      for (size_t j = 0; j < fused_circuits[i].size(); j++) {
         qsim::ApplyFusedGate(sim, fused_circuits[i][j], sv);
       }
-      for (int j = 0; j < pauli_sums[i].size(); j++) {
+      for (size_t j = 0; j < pauli_sums[i].size(); j++) {
         // (#679) Just ignore empty program
         if (fused_circuits[i].size() == 0) {
           (*output_tensor)(i, j) = -2.0;
           continue;
         }
         float exp_v = 0.0;
-        OP_REQUIRES_OK(context,
-                       ComputeExpectationQsim(pauli_sums[i][j], sim, ss, sv,
-                                              scratch, &exp_v));
+        OP_REQUIRES_OK(context, ComputeSampledExpectationQsim(
+                                    pauli_sums[i][j], sim, ss, sv, scratch,
+                                    num_samples[i][j], rand_source, &exp_v));
         (*output_tensor)(i, j) = exp_v;
       }
     }
   }
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("TfqSimulateExpectationCuda").Device(tensorflow::DEVICE_CPU),
-    TfqSimulateExpectationOpCuda);
+REGISTER_KERNEL_BUILDER(Name("TfqSimulateSampledExpectationCuquantum")
+                            .Device(tensorflow::DEVICE_CPU),
+                        TfqSimulateSampledExpectationOpCuQuantum);
 
-REGISTER_OP("TfqSimulateExpectationCuda")
+REGISTER_OP("TfqSimulateSampledExpectationCuquantum")
     .Input("programs: string")
     .Input("symbol_names: string")
     .Input("symbol_values: float")
     .Input("pauli_sums: string")
+    .Input("num_samples: int32")
+    .SetIsStateful()
     .Output("expectations: float")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
     .SetShapeFn([](tensorflow::shape_inference::InferenceContext* c) {
       tensorflow::shape_inference::ShapeHandle programs_shape;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &programs_shape));
@@ -194,6 +241,9 @@ REGISTER_OP("TfqSimulateExpectationCuda")
       tensorflow::shape_inference::ShapeHandle pauli_sums_shape;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 2, &pauli_sums_shape));
 
+      tensorflow::shape_inference::ShapeHandle num_samples_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 2, &num_samples_shape));
+
       tensorflow::shape_inference::DimensionHandle output_rows =
           c->Dim(programs_shape, 0);
       tensorflow::shape_inference::DimensionHandle output_cols =
diff --git a/tensorflow_quantum/core/ops/tfq_simulate_samples_op.cc b/tensorflow_quantum/core/ops/tfq_simulate_samples_op.cc
index 0e68020e9..a5918ba27 100644
--- a/tensorflow_quantum/core/ops/tfq_simulate_samples_op.cc
+++ b/tensorflow_quantum/core/ops/tfq_simulate_samples_op.cc
@@ -48,7 +48,9 @@ typedef qsim::Circuit<QsimGate> QsimCircuit;
 class TfqSimulateSamplesOp : public tensorflow::OpKernel {
  public:
   explicit TfqSimulateSamplesOp(tensorflow::OpKernelConstruction* context)
-      : OpKernel(context) {}
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, random_gen_.Init(context));
+  }
 
   void Compute(tensorflow::OpKernelContext* context) override {
     // TODO (mbbrough): add more dimension checks for other inputs here.
@@ -129,6 +131,8 @@ class TfqSimulateSamplesOp : public tensorflow::OpKernel {
   }
 
  private:
+  tensorflow::GuardedPhiloxRandom random_gen_;
+
   void ComputeLarge(
       const std::vector<int>& num_qubits, const int max_num_qubits,
       const int num_samples,
@@ -146,15 +150,13 @@ class TfqSimulateSamplesOp : public tensorflow::OpKernel {
     StateSpace ss = StateSpace(tfq_for);
     auto sv = ss.Create(largest_nq);
 
-    tensorflow::GuardedPhiloxRandom random_gen;
-    random_gen.Init(tensorflow::random::New64(), tensorflow::random::New64());
-    auto local_gen = random_gen.ReserveSamples32(fused_circuits.size() + 1);
+    auto local_gen = random_gen_.ReserveSamples32(fused_circuits.size() + 1);
     tensorflow::random::SimplePhilox rand_source(&local_gen);
 
     // Simulate programs one by one. Parallelizing over state vectors
     // we no longer parallelize over circuits. Each time we encounter a
     // a larger circuit we will grow the Statevector as nescessary.
-    for (int i = 0; i < fused_circuits.size(); i++) {
+    for (size_t i = 0; i < fused_circuits.size(); i++) {
       int nq = num_qubits[i];
 
       if (nq > largest_nq) {
@@ -163,7 +165,7 @@ class TfqSimulateSamplesOp : public tensorflow::OpKernel {
         sv = ss.Create(largest_nq);
       }
       ss.SetStateZero(sv);
-      for (int j = 0; j < fused_circuits[i].size(); j++) {
+      for (size_t j = 0; j < fused_circuits[i].size(); j++) {
         qsim::ApplyFusedGate(sim, fused_circuits[i][j], sv);
       }
 
@@ -198,16 +200,13 @@ class TfqSimulateSamplesOp : public tensorflow::OpKernel {
     using Simulator = qsim::Simulator<const qsim::SequentialFor&>;
     using StateSpace = Simulator::StateSpace;
 
-    tensorflow::GuardedPhiloxRandom random_gen;
-    random_gen.Init(tensorflow::random::New64(), tensorflow::random::New64());
-
     auto DoWork = [&](int start, int end) {
       int largest_nq = 1;
       Simulator sim = Simulator(tfq_for);
       StateSpace ss = StateSpace(tfq_for);
       auto sv = ss.Create(largest_nq);
 
-      auto local_gen = random_gen.ReserveSamples32(fused_circuits.size() + 1);
+      auto local_gen = random_gen_.ReserveSamples32(fused_circuits.size() + 1);
       tensorflow::random::SimplePhilox rand_source(&local_gen);
 
       for (int i = start; i < end; i++) {
@@ -219,7 +218,7 @@ class TfqSimulateSamplesOp : public tensorflow::OpKernel {
           sv = ss.Create(largest_nq);
         }
         ss.SetStateZero(sv);
-        for (int j = 0; j < fused_circuits[i].size(); j++) {
+        for (size_t j = 0; j < fused_circuits[i].size(); j++) {
           qsim::ApplyFusedGate(sim, fused_circuits[i][j], sv);
         }
 
@@ -260,7 +259,10 @@ REGISTER_OP("TfqSimulateSamples")
     .Input("symbol_names: string")
     .Input("symbol_values: float")
     .Input("num_samples: int32")
+    .SetIsStateful()
     .Output("samples: int8")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
     .SetShapeFn([](tensorflow::shape_inference::InferenceContext* c) {
       tensorflow::shape_inference::ShapeHandle programs_shape;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &programs_shape));
diff --git a/tensorflow_quantum/core/ops/tfq_simulate_samples_op_cuquantum.cu.cc b/tensorflow_quantum/core/ops/tfq_simulate_samples_op_cuquantum.cu.cc
new file mode 100644
index 000000000..3c4d8666c
--- /dev/null
+++ b/tensorflow_quantum/core/ops/tfq_simulate_samples_op_cuquantum.cu.cc
@@ -0,0 +1,232 @@
+/* Copyright 2020 The TensorFlow Quantum Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <custatevec.h>
+#include <stdlib.h>
+
+#include <chrono>
+#include <string>
+
+#include "../qsim/lib/circuit.h"
+#include "../qsim/lib/gate_appl.h"
+#include "../qsim/lib/gates_cirq.h"
+#include "../qsim/lib/simmux_gpu.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/util/guarded_philox_random.h"
+#include "tensorflow_quantum/core/ops/parse_context.h"
+#include "tensorflow_quantum/core/proto/program.pb.h"
+#include "tensorflow_quantum/core/src/circuit_parser_qsim.h"
+#include "tensorflow_quantum/core/src/util_qsim.h"
+
+namespace tfq {
+
+using ::tensorflow::Status;
+using ::tfq::proto::Program;
+
+typedef qsim::Cirq::GateCirq<float> QsimGate;
+typedef qsim::Circuit<QsimGate> QsimCircuit;
+
+class TfqSimulateSamplesOpCuQuantum : public tensorflow::OpKernel {
+ public:
+  explicit TfqSimulateSamplesOpCuQuantum(
+      tensorflow::OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, random_gen_.Init(context));
+    // Allocates handlers for initialization.
+    cublasCreate(&cublas_handle_);
+    custatevecCreate(&custatevec_handle_);
+  }
+
+  ~TfqSimulateSamplesOpCuQuantum() {
+    // Destroys handlers in sync with simulator lifetime.
+    cublasDestroy(cublas_handle_);
+    custatevecDestroy(custatevec_handle_);
+  }
+
+  void Compute(tensorflow::OpKernelContext* context) override {
+    // TODO (mbbrough): add more dimension checks for other inputs here.
+    DCHECK_EQ(4, context->num_inputs());
+
+    // Parse to Program Proto and num_qubits.
+    std::vector<Program> programs;
+    std::vector<int> num_qubits;
+    OP_REQUIRES_OK(context,
+                   GetProgramsAndNumQubits(context, &programs, &num_qubits));
+
+    // Parse symbol maps for parameter resolution in the circuits.
+    std::vector<SymbolMap> maps;
+    OP_REQUIRES_OK(context, GetSymbolMaps(context, &maps));
+    OP_REQUIRES(
+        context, maps.size() == programs.size(),
+        tensorflow::errors::InvalidArgument(absl::StrCat(
+            "Number of circuits and values do not match. Got ", programs.size(),
+            " circuits and ", maps.size(), " values.")));
+
+    int num_samples = 0;
+    OP_REQUIRES_OK(context, GetIndividualSample(context, &num_samples));
+
+    // Construct qsim circuits.
+    std::vector<QsimCircuit> qsim_circuits(programs.size(), QsimCircuit());
+    std::vector<std::vector<qsim::GateFused<QsimGate>>> fused_circuits(
+        programs.size(), std::vector<qsim::GateFused<QsimGate>>({}));
+
+    Status parse_status = ::tensorflow::Status();
+    auto p_lock = tensorflow::mutex();
+    auto construct_f = [&](int start, int end) {
+      for (int i = start; i < end; i++) {
+        Status local =
+            QsimCircuitFromProgram(programs[i], maps[i], num_qubits[i],
+                                   &qsim_circuits[i], &fused_circuits[i]);
+        NESTED_FN_STATUS_SYNC(parse_status, local, p_lock);
+      }
+    };
+
+    const int num_cycles = 1000;
+    context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
+        programs.size(), num_cycles, construct_f);
+    OP_REQUIRES_OK(context, parse_status);
+
+    // Find largest circuit for tensor size padding and allocate
+    // the output tensor.
+    int max_num_qubits = 0;
+    for (const int num : num_qubits) {
+      max_num_qubits = std::max(max_num_qubits, num);
+    }
+
+    const int output_dim_size = maps.size();
+    tensorflow::TensorShape output_shape;
+    output_shape.AddDim(output_dim_size);
+    output_shape.AddDim(num_samples);
+    output_shape.AddDim(max_num_qubits);
+
+    tensorflow::Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+    auto output_tensor = output->tensor<int8_t, 3>();
+
+    if (num_samples == 0) {
+      return;  // bug in qsim dependency we can't control.
+    }
+
+    ComputeLarge(num_qubits, max_num_qubits, num_samples, fused_circuits,
+                 context, &output_tensor);
+  }
+
+ private:
+  cublasHandle_t cublas_handle_;
+  custatevecHandle_t custatevec_handle_;
+  tensorflow::GuardedPhiloxRandom random_gen_;
+
+  void ComputeLarge(
+      const std::vector<int>& num_qubits, const int max_num_qubits,
+      const int num_samples,
+      const std::vector<std::vector<qsim::GateFused<QsimGate>>>& fused_circuits,
+      tensorflow::OpKernelContext* context,
+      tensorflow::TTypes<int8_t, 3>::Tensor* output_tensor) {
+    // Instantiate qsim objects.
+    using Simulator = qsim::SimulatorCuStateVec<float>;
+    using StateSpace = Simulator::StateSpace;
+
+    // Begin simulation.
+    int largest_nq = 1;
+    Simulator sim = Simulator(cublas_handle_, custatevec_handle_);
+    StateSpace ss = StateSpace(cublas_handle_, custatevec_handle_);
+    auto sv = ss.Create(largest_nq);
+
+    auto local_gen = random_gen_.ReserveSamples32(fused_circuits.size() + 1);
+    tensorflow::random::SimplePhilox rand_source(&local_gen);
+
+    // Simulate programs one by one. Parallelizing over state vectors
+    // we no longer parallelize over circuits. Each time we encounter a
+    // a larger circuit we will grow the Statevector as nescessary.
+    for (size_t i = 0; i < fused_circuits.size(); i++) {
+      int nq = num_qubits[i];
+
+      if (nq > largest_nq) {
+        // need to switch to larger statespace.
+        largest_nq = nq;
+        sv = ss.Create(largest_nq);
+      }
+      ss.SetStateZero(sv);
+      for (size_t j = 0; j < fused_circuits[i].size(); j++) {
+        qsim::ApplyFusedGate(sim, fused_circuits[i][j], sv);
+      }
+
+      auto samples = ss.Sample(sv, num_samples, rand_source.Rand32());
+      for (int j = 0; j < num_samples; j++) {
+        uint64_t q_ind = 0;
+        uint64_t mask = 1;
+        bool val = 0;
+        while (q_ind < nq) {
+          val = samples[j] & mask;
+          (*output_tensor)(
+              i, j, static_cast<ptrdiff_t>(max_num_qubits - q_ind - 1)) = val;
+          q_ind++;
+          mask <<= 1;
+        }
+        while (q_ind < max_num_qubits) {
+          (*output_tensor)(
+              i, j, static_cast<ptrdiff_t>(max_num_qubits - q_ind - 1)) = -2;
+          q_ind++;
+        }
+      }
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("TfqSimulateSamplesCuquantum").Device(tensorflow::DEVICE_CPU),
+    TfqSimulateSamplesOpCuQuantum);
+
+REGISTER_OP("TfqSimulateSamplesCuquantum")
+    .Input("programs: string")
+    .Input("symbol_names: string")
+    .Input("symbol_values: float")
+    .Input("num_samples: int32")
+    .SetIsStateful()
+    .Output("samples: int8")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
+    .SetShapeFn([](tensorflow::shape_inference::InferenceContext* c) {
+      tensorflow::shape_inference::ShapeHandle programs_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &programs_shape));
+
+      tensorflow::shape_inference::ShapeHandle symbol_names_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &symbol_names_shape));
+
+      tensorflow::shape_inference::ShapeHandle symbol_values_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &symbol_values_shape));
+
+      tensorflow::shape_inference::ShapeHandle num_samples_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &num_samples_shape));
+
+      // [batch_size, n_samples, largest_n_qubits]
+      c->set_output(
+          0, c->MakeShape(
+                 {c->Dim(programs_shape, 0),
+                  tensorflow::shape_inference::InferenceContext::kUnknownDim,
+                  tensorflow::shape_inference::InferenceContext::kUnknownDim}));
+
+      return ::tensorflow::Status();
+    });
+
+}  // namespace tfq
\ No newline at end of file
diff --git a/tensorflow_quantum/core/ops/tfq_simulate_state_op_cuquantum.cu.cc b/tensorflow_quantum/core/ops/tfq_simulate_state_op_cuquantum.cu.cc
new file mode 100644
index 000000000..0ad5feb2d
--- /dev/null
+++ b/tensorflow_quantum/core/ops/tfq_simulate_state_op_cuquantum.cu.cc
@@ -0,0 +1,217 @@
+/* Copyright 2020 The TensorFlow Quantum Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <custatevec.h>
+
+#include <chrono>
+#include <string>
+#include <vector>
+
+#include "../qsim/lib/circuit.h"
+#include "../qsim/lib/gate_appl.h"
+#include "../qsim/lib/gates_cirq.h"
+#include "../qsim/lib/simmux_gpu.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow_quantum/core/ops/parse_context.h"
+#include "tensorflow_quantum/core/proto/program.pb.h"
+#include "tensorflow_quantum/core/src/circuit_parser_qsim.h"
+#include "tensorflow_quantum/core/src/util_qsim.h"
+
+namespace tfq {
+
+using ::tensorflow::Status;
+using ::tfq::proto::Program;
+
+typedef qsim::Cirq::GateCirq<float> QsimGate;
+typedef qsim::Circuit<QsimGate> QsimCircuit;
+
+class TfqSimulateStateOpCuQuantum : public tensorflow::OpKernel {
+ public:
+  explicit TfqSimulateStateOpCuQuantum(
+      tensorflow::OpKernelConstruction* context)
+      : OpKernel(context) {
+    // Allocates handlers for initialization.
+    cublasCreate(&cublas_handle_);
+    custatevecCreate(&custatevec_handle_);
+  }
+
+  ~TfqSimulateStateOpCuQuantum() {
+    // Destroys handlers in sync with simulator lifetime.
+    cublasDestroy(cublas_handle_);
+    custatevecDestroy(custatevec_handle_);
+  }
+
+  void Compute(tensorflow::OpKernelContext* context) override {
+    // TODO (mbbrough): add more dimension checks for other inputs here.
+    DCHECK_EQ(3, context->num_inputs());
+
+    // Parse to Program Proto and num_qubits.
+    std::vector<Program> programs;
+    std::vector<int> num_qubits;
+    OP_REQUIRES_OK(context,
+                   GetProgramsAndNumQubits(context, &programs, &num_qubits));
+
+    // Parse symbol maps for parameter resolution in the circuits.
+    std::vector<SymbolMap> maps;
+    OP_REQUIRES_OK(context, GetSymbolMaps(context, &maps));
+    OP_REQUIRES(
+        context, maps.size() == programs.size(),
+        tensorflow::errors::InvalidArgument(absl::StrCat(
+            "Number of circuits and values do not match. Got ", programs.size(),
+            " circuits and ", maps.size(), " values.")));
+
+    // Construct qsim circuits.
+    std::vector<QsimCircuit> qsim_circuits(programs.size(), QsimCircuit());
+    std::vector<std::vector<qsim::GateFused<QsimGate>>> fused_circuits(
+        programs.size(), std::vector<qsim::GateFused<QsimGate>>({}));
+
+    Status parse_status = Status::OK();
+    auto p_lock = tensorflow::mutex();
+    auto construct_f = [&](int start, int end) {
+      for (int i = start; i < end; i++) {
+        Status local =
+            QsimCircuitFromProgram(programs[i], maps[i], num_qubits[i],
+                                   &qsim_circuits[i], &fused_circuits[i]);
+        NESTED_FN_STATUS_SYNC(parse_status, local, p_lock);
+      }
+    };
+
+    const int num_cycles = 1000;
+    context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
+        programs.size(), num_cycles, construct_f);
+    OP_REQUIRES_OK(context, parse_status);
+
+    // Find largest circuit for tensor size padding and allocate
+    // the output tensor.
+    int max_num_qubits = 0;
+    for (const int num : num_qubits) {
+      max_num_qubits = std::max(max_num_qubits, num);
+    }
+
+    const int output_dim_size = maps.size();
+    tensorflow::TensorShape output_shape;
+    output_shape.AddDim(output_dim_size);
+    output_shape.AddDim(1 << max_num_qubits);
+
+    tensorflow::Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+    tensorflow::TTypes<std::complex<float>, 1>::Matrix output_tensor =
+        output->matrix<std::complex<float>>();
+
+    ComputeLarge(num_qubits, max_num_qubits, fused_circuits, context,
+                 &output_tensor);
+  }
+
+ private:
+  cublasHandle_t cublas_handle_;
+  custatevecHandle_t custatevec_handle_;
+
+  void ComputeLarge(
+      const std::vector<int>& num_qubits, const int max_num_qubits,
+      const std::vector<std::vector<qsim::GateFused<QsimGate>>>& fused_circuits,
+      tensorflow::OpKernelContext* context,
+      tensorflow::TTypes<std::complex<float>, 1>::Matrix* output_tensor) {
+    // Instantiate qsim objects.
+    using Simulator = qsim::SimulatorCuStateVec<float>;
+    using StateSpace = Simulator::StateSpace;
+
+    // Begin simulation.
+    Simulator sim = Simulator(cublas_handle_, custatevec_handle_);
+    StateSpace ss = StateSpace(cublas_handle_, custatevec_handle_);
+    // Begin simulation.
+    int largest_nq = 1;
+    auto sv = ss.Create(largest_nq);
+    std::vector<float> sv_host;
+    sv_host.resize(2 * (uint64_t(1) << largest_nq));
+
+    // Simulate programs one by one. Parallelizing over state vectors
+    // we no longer parallelize over circuits. Each time we encounter a
+    // a larger circuit we will grow the Statevector as necessary.
+    for (size_t i = 0; i < fused_circuits.size(); i++) {
+      int nq = num_qubits[i];
+
+      if (nq > largest_nq) {
+        // need to switch to larger statespace.
+        largest_nq = nq;
+        sv = ss.Create(largest_nq);
+        sv_host.resize(2 * (uint64_t(1) << largest_nq));
+      }
+      ss.SetStateZero(sv);
+      for (size_t j = 0; j < fused_circuits[i].size(); j++) {
+        qsim::ApplyFusedGate(sim, fused_circuits[i][j], sv);
+      }
+
+      // Copy the whole GPU data to CPU memory once.
+      // Please don't use ss.GetAmpl(), because it copies amplitude
+      // one-by-one, which makes huge speed slowdown, even slower than CPU op.
+      ss.Copy(sv, sv_host.data());
+      // Parallel copy state vector information from qsim into tensorflow
+      // tensors. We need type conversions from 2 floats to std::complex.
+      auto copy_f = [i, nq, max_num_qubits, &output_tensor, &sv_host](
+                        uint64_t start, uint64_t end) {
+        uint64_t crossover = uint64_t(1) << nq;
+        uint64_t upper = std::min(end, crossover);
+
+        if (start < crossover) {
+          for (uint64_t j = 0; j < upper; j++) {
+            (*output_tensor)(i, j) =
+                std::complex<float>(sv_host[2 * j], sv_host[2 * j + 1]);
+          }
+        }
+        for (uint64_t j = upper; j < end; j++) {
+          (*output_tensor)(i, j) = std::complex<float>(-2, 0);
+        }
+      };
+      const int num_cycles_copy = 50;
+      context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
+          uint64_t(1) << max_num_qubits, num_cycles_copy, copy_f);
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("TfqSimulateStateCuquantum").Device(tensorflow::DEVICE_CPU),
+    TfqSimulateStateOpCuQuantum);
+
+REGISTER_OP("TfqSimulateStateCuquantum")
+    .Input("programs: string")
+    .Input("symbol_names: string")
+    .Input("symbol_values: float")
+    .Output("state_vector: complex64")
+    .SetShapeFn([](tensorflow::shape_inference::InferenceContext* c) {
+      tensorflow::shape_inference::ShapeHandle programs_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &programs_shape));
+
+      tensorflow::shape_inference::ShapeHandle symbol_names_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &symbol_names_shape));
+
+      tensorflow::shape_inference::ShapeHandle symbol_values_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &symbol_values_shape));
+
+      c->set_output(
+          0, c->MakeShape(
+                 {c->Dim(programs_shape, 0),
+                  tensorflow::shape_inference::InferenceContext::kUnknownDim}));
+
+      return ::tensorflow::Status();
+    });
+
+}  // namespace tfq
\ No newline at end of file
diff --git a/tensorflow_quantum/core/serialize/op_deserializer_test.py b/tensorflow_quantum/core/serialize/op_deserializer_test.py
index 634561d2b..466e32200 100644
--- a/tensorflow_quantum/core/serialize/op_deserializer_test.py
+++ b/tensorflow_quantum/core/serialize/op_deserializer_test.py
@@ -39,7 +39,7 @@ def op_proto(json_dict):
 
 
 @cirq.value_equality
-class GateWithAttribute(cirq.SingleQubitGate):
+class GateWithAttribute(cirq.testing.SingleQubitGate):
     """GateAttribute helper class."""
 
     def __init__(self, val, not_req=None):
diff --git a/tensorflow_quantum/core/serialize/op_serializer_test.py b/tensorflow_quantum/core/serialize/op_serializer_test.py
index 432dd117f..d0e4bc80b 100644
--- a/tensorflow_quantum/core/serialize/op_serializer_test.py
+++ b/tensorflow_quantum/core/serialize/op_serializer_test.py
@@ -39,14 +39,14 @@ def op_proto(json):
     return op
 
 
-class GateWithAttribute(cirq.SingleQubitGate):
+class GateWithAttribute(cirq.testing.SingleQubitGate):
     """GateAttribute helper class."""
 
     def __init__(self, val):
         self.val = val
 
 
-class GateWithProperty(cirq.SingleQubitGate):
+class GateWithProperty(cirq.testing.SingleQubitGate):
     """GateProperty helper class."""
 
     def __init__(self, val, not_req=None):
@@ -59,7 +59,7 @@ def val(self):
         return self._val
 
 
-class GateWithMethod(cirq.SingleQubitGate):
+class GateWithMethod(cirq.testing.SingleQubitGate):
     """GateMethod helper class."""
 
     def __init__(self, val):
diff --git a/tensorflow_quantum/python/differentiators/BUILD b/tensorflow_quantum/python/differentiators/BUILD
index 9e5f28aab..33103e4e7 100644
--- a/tensorflow_quantum/python/differentiators/BUILD
+++ b/tensorflow_quantum/python/differentiators/BUILD
@@ -1,3 +1,5 @@
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
+
 package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])
@@ -25,7 +27,9 @@ py_library(
     deps = [
         ":differentiator",
         "//tensorflow_quantum/core/ops:tfq_adj_grad_op_py",
-    ],
+    ] + if_cuda_is_configured([
+        "//tensorflow_quantum/core/ops:tfq_adj_grad_op_cuquantum_py",
+    ]),
 )
 
 py_test(
@@ -35,6 +39,7 @@ py_test(
     deps = [
         ":adjoint",
         "//tensorflow_quantum/core/ops:circuit_execution_ops",
+        "//tensorflow_quantum/python:util",
     ],
 )
 
@@ -118,6 +123,7 @@ py_test(
 py_test(
     name = "gradient_test",
     timeout = "eternal",
+    shard_count = 5,
     srcs = ["gradient_test.py"],
     python_version = "PY3",
     deps = [
diff --git a/tensorflow_quantum/python/differentiators/adjoint.py b/tensorflow_quantum/python/differentiators/adjoint.py
index 44b8e9da6..57ccd304b 100644
--- a/tensorflow_quantum/python/differentiators/adjoint.py
+++ b/tensorflow_quantum/python/differentiators/adjoint.py
@@ -16,6 +16,13 @@
 import tensorflow as tf
 
 from tensorflow_quantum.core.ops import tfq_adj_grad_op
+try:
+    from tensorflow_quantum.core.ops import tfq_adj_grad_op_cuquantum
+    _ENABLE_USE_CUQUANTUM = True
+except:
+    _ENABLE_USE_CUQUANTUM = False
+    tfq_adj_grad_op_cuquantum = tfq_adj_grad_op
+
 from tensorflow_quantum.python.differentiators import differentiator
 
 
@@ -32,9 +39,10 @@ class Adjoint(differentiator.Differentiator):
     https://academic.oup.com/gji/article-pdf/167/2/495/1492368/167-2-495.pdf).
     The Adjoint method differentiates the input circuits in roughly one forward
     and backward pass over the circuits, to calculate the gradient of
-    a symbol only a constant number of gate operations need to be applied to the
-    circuits state. When the number of parameters in a circuit is very large,
-    this differentiator performs much better than all the others found in TFQ.
+    a symbol only a constant number of gate operations need to be applied to
+    the circuits state. When the number of parameters in a circuit is very
+    large, this differentiator performs much better than all the others found
+    in TFQ.
 
 
     >>> my_op = tfq.get_expectation_op()
@@ -62,7 +70,11 @@ class Adjoint(differentiator.Differentiator):
 
     """
 
-    def generate_differentiable_op(self, *, sampled_op=None, analytic_op=None):
+    def generate_differentiable_op(self,
+                                   *,
+                                   sampled_op=None,
+                                   analytic_op=None,
+                                   use_cuquantum=False):
         """Generate a differentiable op by attaching self to an op.
 
         See `tfq.differentiators.Differentiator`. This has been partially
@@ -75,6 +87,8 @@ def generate_differentiable_op(self, *, sampled_op=None, analytic_op=None):
                 using this differentiator's `differentiate_sampled` method.
             analytic_op: A `callable` op that you want to make differentiable
                 using this differentiators `differentiate_analytic` method.
+            use_cuquantum: A `bool` indicating whether to use the cuQuantum
+                version of the adjoint gradient op.
 
         Returns:
             A `callable` op that who's gradients are now registered to be
@@ -85,8 +99,10 @@ def generate_differentiable_op(self, *, sampled_op=None, analytic_op=None):
             raise ValueError("sample base backends are not supported by the "
                              "Adjoint method, please use analytic expectation"
                              " or choose another differentiator.")
+        use_cuquantum = _ENABLE_USE_CUQUANTUM and use_cuquantum
 
-        return super().generate_differentiable_op(analytic_op=analytic_op)
+        return super().generate_differentiable_op(analytic_op=analytic_op,
+                                                  use_cuquantum=use_cuquantum)
 
     @tf.function
     def get_gradient_circuits(self, programs, symbol_names, symbol_values):
@@ -97,13 +113,60 @@ def get_gradient_circuits(self, programs, symbol_names, symbol_values):
 
     @differentiator.catch_empty_inputs
     @tf.function
-    def differentiate_analytic(self, programs, symbol_names, symbol_values,
-                               pauli_sums, forward_pass_vals, grad):
+    def differentiate_analytic_cuquantum(
+            self,
+            programs,
+            symbol_names,
+            symbol_values,
+            pauli_sums,
+            forward_pass_vals,
+            grad,
+    ):
+        """Returns cuquantum adjoint gradient op result."""
+        return tfq_adj_grad_op_cuquantum.tfq_adj_grad(programs, symbol_names,
+                                                      symbol_values, pauli_sums,
+                                                      grad)
+
+    @differentiator.catch_empty_inputs
+    @tf.function
+    def differentiate_analytic(
+            self,
+            programs,
+            symbol_names,
+            symbol_values,
+            pauli_sums,
+            forward_pass_vals,
+            grad,
+    ):
+        """Returns cpu adjoint gradient op result."""
         return tfq_adj_grad_op.tfq_adj_grad(programs, symbol_names,
                                             symbol_values, pauli_sums, grad)
 
-    def differentiate_sampled(self, programs, symbol_names, symbol_values,
-                              pauli_sums, num_samples, forward_pass_vals, grad):
+    def differentiate_sampled_cuquantum(
+            self,
+            programs,
+            symbol_names,
+            symbol_values,
+            pauli_sums,
+            num_samples,
+            forward_pass_vals,
+            grad,
+    ):
+        raise NotImplementedError(
+            "Adjoint state methods are not supported in sample based settings."
+            " Please use analytic expectation calculation or a different "
+            "tfq.differentiator.")
+
+    def differentiate_sampled(
+            self,
+            programs,
+            symbol_names,
+            symbol_values,
+            pauli_sums,
+            num_samples,
+            forward_pass_vals,
+            grad,
+    ):
         raise NotImplementedError(
             "Adjoint state methods are not supported in sample based settings."
             " Please use analytic expectation calculation or a different "
diff --git a/tensorflow_quantum/python/differentiators/adjoint_test.py b/tensorflow_quantum/python/differentiators/adjoint_test.py
index 640a87d30..9cb5c30a9 100644
--- a/tensorflow_quantum/python/differentiators/adjoint_test.py
+++ b/tensorflow_quantum/python/differentiators/adjoint_test.py
@@ -20,20 +20,65 @@
 NEW_PATH = [x for x in sys.path if 'com_google_protobuf' not in x]
 sys.path = NEW_PATH
 # pylint: enable=wrong-import-position
+from unittest import mock
 
+from absl.testing import parameterized
+import cirq
+import numpy as np
+import sympy
 import tensorflow as tf
 
-from tensorflow_quantum.python.differentiators import adjoint
 from tensorflow_quantum.core.ops import circuit_execution_ops
+from tensorflow_quantum.python import util
+from tensorflow_quantum.python.differentiators import adjoint
 
 
-class AdjointTest(tf.test.TestCase):
+class AdjointTest(tf.test.TestCase, parameterized.TestCase):
     """Test that we can properly subclass differentiator."""
 
     def test_instantiation(self):
         """Test that adjoint can be created."""
         adjoint.Adjoint()
 
+    @parameterized.parameters(
+        list(util.kwargs_cartesian_product(**{
+            'use_cuquantum': [False, True],
+        })))
+    def test_use_cuquantum(self, use_cuquantum):
+        """Ensure that use_cuquantum switches to cuquantum ops well."""
+        if not circuit_execution_ops.is_gpu_configured():
+            # Ignores this test if gpu is not configured.
+            self.skipTest("GPU is not set. Ignoring gpu tests...")
+        # Prepares a simple circuit.
+        qubit = cirq.GridQubit(0, 0)
+        circuit = util.convert_to_tensor(
+            [cirq.Circuit(cirq.X(qubit)**sympy.Symbol('alpha'))])
+        psums = util.convert_to_tensor([[cirq.Z(qubit)]])
+        symbol_values_array = np.array([[0.123]], dtype=np.float32)
+        symbol_values_tensor = tf.convert_to_tensor(symbol_values_array)
+
+        # Mocks `Adjoint.differentiate_analytic*()` to check if
+        # it's called once correctly.
+        method_name = ("differentiate_analytic_cuquantum"
+                       if use_cuquantum else "differentiate_analytic")
+        with mock.patch.object(adjoint.Adjoint,
+                               method_name,
+                               return_value=None,
+                               autospec=True) as mock_adj:
+            dif = adjoint.Adjoint()
+            op = circuit_execution_ops.get_expectation_op(
+                use_cuquantum=use_cuquantum, quantum_concurrent=False)
+            diff_op = dif.generate_differentiable_op(
+                analytic_op=op, use_cuquantum=use_cuquantum)
+
+            # Calculate tfq gradient.
+            with tf.GradientTape() as g:
+                g.watch(symbol_values_tensor)
+                expectations = diff_op(circuit, tf.convert_to_tensor(['alpha']),
+                                       symbol_values_tensor, psums)
+            _ = g.gradient(expectations, symbol_values_tensor)
+        mock_adj.assert_called_once()
+
     def test_sample_errors(self):
         """Ensure that the adjoint method won't attach to sample ops."""
 
@@ -41,6 +86,8 @@ def test_sample_errors(self):
         op = circuit_execution_ops.get_sampled_expectation_op()
         with self.assertRaisesRegex(ValueError, expected_regex='not supported'):
             dif.generate_differentiable_op(sampled_op=op)
+        with self.assertRaisesRegex(ValueError, expected_regex='not supported'):
+            dif.generate_differentiable_op(sampled_op=op, use_cuquantum=True)
 
     def test_no_gradient_circuits(self):
         """Confirm the adjoint differentiator has no gradient circuits."""
diff --git a/tensorflow_quantum/python/differentiators/differentiator.py b/tensorflow_quantum/python/differentiators/differentiator.py
index bb3668a92..72ee25e28 100644
--- a/tensorflow_quantum/python/differentiators/differentiator.py
+++ b/tensorflow_quantum/python/differentiators/differentiator.py
@@ -55,12 +55,16 @@ class Differentiator(metaclass=abc.ABCMeta):
     to backpropagate through a quantum circuit.
     """
 
-    def generate_differentiable_op(self, *, sampled_op=None, analytic_op=None):
+    def generate_differentiable_op(self,
+                                   *,
+                                   sampled_op=None,
+                                   analytic_op=None,
+                                   use_cuquantum=False):
         """Generate a differentiable op by attaching self to an op.
 
         This function returns a `tf.function` that passes values through to
-        `forward_op` during the forward pass and this differentiator (`self`) to
-        backpropagate through the op during the backward pass. If sampled_op
+        `forward_op` during the forward pass and this differentiator (`self`)
+        to backpropagate through the op during the backward pass. If sampled_op
         is provided the differentiators `differentiate_sampled` method will
         be invoked (which requires sampled_op to be a sample based expectation
         op with num_samples input tensor). If analytic_op is provided the
@@ -80,6 +84,8 @@ def generate_differentiable_op(self, *, sampled_op=None, analytic_op=None):
                 using this differentiator's `differentiate_sampled` method.
             analytic_op: A `callable` op that you want to make differentiable
                 using this differentiators `differentiate_analytic` method.
+            use_cuquantum: A `bool` indicating whether to use cuQuantum version
+                op.
 
         Returns:
             A `callable` op that who's gradients are now registered to be
@@ -112,6 +118,9 @@ def generate_differentiable_op(self, *, sampled_op=None, analytic_op=None):
             raise TypeError('Provided arguments must be callable tensorflow '
                             'ops.')
 
+        if not isinstance(use_cuquantum, bool):
+            raise TypeError('use_cuquantum should be boolean.')
+
         # TODO (mbbrough): find a better workaround than this to ensure
         #   that the correct sample based expectation wasn't accidentally
         #   put inside of the analytical_op argument or vice versa.
@@ -149,6 +158,12 @@ def generate_differentiable_op(self, *, sampled_op=None, analytic_op=None):
                                      'Given arg: {}.'.format(str(key)) + ''
                                      'The signature should contain: {}.'.format(
                                          list(expected_signature)))
+        if use_cuquantum:
+            _differentiate_ana, _differentiate_sam = (
+                self._differentiate_ana_cq, self._differentiate_sam_cq)
+        else:
+            _differentiate_ana, _differentiate_sam = (self._differentiate_ana,
+                                                      self._differentiate_sam)
 
         @tf.custom_gradient
         def op_wrapper_analytic(programs, symbol_names, symbol_values,
@@ -157,9 +172,8 @@ def op_wrapper_analytic(programs, symbol_names, symbol_values,
                                             symbol_values, pauli_sums)
 
             def gradient(grad):
-                return self._differentiate_ana(programs, symbol_names,
-                                               symbol_values, pauli_sums,
-                                               forward_pass_vals, grad)
+                return _differentiate_ana(programs, symbol_names, symbol_values,
+                                          pauli_sums, forward_pass_vals, grad)
 
             return forward_pass_vals, gradient
 
@@ -171,10 +185,9 @@ def op_wrapper_sampled(programs, symbol_names, symbol_values,
                                            num_samples)
 
             def gradient(grad):
-                return self._differentiate_sam(programs, symbol_names,
-                                               symbol_values, pauli_sums,
-                                               num_samples, forward_pass_vals,
-                                               grad)
+                return _differentiate_sam(programs, symbol_names, symbol_values,
+                                          pauli_sums, num_samples,
+                                          forward_pass_vals, grad)
 
             return forward_pass_vals, gradient
 
@@ -186,6 +199,13 @@ def gradient(grad):
 
         return return_func
 
+    def _differentiate_ana_cq(self, programs, symbol_names, symbol_values,
+                              pauli_sums, forward_pass_vals, grad):
+        return None, None, self.differentiate_analytic_cuquantum(
+            programs, symbol_names, symbol_values,
+            pauli_sums, forward_pass_vals, grad), \
+               None
+
     def _differentiate_ana(self, programs, symbol_names, symbol_values,
                            pauli_sums, forward_pass_vals, grad):
         return None, None, self.differentiate_analytic(
@@ -193,6 +213,13 @@ def _differentiate_ana(self, programs, symbol_names, symbol_values,
             pauli_sums, forward_pass_vals, grad), \
                None
 
+    def _differentiate_sam_cq(self, programs, symbol_names, symbol_values,
+                              pauli_sums, num_samples, forward_pass_vals, grad):
+        return None, None, self.differentiate_sampled_cuquantum(
+            programs, symbol_names, symbol_values,
+            pauli_sums, num_samples, forward_pass_vals, grad), \
+               None, None
+
     def _differentiate_sam(self, programs, symbol_names, symbol_values,
                            pauli_sums, num_samples, forward_pass_vals, grad):
         return None, None, self.differentiate_sampled(
@@ -324,6 +351,30 @@ def get_gradient_circuits(self, programs, symbol_names, symbol_values):
                 the output `batch_weights`.
         """
 
+    @catch_empty_inputs
+    @tf.function
+    def differentiate_analytic_cuquantum(self, programs, symbol_names,
+                                         symbol_values, pauli_sums,
+                                         forward_pass_vals, grad):
+        """Differentiate a circuit with analytical expectation with GPU ops."""
+        # `self.expectation_op` is already set to cuquantum op at
+        # generate_differentiable_op._differentiate_ana.
+        return self.differentiate_analytic(programs, symbol_names,
+                                           symbol_values, pauli_sums,
+                                           forward_pass_vals, grad)
+
+    @catch_empty_inputs
+    @tf.function
+    def differentiate_sampled_cuquantum(self, programs, symbol_names,
+                                        symbol_values, pauli_sums, num_samples,
+                                        forward_pass_vals, grad):
+        """Differentiate a circuit with sampled expectation with GPU ops."""
+        # `self.expectation_op` is already set to cuquantum op at
+        # generate_differentiable_op._differentiate_sam.
+        return self.differentiate_sampled(programs, symbol_names, symbol_values,
+                                          pauli_sums, num_samples,
+                                          forward_pass_vals, grad)
+
     @catch_empty_inputs
     @tf.function
     def differentiate_analytic(self, programs, symbol_names, symbol_values,
diff --git a/tensorflow_quantum/python/differentiators/differentiator_test.py b/tensorflow_quantum/python/differentiators/differentiator_test.py
index b61b2a323..f4d4544fc 100644
--- a/tensorflow_quantum/python/differentiators/differentiator_test.py
+++ b/tensorflow_quantum/python/differentiators/differentiator_test.py
@@ -73,6 +73,26 @@ def test_generate_differentiable_op(self):
             WorkingDifferentiator().generate_differentiable_op(
                 sampled_op=lambda programs, symbol_names, pauli_sums: 1)
 
+    def test_generate_differentiable_op_cuquantum(self):
+        """test the type checking on this method with `use_cuquantum`."""
+        WorkingDifferentiator().generate_differentiable_op(
+            analytic_op=lambda programs, symbol_names, symbol_values,
+            pauli_sums: 1,
+            use_cuquantum=True)
+        WorkingDifferentiator().generate_differentiable_op(
+            sampled_op=lambda programs, symbol_names, symbol_values, pauli_sums,
+            num_samples: 1,
+            use_cuquantum=True)
+        with self.assertRaisesRegex(TypeError, expected_regex='boolean'):
+            WorkingDifferentiator().generate_differentiable_op(
+                analytic_op=lambda programs, symbol_names, symbol_values,
+                pauli_sums: 1,
+                use_cuquantum='junk')
+        with self.assertRaisesRegex(TypeError, expected_regex='boolean'):
+            WorkingDifferentiator().generate_differentiable_op(
+                sampled_op=lambda programs, symbol_names, pauli_sums: 1,
+                use_cuquantum='junk')
+
     def test_single_op_link(self):
         """Tests if the `one-differentiator-per-op` policy is working well."""
         wd = WorkingDifferentiator()
diff --git a/tensorflow_quantum/python/differentiators/gradient_test.py b/tensorflow_quantum/python/differentiators/gradient_test.py
index b85506b0d..9c06f4035 100644
--- a/tensorflow_quantum/python/differentiators/gradient_test.py
+++ b/tensorflow_quantum/python/differentiators/gradient_test.py
@@ -37,6 +37,8 @@
 from tensorflow_quantum.core.ops.noise import noisy_expectation_op
 from tensorflow_quantum.core.ops.noise import noisy_sampled_expectation_op
 
+RANDOM_SEED = 1234
+
 ANALYTIC_DIFFS = [
     linear_combination.ForwardDifference(grid_spacing=0.0001),
     linear_combination.ForwardDifference(error_order=2, grid_spacing=0.0001),
@@ -58,12 +60,22 @@
     circuit_execution_ops.get_expectation_op()  # C++
 ]
 
+ANALYTIC_GPU_OPS = [
+    circuit_execution_ops.get_expectation_op(use_cuquantum=True,
+                                             quantum_concurrent=False)
+]
+
 SAMPLED_OPS = [
     circuit_execution_ops.get_sampled_expectation_op(
         cirq.sim.Simulator()),  # WF
     circuit_execution_ops.get_sampled_expectation_op()  # C++
 ]
 
+SAMPLED_GPU_OPS = [
+    circuit_execution_ops.get_sampled_expectation_op(use_cuquantum=True,
+                                                     quantum_concurrent=False)
+]
+
 NOISY_OPS = [
     noisy_sampled_expectation_op.sampled_expectation,
     noisy_expectation_op.expectation
@@ -118,19 +130,35 @@ class AnalyticGradientCorrectnessTest(tf.test.TestCase, parameterized.TestCase):
 
     @parameterized.parameters(
         list(
-            util.kwargs_cartesian_product(**{
-                'differentiator': ANALYTIC_DIFFS,
-                'op': ANALYTIC_OPS
-            })) + [{
-                'differentiator': adjoint.Adjoint(),
-                'op': circuit_execution_ops.get_expectation_op()
-            }])
-    def test_backprop(self, differentiator, op):
+            util.kwargs_cartesian_product(
+                **{
+                    'differentiator': ANALYTIC_DIFFS,
+                    'op': ANALYTIC_OPS,
+                    'use_cuquantum': [False],
+                })) + [{
+                    'differentiator': adjoint.Adjoint(),
+                    'op': circuit_execution_ops.get_expectation_op(),
+                    'use_cuquantum': False,
+                }] +
+        list(
+            util.kwargs_cartesian_product(
+                **{
+                    'differentiator': ANALYTIC_DIFFS + [adjoint.Adjoint()],
+                    'op': ANALYTIC_GPU_OPS,
+                    'use_cuquantum': [True],
+                })))
+    def test_backprop(self, differentiator, op, use_cuquantum):
         """Test that gradients are correctly backpropagated through a quantum
         circuit via comparison to analytical results.
         """
+        if use_cuquantum and not circuit_execution_ops.is_gpu_configured():
+            # GPU is not set. Ignores this sub-test.
+            self.skipTest("GPU is not set. Ignoring gpu tests...")
         differentiator.refresh()
-        op = differentiator.generate_differentiable_op(analytic_op=op)
+        op = differentiator.generate_differentiable_op(
+            analytic_op=op,
+            use_cuquantum=use_cuquantum,
+        )
 
         def exact_grad(theta):
             new_theta = 2 * np.pi * theta
@@ -165,23 +193,42 @@ def exact_grad(theta):
                     'n_qubits': [5],
                     'n_programs': [3],
                     'n_ops': [3],
-                    'symbol_names': [['a', 'b']]
+                    'symbol_names': [['a', 'b']],
+                    'use_cuquantum': [False],
                 })) + [{
                     'differentiator': adjoint.Adjoint(),
                     'op': circuit_execution_ops.get_expectation_op(),
                     'n_qubits': 10,
                     'n_programs': 5,
                     'n_ops': 3,
-                    'symbol_names': ['a', 'b']
-                }])
+                    'symbol_names': ['a', 'b'],
+                    'use_cuquantum': False,
+                }] +
+        list(
+            util.kwargs_cartesian_product(
+                **{
+                    'differentiator': ANALYTIC_DIFFS + [adjoint.Adjoint()],
+                    'op': ANALYTIC_GPU_OPS,
+                    'n_qubits': [5],
+                    'n_programs': [3],
+                    'n_ops': [3],
+                    'symbol_names': [['a', 'b']],
+                    'use_cuquantum': [True],
+                })))
     def test_gradients_vs_cirq_finite_difference(self, differentiator, op,
                                                  n_qubits, n_programs, n_ops,
-                                                 symbol_names):
+                                                 symbol_names, use_cuquantum):
         """Compare TFQ differentiators to fine-grained noiseless cirq finite
         differencing.
         """
+        if use_cuquantum and not circuit_execution_ops.is_gpu_configured():
+            # GPU is not set. Ignores this sub-test.
+            self.skipTest("GPU is not set. Ignoring gpu tests...")
         differentiator.refresh()
-        op = differentiator.generate_differentiable_op(analytic_op=op)
+        op = differentiator.generate_differentiable_op(
+            analytic_op=op,
+            use_cuquantum=use_cuquantum,
+        )
 
         qubits = cirq.GridQubit.rect(1, n_qubits)
         circuit_batch, resolver_batch = \
@@ -220,18 +267,39 @@ def test_gradients_vs_cirq_finite_difference(self, differentiator, op,
 
     @parameterized.parameters(
         list(
-            util.kwargs_cartesian_product(**{
-                'differentiator': ANALYTIC_DIFFS,
-                'op': ANALYTIC_OPS,
-            })) + [{
-                'differentiator': adjoint.Adjoint(),
-                'op': circuit_execution_ops.get_expectation_op(),
-            }])
-    def test_analytic_value_with_simple_circuit(self, differentiator, op):
+            util.kwargs_cartesian_product(
+                **{
+                    'differentiator': ANALYTIC_DIFFS,
+                    'op': ANALYTIC_OPS,
+                    'use_cuquantum': [False],
+                })) + [{
+                    'differentiator': adjoint.Adjoint(),
+                    'op': circuit_execution_ops.get_expectation_op(),
+                    'use_cuquantum': False,
+                }] +
+        list(
+            util.kwargs_cartesian_product(
+                **{
+                    'differentiator': ANALYTIC_DIFFS + [adjoint.Adjoint()],
+                    'op': ANALYTIC_GPU_OPS,
+                    'use_cuquantum': [True],
+                })))
+    def test_analytic_value_with_simple_circuit(
+            self,
+            differentiator,
+            op,
+            use_cuquantum,
+    ):
         """Test the value of differentiator with simple circuit."""
+        if use_cuquantum and not circuit_execution_ops.is_gpu_configured():
+            # GPU is not set. Ignores this sub-test.
+            self.skipTest("GPU is not set. Ignoring gpu tests...")
         # Get an expectation op, with this differentiator attached.
         differentiator.refresh()
-        op = differentiator.generate_differentiable_op(analytic_op=op)
+        op = differentiator.generate_differentiable_op(
+            analytic_op=op,
+            use_cuquantum=use_cuquantum,
+        )
         qubit = cirq.GridQubit(0, 0)
         circuit = util.convert_to_tensor(
             [cirq.Circuit(cirq.X(qubit)**sympy.Symbol('alpha'))])
@@ -249,15 +317,28 @@ def test_analytic_value_with_simple_circuit(self, differentiator, op):
 
     @parameterized.parameters(
         list(
-            util.kwargs_cartesian_product(**{
-                'differentiator': ANALYTIC_DIFFS,
-                'op': ANALYTIC_OPS,
-            })) + [{
-                'differentiator': adjoint.Adjoint(),
-                'op': circuit_execution_ops.get_expectation_op(),
-            }])
-    def test_empty_circuit_grad(self, differentiator, op):
+            util.kwargs_cartesian_product(
+                **{
+                    'differentiator': ANALYTIC_DIFFS,
+                    'op': ANALYTIC_OPS,
+                    'use_cuquantum': [False],
+                })) + [{
+                    'differentiator': adjoint.Adjoint(),
+                    'op': circuit_execution_ops.get_expectation_op(),
+                    'use_cuquantum': False,
+                }] +
+        list(
+            util.kwargs_cartesian_product(
+                **{
+                    'differentiator': ANALYTIC_DIFFS + [adjoint.Adjoint()],
+                    'op': ANALYTIC_GPU_OPS,
+                    'use_cuquantum': [True],
+                })))
+    def test_empty_circuit_grad(self, differentiator, op, use_cuquantum):
         """Test that providing no circuits will fail gracefully."""
+        if use_cuquantum and not circuit_execution_ops.is_gpu_configured():
+            # GPU is not set. Ignores this sub-test.
+            self.skipTest("GPU is not set. Ignoring gpu tests...")
         differentiator.refresh()
         op = differentiator.generate_differentiable_op(analytic_op=op)
         circuit = tf.convert_to_tensor([], dtype=tf.string)
@@ -284,11 +365,23 @@ class SampledGradientCorrectnessTest(tf.test.TestCase, parameterized.TestCase):
                 **{
                     'differentiator': SAMPLED_DIFFS,
                     'op': SAMPLED_OPS,
-                    'num_samples': [20000]
-                })))
+                    'num_samples': [20000],
+                    'use_cuquantum': [False],
+                })) + list(
+                    util.kwargs_cartesian_product(
+                        **{
+                            'differentiator': SAMPLED_DIFFS,
+                            'op': SAMPLED_GPU_OPS,
+                            'num_samples': [20000],
+                            'use_cuquantum': [True],
+                        })))
     def test_sampled_value_with_simple_circuit(self, differentiator, op,
-                                               num_samples):
+                                               num_samples, use_cuquantum):
         """Test the value of sampled differentiator with simple circuit."""
+        if use_cuquantum and not circuit_execution_ops.is_gpu_configured():
+            # GPU is not set. Ignores this sub-test.
+            self.skipTest("GPU is not set. Ignoring gpu tests...")
+        tf.random.set_seed(RANDOM_SEED)
         # Get an expectation op, with this differentiator attached.
         differentiator.refresh()
         op = differentiator.generate_differentiable_op(sampled_op=op)
@@ -318,15 +411,33 @@ def test_sampled_value_with_simple_circuit(self, differentiator, op,
                     'n_programs': [5],
                     'n_ops': [2],
                     'symbol_names': [['a', 'b']],
-                    'num_samples': [30000]
+                    'num_samples': [30000],
+                    'use_cuquantum': [False],
+                })) +
+        list(
+            util.kwargs_cartesian_product(
+                **{
+                    'diff_and_tol': zip(SAMPLED_DIFFS, SAMPLED_DIFFS_TOLS),
+                    'op': SAMPLED_GPU_OPS,
+                    'n_qubits': [3],
+                    'n_programs': [5],
+                    'n_ops': [2],
+                    'symbol_names': [['a', 'b']],
+                    'num_samples': [30000],
+                    'use_cuquantum': [True],
                 })))
     def test_approx_equality_shallow(self, diff_and_tol, op, n_qubits,
                                      symbol_names, n_ops, n_programs,
-                                     num_samples):
+                                     num_samples, use_cuquantum):
         """Test small circuits with limited depth."""
+        if use_cuquantum and not circuit_execution_ops.is_gpu_configured():
+            # GPU is not set. Ignores this sub-test.
+            self.skipTest("GPU is not set. Ignoring gpu tests...")
+        tf.random.set_seed(RANDOM_SEED)
         differentiator, tol = diff_and_tol
         differentiator.refresh()
-        op = differentiator.generate_differentiable_op(sampled_op=op)
+        op = differentiator.generate_differentiable_op(
+            sampled_op=op, use_cuquantum=use_cuquantum)
 
         qubits = cirq.GridQubit.rect(1, n_qubits)
         circuit_batch, resolver_batch = \
@@ -369,12 +480,25 @@ def test_approx_equality_shallow(self, diff_and_tol, op, n_qubits,
 
     @parameterized.parameters(
         list(
-            util.kwargs_cartesian_product(**{
-                'differentiator': SAMPLED_DIFFS,
-                'op': SAMPLED_OPS,
-            })))
-    def test_empty_circuit_sampled_grad(self, differentiator, op):
+            util.kwargs_cartesian_product(
+                **{
+                    'differentiator': SAMPLED_DIFFS,
+                    'op': SAMPLED_OPS,
+                    'use_cuquantum': [False],
+                })) + list(
+                    util.kwargs_cartesian_product(
+                        **{
+                            'differentiator': SAMPLED_DIFFS,
+                            'op': SAMPLED_GPU_OPS,
+                            'use_cuquantum': [True],
+                        })))
+    def test_empty_circuit_sampled_grad(self, differentiator, op,
+                                        use_cuquantum):
         """Test that providing no circuits will fail gracefully."""
+        if use_cuquantum and not circuit_execution_ops.is_gpu_configured():
+            # GPU is not set. Ignores this sub-test.
+            self.skipTest("GPU is not set. Ignoring gpu tests...")
+        tf.random.set_seed(RANDOM_SEED)
         differentiator.refresh()
         op = differentiator.generate_differentiable_op(sampled_op=op)
         circuit = tf.convert_to_tensor([], dtype=tf.string)
diff --git a/tensorflow_quantum/python/layers/circuit_executors/BUILD b/tensorflow_quantum/python/layers/circuit_executors/BUILD
index ae8feff9c..1e4541c42 100644
--- a/tensorflow_quantum/python/layers/circuit_executors/BUILD
+++ b/tensorflow_quantum/python/layers/circuit_executors/BUILD
@@ -40,7 +40,7 @@ py_library(
         "//tensorflow_quantum/python/differentiators:adjoint",
         "//tensorflow_quantum/python/differentiators:differentiator",
         "//tensorflow_quantum/python/differentiators:parameter_shift",
-    ],
+    ]
 )
 
 py_library(
diff --git a/tensorflow_quantum/python/layers/circuit_executors/expectation.py b/tensorflow_quantum/python/layers/circuit_executors/expectation.py
index 35d0c1b50..cba81e7b5 100644
--- a/tensorflow_quantum/python/layers/circuit_executors/expectation.py
+++ b/tensorflow_quantum/python/layers/circuit_executors/expectation.py
@@ -21,6 +21,7 @@
 import cirq
 from tensorflow_quantum.core.ops import circuit_execution_ops
 from tensorflow_quantum.core.ops.noise import noisy_expectation_op
+from tensorflow_quantum.python import quantum_context
 from tensorflow_quantum.python.differentiators import adjoint
 from tensorflow_quantum.python.differentiators import parameter_shift
 from tensorflow_quantum.python.differentiators import differentiator as diff
@@ -205,7 +206,11 @@ class Expectation(tf.keras.layers.Layer):
 
     """
 
-    def __init__(self, backend='noiseless', differentiator=None, **kwargs):
+    def __init__(self,
+                 backend='noiseless',
+                 differentiator=None,
+                 use_cuquantum=False,
+                 **kwargs):
         """Instantiate this Layer.
 
         Create a layer that will output expectation values gained from
@@ -225,6 +230,7 @@ def __init__(self, backend='noiseless', differentiator=None, **kwargs):
                 which uses `tfq.differentiators.ParameterShift()`. If
                 `backend` is also 'noiseless' then default is
                 `tfq.differentiators.Adjoint`.
+            use_cuquantum: Calls TFQ cuQuantum version op.
 
         """
         super().__init__(**kwargs)
@@ -238,20 +244,29 @@ def __init__(self, backend='noiseless', differentiator=None, **kwargs):
                             "Please use SampledExpectation instead.")
         used_op = None
         self.noisy = False
-        if backend == 'noiseless':
-            backend = None
 
         # Ingest differentiator.
         if differentiator is None:
             differentiator = parameter_shift.ParameterShift()
-            if backend is None:
+            if backend == 'noiseless' or backend is None:
                 differentiator = adjoint.Adjoint()
 
         if not isinstance(differentiator, diff.Differentiator):
             raise TypeError("Differentiator must inherit from "
                             "tfq.differentiators.Differentiator")
 
-        if backend == 'noisy':
+        if backend == 'noiseless' or backend is None:
+            mode = quantum_context.get_quantum_concurrent_op_mode()
+            quantum_concurrent = False if use_cuquantum else mode
+            used_op = circuit_execution_ops.get_expectation_op(
+                backend=None,
+                use_cuquantum=use_cuquantum,
+                quantum_concurrent=quantum_concurrent)
+            self._expectation_op = differentiator.generate_differentiable_op(
+                analytic_op=used_op, use_cuquantum=use_cuquantum)
+        elif backend == 'noisy':
+            if use_cuquantum:
+                raise ValueError("noisy backend does not currently support GPU")
             used_op = noisy_expectation_op.expectation
             self._expectation_op = differentiator.generate_differentiable_op(
                 sampled_op=used_op)
@@ -270,15 +285,20 @@ def call(self,
              symbol_values=None,
              operators=None,
              repetitions=None,
-             initializer=tf.keras.initializers.RandomUniform(0, 2 * np.pi)):
+             initializer=None):
         """Keras call function.
 
-        Input options:
-            `inputs`, `symbol_names`, `symbol_values`:
-                see `input_checks.expand_circuits`
-            `operators`: see `input_checks.expand_operators`
-
-        Output shape:
+        Args:
+            inputs: See `input_checks.expand_circuits.
+            symbol_names: See `input_checks.expand_circuits.
+            symbol_values: See `input_checks.expand_circuits.
+            operators: See `input_checks.expand_operators`
+            repetitions: A Python `int` or a pre-converted `tf.Tensor`
+                containing a single `int` entry.
+            initializer: The keras initializer object for weights.
+                Defaults to uniform distribution [0..2*pi]
+
+        Returns:
             `tf.Tensor` with shape [batch_size, n_ops] that holds the
                 expectation value for each circuit with each op applied to it
                 (after resolving the corresponding parameters in).
@@ -287,6 +307,9 @@ def call(self,
         if symbol_values is None:
             values_empty = True
 
+        if initializer is None:
+            initializer = tf.keras.initializers.RandomUniform(0, 2 * np.pi)
+
         inputs, symbol_names, symbol_values = input_checks.expand_circuits(
             inputs, symbol_names, symbol_values)
 
diff --git a/tensorflow_quantum/python/layers/circuit_executors/expectation_test.py b/tensorflow_quantum/python/layers/circuit_executors/expectation_test.py
index 1ef7b99fc..f36396232 100644
--- a/tensorflow_quantum/python/layers/circuit_executors/expectation_test.py
+++ b/tensorflow_quantum/python/layers/circuit_executors/expectation_test.py
@@ -27,10 +27,13 @@
 import tensorflow as tf
 
 import cirq
+from tensorflow_quantum.core.ops import circuit_execution_ops
 from tensorflow_quantum.python.layers.circuit_executors import expectation
 from tensorflow_quantum.python.differentiators import linear_combination
 from tensorflow_quantum.python import util
 
+RANDOM_SEED = 1234
+
 
 def _gen_single_bit_rotation_problem(bit, symbols, noisy):
     """Generate a toy problem on 1 qubit."""
@@ -48,7 +51,7 @@ def _gen_single_bit_rotation_problem(bit, symbols, noisy):
     return circuit
 
 
-class ExpectationTest(tf.test.TestCase):
+class ExpectationTest(parameterized.TestCase, tf.test.TestCase):
     """Basic tests for the expectation layer."""
 
     def test_expectation_instantiate(self):
@@ -76,11 +79,15 @@ def run_sweep(self):
             expectation.Expectation(backend=MySampler())
 
         with self.assertRaisesRegex(
-                TypeError, expected_regex="SimulatesExpectationValues or None"):
+                TypeError,
+                expected_regex="SimulatesExpectationValues or None",
+        ):
             expectation.Expectation(backend='junk')
 
         with self.assertRaisesRegex(
-                TypeError, expected_regex="tfq.differentiators.Differentiator"):
+                TypeError,
+                expected_regex="tfq.differentiators.Differentiator",
+        ):
             expectation.Expectation(differentiator='junk')
 
     def test_expectation_type_inputs_error(self):
@@ -189,7 +196,10 @@ def test_static_cases(self):
 
         # Ensure tiling up of circuits works as expected.
         expectation.Expectation()(reg_circuit, operators=test_psum)
-        expectation.Expectation()(reg_circuit, operators=[test_psum, test_psum])
+        expectation.Expectation()(
+            reg_circuit,
+            operators=[test_psum, test_psum],
+        )
 
         # Ensure tiling up of symbol_values works as expected.
         expectation.Expectation()(symb_circuit,
@@ -276,10 +286,17 @@ def test_static_cases_noisy(self):
             ], [cirq.Z(bit), cirq.Z(bit), cirq.Z(bit)]],
             repetitions=[[1, 2, 3], [4, 5, 6]])
 
-    def test_expectation_simple_tf_train(self):
+    @parameterized.parameters([{
+        'use_cuquantum': False,
+    }, {
+        'use_cuquantum': True,
+    }])
+    def test_expectation_simple_tf_train(self, use_cuquantum):
         """Train a layer using standard tf (not keras).
         This is a subtle test that will work since we don't use keras compile.
         """
+        tf.random.set_seed(RANDOM_SEED)
+        initializer = tf.keras.initializers.RandomUniform(0, 2 * np.pi)
         bit = cirq.GridQubit(0, 0)
         circuit = \
             cirq.Circuit(cirq.rx(sympy.Symbol('theta'))(bit))
@@ -290,7 +307,8 @@ def test_expectation_simple_tf_train(self):
             with tf.GradientTape() as tape:
                 circuit_out = layer(circuit,
                                     symbol_names=['theta'],
-                                    operators=op)
+                                    operators=op,
+                                    initializer=initializer)
                 mse = tf.square(tf.reduce_sum(tf.subtract(circuit_out, -1)))
             grads = tape.gradient(mse, layer.trainable_weights)
             optimizer.apply_gradients(zip(grads, layer.trainable_weights))
@@ -302,19 +320,30 @@ class ExpectationFunctionalTests(parameterized.TestCase, tf.test.TestCase):
 
     @parameterized.parameters([
         {
-            'backend': 'noisy'
+            'backend': 'noisy',
+            'use_cuquantum': False,
+        },
+        {
+            'backend': None,  # old API usage
+            'use_cuquantum': False,
         },
         {
-            'backend': None  # old API usage
+            'backend': None,
+            'use_cuquantum': True,
         }
     ])
-    def test_simple_param_value_input(self, backend):
+    def test_simple_param_value_input(self, backend, use_cuquantum):
         """Train a densely connected hybrid model.
 
-        This model will put a qubit in the zero or one state from a random state
-        given the input zero or one. This tests the input signature:
+        This model will put a qubit in the zero or one state from a random
+        state given the input zero or one. This tests the input signature:
         Expectation([input_value_batch]).
         """
+        if use_cuquantum and not circuit_execution_ops.is_gpu_configured():
+            # GPU is not set. Ignores this sub-test.
+            self.skipTest("GPU is not set. Ignoring gpu tests...")
+        tf.random.set_seed(RANDOM_SEED)
+        initializer = tf.keras.initializers.RandomUniform(0, 2 * np.pi)
         noisy = backend == 'noisy'
         bit = cirq.GridQubit(0, 0)
         symbols = sympy.symbols('x y z')
@@ -325,12 +354,15 @@ def test_simple_param_value_input(self, backend):
         l1 = tf.keras.layers.Dense(10)(inputs)
         l2 = tf.keras.layers.Dense(3)(l1)
         reps = 1000 if noisy else None
-        outputs = expectation.Expectation(backend=backend)(
-            datum,
-            symbol_names=symbols,
-            operators=cirq.Z(bit),
-            symbol_values=l2,
-            repetitions=reps)
+        outputs = expectation.Expectation(
+            backend=backend,
+            use_cuquantum=use_cuquantum,
+        )(datum,
+          symbol_names=symbols,
+          operators=cirq.Z(bit),
+          symbol_values=l2,
+          repetitions=reps,
+          initializer=initializer)
         model = tf.keras.Model(inputs=[datum, inputs], outputs=outputs)
 
         data_in = np.array([[1], [0]], dtype=np.float32)
@@ -347,18 +379,29 @@ def test_simple_param_value_input(self, backend):
 
     @parameterized.parameters([
         {
-            'backend': 'noisy'
+            'backend': 'noisy',
+            'use_cuquantum': False,
         },
         {
-            'backend': None  # old API usage
+            'backend': None,  # old API usage
+            'use_cuquantum': False,
+        },
+        {
+            'backend': None,
+            'use_cuquantum': True,
         }
     ])
-    def test_simple_op_input(self, backend):
+    def test_simple_op_input(self, backend, use_cuquantum):
         """Test a simple operator input
 
         Learn qubit in the z+ state using two different measurement operators.
         This tests input signature Expectation([operator_batch])
         """
+        if use_cuquantum and not circuit_execution_ops.is_gpu_configured():
+            # GPU is not set. Ignores this sub-test.
+            self.skipTest("GPU is not set. Ignoring gpu tests...")
+        tf.random.set_seed(RANDOM_SEED)
+        normal_initializer = tf.keras.initializers.RandomNormal()
         noisy = backend == 'noisy'
         bit = cirq.GridQubit(0, 0)
         symbols = sympy.symbols('x, y, z')
@@ -373,14 +416,19 @@ def test_simple_op_input(self, backend):
         op_input = tf.keras.Input(shape=(1,), dtype=tf.dtypes.string)
 
         reps = 1000 if noisy else None
-        output = expectation.Expectation(backend=backend)(
-            circuit_input,
-            symbol_names=symbols,
-            operators=op_input,
-            initializer=tf.keras.initializers.RandomNormal(),
-            repetitions=reps)
-
-        model = tf.keras.Model(inputs=[circuit_input, op_input], outputs=output)
+        output = expectation.Expectation(
+            backend=backend,
+            use_cuquantum=use_cuquantum,
+        )(circuit_input,
+          symbol_names=symbols,
+          operators=op_input,
+          initializer=normal_initializer,
+          repetitions=reps)
+
+        model = tf.keras.Model(
+            inputs=[circuit_input, op_input],
+            outputs=output,
+        )
 
         model.compile(
             optimizer=tf.keras.optimizers.Adam(learning_rate=0.05),
@@ -395,22 +443,34 @@ def test_simple_op_input(self, backend):
 
     @parameterized.parameters([
         {
-            'backend': 'noisy'
+            'backend': 'noisy',
+            'use_cuquantum': False,
+        },
+        {
+            'backend': None,  # old api usage.
+            'use_cuquantum': False,
         },
         {
-            'backend': None  # old api usage.
+            'backend': None,
+            'use_cuquantum': True,
         },
         {
-            'backend': cirq.Simulator()
+            'backend': cirq.Simulator(),
+            'use_cuquantum': False,
         }
     ])
-    def test_simple_op_and_param_input(self, backend):
+    def test_simple_op_and_param_input(self, backend, use_cuquantum):
         """Test a simple operator and parameter input.
 
         Train a NN to put a qubit in the z+ or x+ states based on a classical
         binary input. This tests the input signature:
         Expectation([value_batch, operator_batch]).
         """
+        if use_cuquantum and not circuit_execution_ops.is_gpu_configured():
+            # GPU is not set. Ignores this sub-test.
+            self.skipTest("GPU is not set. Ignoring gpu tests...")
+        tf.random.set_seed(RANDOM_SEED)
+        initializer = tf.keras.initializers.RandomUniform(0, 2 * np.pi)
         noisy = backend == 'noisy'
         bit = cirq.GridQubit(0, 0)
         symbols = sympy.symbols('x, y, z')
@@ -426,12 +486,15 @@ def test_simple_op_and_param_input(self, backend):
         dense_1 = tf.keras.layers.Dense(10)(data_inp)
         dense_2 = tf.keras.layers.Dense(3)(dense_1)
         reps = 1000 if noisy else None
-        circuit_output = expectation.Expectation(backend=backend)(
-            circuit_inp,
-            symbol_names=symbols,
-            symbol_values=dense_2,
-            operators=op_inp,
-            repetitions=reps)
+        circuit_output = expectation.Expectation(
+            backend=backend,
+            use_cuquantum=use_cuquantum,
+        )(circuit_inp,
+          symbol_names=symbols,
+          symbol_values=dense_2,
+          operators=op_inp,
+          repetitions=reps,
+          initializer=initializer)
 
         functional_model = tf.keras.Model(
             inputs=[data_inp, op_inp, circuit_inp], outputs=[circuit_output])
@@ -448,18 +511,30 @@ def test_simple_op_and_param_input(self, backend):
 
     @parameterized.parameters([
         {
-            'backend': 'noisy'
+            'backend': 'noisy',
+            'use_cuquantum': False,
         },
         {
-            'backend': None  # old api usage.
+            'backend': None,  # old API usage
+            'use_cuquantum': False,
+        },
+        {
+            'backend': None,
+            'use_cuquantum': True,
         }
     ])
-    def test_dnn_qnn_dnn(self, backend):
+    def test_dnn_qnn_dnn(self, backend, use_cuquantum):
         """Train a fully hybrid network using an Expectation layer.
 
         Train the network to output +-5 given an input of 1 or 0. This tests
         that everything works when Expectation layer is a middle layers.
         """
+        if use_cuquantum and not circuit_execution_ops.is_gpu_configured():
+            # GPU is not set. Ignores this sub-test.
+            self.skipTest("GPU is not set. Ignoring gpu tests...")
+        tf.random.set_seed(RANDOM_SEED)
+        initializer = tf.keras.initializers.RandomUniform(0, 2 * np.pi)
+
         noisy = backend == 'noisy'
         bit = cirq.GridQubit(0, 0)
         symbols = sympy.symbols('x, y, z')
@@ -473,12 +548,15 @@ def test_dnn_qnn_dnn(self, backend):
         d1 = tf.keras.layers.Dense(10)(classical_input)
         d2 = tf.keras.layers.Dense(3)(d1)
         reps = 1000 if noisy else None
-        quantum = expectation.Expectation(backend=backend)(
-            circuit_input,
-            symbol_names=symbols,
-            symbol_values=d2,
-            operators=cirq.Z(bit),
-            repetitions=reps)
+        quantum = expectation.Expectation(
+            backend=backend,
+            use_cuquantum=use_cuquantum,
+        )(circuit_input,
+          symbol_names=symbols,
+          symbol_values=d2,
+          operators=cirq.Z(bit),
+          repetitions=reps,
+          initializer=initializer)
         d3 = tf.keras.layers.Dense(1)(quantum)
 
         model = tf.keras.Model(inputs=[circuit_input, classical_input],
diff --git a/tensorflow_quantum/python/layers/circuit_executors/sample.py b/tensorflow_quantum/python/layers/circuit_executors/sample.py
index 750885c20..3ba53c6bf 100644
--- a/tensorflow_quantum/python/layers/circuit_executors/sample.py
+++ b/tensorflow_quantum/python/layers/circuit_executors/sample.py
@@ -19,6 +19,7 @@
 
 from tensorflow_quantum.core.ops import circuit_execution_ops
 from tensorflow_quantum.core.ops.noise import noisy_samples_op
+from tensorflow_quantum.python import quantum_context
 from tensorflow_quantum.python.layers.circuit_executors import input_checks
 
 
@@ -139,7 +140,7 @@ class Sample(tf.keras.layers.Layer):
 
     """
 
-    def __init__(self, backend='noiseless', **kwargs):
+    def __init__(self, backend='noiseless', use_cuquantum=False, **kwargs):
         """Instantiate this Layer.
 
         Create a layer that will output bitstring samples taken from either a
@@ -150,12 +151,20 @@ def __init__(self, backend='noiseless', **kwargs):
                 to the noiseless simulator. Options are {'noisy', 'noiseless'},
                 however users may also specify a preconfigured cirq execution
                 object to use instead, which must inherit `cirq.Sampler`.
+            use_cuquantum: Calls TFQ GPU version op.
         """
         super().__init__(**kwargs)
         used_op = None
-        if backend == 'noiseless':
-            used_op = circuit_execution_ops.get_sampling_op(None)
+        if backend == 'noiseless' or backend is None:
+            mode = quantum_context.get_quantum_concurrent_op_mode()
+            quantum_concurrent = False if use_cuquantum else mode
+            used_op = circuit_execution_ops.get_sampling_op(
+                None,
+                use_cuquantum=use_cuquantum,
+                quantum_concurrent=quantum_concurrent)
         elif backend == 'noisy':
+            if use_cuquantum:
+                raise ValueError('noisy backend has no GPU support.')
             used_op = noisy_samples_op.samples
         else:
             used_op = circuit_execution_ops.get_sampling_op(backend)
@@ -170,17 +179,18 @@ def call(self,
              repetitions=None):
         """Keras call function.
 
-        Input options:
-            `inputs`, `symbol_names`, `symbol_values`:
-                see `input_checks.expand_circuits`
-            `repetitions`: a Python `int` or a pre-converted
-                `tf.Tensor` containing a single `int` entry.
+        Args:
+            inputs: See `input_checks.expand_circuits`.
+            symbol_names: See `input_checks.expand_circuits`.
+            symbol_values: See `input_checks.expand_circuits`.
+            repetitions: A Python `int` or a pre-converted `tf.Tensor`
+                containing a single `int` entry.
 
-        Output shape:
+        Returns:
             `tf.RaggedTensor` with shape:
-                [batch size of symbol_values, repetitions, <ragged string size>]
-                    or
-                [number of circuits, repetitions, <ragged string size>]
+            [batch size of symbol_values, repetitions, <ragged string size>]
+                or
+            [number of circuits, repetitions, <ragged string size>]
         """
         if repetitions is None:
             raise ValueError("Number of repetitions not specified.")
diff --git a/tensorflow_quantum/python/layers/circuit_executors/sample_test.py b/tensorflow_quantum/python/layers/circuit_executors/sample_test.py
index 7103759b4..379fbaee3 100644
--- a/tensorflow_quantum/python/layers/circuit_executors/sample_test.py
+++ b/tensorflow_quantum/python/layers/circuit_executors/sample_test.py
@@ -27,9 +27,12 @@
 import tensorflow as tf
 import cirq
 
+from tensorflow_quantum.core.ops import circuit_execution_ops
 from tensorflow_quantum.python.layers.circuit_executors import sample
 from tensorflow_quantum.python import util
 
+RANDOM_SEED = 1234
+
 
 class SampleTest(tf.test.TestCase, parameterized.TestCase):
     """Tests for the Sample layer."""
@@ -86,21 +89,32 @@ def test_sample_invalid_shape_inputs(self):
 
     @parameterized.parameters([
         {
-            'backend': 'noiseless'
+            'backend': 'noiseless',
+            'use_cuquantum': False,
+        },
+        {
+            'backend': 'noisy',
+            'use_cuquantum': False,
         },
         {
-            'backend': 'noisy'
+            'backend': cirq.Simulator(),
+            'use_cuquantum': False,
         },
         {
-            'backend': cirq.Simulator()
+            'backend': None,  # old API usage.
+            'use_cuquantum': False,
         },
         {
-            'backend': None  # old API usage.
+            'backend': None,
+            'use_cuquantum': True,
         }
     ])
-    def test_sample_invalid_combinations(self, backend):
+    def test_sample_invalid_combinations(self, backend, use_cuquantum):
         """Test with valid type inputs and valid value, but incorrect combo."""
-        sampler = sample.Sample(backend)
+        if use_cuquantum and not circuit_execution_ops.is_gpu_configured():
+            # GPU is not set. Ignores this sub-test.
+            self.skipTest("GPU is not set. Ignoring gpu tests...")
+        sampler = sample.Sample(backend, use_cuquantum=use_cuquantum)
         symbol = sympy.Symbol('alpha')
         circuit = cirq.Circuit(cirq.H(cirq.GridQubit(0, 0))**symbol)
         with self.assertRaisesRegex(Exception, expected_regex=""):
@@ -142,9 +156,17 @@ def test_sample_invalid_combinations(self, backend):
                     symbol_values=np.zeros((3, 1)),
                     repetitions=5)
 
-    def test_sample_basic_inputs(self):
+    @parameterized.parameters([{
+        'use_cuquantum': False,
+    }, {
+        'use_cuquantum': True,
+    }])
+    def test_sample_basic_inputs(self, use_cuquantum):
         """Test that sample ingests inputs correctly in simple settings."""
-        sampler = sample.Sample()
+        if use_cuquantum and not circuit_execution_ops.is_gpu_configured():
+            # GPU is not set. Ignores this sub-test.
+            self.skipTest("GPU is not set. Ignoring gpu tests...")
+        sampler = sample.Sample(use_cuquantum=use_cuquantum)
         sampler(cirq.Circuit(), repetitions=10)
         sampler([cirq.Circuit()], repetitions=10)
         sampler(cirq.Circuit(),
@@ -156,31 +178,49 @@ def test_sample_basic_inputs(self):
                 symbol_values=[[0.5]],
                 repetitions=10)
 
-    def test_sample_outputs_simple(self):
+    @parameterized.parameters([{
+        'use_cuquantum': False,
+    }, {
+        'use_cuquantum': True,
+    }])
+    def test_sample_outputs_simple(self, use_cuquantum):
         """Test the simplest call where nothing but circuits are provided."""
-        sampler = sample.Sample()
+        if use_cuquantum and not circuit_execution_ops.is_gpu_configured():
+            # GPU is not set. Ignores this sub-test.
+            self.skipTest("GPU is not set. Ignoring gpu tests...")
+        sampler = sample.Sample(use_cuquantum=use_cuquantum)
         circuit = cirq.Circuit(cirq.H(cirq.GridQubit(0, 0)))
         output = sampler([circuit, circuit], repetitions=5)
         self.assertShapeEqual(np.empty((2, 5, 1)), output.to_tensor())
 
-    # TODO(trevormccrt): add QuantumEngineSampler to this once it is available
+    # TODO(trevormccrt): add ProcessorSampler to this once it is available
     @parameterized.parameters(
         list(
             util.kwargs_cartesian_product(
                 backend=['noiseless', 'noisy',
                          cirq.Simulator(), None],
+                use_cuquantum=[False, True],
                 all_n_qubits=[[3, 4, 10]],
                 n_samples=[1],
                 symbol_names=[[], ['a', 'b']])))
-    def test_sample_output(self, backend, all_n_qubits, n_samples,
-                           symbol_names):
+    def test_sample_output(self, backend, use_cuquantum, all_n_qubits,
+                           n_samples, symbol_names):
         """Test that expected output format is preserved.
 
         Check that any pre or post processing done inside the layers does not
         cause what is output from the layer to structurally deviate from what
         is expected.
         """
-        sampler = sample.Sample(backend=backend)
+        if use_cuquantum and not circuit_execution_ops.is_gpu_configured():
+            # GPU is not set. Ignores this sub-test.
+            self.skipTest("GPU is not set. Ignoring gpu tests...")
+        tf.random.set_seed(RANDOM_SEED)
+        if use_cuquantum:
+            # If use_cuquantum is True,
+            if backend is not None and backend != 'noiseless':
+                return
+            # Passes backend=None or backend == 'noiseless' only.
+        sampler = sample.Sample(backend=backend, use_cuquantum=use_cuquantum)
         bits = cirq.GridQubit.rect(1, max(all_n_qubits))
         programs = []
         expected_outputs = []
diff --git a/tensorflow_quantum/python/layers/circuit_executors/sampled_expectation.py b/tensorflow_quantum/python/layers/circuit_executors/sampled_expectation.py
index fa434e332..0fdcc421f 100644
--- a/tensorflow_quantum/python/layers/circuit_executors/sampled_expectation.py
+++ b/tensorflow_quantum/python/layers/circuit_executors/sampled_expectation.py
@@ -22,6 +22,7 @@
 import cirq
 from tensorflow_quantum.core.ops import circuit_execution_ops
 from tensorflow_quantum.core.ops.noise import noisy_sampled_expectation_op
+from tensorflow_quantum.python import quantum_context
 from tensorflow_quantum.python.differentiators import differentiator as diff
 from tensorflow_quantum.python.differentiators import parameter_shift
 from tensorflow_quantum.python.layers.circuit_executors import input_checks
@@ -213,7 +214,11 @@ class SampledExpectation(tf.keras.layers.Layer):
 
     """
 
-    def __init__(self, backend='noiseless', differentiator=None, **kwargs):
+    def __init__(self,
+                 backend='noiseless',
+                 differentiator=None,
+                 use_cuquantum=False,
+                 **kwargs):
         """Instantiate this Layer.
 
         Create a layer that will output expectation values gained from
@@ -227,6 +232,7 @@ def __init__(self, backend='noiseless', differentiator=None, **kwargs):
                 derivative values of given operators_to_measure and circuit,
                 which must inherit `tfq.differentiators.Differentiator`.
                 Defaults to `parameter_shift.ParameterShift()` (None argument).
+            use_cuquantum: Calls TFQ GPU version op.
 
         """
         super().__init__(**kwargs)
@@ -246,10 +252,17 @@ def __init__(self, backend='noiseless', differentiator=None, **kwargs):
                             "not cirq.Sampler. Please use Expectation instead.")
 
         used_op = None
-        if backend == 'noiseless':
-            backend = None
-
-        if backend == 'noisy':
+        if backend == 'noiseless' or backend is None:
+            mode = quantum_context.get_quantum_concurrent_op_mode()
+            quantum_concurrent = False if use_cuquantum else mode
+            used_op = circuit_execution_ops.get_sampled_expectation_op(
+                backend=None,
+                use_cuquantum=use_cuquantum,
+                quantum_concurrent=quantum_concurrent,
+            )
+        elif backend == 'noisy':
+            if use_cuquantum:
+                raise ValueError('noisy backend does not currently support GPU')
             used_op = noisy_sampled_expectation_op.sampled_expectation
         else:
             used_op = circuit_execution_ops.get_sampled_expectation_op(
@@ -267,25 +280,31 @@ def call(self,
              symbol_values=None,
              operators=None,
              repetitions=None,
-             initializer=tf.keras.initializers.RandomUniform(0, 2 * np.pi)):
+             initializer=None):
         """Keras call function.
 
-        Input options:
-            `inputs`, `symbol_names`, `symbol_values`:
-                see `input_checks.expand_circuits`
-            `operators`: see `input_checks.expand_operators`
-            `repetitions`: a Python `int` or a pre-converted
-                `tf.Tensor` containing a single `int` entry.
-
-        Output shape:
+        Args:
+            inputs: See `input_checks.expand_circuits.
+            symbol_names: See `input_checks.expand_circuits.
+            symbol_values: See `input_checks.expand_circuits.
+            operators: See `input_checks.expand_operators`
+            repetitions: A Python `int` or a pre-converted `tf.Tensor`
+                containing a single `int` entry.
+            initializer: The keras initializer object for weights.
+                Defaults to uniform distribution [0..2*pi]
+
+        Returns:
             `tf.Tensor` with shape [batch_size, n_ops] that holds the
-                expectation value for each circuit with each op applied to it
-                (after resolving the corresponding parameters in).
+            expectation value for each circuit with each op applied to it
+            (after resolving the corresponding parameters in).
         """
         values_empty = False
         if symbol_values is None:
             values_empty = True
 
+        if initializer is None:
+            initializer = tf.keras.initializers.RandomUniform(0, 2 * np.pi)
+
         inputs, symbol_names, symbol_values = input_checks.expand_circuits(
             inputs, symbol_names, symbol_values)
 
diff --git a/tensorflow_quantum/python/layers/circuit_executors/sampled_expectation_test.py b/tensorflow_quantum/python/layers/circuit_executors/sampled_expectation_test.py
index b027cfbe5..c13afbfd4 100644
--- a/tensorflow_quantum/python/layers/circuit_executors/sampled_expectation_test.py
+++ b/tensorflow_quantum/python/layers/circuit_executors/sampled_expectation_test.py
@@ -27,11 +27,14 @@
 import tensorflow as tf
 
 import cirq
+from tensorflow_quantum.core.ops import circuit_execution_ops
 from tensorflow_quantum.python.layers.circuit_executors import \
     sampled_expectation
 from tensorflow_quantum.python.differentiators import linear_combination
 from tensorflow_quantum.python import util
 
+RANDOM_SEED = 1234
+
 
 class CustomSampler(cirq.Sampler):
     """Wrapper for cirq.Simulator to confirm that custom samplers work."""
@@ -98,23 +101,40 @@ def simulate_sweep(self):
 
     @parameterized.parameters([
         {
-            'backend': 'noisy'
+            'backend': 'noisy',
+            'use_cuquantum': False,
+        },
+        {
+            'backend': 'noiseless',
+            'use_cuquantum': False,
         },
         {
-            'backend': 'noiseless'
+            'backend': 'noiseless',
+            'use_cuquantum': True,
         },
         {
-            'backend': cirq.Simulator()
+            'backend': cirq.Simulator(),
+            'use_cuquantum': False,
         },
         {
-            'backend': CustomSampler()
+            'backend': CustomSampler(),
+            'use_cuquantum': False,
         },
         {
-            'backend': None  # older API usage.
+            'backend': None,  # older API usage.
+            'use_cuquantum': False,
+        },
+        {
+            'backend': None,
+            'use_cuquantum': True,
         }
     ])
-    def test_sampled_expectation_type_inputs_error(self, backend):
+    def test_sampled_expectation_type_inputs_error(self, backend,
+                                                   use_cuquantum):
         """Test that SampledExpectation errors within Keras call."""
+        if use_cuquantum and not circuit_execution_ops.is_gpu_configured():
+            # GPU is not set. Ignores this sub-test.
+            self.skipTest("GPU is not set. Ignoring gpu tests...")
 
         bit = cirq.GridQubit(0, 0)
         symbol = sympy.Symbol('alpha')
@@ -125,44 +145,67 @@ def test_sampled_expectation_type_inputs_error(self, backend):
 
         with self.assertRaisesRegex(RuntimeError,
                                     expected_regex="repetitions not provided"):
-            sampled_expectation.SampledExpectation(backend=backend)(
-                symb_circuit,
-                symbol_names=[symbol],
-                symbol_values=[[0.5]],
-                operators=test_psum)
+            sampled_expectation.SampledExpectation(
+                backend=backend,
+                use_cuquantum=use_cuquantum,
+            )(symb_circuit,
+              symbol_names=[symbol],
+              symbol_values=[[0.5]],
+              operators=test_psum)
 
         with self.assertRaisesRegex(Exception,
                                     expected_regex="Unknown initializer"):
-            sampled_expectation.SampledExpectation(backend=backend)(
-                reg_circuit,
-                operators=test_psum,
-                initializer='junk',
-                repetitions=1)
+            sampled_expectation.SampledExpectation(
+                backend=backend,
+                use_cuquantum=use_cuquantum,
+            )(reg_circuit,
+              operators=test_psum,
+              initializer='junk',
+              repetitions=1)
 
         with self.assertRaisesRegex(Exception,
                                     expected_regex="cannot be parsed"):
-            sampled_expectation.SampledExpectation(backend=backend)(
-                reg_circuit, operators=test_psum, repetitions='junk')
+            sampled_expectation.SampledExpectation(
+                backend=backend,
+                use_cuquantum=use_cuquantum,
+            )(reg_circuit, operators=test_psum, repetitions='junk')
 
     @parameterized.parameters([
         {
-            'backend': 'noisy'
+            'backend': 'noisy',
+            'use_cuquantum': False,
+        },
+        {
+            'backend': 'noiseless',
+            'use_cuquantum': False,
+        },
+        {
+            'backend': 'noiseless',
+            'use_cuquantum': True,
         },
         {
-            'backend': 'noiseless'
+            'backend': cirq.Simulator(),
+            'use_cuquantum': False,
         },
         {
-            'backend': cirq.Simulator()
+            'backend': CustomSampler(),
+            'use_cuquantum': False,
         },
         {
-            'backend': CustomSampler()
+            'backend': None,  # older API usage.
+            'use_cuquantum': False,
         },
         {
-            'backend': None  # older API usage.
+            'backend': None,
+            'use_cuquantum': True,
         }
     ])
-    def test_sampled_expectation_op_error(self, backend):
+    def test_sampled_expectation_op_error(self, backend, use_cuquantum):
         """Test that expectation errors within underlying ops correctly."""
+        if use_cuquantum and not circuit_execution_ops.is_gpu_configured():
+            # GPU is not set. Ignores this sub-test.
+            self.skipTest("GPU is not set. Ignoring gpu tests...")
+
         # Note the expected_regex is left blank here since there is a
         # discrepancy between the error strings provided between backends.
         bit = cirq.GridQubit(0, 0)
@@ -174,73 +217,101 @@ def test_sampled_expectation_op_error(self, backend):
 
         with self.assertRaisesRegex(Exception, expected_regex="pauli"):
             # Operators has wrong rank. Parse error.
-            sampled_expectation.SampledExpectation(backend=backend)(
-                [reg_circuit],
-                operators=util.convert_to_tensor([test_psum]),
-                repetitions=1)
+            sampled_expectation.SampledExpectation(
+                backend=backend,
+                use_cuquantum=use_cuquantum,
+            )([reg_circuit],
+              operators=util.convert_to_tensor([test_psum]),
+              repetitions=1)
 
         with self.assertRaisesRegex(Exception, expected_regex="symbol_values"):
             # symbol_values has wrong rank.
-            sampled_expectation.SampledExpectation(backend=backend)(
-                [symb_circuit],
-                symbol_names=[symbol],
-                symbol_values=[0.5],
-                operators=test_psum,
-                repetitions=1)
+            sampled_expectation.SampledExpectation(
+                backend=backend,
+                use_cuquantum=use_cuquantum,
+            )([symb_circuit],
+              symbol_names=[symbol],
+              symbol_values=[0.5],
+              operators=test_psum,
+              repetitions=1)
 
         with self.assertRaisesRegex(Exception, expected_regex="pauli"):
             # Wrong batch size for pauli operators.
-            sampled_expectation.SampledExpectation(backend=backend)(
-                symb_circuit,
-                symbol_names=[symbol],
-                operators=[[test_psum], [test_psum]],
-                repetitions=1)
+            sampled_expectation.SampledExpectation(
+                backend=backend,
+                use_cuquantum=use_cuquantum,
+            )(symb_circuit,
+              symbol_names=[symbol],
+              operators=[[test_psum], [test_psum]],
+              repetitions=1)
 
         with self.assertRaisesRegex(Exception, expected_regex="pauli"):
             # Wrong batch size for pauli operators.
-            sampled_expectation.SampledExpectation(backend=backend)(
-                reg_circuit,
-                operators=[[test_psum], [test_psum]],
-                repetitions=1)
+            sampled_expectation.SampledExpectation(
+                backend=backend,
+                use_cuquantum=use_cuquantum,
+            )(reg_circuit, operators=[[test_psum], [test_psum]], repetitions=1)
 
         with self.assertRaisesRegex(Exception, expected_regex="0"):
             # Wrong repetitions.
-            sampled_expectation.SampledExpectation(backend=backend)(
-                reg_circuit, operators=test_psum, repetitions=-1)
+            sampled_expectation.SampledExpectation(
+                backend=backend,
+                use_cuquantum=use_cuquantum,
+            )(reg_circuit, operators=test_psum, repetitions=-1)
 
         with self.assertRaisesRegex(Exception, expected_regex=""):
             # Wrong second dimension size for repetitions & pauli operators.
-            sampled_expectation.SampledExpectation(backend=backend)(
-                reg_circuit, operators=test_psum, repetitions=[5, 4, 3])
+            sampled_expectation.SampledExpectation(
+                backend=backend,
+                use_cuquantum=use_cuquantum,
+            )(reg_circuit, operators=test_psum, repetitions=[5, 4, 3])
 
         with self.assertRaisesRegex(Exception, expected_regex=""):
             # Wrong batch_size for symbol values.
-            sampled_expectation.SampledExpectation(backend=backend)(
-                [reg_circuit],
-                symbol_names=[symbol],
-                symbol_values=np.zeros((3, 1)),
-                operators=test_psum,
-                repetitions=5)
+            sampled_expectation.SampledExpectation(
+                backend=backend,
+                use_cuquantum=use_cuquantum,
+            )([reg_circuit],
+              symbol_names=[symbol],
+              symbol_values=np.zeros((3, 1)),
+              operators=test_psum,
+              repetitions=5)
 
     @parameterized.parameters([
         {
-            'backend': 'noisy'
+            'backend': 'noisy',
+            'use_cuquantum': False,
+        },
+        {
+            'backend': 'noiseless',
+            'use_cuquantum': False,
         },
         {
-            'backend': 'noiseless'
+            'backend': 'noiseless',
+            'use_cuquantum': True,
         },
         {
-            'backend': cirq.Simulator()
+            'backend': cirq.Simulator(),
+            'use_cuquantum': False,
         },
         {
-            'backend': CustomSampler()
+            'backend': CustomSampler(),
+            'use_cuquantum': False,
         },
         {
-            'backend': None  # older API usage.
+            'backend': None,  # older API usage.
+            'use_cuquantum': False,
+        },
+        {
+            'backend': None,
+            'use_cuquantum': True,
         }
     ])
-    def test_static_cases(self, backend):
+    def test_static_cases(self, backend, use_cuquantum):
         """Run inputs through in complex cases."""
+        if use_cuquantum and not circuit_execution_ops.is_gpu_configured():
+            # GPU is not set. Ignores this sub-test.
+            self.skipTest("GPU is not set. Ignoring gpu tests...")
 
         bit = cirq.GridQubit(0, 0)
         symbol = sympy.Symbol('alpha')
@@ -250,72 +321,102 @@ def test_static_cases(self, backend):
         reg_circuit = cirq.Circuit(cirq.H(bit))
 
         # Passing a 2d operators input requires a 1d circuit input.
-        sampled_expectation.SampledExpectation(backend=backend)(
-            [reg_circuit, reg_circuit],
-            operators=[[test_psum, test_psum], [test_psum, test_psum]],
-            repetitions=1)
+        sampled_expectation.SampledExpectation(
+            backend=backend,
+            use_cuquantum=use_cuquantum,
+        )([reg_circuit, reg_circuit],
+          operators=[[test_psum, test_psum], [test_psum, test_psum]],
+          repetitions=1)
 
         # Passing 2d operators along with other inputs.
-        sampled_expectation.SampledExpectation(backend=backend)(
-            [symb_circuit, symb_circuit],
-            symbol_names=[symbol],
-            operators=[[test_psum, test_psum], [test_psum, test_psum]],
-            repetitions=1)
-        sampled_expectation.SampledExpectation(backend=backend)(
-            [symb_circuit, symb_circuit],
-            symbol_names=[symbol],
-            symbol_values=[[0.5], [0.8]],
-            operators=[[test_psum, test_psum], [test_psum, test_psum]],
-            repetitions=1)
+        sampled_expectation.SampledExpectation(
+            backend=backend,
+            use_cuquantum=use_cuquantum,
+        )([symb_circuit, symb_circuit],
+          symbol_names=[symbol],
+          operators=[[test_psum, test_psum], [test_psum, test_psum]],
+          repetitions=1)
+        sampled_expectation.SampledExpectation(
+            backend=backend,
+            use_cuquantum=use_cuquantum,
+        )([symb_circuit, symb_circuit],
+          symbol_names=[symbol],
+          symbol_values=[[0.5], [0.8]],
+          operators=[[test_psum, test_psum], [test_psum, test_psum]],
+          repetitions=1)
 
         # Ensure tiling up of circuits works as expected.
-        sampled_expectation.SampledExpectation(backend=backend)(
-            reg_circuit, operators=test_psum, repetitions=1)
-        sampled_expectation.SampledExpectation(backend=backend)(
-            reg_circuit, operators=[test_psum, test_psum], repetitions=1)
+        sampled_expectation.SampledExpectation(
+            backend=backend,
+            use_cuquantum=use_cuquantum,
+        )(reg_circuit, operators=test_psum, repetitions=1)
+        sampled_expectation.SampledExpectation(
+            backend=backend,
+            use_cuquantum=use_cuquantum,
+        )(reg_circuit, operators=[test_psum, test_psum], repetitions=1)
 
         # Ensure tiling up of symbol_values works as expected.
-        sampled_expectation.SampledExpectation(backend=backend)(
-            symb_circuit,
-            symbol_names=[symbol],
-            symbol_values=[[0.5], [0.8]],
-            operators=test_psum,
-            repetitions=1)
-        sampled_expectation.SampledExpectation(backend=backend)(
-            symb_circuit,
-            symbol_names=[symbol],
-            symbol_values=[[0.5]],
-            operators=test_psum,
-            repetitions=1)
+        sampled_expectation.SampledExpectation(
+            backend=backend,
+            use_cuquantum=use_cuquantum,
+        )(symb_circuit,
+          symbol_names=[symbol],
+          symbol_values=[[0.5], [0.8]],
+          operators=test_psum,
+          repetitions=1)
+        sampled_expectation.SampledExpectation(
+            backend=backend,
+            use_cuquantum=use_cuquantum,
+        )(symb_circuit,
+          symbol_names=[symbol],
+          symbol_values=[[0.5]],
+          operators=test_psum,
+          repetitions=1)
 
         # Test multiple operators with integer valued repetition.
-        sampled_expectation.SampledExpectation(backend=backend)(
-            symb_circuit,
-            symbol_names=[symbol],
-            symbol_values=[[0.5]],
-            operators=[-1.0 * cirq.Z(bit),
-                       cirq.X(bit) + 2.0 * cirq.Z(bit)],
-            repetitions=1)
-        sampled_expectation.SampledExpectation(backend=backend)(
-            symb_circuit,
-            symbol_names=[symbol],
-            symbol_values=[[0.5]],
-            operators=[-1.0 * cirq.Z(bit),
-                       cirq.X(bit) + 2.0 * cirq.Z(bit)],
-            repetitions=[5, 1])
-
-    def test_sampled_expectation_simple_tf_train(self):
+        sampled_expectation.SampledExpectation(
+            backend=backend,
+            use_cuquantum=use_cuquantum,
+        )(symb_circuit,
+          symbol_names=[symbol],
+          symbol_values=[[0.5]],
+          operators=[-1.0 * cirq.Z(bit),
+                     cirq.X(bit) + 2.0 * cirq.Z(bit)],
+          repetitions=1)
+        sampled_expectation.SampledExpectation(
+            backend=backend,
+            use_cuquantum=use_cuquantum,
+        )(symb_circuit,
+          symbol_names=[symbol],
+          symbol_values=[[0.5]],
+          operators=[-1.0 * cirq.Z(bit),
+                     cirq.X(bit) + 2.0 * cirq.Z(bit)],
+          repetitions=[5, 1])
+
+    @parameterized.parameters([{
+        'use_cuquantum': False,
+    }, {
+        'use_cuquantum': True,
+    }])
+    def test_sampled_expectation_simple_tf_train(self, use_cuquantum):
         """Train a layer using standard tf (not keras)."""
+        if use_cuquantum and not circuit_execution_ops.is_gpu_configured():
+            # GPU is not set. Ignores this sub-test.
+            self.skipTest("GPU is not set. Ignoring gpu tests...")
+        tf.random.set_seed(RANDOM_SEED)
+        initializer = tf.keras.initializers.RandomUniform(0, 2 * np.pi)
         bit = cirq.GridQubit(0, 0)
         circuit = cirq.Circuit(cirq.rx(sympy.Symbol('theta'))(bit))
-        layer = sampled_expectation.SampledExpectation()
+        layer = sampled_expectation.SampledExpectation(
+            use_cuquantum=use_cuquantum)
         optimizer = tf.optimizers.Adam(learning_rate=0.05)
-        for _ in range(10):
+        for _ in range(20):
             with tf.GradientTape() as tape:
                 circuit_out = layer(circuit,
                                     symbol_names=['theta'],
                                     operators=cirq.Z(bit),
-                                    repetitions=100)
+                                    repetitions=1000,
+                                    initializer=initializer)
                 mse = tf.square(tf.reduce_sum(tf.subtract(circuit_out, -1)))
             grads = tape.gradient(mse, layer.trainable_weights)
             optimizer.apply_gradients(zip(grads, layer.trainable_weights))
@@ -326,13 +427,27 @@ class SampledExpectationFunctionalTests(parameterized.TestCase,
                                         tf.test.TestCase):
     """Test hybrid/integrated models that include a SampledExpectation layer."""
 
-    @parameterized.parameters([{'backend': 'noisy'}, {'backend': 'noiseless'}])
-    def test_simple_param_value_input(self, backend):
+    @parameterized.parameters([{
+        'backend': 'noisy',
+        'use_cuquantum': False,
+    }, {
+        'backend': 'noiseless',
+        'use_cuquantum': False,
+    }, {
+        'backend': 'noiseless',
+        'use_cuquantum': True,
+    }])
+    def test_simple_param_value_input(self, backend, use_cuquantum):
         """Train a densely connected hybrid model.
 
         This model will put a qubit in the zero or one state from a random state
         given the input zero or one.
         """
+        if use_cuquantum and not circuit_execution_ops.is_gpu_configured():
+            # GPU is not set. Ignores this sub-test.
+            self.skipTest("GPU is not set. Ignoring gpu tests...")
+        tf.random.set_seed(RANDOM_SEED)
+        initializer = tf.keras.initializers.RandomUniform(0, 2 * np.pi)
         bit = cirq.GridQubit(0, 0)
         symbols = sympy.symbols('x y z')
         circuit = _gen_single_bit_rotation_problem(
@@ -342,12 +457,15 @@ def test_simple_param_value_input(self, backend):
         datum = tf.keras.Input(shape=(), dtype=tf.dtypes.string)
         l1 = tf.keras.layers.Dense(10)(inputs)
         l2 = tf.keras.layers.Dense(3)(l1)
-        outputs = sampled_expectation.SampledExpectation(backend=backend)(
-            datum,
-            symbol_names=symbols,
-            operators=cirq.Z(bit),
-            symbol_values=l2,
-            repetitions=5000)
+        outputs = sampled_expectation.SampledExpectation(
+            backend=backend,
+            use_cuquantum=use_cuquantum,
+        )(datum,
+          symbol_names=symbols,
+          operators=cirq.Z(bit),
+          symbol_values=l2,
+          repetitions=5000,
+          initializer=initializer)
         model = tf.keras.Model(inputs=[datum, inputs], outputs=outputs)
 
         data_in = np.array([[1], [0]], dtype=np.float32)
@@ -361,12 +479,26 @@ def test_simple_param_value_input(self, backend):
         history = model.fit(x=[circuits, data_in], y=data_out, epochs=30)
         self.assertAllClose(history.history['loss'][-1], 0, atol=0.3)
 
-    @parameterized.parameters([{'backend': 'noisy'}, {'backend': 'noiseless'}])
-    def test_simple_op_input(self, backend):
+    @parameterized.parameters([{
+        'backend': 'noisy',
+        'use_cuquantum': False,
+    }, {
+        'backend': 'noiseless',
+        'use_cuquantum': False,
+    }, {
+        'backend': 'noiseless',
+        'use_cuquantum': True,
+    }])
+    def test_simple_op_input(self, backend, use_cuquantum):
         """Test a simple operator input
 
         Learn qubit in the z+ state using two different measurement operators.
         """
+        if use_cuquantum and not circuit_execution_ops.is_gpu_configured():
+            # GPU is not set. Ignores this sub-test.
+            self.skipTest("GPU is not set. Ignoring gpu tests...")
+        tf.random.set_seed(RANDOM_SEED)
+        initializer = tf.keras.initializers.RandomUniform(0, 2 * np.pi)
         bit = cirq.GridQubit(0, 0)
         symbols = sympy.symbols('x y z')
         ops = util.convert_to_tensor([[cirq.Z(bit)], [cirq.Z(bit)]])
@@ -382,10 +514,13 @@ def test_simple_op_input(self, backend):
         n_inp = tf.keras.Input(shape=(1,), dtype=tf.dtypes.int32)
         circuit_inp = tf.keras.Input(shape=(), dtype=tf.dtypes.string)
         circuit_output = sampled_expectation.SampledExpectation(
-            backend=backend)(circuit_inp,
-                             symbol_names=symbols,
-                             operators=op_inp,
-                             repetitions=n_inp)
+            backend=backend,
+            use_cuquantum=use_cuquantum,
+        )(circuit_inp,
+          symbol_names=symbols,
+          operators=op_inp,
+          repetitions=n_inp,
+          initializer=initializer)
         model = tf.keras.Model(inputs=[circuit_inp, op_inp, n_inp],
                                outputs=[circuit_output])
 
@@ -400,13 +535,27 @@ def test_simple_op_input(self, backend):
 
         self.assertAllClose(history.history['loss'][-1], 0, atol=1e-2)
 
-    @parameterized.parameters([{'backend': 'noisy'}, {'backend': 'noiseless'}])
-    def test_simple_op_and_param_input(self, backend):
+    @parameterized.parameters([{
+        'backend': 'noisy',
+        'use_cuquantum': False,
+    }, {
+        'backend': 'noiseless',
+        'use_cuquantum': False,
+    }, {
+        'backend': 'noiseless',
+        'use_cuquantum': True,
+    }])
+    def test_simple_op_and_param_input(self, backend, use_cuquantum):
         """Test a simple operator and parameter input.
 
         Train a NN to put a qubit in the z+ or x+ states based on a classical
         binary input.
         """
+        if use_cuquantum and not circuit_execution_ops.is_gpu_configured():
+            # GPU is not set. Ignores this sub-test.
+            self.skipTest("GPU is not set. Ignoring gpu tests...")
+        tf.random.set_seed(RANDOM_SEED)
+        initializer = tf.keras.initializers.RandomUniform(0, 2 * np.pi)
         bit = cirq.GridQubit(0, 0)
         symbols = sympy.symbols('x y z')
         ops = util.convert_to_tensor([[cirq.Z(bit)], [cirq.Z(bit)]])
@@ -425,11 +574,14 @@ def test_simple_op_and_param_input(self, backend):
         dense_1 = tf.keras.layers.Dense(10)(data_inp)
         dense_2 = tf.keras.layers.Dense(3)(dense_1)
         circuit_output = sampled_expectation.SampledExpectation(
-            backend=backend)(circuit_inp,
-                             symbol_names=symbols,
-                             symbol_values=dense_2,
-                             operators=op_inp,
-                             repetitions=n_inp)
+            backend=backend,
+            use_cuquantum=use_cuquantum,
+        )(circuit_inp,
+          symbol_names=symbols,
+          symbol_values=dense_2,
+          operators=op_inp,
+          repetitions=n_inp,
+          initializer=initializer)
 
         functional_model = tf.keras.Model(
             inputs=[circuit_inp, data_inp, op_inp, n_inp],
@@ -444,13 +596,27 @@ def test_simple_op_and_param_input(self, backend):
                                        epochs=20)
         self.assertAllClose(history.history['loss'][-1], 0, atol=3)
 
-    @parameterized.parameters([{'backend': 'noisy'}, {'backend': 'noiseless'}])
-    def test_dnn_qnn_dnn(self, backend):
+    @parameterized.parameters([{
+        'backend': 'noisy',
+        'use_cuquantum': False,
+    }, {
+        'backend': 'noiseless',
+        'use_cuquantum': False,
+    }, {
+        'backend': 'noiseless',
+        'use_cuquantum': True,
+    }])
+    def test_dnn_qnn_dnn(self, backend, use_cuquantum):
         """Train a fully hybrid network using an SampledExpectation layer.
 
         Train the network to output +-5 given an input of 1 or 0. This tests
         that everything works when SampledExpectation layer is a middle layers.
         """
+        if use_cuquantum and not circuit_execution_ops.is_gpu_configured():
+            # GPU is not set. Ignores this sub-test.
+            self.skipTest("GPU is not set. Ignoring gpu tests...")
+        tf.random.set_seed(RANDOM_SEED)
+        initializer = tf.keras.initializers.RandomUniform(0, 2 * np.pi)
         bit = cirq.GridQubit(0, 0)
         symbols = sympy.symbols('x, y, z')
         circuits = util.convert_to_tensor([
@@ -464,12 +630,15 @@ def test_dnn_qnn_dnn(self, backend):
         circuit_input = tf.keras.Input(shape=(), dtype=tf.dtypes.string)
         d1 = tf.keras.layers.Dense(10)(classical_input)
         d2 = tf.keras.layers.Dense(3)(d1)
-        quantum = sampled_expectation.SampledExpectation(backend=backend)(
-            circuit_input,
-            symbol_names=symbols,
-            symbol_values=d2,
-            operators=cirq.Z(bit),
-            repetitions=5000)
+        quantum = sampled_expectation.SampledExpectation(
+            backend=backend,
+            use_cuquantum=use_cuquantum,
+        )(circuit_input,
+          symbol_names=symbols,
+          symbol_values=d2,
+          operators=cirq.Z(bit),
+          repetitions=5000,
+          initializer=initializer)
         d3 = tf.keras.layers.Dense(1)(quantum)
 
         model = tf.keras.Model(inputs=[circuit_input, classical_input],
diff --git a/tensorflow_quantum/python/layers/circuit_executors/state.py b/tensorflow_quantum/python/layers/circuit_executors/state.py
index f2b213ee1..456a83463 100644
--- a/tensorflow_quantum/python/layers/circuit_executors/state.py
+++ b/tensorflow_quantum/python/layers/circuit_executors/state.py
@@ -16,6 +16,7 @@
 import tensorflow as tf
 
 from tensorflow_quantum.core.ops import circuit_execution_ops
+from tensorflow_quantum.python import quantum_context
 from tensorflow_quantum.python.layers.circuit_executors import input_checks
 
 
@@ -112,7 +113,7 @@ class State(tf.keras.layers.Layer):
 
     """
 
-    def __init__(self, backend=None, **kwargs):
+    def __init__(self, backend=None, use_cuquantum=False, **kwargs):
         """Instantiate a State Layer.
 
         Create a layer that will simulate a quantum state and output it into
@@ -126,18 +127,35 @@ def __init__(self, backend=None, **kwargs):
                 `cirq.SimulatesFinalState`. Note that C++ Density Matrix
                 simulation is not yet supported so to do Density Matrix
                 simulation please use `cirq.DensityMatrixSimulator`.
+            use_cuquantum: Calls TFQ GPU version op.
         """
         super().__init__(**kwargs)
-        self.state_op = circuit_execution_ops.get_state_op(backend)
+
+        used_op = None
+        if backend == 'noiseless' or backend is None:
+            mode = quantum_context.get_quantum_concurrent_op_mode()
+            quantum_concurrent = False if use_cuquantum else mode
+            used_op = circuit_execution_ops.get_state_op(
+                backend=None,
+                use_cuquantum=use_cuquantum,
+                quantum_concurrent=quantum_concurrent,
+            )
+        elif backend == 'noisy':
+            raise ValueError('noisy backend is not supported in State layer.')
+        else:
+            used_op = circuit_execution_ops.get_state_op(backend=backend)
+
+        self.state_op = used_op
 
     def call(self, inputs, *, symbol_names=None, symbol_values=None):
         """Keras call function.
 
-        Input options:
-            `inputs`, `symbol_names`, `symbol_values`:
-                see `input_checks.expand_circuits`
+        Args:
+            inputs: See `input_checks.expand_circuits.
+            symbol_names: See `input_checks.expand_circuits.
+            symbol_values: See `input_checks.expand_circuits.
 
-        Output shape:
+        Returns:
             `tf.RaggedTensor` with shape:
                 [batch size of symbol_values, <size of state>]
                     or
diff --git a/tensorflow_quantum/python/layers/circuit_executors/state_test.py b/tensorflow_quantum/python/layers/circuit_executors/state_test.py
index 8904014b7..21286cfb6 100644
--- a/tensorflow_quantum/python/layers/circuit_executors/state_test.py
+++ b/tensorflow_quantum/python/layers/circuit_executors/state_test.py
@@ -27,6 +27,7 @@
 import tensorflow as tf
 import cirq
 
+from tensorflow_quantum.core.ops import circuit_execution_ops
 from tensorflow_quantum.python.layers.circuit_executors import state
 from tensorflow_quantum.python import util
 
@@ -46,15 +47,24 @@ def test_state_create(self):
             state.State('junk')
 
     @parameterized.parameters([{
-        'backend': None
+        'backend': None,
+        'use_cuquantum': False,
     }, {
-        'backend': cirq.Simulator()
+        'backend': None,
+        'use_cuquantum': True,
     }, {
-        'backend': cirq.DensityMatrixSimulator()
+        'backend': cirq.Simulator(),
+        'use_cuquantum': False,
+    }, {
+        'backend': cirq.DensityMatrixSimulator(),
+        'use_cuquantum': False,
     }])
-    def test_state_invalid_combinations(self, backend):
+    def test_state_invalid_combinations(self, backend, use_cuquantum):
         """Test with valid type inputs and valid value, but incorrect combo."""
-        state_calc = state.State(backend)
+        if use_cuquantum and not circuit_execution_ops.is_gpu_configured():
+            # GPU is not set. Ignores this sub-test.
+            self.skipTest("GPU is not set. Ignoring gpu tests...")
+        state_calc = state.State(backend, use_cuquantum=use_cuquantum)
         symbol = sympy.Symbol('alpha')
         circuit = cirq.Circuit(cirq.H(cirq.GridQubit(0, 0))**symbol)
         with self.assertRaisesRegex(Exception, expected_regex=""):
@@ -110,18 +120,26 @@ def test_sample_outputs_simple(self):
 
     @parameterized.parameters([
         {
-            'backend_output': (None, WF_OUTPUT)
+            'backend_output': (None, WF_OUTPUT),
+            'use_cuquantum': False,
+        },
+        {
+            'backend_output': (None, WF_OUTPUT),
+            'use_cuquantum': True,
         },
         {
-            'backend_output': (cirq.sim.sparse_simulator.Simulator(), WF_OUTPUT)
+            'backend_output':
+                (cirq.sim.sparse_simulator.Simulator(), WF_OUTPUT),
+            'use_cuquantum': False,
         },
         {
             'backend_output':
                 (cirq.sim.density_matrix_simulator.DensityMatrixSimulator(),
-                 DM_OUTPUT)
+                 DM_OUTPUT),
+            'use_cuquantum': False,
         },
     ])
-    def test_state_output(self, backend_output):
+    def test_state_output(self, backend_output, use_cuquantum):
         """Check that any output type is as expected.
 
         This layer only allows for 2 different outputs, depending on whether a
@@ -129,9 +147,15 @@ def test_state_output(self, backend_output):
         post processing done inside the layers should not cause output from the
         layer to structurally deviate from what is expected.
         """
+        if use_cuquantum and not circuit_execution_ops.is_gpu_configured():
+            # GPU is not set. Ignores this sub-test.
+            self.skipTest("GPU is not set. Ignoring gpu tests...")
         backend = backend_output[0]
         output = backend_output[1]
-        state_executor = state.State(backend=backend)
+        state_executor = state.State(
+            backend=backend,
+            use_cuquantum=use_cuquantum,
+        )
         bits = cirq.GridQubit.rect(1, 2)
         circuit = cirq.Circuit()
         circuit.append(cirq.H.on(bits[0]))
diff --git a/tensorflow_quantum/python/layers/high_level/controlled_pqc.py b/tensorflow_quantum/python/layers/high_level/controlled_pqc.py
index a61b4b50e..7a781f852 100644
--- a/tensorflow_quantum/python/layers/high_level/controlled_pqc.py
+++ b/tensorflow_quantum/python/layers/high_level/controlled_pqc.py
@@ -128,6 +128,7 @@ def __init__(self,
                  *,
                  repetitions=None,
                  backend='noiseless',
+                 use_cuquantum=False,
                  differentiator=None,
                  **kwargs):
         """Instantiate this layer.
@@ -153,6 +154,8 @@ def __init__(self,
             `sampled_based` is True or it must inherit
             `cirq.sim.simulator.SimulatesExpectationValues` if `sample_based` is
             False.
+        use_cuquantum: Optional Python `bool` indicating whether or not to use
+            GPU ops
         differentiator: Optional `tfq.differentiator` object to specify how
             gradients of `model_circuit` should be calculated.
         """
@@ -235,10 +238,13 @@ def __init__(self,
 
         if self._analytic:
             self._layer = expectation.Expectation(backend=backend,
-                                                  differentiator=differentiator)
+                                                  differentiator=differentiator,
+                                                  use_cuquantum=use_cuquantum)
         else:
             self._layer = sampled_expectation.SampledExpectation(
-                backend=backend, differentiator=differentiator)
+                backend=backend,
+                differentiator=differentiator,
+                use_cuquantum=use_cuquantum)
 
         self._append_layer = elementary.AddCircuit()
 
diff --git a/tensorflow_quantum/python/layers/high_level/noisy_controlled_pqc.py b/tensorflow_quantum/python/layers/high_level/noisy_controlled_pqc.py
index 9a3b86eec..2d6565dfc 100644
--- a/tensorflow_quantum/python/layers/high_level/noisy_controlled_pqc.py
+++ b/tensorflow_quantum/python/layers/high_level/noisy_controlled_pqc.py
@@ -142,6 +142,7 @@ def __init__(self,
                  repetitions=None,
                  sample_based=None,
                  differentiator=None,
+                 use_cuquantum=False,
                  **kwargs):
         """Instantiate this layer.
 
@@ -163,6 +164,8 @@ def __init__(self,
             trajectory.
         differentiator: Optional `tfq.differentiator` object to specify how
             gradients of `model_circuit` should be calculated.
+        use_cuquantum: Optional `bool` indicating whether to use GPU for
+            simulation or not. Defaults to `False`. NOT IMPLEMENTED YET.
         """
         super().__init__(**kwargs)
         # Ingest model_circuit.
@@ -218,6 +221,11 @@ def __init__(self,
         if differentiator is None:
             differentiator = parameter_shift.ParameterShift()
 
+        # Use gpu not supported yet.
+        if use_cuquantum:
+            raise NotImplementedError("GPU support for noisy controlled PQC \
+                is not yet implemented.")
+
         # Ingest and promote sample based.
         if sample_based is None:
             raise ValueError("Please specify sample_based=False for analytic "
@@ -256,4 +264,4 @@ def call(self, inputs):
         tiled_up_repetitions = tf.tile(self._repetitions,
                                        [circuit_batch_dim, 1])
         return self._executor(model_appended, self._symbols, inputs[1],
-                              tiled_up_operators, tiled_up_repetitions)
+                              tiled_up_operators, tiled_up_repetitions)
\ No newline at end of file
diff --git a/tensorflow_quantum/python/layers/high_level/noisy_pqc.py b/tensorflow_quantum/python/layers/high_level/noisy_pqc.py
index 2c6796231..0c72668ce 100644
--- a/tensorflow_quantum/python/layers/high_level/noisy_pqc.py
+++ b/tensorflow_quantum/python/layers/high_level/noisy_pqc.py
@@ -139,6 +139,7 @@ def __init__(
             repetitions=None,
             sample_based=None,
             differentiator=None,
+            use_cuquantum=False,
             initializer=tf.keras.initializers.RandomUniform(0, 2 * np.pi),
             regularizer=None,
             constraint=None,
@@ -164,6 +165,8 @@ def __init__(
             trajectory.
         differentiator: Optional `tfq.differentiator` object to specify how
             gradients of `model_circuit` should be calculated.
+        use_cuquantum: Python `bool` indicating whether to use GPU ops
+            (currently not supported/implemented).
         initializer: Optional `tf.keras.initializer` object to specify how the
             symbols in `model_circuit` should be initialized when creating
             the managed variables.
@@ -220,6 +223,11 @@ def __init__(
             [[repetitions for _ in range(len(operators))]],
             dtype=tf.dtypes.int32)
 
+        # Use gpu not supported yet.
+        if use_cuquantum:
+            raise NotImplementedError("GPU support for noisy PQC is not \
+                                      yet implemented.")
+
         # Ingest differentiator.
         if differentiator is None:
             differentiator = parameter_shift.ParameterShift()
@@ -292,4 +300,4 @@ def call(self, inputs):
                                        [circuit_batch_dim, 1])
         return self._executor(model_appended, self._symbols,
                               tiled_up_parameters, tiled_up_operators,
-                              tiled_up_repetitions)
+                              tiled_up_repetitions)
\ No newline at end of file
diff --git a/tensorflow_quantum/python/layers/high_level/pqc.py b/tensorflow_quantum/python/layers/high_level/pqc.py
index 229ded921..a4c5c3f05 100644
--- a/tensorflow_quantum/python/layers/high_level/pqc.py
+++ b/tensorflow_quantum/python/layers/high_level/pqc.py
@@ -137,6 +137,7 @@ def __init__(
             *,
             repetitions=None,
             backend='noiseless',
+            use_cuquantum=False,
             differentiator=None,
             initializer=tf.keras.initializers.RandomUniform(0, 2 * np.pi),
             regularizer=None,
@@ -166,6 +167,8 @@ def __init__(
             `cirq.sim.simulator.SimulatesExpectationValues` if analytic
             expectations are desired or `cirq.Sampler` if sampled expectations
             are desired.
+        use_cuquantum: Optional Python `bool` indicating whether or not to use
+            GPU ops.
         differentiator: Optional `tfq.differentiator` object to specify how
             gradients of `model_circuit` should be calculated.
         initializer: Optional `tf.keras.initializer` object to specify how the
@@ -248,10 +251,14 @@ def __init__(
                             "cirq.sim.simulator.SimulatesExpectationValues.")
         if self._analytic:
             self._executor = expectation.Expectation(
-                backend=backend, differentiator=differentiator)
+                backend=backend,
+                differentiator=differentiator,
+                use_cuquantum=use_cuquantum)
         else:
             self._executor = sampled_expectation.SampledExpectation(
-                backend=backend, differentiator=differentiator)
+                backend=backend,
+                differentiator=differentiator,
+                use_cuquantum=use_cuquantum)
 
         self._append_layer = elementary.AddCircuit()
 
diff --git a/tensorflow_quantum/python/optimizers/rotosolve_minimizer_test.py b/tensorflow_quantum/python/optimizers/rotosolve_minimizer_test.py
index 0687f4ee3..8bdcd28a7 100755
--- a/tensorflow_quantum/python/optimizers/rotosolve_minimizer_test.py
+++ b/tensorflow_quantum/python/optimizers/rotosolve_minimizer_test.py
@@ -146,7 +146,7 @@ def convert_to_circuit(input_data):
         a, b = sympy.symbols('a b')  # parameters for the circuit
         circuit = cirq.Circuit(
             cirq.rx(a).on(q0),
-            cirq.ry(b).on(q1), cirq.CNOT(control=q0, target=q1))
+            cirq.ry(b).on(q1), cirq.CNOT(q0, q1))
 
         # Build the Keras model.
         model = tf.keras.Sequential([
diff --git a/tensorflow_quantum/python/optimizers/spsa_minimizer_test.py b/tensorflow_quantum/python/optimizers/spsa_minimizer_test.py
index 33dcb180e..a22a72079 100644
--- a/tensorflow_quantum/python/optimizers/spsa_minimizer_test.py
+++ b/tensorflow_quantum/python/optimizers/spsa_minimizer_test.py
@@ -249,7 +249,7 @@ def convert_to_circuit(input_data):
         a, b = sympy.symbols('a b')  # parameters for the circuit
         circuit = cirq.Circuit(
             cirq.rx(a).on(q0),
-            cirq.ry(b).on(q1), cirq.CNOT(control=q0, target=q1))
+            cirq.ry(b).on(q1), cirq.CNOT(q0, q1))
 
         # Build the Keras model.
         model = tf.keras.Sequential([
diff --git a/third_party/cuquantum/BUILD b/third_party/cuquantum/BUILD
new file mode 100644
index 000000000..e69de29bb
diff --git a/third_party/cuquantum/BUILD.tpl b/third_party/cuquantum/BUILD.tpl
new file mode 100644
index 000000000..0ec87701f
--- /dev/null
+++ b/third_party/cuquantum/BUILD.tpl
@@ -0,0 +1,23 @@
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "cuquantum_headers",
+    linkstatic = 1,
+    srcs = [":cuquantum_header_include"],
+    includes = ["include"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "libcuquantum",
+    srcs = [
+        ":libcustatevec.so",
+    ],
+    linkopts = [
+        "-Wl,-rpath,%{CUQUANTUM_LIBRARY_PATH}",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+%{CUQUANTUM_HEADER_GENRULE}
+%{CUSTATEVEC_SHARED_LIBRARY_GENRULE}
diff --git a/third_party/cuquantum/cuquantum_configure.bzl b/third_party/cuquantum/cuquantum_configure.bzl
new file mode 100644
index 000000000..1a301ebb0
--- /dev/null
+++ b/third_party/cuquantum/cuquantum_configure.bzl
@@ -0,0 +1,257 @@
+"""Setup cuQuantum as external dependency."""
+_CUQUANTUM_ROOT = "CUQUANTUM_ROOT"
+
+
+def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
+    if not out:
+        out = tpl
+    repository_ctx.template(
+        out,
+        Label("//third_party/cuquantum:%s.tpl" % tpl),
+        substitutions,
+    )
+
+
+def _fail(msg):
+    """Output failure message when auto configuration fails."""
+    red = "\033[0;31m"
+    no_color = "\033[0m"
+    fail("%sPython Configuration Error:%s %s\n" % (red, no_color, msg))
+
+
+def _warn(msg):
+    """Output warning message when auto configuration warns."""
+    brown = "\033[1;33m"
+    no_color = "\033[0m"
+    print("\n%sAuto-Configuration Warning:%s %s\n" % (brown, no_color, msg))
+
+
+def _execute(
+        repository_ctx,
+        cmdline,
+        error_msg = None,
+        error_details = None,
+        empty_stdout_fine = False):
+    """Executes an arbitrary shell command.
+
+    Args:
+      repository_ctx: the repository_ctx object
+      cmdline: list of strings, the command to execute
+      error_msg: string, a summary of the error if the command fails
+      error_details: string, details about the error or steps to fix it
+      empty_stdout_fine: bool, if True, an empty stdout result is fine, otherwise
+        it's an error
+
+    Return:
+      the result of repository_ctx.execute(cmdline)
+    """
+    result = repository_ctx.execute(cmdline)
+    if result.stderr or not (empty_stdout_fine or result.stdout):
+        _fail("\n".join([
+            error_msg.strip() if error_msg else "Repository command failed",
+            result.stderr.strip(),
+            error_details if error_details else "",
+        ]))
+    return result
+
+
+def _read_dir(repository_ctx, src_dir):
+    """Returns a string with all files in a directory.
+
+    Finds all files inside a directory, traversing subfolders and following
+    symlinks. The returned string contains the full path of all files
+    separated by line breaks.
+    """
+    find_result = _execute(
+        repository_ctx,
+        ["find", src_dir, "-follow", "-type", "f"],
+        empty_stdout_fine = True,
+    )
+    result = find_result.stdout
+    return result
+
+
+def _find_file(repository_ctx, filename):
+    """Returns a string with a directory path including the filename.
+
+    The returned string contains the parent path of the filename.
+    """
+    result = repository_ctx.execute(
+        ["timeout", "10", "find", "/", "-name", filename, "-print", "-quit", "-not", "-path", "'*/.*'", "-quit"]).stdout
+    result = result[:result.find(filename)+len(filename)]
+    return result
+
+
+def _genrule(genrule_name, command, outs):
+    """Returns a string with a genrule.
+
+    Genrule executes the given command and produces the given outputs.
+
+    Args:
+        genrule_name: A unique name for genrule target.
+        command: The command to run.
+        outs: A list of files generated by this rule.
+
+    Returns:
+        A genrule target.
+    """
+    return (
+        "genrule(\n" +
+        '    name = "' +
+        genrule_name + '",\n' +
+        "    outs = [\n" +
+        outs +
+        "\n    ],\n" +
+        '    cmd = """\n' +
+        command +
+        '\n   """,\n' +
+        ")\n"
+    )
+
+def _norm_path(path):
+    """Returns a path with '/' and remove the trailing slash."""
+    path = path.replace("\\", "/")
+    if path[-1] == "/":
+        path = path[:-1]
+    return path
+
+
+def _symlink_genrule_for_dir(
+        repository_ctx,
+        src_dir,
+        dest_dir,
+        genrule_name,
+        src_files = [],
+        dest_files = [],
+        is_empty_genrule = False):
+    """Returns a genrule to symlink(or copy if on Windows) a set of files.
+
+    If src_dir is passed, files will be read from the given directory; otherwise
+    we assume files are in src_files and dest_files. Here are the examples:
+
+    ```
+    genrule(
+        name = "cuquantum_header_include",
+        outs = [
+            "include/custatevec.h",
+            "include/cutensornet.h",
+            "include/cutensornet/types.h",
+            "include/cutensornet/typesDistributed.h",
+        ],
+        cmd = [some copy command lines based on users' local environment],
+    )
+    genrule(
+        name = "libcustatevec.so",
+        outs = [
+            "libcustatevec.so",
+        ],
+        cmd = [some copy command lines based on users' local environment],
+    )
+    ```
+
+    Args:
+        repository_ctx: the repository_ctx object.
+        src_dir: source directory.
+        dest_dir: directory to create symlink in.
+        genrule_name: genrule name.
+        src_files: list of source files instead of src_dir.
+        dest_files: list of corresonding destination files.
+        is_empty_genrule: True if CUQUANTUM_ROOT is not set.
+
+    Returns:
+        genrule target that creates the symlinks.
+    """
+    if is_empty_genrule:
+        if dest_dir != "":
+          target_path = "%s/%s.h" % (dest_dir, genrule_name)
+        else:
+          target_path = genrule_name
+        genrule = _genrule(
+            genrule_name,
+            "touch $(OUTS)",
+            "'%s'" % (target_path),
+        )
+        return genrule
+
+    if src_dir != None:
+        src_dir = _norm_path(src_dir)
+        dest_dir = _norm_path(dest_dir)
+        files = "\n".join(sorted(_read_dir(repository_ctx, src_dir).splitlines()))
+
+        dest_files = files.replace(src_dir, "").splitlines()
+        src_files = files.splitlines()
+    command = []
+    outs = []
+    
+    for i in range(len(dest_files)):
+        if dest_files[i] != "":
+            # If we have only one file to link we do not want to use the dest_dir, as
+            # $(@D) will include the full path to the file.
+            dest = "$(@D)/" + dest_dir + dest_files[i] if len(dest_files) != 1 else "$(@D)/" + dest_files[i]
+
+            # Copy the headers to create a sandboxable setup.
+            cmd = "cp -f"
+            command.append(cmd + ' "%s" "%s"' % (src_files[i], dest))
+            outs.append('        "' + dest_dir + dest_files[i] + '",')
+
+    genrule = _genrule(
+        genrule_name,
+        " && ".join(command),
+        "\n".join(outs),
+    )
+    return genrule
+
+
+def _cuquantum_pip_impl(repository_ctx):
+    if _CUQUANTUM_ROOT in repository_ctx.os.environ:
+      cuquantum_root = repository_ctx.os.environ[_CUQUANTUM_ROOT]
+    else:
+      repository_ctx.os.environ[_CUQUANTUM_ROOT] = ""
+      cuquantum_root = ""
+    if cuquantum_root == "":
+      # CUQUANTUM_ROOT is empty. Let's find the library root path lazily.
+      cuquantum_header_path = _find_file(repository_ctx, "custatevec.h")
+      cuquantum_header_path = cuquantum_header_path[:cuquantum_header_path.find("/custatevec.h")]
+      custatevec_shared_library_path = _find_file(repository_ctx, "libcustatevec.so")
+      cuquantum_root = custatevec_shared_library_path[:custatevec_shared_library_path.find("/lib/lib")]
+      if cuquantum_root == "":
+        _warn("'CUQUANTUM_ROOT' environment variable is not set, no library was found too. If it is CPU mode, please ignore this warning")
+      else:
+        _warn("'CUQUANTUM_ROOT' environment variable is not set, using '%s' as default" % cuquantum_root)
+    else:
+      cuquantum_header_path = "%s/include" % cuquantum_root
+      custatevec_shared_library_path = "%s/lib/libcustatevec.so" % (cuquantum_root)
+
+    is_empty_genrule = cuquantum_header_path == "" or custatevec_shared_library_path == ""
+
+    cuquantum_header_rule = _symlink_genrule_for_dir(
+        repository_ctx,
+        cuquantum_header_path,
+        "include",
+        "cuquantum_header_include",
+        is_empty_genrule=is_empty_genrule,
+    )
+
+    custatevec_shared_library_rule = _symlink_genrule_for_dir(
+        repository_ctx,
+        None,
+        "",
+        "libcustatevec.so",
+        [custatevec_shared_library_path],
+        ["libcustatevec.so"],
+        is_empty_genrule=is_empty_genrule,
+    )
+
+    _tpl(repository_ctx, "BUILD", {
+        "%{CUQUANTUM_LIBRARY_PATH}": "%s/lib" % (cuquantum_root),
+        "%{CUQUANTUM_HEADER_GENRULE}": cuquantum_header_rule,
+        "%{CUSTATEVEC_SHARED_LIBRARY_GENRULE}": custatevec_shared_library_rule,
+    })
+
+
+cuquantum_configure = repository_rule(
+    implementation = _cuquantum_pip_impl,
+    environ = [
+        _CUQUANTUM_ROOT,
+    ],
+)