diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp
index 9570baad23..ad46884bf1 100644
--- a/source/adapters/cuda/command_buffer.cpp
+++ b/source/adapters/cuda/command_buffer.cpp
@@ -9,48 +9,143 @@
 //===----------------------------------------------------------------------===//
 
 #include "command_buffer.hpp"
+
 #include "common.hpp"
+#include "enqueue.hpp"
+#include "event.hpp"
+#include "kernel.hpp"
+#include "memory.hpp"
+#include "queue.hpp"
+
+#include <cstring>
+
+ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_(
+    ur_context_handle_t hContext, ur_device_handle_t hDevice)
+    : Context(hContext),
+      Device(hDevice), CudaGraph{nullptr}, CudaGraphExec{nullptr}, RefCount{1} {
+  urContextRetain(hContext);
+  urDeviceRetain(hDevice);
+}
+
+/// The ur_exp_command_buffer_handle_t_ destructor releases
+/// all the memory objects allocated for command_buffer managment
+ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() {
+  // Release the memory allocated to the Context stored in the command_buffer
+  UR_TRACE(urContextRelease(Context));
+
+  // Release the device
+  UR_TRACE(urDeviceRelease(Device));
+
+  // Release the memory allocated to the CudaGraph
+  cuGraphDestroy(CudaGraph);
+
+  // Release the memory allocated to the CudaGraphExec
+  cuGraphExecDestroy(CudaGraphExec);
+}
 
-/// Stub implementations of UR experimental feature command-buffers
+/// Helper function for finding the Cuda Nodes associated with the
+/// commands in a command-buffer, each event is pointed to by a sync-point in
+/// the wait list.
+///
+/// @param[in] CommandBuffer to lookup the events from.
+/// @param[in] NumSyncPointsInWaitList Length of \p SyncPointWaitList.
+/// @param[in] SyncPointWaitList List of sync points in \p CommandBuffer
+/// to find the events for.
+/// @param[out] CuNodesList Return parameter for the Cuda Nodes associated with
+/// each sync-point in \p SyncPointWaitList.
+///
+/// @return UR_RESULT_SUCCESS or an error code on failure
+static ur_result_t getNodesFromSyncPoints(
+    const ur_exp_command_buffer_handle_t &CommandBuffer,
+    size_t NumSyncPointsInWaitList,
+    const ur_exp_command_buffer_sync_point_t *SyncPointWaitList,
+    std::vector<CUgraphNode> &CuNodesList) {
+  // Map of ur_exp_command_buffer_sync_point_t to ur_event_handle_t defining
+  // the event associated with each sync-point
+  auto SyncPoints = CommandBuffer->SyncPoints;
+
+  // For each sync-point add associated CUDA graph node to the return list.
+  for (size_t i = 0; i < NumSyncPointsInWaitList; i++) {
+    if (auto NodeHandle = SyncPoints.find(SyncPointWaitList[i]);
+        NodeHandle != SyncPoints.end()) {
+      CuNodesList.push_back(*NodeHandle->second.get());
+    } else {
+      return UR_RESULT_ERROR_INVALID_VALUE;
+    }
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+/// Set parameter for General 1D memory copy.
+/// If the source and/or destination is on the device, SrcPtr and/or DstPtr
+/// must be a pointer to a CUdeviceptr
+static void setCopyParams(const void *SrcPtr, const CUmemorytype_enum SrcType,
+                          void *DstPtr, const CUmemorytype_enum DstType,
+                          size_t Size, CUDA_MEMCPY3D &Params) {
+  // Set all params to 0 first
+  std::memset(&Params, 0, sizeof(CUDA_MEMCPY3D));
+
+  Params.srcMemoryType = SrcType;
+  Params.srcDevice = SrcType == CU_MEMORYTYPE_DEVICE
+                         ? *static_cast<const CUdeviceptr *>(SrcPtr)
+                         : 0;
+  Params.srcHost = SrcType == CU_MEMORYTYPE_HOST ? SrcPtr : nullptr;
+  Params.dstMemoryType = DstType;
+  Params.dstDevice =
+      DstType == CU_MEMORYTYPE_DEVICE ? *static_cast<CUdeviceptr *>(DstPtr) : 0;
+  Params.dstHost = DstType == CU_MEMORYTYPE_HOST ? DstPtr : nullptr;
+  Params.WidthInBytes = Size;
+  Params.Height = 1;
+  Params.Depth = 1;
+}
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
     ur_exp_command_buffer_handle_t *phCommandBuffer) {
-  (void)hContext;
-  (void)hDevice;
   (void)pCommandBufferDesc;
-  (void)phCommandBuffer;
-  detail::ur::die("Experimental Command-buffer feature is not "
-                  "implemented for CUDA adapter.");
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+
+  try {
+    *phCommandBuffer = new ur_exp_command_buffer_handle_t_(hContext, hDevice);
+  } catch (const std::bad_alloc &) {
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  try {
+    UR_CHECK_ERROR(cuGraphCreate(&(*phCommandBuffer)->CudaGraph, 0));
+  } catch (...) {
+    return UR_RESULT_ERROR_OUT_OF_RESOURCES;
+  }
+
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL
 urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
-  (void)hCommandBuffer;
-
-  detail::ur::die("Experimental Command-buffer feature is not "
-                  "implemented for CUDA adapter.");
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  hCommandBuffer->incrementReferenceCount();
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL
 urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
-  (void)hCommandBuffer;
+  if (hCommandBuffer->decrementReferenceCount() != 0)
+    return UR_RESULT_SUCCESS;
 
-  detail::ur::die("Experimental Command-buffer feature is not "
-                  "implemented for CUDA adapter.");
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  delete hCommandBuffer;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL
 urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
-  (void)hCommandBuffer;
-
-  detail::ur::die("Experimental Command-buffer feature is not "
-                  "implemented for CUDA adapter.");
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  try {
+    UR_CHECK_ERROR(cuGraphInstantiate(&hCommandBuffer->CudaGraphExec,
+                                      hCommandBuffer->CudaGraph, 0));
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
@@ -60,19 +155,85 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
     uint32_t numSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint) {
-  (void)hCommandBuffer;
-  (void)hKernel;
-  (void)workDim;
-  (void)pGlobalWorkOffset;
-  (void)pGlobalWorkSize;
-  (void)pLocalWorkSize;
-  (void)numSyncPointsInWaitList;
-  (void)pSyncPointWaitList;
-  (void)pSyncPoint;
+  // Preconditions
+  UR_ASSERT(hCommandBuffer->Context == hKernel->getContext(),
+            UR_RESULT_ERROR_INVALID_KERNEL);
+  UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
+  UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  CUgraphNode GraphNode;
+
+  std::vector<CUgraphNode> DepsList;
+
+  UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                 pSyncPointWaitList, DepsList),
+          Result);
+
+  if (Result != UR_RESULT_SUCCESS) {
+    return Result;
+  }
+
+  if (*pGlobalWorkSize == 0) {
+    try {
+      // Create an empty node if the kernel workload size is zero
+      UR_CHECK_ERROR(cuGraphAddEmptyNode(&GraphNode, hCommandBuffer->CudaGraph,
+                                         DepsList.data(), DepsList.size()));
+
+      // Get sync point and register the cuNode with it.
+      *pSyncPoint = hCommandBuffer->AddSyncPoint(
+          std::make_shared<CUgraphNode>(GraphNode));
+    } catch (ur_result_t Err) {
+      Result = Err;
+    }
+    return Result;
+  }
+
+  // Set the number of threads per block to the number of threads per warp
+  // by default unless user has provided a better number
+  size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
+  size_t BlocksPerGrid[3] = {1u, 1u, 1u};
 
-  detail::ur::die("Experimental Command-buffer feature is not "
-                  "implemented for CUDA adapter.");
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  uint32_t LocalSize = hKernel->getLocalSize();
+  CUfunction CuFunc = hKernel->get();
+  Result =
+      setKernelParams(hCommandBuffer->Context, hCommandBuffer->Device, workDim,
+                      pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
+                      hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid);
+  if (Result != UR_RESULT_SUCCESS) {
+    return Result;
+  }
+
+  try {
+    // Set node param structure with the kernel related data
+    auto &ArgIndices = hKernel->getArgIndices();
+    CUDA_KERNEL_NODE_PARAMS NodeParams;
+    NodeParams.func = CuFunc;
+    NodeParams.gridDimX = BlocksPerGrid[0];
+    NodeParams.gridDimY = BlocksPerGrid[1];
+    NodeParams.gridDimZ = BlocksPerGrid[2];
+    NodeParams.blockDimX = ThreadsPerBlock[0];
+    NodeParams.blockDimY = ThreadsPerBlock[1];
+    NodeParams.blockDimZ = ThreadsPerBlock[2];
+    NodeParams.sharedMemBytes = LocalSize;
+    NodeParams.kernelParams = const_cast<void **>(ArgIndices.data());
+    NodeParams.extra = nullptr;
+
+    // Create and add an new kernel node to the Cuda graph
+    UR_CHECK_ERROR(cuGraphAddKernelNode(&GraphNode, hCommandBuffer->CudaGraph,
+                                        DepsList.data(), DepsList.size(),
+                                        &NodeParams));
+
+    if (LocalSize != 0)
+      hKernel->clearLocalSize();
+
+    // Get sync point and register the cuNode with it.
+    *pSyncPoint =
+        hCommandBuffer->AddSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemcpyUSMExp(
@@ -80,17 +241,33 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemcpyUSMExp(
     size_t size, uint32_t numSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint) {
-  (void)hCommandBuffer;
-  (void)pDst;
-  (void)pSrc;
-  (void)size;
-  (void)numSyncPointsInWaitList;
-  (void)pSyncPointWaitList;
-  (void)pSyncPoint;
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  CUgraphNode GraphNode;
+  std::vector<CUgraphNode> DepsList;
+  UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                 pSyncPointWaitList, DepsList),
+          Result);
 
-  detail::ur::die("Experimental Command-buffer feature is not "
-                  "implemented for CUDA adapter.");
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  if (Result != UR_RESULT_SUCCESS) {
+    return Result;
+  }
+
+  try {
+    CUDA_MEMCPY3D NodeParams = {};
+    setCopyParams(pSrc, CU_MEMORYTYPE_HOST, pDst, CU_MEMORYTYPE_HOST, size,
+                  NodeParams);
+
+    UR_CHECK_ERROR(cuGraphAddMemcpyNode(
+        &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
+        &NodeParams, hCommandBuffer->Device->getContext()));
+
+    // Get sync point and register the cuNode with it.
+    *pSyncPoint =
+        hCommandBuffer->AddSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyExp(
@@ -99,19 +276,42 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyExp(
     uint32_t numSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint) {
-  (void)hCommandBuffer;
-  (void)hSrcMem;
-  (void)hDstMem;
-  (void)srcOffset;
-  (void)dstOffset;
-  (void)size;
-  (void)numSyncPointsInWaitList;
-  (void)pSyncPointWaitList;
-  (void)pSyncPoint;
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  CUgraphNode GraphNode;
+  std::vector<CUgraphNode> DepsList;
+
+  UR_ASSERT(size + dstOffset <= std::get<BufferMem>(hDstMem->Mem).getSize(),
+            UR_RESULT_ERROR_INVALID_SIZE);
+  UR_ASSERT(size + srcOffset <= std::get<BufferMem>(hSrcMem->Mem).getSize(),
+            UR_RESULT_ERROR_INVALID_SIZE);
 
-  detail::ur::die("Experimental Command-buffer feature is not "
-                  "implemented for CUDA adapter.");
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                 pSyncPointWaitList, DepsList),
+          Result);
+
+  if (Result != UR_RESULT_SUCCESS) {
+    return Result;
+  }
+
+  try {
+    auto Src = std::get<BufferMem>(hSrcMem->Mem).get() + srcOffset;
+    auto Dst = std::get<BufferMem>(hDstMem->Mem).get() + dstOffset;
+
+    CUDA_MEMCPY3D NodeParams = {};
+    setCopyParams(&Src, CU_MEMORYTYPE_DEVICE, &Dst, CU_MEMORYTYPE_DEVICE, size,
+                  NodeParams);
+
+    UR_CHECK_ERROR(cuGraphAddMemcpyNode(
+        &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
+        &NodeParams, hCommandBuffer->Device->getContext()));
+
+    // Get sync point and register the cuNode with it.
+    *pSyncPoint =
+        hCommandBuffer->AddSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyRectExp(
@@ -122,23 +322,37 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyRectExp(
     uint32_t numSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint) {
-  (void)hCommandBuffer;
-  (void)hSrcMem;
-  (void)hDstMem;
-  (void)srcOrigin;
-  (void)dstOrigin;
-  (void)region;
-  (void)srcRowPitch;
-  (void)srcSlicePitch;
-  (void)dstRowPitch;
-  (void)dstSlicePitch;
-  (void)numSyncPointsInWaitList;
-  (void)pSyncPointWaitList;
-  (void)pSyncPoint;
-
-  detail::ur::die("Experimental Command-buffer feature is not "
-                  "implemented for CUDA adapter.");
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  CUgraphNode GraphNode;
+  std::vector<CUgraphNode> DepsList;
+  UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                 pSyncPointWaitList, DepsList),
+          Result);
+
+  if (Result != UR_RESULT_SUCCESS) {
+    return Result;
+  }
+
+  try {
+    CUdeviceptr SrcPtr = std::get<BufferMem>(hSrcMem->Mem).get();
+    CUdeviceptr DstPtr = std::get<BufferMem>(hDstMem->Mem).get();
+    CUDA_MEMCPY3D NodeParams = {};
+
+    setCopyRectParams(region, &SrcPtr, CU_MEMORYTYPE_DEVICE, srcOrigin,
+                      srcRowPitch, srcSlicePitch, &DstPtr, CU_MEMORYTYPE_DEVICE,
+                      dstOrigin, dstRowPitch, dstSlicePitch, NodeParams);
+
+    UR_CHECK_ERROR(cuGraphAddMemcpyNode(
+        &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
+        &NodeParams, hCommandBuffer->Device->getContext()));
+
+    // Get sync point and register the cuNode with it.
+    *pSyncPoint =
+        hCommandBuffer->AddSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
 }
 
 UR_APIEXPORT
@@ -148,18 +362,35 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteExp(
     uint32_t numSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint) {
-  (void)hCommandBuffer;
-  (void)hBuffer;
-  (void)offset;
-  (void)size;
-  (void)pSrc;
-  (void)numSyncPointsInWaitList;
-  (void)pSyncPointWaitList;
-  (void)pSyncPoint;
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  CUgraphNode GraphNode;
+  std::vector<CUgraphNode> DepsList;
+  UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                 pSyncPointWaitList, DepsList),
+          Result);
+
+  if (Result != UR_RESULT_SUCCESS) {
+    return Result;
+  }
+
+  try {
+    auto Dst = std::get<BufferMem>(hBuffer->Mem).get() + offset;
+
+    CUDA_MEMCPY3D NodeParams = {};
+    setCopyParams(pSrc, CU_MEMORYTYPE_HOST, &Dst, CU_MEMORYTYPE_DEVICE, size,
+                  NodeParams);
 
-  detail::ur::die("Experimental Command-buffer feature is not "
-                  "implemented for CUDA adapter.");
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    UR_CHECK_ERROR(cuGraphAddMemcpyNode(
+        &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
+        &NodeParams, hCommandBuffer->Device->getContext()));
+
+    // Get sync point and register the cuNode with it.
+    *pSyncPoint =
+        hCommandBuffer->AddSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
 }
 
 UR_APIEXPORT
@@ -168,18 +399,35 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferReadExp(
     size_t offset, size_t size, void *pDst, uint32_t numSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint) {
-  (void)hCommandBuffer;
-  (void)hBuffer;
-  (void)offset;
-  (void)size;
-  (void)pDst;
-  (void)numSyncPointsInWaitList;
-  (void)pSyncPointWaitList;
-  (void)pSyncPoint;
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  CUgraphNode GraphNode;
+  std::vector<CUgraphNode> DepsList;
+  UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                 pSyncPointWaitList, DepsList),
+          Result);
+
+  if (Result != UR_RESULT_SUCCESS) {
+    return Result;
+  }
+
+  try {
+    auto Src = std::get<BufferMem>(hBuffer->Mem).get() + offset;
+
+    CUDA_MEMCPY3D NodeParams = {};
+    setCopyParams(&Src, CU_MEMORYTYPE_DEVICE, pDst, CU_MEMORYTYPE_HOST, size,
+                  NodeParams);
 
-  detail::ur::die("Experimental Command-buffer feature is not "
-                  "implemented for CUDA adapter.");
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    UR_CHECK_ERROR(cuGraphAddMemcpyNode(
+        &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
+        &NodeParams, hCommandBuffer->Device->getContext()));
+
+    // Get sync point and register the cuNode with it.
+    *pSyncPoint =
+        hCommandBuffer->AddSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
 }
 
 UR_APIEXPORT
@@ -191,23 +439,37 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferWriteRectExp(
     uint32_t numSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint) {
-  (void)hCommandBuffer;
-  (void)hBuffer;
-  (void)bufferOffset;
-  (void)hostOffset;
-  (void)region;
-  (void)bufferRowPitch;
-  (void)bufferSlicePitch;
-  (void)hostRowPitch;
-  (void)hostSlicePitch;
-  (void)pSrc;
-  (void)numSyncPointsInWaitList;
-  (void)pSyncPointWaitList;
-  (void)pSyncPoint;
-
-  detail::ur::die("Experimental Command-buffer feature is not "
-                  "implemented for CUDA adapter.");
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  CUgraphNode GraphNode;
+  std::vector<CUgraphNode> DepsList;
+  UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                 pSyncPointWaitList, DepsList),
+          Result);
+
+  if (Result != UR_RESULT_SUCCESS) {
+    return Result;
+  }
+
+  try {
+    CUdeviceptr DstPtr = std::get<BufferMem>(hBuffer->Mem).get();
+    CUDA_MEMCPY3D NodeParams = {};
+
+    setCopyRectParams(region, pSrc, CU_MEMORYTYPE_HOST, hostOffset,
+                      hostRowPitch, hostSlicePitch, &DstPtr,
+                      CU_MEMORYTYPE_DEVICE, bufferOffset, bufferRowPitch,
+                      bufferSlicePitch, NodeParams);
+
+    UR_CHECK_ERROR(cuGraphAddMemcpyNode(
+        &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
+        &NodeParams, hCommandBuffer->Device->getContext()));
+
+    // Get sync point and register the cuNode with it.
+    *pSyncPoint =
+        hCommandBuffer->AddSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
 }
 
 UR_APIEXPORT
@@ -219,37 +481,75 @@ ur_result_t UR_APICALL urCommandBufferAppendMembufferReadRectExp(
     uint32_t numSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint) {
-  (void)hCommandBuffer;
-  (void)hBuffer;
-  (void)bufferOffset;
-  (void)hostOffset;
-  (void)region;
-  (void)bufferRowPitch;
-  (void)bufferSlicePitch;
-  (void)hostRowPitch;
-  (void)hostSlicePitch;
-  (void)pDst;
-
-  (void)numSyncPointsInWaitList;
-  (void)pSyncPointWaitList;
-  (void)pSyncPoint;
-
-  detail::ur::die("Experimental Command-buffer feature is not "
-                  "implemented for CUDA adapter.");
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  CUgraphNode GraphNode;
+  std::vector<CUgraphNode> DepsList;
+  UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                 pSyncPointWaitList, DepsList),
+          Result);
+
+  if (Result != UR_RESULT_SUCCESS) {
+    return Result;
+  }
+
+  try {
+    CUdeviceptr SrcPtr = std::get<BufferMem>(hBuffer->Mem).get();
+    CUDA_MEMCPY3D NodeParams = {};
+
+    setCopyRectParams(region, &SrcPtr, CU_MEMORYTYPE_DEVICE, bufferOffset,
+                      bufferRowPitch, bufferSlicePitch, pDst,
+                      CU_MEMORYTYPE_HOST, hostOffset, hostRowPitch,
+                      hostSlicePitch, NodeParams);
+
+    UR_CHECK_ERROR(cuGraphAddMemcpyNode(
+        &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
+        &NodeParams, hCommandBuffer->Device->getContext()));
+
+    // Get sync point and register the cuNode with it.
+    *pSyncPoint =
+        hCommandBuffer->AddSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
     ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
-  (void)hCommandBuffer;
-  (void)hQueue;
-  (void)numEventsInWaitList;
-  (void)phEventWaitList;
-  (void)phEvent;
-
-  detail::ur::die("Experimental Command-buffer feature is not "
-                  "implemented for CUDA adapter.");
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  ur_result_t Result = UR_RESULT_SUCCESS;
+
+  try {
+    std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+    ScopedContext Active(hQueue->getContext());
+    uint32_t StreamToken;
+    ur_stream_guard_ Guard;
+    CUstream CuStream = hQueue->getNextComputeStream(
+        numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
+
+    if ((Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+                                    phEventWaitList)) != UR_RESULT_SUCCESS) {
+      return Result;
+    }
+
+    if (phEvent) {
+      RetImplEvent = std::unique_ptr<ur_event_handle_t_>(
+          ur_event_handle_t_::makeNative(UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP,
+                                         hQueue, CuStream, StreamToken));
+      UR_CHECK_ERROR(RetImplEvent->start());
+    }
+
+    // Launch graph
+    UR_CHECK_ERROR(cuGraphLaunch(hCommandBuffer->CudaGraphExec, CuStream));
+
+    if (phEvent) {
+      UR_CHECK_ERROR(RetImplEvent->record());
+      *phEvent = RetImplEvent.release();
+    }
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+
+  return Result;
 }
diff --git a/source/adapters/cuda/command_buffer.hpp b/source/adapters/cuda/command_buffer.hpp
index 0fe0be0a6d..4ceab42062 100644
--- a/source/adapters/cuda/command_buffer.hpp
+++ b/source/adapters/cuda/command_buffer.hpp
@@ -9,7 +9,222 @@
 //===----------------------------------------------------------------------===//
 
 #include <ur/ur.hpp>
+#include <ur_api.h>
 
-/// Stub implementation of command-buffers for CUDA
+#include "context.hpp"
+#include <cuda.h>
+#include <memory>
 
-struct ur_exp_command_buffer_handle_t_ {};
+static inline const char *getUrResultString(ur_result_t Result) {
+  switch (Result) {
+  case UR_RESULT_SUCCESS:
+    return "UR_RESULT_SUCCESS";
+  case UR_RESULT_ERROR_INVALID_OPERATION:
+    return "UR_RESULT_ERROR_INVALID_OPERATION";
+  case UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES:
+    return "UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES";
+  case UR_RESULT_ERROR_INVALID_QUEUE:
+    return "UR_RESULT_ERROR_INVALID_QUEUE";
+  case UR_RESULT_ERROR_INVALID_VALUE:
+    return "UR_RESULT_ERROR_INVALID_VALUE";
+  case UR_RESULT_ERROR_INVALID_CONTEXT:
+    return "UR_RESULT_ERROR_INVALID_CONTEXT";
+  case UR_RESULT_ERROR_INVALID_PLATFORM:
+    return "UR_RESULT_ERROR_INVALID_PLATFORM";
+  case UR_RESULT_ERROR_INVALID_BINARY:
+    return "UR_RESULT_ERROR_INVALID_BINARY";
+  case UR_RESULT_ERROR_INVALID_PROGRAM:
+    return "UR_RESULT_ERROR_INVALID_PROGRAM";
+  case UR_RESULT_ERROR_INVALID_SAMPLER:
+    return "UR_RESULT_ERROR_INVALID_SAMPLER";
+  case UR_RESULT_ERROR_INVALID_BUFFER_SIZE:
+    return "UR_RESULT_ERROR_INVALID_BUFFER_SIZE";
+  case UR_RESULT_ERROR_INVALID_MEM_OBJECT:
+    return "UR_RESULT_ERROR_INVALID_MEM_OBJECT";
+  case UR_RESULT_ERROR_INVALID_EVENT:
+    return "UR_RESULT_ERROR_INVALID_EVENT";
+  case UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST:
+    return "UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST";
+  case UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET:
+    return "UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET";
+  case UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE:
+    return "UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE";
+  case UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE:
+    return "UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE";
+  case UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE:
+    return "UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE";
+  case UR_RESULT_ERROR_DEVICE_NOT_FOUND:
+    return "UR_RESULT_ERROR_DEVICE_NOT_FOUND";
+  case UR_RESULT_ERROR_INVALID_DEVICE:
+    return "UR_RESULT_ERROR_INVALID_DEVICE";
+  case UR_RESULT_ERROR_DEVICE_LOST:
+    return "UR_RESULT_ERROR_DEVICE_LOST";
+  case UR_RESULT_ERROR_DEVICE_REQUIRES_RESET:
+    return "UR_RESULT_ERROR_DEVICE_REQUIRES_RESET";
+  case UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE:
+    return "UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE";
+  case UR_RESULT_ERROR_DEVICE_PARTITION_FAILED:
+    return "UR_RESULT_ERROR_DEVICE_PARTITION_FAILED";
+  case UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT:
+    return "UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT";
+  case UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE:
+    return "UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE";
+  case UR_RESULT_ERROR_INVALID_WORK_DIMENSION:
+    return "UR_RESULT_ERROR_INVALID_WORK_DIMENSION";
+  case UR_RESULT_ERROR_INVALID_KERNEL_ARGS:
+    return "UR_RESULT_ERROR_INVALID_KERNEL_ARGS";
+  case UR_RESULT_ERROR_INVALID_KERNEL:
+    return "UR_RESULT_ERROR_INVALID_KERNEL";
+  case UR_RESULT_ERROR_INVALID_KERNEL_NAME:
+    return "UR_RESULT_ERROR_INVALID_KERNEL_NAME";
+  case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX:
+    return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX";
+  case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE:
+    return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE";
+  case UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE:
+    return "UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE";
+  case UR_RESULT_ERROR_INVALID_IMAGE_SIZE:
+    return "UR_RESULT_ERROR_INVALID_IMAGE_SIZE";
+  case UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR:
+    return "UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+  case UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED:
+    return "UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED";
+  case UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE:
+    return "UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE";
+  case UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE:
+    return "UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE";
+  case UR_RESULT_ERROR_UNINITIALIZED:
+    return "UR_RESULT_ERROR_UNINITIALIZED";
+  case UR_RESULT_ERROR_OUT_OF_HOST_MEMORY:
+    return "UR_RESULT_ERROR_OUT_OF_HOST_MEMORY";
+  case UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY:
+    return "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY";
+  case UR_RESULT_ERROR_OUT_OF_RESOURCES:
+    return "UR_RESULT_ERROR_OUT_OF_RESOURCES";
+  case UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE:
+    return "UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE";
+  case UR_RESULT_ERROR_PROGRAM_LINK_FAILURE:
+    return "UR_RESULT_ERROR_PROGRAM_LINK_FAILURE";
+  case UR_RESULT_ERROR_UNSUPPORTED_VERSION:
+    return "UR_RESULT_ERROR_UNSUPPORTED_VERSION";
+  case UR_RESULT_ERROR_UNSUPPORTED_FEATURE:
+    return "UR_RESULT_ERROR_UNSUPPORTED_FEATURE";
+  case UR_RESULT_ERROR_INVALID_ARGUMENT:
+    return "UR_RESULT_ERROR_INVALID_ARGUMENT";
+  case UR_RESULT_ERROR_INVALID_NULL_HANDLE:
+    return "UR_RESULT_ERROR_INVALID_NULL_HANDLE";
+  case UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE:
+    return "UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE";
+  case UR_RESULT_ERROR_INVALID_NULL_POINTER:
+    return "UR_RESULT_ERROR_INVALID_NULL_POINTER";
+  case UR_RESULT_ERROR_INVALID_SIZE:
+    return "UR_RESULT_ERROR_INVALID_SIZE";
+  case UR_RESULT_ERROR_UNSUPPORTED_SIZE:
+    return "UR_RESULT_ERROR_UNSUPPORTED_SIZE";
+  case UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT:
+    return "UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT";
+  case UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT:
+    return "UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT";
+  case UR_RESULT_ERROR_INVALID_ENUMERATION:
+    return "UR_RESULT_ERROR_INVALID_ENUMERATION";
+  case UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION:
+    return "UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION";
+  case UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT:
+    return "UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT";
+  case UR_RESULT_ERROR_INVALID_NATIVE_BINARY:
+    return "UR_RESULT_ERROR_INVALID_NATIVE_BINARY";
+  case UR_RESULT_ERROR_INVALID_GLOBAL_NAME:
+    return "UR_RESULT_ERROR_INVALID_GLOBAL_NAME";
+  case UR_RESULT_ERROR_INVALID_FUNCTION_NAME:
+    return "UR_RESULT_ERROR_INVALID_FUNCTION_NAME";
+  case UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION:
+    return "UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION";
+  case UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION:
+    return "UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION";
+  case UR_RESULT_ERROR_PROGRAM_UNLINKED:
+    return "UR_RESULT_ERROR_PROGRAM_UNLINKED";
+  case UR_RESULT_ERROR_OVERLAPPING_REGIONS:
+    return "UR_RESULT_ERROR_OVERLAPPING_REGIONS";
+  case UR_RESULT_ERROR_INVALID_HOST_PTR:
+    return "UR_RESULT_ERROR_INVALID_HOST_PTR";
+  case UR_RESULT_ERROR_INVALID_USM_SIZE:
+    return "UR_RESULT_ERROR_INVALID_USM_SIZE";
+  case UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE:
+    return "UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE";
+  case UR_RESULT_ERROR_ADAPTER_SPECIFIC:
+    return "UR_RESULT_ERROR_ADAPTER_SPECIFIC";
+  default:
+    return "UR_RESULT_ERROR_UNKNOWN";
+  }
+}
+
+// Trace an internal UR call
+#define UR_TRACE(Call)                                                         \
+  {                                                                            \
+    ur_result_t Result;                                                        \
+    UR_CALL(Call, Result);                                                     \
+  }
+
+// Trace an internal UR call and return the result to the user.
+#define UR_CALL(Call, Result)                                                  \
+  {                                                                            \
+    if (PrintTrace)                                                            \
+      fprintf(stderr, "UR ---> %s\n", #Call);                                  \
+    Result = (Call);                                                           \
+    if (PrintTrace)                                                            \
+      fprintf(stderr, "UR <--- %s(%s)\n", #Call, getUrResultString(Result));   \
+  }
+
+struct ur_exp_command_buffer_handle_t_ {
+
+  ur_exp_command_buffer_handle_t_(ur_context_handle_t hContext,
+                                  ur_device_handle_t hDevice);
+
+  ~ur_exp_command_buffer_handle_t_();
+
+  void RegisterSyncPoint(ur_exp_command_buffer_sync_point_t SyncPoint,
+                         std::shared_ptr<CUgraphNode> CuNode) {
+    SyncPoints[SyncPoint] = CuNode;
+    NextSyncPoint++;
+  }
+
+  ur_exp_command_buffer_sync_point_t GetNextSyncPoint() const {
+    return NextSyncPoint;
+  }
+
+  // Helper to register next sync point
+  // @param CuNode Node to register as next sycn point
+  // @return Pointer to the sync that registers the Node
+  ur_exp_command_buffer_sync_point_t
+  AddSyncPoint(std::shared_ptr<CUgraphNode> CuNode) {
+    ur_exp_command_buffer_sync_point_t SyncPoint = NextSyncPoint;
+    RegisterSyncPoint(SyncPoint, CuNode);
+    return SyncPoint;
+  }
+
+  // UR context associated with this command-buffer
+  ur_context_handle_t Context;
+  // Device associated with this command buffer
+  ur_device_handle_t Device;
+  // Cuda Graph handle
+  CUgraph CudaGraph;
+  // Cuda Graph Exec handle
+  CUgraphExec CudaGraphExec;
+  // Atomic variable counting the number of reference to this command_buffer
+  // using std::atomic prevents data race when incrementing/decrementing.
+  std::atomic_uint32_t RefCount;
+
+  // Map of sync_points to ur_events
+  std::unordered_map<ur_exp_command_buffer_sync_point_t,
+                     std::shared_ptr<CUgraphNode>>
+      SyncPoints;
+  // Next sync_point value (may need to consider ways to reuse values if 32-bits
+  // is not enough)
+  ur_exp_command_buffer_sync_point_t NextSyncPoint;
+
+  // Used when retaining an object.
+  uint32_t incrementReferenceCount() noexcept { return ++RefCount; }
+  // Used when releasing an object.
+  uint32_t decrementReferenceCount() noexcept { return --RefCount; }
+  uint32_t getReferenceCount() const noexcept { return RefCount; }
+};
diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp
index 0c00210eb2..76984ca744 100644
--- a/source/adapters/cuda/device.cpp
+++ b/source/adapters/cuda/device.cpp
@@ -613,6 +613,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
 
     std::string SupportedExtensions = "cl_khr_fp64 cl_khr_subgroups ";
     SupportedExtensions += "pi_ext_intel_devicelib_assert ";
+    // Return supported for the UR command-buffer experimental feature
+    SupportedExtensions += "ur_exp_command_buffer ";
     SupportedExtensions += " ";
 
     int Major = 0;
diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index 1022be1f09..6e6515908e 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -187,6 +187,146 @@ bool hasExceededMaxRegistersPerBlock(ur_device_handle_t Device,
   return BlockSize * Kernel->getRegsPerThread() > Device->getMaxRegsPerBlock();
 }
 
+// Helper to compute kernel parameters from workload
+// dimensions.
+// @param [in]  Context handler to the target Context
+// @param [in]  Device handler to the target Device
+// @param [in]  WorkDim workload dimension
+// @param [in]  GlobalWorkOffset pointer workload global offsets
+// @param [in]  LocalWorkOffset pointer workload local offsets
+// @param [inout] Kernel handler to the kernel
+// @param [inout] CuFunc handler to the cuda function attached to the kernel
+// @param [out] ThreadsPerBlock Number of threads per block we should run
+// @param [out] BlocksPerGrid Number of blocks per grid we should run
+ur_result_t
+setKernelParams(const ur_context_handle_t Context,
+                const ur_device_handle_t Device, const uint32_t WorkDim,
+                const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
+                const size_t *LocalWorkSize, ur_kernel_handle_t &Kernel,
+                CUfunction &CuFunc, size_t (&ThreadsPerBlock)[3],
+                size_t (&BlocksPerGrid)[3]) {
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  size_t MaxWorkGroupSize = 0u;
+  size_t MaxThreadsPerBlock[3] = {};
+  bool ProvidedLocalWorkGroupSize = LocalWorkSize != nullptr;
+  uint32_t LocalSize = Kernel->getLocalSize();
+
+  try {
+    // Set the active context here as guessLocalWorkSize needs an active context
+    ScopedContext Active(Context);
+    {
+      size_t *ReqdThreadsPerBlock = Kernel->ReqdThreadsPerBlock;
+      MaxWorkGroupSize = Device->getMaxWorkGroupSize();
+      Device->getMaxWorkItemSizes(sizeof(MaxThreadsPerBlock),
+                                  MaxThreadsPerBlock);
+
+      if (ProvidedLocalWorkGroupSize) {
+        auto IsValid = [&](int Dim) {
+          if (ReqdThreadsPerBlock[Dim] != 0 &&
+              LocalWorkSize[Dim] != ReqdThreadsPerBlock[Dim])
+            return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+
+          if (LocalWorkSize[Dim] > MaxThreadsPerBlock[Dim])
+            return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+          // Checks that local work sizes are a divisor of the global work sizes
+          // which includes that the local work sizes are neither larger than
+          // the global work sizes and not 0.
+          if (0u == LocalWorkSize[Dim])
+            return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+          if (0u != (GlobalWorkSize[Dim] % LocalWorkSize[Dim]))
+            return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+          ThreadsPerBlock[Dim] = LocalWorkSize[Dim];
+          return UR_RESULT_SUCCESS;
+        };
+
+        size_t KernelLocalWorkGroupSize = 0;
+        for (size_t Dim = 0; Dim < WorkDim; Dim++) {
+          auto Err = IsValid(Dim);
+          if (Err != UR_RESULT_SUCCESS)
+            return Err;
+          // If no error then sum the total local work size per dim.
+          KernelLocalWorkGroupSize += LocalWorkSize[Dim];
+        }
+
+        if (hasExceededMaxRegistersPerBlock(Device, Kernel,
+                                            KernelLocalWorkGroupSize)) {
+          return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+        }
+      } else {
+        guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim,
+                           MaxThreadsPerBlock, Kernel, LocalSize);
+      }
+    }
+
+    if (MaxWorkGroupSize <
+        ThreadsPerBlock[0] * ThreadsPerBlock[1] * ThreadsPerBlock[2]) {
+      return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+    }
+
+    for (size_t i = 0; i < WorkDim; i++) {
+      BlocksPerGrid[i] =
+          (GlobalWorkSize[i] + ThreadsPerBlock[i] - 1) / ThreadsPerBlock[i];
+    }
+
+    // Set the implicit global offset parameter if kernel has offset variant
+    if (Kernel->get_with_offset_parameter()) {
+      std::uint32_t CudaImplicitOffset[3] = {0, 0, 0};
+      if (GlobalWorkOffset) {
+        for (size_t i = 0; i < WorkDim; i++) {
+          CudaImplicitOffset[i] =
+              static_cast<std::uint32_t>(GlobalWorkOffset[i]);
+          if (GlobalWorkOffset[i] != 0) {
+            CuFunc = Kernel->get_with_offset_parameter();
+          }
+        }
+      }
+      Kernel->setImplicitOffsetArg(sizeof(CudaImplicitOffset),
+                                   CudaImplicitOffset);
+    }
+
+    if (Context->getDevice()->maxLocalMemSizeChosen()) {
+      // Set up local memory requirements for kernel.
+      auto Device = Context->getDevice();
+      if (Device->getMaxChosenLocalMem() < 0) {
+        bool EnvVarHasURPrefix =
+            std::getenv("UR_CUDA_MAX_LOCAL_MEM_SIZE") != nullptr;
+        setErrorMessage(EnvVarHasURPrefix ? "Invalid value specified for "
+                                            "UR_CUDA_MAX_LOCAL_MEM_SIZE"
+                                          : "Invalid value specified for "
+                                            "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE",
+                        UR_RESULT_ERROR_ADAPTER_SPECIFIC);
+        return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
+      }
+      if (LocalSize > static_cast<uint32_t>(Device->getMaxCapacityLocalMem())) {
+        setErrorMessage("Too much local memory allocated for device",
+                        UR_RESULT_ERROR_ADAPTER_SPECIFIC);
+        return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
+      }
+      if (LocalSize > static_cast<uint32_t>(Device->getMaxChosenLocalMem())) {
+        bool EnvVarHasURPrefix =
+            std::getenv("UR_CUDA_MAX_LOCAL_MEM_SIZE") != nullptr;
+        setErrorMessage(
+            EnvVarHasURPrefix
+                ? "Local memory for kernel exceeds the amount requested using "
+                  "UR_CUDA_MAX_LOCAL_MEM_SIZE. Try increasing the value of "
+                  "UR_CUDA_MAX_LOCAL_MEM_SIZE."
+                : "Local memory for kernel exceeds the amount requested using "
+                  "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE. Try increasing the the "
+                  "value of SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE.",
+            UR_RESULT_ERROR_ADAPTER_SPECIFIC);
+        return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
+      }
+      UR_CHECK_ERROR(cuFuncSetAttribute(
+          CuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+          Device->getMaxChosenLocalMem()));
+    }
+
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
+}
+
 /// Enqueues a wait on the given CUstream for all specified events (See
 /// \ref enqueueEventWaitWithBarrier.) If the events list is empty, the enqueued
 /// wait will wait on all previous events in the queue.
@@ -291,100 +431,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   // Set the number of threads per block to the number of threads per warp
   // by default unless user has provided a better number
   size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
-  size_t MaxWorkGroupSize = 0u;
-  size_t MaxThreadsPerBlock[3] = {};
-  bool ProvidedLocalWorkGroupSize = (pLocalWorkSize != nullptr);
+  size_t BlocksPerGrid[3] = {1u, 1u, 1u};
+
   uint32_t LocalSize = hKernel->getLocalSize();
   ur_result_t Result = UR_RESULT_SUCCESS;
+  CUfunction CuFunc = hKernel->get();
 
-  try {
-    // Set the active context here as guessLocalWorkSize needs an active context
-    ScopedContext Active(hQueue->getContext());
-    {
-      size_t *ReqdThreadsPerBlock = hKernel->ReqdThreadsPerBlock;
-      MaxWorkGroupSize = hQueue->Device->getMaxWorkGroupSize();
-      hQueue->Device->getMaxWorkItemSizes(sizeof(MaxThreadsPerBlock),
-                                          MaxThreadsPerBlock);
-
-      if (ProvidedLocalWorkGroupSize) {
-        auto IsValid = [&](int Dim) {
-          if (ReqdThreadsPerBlock[Dim] != 0 &&
-              pLocalWorkSize[Dim] != ReqdThreadsPerBlock[Dim])
-            return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
-
-          if (pLocalWorkSize[Dim] > MaxThreadsPerBlock[Dim])
-            return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
-          // Checks that local work sizes are a divisor of the global work sizes
-          // which includes that the local work sizes are neither larger than
-          // the global work sizes and not 0.
-          if (0u == pLocalWorkSize[Dim])
-            return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
-          if (0u != (pGlobalWorkSize[Dim] % pLocalWorkSize[Dim]))
-            return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
-          ThreadsPerBlock[Dim] = pLocalWorkSize[Dim];
-          return UR_RESULT_SUCCESS;
-        };
-
-        size_t KernelLocalWorkGroupSize = 0;
-        for (size_t Dim = 0; Dim < workDim; Dim++) {
-          auto Err = IsValid(Dim);
-          if (Err != UR_RESULT_SUCCESS)
-            return Err;
-          // If no error then sum the total local work size per dim.
-          KernelLocalWorkGroupSize += pLocalWorkSize[Dim];
-        }
-
-        if (hasExceededMaxRegistersPerBlock(hQueue->Device, hKernel,
-                                            KernelLocalWorkGroupSize)) {
-          return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
-        }
-      } else {
-        guessLocalWorkSize(hQueue->Device, ThreadsPerBlock, pGlobalWorkSize,
-                           workDim, MaxThreadsPerBlock, hKernel, LocalSize);
-      }
-    }
-
-    if (MaxWorkGroupSize <
-        ThreadsPerBlock[0] * ThreadsPerBlock[1] * ThreadsPerBlock[2]) {
-      return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
-    }
-
-    size_t BlocksPerGrid[3] = {1u, 1u, 1u};
-
-    for (size_t i = 0; i < workDim; i++) {
-      BlocksPerGrid[i] =
-          (pGlobalWorkSize[i] + ThreadsPerBlock[i] - 1) / ThreadsPerBlock[i];
-    }
+  Result = setKernelParams(hQueue->getContext(), hQueue->Device, workDim,
+                           pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
+                           hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid);
+  if (Result != UR_RESULT_SUCCESS) {
+    return Result;
+  }
 
+  try {
     std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
     uint32_t StreamToken;
     ur_stream_guard_ Guard;
     CUstream CuStream = hQueue->getNextComputeStream(
         numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
-    CUfunction CuFunc = hKernel->get();
 
     Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
                                phEventWaitList);
 
-    // Set the implicit global offset parameter if kernel has offset variant
-    if (hKernel->get_with_offset_parameter()) {
-      std::uint32_t CudaImplicitOffset[3] = {0, 0, 0};
-      if (pGlobalWorkOffset) {
-        for (size_t i = 0; i < workDim; i++) {
-          CudaImplicitOffset[i] =
-              static_cast<std::uint32_t>(pGlobalWorkOffset[i]);
-          if (pGlobalWorkOffset[i] != 0) {
-            CuFunc = hKernel->get_with_offset_parameter();
-          }
-        }
-      }
-      hKernel->setImplicitOffsetArg(sizeof(CudaImplicitOffset),
-                                    CudaImplicitOffset);
-    }
-
-    auto &ArgIndices = hKernel->getArgIndices();
-
     if (phEvent) {
       RetImplEvent =
           std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
@@ -392,47 +462,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
-    if (hQueue->getContext()->getDevice()->maxLocalMemSizeChosen()) {
-      // Set up local memory requirements for kernel.
-      auto Device = hQueue->getContext()->getDevice();
-      if (Device->getMaxChosenLocalMem() < 0) {
-        bool EnvVarHasURPrefix =
-            (std::getenv("UR_CUDA_MAX_LOCAL_MEM_SIZE") != nullptr);
-        setErrorMessage(EnvVarHasURPrefix ? "Invalid value specified for "
-                                            "UR_CUDA_MAX_LOCAL_MEM_SIZE"
-                                          : "Invalid value specified for "
-                                            "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE",
-                        UR_RESULT_ERROR_ADAPTER_SPECIFIC);
-        return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
-      }
-      if (LocalSize > static_cast<uint32_t>(Device->getMaxCapacityLocalMem())) {
-        setErrorMessage("Too much local memory allocated for device",
-                        UR_RESULT_ERROR_ADAPTER_SPECIFIC);
-        return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
-      }
-      if (LocalSize > static_cast<uint32_t>(Device->getMaxChosenLocalMem())) {
-        bool EnvVarHasURPrefix =
-            (std::getenv("UR_CUDA_MAX_LOCAL_MEM_SIZE") != nullptr);
-        setErrorMessage(
-            EnvVarHasURPrefix
-                ? "Local memory for kernel exceeds the amount requested using "
-                  "UR_CUDA_MAX_LOCAL_MEM_SIZE. Try increasing the value of "
-                  "UR_CUDA_MAX_LOCAL_MEM_SIZE."
-                : "Local memory for kernel exceeds the amount requested using "
-                  "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE. Try increasing the the "
-                  "value of SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE.",
-            UR_RESULT_ERROR_ADAPTER_SPECIFIC);
-        return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
-      }
-      UR_CHECK_ERROR(cuFuncSetAttribute(
-          CuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
-          Device->getMaxChosenLocalMem()));
-    }
-
+    auto &ArgIndices = hKernel->getArgIndices();
     UR_CHECK_ERROR(cuLaunchKernel(
         CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2],
         ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], LocalSize,
         CuStream, const_cast<void **>(ArgIndices.data()), nullptr));
+
     if (LocalSize != 0)
       hKernel->clearLocalSize();
 
@@ -440,29 +475,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       UR_CHECK_ERROR(RetImplEvent->record());
       *phEvent = RetImplEvent.release();
     }
+
   } catch (ur_result_t Err) {
     Result = Err;
   }
   return Result;
 }
 
-/// General 3D memory copy operation.
-/// This function requires the corresponding CUDA context to be at the top of
-/// the context stack
+/// Set parameters for general 3D memory copy.
 /// If the source and/or destination is on the device, SrcPtr and/or DstPtr
 /// must be a pointer to a CUdeviceptr
-static ur_result_t commonEnqueueMemBufferCopyRect(
-    CUstream cu_stream, ur_rect_region_t region, const void *SrcPtr,
-    const CUmemorytype_enum SrcType, ur_rect_offset_t src_offset,
-    size_t src_row_pitch, size_t src_slice_pitch, void *DstPtr,
-    const CUmemorytype_enum DstType, ur_rect_offset_t dst_offset,
-    size_t dst_row_pitch, size_t dst_slice_pitch) {
-
-  UR_ASSERT(SrcType == CU_MEMORYTYPE_DEVICE || SrcType == CU_MEMORYTYPE_HOST,
-            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
-  UR_ASSERT(DstType == CU_MEMORYTYPE_DEVICE || DstType == CU_MEMORYTYPE_HOST,
-            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
-
+void setCopyRectParams(ur_rect_region_t region, const void *SrcPtr,
+                       const CUmemorytype_enum SrcType,
+                       ur_rect_offset_t src_offset, size_t src_row_pitch,
+                       size_t src_slice_pitch, void *DstPtr,
+                       const CUmemorytype_enum DstType,
+                       ur_rect_offset_t dst_offset, size_t dst_row_pitch,
+                       size_t dst_slice_pitch, CUDA_MEMCPY3D &params) {
   src_row_pitch =
       (!src_row_pitch) ? region.width + src_offset.x : src_row_pitch;
   src_slice_pitch = (!src_slice_pitch)
@@ -474,8 +503,6 @@ static ur_result_t commonEnqueueMemBufferCopyRect(
                         ? ((region.height + dst_offset.y) * dst_row_pitch)
                         : dst_slice_pitch;
 
-  CUDA_MEMCPY3D params = {};
-
   params.WidthInBytes = region.width;
   params.Height = region.height;
   params.Depth = region.depth;
@@ -500,6 +527,29 @@ static ur_result_t commonEnqueueMemBufferCopyRect(
   params.dstZ = dst_offset.z;
   params.dstPitch = dst_row_pitch;
   params.dstHeight = dst_slice_pitch / dst_row_pitch;
+}
+
+/// General 3D memory copy operation.
+/// This function requires the corresponding CUDA context to be at the top of
+/// the context stack
+/// If the source and/or destination is on the device, SrcPtr and/or DstPtr
+/// must be a pointer to a CUdeviceptr
+static ur_result_t commonEnqueueMemBufferCopyRect(
+    CUstream cu_stream, ur_rect_region_t region, const void *SrcPtr,
+    const CUmemorytype_enum SrcType, ur_rect_offset_t src_offset,
+    size_t src_row_pitch, size_t src_slice_pitch, void *DstPtr,
+    const CUmemorytype_enum DstType, ur_rect_offset_t dst_offset,
+    size_t dst_row_pitch, size_t dst_slice_pitch) {
+  UR_ASSERT(SrcType == CU_MEMORYTYPE_DEVICE || SrcType == CU_MEMORYTYPE_HOST,
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(DstType == CU_MEMORYTYPE_DEVICE || DstType == CU_MEMORYTYPE_HOST,
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+  CUDA_MEMCPY3D params = {};
+
+  setCopyRectParams(region, SrcPtr, SrcType, src_offset, src_row_pitch,
+                    src_slice_pitch, DstPtr, DstType, dst_offset, dst_row_pitch,
+                    dst_slice_pitch, params);
 
   UR_CHECK_ERROR(cuMemcpy3DAsync(&params, cu_stream));
 
diff --git a/source/adapters/cuda/enqueue.hpp b/source/adapters/cuda/enqueue.hpp
index 7226a5536b..64c590f742 100644
--- a/source/adapters/cuda/enqueue.hpp
+++ b/source/adapters/cuda/enqueue.hpp
@@ -16,3 +16,28 @@
 ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream,
                               uint32_t NumEventsInWaitList,
                               const ur_event_handle_t *EventWaitList);
+
+void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
+                        const size_t *GlobalWorkSize, const uint32_t WorkDim,
+                        const size_t MaxThreadsPerBlock[3],
+                        ur_kernel_handle_t Kernel, uint32_t LocalSize);
+
+bool hasExceededMaxRegistersPerBlock(ur_device_handle_t Device,
+                                     ur_kernel_handle_t Kernel,
+                                     size_t BlockSize);
+
+ur_result_t
+setKernelParams(const ur_context_handle_t Context,
+                const ur_device_handle_t Device, const uint32_t WorkDim,
+                const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
+                const size_t *LocalWorkSize, ur_kernel_handle_t &Kernel,
+                CUfunction &CuFunc, size_t (&ThreadsPerBlock)[3],
+                size_t (&BlocksPerGrid)[3]);
+
+void setCopyRectParams(ur_rect_region_t region, const void *SrcPtr,
+                       const CUmemorytype_enum SrcType,
+                       ur_rect_offset_t src_offset, size_t src_row_pitch,
+                       size_t src_slice_pitch, void *DstPtr,
+                       const CUmemorytype_enum DstType,
+                       ur_rect_offset_t dst_offset, size_t dst_row_pitch,
+                       size_t dst_slice_pitch, CUDA_MEMCPY3D &params);