diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h
index 4451c36038d11..7ec89c02eba0c 100644
--- a/sycl/include/sycl/detail/pi.h
+++ b/sycl/include/sycl/detail/pi.h
@@ -52,11 +52,10 @@
 // 10.13 Added new PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS queue property.
 // 10.14 Add PI_EXT_INTEL_DEVICE_INFO_FREE_MEMORY as an extension for
 // piDeviceGetInfo.
-// 10.15 Add new PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION queue property
-// 10.16 Add command-buffer extension methods
+// 10.15 Add command-buffer extension methods
 
 #define _PI_H_VERSION_MAJOR 10
-#define _PI_H_VERSION_MINOR 16
+#define _PI_H_VERSION_MINOR 15
 
 #define _PI_STRING_HELPER(a) #a
 #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b)
@@ -572,14 +571,6 @@ constexpr pi_queue_properties PI_QUEUE_PROFILING_ENABLE = (1 << 1);
 constexpr pi_queue_properties PI_QUEUE_ON_DEVICE = (1 << 2);
 constexpr pi_queue_properties PI_QUEUE_ON_DEVICE_DEFAULT = (1 << 3);
 constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS = (1 << 4);
-// In a lazy queue, enqueued commands are not submitted for execution
-// immediately, instead they are submitted for execution once the queue is
-// flushed.
-//
-// This is to enable prototyping of the SYCL_EXT_ONEAPI_GRAPH extension,
-// before a native command-list interface in PI can be designed and
-// implemented.
-constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION = (1 << 5);
 
 using pi_result = _pi_result;
 using pi_platform_info = _pi_platform_info;
diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
index f3fef118d3aa4..a9cc6798b73f8 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
@@ -27,6 +27,7 @@ namespace experimental {
 namespace detail {
 struct node_impl;
 struct graph_impl;
+class exec_graph_impl;
 
 } // namespace detail
 
@@ -125,18 +126,20 @@ template <> class __SYCL_EXPORT command_graph<graph_state::executable> {
 public:
   command_graph() = delete;
 
-  command_graph(const std::shared_ptr<detail::graph_impl> &g,
-                const sycl::context &ctx)
-      : MTag(rand()), MCtx(ctx), impl(g) {}
+  command_graph(const std::shared_ptr<detail::graph_impl> &Graph,
+                const sycl::context &Ctx);
 
 private:
   template <class Obj>
   friend decltype(Obj::impl)
   sycl::detail::getSyclObjImpl(const Obj &SyclObject);
 
+  // Creates a backend representation of the graph in impl
+  void finalize_impl();
+
   int MTag;
   const sycl::context &MCtx;
-  std::shared_ptr<detail::graph_impl> impl;
+  std::shared_ptr<detail::exec_graph_impl> impl;
 };
 } // namespace experimental
 } // namespace oneapi
diff --git a/sycl/include/sycl/properties/queue_properties.hpp b/sycl/include/sycl/properties/queue_properties.hpp
index 643f16fbe7cb8..2457b4c561dcf 100644
--- a/sycl/include/sycl/properties/queue_properties.hpp
+++ b/sycl/include/sycl/properties/queue_properties.hpp
@@ -28,8 +28,6 @@ namespace property {
 namespace queue {
 class discard_events
     : public ::sycl::detail::DataLessProperty<::sycl::detail::DiscardEvents> {};
-class lazy_execution
-    : public ::sycl::detail::DataLessProperty<::sycl::detail::LazyExecution> {};
 } // namespace queue
 } // namespace property
 
@@ -67,9 +65,6 @@ template <>
 struct is_property<ext::oneapi::property::queue::discard_events>
     : std::true_type {};
 template <>
-struct is_property<ext::oneapi::property::queue::lazy_execution>
-    : std::true_type {};
-template <>
 struct is_property<property::queue::cuda::use_default_stream> : std::true_type {
 };
 template <>
@@ -85,9 +80,6 @@ template <>
 struct is_property_of<ext::oneapi::property::queue::discard_events, queue>
     : std::true_type {};
 template <>
-struct is_property_of<ext::oneapi::property::queue::lazy_execution, queue>
-    : std::true_type {};
-template <>
 struct is_property_of<property::queue::cuda::use_default_stream, queue>
     : std::true_type {};
 template <>
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index 2ea4db7ab2c8c..91e7f07771984 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -1307,49 +1307,6 @@ pi_result resetCommandLists(pi_queue Queue) {
 pi_result _pi_context::getAvailableCommandList(
     pi_queue Queue, pi_command_list_ptr_t &CommandList, bool UseCopyEngine,
     bool AllowBatching, ze_command_queue_handle_t *ForcedCmdQueue) {
-
-#if SYCL_EXT_ONEAPI_GRAPH
-  // This is a hack. TODO: Proper CommandList allocation per Executable Graph.
-  if( Queue->Properties & PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION ) {
-    // TODO: Create new Command List.
-    if(Queue->LazyCommandListMap.empty()) {
-      const bool UseCopyEngine = false;
-      // Adding createCommandList() to LazyCommandListMap
-      ze_fence_handle_t ZeFence;
-      ZeStruct<ze_fence_desc_t> ZeFenceDesc;
-      ze_command_list_handle_t ZeCommandList;
-
-      uint32_t QueueGroupOrdinal;
-      auto &QGroup = Queue->getQueueGroup(UseCopyEngine);
-      auto &ZeCommandQueue =
-          // ForcedCmdQueue ? *ForcedCmdQueue :
-          QGroup.getZeQueue(&QueueGroupOrdinal);
-      // if (ForcedCmdQueue)
-      //  QueueGroupOrdinal = QGroup.getCmdQueueOrdinal(ZeCommandQueue);
-
-      ZeStruct<ze_command_list_desc_t> ZeCommandListDesc;
-      ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;
-
-      ZE_CALL(zeCommandListCreate,
-              (Queue->Context->ZeContext, Queue->Device->ZeDevice,
-               &ZeCommandListDesc, &ZeCommandList));
-
-      ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
-      std::tie(CommandList, std::ignore) = Queue->LazyCommandListMap.insert(
-          std::pair<ze_command_list_handle_t, pi_command_list_info_t>(
-              ZeCommandList,
-              {ZeFence, false, ZeCommandQueue, QueueGroupOrdinal}));
-
-      Queue->insertActiveBarriers(CommandList, UseCopyEngine);
-      //
-      CommandList->second.ZeFenceInUse = true;
-    } else {
-        CommandList = Queue->LazyCommandListMap.begin();
-    }
-    return PI_SUCCESS;
-  }
-#endif
-    
   // Immediate commandlists have been pre-allocated and are always available.
   if (Queue->Device->useImmediateCommandLists()) {
     CommandList = Queue->getQueueGroup(UseCopyEngine).getImmCmdList();
@@ -1588,13 +1545,6 @@ void _pi_queue::CaptureIndirectAccesses() {
 pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList,
                                         bool IsBlocking,
                                         bool OKToBatchCommand) {
-  // When executing a Graph, defer execution if this is a command
-  // which could be batched (i.e. likely a kernel submission)
-#if SYCL_EXT_ONEAPI_GRAPH
-  if (this->Properties & PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION && OKToBatchCommand)
-    return PI_SUCCESS;
-#endif
-
   bool UseCopyEngine = CommandList->second.isCopy(this);
 
   // If the current LastCommandEvent is the nullptr, then it means
@@ -3560,8 +3510,7 @@ pi_result piQueueCreate(pi_context Context, pi_device Device,
   PI_ASSERT(!(Properties & ~(PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
                              PI_QUEUE_PROFILING_ENABLE | PI_QUEUE_ON_DEVICE |
                              PI_QUEUE_ON_DEVICE_DEFAULT |
-                             PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS |
-                             PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION)),
+                             PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS)),
             PI_ERROR_INVALID_VALUE);
 
   PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
@@ -3835,18 +3784,6 @@ pi_result piQueueFinish(pi_queue Queue) {
 // Flushing cross-queue dependencies is covered by createAndRetainPiZeEventList,
 // so this can be left as a no-op.
 pi_result piQueueFlush(pi_queue Queue) {
-#if SYCL_EXT_ONEAPI_GRAPH
-  if( Queue->Properties & PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION ) {
-
-    pi_command_list_ptr_t CommandList{};
-    // TODO: 
-    CommandList = Queue->LazyCommandListMap.begin();
-
-    Queue->executeCommandList(CommandList, false, false);
-  }
-#else
-  (void)Queue;
-#endif
   return PI_SUCCESS;
 }
 
diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp
index 28dfc16223448..6c14f72907206 100644
--- a/sycl/source/detail/graph_impl.cpp
+++ b/sycl/source/detail/graph_impl.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include <detail/graph_impl.hpp>
+#include <detail/program_manager/program_manager.hpp>
 #include <detail/queue_impl.hpp>
 #include <detail/scheduler/commands.hpp>
 #include <sycl/queue.hpp>
@@ -20,48 +21,30 @@ namespace oneapi {
 namespace experimental {
 namespace detail {
 
-void graph_impl::exec(const std::shared_ptr<sycl::detail::queue_impl> &Queue) {
+void exec_graph_impl::schedule() {
   if (MSchedule.empty()) {
-    for (auto Node : MRoots) {
-      Node->topology_sort(MSchedule);
+    for (auto Node : MGraphImpl->MRoots) {
+      Node->topology_sort(Node, MSchedule);
     }
   }
-  for (auto Node : MSchedule)
-    Node->exec(Queue);
 }
 
-void graph_impl::exec_and_wait(
-    const std::shared_ptr<sycl::detail::queue_impl> &Queue) {
-  bool IsSubGraph = Queue->getIsGraphSubmitting();
-  if (!IsSubGraph) {
-    Queue->setIsGraphSubmitting(true);
-  }
-#if SYCL_EXT_ONEAPI_GRAPH
-  if (MFirst) {
-    exec(Queue);
-    MFirst = false;
-  }
-#else
-  exec(Queue);
-#endif
-  if (!IsSubGraph) {
-    Queue->setIsGraphSubmitting(false);
-    Queue->wait();
-  }
+sycl::event
+exec_graph_impl::exec(const std::shared_ptr<sycl::detail::queue_impl> &Queue) {
+  // TODO: Support subgraphs
+  sycl::event RetEvent = enqueue(Queue);
+  // TODO: Remove this queue wait. Currently waiting on the event returned from
+  // graph execution does not work.
+  Queue->wait();
+  return RetEvent;
 }
 
 void graph_impl::add_root(const std::shared_ptr<node_impl> &Root) {
   MRoots.insert(Root);
-  for (auto Node : MSchedule)
-    Node->MScheduled = false;
-  MSchedule.clear();
 }
 
 void graph_impl::remove_root(const std::shared_ptr<node_impl> &Root) {
   MRoots.erase(Root);
-  for (auto Node : MSchedule)
-    Node->MScheduled = false;
-  MSchedule.clear();
 }
 
 // Recursive check if a graph node or its successors contains a given kernel
@@ -99,7 +82,7 @@ graph_impl::add(const std::shared_ptr<graph_impl> &Impl,
   // TODO: Encapsulate in separate function to avoid duplication
   if (!Dep.empty()) {
     for (auto N : Dep) {
-      N->register_successor(NodeImpl); // register successor
+      N->register_successor(NodeImpl, N); // register successor
       this->remove_root(NodeImpl);     // remove receiver from root node
                                        // list
     }
@@ -118,10 +101,11 @@ graph_impl::add(const std::shared_ptr<graph_impl> &Impl,
   sycl::handler Handler{Impl};
   CGF(Handler);
 
+  // TODO: Do we need to pass event dependencies here for the explicit API?
   return this->add(Impl, Handler.MKernel, Handler.MNDRDesc,
                    Handler.MOSModuleHandle, Handler.MKernelName,
                    Handler.MAccStorage, Handler.MLocalAccStorage,
-                   Handler.MRequirements, Handler.MArgs, {});
+                   Handler.MRequirements, Handler.MArgs, Dep);
 }
 
 std::shared_ptr<node_impl> graph_impl::add(
@@ -133,7 +117,8 @@ std::shared_ptr<node_impl> graph_impl::add(
     const std::vector<sycl::detail::LocalAccessorImplPtr> &LocalAccStorage,
     const std::vector<sycl::detail::AccessorImplHost *> &Requirements,
     const std::vector<sycl::detail::ArgDesc> &Args,
-    const std::vector<std::shared_ptr<node_impl>> &Dep) {
+    const std::vector<std::shared_ptr<node_impl>> &Dep,
+    const std::vector<std::shared_ptr<sycl::detail::event_impl>> &DepEvents) {
   const std::shared_ptr<node_impl> &NodeImpl = std::make_shared<node_impl>(
       Impl, Kernel, NDRDesc, OSModuleHandle, KernelName, AccStorage,
       LocalAccStorage, Requirements, Args);
@@ -154,9 +139,21 @@ std::shared_ptr<node_impl> graph_impl::add(
 
   // Add any deps determined from accessor arguments into the dependency list
   Deps.insert(Deps.end(), UniqueDeps.begin(), UniqueDeps.end());
+
+  // Add any nodes specified by event dependencies into the dependency list
+  for (auto Dep : DepEvents) {
+    if (auto NodeImpl = MEventsMap.find(Dep); NodeImpl != MEventsMap.end()) {
+      Deps.push_back(NodeImpl->second);
+    } else {
+      throw sycl::exception(errc::invalid,
+                            "Event dependency from handler::depends_on does "
+                            "not correspond to a node within the graph");
+    }
+  }
+
   if (!Deps.empty()) {
     for (auto N : Deps) {
-      N->register_successor(NodeImpl); // register successor
+      N->register_successor(NodeImpl, N); // register successor
       this->remove_root(NodeImpl);     // remove receiver from root node
                                        // list
     }
@@ -177,39 +174,201 @@ bool graph_impl::clear_queues() {
   return AnyQueuesCleared;
 }
 
-void node_impl::exec(const std::shared_ptr<sycl::detail::queue_impl> &Queue
-                         _CODELOCPARAMDEF(&CodeLoc)) {
-  std::vector<sycl::event> Deps;
-  for (auto Sender : MPredecessors)
-    Deps.push_back(Sender->get_event());
+// Check if nodes are empty and if so loop back through predecessors until we
+// find the real dependency.
+void find_real_deps(std::vector<pi_ext_sync_point> &Deps,
+                    std::shared_ptr<node_impl> CurrentNode) {
+  if (CurrentNode->is_empty()) {
+    for (auto &N : CurrentNode->MPredecessors) {
+      auto NodeImpl = N.lock();
+      find_real_deps(Deps, NodeImpl);
+    }
+  } else {
+    // Check if the dependency has already been added.
+    if (std::find(Deps.begin(), Deps.end(), CurrentNode->MPiSyncPoint) ==
+        Deps.end())
+      Deps.push_back(CurrentNode->MPiSyncPoint);
+  }
+}
+
+void exec_graph_impl::create_pi_command_buffers(sycl::device D,
+                                                const sycl::context &Ctx) {
+  // TODO we only have a single command-buffer per graph here, but
+  // this will need to be multiple command-buffers for non-trivial graphs
+  pi_ext_command_buffer OutCommandBuffer;
+  pi_ext_command_buffer_desc Desc{};
+  auto ContextImpl = sycl::detail::getSyclObjImpl(Ctx);
+  const sycl::detail::plugin &Plugin = ContextImpl->getPlugin();
+  auto DeviceImpl = sycl::detail::getSyclObjImpl(D);
+  pi_result Res =
+      Plugin.call_nocheck<sycl::detail::PiApiKind::piextCommandBufferCreate>(
+          ContextImpl->getHandleRef(), DeviceImpl->getHandleRef(), &Desc,
+          &OutCommandBuffer);
+  if (Res != pi_result::PI_SUCCESS) {
+    throw sycl::exception(errc::invalid, "Failed to create PI command-buffer");
+  }
+
+  MPiCommandBuffers[D] = OutCommandBuffer;
+
+  // TODO extract kernel bundle logic from enqueueImpKernel
+  for (auto Node : MSchedule) {
+    pi_kernel PiKernel = nullptr;
+    std::mutex *KernelMutex = nullptr;
+    pi_program PiProgram = nullptr;
+
+    auto Kernel = Node->MKernel;
+    if (Kernel != nullptr) {
+      PiKernel = Kernel->getHandleRef();
+    } else {
+      std::tie(PiKernel, KernelMutex, PiProgram) =
+          sycl::detail::ProgramManager::getInstance().getOrCreateKernel(
+              Node->MOSModuleHandle, ContextImpl, DeviceImpl, Node->MKernelName,
+              nullptr);
+    }
+
+    sycl::detail::ProgramManager::KernelArgMask EliminatedArgMask;
+    if (nullptr == Node->MKernel || !Node->MKernel->isCreatedFromSource()) {
+      EliminatedArgMask =
+          sycl::detail::ProgramManager::getInstance()
+              .getEliminatedKernelArgMask(Node->MOSModuleHandle, PiProgram,
+                                          Node->MKernelName);
+    }
+
+    auto SetFunc = [&Plugin, &PiKernel, &Ctx](sycl::detail::ArgDesc &Arg,
+                                              size_t NextTrueIndex) {
+      sycl::detail::SetArgBasedOnType(
+          Plugin, PiKernel,
+          nullptr /* TODO: Handle spec constants and pass device image here */,
+          nullptr /* TODO: Pass getMemAllocation function for buffers */, Ctx,
+          false, Arg, NextTrueIndex);
+    };
+    std::vector<sycl::detail::ArgDesc> Args;
+    sycl::detail::applyFuncOnFilteredArgs(EliminatedArgMask, Node->MArgs,
+                                          SetFunc);
+
+    std::vector<pi_ext_sync_point> Deps;
+    for (auto &N : Node->MPredecessors) {
+      find_real_deps(Deps, N.lock());
+    }
+
+    // add commands
+    // Remember this information before the range dimensions are reversed
+    const bool HasLocalSize = (Node->MNDRDesc.LocalSize[0] != 0);
+
+    size_t RequiredWGSize[3] = {0, 0, 0};
+    size_t *LocalSize = nullptr;
+
+    if (HasLocalSize)
+      LocalSize = &Node->MNDRDesc.LocalSize[0];
+    else {
+      Plugin.call<sycl::detail::PiApiKind::piKernelGetGroupInfo>(
+          PiKernel, DeviceImpl->getHandleRef(),
+          PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, sizeof(RequiredWGSize),
+          RequiredWGSize, /* param_value_size_ret = */ nullptr);
+
+      const bool EnforcedLocalSize =
+          (RequiredWGSize[0] != 0 || RequiredWGSize[1] != 0 ||
+           RequiredWGSize[2] != 0);
+      if (EnforcedLocalSize)
+        LocalSize = RequiredWGSize;
+    }
+
+    Res = Plugin.call_nocheck<
+        sycl::detail::PiApiKind::piextCommandBufferNDRangeKernel>(
+        OutCommandBuffer, PiKernel, Node->MNDRDesc.Dims,
+        &Node->MNDRDesc.GlobalOffset[0], &Node->MNDRDesc.GlobalSize[0],
+        LocalSize, Deps.size(), Deps.size() ? Deps.data() : nullptr,
+        &Node->MPiSyncPoint);
+
+    if (Res != pi_result::PI_SUCCESS) {
+      throw sycl::exception(errc::invalid,
+                            "Failed to add kernel to PI command-buffer");
+    }
+  }
 
-  // Enqueue kernel here instead of submit
+  Res =
+      Plugin.call_nocheck<sycl::detail::PiApiKind::piextCommandBufferFinalize>(
+          OutCommandBuffer);
+  if (Res != pi_result::PI_SUCCESS) {
+    throw sycl::exception(errc::invalid,
+                          "Failed to finalize PI command-buffer");
+  }
+}
 
+exec_graph_impl::~exec_graph_impl() {
+  MSchedule.clear();
+  for (auto Iter : MPiCommandBuffers) {
+    const sycl::detail::plugin &Plugin =
+        sycl::detail::getSyclObjImpl(MContext)->getPlugin();
+    auto CmdBuf = Iter.second;
+    pi_result Res =
+        Plugin.call_nocheck<sycl::detail::PiApiKind::piextCommandBufferRelease>(
+            CmdBuf);
+    (void)Res;
+    assert(Res == pi_result::PI_SUCCESS);
+  }
+}
+
+sycl::event exec_graph_impl::enqueue(
+    const std::shared_ptr<sycl::detail::queue_impl> &Queue) {
   std::vector<pi_event> RawEvents;
-  pi_event *OutEvent = nullptr;
-  auto NewEvent = std::make_shared<sycl::detail::event_impl>(Queue);
-  NewEvent->setContextImpl(Queue->getContextImplPtr());
-  NewEvent->setStateIncomplete();
-  OutEvent = &NewEvent->getHandleRef();
+  auto CreateNewEvent([&]() {
+    pi_event *OutEvent = nullptr;
+    auto NewEvent = std::make_shared<sycl::detail::event_impl>(Queue);
+    NewEvent->setContextImpl(Queue->getContextImplPtr());
+    NewEvent->setStateIncomplete();
+    OutEvent = &NewEvent->getHandleRef();
+    pi_result Res =
+        Queue->getPlugin().call_nocheck<sycl::detail::PiApiKind::piEventCreate>(
+            sycl::detail::getSyclObjImpl(Queue->get_context())->getHandleRef(),
+            OutEvent);
+    if (Res != pi_result::PI_SUCCESS) {
+      throw sycl::exception(errc::event,
+                            "Failed to create event for node submission");
+    }
+    return NewEvent;
+  });
+#if SYCL_EXT_ONEAPI_GRAPH
+  auto NewEvent = CreateNewEvent();
+  pi_event *OutEvent = &NewEvent->getHandleRef();
+  auto CommandBuffer = MPiCommandBuffers[Queue->get_device()];
   pi_result Res =
-      Queue->getPlugin().call_nocheck<sycl::detail::PiApiKind::piEventCreate>(
-          sycl::detail::getSyclObjImpl(Queue->get_context())->getHandleRef(),
-          OutEvent);
+      Queue->getPlugin()
+          .call_nocheck<sycl::detail::PiApiKind::piextEnqueueCommandBuffer>(
+              CommandBuffer, Queue->getHandleRef(), RawEvents.size(),
+              RawEvents.empty() ? nullptr : &RawEvents[0], OutEvent);
   if (Res != pi_result::PI_SUCCESS) {
     throw sycl::exception(errc::event,
-                          "Failed to create event for node submission");
+                          "Failed to enqueue event for node submission");
   }
 
-  pi_int32 Result = enqueueImpKernel(
-      Queue, MNDRDesc, MArgs, /* KernelBundleImpPtr */ nullptr, MKernel,
-      MKernelName, MOSModuleHandle, RawEvents, OutEvent, nullptr);
-  if (Result != pi_result::PI_SUCCESS) {
-    throw sycl::exception(errc::kernel, "Error enqueuing graph node kernel");
+#else
+  std::vector<std::shared_ptr<sycl::detail::event_impl>> ScheduledEvents;
+  for (auto &NodeImpl : MSchedule) {
+    std::vector<RT::PiEvent> RawEvents;
+    auto NewEvent = CreateNewEvent();
+    pi_event *OutEvent = &NewEvent->getHandleRef();
+    pi_int32 Res = sycl::detail::enqueueImpKernel(
+        Queue, NodeImpl->MNDRDesc, NodeImpl->MArgs,
+        nullptr /* TODO: Handle KernelBundles */, NodeImpl->MKernel,
+        NodeImpl->MKernelName, NodeImpl->MOSModuleHandle, RawEvents, OutEvent,
+        nullptr /* TODO: Pass mem allocation func for accessors */);
+    if (Res != pi_result::PI_SUCCESS) {
+      throw sycl::exception(
+          sycl::errc::kernel,
+          "Error during emulated graph command group submission.");
+    }
+    ScheduledEvents.push_back(NewEvent);
   }
+  // Create an event which has all kernel events as dependencies
+  auto NewEvent = std::make_shared<sycl::detail::event_impl>(Queue);
+  NewEvent->setStateIncomplete();
+  NewEvent->getPreparedDepsEvents() = ScheduledEvents;
+#endif
+
   sycl::event QueueEvent =
       sycl::detail::createSyclObjFromImpl<sycl::event>(NewEvent);
-  Queue->addEvent(QueueEvent);
-  MEvent = QueueEvent;
+  return QueueEvent;
 }
 } // namespace detail
 
@@ -251,7 +410,8 @@ void command_graph<graph_state::modifiable>::make_edge(node Sender,
   std::shared_ptr<detail::node_impl> ReceiverImpl =
       sycl::detail::getSyclObjImpl(Receiver);
 
-  SenderImpl->register_successor(ReceiverImpl); // register successor
+  SenderImpl->register_successor(ReceiverImpl,
+                                 SenderImpl); // register successor
   impl->remove_root(ReceiverImpl); // remove receiver from root node list
 }
 
@@ -322,6 +482,23 @@ bool command_graph<graph_state::modifiable>::end_recording(
   return QueueStateChanged;
 }
 
+command_graph<graph_state::executable>::command_graph(
+    const std::shared_ptr<detail::graph_impl> &Graph, const sycl::context &Ctx)
+    : MTag(rand()), MCtx(Ctx),
+      impl(std::make_shared<detail::exec_graph_impl>(Ctx, Graph)) {
+  finalize_impl(); // Create backend representation for executable graph
+}
+
+void command_graph<graph_state::executable>::finalize_impl() {
+  // Create PI command-buffers for each device in the finalized context
+  impl->schedule();
+#if SYCL_EXT_ONEAPI_GRAPH
+  for (auto device : MCtx.get_devices()) {
+    impl->create_pi_command_buffers(device, MCtx);
+  }
+#endif
+}
+
 } // namespace experimental
 } // namespace oneapi
 } // namespace ext
diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp
index 6bfc274c35176..f820a0baea0d3 100644
--- a/sycl/source/detail/graph_impl.hpp
+++ b/sycl/source/detail/graph_impl.hpp
@@ -28,29 +28,18 @@ namespace oneapi {
 namespace experimental {
 namespace detail {
 
-class wrapper {
-  using T = std::function<void(sycl::handler &)>;
-  T MFunc;
-  std::vector<sycl::event> MDeps;
-
-public:
-  wrapper(T Func, const std::vector<sycl::event> &Deps)
-      : MFunc(Func), MDeps(Deps){};
-
-  void operator()(sycl::handler &CGH) {
-    CGH.depends_on(MDeps);
-    std::invoke(MFunc, CGH);
-  }
-};
-
 struct node_impl {
-  bool MScheduled;
-
   std::shared_ptr<graph_impl> MGraph;
-  sycl::event MEvent;
+  /// ID representing this node in the graph
+  /// TODO this should be attached to an executable graph, rather than
+  /// a modifiable graph
+  pi_ext_sync_point MPiSyncPoint;
 
+  // List of successors to this node.
   std::vector<std::shared_ptr<node_impl>> MSuccessors;
-  std::vector<std::shared_ptr<node_impl>> MPredecessors;
+  // List of predecessors to this node. Using weak_ptr here to prevent circular
+  // references between nodes.
+  std::vector<std::weak_ptr<node_impl>> MPredecessors;
 
   /// Kernel to be executed by this node
   std::shared_ptr<sycl::detail::kernel_impl> MKernel;
@@ -71,22 +60,20 @@ struct node_impl {
   // may go out of scope before execution.
   std::vector<std::vector<std::byte>> MArgStorage;
 
-  void exec(const std::shared_ptr<sycl::detail::queue_impl> &Queue
-                _CODELOCPARAM(&CodeLoc));
+  bool MIsEmpty = false;
 
-  void register_successor(const std::shared_ptr<node_impl> &Node) {
+  void register_successor(const std::shared_ptr<node_impl> &Node,
+                          const std::shared_ptr<node_impl> &Prev) {
     MSuccessors.push_back(Node);
-    Node->register_predecessor(std::shared_ptr<node_impl>(this));
+    Node->register_predecessor(Prev);
   }
 
   void register_predecessor(const std::shared_ptr<node_impl> &Node) {
     MPredecessors.push_back(Node);
   }
 
-  sycl::event get_event(void) const { return MEvent; }
-
   node_impl(const std::shared_ptr<graph_impl> &Graph)
-      : MScheduled(false), MGraph(Graph) {}
+      : MGraph(Graph), MIsEmpty(true) {}
 
   node_impl(
       const std::shared_ptr<graph_impl> &Graph,
@@ -97,7 +84,7 @@ struct node_impl {
       const std::vector<sycl::detail::LocalAccessorImplPtr> &LocalAccStorage,
       const std::vector<sycl::detail::AccessorImplHost *> &Requirements,
       const std::vector<sycl::detail::ArgDesc> &args)
-      : MScheduled(false), MGraph(Graph), MKernel(Kernel), MNDRDesc(NDRDesc),
+      : MGraph(Graph), MKernel(Kernel), MNDRDesc(NDRDesc),
         MOSModuleHandle(OSModuleHandle), MKernelName(KernelName),
         MAccStorage(AccStorage), MLocalAccStorage(LocalAccStorage),
         MRequirements(Requirements), MArgs(args), MArgStorage() {
@@ -116,14 +103,17 @@ struct node_impl {
   }
 
   // Recursively adding nodes to execution stack:
-  void topology_sort(std::list<std::shared_ptr<node_impl>> &Schedule) {
-    MScheduled = true;
+  void topology_sort(std::shared_ptr<node_impl> NodeImpl,
+                     std::list<std::shared_ptr<node_impl>> &Schedule) {
     for (auto Next : MSuccessors) {
-      if (!Next->MScheduled)
-        Next->topology_sort(Schedule);
+      // Check if we've already scheduled this node
+      if (std::find(Schedule.begin(), Schedule.end(), Next) == Schedule.end())
+        Next->topology_sort(Next, Schedule);
     }
-    if (MKernel != nullptr)
-      Schedule.push_front(std::shared_ptr<node_impl>(this));
+    // We don't need to schedule empty nodes as they are only used when
+    // calculating dependencies
+    if (!NodeImpl->is_empty())
+      Schedule.push_front(NodeImpl);
   }
 
   bool has_arg(const sycl::detail::ArgDesc &Arg) {
@@ -140,19 +130,15 @@ struct node_impl {
     }
     return false;
   }
+
+  bool is_empty() const { return MIsEmpty; }
 };
 
 struct graph_impl {
   std::set<std::shared_ptr<node_impl>> MRoots;
-  std::list<std::shared_ptr<node_impl>> MSchedule;
-  // TODO: Change one time initialization to per executable object
-  bool MFirst;
 
   std::shared_ptr<graph_impl> MParent;
 
-  void exec(const std::shared_ptr<sycl::detail::queue_impl> &);
-  void exec_and_wait(const std::shared_ptr<sycl::detail::queue_impl> &);
-
   void add_root(const std::shared_ptr<node_impl> &);
   void remove_root(const std::shared_ptr<node_impl> &);
 
@@ -165,7 +151,9 @@ struct graph_impl {
       const std::vector<sycl::detail::LocalAccessorImplPtr> &LocalAccStorage,
       const std::vector<sycl::detail::AccessorImplHost *> &Requirements,
       const std::vector<sycl::detail::ArgDesc> &Args,
-      const std::vector<std::shared_ptr<node_impl>> &Dep = {});
+      const std::vector<std::shared_ptr<node_impl>> &Dep = {},
+      const std::vector<std::shared_ptr<sycl::detail::event_impl>> &DepEvents =
+          {});
 
   std::shared_ptr<node_impl>
   add(const std::shared_ptr<graph_impl> &Impl,
@@ -177,7 +165,7 @@ struct graph_impl {
   add(const std::shared_ptr<graph_impl> &Impl,
       const std::vector<std::shared_ptr<node_impl>> &Dep = {});
 
-  graph_impl() : MFirst(true) {}
+  graph_impl() = default;
 
   /// Add a queue to the set of queues which are currently recording to this
   /// graph.
@@ -198,8 +186,45 @@ struct graph_impl {
   /// removed.
   bool clear_queues();
 
+  void add_event_for_node(std::shared_ptr<sycl::detail::event_impl> EventImpl,
+                          std::shared_ptr<node_impl> NodeImpl) {
+    MEventsMap[EventImpl] = NodeImpl;
+  }
+
 private:
   std::set<std::shared_ptr<sycl::detail::queue_impl>> MRecordingQueues;
+  // Map of events to their associated recorded nodes.
+  std::unordered_map<std::shared_ptr<sycl::detail::event_impl>,
+                     std::shared_ptr<node_impl>>
+      MEventsMap;
+};
+
+class exec_graph_impl {
+public:
+  exec_graph_impl(sycl::context Context,
+                  const std::shared_ptr<graph_impl> &GraphImpl)
+      : MSchedule(), MGraphImpl(GraphImpl), MPiCommandBuffers(),
+        MContext(Context) {}
+  ~exec_graph_impl();
+  /// Add nodes to MSchedule
+  void schedule();
+  /// Enqueues the backend objects for the graph to the parametrized queue
+  sycl::event enqueue(const std::shared_ptr<sycl::detail::queue_impl> &);
+  /// Called by handler::ext_oneapi_command_graph() to schedule graph for
+  /// execution
+  sycl::event exec(const std::shared_ptr<sycl::detail::queue_impl> &);
+  /// Turns our internal graph representation into PI command-buffers for a
+  /// device
+  void create_pi_command_buffers(sycl::device D, const sycl::context &Ctx);
+
+private:
+  std::list<std::shared_ptr<node_impl>> MSchedule;
+  // Pointer to the modifiable graph impl associated with this executable graph
+  std::shared_ptr<graph_impl> MGraphImpl;
+  // Map of devices to command buffers
+  std::unordered_map<sycl::device, pi_ext_command_buffer> MPiCommandBuffers;
+  // Context associated with this executable graph
+  sycl::context MContext;
 };
 
 } // namespace detail
diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index 61c860ab3b060..6e9eab5828d2f 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -279,14 +279,6 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
   TelemetryEvent = instrumentationProlog(CodeLoc, Name, StreamID, IId);
 #endif
 
-#if SYCL_EXT_ONEAPI_GRAPH
-  if (has_property<ext::oneapi::property::queue::lazy_execution>()) {
-    const detail::plugin &Plugin = getPlugin();
-    if (Plugin.getBackend() == backend::ext_oneapi_level_zero)
-      Plugin.call<detail::PiApiKind::piQueueFlush>(getHandleRef());
-  }
-#endif
-
   std::vector<std::weak_ptr<event_impl>> WeakEvents;
   std::vector<event> SharedEvents;
   {
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index ba67dcfe482d9..ce121ab290af4 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -318,10 +318,7 @@ class queue_impl {
       // queue property.
       CreationFlags |= PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS;
     }
-    if (MPropList
-            .has_property<ext::oneapi::property::queue::lazy_execution>()) {
-      CreationFlags |= PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION;
-    }
+
     RT::PiQueue Queue{};
     RT::PiContext Context = MContext->getHandleRef();
     RT::PiDevice Device = MDevice->getHandleRef();
@@ -652,7 +649,8 @@ class queue_impl {
 
   // Command graph which is associated with this queue for the purposes of
   // recording commands to it.
-  std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> MGraph;
+  std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> MGraph =
+      nullptr;
   // This flag is set to true if a command_graph is currently submitting
   // commands to this queue. Used by subgraphs to determine if they are part of
   // a larger command graph submission.
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 8f5f18a090aa8..e38ce4075750d 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -84,7 +84,7 @@ static std::string deviceToString(device Device) {
     return "UNKNOWN";
 }
 
-static void applyFuncOnFilteredArgs(
+void applyFuncOnFilteredArgs(
     const ProgramManager::KernelArgMask &EliminatedArgMask,
     std::vector<ArgDesc> &Args,
     std::function<void(detail::ArgDesc &Arg, int NextTrueIndex)> Func) {
@@ -1985,6 +1985,72 @@ static void ReverseRangeDimensionsForKernel(NDRDescT &NDR) {
   }
 }
 
+void SetArgBasedOnType(
+    const detail::plugin &Plugin, RT::PiKernel Kernel,
+    const std::shared_ptr<device_image_impl> &DeviceImageImpl,
+    const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
+    const sycl::context &Context, bool IsHost, detail::ArgDesc &Arg,
+    size_t NextTrueIndex) {
+  switch (Arg.MType) {
+  case kernel_param_kind_t::kind_stream:
+    break;
+  case kernel_param_kind_t::kind_accessor: {
+    Requirement *Req = (Requirement *)(Arg.MPtr);
+    if (getMemAllocationFunc == nullptr)
+      throw sycl::exception(make_error_code(errc::kernel_argument),
+                            "placeholder accessor must be bound by calling "
+                            "handler::require() before it can be used.");
+
+    RT::PiMem MemArg = (RT::PiMem)getMemAllocationFunc(Req);
+    if (Plugin.getBackend() == backend::opencl) {
+      Plugin.call<PiApiKind::piKernelSetArg>(Kernel, NextTrueIndex,
+                                             sizeof(RT::PiMem), &MemArg);
+    } else {
+      Plugin.call<PiApiKind::piextKernelSetArgMemObj>(Kernel, NextTrueIndex,
+                                                      &MemArg);
+    }
+    break;
+  }
+  case kernel_param_kind_t::kind_std_layout: {
+    Plugin.call<PiApiKind::piKernelSetArg>(Kernel, NextTrueIndex, Arg.MSize,
+                                           Arg.MPtr);
+    break;
+  }
+  case kernel_param_kind_t::kind_sampler: {
+    sampler *SamplerPtr = (sampler *)Arg.MPtr;
+    RT::PiSampler Sampler =
+        detail::getSyclObjImpl(*SamplerPtr)->getOrCreateSampler(Context);
+    Plugin.call<PiApiKind::piextKernelSetArgSampler>(Kernel, NextTrueIndex,
+                                                     &Sampler);
+    break;
+  }
+  case kernel_param_kind_t::kind_pointer: {
+    Plugin.call<PiApiKind::piextKernelSetArgPointer>(Kernel, NextTrueIndex,
+                                                     Arg.MSize, Arg.MPtr);
+    break;
+  }
+  case kernel_param_kind_t::kind_specialization_constants_buffer: {
+    if (IsHost) {
+      throw sycl::feature_not_supported(
+          "SYCL2020 specialization constants are not yet supported on host "
+          "device",
+          PI_ERROR_INVALID_OPERATION);
+    }
+    assert(DeviceImageImpl != nullptr);
+    RT::PiMem SpecConstsBuffer = DeviceImageImpl->get_spec_const_buffer_ref();
+    // Avoid taking an address of nullptr
+    RT::PiMem *SpecConstsBufferArg =
+        SpecConstsBuffer ? &SpecConstsBuffer : nullptr;
+    Plugin.call<PiApiKind::piextKernelSetArgMemObj>(Kernel, NextTrueIndex,
+                                                    SpecConstsBufferArg);
+    break;
+  }
+  case kernel_param_kind_t::kind_invalid:
+    throw runtime_error("Invalid kernel param kind", PI_ERROR_INVALID_VALUE);
+    break;
+  }
+}
+
 static pi_result SetKernelParamsAndLaunch(
     const QueueImplPtr &Queue, std::vector<ArgDesc> &Args,
     const std::shared_ptr<device_image_impl> &DeviceImageImpl,
@@ -1996,64 +2062,9 @@ static pi_result SetKernelParamsAndLaunch(
 
   auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc,
                   &Queue](detail::ArgDesc &Arg, size_t NextTrueIndex) {
-    switch (Arg.MType) {
-    case kernel_param_kind_t::kind_stream:
-      break;
-    case kernel_param_kind_t::kind_accessor: {
-      Requirement *Req = (Requirement *)(Arg.MPtr);
-      if (getMemAllocationFunc == nullptr)
-        throw sycl::exception(make_error_code(errc::kernel_argument),
-                              "placeholder accessor must be bound by calling "
-                              "handler::require() before it can be used.");
-
-      RT::PiMem MemArg = (RT::PiMem)getMemAllocationFunc(Req);
-      if (Plugin.getBackend() == backend::opencl) {
-        Plugin.call<PiApiKind::piKernelSetArg>(Kernel, NextTrueIndex,
-                                               sizeof(RT::PiMem), &MemArg);
-      } else {
-        Plugin.call<PiApiKind::piextKernelSetArgMemObj>(Kernel, NextTrueIndex,
-                                                        &MemArg);
-      }
-      break;
-    }
-    case kernel_param_kind_t::kind_std_layout: {
-      Plugin.call<PiApiKind::piKernelSetArg>(Kernel, NextTrueIndex, Arg.MSize,
-                                             Arg.MPtr);
-      break;
-    }
-    case kernel_param_kind_t::kind_sampler: {
-      sampler *SamplerPtr = (sampler *)Arg.MPtr;
-      RT::PiSampler Sampler = detail::getSyclObjImpl(*SamplerPtr)
-                                  ->getOrCreateSampler(Queue->get_context());
-      Plugin.call<PiApiKind::piextKernelSetArgSampler>(Kernel, NextTrueIndex,
-                                                       &Sampler);
-      break;
-    }
-    case kernel_param_kind_t::kind_pointer: {
-      Plugin.call<PiApiKind::piextKernelSetArgPointer>(Kernel, NextTrueIndex,
-                                                       Arg.MSize, Arg.MPtr);
-      break;
-    }
-    case kernel_param_kind_t::kind_specialization_constants_buffer: {
-      if (Queue->is_host()) {
-        throw sycl::feature_not_supported(
-            "SYCL2020 specialization constants are not yet supported on host "
-            "device",
-            PI_ERROR_INVALID_OPERATION);
-      }
-      assert(DeviceImageImpl != nullptr);
-      RT::PiMem SpecConstsBuffer = DeviceImageImpl->get_spec_const_buffer_ref();
-      // Avoid taking an address of nullptr
-      RT::PiMem *SpecConstsBufferArg =
-          SpecConstsBuffer ? &SpecConstsBuffer : nullptr;
-      Plugin.call<PiApiKind::piextKernelSetArgMemObj>(Kernel, NextTrueIndex,
-                                                      SpecConstsBufferArg);
-      break;
-    }
-    case kernel_param_kind_t::kind_invalid:
-      throw runtime_error("Invalid kernel param kind", PI_ERROR_INVALID_VALUE);
-      break;
-    }
+    SetArgBasedOnType(Plugin, Kernel, DeviceImageImpl, getMemAllocationFunc,
+                      Queue->get_context(), Queue->is_host(), Arg,
+                      NextTrueIndex);
   };
 
   applyFuncOnFilteredArgs(EliminatedArgMask, Args, setFunc);
diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index 77afa4936bc0a..ab86f8fbd9f3f 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -601,6 +601,21 @@ class UpdateHostRequirementCommand : public Command {
   void **MDstPtr = nullptr;
 };
 
+// Sets arguments for a given kernel and device based on the argument type.
+// Refactored from SetKernelParamsAndLaunch to allow it to be used in the graphs
+// extension.
+void SetArgBasedOnType(
+    const detail::plugin &Plugin, RT::PiKernel Kernel,
+    const std::shared_ptr<device_image_impl> &DeviceImageImpl,
+    const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
+    const sycl::context &Context, bool IsHost, detail::ArgDesc &Arg,
+    size_t NextTrueIndex);
+
+void applyFuncOnFilteredArgs(
+    const ProgramManager::KernelArgMask &EliminatedArgMask,
+    std::vector<ArgDesc> &Args,
+    std::function<void(detail::ArgDesc &Arg, int NextTrueIndex)> Func);
+
 } // namespace detail
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
 } // namespace sycl
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index e0546e5066453..8a550137e6bfd 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -100,10 +100,15 @@ event handler::finalize() {
   if (auto GraphImpl = MQueue->getCommandGraph(); GraphImpl != nullptr) {
     // Extract relevant data from the handler and pass to graph to create a new
     // node representing this command group.
-    GraphImpl->add(GraphImpl, MKernel, MNDRDesc, MOSModuleHandle, MKernelName,
-                   MAccStorage, MLocalAccStorage, MRequirements, MArgs, {});
-    return detail::createSyclObjFromImpl<event>(
-        std::make_shared<detail::event_impl>());
+    auto NodeImpl = GraphImpl->add(
+        GraphImpl, MKernel, MNDRDesc, MOSModuleHandle, MKernelName, MAccStorage,
+        MLocalAccStorage, MRequirements, MArgs, {}, MEvents);
+
+    // Create and associated an event with this new node
+    auto EventImpl = std::make_shared<detail::event_impl>();
+    GraphImpl->add_event_for_node(EventImpl, NodeImpl);
+
+    return detail::createSyclObjFromImpl<event>(EventImpl);
   }
 
   std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImpPtr = nullptr;
@@ -716,7 +721,9 @@ void handler::ext_oneapi_graph(
         ext::oneapi::experimental::graph_state::executable>
         Graph) {
   auto GraphImpl = detail::getSyclObjImpl(Graph);
-  GraphImpl->exec_and_wait(MQueue);
+  auto GraphCompletionEvent = GraphImpl->exec(MQueue);
+  auto EventImpl = detail::getSyclObjImpl(GraphCompletionEvent);
+  MEvents.push_back(EventImpl);
 }
 
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp
index 20fc669b8d591..47cdc71c70f9c 100644
--- a/sycl/source/queue.cpp
+++ b/sycl/source/queue.cpp
@@ -212,10 +212,5 @@ bool queue::device_has(aspect Aspect) const {
   // avoid creating sycl object from impl
   return impl->getDeviceImplPtr()->has(Aspect);
 }
-
-template __SYCL_EXPORT bool
-queue::has_property<ext::oneapi::property::queue::lazy_execution>() const;
-template __SYCL_EXPORT ext::oneapi::property::queue::lazy_execution
-queue::get_property<ext::oneapi::property::queue::lazy_execution>() const;
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
 } // namespace sycl
diff --git a/sycl/test/graph/graph-explicit-dotp-buffer.cpp b/sycl/test/graph/graph-explicit-dotp-buffer.cpp
index 0b795714f98dd..d84b3483656e7 100644
--- a/sycl/test/graph/graph-explicit-dotp-buffer.cpp
+++ b/sycl/test/graph/graph-explicit-dotp-buffer.cpp
@@ -24,11 +24,7 @@ int main() {
   float beta = 2.0f;
   float gamma = 3.0f;
 
-  sycl::property_list properties{
-      sycl::property::queue::in_order{},
-      sycl::ext::oneapi::property::queue::lazy_execution{}};
-
-  sycl::queue q{sycl::gpu_selector_v, properties};
+  sycl::queue q{sycl::gpu_selector_v};
 
   sycl::ext::oneapi::experimental::command_graph g;
 
diff --git a/sycl/test/graph/graph-explicit-dotp-device-mem.cpp b/sycl/test/graph/graph-explicit-dotp-device-mem.cpp
index 3163a6fb2ea64..d7bbfe0880294 100644
--- a/sycl/test/graph/graph-explicit-dotp-device-mem.cpp
+++ b/sycl/test/graph/graph-explicit-dotp-device-mem.cpp
@@ -24,11 +24,7 @@ int main() {
   float beta = 2.0f;
   float gamma = 3.0f;
 
-  sycl::property_list properties{
-      sycl::property::queue::in_order{},
-      sycl::ext::oneapi::property::queue::lazy_execution{}};
-
-  sycl::queue q{sycl::gpu_selector_v, properties};
+  sycl::queue q{sycl::gpu_selector_v};
 
   sycl::ext::oneapi::experimental::command_graph g;
 
@@ -68,12 +64,22 @@ int main() {
 
   auto node_c = g.add(
       [&](sycl::handler &h) {
+#ifdef TEST_GRAPH_REDUCTIONS
         h.parallel_for(sycl::range<1>{n},
                        sycl::reduction(dotp, 0.0f, std::plus()),
                        [=](sycl::id<1> it, auto &sum) {
                          const size_t i = it[0];
                          sum += x[i] * z[i];
                        });
+#else
+        h.single_task([=]() {
+          // Doing a manual reduction here because reduction objects cause
+          // issues with graphs.
+          for (size_t j = 0; j < n; j++) {
+            dotp[0] += x[j] * z[j];
+          }
+        });
+#endif
       },
       {node_a, node_b});
 
diff --git a/sycl/test/graph/graph-explicit-dotp.cpp b/sycl/test/graph/graph-explicit-dotp.cpp
index 7d0a5bcdae30e..0020db49737c9 100644
--- a/sycl/test/graph/graph-explicit-dotp.cpp
+++ b/sycl/test/graph/graph-explicit-dotp.cpp
@@ -24,11 +24,7 @@ int main() {
   float beta = 2.0f;
   float gamma = 3.0f;
 
-  sycl::property_list properties{
-      sycl::property::queue::in_order{},
-      sycl::ext::oneapi::property::queue::lazy_execution{}};
-
-  sycl::queue q{sycl::gpu_selector_v, properties};
+  sycl::queue q{sycl::gpu_selector_v};
 
   sycl::ext::oneapi::experimental::command_graph g;
 
@@ -68,12 +64,22 @@ int main() {
 
   auto node_c = g.add(
       [&](sycl::handler &h) {
+#ifdef TEST_GRAPH_REDUCTIONS
         h.parallel_for(sycl::range<1>{n},
                        sycl::reduction(dotp, 0.0f, std::plus()),
                        [=](sycl::id<1> it, auto &sum) {
                          const size_t i = it[0];
                          sum += x[i] * z[i];
                        });
+#else
+        h.single_task([=]() {
+          // Doing a manual reduction here because reduction objects cause
+          // issues with graphs.
+          for (size_t j = 0; j < n; j++) {
+            dotp[0] += x[j] * z[j];
+          }
+        });
+#endif
       },
       {node_a, node_b});
 
diff --git a/sycl/test/graph/graph-explicit-empty.cpp b/sycl/test/graph/graph-explicit-empty.cpp
index 7917a98fdf2f4..9b3a9b5bf7726 100644
--- a/sycl/test/graph/graph-explicit-empty.cpp
+++ b/sycl/test/graph/graph-explicit-empty.cpp
@@ -5,46 +5,48 @@
 
 int main() {
 
-  sycl::property_list properties{
-      sycl::property::queue::in_order{},
-      sycl::ext::oneapi::property::queue::lazy_execution{}};
-
-  sycl::queue q{sycl::gpu_selector_v, properties};
+  sycl::queue q{sycl::gpu_selector_v};
 
   sycl::ext::oneapi::experimental::command_graph g;
 
   const size_t n = 10;
-  float *h_arr = sycl::malloc_host<float>(n, q);
   float *arr = sycl::malloc_device<float>(n, q);
-  
+
   auto start = g.add();
 
-  auto init = g.add([&](sycl::handler &h) {
-      h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> idx) {
-        size_t i = idx;
-        arr[i] = 0;
-      });
-  }, {start});
-    
+  auto init = g.add(
+      [&](sycl::handler &h) {
+        h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> idx) {
+          size_t i = idx;
+          arr[i] = 0;
+        });
+      },
+      {start});
+
   auto empty = g.add({init});
+  auto empty2 = g.add({empty});
 
-  g.add([&](sycl::handler &h) {
-    h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> idx) {
-      size_t i = idx;
-      arr[i] = 1;
-    });
-  }, {empty});
+  g.add(
+      [&](sycl::handler &h) {
+        h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> idx) {
+          size_t i = idx;
+          arr[i] = 1;
+        });
+      },
+      {empty2});
 
   auto executable_graph = g.finalize(q.get_context());
 
-  q.submit([&](sycl::handler &h) { h.ext_oneapi_graph(executable_graph); }).wait();
+  q.submit([&](sycl::handler &h) {
+     h.ext_oneapi_graph(executable_graph);
+   }).wait();
 
-  q.memcpy(&(h_arr[0]), arr, n).wait();
+  std::vector<float> HostData(n);
+  q.memcpy(HostData.data(), arr, n * sizeof(float)).wait();
 
   for (int i = 0; i < n; i++)
-    assert(h_arr[i] == 1.0f);
+    assert(HostData[i] == 1.f);
 
-  sycl::free(h_arr, q);
   sycl::free(arr, q);
 
   return 0;
diff --git a/sycl/test/graph/graph-explicit-multiple-exec-graphs.cpp b/sycl/test/graph/graph-explicit-multiple-exec-graphs.cpp
new file mode 100644
index 0000000000000..ece78bda980f3
--- /dev/null
+++ b/sycl/test/graph/graph-explicit-multiple-exec-graphs.cpp
@@ -0,0 +1,100 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+#include <sycl/sycl.hpp>
+
+#include <sycl/ext/oneapi/experimental/graph.hpp>
+
+const size_t n = 10;
+
+float host_gold_result() {
+  float alpha = 1.0f;
+  float beta = 2.0f;
+  float gamma = 3.0f;
+
+  float sum = 0.0f;
+
+  for (size_t i = 0; i < n; ++i) {
+    sum += (alpha * 1.0f + beta * 2.0f) * (gamma * 3.0f + beta * 2.0f);
+  }
+
+  return sum;
+}
+
+int main() {
+  float alpha = 1.0f;
+  float beta = 2.0f;
+  float gamma = 3.0f;
+
+  sycl::queue q{sycl::gpu_selector_v};
+
+  sycl::ext::oneapi::experimental::command_graph g;
+
+  float *dotp = sycl::malloc_shared<float>(1, q);
+
+  float *x = sycl::malloc_shared<float>(n, q);
+  float *y = sycl::malloc_shared<float>(n, q);
+  float *z = sycl::malloc_shared<float>(n, q);
+
+  /* init data on the device */
+  auto n_i = g.add([&](sycl::handler &h) {
+    h.parallel_for(n, [=](sycl::id<1> it) {
+      const size_t i = it[0];
+      x[i] = 1.0f;
+      y[i] = 2.0f;
+      z[i] = 3.0f;
+      dotp[0] = 0.0f;
+    });
+  });
+
+  auto node_a = g.add(
+      [&](sycl::handler &h) {
+        h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
+          const size_t i = it[0];
+          x[i] = alpha * x[i] + beta * y[i];
+        });
+      },
+      {n_i});
+
+  auto node_b = g.add(
+      [&](sycl::handler &h) {
+        h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
+          const size_t i = it[0];
+          z[i] = gamma * z[i] + beta * y[i];
+        });
+      },
+      {n_i});
+
+  auto node_c = g.add(
+      [&](sycl::handler &h) {
+        h.single_task([=]() {
+          for (size_t j = 0; j < n; j++) {
+            dotp[0] += x[j] * z[j];
+          }
+        });
+      },
+      {node_a, node_b});
+
+  auto executable_graph = g.finalize(q.get_context());
+
+  // Add an extra node for the second executable graph which modifies the output
+  auto node_d =
+      g.add([&](sycl::handler &h) { h.single_task([=]() { dotp[0] += 1; }); },
+            {node_c});
+
+  auto executable_graph_2 = g.finalize(q.get_context());
+
+  // Using shortcut for executing a graph of commands
+  q.ext_oneapi_graph(executable_graph).wait();
+
+  assert(*dotp == host_gold_result());
+
+  q.ext_oneapi_graph(executable_graph_2).wait();
+
+  assert(*dotp == host_gold_result() + 1);
+
+  sycl::free(dotp, q);
+  sycl::free(x, q);
+  sycl::free(y, q);
+  sycl::free(z, q);
+
+  return 0;
+}
diff --git a/sycl/test/graph/graph-explicit-node-ordering.cpp b/sycl/test/graph/graph-explicit-node-ordering.cpp
index 2ac11bbba28f2..5f6bf86bdbb23 100644
--- a/sycl/test/graph/graph-explicit-node-ordering.cpp
+++ b/sycl/test/graph/graph-explicit-node-ordering.cpp
@@ -5,11 +5,7 @@
 
 int main() {
 
-  sycl::property_list properties{
-      sycl::property::queue::in_order{},
-      sycl::ext::oneapi::property::queue::lazy_execution{}};
-
-  sycl::queue q{sycl::gpu_selector_v, properties};
+  sycl::queue q{sycl::gpu_selector_v};
 
   sycl::ext::oneapi::experimental::command_graph g;
 
diff --git a/sycl/test/graph/graph-explicit-queue-shortcuts.cpp b/sycl/test/graph/graph-explicit-queue-shortcuts.cpp
index 8f6d0c668af8b..145e10c241c9b 100644
--- a/sycl/test/graph/graph-explicit-queue-shortcuts.cpp
+++ b/sycl/test/graph/graph-explicit-queue-shortcuts.cpp
@@ -5,11 +5,7 @@
 
 int main() {
 
-  sycl::property_list properties{
-      sycl::property::queue::in_order{},
-      sycl::ext::oneapi::property::queue::lazy_execution{}};
-
-  sycl::queue q{sycl::gpu_selector_v, properties};
+  sycl::queue q{sycl::gpu_selector_v};
 
   // Test passing empty property list, which is the default
   sycl::property_list empty_properties;
diff --git a/sycl/test/graph/graph-explicit-reduction.cpp b/sycl/test/graph/graph-explicit-reduction.cpp
index d652c1d9e3ada..d18ef5dd48a3e 100644
--- a/sycl/test/graph/graph-explicit-reduction.cpp
+++ b/sycl/test/graph/graph-explicit-reduction.cpp
@@ -4,11 +4,7 @@
 #include <sycl/ext/oneapi/experimental/graph.hpp>
 
 int main() {
-  sycl::property_list properties{
-      sycl::property::queue::in_order{},
-      sycl::ext::oneapi::property::queue::lazy_execution{}};
-
-  sycl::queue q{sycl::gpu_selector_v, properties};
+  sycl::queue q{sycl::gpu_selector_v};
 
   sycl::ext::oneapi::experimental::command_graph g;
 
diff --git a/sycl/test/graph/graph-explicit-repeated-exec.cpp b/sycl/test/graph/graph-explicit-repeated-exec.cpp
index 23b9f8fbead7b..9fe8c7cfe6708 100644
--- a/sycl/test/graph/graph-explicit-repeated-exec.cpp
+++ b/sycl/test/graph/graph-explicit-repeated-exec.cpp
@@ -5,11 +5,7 @@
 
 int main() {
 
-  sycl::property_list properties{
-      sycl::property::queue::in_order{},
-      sycl::ext::oneapi::property::queue::lazy_execution{}};
-
-  sycl::queue q{sycl::gpu_selector_v, properties};
+  sycl::queue q{sycl::gpu_selector_v};
 
   sycl::ext::oneapi::experimental::command_graph g;
 
diff --git a/sycl/test/graph/graph-explicit-saxpy.cpp b/sycl/test/graph/graph-explicit-saxpy.cpp
index e8a422ad23f3a..fd4bf797d73ba 100644
--- a/sycl/test/graph/graph-explicit-saxpy.cpp
+++ b/sycl/test/graph/graph-explicit-saxpy.cpp
@@ -5,11 +5,7 @@
 
 int main() {
 
-  sycl::property_list properties{
-      sycl::property::queue::in_order{},
-      sycl::ext::oneapi::property::queue::lazy_execution{}};
-
-  sycl::queue q{sycl::gpu_selector_v, properties};
+  sycl::queue q{sycl::gpu_selector_v};
 
   sycl::ext::oneapi::experimental::command_graph g;
 
diff --git a/sycl/test/graph/graph-explicit-single-node.cpp b/sycl/test/graph/graph-explicit-single-node.cpp
index b1461f308edc0..6047b6008cea8 100644
--- a/sycl/test/graph/graph-explicit-single-node.cpp
+++ b/sycl/test/graph/graph-explicit-single-node.cpp
@@ -6,11 +6,7 @@
 
 int main() {
 
-  sycl::property_list properties{
-      sycl::property::queue::in_order{},
-      sycl::ext::oneapi::property::queue::lazy_execution{}};
-
-  sycl::queue q{sycl::gpu_selector_v, properties};
+  sycl::queue q{sycl::gpu_selector_v};
 
   sycl::ext::oneapi::experimental::command_graph g;
 
@@ -23,7 +19,7 @@ int main() {
   g.add([&](sycl::handler &h) {
     h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> idx) {
       size_t i = idx;
-      arr[i] = 1;
+      arr[i] = 3.14f;
     });
   });
 
@@ -42,7 +38,7 @@ int main() {
    }).wait();
 
   for (int i = 0; i < n; i++)
-    assert(arr[i] == 1);
+    assert(arr[i] == 3.14f);
 
   sycl::free(arr, q);
 
diff --git a/sycl/test/graph/graph-explicit-subgraph.cpp b/sycl/test/graph/graph-explicit-subgraph.cpp
index 5510f62a50c9c..5b33e53d417f5 100644
--- a/sycl/test/graph/graph-explicit-subgraph.cpp
+++ b/sycl/test/graph/graph-explicit-subgraph.cpp
@@ -27,11 +27,7 @@ int main() {
   float beta = 2.0f;
   float gamma = 3.0f;
 
-  sycl::property_list properties{
-      sycl::property::queue::in_order{},
-      sycl::ext::oneapi::property::queue::lazy_execution{}};
-
-  sycl::queue q{sycl::gpu_selector_v, properties};
+  sycl::queue q{sycl::gpu_selector_v};
 
   sycl::ext::oneapi::experimental::command_graph g;
   sycl::ext::oneapi::experimental::command_graph subGraph;
diff --git a/sycl/test/graph/graph-record-dotp-buffer.cpp b/sycl/test/graph/graph-record-dotp-buffer.cpp
index 0ee55942f031b..39da200194539 100644
--- a/sycl/test/graph/graph-record-dotp-buffer.cpp
+++ b/sycl/test/graph/graph-record-dotp-buffer.cpp
@@ -24,11 +24,7 @@ int main() {
   float beta = 2.0f;
   float gamma = 3.0f;
 
-  sycl::property_list properties{
-      sycl::property::queue::in_order(),
-      sycl::ext::oneapi::property::queue::lazy_execution{}};
-
-  sycl::queue q{sycl::gpu_selector_v, properties};
+  sycl::queue q{sycl::gpu_selector_v};
 
   sycl::ext::oneapi::experimental::command_graph g;
 
diff --git a/sycl/test/graph/graph-record-dotp.cpp b/sycl/test/graph/graph-record-dotp.cpp
index cfe9d59f39735..d6eb70f359370 100644
--- a/sycl/test/graph/graph-record-dotp.cpp
+++ b/sycl/test/graph/graph-record-dotp.cpp
@@ -24,11 +24,7 @@ int main() {
   float beta = 2.0f;
   float gamma = 3.0f;
 
-  sycl::property_list properties{
-      sycl::property::queue::in_order(),
-      sycl::ext::oneapi::property::queue::lazy_execution{}};
-
-  sycl::queue q{sycl::gpu_selector_v, properties};
+  sycl::queue q{sycl::gpu_selector_v};
 
   sycl::ext::oneapi::experimental::command_graph g;
 
@@ -41,7 +37,7 @@ int main() {
   g.begin_recording(q);
 
   /* init data on the device */
-  q.submit([&](sycl::handler &h) {
+  auto initEvent = q.submit([&](sycl::handler &h) {
     h.parallel_for(n, [=](sycl::id<1> it) {
       const size_t i = it[0];
       x[i] = 1.0f;
@@ -50,14 +46,16 @@ int main() {
     });
   });
 
-  q.submit([&](sycl::handler &h) {
+  auto eventA = q.submit([&](sycl::handler &h) {
+    h.depends_on(initEvent);
     h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
       const size_t i = it[0];
       x[i] = alpha * x[i] + beta * y[i];
     });
   });
 
-  q.submit([&](sycl::handler &h) {
+  auto eventB = q.submit([&](sycl::handler &h) {
+    h.depends_on(initEvent);
     h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
       const size_t i = it[0];
       z[i] = gamma * z[i] + beta * y[i];
@@ -65,6 +63,7 @@ int main() {
   });
 
   q.submit([&](sycl::handler &h) {
+    h.depends_on({eventA, eventB});
 #ifdef TEST_GRAPH_REDUCTIONS
     h.parallel_for(sycl::range<1>{n}, sycl::reduction(dotp, 0.0f, std::plus()),
                    [=](sycl::id<1> it, auto &sum) {
diff --git a/sycl/test/graph/graph-record-simple.cpp b/sycl/test/graph/graph-record-simple.cpp
index d1956d8704d70..b5e844ff7adaa 100644
--- a/sycl/test/graph/graph-record-simple.cpp
+++ b/sycl/test/graph/graph-record-simple.cpp
@@ -8,11 +8,7 @@ int main() {
   const size_t n = 10;
   const float expectedValue = 7.f;
 
-  sycl::property_list properties{
-      sycl::property::queue::in_order(),
-      sycl::ext::oneapi::property::queue::lazy_execution{}};
-
-  sycl::queue q{sycl::default_selector_v, properties};
+  sycl::queue q{sycl::default_selector_v};
   sycl::queue q2;
 
   sycl::ext::oneapi::experimental::command_graph g;
diff --git a/sycl/test/graph/graph-record-temp-scope.cpp b/sycl/test/graph/graph-record-temp-scope.cpp
index a4ac296dd6d29..1adb83e6e6cf0 100644
--- a/sycl/test/graph/graph-record-temp-scope.cpp
+++ b/sycl/test/graph/graph-record-temp-scope.cpp
@@ -20,11 +20,7 @@ void run_some_kernel(sycl::queue q, float *data) {
 
 int main() {
 
-  sycl::property_list properties{
-      sycl::property::queue::in_order(),
-      sycl::ext::oneapi::property::queue::lazy_execution{}};
-
-  sycl::queue q{sycl::default_selector_v, properties};
+  sycl::queue q{sycl::default_selector_v};
 
   sycl::ext::oneapi::experimental::command_graph g;
 
diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp
index ccaab4e0c1fef..94cb586b16872 100644
--- a/sycl/unittests/Extensions/CommandGraph.cpp
+++ b/sycl/unittests/Extensions/CommandGraph.cpp
@@ -35,15 +35,6 @@ class CommandGraphTest : public ::testing::Test {
   experimental::command_graph<experimental::graph_state::modifiable> Graph;
 };
 
-TEST_F(CommandGraphTest, LazyQueueProperty) {
-  sycl::property_list Props{
-      sycl::ext::oneapi::property::queue::lazy_execution{}};
-
-  sycl::queue Queue{Dev, Props};
-  bool hasProp =
-      Queue.has_property<sycl::ext::oneapi::property::queue::lazy_execution>();
-  ASSERT_TRUE(hasProp);
-}
 
 TEST_F(CommandGraphTest, AddNode) {
   using namespace sycl::ext::oneapi;
@@ -98,11 +89,8 @@ TEST_F(CommandGraphTest, MakeEdge) {
 TEST_F(CommandGraphTest, BeginEndRecording) {
   using namespace sycl::ext::oneapi;
 
-  sycl::property_list Props{
-      sycl::ext::oneapi::property::queue::lazy_execution{}};
-
-  sycl::queue Queue{Dev, Props};
-  sycl::queue Queue2{Dev, Props};
+  sycl::queue Queue{Dev};
+  sycl::queue Queue2{Dev};
 
   // Test throwing behaviour
   // Check we can repeatedly begin recording on the same queues