IntelPython · oleksandr-pavlyk · Mar 13, 2023 · Mar 9, 2023 · Mar 9, 2023 · Mar 9, 2023
@@ -70,6 +70,7 @@ using dpctl::utils::keep_args_alive;
 
 sycl::event _populate_packed_shape_strides_for_copycast_kernel(
     sycl::queue exec_q,
+    std::vector<sycl::event> &host_task_events,
     py::ssize_t *device_shape_strides, // to be populated
     const std::vector<py::ssize_t> &common_shape,
     const std::vector<py::ssize_t> &src_strides,
@@ -102,13 +103,14 @@ sycl::event _populate_packed_shape_strides_for_copycast_kernel(
         shp_host_shape_strides->data(), device_shape_strides,
         shp_host_shape_strides->size());
 
-    exec_q.submit([&](sycl::handler &cgh) {
+    auto shared_ptr_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(copy_shape_ev);
         cgh.host_task([shp_host_shape_strides]() {
             // increment shared pointer ref-count to keep it alive
             // till copy operation completes;
         });
     });
+    host_task_events.push_back(shared_ptr_cleanup_ev);
 
     return copy_shape_ev;
 }
@@ -306,26 +308,30 @@ copy_usm_ndarray_into_usm_ndarray(dpctl::tensor::usm_ndarray src,
         throw std::runtime_error("Unabled to allocate device memory");
     }
 
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
     sycl::event copy_shape_ev =
         _populate_packed_shape_strides_for_copycast_kernel(
-            exec_q, shape_strides, simplified_shape, simplified_src_strides,
-            simplified_dst_strides);
+            exec_q, host_task_events, shape_strides, simplified_shape,
+            simplified_src_strides, simplified_dst_strides);
 
     sycl::event copy_and_cast_generic_ev = copy_and_cast_fn(
         exec_q, src_nelems, nd, shape_strides, src_data, src_offset, dst_data,
         dst_offset, depends, {copy_shape_ev});
 
     // async free of shape_strides temporary
     auto ctx = exec_q.get_context();
-    exec_q.submit([&](sycl::handler &cgh) {
+    auto temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(copy_and_cast_generic_ev);
         cgh.host_task(
             [ctx, shape_strides]() { sycl::free(shape_strides, ctx); });
     });
 
-    return std::make_pair(
-        keep_args_alive(exec_q, {src, dst}, {copy_and_cast_generic_ev}),
-        copy_and_cast_generic_ev);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          temporaries_cleanup_ev);
 }
 
 void init_copy_and_cast_usm_to_usm_dispatch_tables(void)

@@ -202,11 +202,14 @@ copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src,
                       dst_nd);
     }
 
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
     // copy packed shapes and strides from host to devices
     sycl::event packed_shape_strides_copy_ev = exec_q.copy<py::ssize_t>(
         packed_host_shapes_strides_shp->data(), packed_shapes_strides,
         packed_host_shapes_strides_shp->size());
-    exec_q.submit([&](sycl::handler &cgh) {
+    auto shared_ptr_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(packed_shape_strides_copy_ev);
         cgh.host_task([packed_host_shapes_strides_shp] {
             // Capturing shared pointer ensures that the underlying vector is
@@ -215,6 +218,8 @@ copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src,
         });
     });
 
+    host_task_events.push_back(shared_ptr_cleanup_ev);
+
     char *src_data = src.get_data();
     char *dst_data = dst.get_data();
 
@@ -226,17 +231,18 @@ copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src,
         fn(exec_q, shift, src_nelems, src_nd, dst_nd, packed_shapes_strides,
            src_data, dst_data, all_deps);
 
-    exec_q.submit([&](sycl::handler &cgh) {
+    auto temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(copy_for_reshape_event);
         auto ctx = exec_q.get_context();
         cgh.host_task([packed_shapes_strides, ctx]() {
             sycl::free(packed_shapes_strides, ctx);
         });
     });
 
-    return std::make_pair(
-        keep_args_alive(exec_q, {src, dst}, {copy_for_reshape_event}),
-        copy_for_reshape_event);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          temporaries_cleanup_ev);
 }
 
 void init_copy_for_reshape_dispatch_vectors(void)

@@ -180,17 +180,15 @@ _populate_kernel_params(sycl::queue exec_q,
         host_along_sh_st_shp->data(), device_along_sh_st,
         host_along_sh_st_shp->size());
 
-    sycl::event shared_ptr_cleanup_host_task =
-        exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on({device_along_sh_st_copy_ev,
-                            device_orthog_sh_st_copy_ev,
-                            device_ind_offsets_copy_ev,
-                            device_ind_sh_st_copy_ev, device_ind_ptrs_copy_ev});
-            cgh.host_task([host_ind_offsets_shp, host_ind_sh_st_shp,
-                           host_ind_ptrs_shp, host_orthog_sh_st_shp,
-                           host_along_sh_st_shp]() {});
-        });
-    host_task_events.push_back(shared_ptr_cleanup_host_task);
+    sycl::event shared_ptr_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on({device_along_sh_st_copy_ev, device_orthog_sh_st_copy_ev,
+                        device_ind_offsets_copy_ev, device_ind_sh_st_copy_ev,
+                        device_ind_ptrs_copy_ev});
+        cgh.host_task([host_ind_offsets_shp, host_ind_sh_st_shp,
+                       host_ind_ptrs_shp, host_orthog_sh_st_shp,
+                       host_along_sh_st_shp]() {});
+    });
+    host_task_events.push_back(shared_ptr_cleanup_ev);
 
     std::vector<sycl::event> sh_st_pack_deps{
         device_ind_ptrs_copy_ev, device_ind_sh_st_copy_ev,

@@ -250,7 +250,7 @@ usm_ndarray_triul(sycl::queue exec_q,
                dev_shape_and_strides, k, depends, {copy_shape_and_strides});
     }
 
-    exec_q.submit([&](sycl::handler &cgh) {
+    auto temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on({tri_ev});
         auto ctx = exec_q.get_context();
         cgh.host_task(
@@ -261,8 +261,9 @@ usm_ndarray_triul(sycl::queue exec_q,
             });
     });
 
-    return std::make_pair(keep_args_alive(exec_q, {src, dst}, {tri_ev}),
-                          tri_ev);
+    return std::make_pair(
+        keep_args_alive(exec_q, {src, dst}, {temporaries_cleanup_ev}),
+        temporaries_cleanup_ev);
 }
 
 void init_triul_ctor_dispatch_vectors(void)