Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Host task cleanup #1111

Merged
merged 4 commits into from
Mar 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 13 additions & 7 deletions dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ using dpctl::utils::keep_args_alive;

sycl::event _populate_packed_shape_strides_for_copycast_kernel(
sycl::queue exec_q,
std::vector<sycl::event> &host_task_events,
py::ssize_t *device_shape_strides, // to be populated
const std::vector<py::ssize_t> &common_shape,
const std::vector<py::ssize_t> &src_strides,
Expand Down Expand Up @@ -102,13 +103,14 @@ sycl::event _populate_packed_shape_strides_for_copycast_kernel(
shp_host_shape_strides->data(), device_shape_strides,
shp_host_shape_strides->size());

exec_q.submit([&](sycl::handler &cgh) {
auto shared_ptr_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(copy_shape_ev);
cgh.host_task([shp_host_shape_strides]() {
// increment shared pointer ref-count to keep it alive
// till copy operation completes;
});
});
host_task_events.push_back(shared_ptr_cleanup_ev);

return copy_shape_ev;
}
Expand Down Expand Up @@ -306,26 +308,30 @@ copy_usm_ndarray_into_usm_ndarray(dpctl::tensor::usm_ndarray src,
throw std::runtime_error("Unabled to allocate device memory");
}

std::vector<sycl::event> host_task_events;
host_task_events.reserve(2);

sycl::event copy_shape_ev =
_populate_packed_shape_strides_for_copycast_kernel(
exec_q, shape_strides, simplified_shape, simplified_src_strides,
simplified_dst_strides);
exec_q, host_task_events, shape_strides, simplified_shape,
simplified_src_strides, simplified_dst_strides);

sycl::event copy_and_cast_generic_ev = copy_and_cast_fn(
exec_q, src_nelems, nd, shape_strides, src_data, src_offset, dst_data,
dst_offset, depends, {copy_shape_ev});

// async free of shape_strides temporary
auto ctx = exec_q.get_context();
exec_q.submit([&](sycl::handler &cgh) {
auto temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(copy_and_cast_generic_ev);
cgh.host_task(
[ctx, shape_strides]() { sycl::free(shape_strides, ctx); });
});

return std::make_pair(
keep_args_alive(exec_q, {src, dst}, {copy_and_cast_generic_ev}),
copy_and_cast_generic_ev);
host_task_events.push_back(temporaries_cleanup_ev);

return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
temporaries_cleanup_ev);
}

void init_copy_and_cast_usm_to_usm_dispatch_tables(void)
Expand Down
16 changes: 11 additions & 5 deletions dpctl/tensor/libtensor/source/copy_for_reshape.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,11 +202,14 @@ copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src,
dst_nd);
}

std::vector<sycl::event> host_task_events;
host_task_events.reserve(2);

// copy packed shapes and strides from host to devices
sycl::event packed_shape_strides_copy_ev = exec_q.copy<py::ssize_t>(
packed_host_shapes_strides_shp->data(), packed_shapes_strides,
packed_host_shapes_strides_shp->size());
exec_q.submit([&](sycl::handler &cgh) {
auto shared_ptr_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(packed_shape_strides_copy_ev);
cgh.host_task([packed_host_shapes_strides_shp] {
// Capturing shared pointer ensures that the underlying vector is
Expand All @@ -215,6 +218,8 @@ copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src,
});
});

host_task_events.push_back(shared_ptr_cleanup_ev);

char *src_data = src.get_data();
char *dst_data = dst.get_data();

Expand All @@ -226,17 +231,18 @@ copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src,
fn(exec_q, shift, src_nelems, src_nd, dst_nd, packed_shapes_strides,
src_data, dst_data, all_deps);

exec_q.submit([&](sycl::handler &cgh) {
auto temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(copy_for_reshape_event);
auto ctx = exec_q.get_context();
cgh.host_task([packed_shapes_strides, ctx]() {
sycl::free(packed_shapes_strides, ctx);
});
});

return std::make_pair(
keep_args_alive(exec_q, {src, dst}, {copy_for_reshape_event}),
copy_for_reshape_event);
host_task_events.push_back(temporaries_cleanup_ev);

return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
temporaries_cleanup_ev);
}

void init_copy_for_reshape_dispatch_vectors(void)
Expand Down
20 changes: 9 additions & 11 deletions dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,17 +180,15 @@ _populate_kernel_params(sycl::queue exec_q,
host_along_sh_st_shp->data(), device_along_sh_st,
host_along_sh_st_shp->size());

sycl::event shared_ptr_cleanup_host_task =
exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on({device_along_sh_st_copy_ev,
device_orthog_sh_st_copy_ev,
device_ind_offsets_copy_ev,
device_ind_sh_st_copy_ev, device_ind_ptrs_copy_ev});
cgh.host_task([host_ind_offsets_shp, host_ind_sh_st_shp,
host_ind_ptrs_shp, host_orthog_sh_st_shp,
host_along_sh_st_shp]() {});
});
host_task_events.push_back(shared_ptr_cleanup_host_task);
sycl::event shared_ptr_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on({device_along_sh_st_copy_ev, device_orthog_sh_st_copy_ev,
device_ind_offsets_copy_ev, device_ind_sh_st_copy_ev,
device_ind_ptrs_copy_ev});
cgh.host_task([host_ind_offsets_shp, host_ind_sh_st_shp,
host_ind_ptrs_shp, host_orthog_sh_st_shp,
host_along_sh_st_shp]() {});
});
host_task_events.push_back(shared_ptr_cleanup_ev);

std::vector<sycl::event> sh_st_pack_deps{
device_ind_ptrs_copy_ev, device_ind_sh_st_copy_ev,
Expand Down
7 changes: 4 additions & 3 deletions dpctl/tensor/libtensor/source/triul_ctor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ usm_ndarray_triul(sycl::queue exec_q,
dev_shape_and_strides, k, depends, {copy_shape_and_strides});
}

exec_q.submit([&](sycl::handler &cgh) {
auto temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on({tri_ev});
auto ctx = exec_q.get_context();
cgh.host_task(
Expand All @@ -261,8 +261,9 @@ usm_ndarray_triul(sycl::queue exec_q,
});
});

return std::make_pair(keep_args_alive(exec_q, {src, dst}, {tri_ev}),
tri_ev);
return std::make_pair(
keep_args_alive(exec_q, {src, dst}, {temporaries_cleanup_ev}),
temporaries_cleanup_ev);
}

void init_triul_ctor_dispatch_vectors(void)
Expand Down