Skip to content

Commit

Permalink
Merge pull request #1806 from IntelPython/fixme-async-memset
Browse files Browse the repository at this point in the history
  • Loading branch information
oleksandr-pavlyk authored Aug 22, 2024
2 parents 4297fef + 640e706 commit cfba263
Show file tree
Hide file tree
Showing 7 changed files with 298 additions and 6 deletions.
1 change: 1 addition & 0 deletions dpctl/tensor/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ set(_tensor_impl_sources
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
Expand Down
7 changes: 5 additions & 2 deletions dpctl/tensor/_ctors.py
Original file line number Diff line number Diff line change
Expand Up @@ -945,8 +945,11 @@ def zeros(
order=order,
buffer_ctor_kwargs={"queue": sycl_queue},
)
# FIXME: replace with asynchronous call to ti
res.usm_data.memset()
_manager = dpctl.utils.SequentialOrderManager[sycl_queue]
# populating new allocation, no dependent events
hev, zeros_ev = ti._zeros_usm_ndarray(res, sycl_queue)
_manager.add_event_pair(hev, zeros_ev)

return res


Expand Down
62 changes: 58 additions & 4 deletions dpctl/tensor/libtensor/source/full_ctor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,65 @@ sycl::event full_contig_impl(sycl::queue &exec_q,
{
dstTy fill_v = py::cast<dstTy>(py_value);

using dpctl::tensor::kernels::constructors::full_contig_impl;
sycl::event fill_ev;

sycl::event fill_ev =
full_contig_impl<dstTy>(exec_q, nelems, fill_v, dst_p, depends);
if constexpr (sizeof(dstTy) == sizeof(char)) {
const auto memset_val = sycl::bit_cast<unsigned char>(fill_v);
fill_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends);

cgh.memset(reinterpret_cast<void *>(dst_p), memset_val,
nelems * sizeof(dstTy));
});
}
else {
bool is_zero = false;
if constexpr (sizeof(dstTy) == 1) {
is_zero = (std::uint8_t{0} == sycl::bit_cast<std::uint8_t>(fill_v));
}
else if constexpr (sizeof(dstTy) == 2) {
is_zero =
(std::uint16_t{0} == sycl::bit_cast<std::uint16_t>(fill_v));
}
else if constexpr (sizeof(dstTy) == 4) {
is_zero =
(std::uint32_t{0} == sycl::bit_cast<std::uint32_t>(fill_v));
}
else if constexpr (sizeof(dstTy) == 8) {
is_zero =
(std::uint64_t{0} == sycl::bit_cast<std::uint64_t>(fill_v));
}
else if constexpr (sizeof(dstTy) == 16) {
struct UInt128
{

constexpr UInt128() : v1{}, v2{} {}
UInt128(const UInt128 &) = default;

operator bool() const { return bool(!v1) && bool(!v2); }

std::uint64_t v1;
std::uint64_t v2;
};
is_zero = static_cast<bool>(sycl::bit_cast<UInt128>(fill_v));
}

if (is_zero) {
constexpr int memset_val = 0;
fill_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends);

cgh.memset(reinterpret_cast<void *>(dst_p), memset_val,
nelems * sizeof(dstTy));
});
}
else {
using dpctl::tensor::kernels::constructors::full_contig_impl;

fill_ev =
full_contig_impl<dstTy>(exec_q, nelems, fill_v, dst_p, depends);
}
}

return fill_ev;
}
Expand Down Expand Up @@ -126,7 +181,6 @@ usm_ndarray_full(const py::object &py_value,
int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);

char *dst_data = dst.get_data();
sycl::event full_event;

if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) {
auto fn = full_contig_dispatch_vector[dst_typeid];
Expand Down
10 changes: 10 additions & 0 deletions dpctl/tensor/libtensor/source/tensor_ctors.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
#include "utils/memory_overlap.hpp"
#include "utils/strided_iters.hpp"
#include "where.hpp"
#include "zeros_ctor.hpp"

namespace py = pybind11;

Expand Down Expand Up @@ -92,6 +93,10 @@ using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step;

using dpctl::tensor::py_internal::usm_ndarray_full;

/* ================ Zeros ================== */

using dpctl::tensor::py_internal::usm_ndarray_zeros;

/* ============== Advanced Indexing ============= */
using dpctl::tensor::py_internal::usm_ndarray_put;
using dpctl::tensor::py_internal::usm_ndarray_take;
Expand Down Expand Up @@ -142,6 +147,7 @@ void init_dispatch_vectors(void)
init_copy_for_roll_dispatch_vectors();
init_linear_sequences_dispatch_vectors();
init_full_ctor_dispatch_vectors();
init_zeros_ctor_dispatch_vectors();
init_eye_ctor_dispatch_vectors();
init_triul_ctor_dispatch_vectors();

Expand Down Expand Up @@ -291,6 +297,10 @@ PYBIND11_MODULE(_tensor_impl, m)
py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
py::arg("depends") = py::list());

m.def("_zeros_usm_ndarray", &usm_ndarray_zeros,
"Populate usm_ndarray `dst` with zeros.", py::arg("dst"),
py::arg("sycl_queue"), py::arg("depends") = py::list());

m.def("_full_usm_ndarray", &usm_ndarray_full,
"Populate usm_ndarray `dst` with given fill_value.",
py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"),
Expand Down
154 changes: 154 additions & 0 deletions dpctl/tensor/libtensor/source/zeros_ctor.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===//
//
// Data Parallel Control (dpctl)
//
// Copyright 2020-2024 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//===--------------------------------------------------------------------===//
///
/// \file
/// This file defines functions of dpctl.tensor._tensor_impl extensions
//===--------------------------------------------------------------------===//

#include "dpctl4pybind11.hpp"
#include <complex>
#include <pybind11/complex.h>
#include <pybind11/pybind11.h>
#include <sycl/sycl.hpp>
#include <utility>
#include <vector>

#include "kernels/constructors.hpp"
#include "utils/output_validation.hpp"
#include "utils/type_dispatch.hpp"
#include "utils/type_utils.hpp"

#include "zeros_ctor.hpp"

namespace py = pybind11;
namespace td_ns = dpctl::tensor::type_dispatch;

namespace dpctl
{
namespace tensor
{
namespace py_internal
{

using dpctl::utils::keep_args_alive;

typedef sycl::event (*zeros_contig_fn_ptr_t)(sycl::queue &,
size_t,
char *,
const std::vector<sycl::event> &);

/*!
* @brief Function to submit kernel to fill given contiguous memory allocation
* with zeros.
*
* @param exec_q Sycl queue to which kernel is submitted for execution.
* @param nelems Length of the sequence
* @param dst_p Kernel accessible USM pointer to the start of array to be
* populated.
* @param depends List of events to wait for before starting computations, if
* any.
*
* @return Event to wait on to ensure that computation completes.
* @defgroup CtorKernels
*/
template <typename dstTy>
sycl::event zeros_contig_impl(sycl::queue &exec_q,
size_t nelems,
char *dst_p,
const std::vector<sycl::event> &depends)
{

constexpr int memset_val(0);
sycl::event fill_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends);

cgh.memset(reinterpret_cast<void *>(dst_p), memset_val,
nelems * sizeof(dstTy));
});

return fill_ev;
}

template <typename fnT, typename Ty> struct ZerosContigFactory
{
fnT get()
{
fnT f = zeros_contig_impl<Ty>;
return f;
}
};

static zeros_contig_fn_ptr_t zeros_contig_dispatch_vector[td_ns::num_types];

std::pair<sycl::event, sycl::event>
usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst,
sycl::queue &exec_q,
const std::vector<sycl::event> &depends)
{
py::ssize_t dst_nelems = dst.get_size();

if (dst_nelems == 0) {
// nothing to do
return std::make_pair(sycl::event(), sycl::event());
}

if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
throw py::value_error(
"Execution queue is not compatible with the allocation queue");
}

dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);

auto array_types = td_ns::usm_ndarray_types();
int dst_typenum = dst.get_typenum();
int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);

char *dst_data = dst.get_data();

if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) {
auto fn = zeros_contig_dispatch_vector[dst_typeid];

sycl::event zeros_contig_event =
fn(exec_q, static_cast<size_t>(dst_nelems), dst_data, depends);

return std::make_pair(
keep_args_alive(exec_q, {dst}, {zeros_contig_event}),
zeros_contig_event);
}
else {
throw std::runtime_error(
"Only population of contiguous usm_ndarray objects is supported.");
}
}

void init_zeros_ctor_dispatch_vectors(void)
{
using namespace td_ns;

DispatchVectorBuilder<zeros_contig_fn_ptr_t, ZerosContigFactory, num_types>
dvb;
dvb.populate_dispatch_vector(zeros_contig_dispatch_vector);

return;
}

} // namespace py_internal
} // namespace tensor
} // namespace dpctl
49 changes: 49 additions & 0 deletions dpctl/tensor/libtensor/source/zeros_ctor.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===//
//
// Data Parallel Control (dpctl)
//
// Copyright 2020-2024 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//===--------------------------------------------------------------------===//
///
/// \file
/// This file defines functions of dpctl.tensor._tensor_impl extensions
//===--------------------------------------------------------------------===//

#pragma once
#include <sycl/sycl.hpp>
#include <utility>
#include <vector>

#include "dpctl4pybind11.hpp"
#include <pybind11/pybind11.h>

namespace dpctl
{
namespace tensor
{
namespace py_internal
{

extern std::pair<sycl::event, sycl::event>
usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst,
sycl::queue &exec_q,
const std::vector<sycl::event> &depends = {});

extern void init_zeros_ctor_dispatch_vectors(void);

} // namespace py_internal
} // namespace tensor
} // namespace dpctl
21 changes: 21 additions & 0 deletions dpctl/tests/test_usm_ndarray_ctor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1682,6 +1682,27 @@ def test_full(dtype):
assert np.array_equal(dpt.asnumpy(X), np.full(10, 4, dtype=dtype))


def test_full_cmplx128():
q = get_queue_or_skip()
dtype = "c16"
skip_if_dtype_not_supported(dtype, q)
fill_v = 1 + 1j
X = dpt.full(tuple(), fill_value=fill_v, dtype=dtype, sycl_queue=q)
assert np.array_equal(
dpt.asnumpy(X), np.full(tuple(), fill_value=fill_v, dtype=dtype)
)
fill_v = 0 + 1j
X = dpt.full(tuple(), fill_value=fill_v, dtype=dtype, sycl_queue=q)
assert np.array_equal(
dpt.asnumpy(X), np.full(tuple(), fill_value=fill_v, dtype=dtype)
)
fill_v = 0 + 0j
X = dpt.full(tuple(), fill_value=fill_v, dtype=dtype, sycl_queue=q)
assert np.array_equal(
dpt.asnumpy(X), np.full(tuple(), fill_value=fill_v, dtype=dtype)
)


def test_full_dtype_inference():
try:
X = dpt.full(10, 4)
Expand Down

0 comments on commit cfba263

Please sign in to comment.