diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index 7ddea50564..65b96d981e 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -129,6 +129,7 @@ set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp diff --git a/dpctl/tensor/_ctors.py b/dpctl/tensor/_ctors.py index 90a6208c87..e95280523e 100644 --- a/dpctl/tensor/_ctors.py +++ b/dpctl/tensor/_ctors.py @@ -945,8 +945,11 @@ def zeros( order=order, buffer_ctor_kwargs={"queue": sycl_queue}, ) - # FIXME: replace with asynchronous call to ti - res.usm_data.memset() + _manager = dpctl.utils.SequentialOrderManager[sycl_queue] + # populating new allocation, no dependent events + hev, zeros_ev = ti._zeros_usm_ndarray(res, sycl_queue) + _manager.add_event_pair(hev, zeros_ev) + return res diff --git a/dpctl/tensor/libtensor/source/full_ctor.cpp b/dpctl/tensor/libtensor/source/full_ctor.cpp index 41b3093652..26029e028a 100644 --- a/dpctl/tensor/libtensor/source/full_ctor.cpp +++ b/dpctl/tensor/libtensor/source/full_ctor.cpp @@ -80,10 +80,65 @@ sycl::event full_contig_impl(sycl::queue &exec_q, { dstTy fill_v = py::cast(py_value); - using dpctl::tensor::kernels::constructors::full_contig_impl; + sycl::event fill_ev; - sycl::event fill_ev = - full_contig_impl(exec_q, nelems, fill_v, dst_p, depends); + if constexpr (sizeof(dstTy) == sizeof(char)) { + const auto memset_val = sycl::bit_cast(fill_v); + fill_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.memset(reinterpret_cast(dst_p), memset_val, + nelems * sizeof(dstTy)); + }); + } + else { + bool is_zero = false; + if constexpr (sizeof(dstTy) == 1) { + is_zero = (std::uint8_t{0} == sycl::bit_cast(fill_v)); + } + else if constexpr (sizeof(dstTy) == 2) { + is_zero = + (std::uint16_t{0} == sycl::bit_cast(fill_v)); + } + else if constexpr (sizeof(dstTy) == 4) { + is_zero = + (std::uint32_t{0} == sycl::bit_cast(fill_v)); + } + else if constexpr (sizeof(dstTy) == 8) { + is_zero = + (std::uint64_t{0} == sycl::bit_cast(fill_v)); + } + else if constexpr (sizeof(dstTy) == 16) { + struct UInt128 + { + + constexpr UInt128() : v1{}, v2{} {} + UInt128(const UInt128 &) = default; + + operator bool() const { return bool(!v1) && bool(!v2); } + + std::uint64_t v1; + std::uint64_t v2; + }; + is_zero = static_cast(sycl::bit_cast(fill_v)); + } + + if (is_zero) { + constexpr int memset_val = 0; + fill_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.memset(reinterpret_cast(dst_p), memset_val, + nelems * sizeof(dstTy)); + }); + } + else { + using dpctl::tensor::kernels::constructors::full_contig_impl; + + fill_ev = + full_contig_impl(exec_q, nelems, fill_v, dst_p, depends); + } + } return fill_ev; } @@ -126,7 +181,6 @@ usm_ndarray_full(const py::object &py_value, int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); char *dst_data = dst.get_data(); - sycl::event full_event; if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) { auto fn = full_contig_dispatch_vector[dst_typeid]; diff --git a/dpctl/tensor/libtensor/source/tensor_ctors.cpp b/dpctl/tensor/libtensor/source/tensor_ctors.cpp index c18220ee6c..854574b69e 100644 --- a/dpctl/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl/tensor/libtensor/source/tensor_ctors.cpp @@ -54,6 +54,7 @@ #include "utils/memory_overlap.hpp" #include "utils/strided_iters.hpp" #include "where.hpp" +#include "zeros_ctor.hpp" namespace py = pybind11; @@ -92,6 +93,10 @@ using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step; using dpctl::tensor::py_internal::usm_ndarray_full; +/* ================ Zeros ================== */ + +using dpctl::tensor::py_internal::usm_ndarray_zeros; + /* ============== Advanced Indexing ============= */ using dpctl::tensor::py_internal::usm_ndarray_put; using dpctl::tensor::py_internal::usm_ndarray_take; @@ -142,6 +147,7 @@ void init_dispatch_vectors(void) init_copy_for_roll_dispatch_vectors(); init_linear_sequences_dispatch_vectors(); init_full_ctor_dispatch_vectors(); + init_zeros_ctor_dispatch_vectors(); init_eye_ctor_dispatch_vectors(); init_triul_ctor_dispatch_vectors(); @@ -291,6 +297,10 @@ PYBIND11_MODULE(_tensor_impl, m) py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_zeros_usm_ndarray", &usm_ndarray_zeros, + "Populate usm_ndarray `dst` with zeros.", py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_full_usm_ndarray", &usm_ndarray_full, "Populate usm_ndarray `dst` with given fill_value.", py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"), diff --git a/dpctl/tensor/libtensor/source/zeros_ctor.cpp b/dpctl/tensor/libtensor/source/zeros_ctor.cpp new file mode 100644 index 0000000000..e53a572efe --- /dev/null +++ b/dpctl/tensor/libtensor/source/zeros_ctor.cpp @@ -0,0 +1,154 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2024 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include +#include + +#include "kernels/constructors.hpp" +#include "utils/output_validation.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "zeros_ctor.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +using dpctl::utils::keep_args_alive; + +typedef sycl::event (*zeros_contig_fn_ptr_t)(sycl::queue &, + size_t, + char *, + const std::vector &); + +/*! + * @brief Function to submit kernel to fill given contiguous memory allocation + * with zeros. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence + * @param dst_p Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event zeros_contig_impl(sycl::queue &exec_q, + size_t nelems, + char *dst_p, + const std::vector &depends) +{ + + constexpr int memset_val(0); + sycl::event fill_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.memset(reinterpret_cast(dst_p), memset_val, + nelems * sizeof(dstTy)); + }); + + return fill_ev; +} + +template struct ZerosContigFactory +{ + fnT get() + { + fnT f = zeros_contig_impl; + return f; + } +}; + +static zeros_contig_fn_ptr_t zeros_contig_dispatch_vector[td_ns::num_types]; + +std::pair +usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + py::ssize_t dst_nelems = dst.get_size(); + + if (dst_nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { + throw py::value_error( + "Execution queue is not compatible with the allocation queue"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + auto array_types = td_ns::usm_ndarray_types(); + int dst_typenum = dst.get_typenum(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + char *dst_data = dst.get_data(); + + if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) { + auto fn = zeros_contig_dispatch_vector[dst_typeid]; + + sycl::event zeros_contig_event = + fn(exec_q, static_cast(dst_nelems), dst_data, depends); + + return std::make_pair( + keep_args_alive(exec_q, {dst}, {zeros_contig_event}), + zeros_contig_event); + } + else { + throw std::runtime_error( + "Only population of contiguous usm_ndarray objects is supported."); + } +} + +void init_zeros_ctor_dispatch_vectors(void) +{ + using namespace td_ns; + + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(zeros_contig_dispatch_vector); + + return; +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/zeros_ctor.hpp b/dpctl/tensor/libtensor/source/zeros_ctor.hpp new file mode 100644 index 0000000000..af4a992fd6 --- /dev/null +++ b/dpctl/tensor/libtensor/source/zeros_ctor.hpp @@ -0,0 +1,49 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2024 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpctl4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair +usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_zeros_ctor_dispatch_vectors(void); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tests/test_usm_ndarray_ctor.py b/dpctl/tests/test_usm_ndarray_ctor.py index f43f931e47..3e027e5926 100644 --- a/dpctl/tests/test_usm_ndarray_ctor.py +++ b/dpctl/tests/test_usm_ndarray_ctor.py @@ -1682,6 +1682,27 @@ def test_full(dtype): assert np.array_equal(dpt.asnumpy(X), np.full(10, 4, dtype=dtype)) +def test_full_cmplx128(): + q = get_queue_or_skip() + dtype = "c16" + skip_if_dtype_not_supported(dtype, q) + fill_v = 1 + 1j + X = dpt.full(tuple(), fill_value=fill_v, dtype=dtype, sycl_queue=q) + assert np.array_equal( + dpt.asnumpy(X), np.full(tuple(), fill_value=fill_v, dtype=dtype) + ) + fill_v = 0 + 1j + X = dpt.full(tuple(), fill_value=fill_v, dtype=dtype, sycl_queue=q) + assert np.array_equal( + dpt.asnumpy(X), np.full(tuple(), fill_value=fill_v, dtype=dtype) + ) + fill_v = 0 + 0j + X = dpt.full(tuple(), fill_value=fill_v, dtype=dtype, sycl_queue=q) + assert np.array_equal( + dpt.asnumpy(X), np.full(tuple(), fill_value=fill_v, dtype=dtype) + ) + + def test_full_dtype_inference(): try: X = dpt.full(10, 4)