From 3f9a5bfba67cd443ce91478a4db3ab1a343150ca Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 21 Aug 2024 17:41:08 -0500 Subject: [PATCH 1/5] Introduce ti._zeros_usm_ndarray(dst, sycl_queue) This is akin to _full_usm_ndarray, but does not take fill_value, hence does not require castings. It dispatches straight to handler::memset. --- dpctl/tensor/CMakeLists.txt | 1 + .../tensor/libtensor/source/tensor_ctors.cpp | 10 ++ dpctl/tensor/libtensor/source/zeros_ctor.cpp | 156 ++++++++++++++++++ dpctl/tensor/libtensor/source/zeros_ctor.hpp | 49 ++++++ 4 files changed, 216 insertions(+) create mode 100644 dpctl/tensor/libtensor/source/zeros_ctor.cpp create mode 100644 dpctl/tensor/libtensor/source/zeros_ctor.hpp diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index 7ddea50564..65b96d981e 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -129,6 +129,7 @@ set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp diff --git a/dpctl/tensor/libtensor/source/tensor_ctors.cpp b/dpctl/tensor/libtensor/source/tensor_ctors.cpp index c18220ee6c..854574b69e 100644 --- a/dpctl/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl/tensor/libtensor/source/tensor_ctors.cpp @@ -54,6 +54,7 @@ #include "utils/memory_overlap.hpp" #include "utils/strided_iters.hpp" #include "where.hpp" +#include "zeros_ctor.hpp" namespace py = pybind11; @@ -92,6 +93,10 @@ using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step; using dpctl::tensor::py_internal::usm_ndarray_full; +/* ================ Zeros ================== */ + +using dpctl::tensor::py_internal::usm_ndarray_zeros; + /* ============== Advanced Indexing ============= */ using dpctl::tensor::py_internal::usm_ndarray_put; using dpctl::tensor::py_internal::usm_ndarray_take; @@ -142,6 +147,7 @@ void init_dispatch_vectors(void) init_copy_for_roll_dispatch_vectors(); init_linear_sequences_dispatch_vectors(); init_full_ctor_dispatch_vectors(); + init_zeros_ctor_dispatch_vectors(); init_eye_ctor_dispatch_vectors(); init_triul_ctor_dispatch_vectors(); @@ -291,6 +297,10 @@ PYBIND11_MODULE(_tensor_impl, m) py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_zeros_usm_ndarray", &usm_ndarray_zeros, + "Populate usm_ndarray `dst` with zeros.", py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_full_usm_ndarray", &usm_ndarray_full, "Populate usm_ndarray `dst` with given fill_value.", py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"), diff --git a/dpctl/tensor/libtensor/source/zeros_ctor.cpp b/dpctl/tensor/libtensor/source/zeros_ctor.cpp new file mode 100644 index 0000000000..cae5a23f3e --- /dev/null +++ b/dpctl/tensor/libtensor/source/zeros_ctor.cpp @@ -0,0 +1,156 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2024 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include +#include + +#include "kernels/constructors.hpp" +#include "utils/output_validation.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "zeros_ctor.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +using dpctl::utils::keep_args_alive; + +typedef sycl::event (*zeros_contig_fn_ptr_t)(sycl::queue &, + size_t, + char *, + const std::vector &); + +/*! + * @brief Function to submit kernel to fill given contiguous memory allocation + * with zeros. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence + * @param dst_p Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event zeros_contig_impl(sycl::queue &exec_q, + size_t nelems, + char *dst_p, + const std::vector &depends) +{ + + constexpr int memset_val(0); + sycl::event fill_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.memset(reinterpret_cast(dst_p), memset_val, + nelems * sizeof(dstTy)); + }); + + return fill_ev; +} + +template struct ZerosContigFactory +{ + fnT get() + { + fnT f = zeros_contig_impl; + return f; + } +}; + +static zeros_contig_fn_ptr_t zeros_contig_dispatch_vector[td_ns::num_types]; + +std::pair +usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + // start, end should be coercible into data type of dst + + py::ssize_t dst_nelems = dst.get_size(); + + if (dst_nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { + throw py::value_error( + "Execution queue is not compatible with the allocation queue"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + auto array_types = td_ns::usm_ndarray_types(); + int dst_typenum = dst.get_typenum(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + char *dst_data = dst.get_data(); + + if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) { + auto fn = zeros_contig_dispatch_vector[dst_typeid]; + + sycl::event zeros_contig_event = + fn(exec_q, static_cast(dst_nelems), dst_data, depends); + + return std::make_pair( + keep_args_alive(exec_q, {dst}, {zeros_contig_event}), + zeros_contig_event); + } + else { + throw std::runtime_error( + "Only population of contiguous usm_ndarray objects is supported."); + } +} + +void init_zeros_ctor_dispatch_vectors(void) +{ + using namespace td_ns; + + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(zeros_contig_dispatch_vector); + + return; +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/zeros_ctor.hpp b/dpctl/tensor/libtensor/source/zeros_ctor.hpp new file mode 100644 index 0000000000..af4a992fd6 --- /dev/null +++ b/dpctl/tensor/libtensor/source/zeros_ctor.hpp @@ -0,0 +1,49 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2024 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpctl4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair +usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_zeros_ctor_dispatch_vectors(void); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl From bec95f98242e6803af3fcd42fbc372ef4ff0ace3 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 21 Aug 2024 17:42:30 -0500 Subject: [PATCH 2/5] Use ti._zeros_usm_ndarray in dpctl.tensor.zeros --- dpctl/tensor/_ctors.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/dpctl/tensor/_ctors.py b/dpctl/tensor/_ctors.py index 90a6208c87..e95280523e 100644 --- a/dpctl/tensor/_ctors.py +++ b/dpctl/tensor/_ctors.py @@ -945,8 +945,11 @@ def zeros( order=order, buffer_ctor_kwargs={"queue": sycl_queue}, ) - # FIXME: replace with asynchronous call to ti - res.usm_data.memset() + _manager = dpctl.utils.SequentialOrderManager[sycl_queue] + # populating new allocation, no dependent events + hev, zeros_ev = ti._zeros_usm_ndarray(res, sycl_queue) + _manager.add_event_pair(hev, zeros_ev) + return res From 2e73a6c697386d9446bdb3ce7b4471a4e9cd9147 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 21 Aug 2024 17:49:32 -0500 Subject: [PATCH 3/5] Added special case for _full_usm_ndarray MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bitwise zero values, and 1-byte wide types now use memset, instead of using fill. ``` In [1]: import dpctl.tensor as dpt, dpctl.tensor._tensor_impl as ti In [2]: res = dpt.empty(10**6, dtype="i8") In [3]: %timeit -n 2000 -r 11 ti._full_usm_ndarray(0, dst=res, sycl_queue=res.sycl_queue)[0].wait() 243 µs ± 22.6 µs per loop (mean ± std. dev. of 11 runs, 2,000 loops each) In [4]: %timeit -n 2000 -r 11 ti._full_usm_ndarray(0, dst=res, sycl_queue=res.sycl_queue)[0].wait() 229 µs ± 14 µs per loop (mean ± std. dev. of 11 runs, 2,000 loops each) In [5]: %timeit -n 2000 -r 11 ti._zeros_usm_ndarray(dst=res, sycl_queue=res.sycl_queue)[0].wait() 227 µs ± 23 µs per loop (mean ± std. dev. of 11 runs, 2,000 loops each) In [6]: %timeit -n 2000 -r 11 ti._zeros_usm_ndarray(dst=res, sycl_queue=res.sycl_queue)[0].wait() 233 µs ± 25.9 µs per loop (mean ± std. dev. of 11 runs, 2,000 loops each) In [7]: %timeit -n 2000 -r 11 ti._zeros_usm_ndarray(dst=res, sycl_queue=res.sycl_queue)[0].wait() 301 µs ± 54.1 µs per loop (mean ± std. dev. of 11 runs, 2,000 loops each) In [8]: %timeit -n 2000 -r 11 ti._zeros_usm_ndarray(dst=res, sycl_queue=res.sycl_queue)[0].wait() 236 µs ± 17.2 µs per loop (mean ± std. dev. of 11 runs, 2,000 loops each) In [9]: %timeit -n 2000 -r 11 ti._full_usm_ndarray(0, dst=res, sycl_queue=res.sycl_queue)[0].wait() 240 µs ± 35.2 µs per loop (mean ± std. dev. of 11 runs, 2,000 loops each) In [10]: %timeit -n 2000 -r 11 ti._full_usm_ndarray(1, dst=res, sycl_queue=res.sycl_queue)[0].wait() 243 µs ± 17.6 µs per loop (mean ± std. dev. of 11 runs, 2,000 loops each) In [11]: %timeit -n 2000 -r 11 ti._full_usm_ndarray(1, dst=res, sycl_queue=res.sycl_queue)[0].wait() 263 µs ± 39.9 µs per loop (mean ± std. dev. of 11 runs, 2,000 loops each) In [12]: %timeit -n 2000 -r 11 ti._full_usm_ndarray(0, dst=res, sycl_queue=res.sycl_queue)[0].wait() 239 µs ± 26.4 µs per loop (mean ± std. dev. of 11 runs, 2,000 loops each) In [13]: %timeit -n 2000 -r 11 ti._zeros_usm_ndarray(dst=res, sycl_queue=res.sycl_queue)[0].wait() 224 µs ± 18.1 µs per loop (mean ± std. dev. of 11 runs, 2,000 loops each) ``` --- dpctl/tensor/libtensor/source/full_ctor.cpp | 62 +++++++++++++++++++-- 1 file changed, 58 insertions(+), 4 deletions(-) diff --git a/dpctl/tensor/libtensor/source/full_ctor.cpp b/dpctl/tensor/libtensor/source/full_ctor.cpp index 41b3093652..26029e028a 100644 --- a/dpctl/tensor/libtensor/source/full_ctor.cpp +++ b/dpctl/tensor/libtensor/source/full_ctor.cpp @@ -80,10 +80,65 @@ sycl::event full_contig_impl(sycl::queue &exec_q, { dstTy fill_v = py::cast(py_value); - using dpctl::tensor::kernels::constructors::full_contig_impl; + sycl::event fill_ev; - sycl::event fill_ev = - full_contig_impl(exec_q, nelems, fill_v, dst_p, depends); + if constexpr (sizeof(dstTy) == sizeof(char)) { + const auto memset_val = sycl::bit_cast(fill_v); + fill_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.memset(reinterpret_cast(dst_p), memset_val, + nelems * sizeof(dstTy)); + }); + } + else { + bool is_zero = false; + if constexpr (sizeof(dstTy) == 1) { + is_zero = (std::uint8_t{0} == sycl::bit_cast(fill_v)); + } + else if constexpr (sizeof(dstTy) == 2) { + is_zero = + (std::uint16_t{0} == sycl::bit_cast(fill_v)); + } + else if constexpr (sizeof(dstTy) == 4) { + is_zero = + (std::uint32_t{0} == sycl::bit_cast(fill_v)); + } + else if constexpr (sizeof(dstTy) == 8) { + is_zero = + (std::uint64_t{0} == sycl::bit_cast(fill_v)); + } + else if constexpr (sizeof(dstTy) == 16) { + struct UInt128 + { + + constexpr UInt128() : v1{}, v2{} {} + UInt128(const UInt128 &) = default; + + operator bool() const { return bool(!v1) && bool(!v2); } + + std::uint64_t v1; + std::uint64_t v2; + }; + is_zero = static_cast(sycl::bit_cast(fill_v)); + } + + if (is_zero) { + constexpr int memset_val = 0; + fill_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.memset(reinterpret_cast(dst_p), memset_val, + nelems * sizeof(dstTy)); + }); + } + else { + using dpctl::tensor::kernels::constructors::full_contig_impl; + + fill_ev = + full_contig_impl(exec_q, nelems, fill_v, dst_p, depends); + } + } return fill_ev; } @@ -126,7 +181,6 @@ usm_ndarray_full(const py::object &py_value, int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); char *dst_data = dst.get_data(); - sycl::event full_event; if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) { auto fn = full_contig_dispatch_vector[dst_typeid]; From f0d926a273631e83ea3a6c7640cf834db0907f7b Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 22 Aug 2024 06:52:09 -0500 Subject: [PATCH 4/5] Add test_full_cmplx128 --- dpctl/tests/test_usm_ndarray_ctor.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/dpctl/tests/test_usm_ndarray_ctor.py b/dpctl/tests/test_usm_ndarray_ctor.py index f43f931e47..3e027e5926 100644 --- a/dpctl/tests/test_usm_ndarray_ctor.py +++ b/dpctl/tests/test_usm_ndarray_ctor.py @@ -1682,6 +1682,27 @@ def test_full(dtype): assert np.array_equal(dpt.asnumpy(X), np.full(10, 4, dtype=dtype)) +def test_full_cmplx128(): + q = get_queue_or_skip() + dtype = "c16" + skip_if_dtype_not_supported(dtype, q) + fill_v = 1 + 1j + X = dpt.full(tuple(), fill_value=fill_v, dtype=dtype, sycl_queue=q) + assert np.array_equal( + dpt.asnumpy(X), np.full(tuple(), fill_value=fill_v, dtype=dtype) + ) + fill_v = 0 + 1j + X = dpt.full(tuple(), fill_value=fill_v, dtype=dtype, sycl_queue=q) + assert np.array_equal( + dpt.asnumpy(X), np.full(tuple(), fill_value=fill_v, dtype=dtype) + ) + fill_v = 0 + 0j + X = dpt.full(tuple(), fill_value=fill_v, dtype=dtype, sycl_queue=q) + assert np.array_equal( + dpt.asnumpy(X), np.full(tuple(), fill_value=fill_v, dtype=dtype) + ) + + def test_full_dtype_inference(): try: X = dpt.full(10, 4) From 640e7061d7e550fd5c0aa582eb38d60710be8e70 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 22 Aug 2024 13:04:32 -0500 Subject: [PATCH 5/5] Update dpctl/tensor/libtensor/source/zeros_ctor.cpp Co-authored-by: ndgrigorian <46709016+ndgrigorian@users.noreply.github.com> --- dpctl/tensor/libtensor/source/zeros_ctor.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/dpctl/tensor/libtensor/source/zeros_ctor.cpp b/dpctl/tensor/libtensor/source/zeros_ctor.cpp index cae5a23f3e..e53a572efe 100644 --- a/dpctl/tensor/libtensor/source/zeros_ctor.cpp +++ b/dpctl/tensor/libtensor/source/zeros_ctor.cpp @@ -102,8 +102,6 @@ usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q, const std::vector &depends) { - // start, end should be coercible into data type of dst - py::ssize_t dst_nelems = dst.get_size(); if (dst_nelems == 0) {