From 3f9a5bfba67cd443ce91478a4db3ab1a343150ca Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 21 Aug 2024 17:41:08 -0500
Subject: [PATCH 1/5] Introduce ti._zeros_usm_ndarray(dst, sycl_queue)

This is akin to _full_usm_ndarray, but does not take fill_value,
hence does not require castings. It dispatches straight to
handler::memset.
---
 dpctl/tensor/CMakeLists.txt                   |   1 +
 .../tensor/libtensor/source/tensor_ctors.cpp  |  10 ++
 dpctl/tensor/libtensor/source/zeros_ctor.cpp  | 156 ++++++++++++++++++
 dpctl/tensor/libtensor/source/zeros_ctor.hpp  |  49 ++++++
 4 files changed, 216 insertions(+)
 create mode 100644 dpctl/tensor/libtensor/source/zeros_ctor.cpp
 create mode 100644 dpctl/tensor/libtensor/source/zeros_ctor.hpp

diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index 7ddea50564..65b96d981e 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -129,6 +129,7 @@ set(_tensor_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
diff --git a/dpctl/tensor/libtensor/source/tensor_ctors.cpp b/dpctl/tensor/libtensor/source/tensor_ctors.cpp
index c18220ee6c..854574b69e 100644
--- a/dpctl/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl/tensor/libtensor/source/tensor_ctors.cpp
@@ -54,6 +54,7 @@
 #include "utils/memory_overlap.hpp"
 #include "utils/strided_iters.hpp"
 #include "where.hpp"
+#include "zeros_ctor.hpp"
 
 namespace py = pybind11;
 
@@ -92,6 +93,10 @@ using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step;
 
 using dpctl::tensor::py_internal::usm_ndarray_full;
 
+/* ================ Zeros ================== */
+
+using dpctl::tensor::py_internal::usm_ndarray_zeros;
+
 /* ============== Advanced Indexing ============= */
 using dpctl::tensor::py_internal::usm_ndarray_put;
 using dpctl::tensor::py_internal::usm_ndarray_take;
@@ -142,6 +147,7 @@ void init_dispatch_vectors(void)
     init_copy_for_roll_dispatch_vectors();
     init_linear_sequences_dispatch_vectors();
     init_full_ctor_dispatch_vectors();
+    init_zeros_ctor_dispatch_vectors();
     init_eye_ctor_dispatch_vectors();
     init_triul_ctor_dispatch_vectors();
 
@@ -291,6 +297,10 @@ PYBIND11_MODULE(_tensor_impl, m)
           py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 
+    m.def("_zeros_usm_ndarray", &usm_ndarray_zeros,
+          "Populate usm_ndarray `dst` with zeros.", py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
     m.def("_full_usm_ndarray", &usm_ndarray_full,
           "Populate usm_ndarray `dst` with given fill_value.",
           py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"),
diff --git a/dpctl/tensor/libtensor/source/zeros_ctor.cpp b/dpctl/tensor/libtensor/source/zeros_ctor.cpp
new file mode 100644
index 0000000000..cae5a23f3e
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/zeros_ctor.cpp
@@ -0,0 +1,156 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2024 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <complex>
+#include <pybind11/complex.h>
+#include <pybind11/pybind11.h>
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "kernels/constructors.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "zeros_ctor.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+using dpctl::utils::keep_args_alive;
+
+typedef sycl::event (*zeros_contig_fn_ptr_t)(sycl::queue &,
+                                             size_t,
+                                             char *,
+                                             const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to submit kernel to fill given contiguous memory allocation
+ * with zeros.
+ *
+ * @param exec_q  Sycl queue to which kernel is submitted for execution.
+ * @param nelems  Length of the sequence
+ * @param dst_p Kernel accessible USM pointer to the start of array to be
+ * populated.
+ * @param depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @defgroup CtorKernels
+ */
+template <typename dstTy>
+sycl::event zeros_contig_impl(sycl::queue &exec_q,
+                              size_t nelems,
+                              char *dst_p,
+                              const std::vector<sycl::event> &depends)
+{
+
+    constexpr int memset_val(0);
+    sycl::event fill_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.memset(reinterpret_cast<void *>(dst_p), memset_val,
+                   nelems * sizeof(dstTy));
+    });
+
+    return fill_ev;
+}
+
+template <typename fnT, typename Ty> struct ZerosContigFactory
+{
+    fnT get()
+    {
+        fnT f = zeros_contig_impl<Ty>;
+        return f;
+    }
+};
+
+static zeros_contig_fn_ptr_t zeros_contig_dispatch_vector[td_ns::num_types];
+
+std::pair<sycl::event, sycl::event>
+usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst,
+                  sycl::queue &exec_q,
+                  const std::vector<sycl::event> &depends)
+{
+    // start, end should be coercible into data type of dst
+
+    py::ssize_t dst_nelems = dst.get_size();
+
+    if (dst_nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with the allocation queue");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int dst_typenum = dst.get_typenum();
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    char *dst_data = dst.get_data();
+
+    if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) {
+        auto fn = zeros_contig_dispatch_vector[dst_typeid];
+
+        sycl::event zeros_contig_event =
+            fn(exec_q, static_cast<size_t>(dst_nelems), dst_data, depends);
+
+        return std::make_pair(
+            keep_args_alive(exec_q, {dst}, {zeros_contig_event}),
+            zeros_contig_event);
+    }
+    else {
+        throw std::runtime_error(
+            "Only population of contiguous usm_ndarray objects is supported.");
+    }
+}
+
+void init_zeros_ctor_dispatch_vectors(void)
+{
+    using namespace td_ns;
+
+    DispatchVectorBuilder<zeros_contig_fn_ptr_t, ZerosContigFactory, num_types>
+        dvb;
+    dvb.populate_dispatch_vector(zeros_contig_dispatch_vector);
+
+    return;
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/zeros_ctor.hpp b/dpctl/tensor/libtensor/source/zeros_ctor.hpp
new file mode 100644
index 0000000000..af4a992fd6
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/zeros_ctor.hpp
@@ -0,0 +1,49 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2024 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <sycl/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst,
+                  sycl::queue &exec_q,
+                  const std::vector<sycl::event> &depends = {});
+
+extern void init_zeros_ctor_dispatch_vectors(void);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl

From bec95f98242e6803af3fcd42fbc372ef4ff0ace3 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 21 Aug 2024 17:42:30 -0500
Subject: [PATCH 2/5] Use ti._zeros_usm_ndarray in dpctl.tensor.zeros

---
 dpctl/tensor/_ctors.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/dpctl/tensor/_ctors.py b/dpctl/tensor/_ctors.py
index 90a6208c87..e95280523e 100644
--- a/dpctl/tensor/_ctors.py
+++ b/dpctl/tensor/_ctors.py
@@ -945,8 +945,11 @@ def zeros(
         order=order,
         buffer_ctor_kwargs={"queue": sycl_queue},
     )
-    # FIXME: replace with asynchronous call to ti
-    res.usm_data.memset()
+    _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
+    # populating new allocation, no dependent events
+    hev, zeros_ev = ti._zeros_usm_ndarray(res, sycl_queue)
+    _manager.add_event_pair(hev, zeros_ev)
+
     return res
 
 

From 2e73a6c697386d9446bdb3ce7b4471a4e9cd9147 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 21 Aug 2024 17:49:32 -0500
Subject: [PATCH 3/5] Added special case for _full_usm_ndarray
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bitwise zero values, and 1-byte wide types now use memset, instead
of using fill.

```
In [1]: import dpctl.tensor as dpt, dpctl.tensor._tensor_impl as ti

In [2]: res = dpt.empty(10**6, dtype="i8")

In [3]: %timeit -n 2000 -r 11 ti._full_usm_ndarray(0, dst=res, sycl_queue=res.sycl_queue)[0].wait()
243 µs ± 22.6 µs per loop (mean ± std. dev. of 11 runs, 2,000 loops each)

In [4]: %timeit -n 2000 -r 11 ti._full_usm_ndarray(0, dst=res, sycl_queue=res.sycl_queue)[0].wait()
229 µs ± 14 µs per loop (mean ± std. dev. of 11 runs, 2,000 loops each)

In [5]: %timeit -n 2000 -r 11 ti._zeros_usm_ndarray(dst=res, sycl_queue=res.sycl_queue)[0].wait()
227 µs ± 23 µs per loop (mean ± std. dev. of 11 runs, 2,000 loops each)

In [6]: %timeit -n 2000 -r 11 ti._zeros_usm_ndarray(dst=res, sycl_queue=res.sycl_queue)[0].wait()
233 µs ± 25.9 µs per loop (mean ± std. dev. of 11 runs, 2,000 loops each)

In [7]: %timeit -n 2000 -r 11 ti._zeros_usm_ndarray(dst=res, sycl_queue=res.sycl_queue)[0].wait()
301 µs ± 54.1 µs per loop (mean ± std. dev. of 11 runs, 2,000 loops each)

In [8]: %timeit -n 2000 -r 11 ti._zeros_usm_ndarray(dst=res, sycl_queue=res.sycl_queue)[0].wait()
236 µs ± 17.2 µs per loop (mean ± std. dev. of 11 runs, 2,000 loops each)

In [9]: %timeit -n 2000 -r 11 ti._full_usm_ndarray(0, dst=res, sycl_queue=res.sycl_queue)[0].wait()
240 µs ± 35.2 µs per loop (mean ± std. dev. of 11 runs, 2,000 loops each)

In [10]: %timeit -n 2000 -r 11 ti._full_usm_ndarray(1, dst=res, sycl_queue=res.sycl_queue)[0].wait()
243 µs ± 17.6 µs per loop (mean ± std. dev. of 11 runs, 2,000 loops each)

In [11]: %timeit -n 2000 -r 11 ti._full_usm_ndarray(1, dst=res, sycl_queue=res.sycl_queue)[0].wait()
263 µs ± 39.9 µs per loop (mean ± std. dev. of 11 runs, 2,000 loops each)

In [12]: %timeit -n 2000 -r 11 ti._full_usm_ndarray(0, dst=res, sycl_queue=res.sycl_queue)[0].wait()
239 µs ± 26.4 µs per loop (mean ± std. dev. of 11 runs, 2,000 loops each)

In [13]: %timeit -n 2000 -r 11 ti._zeros_usm_ndarray(dst=res, sycl_queue=res.sycl_queue)[0].wait()
224 µs ± 18.1 µs per loop (mean ± std. dev. of 11 runs, 2,000 loops each)
```
---
 dpctl/tensor/libtensor/source/full_ctor.cpp | 62 +++++++++++++++++++--
 1 file changed, 58 insertions(+), 4 deletions(-)

diff --git a/dpctl/tensor/libtensor/source/full_ctor.cpp b/dpctl/tensor/libtensor/source/full_ctor.cpp
index 41b3093652..26029e028a 100644
--- a/dpctl/tensor/libtensor/source/full_ctor.cpp
+++ b/dpctl/tensor/libtensor/source/full_ctor.cpp
@@ -80,10 +80,65 @@ sycl::event full_contig_impl(sycl::queue &exec_q,
 {
     dstTy fill_v = py::cast<dstTy>(py_value);
 
-    using dpctl::tensor::kernels::constructors::full_contig_impl;
+    sycl::event fill_ev;
 
-    sycl::event fill_ev =
-        full_contig_impl<dstTy>(exec_q, nelems, fill_v, dst_p, depends);
+    if constexpr (sizeof(dstTy) == sizeof(char)) {
+        const auto memset_val = sycl::bit_cast<unsigned char>(fill_v);
+        fill_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            cgh.memset(reinterpret_cast<void *>(dst_p), memset_val,
+                       nelems * sizeof(dstTy));
+        });
+    }
+    else {
+        bool is_zero = false;
+        if constexpr (sizeof(dstTy) == 1) {
+            is_zero = (std::uint8_t{0} == sycl::bit_cast<std::uint8_t>(fill_v));
+        }
+        else if constexpr (sizeof(dstTy) == 2) {
+            is_zero =
+                (std::uint16_t{0} == sycl::bit_cast<std::uint16_t>(fill_v));
+        }
+        else if constexpr (sizeof(dstTy) == 4) {
+            is_zero =
+                (std::uint32_t{0} == sycl::bit_cast<std::uint32_t>(fill_v));
+        }
+        else if constexpr (sizeof(dstTy) == 8) {
+            is_zero =
+                (std::uint64_t{0} == sycl::bit_cast<std::uint64_t>(fill_v));
+        }
+        else if constexpr (sizeof(dstTy) == 16) {
+            struct UInt128
+            {
+
+                constexpr UInt128() : v1{}, v2{} {}
+                UInt128(const UInt128 &) = default;
+
+                operator bool() const { return bool(!v1) && bool(!v2); }
+
+                std::uint64_t v1;
+                std::uint64_t v2;
+            };
+            is_zero = static_cast<bool>(sycl::bit_cast<UInt128>(fill_v));
+        }
+
+        if (is_zero) {
+            constexpr int memset_val = 0;
+            fill_ev = exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(depends);
+
+                cgh.memset(reinterpret_cast<void *>(dst_p), memset_val,
+                           nelems * sizeof(dstTy));
+            });
+        }
+        else {
+            using dpctl::tensor::kernels::constructors::full_contig_impl;
+
+            fill_ev =
+                full_contig_impl<dstTy>(exec_q, nelems, fill_v, dst_p, depends);
+        }
+    }
 
     return fill_ev;
 }
@@ -126,7 +181,6 @@ usm_ndarray_full(const py::object &py_value,
     int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
 
     char *dst_data = dst.get_data();
-    sycl::event full_event;
 
     if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) {
         auto fn = full_contig_dispatch_vector[dst_typeid];

From f0d926a273631e83ea3a6c7640cf834db0907f7b Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 22 Aug 2024 06:52:09 -0500
Subject: [PATCH 4/5] Add test_full_cmplx128

---
 dpctl/tests/test_usm_ndarray_ctor.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/dpctl/tests/test_usm_ndarray_ctor.py b/dpctl/tests/test_usm_ndarray_ctor.py
index f43f931e47..3e027e5926 100644
--- a/dpctl/tests/test_usm_ndarray_ctor.py
+++ b/dpctl/tests/test_usm_ndarray_ctor.py
@@ -1682,6 +1682,27 @@ def test_full(dtype):
     assert np.array_equal(dpt.asnumpy(X), np.full(10, 4, dtype=dtype))
 
 
+def test_full_cmplx128():
+    q = get_queue_or_skip()
+    dtype = "c16"
+    skip_if_dtype_not_supported(dtype, q)
+    fill_v = 1 + 1j
+    X = dpt.full(tuple(), fill_value=fill_v, dtype=dtype, sycl_queue=q)
+    assert np.array_equal(
+        dpt.asnumpy(X), np.full(tuple(), fill_value=fill_v, dtype=dtype)
+    )
+    fill_v = 0 + 1j
+    X = dpt.full(tuple(), fill_value=fill_v, dtype=dtype, sycl_queue=q)
+    assert np.array_equal(
+        dpt.asnumpy(X), np.full(tuple(), fill_value=fill_v, dtype=dtype)
+    )
+    fill_v = 0 + 0j
+    X = dpt.full(tuple(), fill_value=fill_v, dtype=dtype, sycl_queue=q)
+    assert np.array_equal(
+        dpt.asnumpy(X), np.full(tuple(), fill_value=fill_v, dtype=dtype)
+    )
+
+
 def test_full_dtype_inference():
     try:
         X = dpt.full(10, 4)

From 640e7061d7e550fd5c0aa582eb38d60710be8e70 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 22 Aug 2024 13:04:32 -0500
Subject: [PATCH 5/5] Update dpctl/tensor/libtensor/source/zeros_ctor.cpp

Co-authored-by: ndgrigorian <46709016+ndgrigorian@users.noreply.github.com>
---
 dpctl/tensor/libtensor/source/zeros_ctor.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/dpctl/tensor/libtensor/source/zeros_ctor.cpp b/dpctl/tensor/libtensor/source/zeros_ctor.cpp
index cae5a23f3e..e53a572efe 100644
--- a/dpctl/tensor/libtensor/source/zeros_ctor.cpp
+++ b/dpctl/tensor/libtensor/source/zeros_ctor.cpp
@@ -102,8 +102,6 @@ usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst,
                   sycl::queue &exec_q,
                   const std::vector<sycl::event> &depends)
 {
-    // start, end should be coercible into data type of dst
-
     py::ssize_t dst_nelems = dst.get_size();
 
     if (dst_nelems == 0) {