diff --git a/CHANGELOG.md b/CHANGELOG.md index b849718807..83bab58ac9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Change -* Improved efficiency of copy-and-cast operations from `numpy.ndarray` to `tensor.usm_ndarray` for contiguous inputs [gh-1829](https://github.com/IntelPython/dpctl/pull/1829) +* Improved performance of copy-and-cast operations from `numpy.ndarray` to `tensor.usm_ndarray` for contiguous inputs [gh-1829](https://github.com/IntelPython/dpctl/pull/1829) +* Improved performance of copying operation to C-/F-contig array, with optimization for batch of square matrices [gh-1850](https://github.com/IntelPython/dpctl/pull/1850) ### Fixed diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index 797815cfed..31d4eba03d 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -123,6 +123,7 @@ set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_ctors.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_as_contig.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp diff --git a/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp b/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp new file mode 100644 index 0000000000..b8f46e0c3b --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp @@ -0,0 +1,580 @@ +//=== copy_ascontig.hpp - Implementation of copy-and-cast kernels *-C++-*/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2024 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor copying and value casting. +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +#include "dpctl_tensor_types.hpp" +#include "kernels/alignment.hpp" +#include "utils/offset_utils.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace copy_as_contig +{ + +template +class CopyAsCContigFunctor +{ +private: + const size_t nelems; + const T *src_p = nullptr; + T *dst_p = nullptr; + IndexerT src_indexer; + +public: + CopyAsCContigFunctor(size_t n, + const T *src_, + T *dst_, + const IndexerT &src_indexer_) + : nelems(n), src_p(src_), dst_p(dst_), src_indexer(src_indexer_) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + using dpctl::tensor::type_utils::is_complex; + if constexpr (!enable_sg_loadstore || is_complex::value) { + const std::uint32_t sgSize = + ndit.get_sub_group().get_local_range()[0]; + const std::size_t gid = ndit.get_global_linear_id(); + + const std::size_t base = + (gid / sgSize) * sgSize * n_vecs * vec_sz + (gid % sgSize); + for (size_t offset = base; + offset < std::min(nelems, base + sgSize * (n_vecs * vec_sz)); + offset += sgSize) + { + auto src_offset = src_indexer(offset); + dst_p[offset] = src_p[src_offset]; + } + } + else { + auto sg = ndit.get_sub_group(); + const std::uint32_t sgSize = sg.get_local_range()[0]; + const size_t base = n_vecs * vec_sz * + (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * sgSize); + + if (base + n_vecs * vec_sz * sgSize < nelems) { + sycl::vec dst_vec; + +#pragma unroll + for (std::uint32_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + auto dst_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>( + &dst_p[base + it * sgSize]); + +#pragma unroll + for (std::uint32_t k = 0; k < vec_sz; k++) { + ssize_t src_offset = src_indexer( + base + (it + k) * sgSize + sg.get_local_id()); + dst_vec[k] = src_p[src_offset]; + } + sg.store(dst_multi_ptr, dst_vec); + } + } + else { + for (size_t k = base + sg.get_local_id()[0]; k < nelems; + k += sgSize) + { + ssize_t src_offset = src_indexer(k); + dst_p[k] = src_p[src_offset]; + } + } + } + } +}; + +template +class as_contig_krn; + +template +sycl::event +as_c_contiguous_array_generic_impl(sycl::queue &exec_q, + size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *src_p, + char *dst_p, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(exec_q); + + const T *src_tp = reinterpret_cast(src_p); + T *dst_tp = reinterpret_cast(dst_p); + + using IndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const IndexerT src_indexer(nd, ssize_t(0), shape_and_strides); + + constexpr std::size_t preferred_lws = 256; + constexpr std::uint32_t n_vecs = 2; + constexpr std::uint32_t vec_sz = 4; + constexpr bool enable_sg_load = true; + using KernelName = + as_contig_krn; + + const auto &kernel_id = sycl::get_kernel_id(); + + auto const &ctx = exec_q.get_context(); + auto const &dev = exec_q.get_device(); + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + + auto krn = kb.get_kernel(kernel_id); + + const std::uint32_t max_sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + + const std::size_t lws = + ((preferred_lws + max_sg_size - 1) / max_sg_size) * max_sg_size; + + constexpr std::uint32_t nelems_per_wi = n_vecs * vec_sz; + size_t n_groups = + (nelems + nelems_per_wi * lws - 1) / (nelems_per_wi * lws); + + sycl::event copy_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const sycl::range<1> gRange{n_groups * lws}; + const sycl::range<1> lRange{lws}; + + cgh.parallel_for( + sycl::nd_range<1>(gRange, lRange), + CopyAsCContigFunctor( + nelems, src_tp, dst_tp, src_indexer)); + }); + + return copy_ev; +} + +typedef sycl::event (*as_c_contiguous_array_impl_fn_ptr_t)( + sycl::queue &, + size_t, + int, + const ssize_t *, + const char *, + char *, + const std::vector &); + +template struct AsCContigFactory +{ + fnT get() { return as_c_contiguous_array_generic_impl; } +}; + +template +class as_contig_batch_of_square_matrices_krn; + +namespace detail +{ +/*! @brief batch of matrices (n, n), source strides (1, src_ld), destination + strides (dst_ld, 1) src and destination arrays must be disjoint memory blocks + to avoid race condition + */ +template +sycl::event as_c_contiguous_batch_of_square_matrices_impl( + sycl::queue &exec_q, + size_t batch_nelems, + const BatchIndexerT &batch_two_offsets_indexer, + size_t n, + const char *src_p, + ssize_t src_ld, + char *dst_p, + ssize_t dst_ld, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(exec_q); + + const T *src_tp = reinterpret_cast(src_p); + T *dst_tp = reinterpret_cast(dst_p); + + constexpr std::uint32_t private_tile_size = 4; + constexpr std::uint32_t n_lines = 2; + constexpr std::uint32_t block_size = + n_lines * private_tile_size * private_tile_size; + + constexpr std::uint32_t lws0 = block_size; + constexpr std::uint32_t lws1 = n_lines; + constexpr std::uint32_t nelems_per_wi = (block_size / lws1); + + static_assert(nelems_per_wi * lws1 == block_size); + static_assert(nelems_per_wi == private_tile_size * private_tile_size); + + constexpr std::uint32_t lws = lws0 * lws1; + + const std::size_t n_tiles = (n + block_size - 1) / block_size; + + const ssize_t src_stride = src_ld; + const ssize_t dst_stride = dst_ld; + + sycl::range<1> lRange{lws}; + sycl::range<1> gRange{batch_nelems * n_tiles * n_tiles * lws}; + + sycl::nd_range<1> ndRange{gRange, lRange}; + + using KernelName = + as_contig_batch_of_square_matrices_krn; + + sycl::event e = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + sycl::local_accessor local_block(block_size * block_size, cgh); + + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> nd_it) { + // 1. Read block from source array into SLM + const std::uint32_t lid_lin = nd_it.get_local_linear_id(); + const size_t gr_id_lin = nd_it.get_group_linear_id(); + + const std::size_t batch_id = gr_id_lin / (n_tiles * n_tiles); + const std::size_t rem = gr_id_lin - batch_id * (n_tiles * n_tiles); + + const auto &batch_two_offsets = batch_two_offsets_indexer(batch_id); + const auto &src_batch_offset = batch_two_offsets.get_first_offset(); + const auto &dst_batch_offset = + batch_two_offsets.get_second_offset(); + + // Block id + /* 0 <= src_gr_i1 < n_groups_n1 */ + const std::size_t src_tile_i1 = rem / n_tiles; + /* 0 <= src_gr_i0 < n_groups_n0 */ + const std::size_t src_tile_i0 = rem - src_tile_i1 * n_tiles; + + // ID of element within the block + /* 0 <= src_i1 < lws1 */ + const std::uint32_t src_i1 = lid_lin / lws0; + /* 0 <= src_i0 < lws0 */ + const std::uint32_t src_i0 = lid_lin - src_i1 * lws0; + + // Matrix element ID + const std::size_t src_tile_start0 = src_tile_i0 * block_size; + const std::size_t src_tile_start1 = src_tile_i1 * block_size; + const std::size_t src_gid0 = (src_tile_start0 + src_i0); + const std::size_t src_gid1 = (src_tile_start1 + src_i1); + + // src_offset = src_gid0 * 1 + (src_gid1 + pr_id * lws1) * + // src_stride + const std::size_t src_offset0 = + src_batch_offset + src_gid0 * 1 + src_gid1 * src_stride; + const std::size_t pr_step_src = lws1 * src_stride; + + const std::uint32_t local_offset0 = src_i0 + src_i1 * block_size; + const std::uint32_t pr_step_local = lws1 * block_size; + + for (std::uint32_t pr_id = 0; pr_id < nelems_per_wi; ++pr_id) { + local_block[local_offset0 + pr_step_local * pr_id] = + (src_gid0 < n && src_gid1 + pr_id * lws1 < n) + ? src_tp[src_offset0 + pr_step_src * pr_id] + : T(0); + } + + const std::uint32_t local_dim0 = static_cast( + std::min(src_tile_start0 + block_size, n) - + src_tile_start0); + const std::uint32_t local_dim1 = static_cast( + std::min(src_tile_start1 + block_size, n) - + src_tile_start1); + + sycl::group_barrier(nd_it.get_group(), + sycl::memory_scope::work_group); + + // 2. Permute the block matrix in SLM using two private arrays + std::array private_block_01 = {T(0)}; + std::array private_block_10 = {T(0)}; + + // 0 <= lid_lin < lws0 * lws1 == (block_size * block_size / + // nelems_per_wi) == (block_size/private_tile_size)**2 + constexpr std::uint32_t n_private_tiles_per_axis = + block_size / private_tile_size; + const std::uint32_t local_tile_id0 = + lid_lin / n_private_tiles_per_axis; + const std::uint32_t local_tile_id1 = + lid_lin - local_tile_id0 * n_private_tiles_per_axis; + + if (local_tile_id0 <= local_tile_id1) { + for (std::uint32_t pr_i0 = 0; pr_i0 < private_tile_size; + ++pr_i0) + { + for (std::uint32_t pr_i1 = 0; pr_i1 < private_tile_size; + ++pr_i1) + { + const std::uint32_t t0_offset = + local_tile_id0 * private_tile_size; + const std::uint32_t t1_offset = + local_tile_id1 * private_tile_size; + + const std::uint32_t pr_offset = + pr_i1 * private_tile_size + pr_i0; + const std::uint32_t rel_offset = + pr_i0 + pr_i1 * block_size; + + // read (local_tile_id0, local_tile_id1) + const std::uint32_t local_01_offset = + (t0_offset + t1_offset * block_size) + rel_offset; + private_block_01[pr_offset] = + local_block[local_01_offset]; + + // read (local_tile_id1, local_tile_id0) + const std::uint32_t local_10_offset = + (t1_offset + t0_offset * block_size) + rel_offset; + private_block_10[pr_offset] = + local_block[local_10_offset]; + } + } + } + + sycl::group_barrier(nd_it.get_group(), + sycl::memory_scope::work_group); + + if (local_tile_id0 <= local_tile_id1) { + for (std::uint32_t pr_i0 = 0; pr_i0 < private_tile_size; + ++pr_i0) + { + for (std::uint32_t pr_i1 = 0; pr_i1 < private_tile_size; + ++pr_i1) + { + const std::uint32_t t0_offset = + local_tile_id0 * private_tile_size; + const std::uint32_t t1_offset = + local_tile_id1 * private_tile_size; + const std::uint32_t pr_offset = + pr_i0 * private_tile_size + pr_i1; + + const std::uint32_t rel_offset = + pr_i0 + pr_i1 * block_size; + + // write back permuted private blocks + const std::uint32_t local_01_offset = + (t0_offset + t1_offset * block_size) + rel_offset; + local_block[local_01_offset] = + private_block_10[pr_offset]; + + const std::uint32_t local_10_offset = + (t1_offset + t0_offset * block_size) + rel_offset; + local_block[local_10_offset] = + private_block_01[pr_offset]; + } + } + } + + sycl::group_barrier(nd_it.get_group(), + sycl::memory_scope::work_group); + + // 3. Write out permuted SLM to destination array + + const std::size_t dst_tile_start0 = src_tile_start0; + const std::size_t dst_tile_start1 = src_tile_start1; + + if (local_dim0 == block_size && local_dim1 == block_size) { + const std::uint32_t dst_i0 = src_i1; + const std::uint32_t dst_i1 = src_i0; + + const std::size_t dst_gid0 = (dst_tile_start0 + dst_i0); + const std::size_t dst_gid1 = (dst_tile_start1 + dst_i1); + + const std::size_t dst_offset0 = + dst_batch_offset + dst_gid0 * dst_stride + dst_gid1 * 1; + const std::size_t pr_step_dst = lws1 * dst_stride; + + const std::uint32_t _local_offset0 = + dst_i0 * block_size + dst_i1; + const std::uint32_t _pr_step_local = lws1 * block_size; + + for (std::uint32_t pr_id = 0; pr_id < nelems_per_wi; ++pr_id) { + if ((dst_gid1 < n) && ((dst_gid0 + pr_id * lws1) < n)) { + dst_tp[dst_offset0 + pr_step_dst * pr_id] = + local_block[_local_offset0 + + _pr_step_local * pr_id]; + } + } + } + else { + // map local_linear_id into (local_dim0, local_dim1) + for (std::uint32_t el_id = lid_lin; + el_id < local_dim0 * local_dim1; el_id += lws0 * lws1) + { + + // 0 <= local_i0 < local_dim0 + const std::uint32_t loc_i0 = el_id / local_dim1; + // 0 <= local_i1 < local_dim1 + const std::uint32_t loc_i1 = el_id - loc_i0 * local_dim1; + + const std::uint32_t dst_i0 = loc_i0; + const std::uint32_t dst_i1 = loc_i1; + + const std::size_t dst_gid0 = (dst_tile_start0 + dst_i0); + const std::size_t dst_gid1 = (dst_tile_start1 + dst_i1); + + const std::size_t dst_offset = + dst_batch_offset + dst_gid0 * dst_stride + dst_gid1 * 1; + const std::uint32_t local_offset = + loc_i0 * block_size + loc_i1; + + if ((dst_gid1 < n) && (dst_gid0 < n)) { + dst_tp[dst_offset] = local_block[local_offset]; + } + } + } + }); + }); + + return e; +} + +} // end of namespace detail + +template +sycl::event as_c_contiguous_1d_batch_of_square_matrices_impl( + sycl::queue &exec_q, + size_t batch_nelems, + ssize_t src_batch_step, + ssize_t dst_batch_step, + size_t n, + const char *src_p, + ssize_t src_ld, + char *dst_p, + ssize_t dst_ld, + const std::vector &depends) +{ + using dpctl::tensor::offset_utils::Strided1DIndexer; + using dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer; + using BatchIndexerT = + TwoOffsets_CombinedIndexer; + + const auto &src_batch_indexer = + Strided1DIndexer(batch_nelems, src_batch_step); + const auto &dst_batch_indexer = + Strided1DIndexer(batch_nelems, dst_batch_step); + + const BatchIndexerT batch_two_indexer{src_batch_indexer, dst_batch_indexer}; + + return detail::as_c_contiguous_batch_of_square_matrices_impl( + exec_q, batch_nelems, batch_two_indexer, n, src_p, src_ld, dst_p, + dst_ld, depends); +} + +typedef sycl::event ( + *as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t)( + sycl::queue &, /* execution queue */ + size_t, /* number of batch elements */ + ssize_t, /* distance between batches in source array */ + ssize_t, /* distance between batches in destination array */ + size_t, /* size of square matrices in the batch */ + const char *, + ssize_t, /* untyped pointer to F-contig source array, and matrix leading + dimension */ + char *, + ssize_t, /* untyped pointer to C-contig destination array, and matrix + leading dimension */ + const std::vector &); + +template +struct AsCContig1DBatchOfSquareMatricesFactory +{ + fnT get() { return as_c_contiguous_1d_batch_of_square_matrices_impl; } +}; + +template +sycl::event as_c_contiguous_nd_batch_of_square_matrices_impl( + sycl::queue &exec_q, + size_t batch_nelems, + int batch_nd, + const ssize_t *src_batch_shape_strides, + const ssize_t dst_batch_step, + size_t n, + const char *src_p, + ssize_t src_ld, + char *dst_p, + ssize_t dst_ld, + const std::vector &depends) +{ + using SrcIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + using DstIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer; + using BatchIndexerT = TwoOffsets_CombinedIndexer; + + constexpr ssize_t zero_offset{0}; + + const SrcIndexerT src_batch_indexer{batch_nd, zero_offset, + src_batch_shape_strides}; + const DstIndexerT dst_batch_indexer{/* size */ batch_nelems, + /* step */ dst_batch_step}; + + const BatchIndexerT batch_two_offsets_indexer{src_batch_indexer, + dst_batch_indexer}; + + return detail::as_c_contiguous_batch_of_square_matrices_impl( + exec_q, batch_nelems, batch_two_offsets_indexer, n, src_p, src_ld, + dst_p, dst_ld, depends); +} + +typedef sycl::event ( + *as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t)( + sycl::queue &, /* execution queue */ + size_t, /* number of matrices in the batch */ + int, + const ssize_t *, /* dimensionality, and packed [shape, src_strides] + describing iteration over batch in source array */ + ssize_t, /* distance between batches in destination array */ + size_t, /* matrix size */ + const char *, + ssize_t, /* untyped pointer to source array of F-contig matrices, and + leading dimension of the matrix */ + char *, + ssize_t, /* untyped pointer to destination array of F-contig matrices, and + leading dimension of the matrix */ + const std::vector &); + +template +struct AsCContigNDBatchOfSquareMatricesFactory +{ + fnT get() { return as_c_contiguous_nd_batch_of_square_matrices_impl; } +}; + +} // namespace copy_as_contig +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp index 801dbe72f8..6b10851869 100644 --- a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp +++ b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp @@ -35,6 +35,7 @@ #include #include "dpctl4pybind11.hpp" + #include "kernels/copy_and_cast.hpp" #include "utils/memory_overlap.hpp" #include "utils/offset_utils.hpp" @@ -43,6 +44,7 @@ #include "utils/type_dispatch.hpp" #include "utils/type_utils.hpp" +#include "copy_as_contig.hpp" #include "simplify_iteration_space.hpp" namespace dpctl @@ -90,7 +92,7 @@ copy_usm_ndarray_into_usm_ndarray(const dpctl::tensor::usm_ndarray &src, bool shapes_equal(true); size_t src_nelems(1); - for (int i = 0; i < src_nd; ++i) { + for (int i = 0; shapes_equal && (i < src_nd); ++i) { src_nelems *= static_cast(src_shape[i]); shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); } @@ -162,6 +164,15 @@ copy_usm_ndarray_into_usm_ndarray(const dpctl::tensor::usm_ndarray &src, copy_ev); } + if ((src_type_id == dst_type_id) && (src_nd > 1)) { + if (is_dst_c_contig) { + return py_as_c_contig(src, dst, exec_q, depends); + } + else if (is_dst_f_contig) { + return py_as_f_contig(src, dst, exec_q, depends); + } + } + auto const &src_strides = src.get_strides_vector(); auto const &dst_strides = dst.get_strides_vector(); diff --git a/dpctl/tensor/libtensor/source/copy_as_contig.cpp b/dpctl/tensor/libtensor/source/copy_as_contig.cpp new file mode 100644 index 0000000000..1a186d6ce7 --- /dev/null +++ b/dpctl/tensor/libtensor/source/copy_as_contig.cpp @@ -0,0 +1,776 @@ +//==- copy_ascontig.cpp - Implementation of _tensor_impl module -*-C++-*-/==// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2024 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include "kernels/copy_as_contiguous.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +#include "copy_as_contig.hpp" +#include "simplify_iteration_space.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::copy_as_contig:: + as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t; +using dpctl::tensor::kernels::copy_as_contig:: + as_c_contiguous_array_impl_fn_ptr_t; +using dpctl::tensor::kernels::copy_as_contig:: + as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t; +using dpctl::utils::keep_args_alive; + +static as_c_contiguous_array_impl_fn_ptr_t + as_c_contig_array_dispatch_vector[td_ns::num_types]; + +static as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t + as_c_contig_1d_batch_of_square_matrices_dispatch_vector[td_ns::num_types]; + +static as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t + as_c_contig_nd_batch_of_square_matrices_dispatch_vector[td_ns::num_types]; + +void init_copy_as_contig_dispatch_vectors(void) +{ + + using dpctl::tensor::kernels::copy_as_contig:: + AsCContig1DBatchOfSquareMatricesFactory; + using dpctl::tensor::kernels::copy_as_contig::AsCContigFactory; + using dpctl::tensor::kernels::copy_as_contig:: + AsCContigNDBatchOfSquareMatricesFactory; + using td_ns::DispatchVectorBuilder; + + // Generic to c-contig + DispatchVectorBuilder + dtv_as_c_contig_array; + + dtv_as_c_contig_array.populate_dispatch_vector( + as_c_contig_array_dispatch_vector); + + // 1D batch of square views into F-contig matrices to c-contig array + DispatchVectorBuilder< + as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t, + AsCContig1DBatchOfSquareMatricesFactory, td_ns::num_types> + dtv_as_c_contig_1d_batch_of_square_matrices; + + dtv_as_c_contig_1d_batch_of_square_matrices.populate_dispatch_vector( + as_c_contig_1d_batch_of_square_matrices_dispatch_vector); + + // ND batch of square views into F-contig matrices to c-contig array + DispatchVectorBuilder< + as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t, + AsCContigNDBatchOfSquareMatricesFactory, td_ns::num_types> + dtv_as_c_contig_nd_batch_of_square_matrices; + + dtv_as_c_contig_nd_batch_of_square_matrices.populate_dispatch_vector( + as_c_contig_nd_batch_of_square_matrices_dispatch_vector); +} + +namespace +{ + +template std::size_t get_nelems(const std::vector &shape) +{ + auto mult_fn = [](std::size_t prod, const dimT &term) -> std::size_t { + return prod * static_cast(term); + }; + + constexpr std::size_t unit{1}; + + const std::size_t nelems = + std::accumulate(std::begin(shape), std::end(shape), unit, mult_fn); + return nelems; +} + +} // end of anonymous namespace + +std::pair +py_as_c_contig_f2c(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends); + +std::pair +py_as_c_contig(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + /* Same dimensions, same shape, same data-type + * dst is C-contiguous. + */ + const int src_nd = src.get_ndim(); + const int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("Number of dimensions must be the same"); + } + + const auto &src_shape_vec = src.get_shape_vector(); + const auto &dst_shape_vec = dst.get_shape_vector(); + + if (src_shape_vec != dst_shape_vec) { + throw py::value_error("Shapes must be equal"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + const int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_type_id != dst_type_id) { + throw py::value_error( + "Source and destination arrays must have the same data type"); + } + + // ensures also that destination is plenty ample to accommodate all + // elements of src array + if (!dst.is_c_contiguous()) { + throw py::value_error("Destination array must be C-contiguous"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + const auto &src_strides_vec = src.get_strides_vector(); + + if (src_nd >= 2) { + auto n = dst_shape_vec.back(); + if (n == dst_shape_vec[src_nd - 2]) { + constexpr auto unit_stride = py::ssize_t(1); + if (src_strides_vec[src_nd - 2] == unit_stride) { + return py_as_c_contig_f2c(src, dst, exec_q, depends); + } + } + } + + const std::size_t nelems = get_nelems(src_shape_vec); + + if (nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + // simplify iteration space + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_nd; + + // nd, simplified_* and *_offset are modified by reference + dpctl::tensor::py_internal::simplify_iteration_space( + nd, src_shape_vec.data(), src_strides_vec, dst.get_strides_vector(), + // output + simplified_shape, simplified_src_strides, simplified_dst_strides, + src_offset, dst_offset); + + if (!((0 == src_offset) && (0 == dst_offset))) { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 1"); + } + + std::vector host_task_events{}; + const auto &ptr_size_event_tuple = + dpctl::tensor::offset_utils::device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides); + + py::ssize_t *shape_stride = std::get<0>(ptr_size_event_tuple); + if (shape_stride == nullptr) { + throw std::runtime_error("Unable to allocate device memory"); + } + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + + auto ascontig_fn = as_c_contig_array_dispatch_vector[src_type_id]; + + std::vector all_depends; + all_depends.reserve(depends.size() + 1); + all_depends.insert(std::end(all_depends), std::begin(depends), + std::end(depends)); + all_depends.push_back(copy_shape_ev); + + sycl::event ascontig_ev = + ascontig_fn(exec_q, nelems, nd, shape_stride, src.get_data(), + dst.get_data(), all_depends); + + const auto &ctx = exec_q.get_context(); + const auto &temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(ascontig_ev); + using dpctl::tensor::alloc_utils::sycl_free_noexcept; + cgh.host_task( + [ctx, shape_stride]() { sycl_free_noexcept(shape_stride, ctx); }); + }); + + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + ascontig_ev); +} + +std::pair +py_as_f_contig_c2f(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends); + +std::pair +py_as_f_contig(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + /* Same dimensions, same shape, same data-type + * dst is F-contiguous. + */ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("Number of dimensions must be the same"); + } + + const auto &src_shape_vec = src.get_shape_vector(); + const auto &dst_shape_vec = dst.get_shape_vector(); + + if (src_shape_vec != dst_shape_vec) { + throw py::value_error("Shapes must be equal"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + const int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_type_id != dst_type_id) { + throw py::value_error( + "Source and destination arrays must have the same data type"); + } + + // ensures also that destination is plenty ample to accommodate all + // elements of src array + if (!dst.is_f_contiguous()) { + throw py::value_error("Destination array must be F-contiguous"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + const auto &src_strides_vec = src.get_strides_vector(); + + if (src_nd >= 2) { + auto n = dst_shape_vec.front(); + if (n == dst_shape_vec[1]) { + constexpr auto unit_stride = py::ssize_t(1); + if (src_strides_vec[1] == unit_stride) { + return py_as_f_contig_c2f(src, dst, exec_q, depends); + } + } + } + + const std::size_t nelems = get_nelems(src_shape_vec); + + if (nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + // simplify batch iteration space + // NB: simplification reverses dst strides to C contig, + // it also reverses simplified_shape and simplified_src_strides + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_nd; + + // nd, simplified_* and *_offset are modified by reference + dpctl::tensor::py_internal::simplify_iteration_space( + nd, src_shape_vec.data(), src_strides_vec, dst.get_strides_vector(), + // output + simplified_shape, simplified_src_strides, simplified_dst_strides, + src_offset, dst_offset); + + if (!((0 == src_offset) && (0 == dst_offset))) { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 1"); + } + + std::vector host_task_events{}; + const auto &ptr_size_event_tuple = + dpctl::tensor::offset_utils::device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides); + + py::ssize_t *shape_stride = std::get<0>(ptr_size_event_tuple); + if (shape_stride == nullptr) { + throw std::runtime_error("Unable to allocate device memory"); + } + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + + auto ascontig_fn = as_c_contig_array_dispatch_vector[src_type_id]; + + std::vector all_depends; + all_depends.reserve(depends.size() + 1); + all_depends.insert(std::end(all_depends), std::begin(depends), + std::end(depends)); + all_depends.push_back(copy_shape_ev); + + sycl::event ascontig_ev = + ascontig_fn(exec_q, nelems, nd, shape_stride, src.get_data(), + dst.get_data(), all_depends); + + const auto &ctx = exec_q.get_context(); + const auto &temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(ascontig_ev); + using dpctl::tensor::alloc_utils::sycl_free_noexcept; + cgh.host_task( + [ctx, shape_stride]() { sycl_free_noexcept(shape_stride, ctx); }); + }); + + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + ascontig_ev); +} + +std::pair +py_as_c_contig_f2c(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + /* Same dimensions, same shape, same data-type + * dst is C-contiguous. + */ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("Number of dimensions must be the same."); + } + if (src_nd < 2) { + throw py::value_error("Arrays must have 2 or more axes"); + } + + const auto &src_shape_vec = src.get_shape_vector(); + const auto &dst_shape_vec = dst.get_shape_vector(); + + std::size_t nelems{1}; + bool equal_shapes = true; + + for (int i = 0; equal_shapes && (i < src_nd); ++i) { + auto sh_i = src_shape_vec[i]; + equal_shapes = equal_shapes && (sh_i == dst_shape_vec[i]); + nelems *= static_cast(sh_i); + } + + if (!equal_shapes) { + throw py::value_error("Shapes must be equal"); + } + + const auto n = src_shape_vec.back(); + if (src_shape_vec[src_nd - 2] != n) { + throw py::value_error("Matrices must be square"); + } + + const auto &src_strides_vec = src.get_strides_vector(); + + if (src_strides_vec[src_nd - 2] != py::ssize_t(1)) { + throw py::value_error("Unexpected destination array layout"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto array_types = td_ns::usm_ndarray_types(); + const int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_type_id != dst_type_id) { + throw py::value_error( + "Source and destination arrays must have the same data type"); + } + + // ensures also that destination is plenty ample to accommodate all + // elements of src array + if (!dst.is_c_contiguous()) { + throw py::value_error("Destination array must be C-contiguous"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + if (nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + const auto &dst_strides_vec = dst.get_strides_vector(); + + const std::size_t batch_nelems = + (src_nd == 2) ? std::size_t(1) : (nelems / (n * n)); + const py::ssize_t dst_batch_step = + (src_nd == 2) ? py::ssize_t(0) : dst_strides_vec[src_nd - 3]; + + std::vector src_batch_strides_vec; + std::vector dst_batch_strides_vec; + std::vector batch_shape_vec; + + if (src_nd == 2) { + batch_shape_vec.push_back(py::ssize_t(1)); + src_batch_strides_vec.push_back(py::ssize_t(0)); + dst_batch_strides_vec.push_back(dst_batch_step); + } + else { + batch_shape_vec.insert(std::end(batch_shape_vec), + std::begin(src_shape_vec), + std::end(src_shape_vec) - 2); + src_batch_strides_vec.insert(std::end(src_batch_strides_vec), + std::begin(src_strides_vec), + std::end(src_strides_vec) - 2); + dst_batch_strides_vec.insert(std::end(dst_batch_strides_vec), + std::begin(dst_strides_vec), + std::end(dst_strides_vec) - 2); + } + + // simplify batch iteration space + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = static_cast(batch_shape_vec.size()); + + // nd, simplified_* and *_offset are modified by reference + dpctl::tensor::py_internal::simplify_iteration_space( + nd, batch_shape_vec.data(), src_batch_strides_vec, + dst_batch_strides_vec, + // output + simplified_shape, simplified_src_strides, simplified_dst_strides, + src_offset, dst_offset); + + if (!((0 == src_offset) && (0 == dst_offset))) { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 1"); + } + + if (1 == nd) { + const auto expected_dim = static_cast(batch_nelems); + if ((simplified_shape.front() != expected_dim) || + (simplified_dst_strides.front() != dst_batch_step)) + { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 2"); + } + + auto impl_fn = as_c_contig_1d_batch_of_square_matrices_dispatch_vector + [src_type_id]; + const py::ssize_t src_batch_step = simplified_src_strides.front(); + + sycl::event ascontig_ev = + impl_fn(exec_q, batch_nelems, src_batch_step, dst_batch_step, n, + src.get_data(), src_strides_vec.back(), dst.get_data(), + dst_strides_vec[src_nd - 2], depends); + + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {ascontig_ev}), ascontig_ev); + } + + auto impl_fn = + as_c_contig_nd_batch_of_square_matrices_dispatch_vector[src_type_id]; + + std::vector host_task_events; + host_task_events.reserve(2); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + const auto &ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides); + py::ssize_t *packed_shape_strides = std::get<0>(ptr_size_event_tuple); + if (nullptr == packed_shape_strides) { + throw std::runtime_error("Unable to allocate device memory"); + } + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + + std::vector all_depends; + all_depends.reserve(depends.size() + 1); + all_depends.insert(std::end(all_depends), std::begin(depends), + std::end(depends)); + all_depends.push_back(copy_shape_ev); + + sycl::event ascontig_ev = + impl_fn(exec_q, batch_nelems, nd, packed_shape_strides, dst_batch_step, + n, src.get_data(), src_strides_vec.back(), dst.get_data(), + dst_strides_vec[src_nd - 2], all_depends); + + // async free of shape_strides temporary + const auto &ctx = exec_q.get_context(); + const auto &temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(ascontig_ev); + + cgh.host_task([ctx, packed_shape_strides]() { + using dpctl::tensor::alloc_utils::sycl_free_noexcept; + sycl_free_noexcept(packed_shape_strides, ctx); + }); + }); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + ascontig_ev); +} + +std::pair +py_as_f_contig_c2f(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + /* Same dimensions, same shape, same data-type + * dst is F-contiguous. + */ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("Number of dimensions must be the same."); + } + if (src_nd < 2) { + throw py::value_error("Arrays must have 2 or more axes"); + } + + // ensures also that destination is plenty ample to accommodate all + // elements of src array + if (!dst.is_f_contiguous()) { + throw py::value_error("Destination array must be C-contiguous"); + } + + const auto &src_shape_vec = src.get_shape_vector(); + const auto &dst_shape_vec = dst.get_shape_vector(); + + std::size_t nelems{1}; + bool equal_shapes = true; + + for (int i = 0; equal_shapes && (i < src_nd); ++i) { + auto sh_i = src_shape_vec[i]; + equal_shapes = equal_shapes && (sh_i == dst_shape_vec[i]); + nelems *= static_cast(sh_i); + } + + if (!equal_shapes) { + throw py::value_error("Shapes must be equal"); + } + + const auto n = dst_shape_vec.front(); + if (dst_shape_vec[1] != n) { + throw py::value_error("Matrices must be square"); + } + + const auto &src_strides_vec = src.get_strides_vector(); + + if (src_strides_vec[1] != py::ssize_t(1)) { + throw py::value_error("Unexpected destination array layout"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto array_types = td_ns::usm_ndarray_types(); + const int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_type_id != dst_type_id) { + throw py::value_error( + "Source and destination arrays must have the same data type"); + } + + if (nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + const auto &dst_strides_vec = dst.get_strides_vector(); + + const std::size_t batch_nelems = + (src_nd == 2) ? std::size_t(1) : (nelems / (n * n)); + const py::ssize_t dst_batch_step = + (src_nd == 2) ? py::ssize_t(0) : dst_strides_vec[2]; + + std::vector src_batch_strides_vec; + std::vector dst_batch_strides_vec; + std::vector batch_shape_vec; + + if (src_nd == 2) { + batch_shape_vec.push_back(py::ssize_t(1)); + src_batch_strides_vec.push_back(py::ssize_t(0)); + dst_batch_strides_vec.push_back(dst_batch_step); + } + else { + batch_shape_vec.insert(std::end(batch_shape_vec), + std::begin(src_shape_vec) + 2, + std::end(src_shape_vec)); + src_batch_strides_vec.insert(std::end(src_batch_strides_vec), + std::begin(src_strides_vec) + 2, + std::end(src_strides_vec)); + dst_batch_strides_vec.insert(std::end(dst_batch_strides_vec), + std::begin(dst_strides_vec) + 2, + std::end(dst_strides_vec)); + } + + // simplify batch iteration space + // NB: simplification reverses dst strides to C contig, + // it also reverses simplified_shape and simplified_src_strides + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = static_cast(batch_shape_vec.size()); + + // nd, simplified_* and *_offset are modified by reference + dpctl::tensor::py_internal::simplify_iteration_space( + nd, batch_shape_vec.data(), src_batch_strides_vec, + dst_batch_strides_vec, + // output + simplified_shape, simplified_src_strides, simplified_dst_strides, + src_offset, dst_offset); + + if (!((0 == src_offset) && (0 == dst_offset))) { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 1"); + } + + if (1 == nd) { + const auto expected_dim = static_cast(batch_nelems); + if ((simplified_shape.front() != expected_dim) || + (simplified_dst_strides.front() != dst_batch_step)) + { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 2"); + } + + auto impl_fn = as_c_contig_1d_batch_of_square_matrices_dispatch_vector + [src_type_id]; + const py::ssize_t src_batch_step = simplified_src_strides.front(); + + sycl::event ascontig_ev = + impl_fn(exec_q, batch_nelems, src_batch_step, dst_batch_step, n, + src.get_data(), src_strides_vec.front(), dst.get_data(), + dst_strides_vec[1], depends); + + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {ascontig_ev}), ascontig_ev); + } + + auto impl_fn = + as_c_contig_nd_batch_of_square_matrices_dispatch_vector[src_type_id]; + + std::vector host_task_events; + host_task_events.reserve(2); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + const auto &ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides); + py::ssize_t *packed_shape_strides = std::get<0>(ptr_size_event_tuple); + if (nullptr == packed_shape_strides) { + throw std::runtime_error("Unable to allocate device memory"); + } + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + + std::vector all_depends; + all_depends.reserve(depends.size() + 1); + all_depends.insert(std::end(all_depends), std::begin(depends), + std::end(depends)); + all_depends.push_back(copy_shape_ev); + + sycl::event ascontig_ev = + impl_fn(exec_q, batch_nelems, nd, packed_shape_strides, dst_batch_step, + n, src.get_data(), src_strides_vec.front(), dst.get_data(), + dst_strides_vec[1], all_depends); + + // async free of shape_strides temporary + const auto &ctx = exec_q.get_context(); + const auto &temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(ascontig_ev); + + cgh.host_task([ctx, packed_shape_strides]() { + using dpctl::tensor::alloc_utils::sycl_free_noexcept; + sycl_free_noexcept(packed_shape_strides, ctx); + }); + }); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + ascontig_ev); +} + +} // end of namespace py_internal +} // end of namespace tensor +} // end of namespace dpctl diff --git a/dpctl/tensor/libtensor/source/copy_as_contig.hpp b/dpctl/tensor/libtensor/source/copy_as_contig.hpp new file mode 100644 index 0000000000..f2957593e0 --- /dev/null +++ b/dpctl/tensor/libtensor/source/copy_as_contig.hpp @@ -0,0 +1,33 @@ +#pragma once + +#include +#include + +#include "dpctl4pybind11.hpp" +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +std::pair +py_as_c_contig(const dpctl::tensor::usm_ndarray &, + const dpctl::tensor::usm_ndarray &, + sycl::queue &, + const std::vector &); + +std::pair +py_as_f_contig(const dpctl::tensor::usm_ndarray &, + const dpctl::tensor::usm_ndarray &, + sycl::queue &, + const std::vector &); + +void init_copy_as_contig_dispatch_vectors(void); + +} // end of namespace py_internal +} // end of namespace tensor +} // end of namespace dpctl diff --git a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp index 8e5857f48b..b3d04a252a 100644 --- a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp +++ b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp @@ -84,7 +84,7 @@ void copy_numpy_ndarray_into_usm_ndarray( const py::ssize_t *dst_shape = dst.get_shape_raw(); bool shapes_equal(true); size_t src_nelems(1); - for (int i = 0; i < src_ndim; ++i) { + for (int i = 0; shapes_equal && (i < src_ndim); ++i) { shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); src_nelems *= static_cast(src_shape[i]); } @@ -114,7 +114,7 @@ void copy_numpy_ndarray_into_usm_ndarray( py::detail::array_descriptor_proxy(npy_src.dtype().ptr())->type_num; int dst_typenum = dst.get_typenum(); - auto array_types = td_ns::usm_ndarray_types(); + const auto &array_types = td_ns::usm_ndarray_types(); int src_type_id = array_types.typenum_to_lookup_id(src_typenum); int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); diff --git a/dpctl/tensor/libtensor/source/tensor_ctors.cpp b/dpctl/tensor/libtensor/source/tensor_ctors.cpp index 854574b69e..471207dea9 100644 --- a/dpctl/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl/tensor/libtensor/source/tensor_ctors.cpp @@ -39,6 +39,7 @@ #include "boolean_advanced_indexing.hpp" #include "clip.hpp" #include "copy_and_cast_usm_to_usm.hpp" +#include "copy_as_contig.hpp" #include "copy_for_reshape.hpp" #include "copy_for_roll.hpp" #include "copy_numpy_ndarray_into_usm_ndarray.hpp" @@ -70,6 +71,8 @@ using dpctl::tensor::overlap::MemoryOverlap; using dpctl::tensor::overlap::SameLogicalTensors; using dpctl::tensor::py_internal::copy_usm_ndarray_into_usm_ndarray; +using dpctl::tensor::py_internal::py_as_c_contig; +using dpctl::tensor::py_internal::py_as_f_contig; /* =========================== Copy for reshape ============================= */ @@ -143,6 +146,7 @@ void init_dispatch_vectors(void) { using namespace dpctl::tensor::py_internal; + init_copy_as_contig_dispatch_vectors(); init_copy_for_reshape_dispatch_vectors(); init_copy_for_roll_dispatch_vectors(); init_linear_sequences_dispatch_vectors(); @@ -187,6 +191,20 @@ PYBIND11_MODULE(_tensor_impl, m) py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_as_c_contig", &py_as_c_contig, + "Copies from usm_ndarray `src` into C-contiguous usm_ndarray " + "`dst` of the same shape and the same data type. " + "Returns a tuple of events: (host_task_event, compute_task_event)", + py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_as_f_contig", &py_as_f_contig, + "Copies from usm_ndarray `src` into F-contiguous usm_ndarray " + "`dst` of the same shape and the same data type. " + "Returns a tuple of events: (host_task_event, compute_task_event)", + py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + using dpctl::tensor::strides::contract_iter2; m.def( "_contract_iter2", &contract_iter2, diff --git a/dpctl/tests/test_tensor_asarray.py b/dpctl/tests/test_tensor_asarray.py index 939167126a..6a3005f336 100644 --- a/dpctl/tests/test_tensor_asarray.py +++ b/dpctl/tests/test_tensor_asarray.py @@ -20,7 +20,7 @@ import dpctl import dpctl.tensor as dpt -from .helper import get_queue_or_skip +from .helper import get_queue_or_skip, skip_if_dtype_not_supported @pytest.mark.parametrize( @@ -411,3 +411,148 @@ def test_orderK_gh_1350(): assert c.strides == b.strides assert c._element_offset == 0 assert not c._pointer == b._pointer + + +def _typesafe_arange(n: int, dtype_: dpt.dtype, device: object): + n_half = n // 2 + if dtype_.kind in "ui": + ii = dpt.iinfo(dtype_) + m0 = max(ii.min, -n_half) + m1 = min(m0 + n, ii.max) + n_tiles = (n + m1 - m0 - 1) // (m1 - m0) + res = dpt.arange(m0, m1, dtype=dtype_, device=device) + elif dtype_.kind == "b": + n_tiles = (n + 1) // 2 + res = dpt.asarray([False, True], dtype=dtype_, device=device) + else: + m0 = -n_half + m1 = m0 + n + n_tiles = 1 + res = dpt.linspace(m0, m1, num=n, dtype=dtype_, device=device) + if n_tiles > 1: + res = dpt.tile(res, n_tiles)[:n] + return res + + +_all_dtypes = [ + "b1", + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", +] + + +@pytest.mark.parametrize("dt", _all_dtypes) +def test_as_c_contig_rect(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + dtype_ = dpt.dtype(dt) + n0, n1, n2 = 6, 35, 37 + + arr_flat = _typesafe_arange(n0 * n1 * n2, dtype_, q) + x = dpt.reshape(arr_flat, (n0, n1, n2)).mT + + y = dpt.asarray(x, order="C") + assert dpt.all(x == y) + + x2 = x[0] + y2 = dpt.asarray(x2, order="C") + assert dpt.all(x2 == y2) + + x3 = dpt.flip(x, axis=1) + y3 = dpt.asarray(x3, order="C") + assert dpt.all(x3 == y3) + + x4 = dpt.reshape(arr_flat, (2, 3, n1, n2)).mT + x5 = x4[:, :2] + y5 = dpt.asarray(x5, order="C") + assert dpt.all(x5 == y5) + + x6 = dpt.reshape(arr_flat, (n0, n1, n2), order="F") + y6 = dpt.asarray(x6, order="C") + assert dpt.all(x6 == y6) + + +@pytest.mark.parametrize("dt", _all_dtypes) +def test_as_f_contig_rect(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + dtype_ = dpt.dtype(dt) + n0, n1, n2 = 6, 35, 37 + + arr_flat = _typesafe_arange(n0 * n1 * n2, dtype_, q) + x = dpt.reshape(arr_flat, (n0, n1, n2)) + + y = dpt.asarray(x, order="F") + assert dpt.all(x == y) + + x2 = x[0] + y2 = dpt.asarray(x2, order="F") + assert dpt.all(x2 == y2) + + x3 = dpt.flip(x, axis=1) + y3 = dpt.asarray(x3, order="F") + assert dpt.all(x3 == y3) + + x4 = dpt.reshape(arr_flat, (2, 3, n1, n2)) + x5 = dpt.moveaxis(x4[:, :2], (2, 3), (0, 1)) + y5 = dpt.asarray(x5, order="F") + assert dpt.all(x5 == y5) + + +@pytest.mark.parametrize("dt", _all_dtypes) +def test_as_c_contig_square(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + dtype_ = dpt.dtype(dt) + n0, n1 = 4, 53 + + arr_flat = _typesafe_arange(n0 * n1 * n1, dtype_, q) + x = dpt.reshape(arr_flat, (n0, n1, n1)).mT + + y = dpt.asarray(x, order="C") + assert dpt.all(x == y) + + x2 = x[0] + y2 = dpt.asarray(x2, order="C") + assert dpt.all(x2 == y2) + + x3 = dpt.flip(x, axis=1) + y3 = dpt.asarray(x3, order="C") + assert dpt.all(x3 == y3) + + +@pytest.mark.parametrize("dt", _all_dtypes) +def test_as_f_contig_square(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + dtype_ = dpt.dtype(dt) + n0, n1 = 6, 53 + + arr_flat = _typesafe_arange(n0 * n1 * n1, dtype_, q) + x = dpt.moveaxis(dpt.reshape(arr_flat, (n0, n1, n1)), (1, 2), (0, 1)) + + y = dpt.asarray(x, order="F") + assert dpt.all(x == y) + + x2 = x[..., 0] + y2 = dpt.asarray(x2, order="F") + assert dpt.all(x2 == y2) + + x3 = dpt.flip(x, axis=1) + y3 = dpt.asarray(x3, order="F") + assert dpt.all(x3 == y3)