diff --git a/CHANGELOG.md b/CHANGELOG.md
index b849718807..83bab58ac9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,7 +10,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Change
 
-* Improved efficiency of copy-and-cast operations from `numpy.ndarray` to `tensor.usm_ndarray` for contiguous inputs [gh-1829](https://github.com/IntelPython/dpctl/pull/1829)
+* Improved performance of copy-and-cast operations from `numpy.ndarray` to `tensor.usm_ndarray` for contiguous inputs [gh-1829](https://github.com/IntelPython/dpctl/pull/1829)
+* Improved performance of copying operation to C-/F-contig array, with optimization for batch of square matrices [gh-1850](https://github.com/IntelPython/dpctl/pull/1850)
 
 ### Fixed
 
diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index 797815cfed..31d4eba03d 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -123,6 +123,7 @@ set(_tensor_impl_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_ctors.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_as_contig.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp
diff --git a/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp b/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
new file mode 100644
index 0000000000..b8f46e0c3b
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
@@ -0,0 +1,580 @@
+//=== copy_ascontig.hpp - Implementation of copy-and-cast kernels *-C++-*/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2024 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor copying and value casting.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstdint>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+
+#include "dpctl_tensor_types.hpp"
+#include "kernels/alignment.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace copy_as_contig
+{
+
+template <typename T,
+          typename IndexerT,
+          int vec_sz = 4,
+          int n_vecs = 2,
+          bool enable_sg_loadstore = true>
+class CopyAsCContigFunctor
+{
+private:
+    const size_t nelems;
+    const T *src_p = nullptr;
+    T *dst_p = nullptr;
+    IndexerT src_indexer;
+
+public:
+    CopyAsCContigFunctor(size_t n,
+                         const T *src_,
+                         T *dst_,
+                         const IndexerT &src_indexer_)
+        : nelems(n), src_p(src_), dst_p(dst_), src_indexer(src_indexer_)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> ndit) const
+    {
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (!enable_sg_loadstore || is_complex<T>::value) {
+            const std::uint32_t sgSize =
+                ndit.get_sub_group().get_local_range()[0];
+            const std::size_t gid = ndit.get_global_linear_id();
+
+            const std::size_t base =
+                (gid / sgSize) * sgSize * n_vecs * vec_sz + (gid % sgSize);
+            for (size_t offset = base;
+                 offset < std::min(nelems, base + sgSize * (n_vecs * vec_sz));
+                 offset += sgSize)
+            {
+                auto src_offset = src_indexer(offset);
+                dst_p[offset] = src_p[src_offset];
+            }
+        }
+        else {
+            auto sg = ndit.get_sub_group();
+            const std::uint32_t sgSize = sg.get_local_range()[0];
+            const size_t base = n_vecs * vec_sz *
+                                (ndit.get_group(0) * ndit.get_local_range(0) +
+                                 sg.get_group_id()[0] * sgSize);
+
+            if (base + n_vecs * vec_sz * sgSize < nelems) {
+                sycl::vec<T, vec_sz> dst_vec;
+
+#pragma unroll
+                for (std::uint32_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
+                    auto dst_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(
+                        &dst_p[base + it * sgSize]);
+
+#pragma unroll
+                    for (std::uint32_t k = 0; k < vec_sz; k++) {
+                        ssize_t src_offset = src_indexer(
+                            base + (it + k) * sgSize + sg.get_local_id());
+                        dst_vec[k] = src_p[src_offset];
+                    }
+                    sg.store<vec_sz>(dst_multi_ptr, dst_vec);
+                }
+            }
+            else {
+                for (size_t k = base + sg.get_local_id()[0]; k < nelems;
+                     k += sgSize)
+                {
+                    ssize_t src_offset = src_indexer(k);
+                    dst_p[k] = src_p[src_offset];
+                }
+            }
+        }
+    }
+};
+
+template <typename T,
+          typename IndexT,
+          int vec_sz,
+          int n_vecs,
+          bool enable_sgload>
+class as_contig_krn;
+
+template <typename T>
+sycl::event
+as_c_contiguous_array_generic_impl(sycl::queue &exec_q,
+                                   size_t nelems,
+                                   int nd,
+                                   const ssize_t *shape_and_strides,
+                                   const char *src_p,
+                                   char *dst_p,
+                                   const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<T>(exec_q);
+
+    const T *src_tp = reinterpret_cast<const T *>(src_p);
+    T *dst_tp = reinterpret_cast<T *>(dst_p);
+
+    using IndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    const IndexerT src_indexer(nd, ssize_t(0), shape_and_strides);
+
+    constexpr std::size_t preferred_lws = 256;
+    constexpr std::uint32_t n_vecs = 2;
+    constexpr std::uint32_t vec_sz = 4;
+    constexpr bool enable_sg_load = true;
+    using KernelName =
+        as_contig_krn<T, IndexerT, vec_sz, n_vecs, enable_sg_load>;
+
+    const auto &kernel_id = sycl::get_kernel_id<KernelName>();
+
+    auto const &ctx = exec_q.get_context();
+    auto const &dev = exec_q.get_device();
+    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+        ctx, {dev}, {kernel_id});
+
+    auto krn = kb.get_kernel(kernel_id);
+
+    const std::uint32_t max_sg_size = krn.template get_info<
+        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+
+    const std::size_t lws =
+        ((preferred_lws + max_sg_size - 1) / max_sg_size) * max_sg_size;
+
+    constexpr std::uint32_t nelems_per_wi = n_vecs * vec_sz;
+    size_t n_groups =
+        (nelems + nelems_per_wi * lws - 1) / (nelems_per_wi * lws);
+
+    sycl::event copy_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const sycl::range<1> gRange{n_groups * lws};
+        const sycl::range<1> lRange{lws};
+
+        cgh.parallel_for<KernelName>(
+            sycl::nd_range<1>(gRange, lRange),
+            CopyAsCContigFunctor<T, IndexerT, vec_sz, n_vecs, enable_sg_load>(
+                nelems, src_tp, dst_tp, src_indexer));
+    });
+
+    return copy_ev;
+}
+
+typedef sycl::event (*as_c_contiguous_array_impl_fn_ptr_t)(
+    sycl::queue &,
+    size_t,
+    int,
+    const ssize_t *,
+    const char *,
+    char *,
+    const std::vector<sycl::event> &);
+
+template <typename fnT, typename T> struct AsCContigFactory
+{
+    fnT get() { return as_c_contiguous_array_generic_impl<T>; }
+};
+
+template <typename T,
+          typename IndexerT,
+          std::uint32_t tile_size,
+          std::uint32_t n_lines>
+class as_contig_batch_of_square_matrices_krn;
+
+namespace detail
+{
+/*! @brief batch of matrices (n, n), source strides (1, src_ld), destination
+   strides (dst_ld, 1) src and destination arrays must be disjoint memory blocks
+   to avoid race condition
+ */
+template <typename T, typename BatchIndexerT>
+sycl::event as_c_contiguous_batch_of_square_matrices_impl(
+    sycl::queue &exec_q,
+    size_t batch_nelems,
+    const BatchIndexerT &batch_two_offsets_indexer,
+    size_t n,
+    const char *src_p,
+    ssize_t src_ld,
+    char *dst_p,
+    ssize_t dst_ld,
+    const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<T>(exec_q);
+
+    const T *src_tp = reinterpret_cast<const T *>(src_p);
+    T *dst_tp = reinterpret_cast<T *>(dst_p);
+
+    constexpr std::uint32_t private_tile_size = 4;
+    constexpr std::uint32_t n_lines = 2;
+    constexpr std::uint32_t block_size =
+        n_lines * private_tile_size * private_tile_size;
+
+    constexpr std::uint32_t lws0 = block_size;
+    constexpr std::uint32_t lws1 = n_lines;
+    constexpr std::uint32_t nelems_per_wi = (block_size / lws1);
+
+    static_assert(nelems_per_wi * lws1 == block_size);
+    static_assert(nelems_per_wi == private_tile_size * private_tile_size);
+
+    constexpr std::uint32_t lws = lws0 * lws1;
+
+    const std::size_t n_tiles = (n + block_size - 1) / block_size;
+
+    const ssize_t src_stride = src_ld;
+    const ssize_t dst_stride = dst_ld;
+
+    sycl::range<1> lRange{lws};
+    sycl::range<1> gRange{batch_nelems * n_tiles * n_tiles * lws};
+
+    sycl::nd_range<1> ndRange{gRange, lRange};
+
+    using KernelName =
+        as_contig_batch_of_square_matrices_krn<T, BatchIndexerT,
+                                               private_tile_size, lws1>;
+
+    sycl::event e = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        sycl::local_accessor<T, 1> local_block(block_size * block_size, cgh);
+
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> nd_it) {
+            // 1. Read block from source array into SLM
+            const std::uint32_t lid_lin = nd_it.get_local_linear_id();
+            const size_t gr_id_lin = nd_it.get_group_linear_id();
+
+            const std::size_t batch_id = gr_id_lin / (n_tiles * n_tiles);
+            const std::size_t rem = gr_id_lin - batch_id * (n_tiles * n_tiles);
+
+            const auto &batch_two_offsets = batch_two_offsets_indexer(batch_id);
+            const auto &src_batch_offset = batch_two_offsets.get_first_offset();
+            const auto &dst_batch_offset =
+                batch_two_offsets.get_second_offset();
+
+            // Block id
+            /* 0 <= src_gr_i1 < n_groups_n1 */
+            const std::size_t src_tile_i1 = rem / n_tiles;
+            /* 0 <= src_gr_i0 < n_groups_n0 */
+            const std::size_t src_tile_i0 = rem - src_tile_i1 * n_tiles;
+
+            // ID of element within the block
+            /* 0 <= src_i1 < lws1 */
+            const std::uint32_t src_i1 = lid_lin / lws0;
+            /* 0 <= src_i0 < lws0 */
+            const std::uint32_t src_i0 = lid_lin - src_i1 * lws0;
+
+            // Matrix element ID
+            const std::size_t src_tile_start0 = src_tile_i0 * block_size;
+            const std::size_t src_tile_start1 = src_tile_i1 * block_size;
+            const std::size_t src_gid0 = (src_tile_start0 + src_i0);
+            const std::size_t src_gid1 = (src_tile_start1 + src_i1);
+
+            // src_offset = src_gid0 * 1 + (src_gid1 + pr_id * lws1) *
+            // src_stride
+            const std::size_t src_offset0 =
+                src_batch_offset + src_gid0 * 1 + src_gid1 * src_stride;
+            const std::size_t pr_step_src = lws1 * src_stride;
+
+            const std::uint32_t local_offset0 = src_i0 + src_i1 * block_size;
+            const std::uint32_t pr_step_local = lws1 * block_size;
+
+            for (std::uint32_t pr_id = 0; pr_id < nelems_per_wi; ++pr_id) {
+                local_block[local_offset0 + pr_step_local * pr_id] =
+                    (src_gid0 < n && src_gid1 + pr_id * lws1 < n)
+                        ? src_tp[src_offset0 + pr_step_src * pr_id]
+                        : T(0);
+            }
+
+            const std::uint32_t local_dim0 = static_cast<std::uint32_t>(
+                std::min<std::size_t>(src_tile_start0 + block_size, n) -
+                src_tile_start0);
+            const std::uint32_t local_dim1 = static_cast<std::uint32_t>(
+                std::min<std::size_t>(src_tile_start1 + block_size, n) -
+                src_tile_start1);
+
+            sycl::group_barrier(nd_it.get_group(),
+                                sycl::memory_scope::work_group);
+
+            // 2. Permute the block matrix in SLM using two private arrays
+            std::array<T, nelems_per_wi> private_block_01 = {T(0)};
+            std::array<T, nelems_per_wi> private_block_10 = {T(0)};
+
+            // 0 <= lid_lin < lws0 * lws1 == (block_size * block_size /
+            // nelems_per_wi) == (block_size/private_tile_size)**2
+            constexpr std::uint32_t n_private_tiles_per_axis =
+                block_size / private_tile_size;
+            const std::uint32_t local_tile_id0 =
+                lid_lin / n_private_tiles_per_axis;
+            const std::uint32_t local_tile_id1 =
+                lid_lin - local_tile_id0 * n_private_tiles_per_axis;
+
+            if (local_tile_id0 <= local_tile_id1) {
+                for (std::uint32_t pr_i0 = 0; pr_i0 < private_tile_size;
+                     ++pr_i0)
+                {
+                    for (std::uint32_t pr_i1 = 0; pr_i1 < private_tile_size;
+                         ++pr_i1)
+                    {
+                        const std::uint32_t t0_offset =
+                            local_tile_id0 * private_tile_size;
+                        const std::uint32_t t1_offset =
+                            local_tile_id1 * private_tile_size;
+
+                        const std::uint32_t pr_offset =
+                            pr_i1 * private_tile_size + pr_i0;
+                        const std::uint32_t rel_offset =
+                            pr_i0 + pr_i1 * block_size;
+
+                        // read (local_tile_id0, local_tile_id1)
+                        const std::uint32_t local_01_offset =
+                            (t0_offset + t1_offset * block_size) + rel_offset;
+                        private_block_01[pr_offset] =
+                            local_block[local_01_offset];
+
+                        // read (local_tile_id1, local_tile_id0)
+                        const std::uint32_t local_10_offset =
+                            (t1_offset + t0_offset * block_size) + rel_offset;
+                        private_block_10[pr_offset] =
+                            local_block[local_10_offset];
+                    }
+                }
+            }
+
+            sycl::group_barrier(nd_it.get_group(),
+                                sycl::memory_scope::work_group);
+
+            if (local_tile_id0 <= local_tile_id1) {
+                for (std::uint32_t pr_i0 = 0; pr_i0 < private_tile_size;
+                     ++pr_i0)
+                {
+                    for (std::uint32_t pr_i1 = 0; pr_i1 < private_tile_size;
+                         ++pr_i1)
+                    {
+                        const std::uint32_t t0_offset =
+                            local_tile_id0 * private_tile_size;
+                        const std::uint32_t t1_offset =
+                            local_tile_id1 * private_tile_size;
+                        const std::uint32_t pr_offset =
+                            pr_i0 * private_tile_size + pr_i1;
+
+                        const std::uint32_t rel_offset =
+                            pr_i0 + pr_i1 * block_size;
+
+                        // write back permuted private blocks
+                        const std::uint32_t local_01_offset =
+                            (t0_offset + t1_offset * block_size) + rel_offset;
+                        local_block[local_01_offset] =
+                            private_block_10[pr_offset];
+
+                        const std::uint32_t local_10_offset =
+                            (t1_offset + t0_offset * block_size) + rel_offset;
+                        local_block[local_10_offset] =
+                            private_block_01[pr_offset];
+                    }
+                }
+            }
+
+            sycl::group_barrier(nd_it.get_group(),
+                                sycl::memory_scope::work_group);
+
+            // 3. Write out permuted SLM to destination array
+
+            const std::size_t dst_tile_start0 = src_tile_start0;
+            const std::size_t dst_tile_start1 = src_tile_start1;
+
+            if (local_dim0 == block_size && local_dim1 == block_size) {
+                const std::uint32_t dst_i0 = src_i1;
+                const std::uint32_t dst_i1 = src_i0;
+
+                const std::size_t dst_gid0 = (dst_tile_start0 + dst_i0);
+                const std::size_t dst_gid1 = (dst_tile_start1 + dst_i1);
+
+                const std::size_t dst_offset0 =
+                    dst_batch_offset + dst_gid0 * dst_stride + dst_gid1 * 1;
+                const std::size_t pr_step_dst = lws1 * dst_stride;
+
+                const std::uint32_t _local_offset0 =
+                    dst_i0 * block_size + dst_i1;
+                const std::uint32_t _pr_step_local = lws1 * block_size;
+
+                for (std::uint32_t pr_id = 0; pr_id < nelems_per_wi; ++pr_id) {
+                    if ((dst_gid1 < n) && ((dst_gid0 + pr_id * lws1) < n)) {
+                        dst_tp[dst_offset0 + pr_step_dst * pr_id] =
+                            local_block[_local_offset0 +
+                                        _pr_step_local * pr_id];
+                    }
+                }
+            }
+            else {
+                // map local_linear_id into (local_dim0, local_dim1)
+                for (std::uint32_t el_id = lid_lin;
+                     el_id < local_dim0 * local_dim1; el_id += lws0 * lws1)
+                {
+
+                    // 0 <= local_i0 < local_dim0
+                    const std::uint32_t loc_i0 = el_id / local_dim1;
+                    // 0 <= local_i1 < local_dim1
+                    const std::uint32_t loc_i1 = el_id - loc_i0 * local_dim1;
+
+                    const std::uint32_t dst_i0 = loc_i0;
+                    const std::uint32_t dst_i1 = loc_i1;
+
+                    const std::size_t dst_gid0 = (dst_tile_start0 + dst_i0);
+                    const std::size_t dst_gid1 = (dst_tile_start1 + dst_i1);
+
+                    const std::size_t dst_offset =
+                        dst_batch_offset + dst_gid0 * dst_stride + dst_gid1 * 1;
+                    const std::uint32_t local_offset =
+                        loc_i0 * block_size + loc_i1;
+
+                    if ((dst_gid1 < n) && (dst_gid0 < n)) {
+                        dst_tp[dst_offset] = local_block[local_offset];
+                    }
+                }
+            }
+        });
+    });
+
+    return e;
+}
+
+} // end of namespace detail
+
+template <typename T>
+sycl::event as_c_contiguous_1d_batch_of_square_matrices_impl(
+    sycl::queue &exec_q,
+    size_t batch_nelems,
+    ssize_t src_batch_step,
+    ssize_t dst_batch_step,
+    size_t n,
+    const char *src_p,
+    ssize_t src_ld,
+    char *dst_p,
+    ssize_t dst_ld,
+    const std::vector<sycl::event> &depends)
+{
+    using dpctl::tensor::offset_utils::Strided1DIndexer;
+    using dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer;
+    using BatchIndexerT =
+        TwoOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer>;
+
+    const auto &src_batch_indexer =
+        Strided1DIndexer(batch_nelems, src_batch_step);
+    const auto &dst_batch_indexer =
+        Strided1DIndexer(batch_nelems, dst_batch_step);
+
+    const BatchIndexerT batch_two_indexer{src_batch_indexer, dst_batch_indexer};
+
+    return detail::as_c_contiguous_batch_of_square_matrices_impl<T,
+                                                                 BatchIndexerT>(
+        exec_q, batch_nelems, batch_two_indexer, n, src_p, src_ld, dst_p,
+        dst_ld, depends);
+}
+
+typedef sycl::event (
+    *as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t)(
+    sycl::queue &, /* execution queue */
+    size_t,        /* number of batch elements */
+    ssize_t,       /* distance between batches in source array */
+    ssize_t,       /* distance between batches in destination array */
+    size_t,        /* size of square matrices in the batch */
+    const char *,
+    ssize_t, /* untyped pointer to F-contig source array, and matrix leading
+                dimension */
+    char *,
+    ssize_t, /* untyped pointer to C-contig destination array, and matrix
+                leading dimension */
+    const std::vector<sycl::event> &);
+
+template <typename fnT, typename T>
+struct AsCContig1DBatchOfSquareMatricesFactory
+{
+    fnT get() { return as_c_contiguous_1d_batch_of_square_matrices_impl<T>; }
+};
+
+template <typename T>
+sycl::event as_c_contiguous_nd_batch_of_square_matrices_impl(
+    sycl::queue &exec_q,
+    size_t batch_nelems,
+    int batch_nd,
+    const ssize_t *src_batch_shape_strides,
+    const ssize_t dst_batch_step,
+    size_t n,
+    const char *src_p,
+    ssize_t src_ld,
+    char *dst_p,
+    ssize_t dst_ld,
+    const std::vector<sycl::event> &depends)
+{
+    using SrcIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+    using DstIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+    using dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer;
+    using BatchIndexerT = TwoOffsets_CombinedIndexer<SrcIndexerT, DstIndexerT>;
+
+    constexpr ssize_t zero_offset{0};
+
+    const SrcIndexerT src_batch_indexer{batch_nd, zero_offset,
+                                        src_batch_shape_strides};
+    const DstIndexerT dst_batch_indexer{/* size */ batch_nelems,
+                                        /* step */ dst_batch_step};
+
+    const BatchIndexerT batch_two_offsets_indexer{src_batch_indexer,
+                                                  dst_batch_indexer};
+
+    return detail::as_c_contiguous_batch_of_square_matrices_impl<T,
+                                                                 BatchIndexerT>(
+        exec_q, batch_nelems, batch_two_offsets_indexer, n, src_p, src_ld,
+        dst_p, dst_ld, depends);
+}
+
+typedef sycl::event (
+    *as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t)(
+    sycl::queue &, /* execution queue */
+    size_t,        /* number of matrices in the batch */
+    int,
+    const ssize_t *, /* dimensionality, and packed [shape, src_strides]
+                        describing iteration over batch in source array */
+    ssize_t,         /* distance between batches in destination array */
+    size_t,          /* matrix size */
+    const char *,
+    ssize_t, /* untyped pointer to source array of F-contig matrices, and
+                leading dimension of the matrix */
+    char *,
+    ssize_t, /* untyped pointer to destination array of F-contig matrices, and
+                leading dimension of the matrix */
+    const std::vector<sycl::event> &);
+
+template <typename fnT, typename T>
+struct AsCContigNDBatchOfSquareMatricesFactory
+{
+    fnT get() { return as_c_contiguous_nd_batch_of_square_matrices_impl<T>; }
+};
+
+} // namespace copy_as_contig
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
index 801dbe72f8..6b10851869 100644
--- a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
+++ b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
@@ -35,6 +35,7 @@
 #include <utility>
 
 #include "dpctl4pybind11.hpp"
+
 #include "kernels/copy_and_cast.hpp"
 #include "utils/memory_overlap.hpp"
 #include "utils/offset_utils.hpp"
@@ -43,6 +44,7 @@
 #include "utils/type_dispatch.hpp"
 #include "utils/type_utils.hpp"
 
+#include "copy_as_contig.hpp"
 #include "simplify_iteration_space.hpp"
 
 namespace dpctl
@@ -90,7 +92,7 @@ copy_usm_ndarray_into_usm_ndarray(const dpctl::tensor::usm_ndarray &src,
     bool shapes_equal(true);
     size_t src_nelems(1);
 
-    for (int i = 0; i < src_nd; ++i) {
+    for (int i = 0; shapes_equal && (i < src_nd); ++i) {
         src_nelems *= static_cast<size_t>(src_shape[i]);
         shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
     }
@@ -162,6 +164,15 @@ copy_usm_ndarray_into_usm_ndarray(const dpctl::tensor::usm_ndarray &src,
                               copy_ev);
     }
 
+    if ((src_type_id == dst_type_id) && (src_nd > 1)) {
+        if (is_dst_c_contig) {
+            return py_as_c_contig(src, dst, exec_q, depends);
+        }
+        else if (is_dst_f_contig) {
+            return py_as_f_contig(src, dst, exec_q, depends);
+        }
+    }
+
     auto const &src_strides = src.get_strides_vector();
     auto const &dst_strides = dst.get_strides_vector();
 
diff --git a/dpctl/tensor/libtensor/source/copy_as_contig.cpp b/dpctl/tensor/libtensor/source/copy_as_contig.cpp
new file mode 100644
index 0000000000..1a186d6ce7
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/copy_as_contig.cpp
@@ -0,0 +1,776 @@
+//==- copy_ascontig.cpp - Implementation of _tensor_impl module   -*-C++-*-/==//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2024 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <numeric>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include <dpctl4pybind11.hpp>
+#include <pybind11/pybind11.h>
+
+#include "kernels/copy_as_contiguous.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "copy_as_contig.hpp"
+#include "simplify_iteration_space.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::copy_as_contig::
+    as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t;
+using dpctl::tensor::kernels::copy_as_contig::
+    as_c_contiguous_array_impl_fn_ptr_t;
+using dpctl::tensor::kernels::copy_as_contig::
+    as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t;
+using dpctl::utils::keep_args_alive;
+
+static as_c_contiguous_array_impl_fn_ptr_t
+    as_c_contig_array_dispatch_vector[td_ns::num_types];
+
+static as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t
+    as_c_contig_1d_batch_of_square_matrices_dispatch_vector[td_ns::num_types];
+
+static as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t
+    as_c_contig_nd_batch_of_square_matrices_dispatch_vector[td_ns::num_types];
+
+void init_copy_as_contig_dispatch_vectors(void)
+{
+
+    using dpctl::tensor::kernels::copy_as_contig::
+        AsCContig1DBatchOfSquareMatricesFactory;
+    using dpctl::tensor::kernels::copy_as_contig::AsCContigFactory;
+    using dpctl::tensor::kernels::copy_as_contig::
+        AsCContigNDBatchOfSquareMatricesFactory;
+    using td_ns::DispatchVectorBuilder;
+
+    // Generic to c-contig
+    DispatchVectorBuilder<as_c_contiguous_array_impl_fn_ptr_t, AsCContigFactory,
+                          td_ns::num_types>
+        dtv_as_c_contig_array;
+
+    dtv_as_c_contig_array.populate_dispatch_vector(
+        as_c_contig_array_dispatch_vector);
+
+    // 1D batch of square views into F-contig matrices to c-contig array
+    DispatchVectorBuilder<
+        as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t,
+        AsCContig1DBatchOfSquareMatricesFactory, td_ns::num_types>
+        dtv_as_c_contig_1d_batch_of_square_matrices;
+
+    dtv_as_c_contig_1d_batch_of_square_matrices.populate_dispatch_vector(
+        as_c_contig_1d_batch_of_square_matrices_dispatch_vector);
+
+    // ND batch of square views into F-contig matrices to c-contig array
+    DispatchVectorBuilder<
+        as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t,
+        AsCContigNDBatchOfSquareMatricesFactory, td_ns::num_types>
+        dtv_as_c_contig_nd_batch_of_square_matrices;
+
+    dtv_as_c_contig_nd_batch_of_square_matrices.populate_dispatch_vector(
+        as_c_contig_nd_batch_of_square_matrices_dispatch_vector);
+}
+
+namespace
+{
+
+template <typename dimT> std::size_t get_nelems(const std::vector<dimT> &shape)
+{
+    auto mult_fn = [](std::size_t prod, const dimT &term) -> std::size_t {
+        return prod * static_cast<std::size_t>(term);
+    };
+
+    constexpr std::size_t unit{1};
+
+    const std::size_t nelems =
+        std::accumulate(std::begin(shape), std::end(shape), unit, mult_fn);
+    return nelems;
+}
+
+} // end of anonymous namespace
+
+std::pair<sycl::event, sycl::event>
+py_as_c_contig_f2c(const dpctl::tensor::usm_ndarray &src,
+                   const dpctl::tensor::usm_ndarray &dst,
+                   sycl::queue &exec_q,
+                   const std::vector<sycl::event> &depends);
+
+std::pair<sycl::event, sycl::event>
+py_as_c_contig(const dpctl::tensor::usm_ndarray &src,
+               const dpctl::tensor::usm_ndarray &dst,
+               sycl::queue &exec_q,
+               const std::vector<sycl::event> &depends)
+{
+    /*  Same dimensions, same shape, same data-type
+     *  dst is C-contiguous.
+     */
+    const int src_nd = src.get_ndim();
+    const int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("Number of dimensions must be the same");
+    }
+
+    const auto &src_shape_vec = src.get_shape_vector();
+    const auto &dst_shape_vec = dst.get_shape_vector();
+
+    if (src_shape_vec != dst_shape_vec) {
+        throw py::value_error("Shapes must be equal");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    const int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_type_id != dst_type_id) {
+        throw py::value_error(
+            "Source and destination arrays must have the same data type");
+    }
+
+    // ensures also that destination is plenty ample to accommodate all
+    // elements of src array
+    if (!dst.is_c_contiguous()) {
+        throw py::value_error("Destination array must be C-contiguous");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    const auto &src_strides_vec = src.get_strides_vector();
+
+    if (src_nd >= 2) {
+        auto n = dst_shape_vec.back();
+        if (n == dst_shape_vec[src_nd - 2]) {
+            constexpr auto unit_stride = py::ssize_t(1);
+            if (src_strides_vec[src_nd - 2] == unit_stride) {
+                return py_as_c_contig_f2c(src, dst, exec_q, depends);
+            }
+        }
+    }
+
+    const std::size_t nelems = get_nelems(src_shape_vec);
+
+    if (nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // simplify iteration space
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_nd;
+
+    // nd, simplified_* and *_offset are modified by reference
+    dpctl::tensor::py_internal::simplify_iteration_space(
+        nd, src_shape_vec.data(), src_strides_vec, dst.get_strides_vector(),
+        // output
+        simplified_shape, simplified_src_strides, simplified_dst_strides,
+        src_offset, dst_offset);
+
+    if (!((0 == src_offset) && (0 == dst_offset))) {
+        throw std::runtime_error(
+            "Unexpected result of simplifying iteration space, 1");
+    }
+
+    std::vector<sycl::event> host_task_events{};
+    const auto &ptr_size_event_tuple =
+        dpctl::tensor::offset_utils::device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_shape, simplified_src_strides);
+
+    py::ssize_t *shape_stride = std::get<0>(ptr_size_event_tuple);
+    if (shape_stride == nullptr) {
+        throw std::runtime_error("Unable to allocate device memory");
+    }
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+
+    auto ascontig_fn = as_c_contig_array_dispatch_vector[src_type_id];
+
+    std::vector<sycl::event> all_depends;
+    all_depends.reserve(depends.size() + 1);
+    all_depends.insert(std::end(all_depends), std::begin(depends),
+                       std::end(depends));
+    all_depends.push_back(copy_shape_ev);
+
+    sycl::event ascontig_ev =
+        ascontig_fn(exec_q, nelems, nd, shape_stride, src.get_data(),
+                    dst.get_data(), all_depends);
+
+    const auto &ctx = exec_q.get_context();
+    const auto &temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(ascontig_ev);
+        using dpctl::tensor::alloc_utils::sycl_free_noexcept;
+        cgh.host_task(
+            [ctx, shape_stride]() { sycl_free_noexcept(shape_stride, ctx); });
+    });
+
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          ascontig_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+py_as_f_contig_c2f(const dpctl::tensor::usm_ndarray &src,
+                   const dpctl::tensor::usm_ndarray &dst,
+                   sycl::queue &exec_q,
+                   const std::vector<sycl::event> &depends);
+
+std::pair<sycl::event, sycl::event>
+py_as_f_contig(const dpctl::tensor::usm_ndarray &src,
+               const dpctl::tensor::usm_ndarray &dst,
+               sycl::queue &exec_q,
+               const std::vector<sycl::event> &depends)
+{
+    /*  Same dimensions, same shape, same data-type
+     *  dst is F-contiguous.
+     */
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("Number of dimensions must be the same");
+    }
+
+    const auto &src_shape_vec = src.get_shape_vector();
+    const auto &dst_shape_vec = dst.get_shape_vector();
+
+    if (src_shape_vec != dst_shape_vec) {
+        throw py::value_error("Shapes must be equal");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    const int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_type_id != dst_type_id) {
+        throw py::value_error(
+            "Source and destination arrays must have the same data type");
+    }
+
+    // ensures also that destination is plenty ample to accommodate all
+    // elements of src array
+    if (!dst.is_f_contiguous()) {
+        throw py::value_error("Destination array must be F-contiguous");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    const auto &src_strides_vec = src.get_strides_vector();
+
+    if (src_nd >= 2) {
+        auto n = dst_shape_vec.front();
+        if (n == dst_shape_vec[1]) {
+            constexpr auto unit_stride = py::ssize_t(1);
+            if (src_strides_vec[1] == unit_stride) {
+                return py_as_f_contig_c2f(src, dst, exec_q, depends);
+            }
+        }
+    }
+
+    const std::size_t nelems = get_nelems(src_shape_vec);
+
+    if (nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // simplify batch iteration space
+    // NB: simplification reverses dst strides to C contig,
+    // it also reverses simplified_shape and simplified_src_strides
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_nd;
+
+    // nd, simplified_* and *_offset are modified by reference
+    dpctl::tensor::py_internal::simplify_iteration_space(
+        nd, src_shape_vec.data(), src_strides_vec, dst.get_strides_vector(),
+        // output
+        simplified_shape, simplified_src_strides, simplified_dst_strides,
+        src_offset, dst_offset);
+
+    if (!((0 == src_offset) && (0 == dst_offset))) {
+        throw std::runtime_error(
+            "Unexpected result of simplifying iteration space, 1");
+    }
+
+    std::vector<sycl::event> host_task_events{};
+    const auto &ptr_size_event_tuple =
+        dpctl::tensor::offset_utils::device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events, simplified_shape, simplified_src_strides);
+
+    py::ssize_t *shape_stride = std::get<0>(ptr_size_event_tuple);
+    if (shape_stride == nullptr) {
+        throw std::runtime_error("Unable to allocate device memory");
+    }
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+
+    auto ascontig_fn = as_c_contig_array_dispatch_vector[src_type_id];
+
+    std::vector<sycl::event> all_depends;
+    all_depends.reserve(depends.size() + 1);
+    all_depends.insert(std::end(all_depends), std::begin(depends),
+                       std::end(depends));
+    all_depends.push_back(copy_shape_ev);
+
+    sycl::event ascontig_ev =
+        ascontig_fn(exec_q, nelems, nd, shape_stride, src.get_data(),
+                    dst.get_data(), all_depends);
+
+    const auto &ctx = exec_q.get_context();
+    const auto &temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(ascontig_ev);
+        using dpctl::tensor::alloc_utils::sycl_free_noexcept;
+        cgh.host_task(
+            [ctx, shape_stride]() { sycl_free_noexcept(shape_stride, ctx); });
+    });
+
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          ascontig_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+py_as_c_contig_f2c(const dpctl::tensor::usm_ndarray &src,
+                   const dpctl::tensor::usm_ndarray &dst,
+                   sycl::queue &exec_q,
+                   const std::vector<sycl::event> &depends)
+{
+    /*  Same dimensions, same shape, same data-type
+     *  dst is C-contiguous.
+     */
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("Number of dimensions must be the same.");
+    }
+    if (src_nd < 2) {
+        throw py::value_error("Arrays must have 2 or more axes");
+    }
+
+    const auto &src_shape_vec = src.get_shape_vector();
+    const auto &dst_shape_vec = dst.get_shape_vector();
+
+    std::size_t nelems{1};
+    bool equal_shapes = true;
+
+    for (int i = 0; equal_shapes && (i < src_nd); ++i) {
+        auto sh_i = src_shape_vec[i];
+        equal_shapes = equal_shapes && (sh_i == dst_shape_vec[i]);
+        nelems *= static_cast<std::size_t>(sh_i);
+    }
+
+    if (!equal_shapes) {
+        throw py::value_error("Shapes must be equal");
+    }
+
+    const auto n = src_shape_vec.back();
+    if (src_shape_vec[src_nd - 2] != n) {
+        throw py::value_error("Matrices must be square");
+    }
+
+    const auto &src_strides_vec = src.get_strides_vector();
+
+    if (src_strides_vec[src_nd - 2] != py::ssize_t(1)) {
+        throw py::value_error("Unexpected destination array layout");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    const int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_type_id != dst_type_id) {
+        throw py::value_error(
+            "Source and destination arrays must have the same data type");
+    }
+
+    // ensures also that destination is plenty ample to accommodate all
+    // elements of src array
+    if (!dst.is_c_contiguous()) {
+        throw py::value_error("Destination array must be C-contiguous");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    if (nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    const auto &dst_strides_vec = dst.get_strides_vector();
+
+    const std::size_t batch_nelems =
+        (src_nd == 2) ? std::size_t(1) : (nelems / (n * n));
+    const py::ssize_t dst_batch_step =
+        (src_nd == 2) ? py::ssize_t(0) : dst_strides_vec[src_nd - 3];
+
+    std::vector<py::ssize_t> src_batch_strides_vec;
+    std::vector<py::ssize_t> dst_batch_strides_vec;
+    std::vector<py::ssize_t> batch_shape_vec;
+
+    if (src_nd == 2) {
+        batch_shape_vec.push_back(py::ssize_t(1));
+        src_batch_strides_vec.push_back(py::ssize_t(0));
+        dst_batch_strides_vec.push_back(dst_batch_step);
+    }
+    else {
+        batch_shape_vec.insert(std::end(batch_shape_vec),
+                               std::begin(src_shape_vec),
+                               std::end(src_shape_vec) - 2);
+        src_batch_strides_vec.insert(std::end(src_batch_strides_vec),
+                                     std::begin(src_strides_vec),
+                                     std::end(src_strides_vec) - 2);
+        dst_batch_strides_vec.insert(std::end(dst_batch_strides_vec),
+                                     std::begin(dst_strides_vec),
+                                     std::end(dst_strides_vec) - 2);
+    }
+
+    // simplify batch iteration space
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = static_cast<int>(batch_shape_vec.size());
+
+    // nd, simplified_* and *_offset are modified by reference
+    dpctl::tensor::py_internal::simplify_iteration_space(
+        nd, batch_shape_vec.data(), src_batch_strides_vec,
+        dst_batch_strides_vec,
+        // output
+        simplified_shape, simplified_src_strides, simplified_dst_strides,
+        src_offset, dst_offset);
+
+    if (!((0 == src_offset) && (0 == dst_offset))) {
+        throw std::runtime_error(
+            "Unexpected result of simplifying iteration space, 1");
+    }
+
+    if (1 == nd) {
+        const auto expected_dim = static_cast<py::ssize_t>(batch_nelems);
+        if ((simplified_shape.front() != expected_dim) ||
+            (simplified_dst_strides.front() != dst_batch_step))
+        {
+            throw std::runtime_error(
+                "Unexpected result of simplifying iteration space, 2");
+        }
+
+        auto impl_fn = as_c_contig_1d_batch_of_square_matrices_dispatch_vector
+            [src_type_id];
+        const py::ssize_t src_batch_step = simplified_src_strides.front();
+
+        sycl::event ascontig_ev =
+            impl_fn(exec_q, batch_nelems, src_batch_step, dst_batch_step, n,
+                    src.get_data(), src_strides_vec.back(), dst.get_data(),
+                    dst_strides_vec[src_nd - 2], depends);
+
+        return std::make_pair(
+            keep_args_alive(exec_q, {src, dst}, {ascontig_ev}), ascontig_ev);
+    }
+
+    auto impl_fn =
+        as_c_contig_nd_batch_of_square_matrices_dispatch_vector[src_type_id];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    const auto &ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_shape, simplified_src_strides);
+    py::ssize_t *packed_shape_strides = std::get<0>(ptr_size_event_tuple);
+    if (nullptr == packed_shape_strides) {
+        throw std::runtime_error("Unable to allocate device memory");
+    }
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+
+    std::vector<sycl::event> all_depends;
+    all_depends.reserve(depends.size() + 1);
+    all_depends.insert(std::end(all_depends), std::begin(depends),
+                       std::end(depends));
+    all_depends.push_back(copy_shape_ev);
+
+    sycl::event ascontig_ev =
+        impl_fn(exec_q, batch_nelems, nd, packed_shape_strides, dst_batch_step,
+                n, src.get_data(), src_strides_vec.back(), dst.get_data(),
+                dst_strides_vec[src_nd - 2], all_depends);
+
+    // async free of shape_strides temporary
+    const auto &ctx = exec_q.get_context();
+    const auto &temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(ascontig_ev);
+
+        cgh.host_task([ctx, packed_shape_strides]() {
+            using dpctl::tensor::alloc_utils::sycl_free_noexcept;
+            sycl_free_noexcept(packed_shape_strides, ctx);
+        });
+    });
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          ascontig_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+py_as_f_contig_c2f(const dpctl::tensor::usm_ndarray &src,
+                   const dpctl::tensor::usm_ndarray &dst,
+                   sycl::queue &exec_q,
+                   const std::vector<sycl::event> &depends)
+{
+    /*  Same dimensions, same shape, same data-type
+     *  dst is F-contiguous.
+     */
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    if (src_nd != dst_nd) {
+        throw py::value_error("Number of dimensions must be the same.");
+    }
+    if (src_nd < 2) {
+        throw py::value_error("Arrays must have 2 or more axes");
+    }
+
+    // ensures also that destination is plenty ample to accommodate all
+    // elements of src array
+    if (!dst.is_f_contiguous()) {
+        throw py::value_error("Destination array must be C-contiguous");
+    }
+
+    const auto &src_shape_vec = src.get_shape_vector();
+    const auto &dst_shape_vec = dst.get_shape_vector();
+
+    std::size_t nelems{1};
+    bool equal_shapes = true;
+
+    for (int i = 0; equal_shapes && (i < src_nd); ++i) {
+        auto sh_i = src_shape_vec[i];
+        equal_shapes = equal_shapes && (sh_i == dst_shape_vec[i]);
+        nelems *= static_cast<std::size_t>(sh_i);
+    }
+
+    if (!equal_shapes) {
+        throw py::value_error("Shapes must be equal");
+    }
+
+    const auto n = dst_shape_vec.front();
+    if (dst_shape_vec[1] != n) {
+        throw py::value_error("Matrices must be square");
+    }
+
+    const auto &src_strides_vec = src.get_strides_vector();
+
+    if (src_strides_vec[1] != py::ssize_t(1)) {
+        throw py::value_error("Unexpected destination array layout");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    const int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_type_id != dst_type_id) {
+        throw py::value_error(
+            "Source and destination arrays must have the same data type");
+    }
+
+    if (nelems == 0) {
+        // nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    const auto &dst_strides_vec = dst.get_strides_vector();
+
+    const std::size_t batch_nelems =
+        (src_nd == 2) ? std::size_t(1) : (nelems / (n * n));
+    const py::ssize_t dst_batch_step =
+        (src_nd == 2) ? py::ssize_t(0) : dst_strides_vec[2];
+
+    std::vector<py::ssize_t> src_batch_strides_vec;
+    std::vector<py::ssize_t> dst_batch_strides_vec;
+    std::vector<py::ssize_t> batch_shape_vec;
+
+    if (src_nd == 2) {
+        batch_shape_vec.push_back(py::ssize_t(1));
+        src_batch_strides_vec.push_back(py::ssize_t(0));
+        dst_batch_strides_vec.push_back(dst_batch_step);
+    }
+    else {
+        batch_shape_vec.insert(std::end(batch_shape_vec),
+                               std::begin(src_shape_vec) + 2,
+                               std::end(src_shape_vec));
+        src_batch_strides_vec.insert(std::end(src_batch_strides_vec),
+                                     std::begin(src_strides_vec) + 2,
+                                     std::end(src_strides_vec));
+        dst_batch_strides_vec.insert(std::end(dst_batch_strides_vec),
+                                     std::begin(dst_strides_vec) + 2,
+                                     std::end(dst_strides_vec));
+    }
+
+    // simplify batch iteration space
+    // NB: simplification reverses dst strides to C contig,
+    // it also reverses simplified_shape and simplified_src_strides
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = static_cast<int>(batch_shape_vec.size());
+
+    // nd, simplified_* and *_offset are modified by reference
+    dpctl::tensor::py_internal::simplify_iteration_space(
+        nd, batch_shape_vec.data(), src_batch_strides_vec,
+        dst_batch_strides_vec,
+        // output
+        simplified_shape, simplified_src_strides, simplified_dst_strides,
+        src_offset, dst_offset);
+
+    if (!((0 == src_offset) && (0 == dst_offset))) {
+        throw std::runtime_error(
+            "Unexpected result of simplifying iteration space, 1");
+    }
+
+    if (1 == nd) {
+        const auto expected_dim = static_cast<py::ssize_t>(batch_nelems);
+        if ((simplified_shape.front() != expected_dim) ||
+            (simplified_dst_strides.front() != dst_batch_step))
+        {
+            throw std::runtime_error(
+                "Unexpected result of simplifying iteration space, 2");
+        }
+
+        auto impl_fn = as_c_contig_1d_batch_of_square_matrices_dispatch_vector
+            [src_type_id];
+        const py::ssize_t src_batch_step = simplified_src_strides.front();
+
+        sycl::event ascontig_ev =
+            impl_fn(exec_q, batch_nelems, src_batch_step, dst_batch_step, n,
+                    src.get_data(), src_strides_vec.front(), dst.get_data(),
+                    dst_strides_vec[1], depends);
+
+        return std::make_pair(
+            keep_args_alive(exec_q, {src, dst}, {ascontig_ev}), ascontig_ev);
+    }
+
+    auto impl_fn =
+        as_c_contig_nd_batch_of_square_matrices_dispatch_vector[src_type_id];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    const auto &ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_shape, simplified_src_strides);
+    py::ssize_t *packed_shape_strides = std::get<0>(ptr_size_event_tuple);
+    if (nullptr == packed_shape_strides) {
+        throw std::runtime_error("Unable to allocate device memory");
+    }
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+
+    std::vector<sycl::event> all_depends;
+    all_depends.reserve(depends.size() + 1);
+    all_depends.insert(std::end(all_depends), std::begin(depends),
+                       std::end(depends));
+    all_depends.push_back(copy_shape_ev);
+
+    sycl::event ascontig_ev =
+        impl_fn(exec_q, batch_nelems, nd, packed_shape_strides, dst_batch_step,
+                n, src.get_data(), src_strides_vec.front(), dst.get_data(),
+                dst_strides_vec[1], all_depends);
+
+    // async free of shape_strides temporary
+    const auto &ctx = exec_q.get_context();
+    const auto &temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(ascontig_ev);
+
+        cgh.host_task([ctx, packed_shape_strides]() {
+            using dpctl::tensor::alloc_utils::sycl_free_noexcept;
+            sycl_free_noexcept(packed_shape_strides, ctx);
+        });
+    });
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          ascontig_ev);
+}
+
+} // end of namespace py_internal
+} // end of namespace tensor
+} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/copy_as_contig.hpp b/dpctl/tensor/libtensor/source/copy_as_contig.hpp
new file mode 100644
index 0000000000..f2957593e0
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/copy_as_contig.hpp
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <utility>
+#include <vector>
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <sycl/sycl.hpp>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+std::pair<sycl::event, sycl::event>
+py_as_c_contig(const dpctl::tensor::usm_ndarray &,
+               const dpctl::tensor::usm_ndarray &,
+               sycl::queue &,
+               const std::vector<sycl::event> &);
+
+std::pair<sycl::event, sycl::event>
+py_as_f_contig(const dpctl::tensor::usm_ndarray &,
+               const dpctl::tensor::usm_ndarray &,
+               sycl::queue &,
+               const std::vector<sycl::event> &);
+
+void init_copy_as_contig_dispatch_vectors(void);
+
+} // end of namespace py_internal
+} // end of namespace tensor
+} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
index 8e5857f48b..b3d04a252a 100644
--- a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
+++ b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
@@ -84,7 +84,7 @@ void copy_numpy_ndarray_into_usm_ndarray(
     const py::ssize_t *dst_shape = dst.get_shape_raw();
     bool shapes_equal(true);
     size_t src_nelems(1);
-    for (int i = 0; i < src_ndim; ++i) {
+    for (int i = 0; shapes_equal && (i < src_ndim); ++i) {
         shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
         src_nelems *= static_cast<size_t>(src_shape[i]);
     }
@@ -114,7 +114,7 @@ void copy_numpy_ndarray_into_usm_ndarray(
         py::detail::array_descriptor_proxy(npy_src.dtype().ptr())->type_num;
     int dst_typenum = dst.get_typenum();
 
-    auto array_types = td_ns::usm_ndarray_types();
+    const auto &array_types = td_ns::usm_ndarray_types();
     int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
     int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
 
diff --git a/dpctl/tensor/libtensor/source/tensor_ctors.cpp b/dpctl/tensor/libtensor/source/tensor_ctors.cpp
index 854574b69e..471207dea9 100644
--- a/dpctl/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl/tensor/libtensor/source/tensor_ctors.cpp
@@ -39,6 +39,7 @@
 #include "boolean_advanced_indexing.hpp"
 #include "clip.hpp"
 #include "copy_and_cast_usm_to_usm.hpp"
+#include "copy_as_contig.hpp"
 #include "copy_for_reshape.hpp"
 #include "copy_for_roll.hpp"
 #include "copy_numpy_ndarray_into_usm_ndarray.hpp"
@@ -70,6 +71,8 @@ using dpctl::tensor::overlap::MemoryOverlap;
 using dpctl::tensor::overlap::SameLogicalTensors;
 
 using dpctl::tensor::py_internal::copy_usm_ndarray_into_usm_ndarray;
+using dpctl::tensor::py_internal::py_as_c_contig;
+using dpctl::tensor::py_internal::py_as_f_contig;
 
 /* =========================== Copy for reshape ============================= */
 
@@ -143,6 +146,7 @@ void init_dispatch_vectors(void)
 {
     using namespace dpctl::tensor::py_internal;
 
+    init_copy_as_contig_dispatch_vectors();
     init_copy_for_reshape_dispatch_vectors();
     init_copy_for_roll_dispatch_vectors();
     init_linear_sequences_dispatch_vectors();
@@ -187,6 +191,20 @@ PYBIND11_MODULE(_tensor_impl, m)
           py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 
+    m.def("_as_c_contig", &py_as_c_contig,
+          "Copies from usm_ndarray `src` into C-contiguous usm_ndarray "
+          "`dst` of the same shape and the same data type. "
+          "Returns a tuple of events: (host_task_event, compute_task_event)",
+          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_as_f_contig", &py_as_f_contig,
+          "Copies from usm_ndarray `src` into F-contiguous usm_ndarray "
+          "`dst` of the same shape and the same data type. "
+          "Returns a tuple of events: (host_task_event, compute_task_event)",
+          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
     using dpctl::tensor::strides::contract_iter2;
     m.def(
         "_contract_iter2", &contract_iter2<py::ssize_t, py::value_error>,
diff --git a/dpctl/tests/test_tensor_asarray.py b/dpctl/tests/test_tensor_asarray.py
index 939167126a..6a3005f336 100644
--- a/dpctl/tests/test_tensor_asarray.py
+++ b/dpctl/tests/test_tensor_asarray.py
@@ -20,7 +20,7 @@
 import dpctl
 import dpctl.tensor as dpt
 
-from .helper import get_queue_or_skip
+from .helper import get_queue_or_skip, skip_if_dtype_not_supported
 
 
 @pytest.mark.parametrize(
@@ -411,3 +411,148 @@ def test_orderK_gh_1350():
     assert c.strides == b.strides
     assert c._element_offset == 0
     assert not c._pointer == b._pointer
+
+
+def _typesafe_arange(n: int, dtype_: dpt.dtype, device: object):
+    n_half = n // 2
+    if dtype_.kind in "ui":
+        ii = dpt.iinfo(dtype_)
+        m0 = max(ii.min, -n_half)
+        m1 = min(m0 + n, ii.max)
+        n_tiles = (n + m1 - m0 - 1) // (m1 - m0)
+        res = dpt.arange(m0, m1, dtype=dtype_, device=device)
+    elif dtype_.kind == "b":
+        n_tiles = (n + 1) // 2
+        res = dpt.asarray([False, True], dtype=dtype_, device=device)
+    else:
+        m0 = -n_half
+        m1 = m0 + n
+        n_tiles = 1
+        res = dpt.linspace(m0, m1, num=n, dtype=dtype_, device=device)
+    if n_tiles > 1:
+        res = dpt.tile(res, n_tiles)[:n]
+    return res
+
+
+_all_dtypes = [
+    "b1",
+    "i1",
+    "u1",
+    "i2",
+    "u2",
+    "i4",
+    "u4",
+    "i8",
+    "u8",
+    "f2",
+    "f4",
+    "f8",
+    "c8",
+    "c16",
+]
+
+
+@pytest.mark.parametrize("dt", _all_dtypes)
+def test_as_c_contig_rect(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    dtype_ = dpt.dtype(dt)
+    n0, n1, n2 = 6, 35, 37
+
+    arr_flat = _typesafe_arange(n0 * n1 * n2, dtype_, q)
+    x = dpt.reshape(arr_flat, (n0, n1, n2)).mT
+
+    y = dpt.asarray(x, order="C")
+    assert dpt.all(x == y)
+
+    x2 = x[0]
+    y2 = dpt.asarray(x2, order="C")
+    assert dpt.all(x2 == y2)
+
+    x3 = dpt.flip(x, axis=1)
+    y3 = dpt.asarray(x3, order="C")
+    assert dpt.all(x3 == y3)
+
+    x4 = dpt.reshape(arr_flat, (2, 3, n1, n2)).mT
+    x5 = x4[:, :2]
+    y5 = dpt.asarray(x5, order="C")
+    assert dpt.all(x5 == y5)
+
+    x6 = dpt.reshape(arr_flat, (n0, n1, n2), order="F")
+    y6 = dpt.asarray(x6, order="C")
+    assert dpt.all(x6 == y6)
+
+
+@pytest.mark.parametrize("dt", _all_dtypes)
+def test_as_f_contig_rect(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    dtype_ = dpt.dtype(dt)
+    n0, n1, n2 = 6, 35, 37
+
+    arr_flat = _typesafe_arange(n0 * n1 * n2, dtype_, q)
+    x = dpt.reshape(arr_flat, (n0, n1, n2))
+
+    y = dpt.asarray(x, order="F")
+    assert dpt.all(x == y)
+
+    x2 = x[0]
+    y2 = dpt.asarray(x2, order="F")
+    assert dpt.all(x2 == y2)
+
+    x3 = dpt.flip(x, axis=1)
+    y3 = dpt.asarray(x3, order="F")
+    assert dpt.all(x3 == y3)
+
+    x4 = dpt.reshape(arr_flat, (2, 3, n1, n2))
+    x5 = dpt.moveaxis(x4[:, :2], (2, 3), (0, 1))
+    y5 = dpt.asarray(x5, order="F")
+    assert dpt.all(x5 == y5)
+
+
+@pytest.mark.parametrize("dt", _all_dtypes)
+def test_as_c_contig_square(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    dtype_ = dpt.dtype(dt)
+    n0, n1 = 4, 53
+
+    arr_flat = _typesafe_arange(n0 * n1 * n1, dtype_, q)
+    x = dpt.reshape(arr_flat, (n0, n1, n1)).mT
+
+    y = dpt.asarray(x, order="C")
+    assert dpt.all(x == y)
+
+    x2 = x[0]
+    y2 = dpt.asarray(x2, order="C")
+    assert dpt.all(x2 == y2)
+
+    x3 = dpt.flip(x, axis=1)
+    y3 = dpt.asarray(x3, order="C")
+    assert dpt.all(x3 == y3)
+
+
+@pytest.mark.parametrize("dt", _all_dtypes)
+def test_as_f_contig_square(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    dtype_ = dpt.dtype(dt)
+    n0, n1 = 6, 53
+
+    arr_flat = _typesafe_arange(n0 * n1 * n1, dtype_, q)
+    x = dpt.moveaxis(dpt.reshape(arr_flat, (n0, n1, n1)), (1, 2), (0, 1))
+
+    y = dpt.asarray(x, order="F")
+    assert dpt.all(x == y)
+
+    x2 = x[..., 0]
+    y2 = dpt.asarray(x2, order="F")
+    assert dpt.all(x2 == y2)
+
+    x3 = dpt.flip(x, axis=1)
+    y3 = dpt.asarray(x3, order="F")
+    assert dpt.all(x3 == y3)