Skip to content

Commit

Permalink
Merge pull request #1950 from IntelPython/use-std-size-t-cstddef
Browse files Browse the repository at this point in the history
[MAINT] Use `std::size_t` from `cstddef` and use `dpctl::tensor::ssize_t` where `ssize_t` is used
  • Loading branch information
ndgrigorian authored Jan 7, 2025
2 parents 3a1a7c5 + 919d772 commit 8edf7f3
Show file tree
Hide file tree
Showing 154 changed files with 2,124 additions and 1,860 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
* Add support of CV-qualifiers in `is_complex<T>` helper [gh-1900](https://github.com/IntelPython/dpctl/pull/1900)
* Tuning work for elementwise functions with modest performance gains (under 10%) [gh-1889](https://github.com/IntelPython/dpctl/pull/1889)
* Support for Python 3.13 for `dpctl` [gh-1941](https://github.com/IntelPython/dpctl/pull/1941)
* Change libtensor to use `std::size_t` and `dpctl::tensor::ssize_t` throughout and fix missing includes for `std::size_t` and `size_t` [gh-1950](https://github.com/IntelPython/dpctl/pull/1950)

## [0.18.3] - Dec. 07, 2024

Expand Down
1 change: 1 addition & 0 deletions dpctl/_host_task_util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@

#pragma once
#include <exception>
#include <stddef.h>
#include <sycl/sycl.hpp>

#include "Python.h"
Expand Down
6 changes: 4 additions & 2 deletions dpctl/apis/include/dpctl4pybind11.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@

#include "dpctl_capi.h"
#include <complex>
#include <cstddef> // for std::size_t for C++ linkage
#include <memory>
#include <pybind11/pybind11.h>
#include <stddef.h> // for size_t for C linkage
#include <stdexcept>
#include <sycl/sycl.hpp>
#include <utility>
Expand Down Expand Up @@ -759,7 +761,7 @@ class usm_memory : public py::object
* lifetime of the USM allocation.
*/
usm_memory(void *usm_ptr,
size_t nbytes,
std::size_t nbytes,
const sycl::queue &q,
std::shared_ptr<void> shptr)
{
Expand Down Expand Up @@ -819,7 +821,7 @@ class usm_memory : public py::object
return reinterpret_cast<char *>(MRef);
}

size_t get_nbytes() const
std::size_t get_nbytes() const
{
auto const &api = ::dpctl::detail::dpctl_capi::get();
Py_MemoryObject *mem_obj = reinterpret_cast<Py_MemoryObject *>(m_ptr);
Expand Down
2 changes: 2 additions & 0 deletions dpctl/tensor/libtensor/include/kernels/accumulators.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

#pragma once
#include <array>
#include <cstddef>
#include <cstdint>
#include <limits>
#include <new>
Expand All @@ -47,6 +48,7 @@ namespace kernels
namespace accumulators
{

using dpctl::tensor::ssize_t;
using namespace dpctl::tensor::offset_utils;

template <typename T> T ceiling_quotient(T n, T m) { return (n + m - 1) / m; }
Expand Down
2 changes: 1 addition & 1 deletion dpctl/tensor/libtensor/include/kernels/alignment.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ namespace kernels
namespace alignment_utils
{

static constexpr size_t required_alignment = 64UL;
static constexpr std::size_t required_alignment = 64UL;

template <std::uintptr_t alignment, typename Ptr> bool is_aligned(Ptr p)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
//===---------------------------------------------------------------------===//

#pragma once
#include <cstddef>
#include <cstdint>
#include <limits>
#include <sycl/sycl.hpp>
Expand All @@ -42,6 +43,7 @@ namespace kernels
namespace indexing
{

using dpctl::tensor::ssize_t;
using namespace dpctl::tensor::offset_utils;

template <typename OrthogIndexerT,
Expand All @@ -55,7 +57,7 @@ struct MaskedExtractStridedFunctor
MaskedExtractStridedFunctor(const dataT *src_data_p,
const indT *cumsum_data_p,
dataT *dst_data_p,
size_t masked_iter_size,
std::size_t masked_iter_size,
const OrthogIndexerT &orthog_src_dst_indexer_,
const MaskedSrcIndexerT &masked_src_indexer_,
const MaskedDstIndexerT &masked_dst_indexer_,
Expand All @@ -81,7 +83,7 @@ struct MaskedExtractStridedFunctor

const std::size_t max_offset = masked_nelems + 1;
for (std::uint32_t i = l_i; i < lacc.size(); i += lws) {
const size_t offset = masked_block_start + i;
const std::size_t offset = masked_block_start + i;
lacc[i] = (offset == 0) ? indT(0)
: (offset < max_offset) ? cumsum[offset - 1]
: cumsum[masked_nelems - 1] + 1;
Expand All @@ -99,9 +101,10 @@ struct MaskedExtractStridedFunctor
if (mask_set && (masked_i < masked_nelems)) {
const auto &orthog_offsets = orthog_src_dst_indexer(orthog_i);

const size_t total_src_offset = masked_src_indexer(masked_i) +
orthog_offsets.get_first_offset();
const size_t total_dst_offset =
const std::size_t total_src_offset =
masked_src_indexer(masked_i) +
orthog_offsets.get_first_offset();
const std::size_t total_dst_offset =
masked_dst_indexer(current_running_count - 1) +
orthog_offsets.get_second_offset();

Expand All @@ -113,7 +116,7 @@ struct MaskedExtractStridedFunctor
const dataT *src = nullptr;
const indT *cumsum = nullptr;
dataT *dst = nullptr;
const size_t masked_nelems = 0;
const std::size_t masked_nelems = 0;
// has nd, shape, src_strides, dst_strides for
// dimensions that ARE NOT masked
const OrthogIndexerT orthog_src_dst_indexer;
Expand All @@ -136,7 +139,7 @@ struct MaskedPlaceStridedFunctor
MaskedPlaceStridedFunctor(dataT *dst_data_p,
const indT *cumsum_data_p,
const dataT *rhs_data_p,
size_t masked_iter_size,
std::size_t masked_iter_size,
const OrthogIndexerT &orthog_dst_rhs_indexer_,
const MaskedDstIndexerT &masked_dst_indexer_,
const MaskedRhsIndexerT &masked_rhs_indexer_,
Expand All @@ -157,12 +160,12 @@ struct MaskedPlaceStridedFunctor
const std::uint32_t l_i = ndit.get_local_id(1);
const std::uint32_t lws = ndit.get_local_range(1);

const size_t masked_i = ndit.get_global_id(1);
const size_t masked_block_start = masked_i - l_i;
const std::size_t masked_i = ndit.get_global_id(1);
const std::size_t masked_block_start = masked_i - l_i;

const std::size_t max_offset = masked_nelems + 1;
for (std::uint32_t i = l_i; i < lacc.size(); i += lws) {
const size_t offset = masked_block_start + i;
const std::size_t offset = masked_block_start + i;
lacc[i] = (offset == 0) ? indT(0)
: (offset < max_offset) ? cumsum[offset - 1]
: cumsum[masked_nelems - 1] + 1;
Expand All @@ -180,9 +183,10 @@ struct MaskedPlaceStridedFunctor
if (mask_set && (masked_i < masked_nelems)) {
const auto &orthog_offsets = orthog_dst_rhs_indexer(orthog_i);

const size_t total_dst_offset = masked_dst_indexer(masked_i) +
orthog_offsets.get_first_offset();
const size_t total_rhs_offset =
const std::size_t total_dst_offset =
masked_dst_indexer(masked_i) +
orthog_offsets.get_first_offset();
const std::size_t total_rhs_offset =
masked_rhs_indexer(current_running_count - 1) +
orthog_offsets.get_second_offset();

Expand All @@ -194,7 +198,7 @@ struct MaskedPlaceStridedFunctor
dataT *dst = nullptr;
const indT *cumsum = nullptr;
const dataT *rhs = nullptr;
const size_t masked_nelems = 0;
const std::size_t masked_nelems = 0;
// has nd, shape, dst_strides, rhs_strides for
// dimensions that ARE NOT masked
const OrthogIndexerT orthog_dst_rhs_indexer;
Expand Down Expand Up @@ -450,8 +454,8 @@ sycl::event masked_extract_some_slices_strided_impl(

const std::size_t lws = get_lws(masked_extent);

const size_t n_groups = ((masked_extent + lws - 1) / lws);
const size_t orthog_extent = static_cast<size_t>(orthog_nelems);
const std::size_t n_groups = ((masked_extent + lws - 1) / lws);
const std::size_t orthog_extent = static_cast<std::size_t>(orthog_nelems);

sycl::range<2> gRange{orthog_extent, n_groups * lws};
sycl::range<2> lRange{1, lws};
Expand Down Expand Up @@ -809,7 +813,7 @@ sycl::event non_zero_indexes_impl(sycl::queue &exec_q,
const std::size_t masked_block_start = group_i * lws;

for (std::uint32_t i = l_i; i < lacc.size(); i += lws) {
const size_t offset = masked_block_start + i;
const std::size_t offset = masked_block_start + i;
lacc[i] = (offset == 0) ? indT1(0)
: (offset - 1 < masked_extent)
? cumsum_data[offset - 1]
Expand Down
38 changes: 20 additions & 18 deletions dpctl/tensor/libtensor/include/kernels/clip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#pragma once
#include <algorithm>
#include <complex>
#include <cstddef>
#include <cstdint>
#include <sycl/sycl.hpp>
#include <type_traits>
Expand All @@ -45,6 +46,7 @@ namespace kernels
namespace clip
{

using dpctl::tensor::ssize_t;
using namespace dpctl::tensor::offset_utils;

using dpctl::tensor::kernels::alignment_utils::
Expand Down Expand Up @@ -85,14 +87,14 @@ template <typename T,
class ClipContigFunctor
{
private:
size_t nelems = 0;
std::size_t nelems = 0;
const T *x_p = nullptr;
const T *min_p = nullptr;
const T *max_p = nullptr;
T *dst_p = nullptr;

public:
ClipContigFunctor(size_t nelems_,
ClipContigFunctor(std::size_t nelems_,
const T *x_p_,
const T *min_p_,
const T *max_p_,
Expand All @@ -110,30 +112,30 @@ class ClipContigFunctor
if constexpr (is_complex<T>::value || !enable_sg_loadstore) {
const std::uint16_t sgSize =
ndit.get_sub_group().get_local_range()[0];
const size_t gid = ndit.get_global_linear_id();
const uint16_t nelems_per_sg = sgSize * nelems_per_wi;
const std::size_t gid = ndit.get_global_linear_id();
const std::uint16_t nelems_per_sg = sgSize * nelems_per_wi;

const size_t start =
const std::size_t start =
(gid / sgSize) * (nelems_per_sg - sgSize) + gid;
const size_t end = std::min(nelems, start + nelems_per_sg);
const std::size_t end = std::min(nelems, start + nelems_per_sg);

for (size_t offset = start; offset < end; offset += sgSize) {
for (std::size_t offset = start; offset < end; offset += sgSize) {
dst_p[offset] = clip(x_p[offset], min_p[offset], max_p[offset]);
}
}
else {
auto sg = ndit.get_sub_group();
const std::uint16_t sgSize = sg.get_max_local_range()[0];

const size_t base =
const std::size_t base =
nelems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
sg.get_group_id()[0] * sgSize);

if (base + nelems_per_wi * sgSize < nelems) {
sycl::vec<T, vec_sz> dst_vec;
#pragma unroll
for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
const size_t idx = base + it * sgSize;
const std::size_t idx = base + it * sgSize;
auto x_multi_ptr = sycl::address_space_cast<
sycl::access::address_space::global_space,
sycl::access::decorated::yes>(&x_p[idx]);
Expand Down Expand Up @@ -162,8 +164,8 @@ class ClipContigFunctor
}
}
else {
const size_t lane_id = sg.get_local_id()[0];
for (size_t k = base + lane_id; k < nelems; k += sgSize) {
const std::size_t lane_id = sg.get_local_id()[0];
for (std::size_t k = base + lane_id; k < nelems; k += sgSize) {
dst_p[k] = clip(x_p[k], min_p[k], max_p[k]);
}
}
Expand All @@ -175,7 +177,7 @@ template <typename T, int vec_sz, int n_vecs> class clip_contig_kernel;

typedef sycl::event (*clip_contig_impl_fn_ptr_t)(
sycl::queue &,
size_t,
std::size_t,
const char *,
const char *,
const char *,
Expand All @@ -184,7 +186,7 @@ typedef sycl::event (*clip_contig_impl_fn_ptr_t)(

template <typename T>
sycl::event clip_contig_impl(sycl::queue &q,
size_t nelems,
std::size_t nelems,
const char *x_cp,
const char *min_cp,
const char *max_cp,
Expand All @@ -199,10 +201,10 @@ sycl::event clip_contig_impl(sycl::queue &q,
sycl::event clip_ev = q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends);

size_t lws = 64;
std::size_t lws = 64;
constexpr std::uint8_t vec_sz = 4;
constexpr std::uint8_t n_vecs = 2;
const size_t n_groups =
const std::size_t n_groups =
((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
const auto gws_range = sycl::range<1>(n_groups * lws);
const auto lws_range = sycl::range<1>(lws);
Expand Down Expand Up @@ -258,7 +260,7 @@ template <typename T, typename IndexerT> class ClipStridedFunctor

void operator()(sycl::id<1> id) const
{
size_t gid = id[0];
std::size_t gid = id[0];
auto offsets = indexer(static_cast<ssize_t>(gid));
dst_p[offsets.get_fourth_offset()] = clip(
x_p[offsets.get_first_offset()], min_p[offsets.get_second_offset()],
Expand All @@ -270,7 +272,7 @@ template <typename T, typename IndexerT> class clip_strided_kernel;

typedef sycl::event (*clip_strided_impl_fn_ptr_t)(
sycl::queue &,
size_t,
std::size_t,
int,
const char *,
const char *,
Expand All @@ -285,7 +287,7 @@ typedef sycl::event (*clip_strided_impl_fn_ptr_t)(

template <typename T>
sycl::event clip_strided_impl(sycl::queue &q,
size_t nelems,
std::size_t nelems,
int nd,
const char *x_cp,
const char *min_cp,
Expand Down
Loading

0 comments on commit 8edf7f3

Please sign in to comment.