Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add aten::range #553

Merged
merged 12 commits into from
Jul 21, 2024
96 changes: 96 additions & 0 deletions src/ATen/native/xpu/RangeFactories.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/Dispatch.h>
#include <ATen/ExpandUtils.h>
#include <ATen/ScalarOps.h>
#include <ATen/core/Tensor.h>
Expand All @@ -14,7 +16,101 @@ Tensor& XPUNativeFunctions::arange_out(
const Scalar& end,
const Scalar& step,
Tensor& out) {
AT_DISPATCH_ALL_TYPES_AND2(
at::ScalarType::Half,
at::ScalarType::BFloat16,
out.scalar_type(),
"arange_xpu_preprocess",
[&]() {
using accscalar_t = at::acc_type<scalar_t, true>;
auto xstart = start.to<accscalar_t>();
auto xend = end.to<accscalar_t>();
auto xstep = step.to<accscalar_t>();

TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
TORCH_CHECK(
std::isfinite(xstart) && std::isfinite(xend),
"unsupported range: ",
xstart,
" -> ",
xend);
TORCH_CHECK(
((xstep > 0) && (xend >= xstart)) ||
((xstep < 0) && (xend <= xstart)),
"upper bound and larger bound inconsistent with step sign");

// we use double precision for (start - end) / step
// to compute size_d for consistency across devices.
// The problem with using accscalar_t is that accscalar_t might be
// float32 on gpu for a float32 scalar_t, but double on cpu for the
// same, and the effective output size starts differing on CPU vs GPU
// because of precision issues, which we dont want. the corner-case we
// do want to take into account is int64_t, which has higher precision
// than double
double size_d;
if constexpr (std::is_same_v<scalar_t, int64_t>) {
int64_t sgn = (xstep > 0) - (xstep < 0);
size_d = std::ceil((xend - xstart + xstep - sgn) / xstep);
} else {
size_d = std::ceil(
static_cast<double>(end.to<double>() - start.to<double>()) /
step.to<double>());
}

TORCH_CHECK(
size_d >= 0 &&
size_d <=
static_cast<double>(std::numeric_limits<int64_t>::max()),
"invalid size, possible overflow?");
int64_t size = static_cast<int64_t>(size_d);
int64_t numel = out.numel();

if (numel != size) {
if (numel > 0) {
TORCH_WARN(
"The number of elements in the out tensor of shape ",
out.sizes(),
" is ",
numel,
" which does not match the computed number of elements ",
size,
". Note that this may occur as a result of rounding error. "
"The out tensor will be resized to a tensor of shape (",
size,
",).");
}
out.resize_({size});
}
});

return at::native::xpu::arange_kernel(start, end, step, out);
}

Tensor& XPUNativeFunctions::range_out(
const Scalar& start,
const Scalar& end,
const Scalar& step,
Tensor& out) {
auto xstart = start.to<double>();
auto xend = end.to<double>();
auto xstep = step.to<double>();

TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
TORCH_CHECK(
std::isfinite(xstart) && std::isfinite(xend),
"unsupported range: ",
xstart,
" -> ",
xend);
TORCH_CHECK(
((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
"upper bound and larger bound inconsistent with step sign");
int64_t size = static_cast<int64_t>(((xend - xstart) / xstep) + 1);
if (out.numel() != size) {
out.resize_({size});
}

return at::native::xpu::range_kernel(start, end, step, out);
}

} // namespace at
98 changes: 42 additions & 56 deletions src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,64 +83,8 @@ Tensor& arange_kernel(
[&]() {
using accscalar_t = at::acc_type<scalar_t, true>;
auto xstart = start.to<accscalar_t>();
auto xend = end.to<accscalar_t>();
auto xstep = step.to<accscalar_t>();

TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
TORCH_CHECK(
std::isfinite(static_cast<double>(xstart)) &&
std::isfinite(static_cast<double>(xend)),
"unsupported range: ",
xstart,
" -> ",
xend);
TORCH_CHECK(
((xstep > 0) && (xend >= xstart)) ||
((xstep < 0) && (xend <= xstart)),
"upper bound and larger bound inconsistent with step sign");

// we use double precision for (start - end) / step
// to compute size_d for consistency across devices.
// The problem with using accscalar_t is that accscalar_t might be
// float32 on gpu for a float32 scalar_t, but double on cpu for the
// same, and the effective output size starts differing on CPU vs GPU
// because of precision issues, which we dont want. the corner-case we
// do want to take into account is int64_t, which has higher precision
// than double
double size_d;
if constexpr (std::is_same_v<scalar_t, int64_t>) {
int64_t sgn = (xstep > 0) - (xstep < 0);
size_d = std::ceil((xend - xstart + xstep - sgn) / xstep);
} else {
size_d = std::ceil(
static_cast<double>(end.to<double>() - start.to<double>()) /
step.to<double>());
}

TORCH_CHECK(
size_d >= 0 &&
size_d <=
static_cast<double>(std::numeric_limits<int64_t>::max()),
"invalid size, possible overflow?");
int64_t size = static_cast<int64_t>(size_d);
int64_t numel = result.numel();

if (numel != size) {
if (numel > 0) {
TORCH_WARN(
"The number of elements in the out tensor of shape ",
result.sizes(),
" is ",
numel,
" which does not match the computed number of elements ",
size,
". Note that this may occur as a result of rounding error. "
"The out tensor will be resized to a tensor of shape (",
size,
",).");
}
result.resize_({size});
}
bool is_contiguous = result.is_contiguous();
Tensor r = !is_contiguous
? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT)
Expand All @@ -157,6 +101,48 @@ Tensor& arange_kernel(
return result;
}

template <typename scalar_t, typename accscalar_t>
struct RangeFunctor {
scalar_t operator()(int64_t ind) const {
accscalar_t inc = xstep_ * static_cast<accscalar_t>(ind);
accscalar_t val = xstart_ + inc;
return static_cast<scalar_t>(val);
}
RangeFunctor(accscalar_t xstart, accscalar_t xstep)
: xstart_(xstart), xstep_(xstep) {}

private:
accscalar_t xstart_;
accscalar_t xstep_;
};

Tensor& range_kernel(
const Scalar& start,
const Scalar& end,
const Scalar& step,
Tensor& result) {
AT_DISPATCH_ALL_TYPES_AND(
at::ScalarType::Half, result.scalar_type(), "range_xpu", [&]() {
using accscalar_t = acc_type<scalar_t, true>;
auto xstart = start.to<accscalar_t>();
auto xstep = step.to<accscalar_t>();

bool is_contiguous = result.is_contiguous();
Tensor r = !is_contiguous
? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT)
: result;
auto f = RangeFunctor<scalar_t, accscalar_t>(xstart, xstep);

gpu_kernel_with_index(r, f);

if (!result.is_contiguous()) {
result.copy_(r);
}
});

return result;
}

} // namespace xpu
} // namespace native
} // namespace at
6 changes: 6 additions & 0 deletions src/ATen/native/xpu/sycl/RangeFactoriesKernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ Tensor& arange_kernel(
const Scalar& step,
Tensor& result);

Tensor& range_kernel(
const Scalar& start,
const Scalar& end,
const Scalar& step,
Tensor& result);

} // namespace xpu
} // namespace native
} // namespace at
1 change: 1 addition & 0 deletions yaml/xpu_functions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -629,3 +629,4 @@ supported:
- ceil_
- ceil.out
- nan_to_num.out
- range.out