diff --git a/src/ATen/native/xpu/RangeFactories.cpp b/src/ATen/native/xpu/RangeFactories.cpp index 9373d9e55..60190863b 100644 --- a/src/ATen/native/xpu/RangeFactories.cpp +++ b/src/ATen/native/xpu/RangeFactories.cpp @@ -1,4 +1,6 @@ #include +#include +#include #include #include #include @@ -14,7 +16,101 @@ Tensor& XPUNativeFunctions::arange_out( const Scalar& end, const Scalar& step, Tensor& out) { + AT_DISPATCH_ALL_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + out.scalar_type(), + "arange_xpu_preprocess", + [&]() { + using accscalar_t = at::acc_type; + auto xstart = start.to(); + auto xend = end.to(); + auto xstep = step.to(); + + TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); + TORCH_CHECK( + std::isfinite(xstart) && std::isfinite(xend), + "unsupported range: ", + xstart, + " -> ", + xend); + TORCH_CHECK( + ((xstep > 0) && (xend >= xstart)) || + ((xstep < 0) && (xend <= xstart)), + "upper bound and larger bound inconsistent with step sign"); + + // we use double precision for (start - end) / step + // to compute size_d for consistency across devices. + // The problem with using accscalar_t is that accscalar_t might be + // float32 on gpu for a float32 scalar_t, but double on cpu for the + // same, and the effective output size starts differing on CPU vs GPU + // because of precision issues, which we dont want. the corner-case we + // do want to take into account is int64_t, which has higher precision + // than double + double size_d; + if constexpr (std::is_same_v) { + int64_t sgn = (xstep > 0) - (xstep < 0); + size_d = std::ceil((xend - xstart + xstep - sgn) / xstep); + } else { + size_d = std::ceil( + static_cast(end.to() - start.to()) / + step.to()); + } + + TORCH_CHECK( + size_d >= 0 && + size_d <= + static_cast(std::numeric_limits::max()), + "invalid size, possible overflow?"); + int64_t size = static_cast(size_d); + int64_t numel = out.numel(); + + if (numel != size) { + if (numel > 0) { + TORCH_WARN( + "The number of elements in the out tensor of shape ", + out.sizes(), + " is ", + numel, + " which does not match the computed number of elements ", + size, + ". Note that this may occur as a result of rounding error. " + "The out tensor will be resized to a tensor of shape (", + size, + ",)."); + } + out.resize_({size}); + } + }); + return at::native::xpu::arange_kernel(start, end, step, out); } +Tensor& XPUNativeFunctions::range_out( + const Scalar& start, + const Scalar& end, + const Scalar& step, + Tensor& out) { + auto xstart = start.to(); + auto xend = end.to(); + auto xstep = step.to(); + + TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); + TORCH_CHECK( + std::isfinite(xstart) && std::isfinite(xend), + "unsupported range: ", + xstart, + " -> ", + xend); + TORCH_CHECK( + ((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)), + "upper bound and larger bound inconsistent with step sign"); + int64_t size = static_cast(((xend - xstart) / xstep) + 1); + if (out.numel() != size) { + out.resize_({size}); + } + + return at::native::xpu::range_kernel(start, end, step, out); +} + } // namespace at diff --git a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp index 0cecbdb36..e62c21ed9 100644 --- a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp +++ b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp @@ -83,64 +83,8 @@ Tensor& arange_kernel( [&]() { using accscalar_t = at::acc_type; auto xstart = start.to(); - auto xend = end.to(); auto xstep = step.to(); - TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); - TORCH_CHECK( - std::isfinite(static_cast(xstart)) && - std::isfinite(static_cast(xend)), - "unsupported range: ", - xstart, - " -> ", - xend); - TORCH_CHECK( - ((xstep > 0) && (xend >= xstart)) || - ((xstep < 0) && (xend <= xstart)), - "upper bound and larger bound inconsistent with step sign"); - - // we use double precision for (start - end) / step - // to compute size_d for consistency across devices. - // The problem with using accscalar_t is that accscalar_t might be - // float32 on gpu for a float32 scalar_t, but double on cpu for the - // same, and the effective output size starts differing on CPU vs GPU - // because of precision issues, which we dont want. the corner-case we - // do want to take into account is int64_t, which has higher precision - // than double - double size_d; - if constexpr (std::is_same_v) { - int64_t sgn = (xstep > 0) - (xstep < 0); - size_d = std::ceil((xend - xstart + xstep - sgn) / xstep); - } else { - size_d = std::ceil( - static_cast(end.to() - start.to()) / - step.to()); - } - - TORCH_CHECK( - size_d >= 0 && - size_d <= - static_cast(std::numeric_limits::max()), - "invalid size, possible overflow?"); - int64_t size = static_cast(size_d); - int64_t numel = result.numel(); - - if (numel != size) { - if (numel > 0) { - TORCH_WARN( - "The number of elements in the out tensor of shape ", - result.sizes(), - " is ", - numel, - " which does not match the computed number of elements ", - size, - ". Note that this may occur as a result of rounding error. " - "The out tensor will be resized to a tensor of shape (", - size, - ",)."); - } - result.resize_({size}); - } bool is_contiguous = result.is_contiguous(); Tensor r = !is_contiguous ? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT) @@ -157,6 +101,48 @@ Tensor& arange_kernel( return result; } +template +struct RangeFunctor { + scalar_t operator()(int64_t ind) const { + accscalar_t inc = xstep_ * static_cast(ind); + accscalar_t val = xstart_ + inc; + return static_cast(val); + } + RangeFunctor(accscalar_t xstart, accscalar_t xstep) + : xstart_(xstart), xstep_(xstep) {} + + private: + accscalar_t xstart_; + accscalar_t xstep_; +}; + +Tensor& range_kernel( + const Scalar& start, + const Scalar& end, + const Scalar& step, + Tensor& result) { + AT_DISPATCH_ALL_TYPES_AND( + at::ScalarType::Half, result.scalar_type(), "range_xpu", [&]() { + using accscalar_t = acc_type; + auto xstart = start.to(); + auto xstep = step.to(); + + bool is_contiguous = result.is_contiguous(); + Tensor r = !is_contiguous + ? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT) + : result; + auto f = RangeFunctor(xstart, xstep); + + gpu_kernel_with_index(r, f); + + if (!result.is_contiguous()) { + result.copy_(r); + } + }); + + return result; +} + } // namespace xpu } // namespace native } // namespace at diff --git a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h index 26ca6197d..3cf08ca5d 100644 --- a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h +++ b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h @@ -12,6 +12,12 @@ Tensor& arange_kernel( const Scalar& step, Tensor& result); +Tensor& range_kernel( + const Scalar& start, + const Scalar& end, + const Scalar& step, + Tensor& result); + } // namespace xpu } // namespace native } // namespace at diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml index 15d371e52..3d50c09b6 100644 --- a/yaml/xpu_functions.yaml +++ b/yaml/xpu_functions.yaml @@ -629,3 +629,4 @@ supported: - ceil_ - ceil.out - nan_to_num.out + - range.out