From 0815e6230f59c2db5324dacb1477bcd6562a6fa0 Mon Sep 17 00:00:00 2001 From: chunhuanMeng Date: Tue, 9 Jul 2024 09:00:25 +0000 Subject: [PATCH 1/7] enable op aten::range --- src/ATen/native/xpu/RangeFactories.cpp | 8 +++ .../native/xpu/sycl/RangeFactoriesKernel.cpp | 60 +++++++++++++++++++ .../native/xpu/sycl/RangeFactoriesKernel.h | 6 ++ yaml/xpu_functions.yaml | 1 + 4 files changed, 75 insertions(+) diff --git a/src/ATen/native/xpu/RangeFactories.cpp b/src/ATen/native/xpu/RangeFactories.cpp index 9373d9e55..728098755 100644 --- a/src/ATen/native/xpu/RangeFactories.cpp +++ b/src/ATen/native/xpu/RangeFactories.cpp @@ -17,4 +17,12 @@ Tensor& XPUNativeFunctions::arange_out( return at::native::xpu::arange_kernel(start, end, step, out); } +Tensor& XPUNativeFunctions::range_out( + const Scalar& start, + const Scalar& end, + const Scalar& step, + Tensor& out) { + return at::native::xpu::range_kernel(start, end, step, out); +} + } // namespace at diff --git a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp index 0cecbdb36..04efc21a6 100644 --- a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp +++ b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp @@ -157,6 +157,66 @@ Tensor& arange_kernel( return result; } +template +struct RangeFunctor { + scalar_t operator()(int64_t ind) const { + accscalar_t inc = xstep_ * static_cast(ind); + accscalar_t val = xstart_ + inc; + return static_cast(val); + } + RangeFunctor(accscalar_t xstart, accscalar_t xstep) + : xstart_(xstart), xstep_(xstep) {} + + private: + accscalar_t xstart_; + accscalar_t xstep_; +}; + +Tensor& range_kernel( + const Scalar& start, + const Scalar& end, + const Scalar& step, + Tensor& result) { + printf("in range kernel\n"); + AT_DISPATCH_ALL_TYPES_AND( + at::ScalarType::Half, result.scalar_type(), "range_xpu", [&]() { + using accscalar_t = acc_type; + auto xstart = start.to(); + auto xend = end.to(); + auto xstep = step.to(); + + TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); + TORCH_CHECK( + std::isfinite(static_cast(xstart)) && + std::isfinite(static_cast(xend)), + "unsupported range: ", + xstart, + " -> ", + xend); + TORCH_CHECK( + ((xstep > 0) && (xend >= xstart)) || + ((xstep < 0) && (xend <= xstart)), + "upper bound and larger bound inconsistent with step sign"); + int64_t size = static_cast(((xend - xstart) / xstep) + 1); + if (result.numel() != size) { + result.resize_({size}); + } + bool is_contiguous = result.is_contiguous(); + Tensor r = !is_contiguous + ? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT) + : result; + auto f = RangeFunctor(xstart, xstep); + + gpu_kernel_with_index(r, f); + + if (!result.is_contiguous()) { + result.copy_(r); + } + }); + + return result; +} + } // namespace xpu } // namespace native } // namespace at diff --git a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h index 26ca6197d..3cf08ca5d 100644 --- a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h +++ b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h @@ -12,6 +12,12 @@ Tensor& arange_kernel( const Scalar& step, Tensor& result); +Tensor& range_kernel( + const Scalar& start, + const Scalar& end, + const Scalar& step, + Tensor& result); + } // namespace xpu } // namespace native } // namespace at diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml index 2ecc6790b..757e0d5bf 100644 --- a/yaml/xpu_functions.yaml +++ b/yaml/xpu_functions.yaml @@ -511,3 +511,4 @@ supported: - ceil - ceil_ - ceil.out + - range.out From 02294f35a055ff68c41e1a3d573272d2be56f7ac Mon Sep 17 00:00:00 2001 From: chunhuanMeng Date: Wed, 10 Jul 2024 01:50:08 +0000 Subject: [PATCH 2/7] delete printf log --- src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp index 04efc21a6..4d54a6daf 100644 --- a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp +++ b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp @@ -177,7 +177,6 @@ Tensor& range_kernel( const Scalar& end, const Scalar& step, Tensor& result) { - printf("in range kernel\n"); AT_DISPATCH_ALL_TYPES_AND( at::ScalarType::Half, result.scalar_type(), "range_xpu", [&]() { using accscalar_t = acc_type; From 44f09c1b3707af6b79dd1f19ff04b4eff3f09294 Mon Sep 17 00:00:00 2001 From: Feng Yuan Date: Sat, 20 Jul 2024 21:36:18 +0800 Subject: [PATCH 3/7] Move host only code to operator level --- src/ATen/native/xpu/RangeFactories.cpp | 76 +++++++++++++++++++ .../native/xpu/sycl/RangeFactoriesKernel.cpp | 71 ----------------- 2 files changed, 76 insertions(+), 71 deletions(-) diff --git a/src/ATen/native/xpu/RangeFactories.cpp b/src/ATen/native/xpu/RangeFactories.cpp index 728098755..2797db6e2 100644 --- a/src/ATen/native/xpu/RangeFactories.cpp +++ b/src/ATen/native/xpu/RangeFactories.cpp @@ -14,6 +14,63 @@ Tensor& XPUNativeFunctions::arange_out( const Scalar& end, const Scalar& step, Tensor& out) { + auto xstart = start.to(); + auto xend = end.to(); + auto xstep = step.to(); + + TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); + TORCH_CHECK( + std::isfinite(xstart) && std::isfinite(xend), + "unsupported range: ", + xstart, + " -> ", + xend); + TORCH_CHECK( + ((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)), + "upper bound and larger bound inconsistent with step sign"); + + // we use double precision for (start - end) / step + // to compute size_d for consistency across devices. + // The problem with using accscalar_t is that accscalar_t might be + // float32 on gpu for a float32 scalar_t, but double on cpu for the + // same, and the effective output size starts differing on CPU vs GPU + // because of precision issues, which we dont want. the corner-case we + // do want to take into account is int64_t, which has higher precision + // than double + double size_d; + if constexpr (std::is_same_v) { + int64_t sgn = (xstep > 0) - (xstep < 0); + size_d = std::ceil((xend - xstart + xstep - sgn) / xstep); + } else { + size_d = std::ceil( + static_cast(end.to() - start.to()) / + step.to()); + } + + TORCH_CHECK( + size_d >= 0 && + size_d <= static_cast(std::numeric_limits::max()), + "invalid size, possible overflow?"); + int64_t size = static_cast(size_d); + int64_t numel = out.numel(); + + if (numel != size) { + if (numel > 0) { + TORCH_WARN( + "The number of elements in the out tensor of shape ", + out.sizes(), + " is ", + numel, + " which does not match the computed number of elements ", + size, + ". Note that this may occur as a result of rounding error. " + "The out tensor will be resized to a tensor of shape (", + size, + ",)."); + } + out.resize_({size}); + } + return at::native::xpu::arange_kernel(start, end, step, out); } @@ -22,6 +79,25 @@ Tensor& XPUNativeFunctions::range_out( const Scalar& end, const Scalar& step, Tensor& out) { + auto xstart = start.to(); + auto xend = end.to(); + auto xstep = step.to(); + + TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); + TORCH_CHECK( + std::isfinite(xstart) && std::isfinite(xend), + "unsupported range: ", + xstart, + " -> ", + xend); + TORCH_CHECK( + ((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)), + "upper bound and larger bound inconsistent with step sign"); + int64_t size = static_cast(((xend - xstart) / xstep) + 1); + if (out.numel() != size) { + out.resize_({size}); + } + return at::native::xpu::range_kernel(start, end, step, out); } diff --git a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp index 4d54a6daf..f48daa1f2 100644 --- a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp +++ b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp @@ -86,61 +86,6 @@ Tensor& arange_kernel( auto xend = end.to(); auto xstep = step.to(); - TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); - TORCH_CHECK( - std::isfinite(static_cast(xstart)) && - std::isfinite(static_cast(xend)), - "unsupported range: ", - xstart, - " -> ", - xend); - TORCH_CHECK( - ((xstep > 0) && (xend >= xstart)) || - ((xstep < 0) && (xend <= xstart)), - "upper bound and larger bound inconsistent with step sign"); - - // we use double precision for (start - end) / step - // to compute size_d for consistency across devices. - // The problem with using accscalar_t is that accscalar_t might be - // float32 on gpu for a float32 scalar_t, but double on cpu for the - // same, and the effective output size starts differing on CPU vs GPU - // because of precision issues, which we dont want. the corner-case we - // do want to take into account is int64_t, which has higher precision - // than double - double size_d; - if constexpr (std::is_same_v) { - int64_t sgn = (xstep > 0) - (xstep < 0); - size_d = std::ceil((xend - xstart + xstep - sgn) / xstep); - } else { - size_d = std::ceil( - static_cast(end.to() - start.to()) / - step.to()); - } - - TORCH_CHECK( - size_d >= 0 && - size_d <= - static_cast(std::numeric_limits::max()), - "invalid size, possible overflow?"); - int64_t size = static_cast(size_d); - int64_t numel = result.numel(); - - if (numel != size) { - if (numel > 0) { - TORCH_WARN( - "The number of elements in the out tensor of shape ", - result.sizes(), - " is ", - numel, - " which does not match the computed number of elements ", - size, - ". Note that this may occur as a result of rounding error. " - "The out tensor will be resized to a tensor of shape (", - size, - ",)."); - } - result.resize_({size}); - } bool is_contiguous = result.is_contiguous(); Tensor r = !is_contiguous ? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT) @@ -184,22 +129,6 @@ Tensor& range_kernel( auto xend = end.to(); auto xstep = step.to(); - TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); - TORCH_CHECK( - std::isfinite(static_cast(xstart)) && - std::isfinite(static_cast(xend)), - "unsupported range: ", - xstart, - " -> ", - xend); - TORCH_CHECK( - ((xstep > 0) && (xend >= xstart)) || - ((xstep < 0) && (xend <= xstart)), - "upper bound and larger bound inconsistent with step sign"); - int64_t size = static_cast(((xend - xstart) / xstep) + 1); - if (result.numel() != size) { - result.resize_({size}); - } bool is_contiguous = result.is_contiguous(); Tensor r = !is_contiguous ? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT) From d1b9b83d0cc29a68acee806f718bdd5e895c01ef Mon Sep 17 00:00:00 2001 From: Feng Yuan Date: Sat, 20 Jul 2024 21:53:42 +0800 Subject: [PATCH 4/7] Fixing compilation error --- src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp index f48daa1f2..e62c21ed9 100644 --- a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp +++ b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp @@ -83,7 +83,6 @@ Tensor& arange_kernel( [&]() { using accscalar_t = at::acc_type; auto xstart = start.to(); - auto xend = end.to(); auto xstep = step.to(); bool is_contiguous = result.is_contiguous(); @@ -126,7 +125,6 @@ Tensor& range_kernel( at::ScalarType::Half, result.scalar_type(), "range_xpu", [&]() { using accscalar_t = acc_type; auto xstart = start.to(); - auto xend = end.to(); auto xstep = step.to(); bool is_contiguous = result.is_contiguous(); From 1d241954dda1a5696c164bdf38ec68c096529ea1 Mon Sep 17 00:00:00 2001 From: Feng Yuan Date: Sat, 20 Jul 2024 22:23:44 +0800 Subject: [PATCH 5/7] Fixing compilation error --- src/ATen/native/xpu/RangeFactories.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ATen/native/xpu/RangeFactories.cpp b/src/ATen/native/xpu/RangeFactories.cpp index 2797db6e2..2b5f0445e 100644 --- a/src/ATen/native/xpu/RangeFactories.cpp +++ b/src/ATen/native/xpu/RangeFactories.cpp @@ -38,7 +38,7 @@ Tensor& XPUNativeFunctions::arange_out( // do want to take into account is int64_t, which has higher precision // than double double size_d; - if constexpr (std::is_same_v) { + if (out.scalar_type() == at::ScalarType::Long) { int64_t sgn = (xstep > 0) - (xstep < 0); size_d = std::ceil((xend - xstart + xstep - sgn) / xstep); } else { From d4d1160ded00a293e352e6284f49d76b6b7432b5 Mon Sep 17 00:00:00 2001 From: Feng Yuan Date: Sun, 21 Jul 2024 17:17:34 +0800 Subject: [PATCH 6/7] Fix ut --- src/ATen/native/xpu/RangeFactories.cpp | 116 ++++++++++++++----------- 1 file changed, 64 insertions(+), 52 deletions(-) diff --git a/src/ATen/native/xpu/RangeFactories.cpp b/src/ATen/native/xpu/RangeFactories.cpp index 2b5f0445e..9b0d58e8a 100644 --- a/src/ATen/native/xpu/RangeFactories.cpp +++ b/src/ATen/native/xpu/RangeFactories.cpp @@ -1,4 +1,6 @@ #include +#include +#include #include #include #include @@ -14,62 +16,72 @@ Tensor& XPUNativeFunctions::arange_out( const Scalar& end, const Scalar& step, Tensor& out) { - auto xstart = start.to(); - auto xend = end.to(); - auto xstep = step.to(); + AT_DISPATCH_ALL_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + result.scalar_type(), + "arange_xpu_preprocess", + [&]() { + using accscalar_t = at::acc_type; + auto xstart = start.to(); + auto xend = end.to(); + auto xstep = step.to(); - TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); - TORCH_CHECK( - std::isfinite(xstart) && std::isfinite(xend), - "unsupported range: ", - xstart, - " -> ", - xend); - TORCH_CHECK( - ((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)), - "upper bound and larger bound inconsistent with step sign"); + TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero"); + TORCH_CHECK( + std::isfinite(xstart) && std::isfinite(xend), + "unsupported range: ", + xstart, + " -> ", + xend); + TORCH_CHECK( + ((xstep > 0) && (xend >= xstart)) || + ((xstep < 0) && (xend <= xstart)), + "upper bound and larger bound inconsistent with step sign"); - // we use double precision for (start - end) / step - // to compute size_d for consistency across devices. - // The problem with using accscalar_t is that accscalar_t might be - // float32 on gpu for a float32 scalar_t, but double on cpu for the - // same, and the effective output size starts differing on CPU vs GPU - // because of precision issues, which we dont want. the corner-case we - // do want to take into account is int64_t, which has higher precision - // than double - double size_d; - if (out.scalar_type() == at::ScalarType::Long) { - int64_t sgn = (xstep > 0) - (xstep < 0); - size_d = std::ceil((xend - xstart + xstep - sgn) / xstep); - } else { - size_d = std::ceil( - static_cast(end.to() - start.to()) / - step.to()); - } + // we use double precision for (start - end) / step + // to compute size_d for consistency across devices. + // The problem with using accscalar_t is that accscalar_t might be + // float32 on gpu for a float32 scalar_t, but double on cpu for the + // same, and the effective output size starts differing on CPU vs GPU + // because of precision issues, which we dont want. the corner-case we + // do want to take into account is int64_t, which has higher precision + // than double + double size_d; + if constexpr (std::is_same_v) { + int64_t sgn = (xstep > 0) - (xstep < 0); + size_d = std::ceil((xend - xstart + xstep - sgn) / xstep); + } else { + size_d = std::ceil( + static_cast(end.to() - start.to()) / + step.to()); + } - TORCH_CHECK( - size_d >= 0 && - size_d <= static_cast(std::numeric_limits::max()), - "invalid size, possible overflow?"); - int64_t size = static_cast(size_d); - int64_t numel = out.numel(); + TORCH_CHECK( + size_d >= 0 && + size_d <= + static_cast(std::numeric_limits::max()), + "invalid size, possible overflow?"); + int64_t size = static_cast(size_d); + int64_t numel = out.numel(); - if (numel != size) { - if (numel > 0) { - TORCH_WARN( - "The number of elements in the out tensor of shape ", - out.sizes(), - " is ", - numel, - " which does not match the computed number of elements ", - size, - ". Note that this may occur as a result of rounding error. " - "The out tensor will be resized to a tensor of shape (", - size, - ",)."); - } - out.resize_({size}); - } + if (numel != size) { + if (numel > 0) { + TORCH_WARN( + "The number of elements in the out tensor of shape ", + out.sizes(), + " is ", + numel, + " which does not match the computed number of elements ", + size, + ". Note that this may occur as a result of rounding error. " + "The out tensor will be resized to a tensor of shape (", + size, + ",)."); + } + out.resize_({size}); + } + }); return at::native::xpu::arange_kernel(start, end, step, out); } From 0e0b1706606885aadfddd297514b73f5476d51ed Mon Sep 17 00:00:00 2001 From: Feng Yuan Date: Sun, 21 Jul 2024 17:32:39 +0800 Subject: [PATCH 7/7] Compilation --- src/ATen/native/xpu/RangeFactories.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ATen/native/xpu/RangeFactories.cpp b/src/ATen/native/xpu/RangeFactories.cpp index 9b0d58e8a..60190863b 100644 --- a/src/ATen/native/xpu/RangeFactories.cpp +++ b/src/ATen/native/xpu/RangeFactories.cpp @@ -19,7 +19,7 @@ Tensor& XPUNativeFunctions::arange_out( AT_DISPATCH_ALL_TYPES_AND2( at::ScalarType::Half, at::ScalarType::BFloat16, - result.scalar_type(), + out.scalar_type(), "arange_xpu_preprocess", [&]() { using accscalar_t = at::acc_type;