From 0815e6230f59c2db5324dacb1477bcd6562a6fa0 Mon Sep 17 00:00:00 2001
From: chunhuanMeng <chunhuan.meng@intel.com>
Date: Tue, 9 Jul 2024 09:00:25 +0000
Subject: [PATCH 1/7] enable op aten::range

---
 src/ATen/native/xpu/RangeFactories.cpp        |  8 +++
 .../native/xpu/sycl/RangeFactoriesKernel.cpp  | 60 +++++++++++++++++++
 .../native/xpu/sycl/RangeFactoriesKernel.h    |  6 ++
 yaml/xpu_functions.yaml                       |  1 +
 4 files changed, 75 insertions(+)
diff --git a/src/ATen/native/xpu/RangeFactories.cpp b/src/ATen/native/xpu/RangeFactories.cpp
index 9373d9e55..728098755 100644
--- a/src/ATen/native/xpu/RangeFactories.cpp
+++ b/src/ATen/native/xpu/RangeFactories.cpp
@@ -17,4 +17,12 @@ Tensor& XPUNativeFunctions::arange_out(
   return at::native::xpu::arange_kernel(start, end, step, out);
 }
 
+Tensor& XPUNativeFunctions::range_out(
+    const Scalar& start,
+    const Scalar& end,
+    const Scalar& step,
+    Tensor& out) {
+  return at::native::xpu::range_kernel(start, end, step, out);
+}
+
 } // namespace at
diff --git a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp
index 0cecbdb36..04efc21a6 100644
--- a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp
+++ b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp
@@ -157,6 +157,66 @@ Tensor& arange_kernel(
   return result;
 }
 
+template <typename scalar_t, typename accscalar_t>
+struct RangeFunctor {
+  scalar_t operator()(int64_t ind) const {
+    accscalar_t inc = xstep_ * static_cast<accscalar_t>(ind);
+    accscalar_t val = xstart_ + inc;
+    return static_cast<scalar_t>(val);
+  }
+  RangeFunctor(accscalar_t xstart, accscalar_t xstep)
+      : xstart_(xstart), xstep_(xstep) {}
+
+ private:
+  accscalar_t xstart_;
+  accscalar_t xstep_;
+};
+
+Tensor& range_kernel(
+    const Scalar& start,
+    const Scalar& end,
+    const Scalar& step,
+    Tensor& result) {
+  printf("in range kernel\n");
+  AT_DISPATCH_ALL_TYPES_AND(
+      at::ScalarType::Half, result.scalar_type(), "range_xpu", [&]() {
+        using accscalar_t = acc_type<scalar_t, true>;
+        auto xstart = start.to<accscalar_t>();
+        auto xend = end.to<accscalar_t>();
+        auto xstep = step.to<accscalar_t>();
+
+        TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
+        TORCH_CHECK(
+            std::isfinite(static_cast<double>(xstart)) &&
+                std::isfinite(static_cast<double>(xend)),
+            "unsupported range: ",
+            xstart,
+            " -> ",
+            xend);
+        TORCH_CHECK(
+            ((xstep > 0) && (xend >= xstart)) ||
+                ((xstep < 0) && (xend <= xstart)),
+            "upper bound and larger bound inconsistent with step sign");
+        int64_t size = static_cast<int64_t>(((xend - xstart) / xstep) + 1);
+        if (result.numel() != size) {
+          result.resize_({size});
+        }
+        bool is_contiguous = result.is_contiguous();
+        Tensor r = !is_contiguous
+            ? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT)
+            : result;
+        auto f = RangeFunctor<scalar_t, accscalar_t>(xstart, xstep);
+
+        gpu_kernel_with_index(r, f);
+
+        if (!result.is_contiguous()) {
+          result.copy_(r);
+        }
+      });
+
+  return result;
+}
+
 } // namespace xpu
 } // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h
index 26ca6197d..3cf08ca5d 100644
--- a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h
+++ b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h
@@ -12,6 +12,12 @@ Tensor& arange_kernel(
     const Scalar& step,
     Tensor& result);
 
+Tensor& range_kernel(
+    const Scalar& start,
+    const Scalar& end,
+    const Scalar& step,
+    Tensor& result);
+
 } // namespace xpu
 } // namespace native
 } // namespace at
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
index 2ecc6790b..757e0d5bf 100644
--- a/yaml/xpu_functions.yaml
+++ b/yaml/xpu_functions.yaml
@@ -511,3 +511,4 @@ supported:
   - ceil
   - ceil_
   - ceil.out
+  - range.out

From 02294f35a055ff68c41e1a3d573272d2be56f7ac Mon Sep 17 00:00:00 2001
From: chunhuanMeng <chunhuan.meng@intel.com>
Date: Wed, 10 Jul 2024 01:50:08 +0000
Subject: [PATCH 2/7] delete printf log

---
 src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp
index 04efc21a6..4d54a6daf 100644
--- a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp
+++ b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp
@@ -177,7 +177,6 @@ Tensor& range_kernel(
     const Scalar& end,
     const Scalar& step,
     Tensor& result) {
-  printf("in range kernel\n");
   AT_DISPATCH_ALL_TYPES_AND(
       at::ScalarType::Half, result.scalar_type(), "range_xpu", [&]() {
         using accscalar_t = acc_type<scalar_t, true>;

From 44f09c1b3707af6b79dd1f19ff04b4eff3f09294 Mon Sep 17 00:00:00 2001
From: Feng Yuan <feng1.yuan@intel.com>
Date: Sat, 20 Jul 2024 21:36:18 +0800
Subject: [PATCH 3/7] Move host only code to operator level

---
 src/ATen/native/xpu/RangeFactories.cpp        | 76 +++++++++++++++++++
 .../native/xpu/sycl/RangeFactoriesKernel.cpp  | 71 -----------------
 2 files changed, 76 insertions(+), 71 deletions(-)

diff --git a/src/ATen/native/xpu/RangeFactories.cpp b/src/ATen/native/xpu/RangeFactories.cpp
index 728098755..2797db6e2 100644
--- a/src/ATen/native/xpu/RangeFactories.cpp
+++ b/src/ATen/native/xpu/RangeFactories.cpp
@@ -14,6 +14,63 @@ Tensor& XPUNativeFunctions::arange_out(
     const Scalar& end,
     const Scalar& step,
     Tensor& out) {
+  auto xstart = start.to<double>();
+  auto xend = end.to<double>();
+  auto xstep = step.to<double>();
+
+  TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
+  TORCH_CHECK(
+      std::isfinite(xstart) && std::isfinite(xend),
+      "unsupported range: ",
+      xstart,
+      " -> ",
+      xend);
+  TORCH_CHECK(
+      ((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
+      "upper bound and larger bound inconsistent with step sign");
+
+  // we use double precision for (start - end) / step
+  // to compute size_d for consistency across devices.
+  // The problem with using accscalar_t is that accscalar_t might be
+  // float32 on gpu for a float32 scalar_t, but double on cpu for the
+  // same, and the effective output size starts differing on CPU vs GPU
+  // because of precision issues, which we dont want. the corner-case we
+  // do want to take into account is int64_t, which has higher precision
+  // than double
+  double size_d;
+  if constexpr (std::is_same_v<scalar_t, int64_t>) {
+    int64_t sgn = (xstep > 0) - (xstep < 0);
+    size_d = std::ceil((xend - xstart + xstep - sgn) / xstep);
+  } else {
+    size_d = std::ceil(
+        static_cast<double>(end.to<double>() - start.to<double>()) /
+        step.to<double>());
+  }
+
+  TORCH_CHECK(
+      size_d >= 0 &&
+          size_d <= static_cast<double>(std::numeric_limits<int64_t>::max()),
+      "invalid size, possible overflow?");
+  int64_t size = static_cast<int64_t>(size_d);
+  int64_t numel = out.numel();
+
+  if (numel != size) {
+    if (numel > 0) {
+      TORCH_WARN(
+          "The number of elements in the out tensor of shape ",
+          out.sizes(),
+          " is ",
+          numel,
+          " which does not match the computed number of elements ",
+          size,
+          ". Note that this may occur as a result of rounding error. "
+          "The out tensor will be resized to a tensor of shape (",
+          size,
+          ",).");
+    }
+    out.resize_({size});
+  }
+
   return at::native::xpu::arange_kernel(start, end, step, out);
 }
 
@@ -22,6 +79,25 @@ Tensor& XPUNativeFunctions::range_out(
     const Scalar& end,
     const Scalar& step,
     Tensor& out) {
+  auto xstart = start.to<double>();
+  auto xend = end.to<double>();
+  auto xstep = step.to<double>();
+
+  TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
+  TORCH_CHECK(
+      std::isfinite(xstart) && std::isfinite(xend),
+      "unsupported range: ",
+      xstart,
+      " -> ",
+      xend);
+  TORCH_CHECK(
+      ((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
+      "upper bound and larger bound inconsistent with step sign");
+  int64_t size = static_cast<int64_t>(((xend - xstart) / xstep) + 1);
+  if (out.numel() != size) {
+    out.resize_({size});
+  }
+
   return at::native::xpu::range_kernel(start, end, step, out);
 }
 
diff --git a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp
index 4d54a6daf..f48daa1f2 100644
--- a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp
+++ b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp
@@ -86,61 +86,6 @@ Tensor& arange_kernel(
         auto xend = end.to<accscalar_t>();
         auto xstep = step.to<accscalar_t>();
 
-        TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
-        TORCH_CHECK(
-            std::isfinite(static_cast<double>(xstart)) &&
-                std::isfinite(static_cast<double>(xend)),
-            "unsupported range: ",
-            xstart,
-            " -> ",
-            xend);
-        TORCH_CHECK(
-            ((xstep > 0) && (xend >= xstart)) ||
-                ((xstep < 0) && (xend <= xstart)),
-            "upper bound and larger bound inconsistent with step sign");
-
-        // we use double precision for (start - end) / step
-        // to compute size_d for consistency across devices.
-        // The problem with using accscalar_t is that accscalar_t might be
-        // float32 on gpu for a float32 scalar_t, but double on cpu for the
-        // same, and the effective output size starts differing on CPU vs GPU
-        // because of precision issues, which we dont want. the corner-case we
-        // do want to take into account is int64_t, which has higher precision
-        // than double
-        double size_d;
-        if constexpr (std::is_same_v<scalar_t, int64_t>) {
-          int64_t sgn = (xstep > 0) - (xstep < 0);
-          size_d = std::ceil((xend - xstart + xstep - sgn) / xstep);
-        } else {
-          size_d = std::ceil(
-              static_cast<double>(end.to<double>() - start.to<double>()) /
-              step.to<double>());
-        }
-
-        TORCH_CHECK(
-            size_d >= 0 &&
-                size_d <=
-                    static_cast<double>(std::numeric_limits<int64_t>::max()),
-            "invalid size, possible overflow?");
-        int64_t size = static_cast<int64_t>(size_d);
-        int64_t numel = result.numel();
-
-        if (numel != size) {
-          if (numel > 0) {
-            TORCH_WARN(
-                "The number of elements in the out tensor of shape ",
-                result.sizes(),
-                " is ",
-                numel,
-                " which does not match the computed number of elements ",
-                size,
-                ". Note that this may occur as a result of rounding error. "
-                "The out tensor will be resized to a tensor of shape (",
-                size,
-                ",).");
-          }
-          result.resize_({size});
-        }
         bool is_contiguous = result.is_contiguous();
         Tensor r = !is_contiguous
             ? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT)
@@ -184,22 +129,6 @@ Tensor& range_kernel(
         auto xend = end.to<accscalar_t>();
         auto xstep = step.to<accscalar_t>();
 
-        TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
-        TORCH_CHECK(
-            std::isfinite(static_cast<double>(xstart)) &&
-                std::isfinite(static_cast<double>(xend)),
-            "unsupported range: ",
-            xstart,
-            " -> ",
-            xend);
-        TORCH_CHECK(
-            ((xstep > 0) && (xend >= xstart)) ||
-                ((xstep < 0) && (xend <= xstart)),
-            "upper bound and larger bound inconsistent with step sign");
-        int64_t size = static_cast<int64_t>(((xend - xstart) / xstep) + 1);
-        if (result.numel() != size) {
-          result.resize_({size});
-        }
         bool is_contiguous = result.is_contiguous();
         Tensor r = !is_contiguous
             ? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT)

From d1b9b83d0cc29a68acee806f718bdd5e895c01ef Mon Sep 17 00:00:00 2001
From: Feng Yuan <feng1.yuan@intel.com>
Date: Sat, 20 Jul 2024 21:53:42 +0800
Subject: [PATCH 4/7] Fixing compilation error

---
 src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp
index f48daa1f2..e62c21ed9 100644
--- a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp
+++ b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp
@@ -83,7 +83,6 @@ Tensor& arange_kernel(
       [&]() {
         using accscalar_t = at::acc_type<scalar_t, true>;
         auto xstart = start.to<accscalar_t>();
-        auto xend = end.to<accscalar_t>();
         auto xstep = step.to<accscalar_t>();
 
         bool is_contiguous = result.is_contiguous();
@@ -126,7 +125,6 @@ Tensor& range_kernel(
       at::ScalarType::Half, result.scalar_type(), "range_xpu", [&]() {
         using accscalar_t = acc_type<scalar_t, true>;
         auto xstart = start.to<accscalar_t>();
-        auto xend = end.to<accscalar_t>();
         auto xstep = step.to<accscalar_t>();
 
         bool is_contiguous = result.is_contiguous();

From 1d241954dda1a5696c164bdf38ec68c096529ea1 Mon Sep 17 00:00:00 2001
From: Feng Yuan <feng1.yuan@intel.com>
Date: Sat, 20 Jul 2024 22:23:44 +0800
Subject: [PATCH 5/7] Fixing compilation error

---
 src/ATen/native/xpu/RangeFactories.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ATen/native/xpu/RangeFactories.cpp b/src/ATen/native/xpu/RangeFactories.cpp
index 2797db6e2..2b5f0445e 100644
--- a/src/ATen/native/xpu/RangeFactories.cpp
+++ b/src/ATen/native/xpu/RangeFactories.cpp
@@ -38,7 +38,7 @@ Tensor& XPUNativeFunctions::arange_out(
   // do want to take into account is int64_t, which has higher precision
   // than double
   double size_d;
-  if constexpr (std::is_same_v<scalar_t, int64_t>) {
+  if (out.scalar_type() == at::ScalarType::Long) {
     int64_t sgn = (xstep > 0) - (xstep < 0);
     size_d = std::ceil((xend - xstart + xstep - sgn) / xstep);
   } else {

From d4d1160ded00a293e352e6284f49d76b6b7432b5 Mon Sep 17 00:00:00 2001
From: Feng Yuan <feng1.yuan@intel.com>
Date: Sun, 21 Jul 2024 17:17:34 +0800
Subject: [PATCH 6/7] Fix ut

---
 src/ATen/native/xpu/RangeFactories.cpp | 116 ++++++++++++++-----------
 1 file changed, 64 insertions(+), 52 deletions(-)

diff --git a/src/ATen/native/xpu/RangeFactories.cpp b/src/ATen/native/xpu/RangeFactories.cpp
index 2b5f0445e..9b0d58e8a 100644
--- a/src/ATen/native/xpu/RangeFactories.cpp
+++ b/src/ATen/native/xpu/RangeFactories.cpp
@@ -1,4 +1,6 @@
 #include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
 #include <ATen/ExpandUtils.h>
 #include <ATen/ScalarOps.h>
 #include <ATen/core/Tensor.h>
@@ -14,62 +16,72 @@ Tensor& XPUNativeFunctions::arange_out(
     const Scalar& end,
     const Scalar& step,
     Tensor& out) {
-  auto xstart = start.to<double>();
-  auto xend = end.to<double>();
-  auto xstep = step.to<double>();
+  AT_DISPATCH_ALL_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      result.scalar_type(),
+      "arange_xpu_preprocess",
+      [&]() {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+        auto xstart = start.to<accscalar_t>();
+        auto xend = end.to<accscalar_t>();
+        auto xstep = step.to<accscalar_t>();
 
-  TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
-  TORCH_CHECK(
-      std::isfinite(xstart) && std::isfinite(xend),
-      "unsupported range: ",
-      xstart,
-      " -> ",
-      xend);
-  TORCH_CHECK(
-      ((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
-      "upper bound and larger bound inconsistent with step sign");
+        TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
+        TORCH_CHECK(
+            std::isfinite(xstart) && std::isfinite(xend),
+            "unsupported range: ",
+            xstart,
+            " -> ",
+            xend);
+        TORCH_CHECK(
+            ((xstep > 0) && (xend >= xstart)) ||
+                ((xstep < 0) && (xend <= xstart)),
+            "upper bound and larger bound inconsistent with step sign");
 
-  // we use double precision for (start - end) / step
-  // to compute size_d for consistency across devices.
-  // The problem with using accscalar_t is that accscalar_t might be
-  // float32 on gpu for a float32 scalar_t, but double on cpu for the
-  // same, and the effective output size starts differing on CPU vs GPU
-  // because of precision issues, which we dont want. the corner-case we
-  // do want to take into account is int64_t, which has higher precision
-  // than double
-  double size_d;
-  if (out.scalar_type() == at::ScalarType::Long) {
-    int64_t sgn = (xstep > 0) - (xstep < 0);
-    size_d = std::ceil((xend - xstart + xstep - sgn) / xstep);
-  } else {
-    size_d = std::ceil(
-        static_cast<double>(end.to<double>() - start.to<double>()) /
-        step.to<double>());
-  }
+        // we use double precision for (start - end) / step
+        // to compute size_d for consistency across devices.
+        // The problem with using accscalar_t is that accscalar_t might be
+        // float32 on gpu for a float32 scalar_t, but double on cpu for the
+        // same, and the effective output size starts differing on CPU vs GPU
+        // because of precision issues, which we dont want. the corner-case we
+        // do want to take into account is int64_t, which has higher precision
+        // than double
+        double size_d;
+        if constexpr (std::is_same_v<scalar_t, int64_t>) {
+          int64_t sgn = (xstep > 0) - (xstep < 0);
+          size_d = std::ceil((xend - xstart + xstep - sgn) / xstep);
+        } else {
+          size_d = std::ceil(
+              static_cast<double>(end.to<double>() - start.to<double>()) /
+              step.to<double>());
+        }
 
-  TORCH_CHECK(
-      size_d >= 0 &&
-          size_d <= static_cast<double>(std::numeric_limits<int64_t>::max()),
-      "invalid size, possible overflow?");
-  int64_t size = static_cast<int64_t>(size_d);
-  int64_t numel = out.numel();
+        TORCH_CHECK(
+            size_d >= 0 &&
+                size_d <=
+                    static_cast<double>(std::numeric_limits<int64_t>::max()),
+            "invalid size, possible overflow?");
+        int64_t size = static_cast<int64_t>(size_d);
+        int64_t numel = out.numel();
 
-  if (numel != size) {
-    if (numel > 0) {
-      TORCH_WARN(
-          "The number of elements in the out tensor of shape ",
-          out.sizes(),
-          " is ",
-          numel,
-          " which does not match the computed number of elements ",
-          size,
-          ". Note that this may occur as a result of rounding error. "
-          "The out tensor will be resized to a tensor of shape (",
-          size,
-          ",).");
-    }
-    out.resize_({size});
-  }
+        if (numel != size) {
+          if (numel > 0) {
+            TORCH_WARN(
+                "The number of elements in the out tensor of shape ",
+                out.sizes(),
+                " is ",
+                numel,
+                " which does not match the computed number of elements ",
+                size,
+                ". Note that this may occur as a result of rounding error. "
+                "The out tensor will be resized to a tensor of shape (",
+                size,
+                ",).");
+          }
+          out.resize_({size});
+        }
+      });
 
   return at::native::xpu::arange_kernel(start, end, step, out);
 }

From 0e0b1706606885aadfddd297514b73f5476d51ed Mon Sep 17 00:00:00 2001
From: Feng Yuan <feng1.yuan@intel.com>
Date: Sun, 21 Jul 2024 17:32:39 +0800
Subject: [PATCH 7/7] Compilation

---
 src/ATen/native/xpu/RangeFactories.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ATen/native/xpu/RangeFactories.cpp b/src/ATen/native/xpu/RangeFactories.cpp
index 9b0d58e8a..60190863b 100644
--- a/src/ATen/native/xpu/RangeFactories.cpp
+++ b/src/ATen/native/xpu/RangeFactories.cpp
@@ -19,7 +19,7 @@ Tensor& XPUNativeFunctions::arange_out(
   AT_DISPATCH_ALL_TYPES_AND2(
       at::ScalarType::Half,
       at::ScalarType::BFloat16,
-      result.scalar_type(),
+      out.scalar_type(),
       "arange_xpu_preprocess",
       [&]() {
         using accscalar_t = at::acc_type<scalar_t, true>;