intel · fengyuan14 · Jul 21, 2024 · Jul 9, 2024 · Jul 10, 2024 · Jul 10, 2024
diff --git a/src/ATen/native/xpu/RangeFactories.cpp b/src/ATen/native/xpu/RangeFactories.cpp
@@ -1,4 +1,6 @@
 #include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
 #include <ATen/ExpandUtils.h>
 #include <ATen/ScalarOps.h>
 #include <ATen/core/Tensor.h>
@@ -14,7 +16,101 @@ Tensor& XPUNativeFunctions::arange_out(
     const Scalar& end,
     const Scalar& step,
     Tensor& out) {
+  AT_DISPATCH_ALL_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      out.scalar_type(),
+      "arange_xpu_preprocess",
+      [&]() {
+        using accscalar_t = at::acc_type<scalar_t, true>;
+        auto xstart = start.to<accscalar_t>();
+        auto xend = end.to<accscalar_t>();
+        auto xstep = step.to<accscalar_t>();
+
+        TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
+        TORCH_CHECK(
+            std::isfinite(xstart) && std::isfinite(xend),
+            "unsupported range: ",
+            xstart,
+            " -> ",
+            xend);
+        TORCH_CHECK(
+            ((xstep > 0) && (xend >= xstart)) ||
+                ((xstep < 0) && (xend <= xstart)),
+            "upper bound and larger bound inconsistent with step sign");
+
+        // we use double precision for (start - end) / step
+        // to compute size_d for consistency across devices.
+        // The problem with using accscalar_t is that accscalar_t might be
+        // float32 on gpu for a float32 scalar_t, but double on cpu for the
+        // same, and the effective output size starts differing on CPU vs GPU
+        // because of precision issues, which we dont want. the corner-case we
+        // do want to take into account is int64_t, which has higher precision
+        // than double
+        double size_d;
+        if constexpr (std::is_same_v<scalar_t, int64_t>) {
+          int64_t sgn = (xstep > 0) - (xstep < 0);
+          size_d = std::ceil((xend - xstart + xstep - sgn) / xstep);
+        } else {
+          size_d = std::ceil(
+              static_cast<double>(end.to<double>() - start.to<double>()) /
+              step.to<double>());
+        }
+
+        TORCH_CHECK(
+            size_d >= 0 &&
+                size_d <=
+                    static_cast<double>(std::numeric_limits<int64_t>::max()),
+            "invalid size, possible overflow?");
+        int64_t size = static_cast<int64_t>(size_d);
+        int64_t numel = out.numel();
+
+        if (numel != size) {
+          if (numel > 0) {
+            TORCH_WARN(
+                "The number of elements in the out tensor of shape ",
+                out.sizes(),
+                " is ",
+                numel,
+                " which does not match the computed number of elements ",
+                size,
+                ". Note that this may occur as a result of rounding error. "
+                "The out tensor will be resized to a tensor of shape (",
+                size,
+                ",).");
+          }
+          out.resize_({size});
+        }
+      });
+
   return at::native::xpu::arange_kernel(start, end, step, out);
 }
 
+Tensor& XPUNativeFunctions::range_out(
+    const Scalar& start,
+    const Scalar& end,
+    const Scalar& step,
+    Tensor& out) {
+  auto xstart = start.to<double>();
+  auto xend = end.to<double>();
+  auto xstep = step.to<double>();
+
+  TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
+  TORCH_CHECK(
+      std::isfinite(xstart) && std::isfinite(xend),
+      "unsupported range: ",
+      xstart,
+      " -> ",
+      xend);
+  TORCH_CHECK(
+      ((xstep > 0) && (xend >= xstart)) || ((xstep < 0) && (xend <= xstart)),
+      "upper bound and larger bound inconsistent with step sign");
+  int64_t size = static_cast<int64_t>(((xend - xstart) / xstep) + 1);
+  if (out.numel() != size) {
+    out.resize_({size});
+  }
+
+  return at::native::xpu::range_kernel(start, end, step, out);
+}
+
 } // namespace at
diff --git a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.cpp
@@ -83,64 +83,8 @@ Tensor& arange_kernel(
       [&]() {
         using accscalar_t = at::acc_type<scalar_t, true>;
         auto xstart = start.to<accscalar_t>();
-        auto xend = end.to<accscalar_t>();
         auto xstep = step.to<accscalar_t>();
 
-        TORCH_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
-        TORCH_CHECK(
-            std::isfinite(static_cast<double>(xstart)) &&
-                std::isfinite(static_cast<double>(xend)),
-            "unsupported range: ",
-            xstart,
-            " -> ",
-            xend);
-        TORCH_CHECK(
-            ((xstep > 0) && (xend >= xstart)) ||
-                ((xstep < 0) && (xend <= xstart)),
-            "upper bound and larger bound inconsistent with step sign");
-
-        // we use double precision for (start - end) / step
-        // to compute size_d for consistency across devices.
-        // The problem with using accscalar_t is that accscalar_t might be
-        // float32 on gpu for a float32 scalar_t, but double on cpu for the
-        // same, and the effective output size starts differing on CPU vs GPU
-        // because of precision issues, which we dont want. the corner-case we
-        // do want to take into account is int64_t, which has higher precision
-        // than double
-        double size_d;
-        if constexpr (std::is_same_v<scalar_t, int64_t>) {
-          int64_t sgn = (xstep > 0) - (xstep < 0);
-          size_d = std::ceil((xend - xstart + xstep - sgn) / xstep);
-        } else {
-          size_d = std::ceil(
-              static_cast<double>(end.to<double>() - start.to<double>()) /
-              step.to<double>());
-        }
-
-        TORCH_CHECK(
-            size_d >= 0 &&
-                size_d <=
-                    static_cast<double>(std::numeric_limits<int64_t>::max()),
-            "invalid size, possible overflow?");
-        int64_t size = static_cast<int64_t>(size_d);
-        int64_t numel = result.numel();
-
-        if (numel != size) {
-          if (numel > 0) {
-            TORCH_WARN(
-                "The number of elements in the out tensor of shape ",
-                result.sizes(),
-                " is ",
-                numel,
-                " which does not match the computed number of elements ",
-                size,
-                ". Note that this may occur as a result of rounding error. "
-                "The out tensor will be resized to a tensor of shape (",
-                size,
-                ",).");
-          }
-          result.resize_({size});
-        }
         bool is_contiguous = result.is_contiguous();
         Tensor r = !is_contiguous
             ? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT)
@@ -157,6 +101,48 @@ Tensor& arange_kernel(
   return result;
 }
 
+template <typename scalar_t, typename accscalar_t>
+struct RangeFunctor {
+  scalar_t operator()(int64_t ind) const {
+    accscalar_t inc = xstep_ * static_cast<accscalar_t>(ind);
+    accscalar_t val = xstart_ + inc;
+    return static_cast<scalar_t>(val);
+  }
+  RangeFunctor(accscalar_t xstart, accscalar_t xstep)
+      : xstart_(xstart), xstep_(xstep) {}
+
+ private:
+  accscalar_t xstart_;
+  accscalar_t xstep_;
+};
+
+Tensor& range_kernel(
+    const Scalar& start,
+    const Scalar& end,
+    const Scalar& step,
+    Tensor& result) {
+  AT_DISPATCH_ALL_TYPES_AND(
+      at::ScalarType::Half, result.scalar_type(), "range_xpu", [&]() {
+        using accscalar_t = acc_type<scalar_t, true>;
+        auto xstart = start.to<accscalar_t>();
+        auto xstep = step.to<accscalar_t>();
+
+        bool is_contiguous = result.is_contiguous();
+        Tensor r = !is_contiguous
+            ? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT)
+            : result;
+        auto f = RangeFunctor<scalar_t, accscalar_t>(xstart, xstep);
+
+        gpu_kernel_with_index(r, f);
+
+        if (!result.is_contiguous()) {
+          result.copy_(r);
+        }
+      });
+
+  return result;
+}
+
 } // namespace xpu
 } // namespace native
 } // namespace at
diff --git a/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h b/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h
@@ -12,6 +12,12 @@ Tensor& arange_kernel(
     const Scalar& step,
     Tensor& result);
 
+Tensor& range_kernel(
+    const Scalar& start,
+    const Scalar& end,
+    const Scalar& step,
+    Tensor& result);
+
 } // namespace xpu
 } // namespace native
 } // namespace at
diff --git a/yaml/xpu_functions.yaml b/yaml/xpu_functions.yaml
@@ -629,3 +629,4 @@ supported:
   - ceil_
   - ceil.out
   - nan_to_num.out
+  - range.out