Oneflow-Inc · mergify · Apr 9, 2022 · Apr 2, 2022 · Apr 2, 2022 · Apr 2, 2022
diff --git a/oneflow/user/kernels/nd_index_slice_kernels.cpp b/oneflow/user/kernels/nd_index_slice_kernels.cpp
@@ -35,6 +35,15 @@ struct ScatterNdAddFunctor<DeviceType::kCPU, T, I> final {
   }
 };
 
+template<typename T, typename I>
+struct ScatterNdUpdateFunctor<DeviceType::kCPU, T, I> final {
+  void operator()(ep::Stream* stream, const NdIndexSliceArgs<T, I>& args, const I* indices,
+                  const T* slices, T* dense) const {
+    DoScatterNdUpdate<DeviceType::kCPU>(args.num_slices * args.slice_size, args.slice_size,
+                                        args.index_ndims, args.dense_shape, indices, slices, dense);
+  }
+};
+
 template<typename T, typename I>
 struct FillByNdIndexFunctor<DeviceType::kCPU, T, I> final {
   void operator()(ep::Stream* stream, const NdIndexSliceArgs<T, I>& args, const I* indices,

@@ -34,6 +34,13 @@ __global__ void CudaScatterNdAdd(NdIndexSliceArgs<T, I> args, const I* indices,
                                     args.index_ndims, args.dense_shape, indices, slices, dense);
 }
 
+template<typename T, typename I>
+__global__ void CudaScatterNdUpdate(NdIndexSliceArgs<T, I> args, const I* indices, const T* slices,
+                                    T* dense) {
+  DoScatterNdUpdate<DeviceType::kCUDA>(args.num_slices * args.slice_size, args.slice_size,
+                                       args.index_ndims, args.dense_shape, indices, slices, dense);
+}
+
 template<typename T, typename I>
 __global__ void CudaFillByNdIndex(NdIndexSliceArgs<T, I> args, const I* indices, T* dense,
                                   T value) {
@@ -61,6 +68,15 @@ struct ScatterNdAddFunctor<DeviceType::kCUDA, T, I> final {
   }
 };
 
+template<typename T, typename I>
+struct ScatterNdUpdateFunctor<DeviceType::kCUDA, T, I> final {
+  void operator()(ep::Stream* stream, const NdIndexSliceArgs<T, I>& args, const I* indices,
+                  const T* slices, T* dense) const {
+    RUN_CUDA_KERNEL((CudaScatterNdUpdate<T, I>), stream, args.num_slices * args.slice_size, args,
+                    indices, slices, dense);
+  }
+};
+
 template<typename T, typename I>
 struct FillByNdIndexFunctor<DeviceType::kCUDA, T, I> final {
   void operator()(ep::Stream* stream, const NdIndexSliceArgs<T, I>& args, const I* indices,

diff --git a/oneflow/user/kernels/nd_index_slice_kernels.h b/oneflow/user/kernels/nd_index_slice_kernels.h
@@ -103,10 +103,8 @@ void TensorScatterNdUpdateKernel<device_type, T, I>::Compute(
   Memcpy<device_type>(ctx->stream(), out->mut_dptr<T>(), params->dptr<T>(), out_bytes_size);
   if (indices->shape().elem_cnt() == 0) { return; }
   auto args = ConstructNdIndexSliceArgs<T, I>(*params, *updates, *indices);
-  FillByNdIndexFunctor<device_type, T, I>()(ctx->stream(), args, indices->dptr<I>(),
-                                            out->mut_dptr<T>(), static_cast<T>(0));
-  ScatterNdAddFunctor<device_type, T, I>()(ctx->stream(), args, indices->dptr<I>(),
-                                           updates->dptr<T>(), out->mut_dptr<T>());
+  ScatterNdUpdateFunctor<device_type, T, I>()(ctx->stream(), args, indices->dptr<I>(),
+                                              updates->dptr<T>(), out->mut_dptr<T>());
 }
 
 template<DeviceType device_type, typename T, typename I>

diff --git a/oneflow/user/kernels/nd_index_slice_util.h b/oneflow/user/kernels/nd_index_slice_util.h
@@ -55,6 +55,12 @@ struct ScatterNdAddFunctor final {
                   const T* slices, T* dense) const;
 };
 
+template<DeviceType device_type, typename T, typename I>
+struct ScatterNdUpdateFunctor final {
+  void operator()(ep::Stream* stream, const NdIndexSliceArgs<T, I>& args, const I* indices,
+                  const T* slices, T* dense) const;
+};
+
 template<DeviceType device_type, typename T, typename I>
 struct FillByNdIndexFunctor final {
   void operator()(ep::Stream* stream, const NdIndexSliceArgs<T, I>& args, const I* indices,
@@ -101,6 +107,16 @@ OF_DEVICE_FUNC void DoScatterNdAdd(int64_t elem_cnt, int64_t slice_size, int64_t
   }
 }
 
+template<DeviceType device_type, typename T, typename I>
+OF_DEVICE_FUNC void DoScatterNdUpdate(int64_t elem_cnt, int64_t slice_size, int64_t index_ndims,
+                                      const int64_t* dense_shape, const I* indices, const T* slices,
+                                      T* dense) {
+  XPU_1D_KERNEL_LOOP(i, elem_cnt) {
+    int64_t offset = OffsetInSliceToOffsetInDense(slice_size, index_ndims, dense_shape, indices, i);
+    dense[offset] = slices[i];
+  }
+}
+
 template<typename T, typename I>
 OF_DEVICE_FUNC void DoFillByNdIndex(int64_t elem_cnt, int64_t slice_size, int64_t index_ndims,
                                     const int64_t* dense_shape, const I* indices, T* dense,

diff --git a/python/oneflow/test/tensor/test_tensor_part_1.py b/python/oneflow/test/tensor/test_tensor_part_1.py
@@ -483,6 +483,19 @@ def compare_setitem_with_numpy(tensor, slices, value):
         x = flow.Tensor(2, 3, 4)
         compare_setitem_with_numpy(x, se[1, :, 2], v)
 
+    @flow.unittest.skip_unless_1n1d()
+    @autotest(check_graph=True)
+    def test_setitem_with_random_data(test_case):
+        device = random_device()
+        x = random_tensor(low=0, high=0, ndim=1, dim0=16).to(device)
+        y = random_tensor(low=-2, high=2, ndim=1, dim0=16).to(device)
+        idx = random_tensor(
+            low=0, high=15, ndim=1, dim0=20, dtype=int, requires_grad=False
+        ).to(device)
+        z = y[idx]
+        x[idx] = z
+        return x
+
     @flow.unittest.skip_unless_1n1d()
     def test_div(test_case):
         x = flow.Tensor(np.random.randn(1, 1))