Oneflow-Inc · oneflow-ci-bot · Jan 1, 2022 · Dec 31, 2021 · Dec 31, 2021 · Dec 31, 2021
@@ -389,6 +389,10 @@
   signature: "Tensor (Tensor x, Tensor dy) => SquareGrad"
   bind_python: False
 
+- name: "sqrt_square_sum"
+  signature: "Tensor (Tensor x) => SqrtSquareSum"
+  bind_python: True
+
 - name: "std"
   signature: "Tensor (Tensor x, Int32List[1] dim=None, Bool unbiased=None, Bool keepdim=None) => StandardDeviation"
   bind_python: True

@@ -810,6 +810,19 @@ class ClampFunctor {
   std::shared_ptr<OpExpr> clip_max_op_;
 };
 
+class SqrtSquareSumFunctor {
+ public:
+  SqrtSquareSumFunctor() {
+    op_ = CHECK_JUST(one::OpBuilder("sqrt_square_sum").Input("x").Output("y").Build());
+  }
+  Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x) const {
+    return OpInterpUtil::Dispatch<Tensor>(*op_, {x}, {});
+  }
+
+ private:
+  std::shared_ptr<OpExpr> op_;
+};
+
 class VectorNormFunctor {
  public:
   VectorNormFunctor() {}
@@ -834,6 +847,7 @@ class VectorNormFunctor {
       }
       dtype_val = x->dtype();
     }
+    bool full_dim_flag = true;
     std::vector<int32_t> dim;
     if (!input_dim.has_value()) {
       std::vector<int32_t> reduce_axis(x->shape()->NumAxes());
@@ -848,7 +862,9 @@ class VectorNormFunctor {
         } else {
           dim.emplace_back(dim_check[i] + x->shape()->NumAxes());
         }
+        if (dim[i] != i) { full_dim_flag = false; }
       }
+      if ((int)dim.size() < x->shape()->NumAxes()) { full_dim_flag = false; }
     }
     if (ord.IsIntegral() || ord.IsFloatingPoint()) {
       double ord_val = JUST(ord.As<double>());
@@ -859,6 +875,9 @@ class VectorNormFunctor {
         res = JUST(ReduceMax(JUST(Abs(x)), dim, keepdim));
       } else if (ord_val == -INFINITY) {
         res = JUST(ReduceMin(JUST(Abs(x)), dim, keepdim));
+      } else if (ord_val == 2.0 && keepdim == false && full_dim_flag
+                 && x->requires_grad() == false) {
+        res = JUST(SqrtSquareSum(x));
       } else {
         res =
             JUST(ScalarPow(JUST(ReduceSum(JUST(ScalarPow(JUST(Abs(x)), ord, false)), dim, keepdim)),
@@ -1738,6 +1757,7 @@ ONEFLOW_FUNCTION_LIBRARY(m) {
   m.add_functor<ConsistentArangeFunctor, ConsistentArange2Functor>("ConsistentArange");
   m.add_functor<CastFunctor>("Cast");
   m.add_functor<ClampFunctor>("Clamp");
+  m.add_functor<SqrtSquareSumFunctor>("SqrtSquareSum");
   m.add_functor<VectorNormFunctor, ScalarVectorNormFunctor>("VectorNorm");
   m.add_functor<ScalarMatrixNormFunctor, MatrixNormFunctor>("MatrixNorm");
   m.add_functor<NormFunctor, Norm2Functor>("Norm");

@@ -8245,6 +8245,19 @@ def OneFlow_SquareSumOp : OneFlow_BaseOp<"square_sum", [NoSideEffect, DeclareOpI
   let has_data_type_infer_fn = 1;
 }
 
+def OneFlow_SqrtSquareSumOp : OneFlow_BaseOp<"sqrt_square_sum", [NoSideEffect, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
+  let input = (ins
+    OneFlow_Tensor:$x
+  );
+  let output = (outs
+    OneFlow_Tensor:$y
+  );
+  let has_logical_tensor_desc_infer_fn = 1;
+  let has_physical_tensor_desc_infer_fn = 1;
+  let has_get_sbp_fn = 1;
+  let has_data_type_infer_fn = 1;
+}
+
 def OneFlow_SqueezeOp : OneFlow_BaseOp<"squeeze", [NoSideEffect, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
   let input = (ins
     OneFlow_Tensor:$in

diff --git a/oneflow/user/kernels/sqrt_square_sum_kernel.cpp b/oneflow/user/kernels/sqrt_square_sum_kernel.cpp
@@ -0,0 +1,69 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <cstdint>
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/user/kernels/sqrt_square_sum_kernel_util.h"
+#include "oneflow/core/common/balanced_splitter.h"
+#include "oneflow/core/kernel/cuda_graph_support.h"
+
+namespace oneflow {
+
+namespace user_op {
+
+int64_t getThreadNumBlocks(int64_t n) {
+  int64_t num_blocks = 1;
+#ifdef WITH_CUDA
+  num_blocks = BlocksNum4ThreadsNum(n);
+#endif
+  return num_blocks;
+}
+
+template<DeviceType device_type, typename T>
+class SqrtSquareSumKernel final : public user_op::OpKernel, public user_op::CudaGraphSupport {
+ public:
+  SqrtSquareSumKernel() = default;
+  ~SqrtSquareSumKernel() override = default;
+
+ private:
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    const user_op::Tensor* x = ctx->Tensor4ArgNameAndIndex("x", 0);
+    user_op::Tensor* y = ctx->Tensor4ArgNameAndIndex("y", 0);
+    user_op::Tensor* tmp = ctx->Tensor4ArgNameAndIndex("tmp_buffer", 0);
+
+    SqrtSquareSumKernelUtil<device_type, T>::SqrtSquareSum(
+        ctx->stream(), x->shape().elem_cnt(), x->dptr<T>(), y->mut_dptr<T>(), tmp->mut_dptr<T>());
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_SQUARE_SUM_KERNEL(device, dtype)                                     \
+  REGISTER_USER_KERNEL("sqrt_square_sum")                                             \
+      .SetCreateFn<SqrtSquareSumKernel<device, OF_PP_PAIR_FIRST(dtype)>>()            \
+      .SetIsMatchedHob((user_op::HobDeviceType() == device)                           \
+                       && (user_op::HobDataType("y", 0) == OF_PP_PAIR_SECOND(dtype))) \
+      .SetInferTmpSizeFn([](user_op::InferContext* ctx) -> size_t {                   \
+        const auto& x_shape = ctx->InputTensorDesc("x", 0).shape();                   \
+        const int32_t num_blocks = getThreadNumBlocks(x_shape.Count(0));              \
+        int64_t tmp_buffer_size = num_blocks;                                         \
+        return tmp_buffer_size * sizeof(OF_PP_PAIR_FIRST(dtype));                     \
+      });
+
+OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(REGISTER_SQUARE_SUM_KERNEL, DEVICE_TYPE_SEQ,
+                                 FLOATING_DATA_TYPE_SEQ)
+
+}  // namespace user_op
+
+}  // namespace oneflow
diff --git a/oneflow/user/kernels/sqrt_square_sum_kernel_util.cpp b/oneflow/user/kernels/sqrt_square_sum_kernel_util.cpp
@@ -0,0 +1,34 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/user/kernels/sqrt_square_sum_kernel_util.h"
+
+namespace oneflow {
+
+template<typename T>
+struct SqrtSquareSumKernelUtil<DeviceType::kCPU, T> {
+  static void SqrtSquareSum(ep::Stream* stream, int64_t n, const T* x, T* y, T* tmp) {
+    T sum = 0;
+    FOR_RANGE(int64_t, i, 0, n) { sum += x[i] * x[i]; }
+    *y = std::sqrt(sum);
+  }
+};
+
+#define INSTANTIATE_SQUARE_SUM_KERNEL_UTIL_CPU(type_cpp, type_proto) \
+  template struct SqrtSquareSumKernelUtil<DeviceType::kCPU, type_cpp>;
+OF_PP_FOR_EACH_TUPLE(INSTANTIATE_SQUARE_SUM_KERNEL_UTIL_CPU, FLOATING_DATA_TYPE_SEQ);
+#undef INSTANTIATE_SQUARE_SUM_KERNEL_UTIL_CPU
+
+}  // namespace oneflow
@@ -0,0 +1,82 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/user/kernels/sqrt_square_sum_kernel_util.h"
+#include "oneflow/core/cuda/atomic.cuh"
+#include "oneflow/core/ep/cuda/cuda_stream.h"
+#include <cub/cub.cuh>
+
+namespace oneflow {
+
+namespace {
+
+template<typename T>
+__global__ void SqrtSquareSumForOneThreadBlock(int64_t n, const T* x, T* y) {
+  T t_sum = 0;
+  CUDA_1D_KERNEL_LOOP(i, n) { t_sum += x[i] * x[i]; }
+  typedef cub::BlockReduce<T, kCudaThreadsNumPerBlock> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  T b_sum = BlockReduce(temp_storage).Sum(t_sum);
+  if (threadIdx.x == 0) { *y = sqrt(b_sum); }
+}
+
+template<typename T>
+__global__ void SqrtSumForMultiThreadBlock(int64_t n, const T* x, T* y) {
+  T t_sum = 0;
+  CUDA_1D_KERNEL_LOOP(i, n) { t_sum += x[i]; }
+  typedef cub::BlockReduce<T, kCudaThreadsNumPerBlock> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  T b_sum = BlockReduce(temp_storage).Sum(t_sum);
+  if (threadIdx.x == 0) { *y = sqrt(b_sum); }
+}
+
+template<typename T>
+__global__ void SquareSumForMultiThreadBlock(int64_t n, const T* x, T* tmp) {
+  T t_sum = 0;
+  CUDA_1D_KERNEL_LOOP(i, n) { t_sum += x[i] * x[i]; }
+  typedef cub::BlockReduce<T, kCudaThreadsNumPerBlock> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  T b_sum = BlockReduce(temp_storage).Sum(t_sum);
+  if (threadIdx.x == 0) { tmp[blockIdx.x] = b_sum; }
+}
+
+}  // namespace
+
+template<typename T>
+struct SqrtSquareSumKernelUtil<DeviceType::kCUDA, T> {
+  static void SqrtSquareSum(ep::Stream* stream, int64_t n, const T* x, T* y, T* tmp) {
+    const int32_t num_blocks = BlocksNum4ThreadsNum(n);
+    CHECK_GE(num_blocks, 0);
+    if (num_blocks == 1) {
+      SqrtSquareSumForOneThreadBlock<T>
+          <<<1, kCudaThreadsNumPerBlock, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(n, x, y);
+    } else {
+      Memset<DeviceType::kCUDA>(stream, y, 0, sizeof(T));
+      SquareSumForMultiThreadBlock<T>
+          <<<num_blocks, kCudaThreadsNumPerBlock, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+              n, x, tmp);
+      SqrtSumForMultiThreadBlock<T>
+          <<<1, kCudaThreadsNumPerBlock, 0, stream->As<ep::CudaStream>()->cuda_stream()>>>(
+              num_blocks, tmp, y);
+    }
+  }
+};
+
+#define INSTANTIATE_SQRT_SQUARE_SUM_KERNEL_UTIL_CUDA(type_cpp, type_proto) \
+  template struct SqrtSquareSumKernelUtil<DeviceType::kCUDA, type_cpp>;
+OF_PP_FOR_EACH_TUPLE(INSTANTIATE_SQRT_SQUARE_SUM_KERNEL_UTIL_CUDA, FLOATING_DATA_TYPE_SEQ);
+#undef INSTANTIATE_SQRT_SQUARE_SUM_KERNEL_UTIL_CUDA
+
+}  // namespace oneflow
diff --git a/oneflow/user/kernels/sqrt_square_sum_kernel_util.h b/oneflow/user/kernels/sqrt_square_sum_kernel_util.h
@@ -0,0 +1,30 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#ifndef ONEFLOW_USER_KERNELS_SQUARE_SUM_KERNEL_UTIL_H_
+#define ONEFLOW_USER_KERNELS_SQUARE_SUM_KERNEL_UTIL_H_
+
+#include "oneflow/core/kernel/kernel_util.h"
+
+namespace oneflow {
+
+template<DeviceType device_type, typename T>
+struct SqrtSquareSumKernelUtil {
+  static void SqrtSquareSum(ep::Stream* stream, int64_t n, const T* x, T* y, T* tmp);
+};
+
+}  // namespace oneflow
+
+#endif  // ONEFLOW_USER_KERNELS_SQUARE_SUM_KERNEL_UTIL_H_
diff --git a/oneflow/user/ops/sqrt_square_sum_op.cpp b/oneflow/user/ops/sqrt_square_sum_op.cpp
@@ -0,0 +1,41 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/framework/op_generated.h"
+
+namespace oneflow {
+
+/*static*/ Maybe<void> SqrtSquareSumOp::GetSbp(user_op::SbpContext* ctx) {
+  const int64_t num_x_axes = ctx->LogicalTensorDesc4InputArgNameAndIndex("x", 0).shape().NumAxes();
+  FOR_RANGE(int64_t, i, 0, num_x_axes) {
+    ctx->NewBuilder().Split(user_op::OpArg("x", 0), i).PartialSum(user_op::OpArg("y", 0)).Build();
+  }
+  return Maybe<void>::Ok();
+}
+/*static*/ Maybe<void> SqrtSquareSumOp::InferLogicalTensorDesc(user_op::InferContext* ctx) {
+  user_op::TensorDesc* y = ctx->OutputTensorDesc("y", 0);
+  *y->mut_shape() = Shape({1});
+  return Maybe<void>::Ok();
+}
+/*static*/ Maybe<void> SqrtSquareSumOp::InferPhysicalTensorDesc(user_op::InferContext* ctx) {
+  return InferLogicalTensorDesc(ctx);
+}
+/*static*/ Maybe<void> SqrtSquareSumOp::InferDataType(user_op::InferContext* ctx) {
+  *ctx->OutputDType("y", 0) = ctx->InputDType("x", 0);
+  return Maybe<void>::Ok();
+}
+
+}  // namespace oneflow
diff --git a/python/oneflow/nn/utils/clip_grad.py b/python/oneflow/nn/utils/clip_grad.py
@@ -76,7 +76,7 @@ def clip_grad_norm_(
         >>> out2.backward()
         >>> norm2 = flow.nn.utils.clip_grad_norm_(x2, 0.5)
         >>> norm2
-        tensor(1.0394, dtype=oneflow.float32)
+        tensor([1.0394], dtype=oneflow.float32)
         >>> x2.grad
         tensor([[0.0962, 0.0481, 0.0283],
                 [0.0663, 0.4810, 0.0428]], dtype=oneflow.float32)