Oneflow-Inc · mergify · Mar 3, 2023 · Mar 2, 2023 · Mar 2, 2023 · Mar 2, 2023
@@ -1784,6 +1784,10 @@
   signature: "TensorTuple (Tensor dy ,Tensor alpha, Tensor rho2, Tensor c2) => FusedGetCiouResultGrad"
   bind_python: False
 
+- name: "fused_codegeex_qkv_reshape"
+  signature: "TensorTuple (Tensor query, Tensor key, Tensor value, Int32 num_attention_heads) => FusedCodegeexQkvReshape"
+  bind_python: True
+
 - name: "fused_get_iou"
   signature: "Tensor (Tensor w1, Tensor h1, Tensor w2, Tensor h2, Tensor inter, Float eps) => FusedGetIou"
   bind_python: True

@@ -3914,6 +3914,32 @@ class CloneFunctor {
   Maybe<Tensor> operator()(const std::shared_ptr<Tensor>& input) const { return input->clone(); }
 };
 
+class FusedCodegeexQkvReshapeFunctor {
+ public:
+  FusedCodegeexQkvReshapeFunctor() {
+    op_ = CHECK_JUST(one::OpBuilder("fused_codegeex_qkv_reshape")
+                         .Input("query")
+                         .Input("key")
+                         .Input("value")
+                         .Output("new_query")
+                         .Output("new_key")
+                         .Output("new_value")
+                         .Build());
+  }
+
+  Maybe<TensorTuple> operator()(const std::shared_ptr<one::Tensor>& query,
+                                const std::shared_ptr<one::Tensor>& key,
+                                const std::shared_ptr<one::Tensor>& value,
+                                const int32_t num_attention_heads) const {
+    auto& attrs = THREAD_CACHED_MUTABLE_ATTR_MAP("num_attention_heads");
+    attrs.SetAllAttrs(num_attention_heads);
+    return OpInterpUtil::Dispatch<TensorTuple>(*op_, {query, key, value}, attrs);
+  }
+
+ private:
+  std::shared_ptr<OpExpr> op_;
+};
+
 }  // namespace impl
 
 ONEFLOW_FUNCTION_LIBRARY(m) {
@@ -4070,6 +4096,7 @@ ONEFLOW_FUNCTION_LIBRARY(m) {
   m.add_functor<impl::BaddBmmFunctor>("BaddBmm");
   m.add_functor<impl::SortFunctor>("Sort");
   m.add_functor<impl::CloneFunctor>("Clone");
+  m.add_functor<impl::FusedCodegeexQkvReshapeFunctor>("FusedCodegeexQkvReshape");
 };
 
 }  // namespace functional

@@ -3234,6 +3234,26 @@ def OneFlow_FusedGetConvexDiagonalSquaredGradOp : OneFlow_BaseOp<"fused_get_conv
   let has_data_type_infer_fn = 1;
 }
 
+def OneFlow_FusedCodegeexQkvReshapeOp : OneFlow_BaseOp<"fused_codegeex_qkv_reshape", [NoSideEffect, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
+  let input = (ins
+    OneFlow_Tensor:$query,
+    OneFlow_Tensor:$key,
+    OneFlow_Tensor:$value
+  );
+  let output = (outs
+    OneFlow_Tensor:$new_query,
+    OneFlow_Tensor:$new_key,
+    OneFlow_Tensor:$new_value
+  );
+  let attrs = (ins
+    DefaultValuedAttr<SI32Attr, "1">:$num_attention_heads
+  );
+  let has_logical_tensor_desc_infer_fn = 1;
+  let has_physical_tensor_desc_infer_fn = 1;
+  let has_get_sbp_fn = 1;
+  let has_data_type_infer_fn = 1;
+}
+
 #endif // GET_ONEFLOW_FUSED_OP_DEFINITIONS
 
 

@@ -0,0 +1,111 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <cassert>
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/kernel/new_kernel_util.h"
+
+namespace oneflow {
+
+namespace {
+
+template<typename T, int pack_size>
+struct alignas(sizeof(T) * pack_size) Packed {
+  __device__ Packed(T val) {
+#pragma unroll
+    for (int i = 0; i < pack_size; i++) { elem[i] = val; }
+  }
+  __device__ Packed() {
+    // do nothing
+  }
+  union {
+    T elem[pack_size];
+  };
+  __device__ void operator=(Packed<T, pack_size> packA) {
+#pragma unroll
+    for (int i = 0; i < pack_size; i++) { elem[i] = packA.elem[i]; }
+  }
+};
+
+// [seq_length, batch_size, hidden_size] -> [seq_length, batch_size, head_num, size_per_head]
+template<typename T, int pack_size>
+__global__ void batch_reshape_for_qkv(const int n, const T* query, const T* key, const T* value,
+                                      T* new_query, T* new_key, T* new_value) {
+  const auto* query_pack_ptr = reinterpret_cast<const Packed<T, pack_size>*>(query);
+  const auto* key_pack_ptr = reinterpret_cast<const Packed<T, pack_size>*>(key);
+  const auto* value_pack_ptr = reinterpret_cast<const Packed<T, pack_size>*>(value);
+  auto* new_query_pack_ptr = reinterpret_cast<Packed<T, pack_size>*>(new_query);
+  auto* new_key_pack_ptr = reinterpret_cast<Packed<T, pack_size>*>(new_key);
+  auto* new_value_pack_ptr = reinterpret_cast<Packed<T, pack_size>*>(new_value);
+  assert(n % pack_size == 0);
+  CUDA_1D_KERNEL_LOOP(i, n) {
+    Packed<T, pack_size> query_pack = query_pack_ptr[i];
+    Packed<T, pack_size> key_pack = key_pack_ptr[i];
+    Packed<T, pack_size> value_pack = value_pack_ptr[i];
+    new_query_pack_ptr[i] = query_pack;
+    new_key_pack_ptr[i] = key_pack;
+    new_value_pack_ptr[i] = value_pack;
+  }
+}
+
+};  // namespace
+
+template<typename T>
+class FusedCodegeexQkvReshapeGpuKernel final : public user_op::OpKernel {
+ public:
+  FusedCodegeexQkvReshapeGpuKernel() = default;
+  ~FusedCodegeexQkvReshapeGpuKernel() = default;
+
+ private:
+  using user_op::OpKernel::Compute;
+  void Compute(user_op::KernelComputeContext* ctx) const override {
+    // [seq_length, batch_size, hidden_size] -> [seq_length, batch_size, head_num, size_per_head]
+    const user_op::Tensor* query = ctx->Tensor4ArgNameAndIndex("query", 0);
+    const user_op::Tensor* key = ctx->Tensor4ArgNameAndIndex("key", 0);
+    const user_op::Tensor* value = ctx->Tensor4ArgNameAndIndex("value", 0);
+
+    user_op::Tensor* new_query = ctx->Tensor4ArgNameAndIndex("new_query", 0);
+    user_op::Tensor* new_key = ctx->Tensor4ArgNameAndIndex("new_key", 0);
+    user_op::Tensor* new_value = ctx->Tensor4ArgNameAndIndex("new_value", 0);
+
+    const int32_t n = query->shape_view().elem_cnt();
+    if (n % 4 == 0) {
+      RUN_CUDA_KERNEL((batch_reshape_for_qkv<T, 4>), ctx->stream(), n / 4, n / 4, query->dptr<T>(),
+                      key->dptr<T>(), value->dptr<T>(), new_query->mut_dptr<T>(),
+                      new_key->mut_dptr<T>(), new_value->mut_dptr<T>());
+    } else if (n % 2 == 0) {
+      RUN_CUDA_KERNEL((batch_reshape_for_qkv<T, 2>), ctx->stream(), n / 2, n / 2, query->dptr<T>(),
+                      key->dptr<T>(), value->dptr<T>(), new_query->mut_dptr<T>(),
+                      new_key->mut_dptr<T>(), new_value->mut_dptr<T>());
+    } else {
+      RUN_CUDA_KERNEL((batch_reshape_for_qkv<T, 1>), ctx->stream(), n, n, query->dptr<T>(),
+                      key->dptr<T>(), value->dptr<T>(), new_query->mut_dptr<T>(),
+                      new_key->mut_dptr<T>(), new_value->mut_dptr<T>());
+    }
+  }
+  bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
+};
+
+#define REGISTER_FUSED_CODEGEEX_QKV_RESHAPE_CUDA_KERNEL(dtype)         \
+  REGISTER_USER_KERNEL("fused_codegeex_qkv_reshape")                   \
+      .SetCreateFn<FusedCodegeexQkvReshapeGpuKernel<dtype>>()          \
+      .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA) \
+                       && (user_op::HobDataType("query", 0) == GetDataType<dtype>::value));
+
+REGISTER_FUSED_CODEGEEX_QKV_RESHAPE_CUDA_KERNEL(float)
+REGISTER_FUSED_CODEGEEX_QKV_RESHAPE_CUDA_KERNEL(half)
+REGISTER_FUSED_CODEGEEX_QKV_RESHAPE_CUDA_KERNEL(double)
+
+}  // namespace oneflow
diff --git a/oneflow/user/ops/fused_codegeex_qkv_reshape.cpp b/oneflow/user/ops/fused_codegeex_qkv_reshape.cpp
@@ -0,0 +1,88 @@
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "oneflow/core/common/maybe.h"
+#include "oneflow/core/common/shape_vec.h"
+#include "oneflow/core/framework/framework.h"
+#include "oneflow/core/framework/op_generated.h"
+
+namespace oneflow {
+
+Maybe<void> FusedCodegeexQkvReshapeOp::InferLogicalTensorDesc(user_op::InferContext* ctx) {
+  const user_op::TensorDesc& query = ctx->InputTensorDesc("query", 0);
+  const user_op::TensorDesc& key = ctx->InputTensorDesc("key", 0);
+  const user_op::TensorDesc& value = ctx->InputTensorDesc("value", 0);
+  const int32_t num_attention_heads = ctx->Attr<int32_t>("num_attention_heads");
+  CHECK_EQ_OR_RETURN(query.shape().size(), 3) << "query shape size should be equal 3";
+  CHECK_EQ_OR_RETURN(key.shape().size(), 3) << "key shape size should be equal 3";
+  CHECK_EQ_OR_RETURN(value.shape().size(), 3) << "value shape size should be equal 3";
+  CHECK_EQ_OR_RETURN(query.shape(), key.shape())
+      << "query, key, value should has same shape in codegeex attention block";
+  CHECK_EQ_OR_RETURN(query.shape(), value.shape())
+      << "query, key, value should has same shape in codegeex attention block";
+  CHECK_EQ_OR_RETURN(query.shape()[2] % num_attention_heads, 0)
+      << "hidden_size must be divisible by num_attention_heads";
+
+  Shape new_shape(DimVector{query.shape()[0], query.shape()[1], num_attention_heads,
+                            query.shape()[2] / num_attention_heads});
+  user_op::TensorDesc* new_query = ctx->MutOutputTensorDesc("new_query", 0);
+  new_query->set_is_dynamic(query.is_dynamic());
+  new_query->set_shape(new_shape);
+
+  user_op::TensorDesc* new_key = ctx->MutOutputTensorDesc("new_key", 0);
+  new_key->set_is_dynamic(key.is_dynamic());
+  new_key->set_shape(new_shape);
+
+  user_op::TensorDesc* new_value = ctx->MutOutputTensorDesc("new_value", 0);
+  new_value->set_is_dynamic(value.is_dynamic());
+  new_value->set_shape(new_shape);
+
+  return Maybe<void>::Ok();
+}
+
+Maybe<void> FusedCodegeexQkvReshapeOp::InferPhysicalTensorDesc(user_op::InferContext* ctx) {
+  return FusedCodegeexQkvReshapeOp::InferLogicalTensorDesc(ctx);
+}
+
+Maybe<void> FusedCodegeexQkvReshapeOp::InferDataType(user_op::InferContext* ctx) {
+  const user_op::TensorDesc& query = ctx->InputTensorDesc("query", 0);
+  const user_op::TensorDesc& key = ctx->InputTensorDesc("key", 0);
+  const user_op::TensorDesc& value = ctx->InputTensorDesc("value", 0);
+
+  user_op::TensorDesc* new_query = ctx->MutOutputTensorDesc("new_query", 0);
+  new_query->set_data_type(query.data_type());
+  user_op::TensorDesc* new_key = ctx->MutOutputTensorDesc("new_key", 0);
+  new_key->set_data_type(key.data_type());
+  user_op::TensorDesc* new_value = ctx->MutOutputTensorDesc("new_value", 0);
+  new_value->set_data_type(value.data_type());
+  return Maybe<void>::Ok();
+}
+
+Maybe<void> FusedCodegeexQkvReshapeOp::GetSbp(user_op::SbpContext* ctx) {
+  const user_op::TensorDesc& query = ctx->LogicalTensorDesc4InputArgNameAndIndex("query", 0);
+  FOR_RANGE(int64_t, i, 0, query.shape().NumAxes() - 1) {
+    ctx->NewBuilder()
+        .Split(user_op::OpArg("query", 0), i)
+        .Split(user_op::OpArg("key", 0), i)
+        .Split(user_op::OpArg("value", 0), i)
+        .Split(user_op::OpArg("new_query", 0), i)
+        .Split(user_op::OpArg("new_key", 0), i)
+        .Split(user_op::OpArg("new_value", 0), i)
+        .Build();
+  }
+  return Maybe<void>::Ok();
+}
+
+}  // namespace oneflow
diff --git a/python/oneflow/test/modules/test_fused_codegeex_qkv_reshape.py b/python/oneflow/test/modules/test_fused_codegeex_qkv_reshape.py
@@ -0,0 +1,74 @@
+"""
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import unittest
+from collections import OrderedDict
+
+import numpy as np
+from oneflow.test_utils.test_util import GenArgList
+
+import oneflow as flow
+import oneflow.unittest
+
+
+def _test_codegeex_qkv_reshape_impl(test_case, device, shape, num_attention_heads):
+    query = flow.randn(shape).to("cuda")
+    key = flow.randn(shape).to("cuda")
+    value = flow.randn(shape).to("cuda")
+    new_shape = (
+        shape[0],
+        shape[1],
+        num_attention_heads,
+        shape[2] / num_attention_heads,
+    )
+    new_query = query.view(new_shape)
+    new_query = new_query.contiguous()
+    new_key = key.view(new_shape)
+    new_key = new_key.contiguous()
+    new_value = value.view(new_shape)
+    new_value = new_value.contiguous()
+    (
+        fused_new_query,
+        fused_new_key,
+        fused_new_value,
+    ) = flow._C.fused_codegeex_qkv_reshape(query, key, value, num_attention_heads)
+
+    def compare(a, b, rtol=1e-5, atol=1e-5):
+        test_case.assertTrue(
+            np.allclose(
+                a.detach().cpu().numpy(), b.detach().cpu().numpy(), rtol=rtol, atol=atol
+            ),
+            f"\na\n{a.detach().cpu().numpy()}\n{'-' * 80}\nb:\n{b.detach().cpu().numpy()}\n{'*' * 80}\ndiff:\n{a.detach().cpu().numpy() - b.detach().cpu().numpy()}",
+        )
+
+    compare(new_query, fused_new_query)
+    compare(new_key, fused_new_key)
+    compare(new_value, fused_new_value)
+
+
+@flow.unittest.skip_unless_1n1d()
+class TestFusedCodegeexQkvReshapeModule(flow.unittest.TestCase):
+    def test_codegeex_qkv_reshape(test_case):
+        arg_dict = OrderedDict()
+        arg_dict["test_fun"] = [_test_codegeex_qkv_reshape_impl]
+        arg_dict["device"] = ["cuda"]
+        arg_dict["shape"] = [(32, 8, 16), (32, 8, 32)]
+        arg_dict["num_attention_heads"] = [(4), (8)]
+        for arg in GenArgList(arg_dict):
+            arg[0](test_case, *arg[1:])
+
+
+if __name__ == "__main__":
+    unittest.main()