Skip to content

Commit

Permalink
Cumsum op implementation (#7050)
Browse files Browse the repository at this point in the history
* add cumsum op's forward definition

* add cumsum forward test case

* cumsum ver3

* remove calculating time

* add cumsum forward gpu implementation

* fix gpu forward error

* change var name

* remove annotation

* add cumsum cpu forward multi-thread support

* add multi-thread annotation

* add cumsum grad definition

* update

* add cumsum cpu backward

* add cumsum cpu backward functor

* add cumsum autograd

* update

* remove user interface

* use random method to test cumsum forward

* add cumsum gpu backward

* add cumsum gpu test

* fix gpu backward bug

* add a 3d cuda kernel try

* Revert "add cumsum gpu test"

This reverts commit 05c31556ba28ecb827b25e54c2f5fa38984e8096.

* Revert "Revert "add cumsum gpu test""

This reverts commit 918ee1569863b008c1d419c3528257416cffd840.

* change nele to ele_cnt

* add test_cumsum.py in oneflow/test/modules

* change original test_cumsum to autotest version

* optimize cumsum for special up_space and down_space

* add two special cu func

* add cumsum doc

* update doc

* update doc

* update code according to bbuf's review

* ditto

* change pin/pout to in_ptr/out_ptr

* remove multi-thread func

* update doc

* use tensor processor

* update by review

* update by review

* update

* update

* auto format by CI

* auto format by CI

* update doc

* update

Co-authored-by: oneflow-ci-bot <ci-bot@oneflow.org>
  • Loading branch information
wyushun and oneflow-ci-bot authored Jan 1, 2022
1 parent eabe79e commit 9869a3f
Show file tree
Hide file tree
Showing 10 changed files with 679 additions and 75 deletions.
1 change: 1 addition & 0 deletions docs/source/oneflow.rst
Original file line number Diff line number Diff line change
Expand Up @@ -152,5 +152,6 @@ oneflow
decode_onerec,
read_onerec,
from_numpy,
cumsum,

.. autofunction:: oneflow.relu
64 changes: 64 additions & 0 deletions oneflow/core/autograd/gradient_funcs/cumsum.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/framework/op_expr_grad_function.h"
#include "oneflow/core/functional/functional.h"

namespace oneflow {
namespace one {

struct CumsumCaptureState : public AutoGradCaptureState {
bool requires_grad = false;
int64_t dim = 0;
};

class CumsumGrad : public OpExprGradFunction<CumsumCaptureState> {
public:
Maybe<void> Init(const OpExpr& op) override {
const auto* fw_op_expr = dynamic_cast<const UserOpExpr*>(&op);
CHECK_NOTNULL_OR_RETURN(fw_op_expr);
base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto());
return Maybe<void>::Ok();
}

Maybe<void> Capture(CumsumCaptureState* ctx, const TensorTuple& inputs,
const TensorTuple& outputs, const AttrMap& attrs) const override {
CHECK_EQ_OR_RETURN(inputs.size(), 1);
ctx->requires_grad = inputs.at(0)->requires_grad();
if (!ctx->requires_grad) { return Maybe<void>::Ok(); }

ComposedAttrMap composed_attrs(attrs, base_attrs_);
ctx->dim = JUST(composed_attrs.GetAttr<int64_t>("dim"));
return Maybe<void>::Ok();
}

Maybe<void> Apply(const CumsumCaptureState* ctx, const TensorTuple& out_grads,
TensorTuple* in_grads) const override {
CHECK_EQ_OR_RETURN(out_grads.size(), 1);
in_grads->resize(1);
if (ctx->requires_grad) {
in_grads->at(0) = JUST(functional::CumsumGrad(out_grads.at(0), ctx->dim));
}
return Maybe<void>::Ok();
}

private:
AttrMap base_attrs_;
};

REGISTER_OP_EXPR_GRAD_FUNCTION("cumsum", CumsumGrad);

} // namespace one
} // namespace oneflow
8 changes: 8 additions & 0 deletions oneflow/core/functional/functional_api.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1864,3 +1864,11 @@
- name: "in_top_k"
signature: "Tensor (Tensor targets, Tensor predictions, Int32 k) => InTopK"
bind_python: True

- name: "cumsum"
signature: "Tensor (Tensor input, Int64 dim) => Cumsum"
bind_python: True

- name: "cumsum_grad"
signature: "Tensor (Tensor input, Int64 dim) => CumsumGrad"
bind_python: False
39 changes: 39 additions & 0 deletions oneflow/core/functional/impl/math_functor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1666,6 +1666,43 @@ class MovedimIntFunctor {
}
};

class CumsumFunctor {
public:
CumsumFunctor() { op_ = CHECK_JUST(one::OpBuilder("cumsum").Input("in").Output("out").Build()); }
Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& input, int64_t dim) const {
auto ndim = input->ndim();
if (dim < 0) { dim += ndim; }
CHECK_OR_RETURN(dim >= 0 && dim < ndim)
<< "IndexError: Dimension out of range (expected to be in range of [" << -ndim << ","
<< ndim << " ) but got " << dim;

MutableAttrMap attrs;
JUST(attrs.SetAttr<int64_t>("dim", dim));
TensorProcessor tensor_processor;
JUST(tensor_processor.AddInputs({input}, DType::Int64()).Apply());
TensorTuple input_tuple = JUST(tensor_processor.GetInputs());
return OpInterpUtil::Dispatch<Tensor>(*op_, input_tuple, attrs);
}

private:
std::shared_ptr<OpExpr> op_;
};

class CumsumGradFunctor {
public:
CumsumGradFunctor() {
op_ = CHECK_JUST(one::OpBuilder("cumsum_grad").Input("dy").Output("dx").Build());
}
Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& input, int64_t dim) const {
// No need to check dim validation here, while CumsumFunctor handled already
MutableAttrMap attrs;
JUST(attrs.SetAttr<int64_t>("dim", dim));
return OpInterpUtil::Dispatch<Tensor>(*op_, {input}, attrs);
}

private:
std::shared_ptr<OpExpr> op_;
};
} // namespace impl

using namespace impl;
Expand Down Expand Up @@ -1728,6 +1765,8 @@ ONEFLOW_FUNCTION_LIBRARY(m) {
m.add_functor<DotFunctor>("Dot");
m.add_functor<MovedimVecFunctor>("MovedimVec");
m.add_functor<MovedimIntFunctor>("MovedimInt");
m.add_functor<CumsumFunctor>("Cumsum");
m.add_functor<CumsumGradFunctor>("CumsumGrad");
};

} // namespace functional
Expand Down
129 changes: 129 additions & 0 deletions oneflow/user/kernels/cumsum_kernel.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/framework/framework.h"

namespace oneflow {

namespace {
template<typename T>
void cumsum_forward(const T* in_ptr, T* out_ptr, int64_t cs_up_space, int64_t cs_space,
int64_t cs_down_space, int64_t elem_cnt) {
std::copy_n(in_ptr, elem_cnt, out_ptr);
auto* tmp_out_ptr_base = out_ptr;
auto step = cs_space * cs_down_space;
for (auto i = 0; i < cs_up_space; i++) {
for (auto j = 1; j < cs_space; j++) {
auto* tmp_out_ptr = tmp_out_ptr_base + j * cs_down_space;
auto* last_tmp_out_ptr = tmp_out_ptr - cs_down_space;
for (auto k = 0; k < cs_down_space; k++) { tmp_out_ptr[k] += last_tmp_out_ptr[k]; }
}
tmp_out_ptr_base += step;
}
}

template<typename T>
void cumsum_backward(const T* in_ptr, T* out_ptr, int64_t cs_up_space, int64_t cs_space,
int64_t cs_down_space, int64_t elem_cnt) {
auto* tmp_in_ptr_base = in_ptr;
auto* tmp_out_ptr_base = out_ptr;
auto step = cs_space * cs_down_space;
for (auto i = 0; i < cs_up_space; i++) {
for (auto j = 0; j < cs_space; j++) {
auto* tmp_in_ptr = tmp_in_ptr_base + j * cs_down_space;
auto* tmp_out_ptr = tmp_out_ptr_base + j * cs_down_space;
std::fill_n(tmp_out_ptr, cs_down_space, cs_space - j);
for (auto k = 0; k < cs_down_space; k++) { tmp_out_ptr[k] *= tmp_in_ptr[k]; }
}
tmp_in_ptr_base += step;
tmp_out_ptr_base += step;
}
}
} // namespace

template<typename T>
class CpuCumsumKernel final : public user_op::OpKernel {
public:
CpuCumsumKernel() = default;
~CpuCumsumKernel() = default;

private:
void Compute(user_op::KernelComputeContext* ctx) const override {
const auto* in = ctx->Tensor4ArgNameAndIndex("in", 0);
auto elem_cnt = in->shape().elem_cnt();
// judge whether tensor has 0 size dimension first
if (!elem_cnt) { return; }

auto* out = ctx->Tensor4ArgNameAndIndex("out", 0);
auto dim = ctx->Attr<int64_t>("dim");
const auto* in_ptr = in->dptr<T>();
auto* out_ptr = out->mut_dptr<T>();

// take cumsum's abbreviation as `cs`
// data partition: cs_up_space|cs_space|cs_down_space
auto cs_up_space = elem_cnt / in->shape().Count(dim);
auto cs_space = in->shape().At(dim);
auto cs_down_space = in->shape().Count(dim + 1);

cumsum_forward<T>(in_ptr, out_ptr, cs_up_space, cs_space, cs_down_space, elem_cnt);
}

bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};

#define REGISTER_CUMSUM_KERNEL(dtype) \
REGISTER_USER_KERNEL("cumsum").SetCreateFn<CpuCumsumKernel<dtype>>().SetIsMatchedHob( \
(user_op::HobDeviceType() == DeviceType::kCPU) \
&& (user_op::HobDataType("out", 0) == GetDataType<dtype>::value));

REGISTER_CUMSUM_KERNEL(int64_t)
REGISTER_CUMSUM_KERNEL(float)
REGISTER_CUMSUM_KERNEL(double)

template<typename T>
class CpuCumsumGradKernel final : public user_op::OpKernel {
public:
CpuCumsumGradKernel() = default;
~CpuCumsumGradKernel() = default;

private:
void Compute(user_op::KernelComputeContext* ctx) const override {
const auto* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
auto* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
auto elem_cnt = dy->shape().elem_cnt();
auto dim = ctx->Attr<int64_t>("dim");
const auto* dy_ptr = dy->dptr<T>();
auto* dx_ptr = dx->mut_dptr<T>();

// data partition: cs_up_space|cs_space|cs_down_space
auto cs_up_space = elem_cnt / dx->shape().Count(dim);
auto cs_space = dx->shape().At(dim);
auto cs_down_space = dx->shape().Count(dim + 1);

cumsum_backward(dy_ptr, dx_ptr, cs_up_space, cs_space, cs_down_space, elem_cnt);
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};

#define REGISTER_CPU_CUMSUM_GRAD_KERNEL(dtype) \
REGISTER_USER_KERNEL("cumsum_grad") \
.SetCreateFn<CpuCumsumGradKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU) \
&& (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));

REGISTER_CPU_CUMSUM_GRAD_KERNEL(float)
REGISTER_CPU_CUMSUM_GRAD_KERNEL(double)

} // namespace oneflow
Loading

0 comments on commit 9869a3f

Please sign in to comment.