Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cumsum op implementation #7050

Merged
merged 46 commits into from
Jan 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
856f1a5
add cumsum op's forward definition
wyushun Dec 15, 2021
3621aa3
add cumsum forward test case
wyushun Dec 15, 2021
cbaf9a0
cumsum ver3
wyushun Dec 16, 2021
4ddce0f
remove calculating time
wyushun Dec 16, 2021
b6c9cb5
add cumsum forward gpu implementation
wyushun Dec 16, 2021
9cabf38
fix gpu forward error
wyushun Dec 16, 2021
141ef48
change var name
wyushun Dec 16, 2021
8954f02
remove annotation
wyushun Dec 16, 2021
bfae4c4
add cumsum cpu forward multi-thread support
wyushun Dec 16, 2021
213657f
add multi-thread annotation
wyushun Dec 18, 2021
9805fdb
add cumsum grad definition
wyushun Dec 20, 2021
04d9f62
update
wyushun Dec 20, 2021
8832bce
add cumsum cpu backward
wyushun Dec 20, 2021
f21703c
add cumsum cpu backward functor
wyushun Dec 20, 2021
9ee1a6b
add cumsum autograd
wyushun Dec 21, 2021
eaade4a
update
wyushun Dec 21, 2021
5e3e7ea
remove user interface
wyushun Dec 21, 2021
89911d7
use random method to test cumsum forward
wyushun Dec 21, 2021
4ea9ebd
add cumsum gpu backward
wyushun Dec 21, 2021
3d72ce2
add cumsum gpu test
wyushun Dec 21, 2021
9129a96
fix gpu backward bug
wyushun Dec 21, 2021
7b23a4f
add a 3d cuda kernel try
wyushun Dec 21, 2021
fc7dfd7
Revert "add cumsum gpu test"
wyushun Dec 22, 2021
dc70fdf
Revert "Revert "add cumsum gpu test""
wyushun Dec 22, 2021
4763e5a
change nele to ele_cnt
wyushun Dec 22, 2021
0e0c718
add test_cumsum.py in oneflow/test/modules
wyushun Dec 22, 2021
caf8671
change original test_cumsum to autotest version
wyushun Dec 22, 2021
88ac0e0
optimize cumsum for special up_space and down_space
wyushun Dec 23, 2021
4b61610
add two special cu func
wyushun Dec 23, 2021
7174053
add cumsum doc
wyushun Dec 23, 2021
382917b
update doc
wyushun Dec 23, 2021
8943d3a
update doc
wyushun Dec 23, 2021
d25f519
update code according to bbuf's review
wyushun Dec 24, 2021
6a22d7a
ditto
wyushun Dec 24, 2021
a3ce1e6
change pin/pout to in_ptr/out_ptr
wyushun Dec 24, 2021
0f11699
remove multi-thread func
wyushun Dec 24, 2021
51398d1
update doc
wyushun Dec 24, 2021
dbace89
use tensor processor
wyushun Dec 24, 2021
bac94d2
update by review
wyushun Dec 27, 2021
c955b8a
update by review
wyushun Dec 28, 2021
50d53d2
update
wyushun Dec 28, 2021
81a4b8d
update
wyushun Dec 31, 2021
5dc75c6
auto format by CI
oneflow-ci-bot Dec 31, 2021
b85be6f
auto format by CI
oneflow-ci-bot Dec 31, 2021
c7dba96
update doc
wyushun Jan 1, 2022
7a8f774
update
wyushun Jan 1, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/oneflow.rst
Original file line number Diff line number Diff line change
Expand Up @@ -152,5 +152,6 @@ oneflow
decode_onerec,
read_onerec,
from_numpy,
cumsum,

.. autofunction:: oneflow.relu
64 changes: 64 additions & 0 deletions oneflow/core/autograd/gradient_funcs/cumsum.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
Copyright 2020 The OneFlow Authors. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/framework/op_expr_grad_function.h"
#include "oneflow/core/functional/functional.h"

namespace oneflow {
namespace one {

struct CumsumCaptureState : public AutoGradCaptureState {
bool requires_grad = false;
int64_t dim = 0;
};

class CumsumGrad : public OpExprGradFunction<CumsumCaptureState> {
public:
Maybe<void> Init(const OpExpr& op) override {
const auto* fw_op_expr = dynamic_cast<const UserOpExpr*>(&op);
CHECK_NOTNULL_OR_RETURN(fw_op_expr);
base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto());
return Maybe<void>::Ok();
}

Maybe<void> Capture(CumsumCaptureState* ctx, const TensorTuple& inputs,
const TensorTuple& outputs, const AttrMap& attrs) const override {
CHECK_EQ_OR_RETURN(inputs.size(), 1);
ctx->requires_grad = inputs.at(0)->requires_grad();
if (!ctx->requires_grad) { return Maybe<void>::Ok(); }

ComposedAttrMap composed_attrs(attrs, base_attrs_);
ctx->dim = JUST(composed_attrs.GetAttr<int64_t>("dim"));
return Maybe<void>::Ok();
}

Maybe<void> Apply(const CumsumCaptureState* ctx, const TensorTuple& out_grads,
TensorTuple* in_grads) const override {
CHECK_EQ_OR_RETURN(out_grads.size(), 1);
in_grads->resize(1);
if (ctx->requires_grad) {
in_grads->at(0) = JUST(functional::CumsumGrad(out_grads.at(0), ctx->dim));
}
return Maybe<void>::Ok();
}

private:
AttrMap base_attrs_;
};

REGISTER_OP_EXPR_GRAD_FUNCTION("cumsum", CumsumGrad);

} // namespace one
} // namespace oneflow
8 changes: 8 additions & 0 deletions oneflow/core/functional/functional_api.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1864,3 +1864,11 @@
- name: "in_top_k"
signature: "Tensor (Tensor targets, Tensor predictions, Int32 k) => InTopK"
bind_python: True

- name: "cumsum"
signature: "Tensor (Tensor input, Int64 dim) => Cumsum"
bind_python: True

- name: "cumsum_grad"
signature: "Tensor (Tensor input, Int64 dim) => CumsumGrad"
bind_python: False
39 changes: 39 additions & 0 deletions oneflow/core/functional/impl/math_functor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1666,6 +1666,43 @@ class MovedimIntFunctor {
}
};

class CumsumFunctor {
public:
CumsumFunctor() { op_ = CHECK_JUST(one::OpBuilder("cumsum").Input("in").Output("out").Build()); }
Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& input, int64_t dim) const {
auto ndim = input->ndim();
if (dim < 0) { dim += ndim; }
CHECK_OR_RETURN(dim >= 0 && dim < ndim)
<< "IndexError: Dimension out of range (expected to be in range of [" << -ndim << ","
<< ndim << " ) but got " << dim;

MutableAttrMap attrs;
JUST(attrs.SetAttr<int64_t>("dim", dim));
TensorProcessor tensor_processor;
JUST(tensor_processor.AddInputs({input}, DType::Int64()).Apply());
TensorTuple input_tuple = JUST(tensor_processor.GetInputs());
return OpInterpUtil::Dispatch<Tensor>(*op_, input_tuple, attrs);
}

private:
std::shared_ptr<OpExpr> op_;
};

class CumsumGradFunctor {
public:
CumsumGradFunctor() {
op_ = CHECK_JUST(one::OpBuilder("cumsum_grad").Input("dy").Output("dx").Build());
}
Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& input, int64_t dim) const {
// No need to check dim validation here, while CumsumFunctor handled already
wyushun marked this conversation as resolved.
Show resolved Hide resolved
MutableAttrMap attrs;
JUST(attrs.SetAttr<int64_t>("dim", dim));
return OpInterpUtil::Dispatch<Tensor>(*op_, {input}, attrs);
}

private:
std::shared_ptr<OpExpr> op_;
};
} // namespace impl

using namespace impl;
Expand Down Expand Up @@ -1728,6 +1765,8 @@ ONEFLOW_FUNCTION_LIBRARY(m) {
m.add_functor<DotFunctor>("Dot");
m.add_functor<MovedimVecFunctor>("MovedimVec");
m.add_functor<MovedimIntFunctor>("MovedimInt");
m.add_functor<CumsumFunctor>("Cumsum");
m.add_functor<CumsumGradFunctor>("CumsumGrad");
};

} // namespace functional
Expand Down
129 changes: 129 additions & 0 deletions oneflow/user/kernels/cumsum_kernel.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
/*
Copyright 2020 The OneFlow Authors. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/framework/framework.h"

namespace oneflow {

namespace {
template<typename T>
void cumsum_forward(const T* in_ptr, T* out_ptr, int64_t cs_up_space, int64_t cs_space,
int64_t cs_down_space, int64_t elem_cnt) {
std::copy_n(in_ptr, elem_cnt, out_ptr);
auto* tmp_out_ptr_base = out_ptr;
auto step = cs_space * cs_down_space;
for (auto i = 0; i < cs_up_space; i++) {
for (auto j = 1; j < cs_space; j++) {
auto* tmp_out_ptr = tmp_out_ptr_base + j * cs_down_space;
auto* last_tmp_out_ptr = tmp_out_ptr - cs_down_space;
for (auto k = 0; k < cs_down_space; k++) { tmp_out_ptr[k] += last_tmp_out_ptr[k]; }
}
tmp_out_ptr_base += step;
}
}

template<typename T>
void cumsum_backward(const T* in_ptr, T* out_ptr, int64_t cs_up_space, int64_t cs_space,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in_ptr, out_ptr 感觉改成 dy_ptr, dx_ptr更易懂点

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

你的建议很好,xiaoyu让我从pin/pout改成in_ptr/out_ptr,我已经更新过了,但是命名这个事情众口难调,比如我现在还是觉得pin/pout最简洁明了,后来还是尊重xiaoyu的建议做了修改,现在继续再改的话我这没问题,但仍然不会让所有人满意,你觉得呢~

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

我觉得是我的问题,这里是反向,确实应该dy_ptr和dx_ptr更合理一些,方便再改下吗

int64_t cs_down_space, int64_t elem_cnt) {
auto* tmp_in_ptr_base = in_ptr;
auto* tmp_out_ptr_base = out_ptr;
auto step = cs_space * cs_down_space;
for (auto i = 0; i < cs_up_space; i++) {
for (auto j = 0; j < cs_space; j++) {
auto* tmp_in_ptr = tmp_in_ptr_base + j * cs_down_space;
auto* tmp_out_ptr = tmp_out_ptr_base + j * cs_down_space;
std::fill_n(tmp_out_ptr, cs_down_space, cs_space - j);
for (auto k = 0; k < cs_down_space; k++) { tmp_out_ptr[k] *= tmp_in_ptr[k]; }
}
tmp_in_ptr_base += step;
tmp_out_ptr_base += step;
}
}
} // namespace

template<typename T>
class CpuCumsumKernel final : public user_op::OpKernel {
public:
CpuCumsumKernel() = default;
~CpuCumsumKernel() = default;

private:
void Compute(user_op::KernelComputeContext* ctx) const override {
const auto* in = ctx->Tensor4ArgNameAndIndex("in", 0);
auto elem_cnt = in->shape().elem_cnt();
// judge whether tensor has 0 size dimension first
if (!elem_cnt) { return; }

auto* out = ctx->Tensor4ArgNameAndIndex("out", 0);
auto dim = ctx->Attr<int64_t>("dim");
const auto* in_ptr = in->dptr<T>();
auto* out_ptr = out->mut_dptr<T>();

// take cumsum's abbreviation as `cs`
// data partition: cs_up_space|cs_space|cs_down_space
auto cs_up_space = elem_cnt / in->shape().Count(dim);
auto cs_space = in->shape().At(dim);
auto cs_down_space = in->shape().Count(dim + 1);

cumsum_forward<T>(in_ptr, out_ptr, cs_up_space, cs_space, cs_down_space, elem_cnt);
}

bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};

#define REGISTER_CUMSUM_KERNEL(dtype) \
REGISTER_USER_KERNEL("cumsum").SetCreateFn<CpuCumsumKernel<dtype>>().SetIsMatchedHob( \
(user_op::HobDeviceType() == DeviceType::kCPU) \
&& (user_op::HobDataType("out", 0) == GetDataType<dtype>::value));

REGISTER_CUMSUM_KERNEL(int64_t)
BBuf marked this conversation as resolved.
Show resolved Hide resolved
REGISTER_CUMSUM_KERNEL(float)
wyushun marked this conversation as resolved.
Show resolved Hide resolved
REGISTER_CUMSUM_KERNEL(double)

template<typename T>
class CpuCumsumGradKernel final : public user_op::OpKernel {
public:
CpuCumsumGradKernel() = default;
~CpuCumsumGradKernel() = default;

private:
void Compute(user_op::KernelComputeContext* ctx) const override {
const auto* dy = ctx->Tensor4ArgNameAndIndex("dy", 0);
auto* dx = ctx->Tensor4ArgNameAndIndex("dx", 0);
auto elem_cnt = dy->shape().elem_cnt();
auto dim = ctx->Attr<int64_t>("dim");
const auto* dy_ptr = dy->dptr<T>();
auto* dx_ptr = dx->mut_dptr<T>();

// data partition: cs_up_space|cs_space|cs_down_space
auto cs_up_space = elem_cnt / dx->shape().Count(dim);
auto cs_space = dx->shape().At(dim);
auto cs_down_space = dx->shape().Count(dim + 1);

cumsum_backward(dy_ptr, dx_ptr, cs_up_space, cs_space, cs_down_space, elem_cnt);
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};

#define REGISTER_CPU_CUMSUM_GRAD_KERNEL(dtype) \
REGISTER_USER_KERNEL("cumsum_grad") \
.SetCreateFn<CpuCumsumGradKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCPU) \
&& (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));

REGISTER_CPU_CUMSUM_GRAD_KERNEL(float)
REGISTER_CPU_CUMSUM_GRAD_KERNEL(double)
BBuf marked this conversation as resolved.
Show resolved Hide resolved

} // namespace oneflow
Loading