Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add FusedMatmulBiasAddReluDropout [OneEmbedding] #8222

Merged
merged 79 commits into from
Jun 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
79 commits
Select commit Hold shift + click to select a range
a89d8a3
previous version for fused_matmul_bias_add_relu_dropout
MARD1NO May 16, 2022
1582363
add op infer
MARD1NO May 16, 2022
abad909
fix detail
MARD1NO May 17, 2022
180d92a
finish forward
MARD1NO May 17, 2022
e868dd6
support dropout rate list
MARD1NO May 17, 2022
cc827e0
add forward test
MARD1NO May 17, 2022
fbdc68c
fix bug for output buffer
MARD1NO May 17, 2022
1b74dd2
Configurable alpha params
MARD1NO May 17, 2022
58b91c7
try to add bit mask logic
MARD1NO May 17, 2022
55b7b88
Add bitmask first version!
MARD1NO May 18, 2022
a0d24cd
Add row col bitmask logic
MARD1NO May 19, 2022
d0f8ab5
support not align4 reludropout
MARD1NO May 19, 2022
1f72d90
simplify relu dropout ld logic
MARD1NO May 19, 2022
0c99841
Add naive relu dropout grad kernel
MARD1NO May 19, 2022
8c2840d
add simple relu dropout grad kernel
MARD1NO May 19, 2022
8a013ca
Rename
MARD1NO May 19, 2022
32adc5b
support relu_dropout bitmask backward
MARD1NO May 19, 2022
e8e263c
add vectorized optimization
MARD1NO May 19, 2022
b8ec8cb
fix tmp buffer
MARD1NO May 19, 2022
cb828ca
add to amp list
MARD1NO May 23, 2022
9954eeb
add lazy backward logic
MARD1NO May 23, 2022
de5f824
Refine kernel
MARD1NO May 23, 2022
fdd9898
Merge branch 'master' into dev_fused_matmul_bias_add_relu_dropout
MARD1NO May 23, 2022
e6e8752
add indextype dispatch
MARD1NO May 23, 2022
37e7845
simplify functor logic
MARD1NO May 23, 2022
0746ef8
fix cublas fused mlp aux_ld shape bug
MARD1NO May 23, 2022
0c675bf
Add more relu dropout kernel
MARD1NO May 24, 2022
233e74c
add full unittest
MARD1NO May 24, 2022
b597577
fix bug in skip final activation
MARD1NO May 24, 2022
845ce90
refine
MARD1NO May 24, 2022
fb60b94
Remove dump func
MARD1NO May 24, 2022
41e924e
fix format
MARD1NO May 24, 2022
076a738
Remove cmake
MARD1NO May 24, 2022
a5baab6
remove redundant divide
MARD1NO May 24, 2022
8bd3611
add padded version
MARD1NO May 24, 2022
f479482
fix dropout
MARD1NO May 24, 2022
b4621c0
oneflow curand
MARD1NO May 25, 2022
84497a3
refine
MARD1NO May 25, 2022
85e3a79
remove redundant kernel
MARD1NO May 25, 2022
9a76eab
add unroll logic
MARD1NO May 26, 2022
84ca43d
add unroll and ballot sync
MARD1NO May 26, 2022
6347107
refine format
MARD1NO May 26, 2022
869b856
Remove fast curand
MARD1NO May 26, 2022
7c39e7d
Refine python interface
MARD1NO May 26, 2022
32f63fa
Add if branch for memset
MARD1NO May 26, 2022
54eca04
fix python logic
MARD1NO May 26, 2022
497c61d
just for debug
MARD1NO May 27, 2022
c704018
not use matmul bias add grad
MARD1NO May 30, 2022
baf26ae
add launch 1 block limit
MARD1NO May 30, 2022
c3966f7
fix unittest
MARD1NO May 30, 2022
7c6edd7
Refine
MARD1NO May 30, 2022
fb5b081
fix graph backward bug
MARD1NO May 30, 2022
d9c8c99
limit to 11060
MARD1NO May 30, 2022
420d66e
change to use int32_t dtype for cublas aux
MARD1NO May 31, 2022
51aa39f
Fix jc comment
MARD1NO May 31, 2022
735b92f
Merge branch 'master' into dev_fused_matmul_bias_add_relu_dropout
liujuncheng Jun 2, 2022
db305cf
fix comment
MARD1NO Jun 2, 2022
33414a5
Merge branch 'master' into dev_fused_matmul_bias_add_relu_dropout
mergify[bot] Jun 2, 2022
cf4b0cf
fix convert
MARD1NO Jun 6, 2022
c748022
Merge branch 'dev_fused_matmul_bias_add_relu_dropout' of github.com:O…
MARD1NO Jun 6, 2022
e3c3069
fix static_analysis
MARD1NO Jun 6, 2022
e4d22a8
Merge branch 'master' into dev_fused_matmul_bias_add_relu_dropout
MARD1NO Jun 6, 2022
271a9d1
fix at
MARD1NO Jun 6, 2022
bc49aec
Merge branch 'dev_fused_matmul_bias_add_relu_dropout' of github.com:O…
MARD1NO Jun 6, 2022
1365234
fix userops td
MARD1NO Jun 6, 2022
5b3d144
Merge branch 'master' into dev_fused_matmul_bias_add_relu_dropout
MARD1NO Jun 6, 2022
4bff58b
fix userops td
MARD1NO Jun 6, 2022
ea1f983
Merge branch 'dev_fused_matmul_bias_add_relu_dropout' of github.com:O…
MARD1NO Jun 6, 2022
e080681
fix const ref
MARD1NO Jun 6, 2022
d5f8bda
Merge branch 'master' into dev_fused_matmul_bias_add_relu_dropout
MARD1NO Jun 6, 2022
eaa39a0
Merge branch 'master' into dev_fused_matmul_bias_add_relu_dropout
MARD1NO Jun 7, 2022
6a96225
Merge branch 'master' into dev_fused_matmul_bias_add_relu_dropout
mergify[bot] Jun 7, 2022
395b952
Merge branch 'master' into dev_fused_matmul_bias_add_relu_dropout
mergify[bot] Jun 7, 2022
781125d
Merge branch 'master' into dev_fused_matmul_bias_add_relu_dropout
MARD1NO Jun 8, 2022
23f1f3a
fix compile error for bfloat16
MARD1NO Jun 8, 2022
b29b3dc
Merge branch 'master' into dev_fused_matmul_bias_add_relu_dropout
mergify[bot] Jun 8, 2022
123fc58
limit to 11060
MARD1NO Jun 8, 2022
2d55661
Merge branch 'dev_fused_matmul_bias_add_relu_dropout' of github.com:O…
MARD1NO Jun 8, 2022
668a5c9
fix bug
MARD1NO Jun 8, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 21 additions & 39 deletions oneflow/core/autograd/gradient_funcs/cublas_fused_mlp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ limitations under the License.
#include "oneflow/core/common/container_util.h"
#include "oneflow/core/functional/functional.h"
#include "oneflow/core/functional/functional_api.yaml.h"
#if CUDA_VERSION >= 11040
#if CUDA_VERSION >= 11060

namespace oneflow {

Expand Down Expand Up @@ -94,14 +94,27 @@ Maybe<void> CublasFusedMLP::Apply(const CublasFusedMLPCaptureState* ctx,
const TensorTuple& out_grads, TensorTuple* in_grads) const {
int32_t weight_num = ctx->weight_num;
in_grads->resize(1 + 2 * weight_num);
std::shared_ptr<one::Tensor> last_bias_dy = JUST(VectorAt(out_grads, 0));

if (!ctx->skip_final_activation) {
// step1: use dy and final output to get last layer's relu grad.
last_bias_dy = JUST(functional::ReluGrad(JUST(VectorAt(out_grads, 0)),
JUST(VectorAt(ctx->SavedTensors(), 1 + weight_num))));
}

// step2: use reduce_sum to get last layer's bias grad.
std::vector<int32_t> reduce_axes_vec{0};
if (JUST(VectorAt(ctx->biases_requires_grad, weight_num - 1))) {
JUST(VectorAt(*in_grads, 2 * weight_num)) =
JUST(functional::ReduceSum(last_bias_dy, reduce_axes_vec, false));
}

TensorTuple hiddens(weight_num - 1);
TensorTuple weights(weight_num);
TensorTuple cublas_auxs(weight_num);
TensorTuple dgrad(weight_num);

std::shared_ptr<one::Tensor> x = JUST(VectorAt(ctx->SavedTensors(), 0));
std::shared_ptr<one::Tensor> out = JUST(VectorAt(ctx->SavedTensors(), 1 + weight_num));

for (int32_t i = 0; i < weight_num; ++i) {
weights[i] = JUST(VectorAt(ctx->SavedTensors(), 1 + i));
Expand All @@ -115,33 +128,6 @@ Maybe<void> CublasFusedMLP::Apply(const CublasFusedMLPCaptureState* ctx,
hiddens[i] = JUST(VectorAt(ctx->SavedTensors(), i + 2 + 2 * weight_num));
}

std::shared_ptr<one::Tensor> last_bias_dy = JUST(VectorAt(out_grads, 0));

if (!ctx->skip_final_activation) {
// step1: use dy and final output to get last layer's relu grad.
last_bias_dy = JUST(functional::ReluGrad(JUST(VectorAt(out_grads, 0)), out));
}

const bool last_layer_weight_requires_grad =
JUST(VectorAt(ctx->weights_requires_grad, weight_num - 1));
const bool last_layer_bias_requires_grad =
JUST(VectorAt(ctx->biases_requires_grad, weight_num - 1));

// For last layer, we use CublasMatmulBiasAddGrad to get wgrad and b grad.
if ((last_layer_weight_requires_grad || last_layer_bias_requires_grad)) {
// If there is only 1 layer, we use CublasMatmulBiasAddGrad to calculate first layer's dw.
std::shared_ptr<one::Tensor> last_layer_x = x;
if (weight_num != 1) { last_layer_x = JUST(VectorAt(hiddens, weight_num - 2)); }
const auto& last_layer_wgrad_bgrad =
JUST(functional::CublasMatmulBiasAddGrad(last_bias_dy, last_layer_x));
if (last_layer_weight_requires_grad) {
JUST(VectorAt(*in_grads, weight_num)) = JUST(VectorAt(*last_layer_wgrad_bgrad, 0));
}
if (last_layer_bias_requires_grad) {
JUST(VectorAt(*in_grads, 2 * weight_num)) = JUST(VectorAt(*last_layer_wgrad_bgrad, 1));
}
}

std::shared_ptr<one::Tensor> cublas_dy = last_bias_dy;
for (int32_t hidden_layer_idx = weight_num - 1; hidden_layer_idx > 0; hidden_layer_idx--) {
// If it is final layer, we use out_grads[0] as dy.
Expand All @@ -154,7 +140,7 @@ Maybe<void> CublasFusedMLP::Apply(const CublasFusedMLPCaptureState* ctx,
*/
const auto& matmul_relu_bias_bgrad = JUST(functional::CublasBiasAddReluMatmulGrad(
cublas_dy, JUST(VectorAt(weights, hidden_layer_idx)),
JUST(VectorAt(cublas_auxs, hidden_layer_idx - 1))));
JUST(VectorAt(cublas_auxs, hidden_layer_idx - 1)), /*alpha=*/1.0));

// dgrad
dgrad.at(hidden_layer_idx) = matmul_relu_bias_bgrad->at(0); // NOLINT
Expand All @@ -164,10 +150,8 @@ Maybe<void> CublasFusedMLP::Apply(const CublasFusedMLPCaptureState* ctx,
JUST(VectorAt(*in_grads, weight_num + hidden_layer_idx)) =
matmul_relu_bias_bgrad->at(1); // NOLINT
}
// dw, need to skip final layer, cause final layer's wgrad has used CublasMatmulBiasAddGrad to
// calculate.
if (JUST(VectorAt(ctx->weights_requires_grad, hidden_layer_idx))
&& hidden_layer_idx != weight_num - 1) {
// dw
if (JUST(VectorAt(ctx->weights_requires_grad, hidden_layer_idx))) {
JUST(VectorAt(*in_grads, (1 + hidden_layer_idx))) = JUST(functional::MatMul(
cublas_dy, JUST(VectorAt(hiddens, hidden_layer_idx - 1)), true, false, 1.0));
}
Expand All @@ -186,12 +170,10 @@ Maybe<void> CublasFusedMLP::Apply(const CublasFusedMLPCaptureState* ctx,
JUST(VectorAt(*in_grads, 0)) =
JUST(functional::MatMul(last_dy, JUST(VectorAt(weights, 0)), false, false, 1.0));
}
if (JUST(VectorAt(ctx->weights_requires_grad, 0)) && weight_num >= 2) {
// If weight_num == 1, dw has been calculated by CublasMatmulBiasAddGrad, so we need to skip.
if (JUST(VectorAt(ctx->weights_requires_grad, 0))) {
// dw:
JUST(VectorAt(*in_grads, 1)) =
JUST(functional::MatMul(last_dy, JUST(VectorAt(ctx->SavedTensors(), 0)), true, false,
1.0)); // use x instead just vectorat
JUST(functional::MatMul(last_dy, JUST(VectorAt(ctx->SavedTensors(), 0)), true, false, 1.0));
}

return Maybe<void>::Ok();
Expand All @@ -202,4 +184,4 @@ REGISTER_OP_EXPR_GRAD_FUNCTION("cublas_fused_mlp", CublasFusedMLP);
} // namespace one

} // namespace oneflow
#endif // CUDA_VERSION >= 11040
#endif // CUDA_VERSION >= 11060
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
/*
Copyright 2020 The OneFlow Authors. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/common/error.pb.h"
#include "oneflow/core/common/just.h"
#include "oneflow/core/common/maybe.h"
#include "oneflow/core/framework/op_expr_grad_function.h"
#include "oneflow/core/framework/op_builder.h"
#include "oneflow/core/framework/op_expr.h"
#include "oneflow/core/framework/op_interpreter/op_interpreter_util.h"
#include "oneflow/core/common/container_util.h"
#include "oneflow/core/functional/functional.h"
#include "oneflow/core/functional/functional_api.yaml.h"
#if CUDA_VERSION >= 11060

namespace oneflow {

namespace one {

struct FusedMatmulBiasAddReluDropoutCaptureState : public AutoGradCaptureState {
int32_t weight_num = 0;
bool skip_final_activation = false;
bool x_requires_grad = false;
std::vector<bool> weights_requires_grad;
std::vector<bool> biases_requires_grad;
std::vector<float> dropout_rate_list;
};

class FusedMatmulBiasAddReluDropout
: public OpExprGradFunction<FusedMatmulBiasAddReluDropoutCaptureState> {
public:
Maybe<void> Init(const OpExpr& op) override;
Maybe<void> Capture(FusedMatmulBiasAddReluDropoutCaptureState* ctx, const TensorTuple& inputs,
const TensorTuple& outputs, const AttrMap& attrs) const override;
Maybe<void> Apply(const FusedMatmulBiasAddReluDropoutCaptureState* ctx,
const TensorTuple& out_grads, TensorTuple* in_grads) const override;

protected:
AttrMap base_attrs_;
};

Maybe<void> FusedMatmulBiasAddReluDropout::Init(const OpExpr& op) {
const UserOpExpr* fw_op_expr = dynamic_cast<const UserOpExpr*>(&op);
CHECK_NOTNULL_OR_RETURN(fw_op_expr);
base_attrs_ = MakeAttrMapFromUserOpConf(fw_op_expr->proto());
return Maybe<void>::Ok();
}

Maybe<void> FusedMatmulBiasAddReluDropout::Capture(FusedMatmulBiasAddReluDropoutCaptureState* ctx,
const TensorTuple& inputs,
const TensorTuple& outputs,
const AttrMap& attrs) const {
CHECK_OR_RETURN(inputs.size() % 2 == 1) << "Both weight and bias should be passed together. ";
int32_t weight_num = (inputs.size() - 1) / 2;
ctx->weight_num = weight_num;
ctx->x_requires_grad = JUST(VectorAt(inputs, 0))->requires_grad();
ctx->weights_requires_grad.resize(weight_num);
ctx->biases_requires_grad.resize(weight_num);

for (int32_t i = 0; i < weight_num; i++) {
ctx->weights_requires_grad.at(i) = inputs.at(i + 1)->requires_grad(); // NOLINT
ctx->biases_requires_grad.at(i) = inputs.at(i + 1 + weight_num)->requires_grad(); // NOLINT
}

ctx->SaveTensorForBackward(JUST(VectorAt(inputs, 0))); // x. idx_sum:1
for (int32_t i = 0; i < weight_num; i++) {
ctx->SaveTensorForBackward(JUST(VectorAt(inputs, i + 1))); // weights. idx_sum:1+w
}

ctx->SaveTensorForBackward(JUST(VectorAt(outputs, 0))); // final layers output. idx_sum:2+w
for (int32_t i = 0; i < weight_num; i++) {
ctx->SaveTensorForBackward(
JUST(VectorAt(outputs, i + 1))); // cublas aux. need minus 1. idx_sum:2+2w
}
for (int32_t i = 0; i < weight_num - 1; i++) {
ctx->SaveTensorForBackward(JUST(VectorAt(outputs, i + 1 + weight_num))); // hidden.
}

ComposedAttrMap composed_attrs(attrs, base_attrs_);
ctx->skip_final_activation = JUST(composed_attrs.GetAttr<bool>("skip_final_activation"));
ctx->dropout_rate_list = JUST(composed_attrs.GetAttr<std::vector<float>>("dropout_rate_list"));

return Maybe<void>::Ok();
}

Maybe<void> FusedMatmulBiasAddReluDropout::Apply(
const FusedMatmulBiasAddReluDropoutCaptureState* ctx, const TensorTuple& out_grads,
TensorTuple* in_grads) const {
int32_t weight_num = ctx->weight_num;
in_grads->resize(1 + 2 * weight_num);

TensorTuple hiddens(weight_num - 1);
TensorTuple weights(weight_num);
TensorTuple cublas_auxs(weight_num);
TensorTuple dgrad(weight_num);

std::shared_ptr<one::Tensor> x = JUST(VectorAt(ctx->SavedTensors(), 0));
std::shared_ptr<one::Tensor> out = JUST(VectorAt(ctx->SavedTensors(), 1 + weight_num));

for (int32_t i = 0; i < weight_num; ++i) {
weights[i] = JUST(VectorAt(ctx->SavedTensors(), 1 + i));
}

for (int32_t i = 0; i < weight_num; ++i) {
cublas_auxs[i] = JUST(VectorAt(ctx->SavedTensors(), i + 2 + weight_num));
}

for (int32_t i = 0; i < weight_num - 1; ++i) {
hiddens[i] = JUST(VectorAt(ctx->SavedTensors(), i + 2 + 2 * weight_num));
}
float rate = ctx->dropout_rate_list.at(weight_num - 1);
float scale = 0.0f;
if (rate < 1.0f) { scale = 1.0f / (1.0f - rate); }

/*
step1: use dy and mask to get last layer's dropout + relu grad.
Because curand_uniform distribution is (0.0, 1.0], so the value after relu will be write into mask
too. And DropoutGrad use this mask to generate grad, it will generate dropout and relu grad
simultaneously.
*/
std::shared_ptr<one::Tensor> last_bias_dy = JUST(VectorAt(out_grads, 0));
if (!ctx->skip_final_activation || rate != 0.0f) {
last_bias_dy = JUST(functional::FusedReluDropoutGrad(JUST(VectorAt(out_grads, 0)),
cublas_auxs[weight_num - 1], scale));
}

// step2: use reduce_sum to get last layer's bias grad.
std::vector<int32_t> reduce_axes_vec{0};
if (JUST(VectorAt(ctx->biases_requires_grad, weight_num - 1))) {
JUST(VectorAt(*in_grads, 2 * weight_num)) =
JUST(functional::ReduceSum(last_bias_dy, reduce_axes_vec, false));
}

std::shared_ptr<one::Tensor> cublas_dy = last_bias_dy;
for (int32_t hidden_layer_idx = weight_num - 1; hidden_layer_idx > 0; hidden_layer_idx--) {
// If it is final layer, we use out_grads[0] as dy.
if (hidden_layer_idx != weight_num - 1) {
cublas_dy = JUST(VectorAt(dgrad, hidden_layer_idx + 1));
}
rate = ctx->dropout_rate_list.at(hidden_layer_idx - 1);
scale = 1.0;
if (rate < 1.0f) { scale = 1.0f / (1.0f - rate); }
/*
Here we use cublas to compute bias + relu + matmul grad.
Then use Matmul to compute weight grad.
*/
const auto& matmul_relu_bias_bgrad = JUST(functional::CublasBiasAddReluMatmulGrad(
cublas_dy, JUST(VectorAt(weights, hidden_layer_idx)),
JUST(VectorAt(cublas_auxs, hidden_layer_idx - 1)), /*alpha=*/scale));

// dgrad
dgrad.at(hidden_layer_idx) = matmul_relu_bias_bgrad->at(0); // NOLINT

if (JUST(VectorAt(ctx->biases_requires_grad, (hidden_layer_idx - 1)))) {
// dbias
JUST(VectorAt(*in_grads, weight_num + hidden_layer_idx)) =
matmul_relu_bias_bgrad->at(1); // NOLINT
}
// dw
if (JUST(VectorAt(ctx->weights_requires_grad, hidden_layer_idx))) {
JUST(VectorAt(*in_grads, (1 + hidden_layer_idx))) = JUST(functional::MatMul(
cublas_dy, JUST(VectorAt(hiddens, hidden_layer_idx - 1)), true, false, 1.0));
}
}

// For the first layer, we need to use 2 matmul to get grads.
std::shared_ptr<one::Tensor> last_dy;
if (weight_num != 1) {
last_dy = JUST(VectorAt(dgrad, 1));
} else {
last_dy = last_bias_dy;
}

if (ctx->x_requires_grad) {
// dx:
JUST(VectorAt(*in_grads, 0)) =
JUST(functional::MatMul(last_dy, JUST(VectorAt(weights, 0)), false, false, 1.0));
}
if (JUST(VectorAt(ctx->weights_requires_grad, 0))) {
// dw:
JUST(VectorAt(*in_grads, 1)) =
JUST(functional::MatMul(last_dy, JUST(VectorAt(ctx->SavedTensors(), 0)), true, false, 1.0));
}

return Maybe<void>::Ok();
}

REGISTER_OP_EXPR_GRAD_FUNCTION("fused_matmul_bias_add_relu_dropout", FusedMatmulBiasAddReluDropout);

} // namespace one

} // namespace oneflow
#endif // CUDA_VERSION >= 11060
12 changes: 11 additions & 1 deletion oneflow/core/functional/functional_api.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -975,14 +975,24 @@

- name: "cublas_bias_add_relu_matmul_grad"
signature:
"TensorTuple (Tensor dy, Tensor weight, Tensor aux) => CublasBiasAddReluMatmulGrad"
"TensorTuple (Tensor dy, Tensor weight, Tensor aux, Double alpha=1.0) => CublasBiasAddReluMatmulGrad"
bind_python: False

- name: "cublas_matmul_bias_add_grad"
signature:
"TensorTuple (Tensor dy, Tensor x) => CublasMatmulBiasAddGrad"
bind_python: False

- name: "fused_matmul_bias_add_relu_dropout"
signature:
"Tensor (Tensor x, TensorTuple weights, TensorTuple biases, Bool skip_final_activation, FloatList dropout_rate_list, Generator generator=None) => FusedMatmulBiasAddReluDropout"
bind_python: True

- name: "fused_relu_dropout_grad"
signature:
"Tensor (Tensor dy, Tensor mask, Float scale) => FusedReluDropoutGrad"
bind_python: False

- name: "broadcast_matmul_grad_b"
signature: "Tensor (Tensor a, Tensor b, Double alpha=1.0) => BroadcastMatmulGradB"
bind_python: False
Expand Down
Loading