-
Notifications
You must be signed in to change notification settings - Fork 6.8k
[MXNET-978] Second order gradient support for some unary operators #14613
Changes from 7 commits
45e1502
904adb4
d5dc994
0e69075
0c7cf98
492e4cd
45b334e
3bbfbac
4dc0907
c4034b2
3fe54e6
76aa6ad
8458717
f66610b
30ff1e9
8ecffcc
d9ba3da
1c93c7d
de721bc
0ac0942
f8e624e
3315124
8538980
1ee38b5
c18f317
689cfee
d56e132
2207815
0b6c2ef
31f671f
62fcca3
a0a0e75
451c4bd
b9b0c93
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -224,7 +224,17 @@ The storage type of ``elemwise_mul`` output depends on storage types of inputs | |
return std::vector<ResourceRequest>{ResourceRequest::kTempSpace}; | ||
}) | ||
.add_alias("_mul").add_alias("_Mul") | ||
.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_mul"}); | ||
.set_attr<nnvm::FGradient>("FGradient", | ||
[](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) { | ||
auto lhs_grad = MakeNode("elemwise_mul", n->attrs.name + "_backward_lhs", | ||
{ograds[0], n->inputs[1]}, nullptr, &n); | ||
auto rhs_grad = MakeNode("elemwise_mul", n->attrs.name + "_backward_rhs", | ||
{ograds[0], n->inputs[0]}, nullptr, &n); | ||
std::vector<nnvm::NodeEntry> ret; | ||
ret.emplace_back(nnvm::NodeEntry{lhs_grad, 0, 0}); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we simplify as per #14095 ? ret.emplace_back(MakeNode(...)); |
||
ret.emplace_back(nnvm::NodeEntry{rhs_grad, 0, 0}); | ||
return ret; | ||
}); | ||
|
||
NNVM_REGISTER_OP(_backward_mul) | ||
.set_num_inputs(3) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -83,7 +83,15 @@ The storage type of ``relu`` output depends upon the input storage type: | |
- relu(csr) = csr | ||
|
||
)code" ADD_FILELINE) | ||
.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_relu"}); | ||
.set_attr<nnvm::FGradient>("FGradient", | ||
[](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) { | ||
auto zero_node = MakeNode("zeros_like", n->attrs.name + "_relu_backward", {n->inputs[0]}, nullptr, &n); | ||
auto x_grad = MakeNode("_greater", n->attrs.name + "_mid_x_grad", {n->inputs[0], nnvm::NodeEntry{zero_node, 0, 0}}, nullptr, &n); | ||
auto in_grad = MakeNode("elemwise_mul", n->attrs.name + "_backward", {ograds[0], nnvm::NodeEntry{x_grad, 0 , 0}}, nullptr, &n); | ||
std::vector<nnvm::NodeEntry> ret; | ||
ret.emplace_back(nnvm::NodeEntry{in_grad, 0, 0}); | ||
return ret; | ||
}); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should measure if this causes regressions as we discussed, otherwise we should add FGradient to _backward_relu. I think same applies for other functions. |
||
|
||
MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_backward_relu, | ||
unary_bwd<mshadow_op::relu_grad>); | ||
|
@@ -648,7 +656,13 @@ The storage type of ``negative`` output depends upon the input storage type: | |
- negative(csr) = csr | ||
|
||
)code") | ||
.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"negative"}); | ||
.set_attr<nnvm::FGradient>("FGradient", | ||
[](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) { | ||
auto in_grad = MakeNode("negative", n->attrs.name + "_backward", {ograds[0]}, nullptr, &n); | ||
std::vector<nnvm::NodeEntry> ret; | ||
ret.emplace_back(nnvm::NodeEntry{in_grad, 0, 0}); | ||
return ret; | ||
}); | ||
|
||
// reciprocal | ||
MXNET_OPERATOR_REGISTER_UNARY(reciprocal) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, | ||
# software distributed under the License is distributed on an | ||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, either express or implied. See the License for the | ||
# specific language governing permissions and limitations | ||
# under the License. | ||
|
||
import mxnet as mx | ||
import numpy as np | ||
from mxnet import gluon, nd, autograd | ||
from mxnet.test_utils import assert_almost_equal | ||
from tests.python.unittest.common import with_seed | ||
|
||
|
||
@with_seed() | ||
def test_elemwise_mul(): | ||
x = nd.array([1, 2, 3]) | ||
y = nd.zeros(3) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need this y? |
||
x.attach_grad() | ||
with autograd.record(): | ||
y = nd.elemwise_mul(x, x) | ||
y_grad = autograd.grad(y, x, create_graph=True, retain_graph=True)[0] | ||
y_grad.backward() | ||
expect_grad = nd.array([2, 2, 2]) | ||
assert_almost_equal(expect_grad.asnumpy(), x.grad.asnumpy()) | ||
|
||
|
||
@with_seed() | ||
def test_sin(): | ||
def sin(x): | ||
return nd.sin(x) | ||
|
||
x = nd.array([1, 2, 3]) | ||
expect_grad = -nd.sin(x) | ||
check_second_order_unary(x, sin, expect_grad) | ||
|
||
|
||
@with_seed() | ||
def test_cos(): | ||
def cos(x): | ||
return nd.cos(x) | ||
|
||
x = nd.array([1, 2, 3]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we randomize the test arrays with random_arrays and rand_shape_2d There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think for second order not using random inputs helps reason about the gradient result... |
||
expect_grad = -nd.cos(x) | ||
check_second_order_unary(x, cos, expect_grad) | ||
|
||
|
||
@with_seed() | ||
def test_negative(): | ||
def negative(x): | ||
return nd.negative(x) | ||
|
||
x = nd.array([1, 2, 3]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same as above and for rest of the tests |
||
expect_grad = nd.zeros_like(x) | ||
check_second_order_unary(x, negative, expect_grad) | ||
|
||
|
||
@with_seed() | ||
def test_relu(): | ||
def relu(x): | ||
return nd.relu(x) | ||
|
||
x = nd.array([1, 2, 3]) | ||
expect_grad = nd.zeros_like(x) | ||
check_second_order_unary(x, relu, expect_grad) | ||
|
||
|
||
def check_second_order_unary(x, op, expect_grad): | ||
x.attach_grad() | ||
with autograd.record(): | ||
y = op(x) | ||
y_grad = autograd.grad(y, x, create_graph=True, retain_graph=True)[0] | ||
y_grad.backward() | ||
assert_almost_equal(expect_grad.asnumpy(), x.grad.asnumpy()) | ||
|
||
|
||
if __name__ == '__main__': | ||
import nose | ||
nose.runmodule() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why change from CHECK to warning?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Current backward operation requires an operator must have at least one inputs, because the gradient of a constants is always zero. However, the second order of some operators such as relu is actually gradient of a constant (ones or zeros). Therefore we need to support gradient for constant operators.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we should dive deeper into this one. Does it produce the warning (or early the failure) for some of the test cases?
In the original code I think the intention is to get if there's any input nodes which have gradient attached, I understand your explanation but what I don't see is where would we store the gradient for such constants, is because grad_req of the constant is kNullOp? the constant is just another node right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The rootcause is when we do second order gradient of the negative(x) operator. The backward graph of this does not require any input and therefore will trigger this condition. I think if I remove the test for negative(x) then we do not need to modify this.