Skip to content

Commit

Permalink
[MXNET-101] Support float16 in LeakyReLU operator (apache#10169)
Browse files Browse the repository at this point in the history
* support for any datatype in leaky ReLU

* test for LeakyReLU operators

* make lint

* clean up unnecessary prints

* fix for amalgamation build failure

* add InferType for Leaky ReLU and slight modification to the tests
  • Loading branch information
haojin2 authored and ashokei committed Mar 27, 2018
1 parent 3d10a22 commit 5ee08c7
Show file tree
Hide file tree
Showing 6 changed files with 201 additions and 46 deletions.
136 changes: 100 additions & 36 deletions src/operator/leaky_relu-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,11 @@
#include <string>
#include <vector>
#include <utility>
#include "../common/random_generator.h"
#include "./operator_common.h"
#include "./mshadow_op.h"
#include "./random/sampler.h"
#include "./random/sample_op.h"

namespace mxnet {
namespace op {
Expand Down Expand Up @@ -75,7 +78,7 @@ struct prelu_grad {
}
};

template<typename xpu>
template<typename xpu, typename DType>
class LeakyReLUOp : public Operator {
public:
explicit LeakyReLUOp(LeakyReLUParam param) {
Expand All @@ -92,25 +95,25 @@ class LeakyReLUOp : public Operator {
size_t expected = param_.act_type == leakyrelu::kPReLU ? 2 : 1;
CHECK_EQ(in_data.size(), expected);
Stream<xpu> *s = ctx.get_stream<xpu>();
Tensor<xpu, 3> data;
Tensor<xpu, 3> out;
Tensor<xpu, 3> mask;
Tensor<xpu, 1> weight;
Tensor<xpu, 3, DType> data;
Tensor<xpu, 3, DType> out;
Tensor<xpu, 3, DType> mask;
Tensor<xpu, 1, DType> weight;
int n = in_data[leakyrelu::kData].shape_[0];
int k = in_data[leakyrelu::kData].shape_[1];
Shape<3> dshape = Shape3(n, k, in_data[leakyrelu::kData].Size()/n/k);
data = in_data[leakyrelu::kData].get_with_shape<xpu, 3, real_t>(dshape, s);
out = out_data[leakyrelu::kOut].get_with_shape<xpu, 3, real_t>(dshape, s);
if (param_.act_type == leakyrelu::kRReLU) {
mask = out_data[leakyrelu::kMask].get_with_shape<xpu, 3, real_t>(dshape, s);
}
data = in_data[leakyrelu::kData].get_with_shape<xpu, 3, DType>(dshape, s);
out = out_data[leakyrelu::kOut].get_with_shape<xpu, 3, DType>(dshape, s);
switch (param_.act_type) {
case leakyrelu::kLeakyReLU: {
Assign(out, req[leakyrelu::kOut], F<mshadow_op::xelu>(data, param_.slope));
MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kOut], Req, {
mxnet_op::Kernel<mxnet_op::op_with_req<mxnet::op::mshadow_op::xelu, Req>, xpu>::Launch(
s, out.size(0) * out.size(1) * out.size(2), out.dptr_, data.dptr_, DType(param_.slope));
});
break;
}
case leakyrelu::kPReLU: {
weight = in_data[leakyrelu::kGamma].get<xpu, 1, real_t>(s);
weight = in_data[leakyrelu::kGamma].get<xpu, 1, DType>(s);
if (weight.shape_.Size() == 1) {
Assign(out, req[leakyrelu::kOut],
F<mshadow_op::xelu>(data, mshadow::expr::broadcast_scalar(weight, out.shape_)));
Expand All @@ -122,18 +125,43 @@ class LeakyReLUOp : public Operator {
}
case leakyrelu::kRReLU: {
if (ctx.is_train) {
Random<xpu>* prnd = ctx.requested[leakyrelu::kRandom].get_random<xpu, real_t>(s);
mask = prnd->uniform(mask.shape_);
mask = mask * (param_.upper_bound - param_.lower_bound) + param_.lower_bound;
Assign(out, req[leakyrelu::kOut], F<mshadow_op::xelu>(data, mask));
mask = out_data[leakyrelu::kMask].get_with_shape<xpu, 3, DType>(dshape, s);
mxnet::op::UniformSampler<xpu> sampler;
Tensor<xpu, 1, DType> low, high;
mxnet::op::GetSamplingTempData<xpu, DType>(DType(0.0f), DType(1.0f), ctx, &low, &high);
mxnet::common::random::RandGenerator<xpu, DType> *pgen =
ctx.requested[0].get_parallel_random<xpu, DType>();
Tensor<xpu, 1, DType> out = mask.FlatTo1D();
sampler.Sample(low, high, out, pgen, s);
MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kMask], Req, {
mxnet_op::Kernel<mxnet_op::op_with_req<mxnet::op::mshadow_op::mul, Req>, xpu>::Launch(
s, mask.size(0) * mask.size(1) * mask.size(2), mask.dptr_, mask.dptr_,
DType(param_.upper_bound - param_.lower_bound));
});
MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kMask], Req, {
mxnet_op::Kernel<mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, Req>, xpu>::Launch(
s, mask.size(0) * mask.size(1) * mask.size(2), mask.dptr_, mask.dptr_,
DType(param_.lower_bound));
});
MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kOut], Req, {
mxnet_op::Kernel<mxnet_op::op_with_req<mxnet::op::mshadow_op::xelu, Req>, xpu>::Launch(
s, mask.size(0) * mask.size(1) * mask.size(2), out.dptr_, data.dptr_, mask.dptr_);
});
} else {
const float slope = (param_.lower_bound + param_.upper_bound) / 2.0f;
Assign(out, req[leakyrelu::kOut], F<mshadow_op::xelu>(data, slope));
MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kOut], Req, {
mxnet_op::Kernel<mxnet_op::op_with_req<mxnet::op::mshadow_op::xelu, Req>, xpu>::Launch(
s, out.size(0) * out.size(1) * out.size(2), out.dptr_, data.dptr_, DType(slope));
});
}
break;
}
case leakyrelu::kELU: {
Assign(out, req[leakyrelu::kOut], F<mshadow_op::elu>(data, param_.slope));
MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kOut], Req, {
mxnet_op::Kernel<mxnet_op::op_with_req<mxnet::op::mshadow_op::elu, Req>, xpu>::Launch(
s, out.size(0) * out.size(1) * out.size(2), out.dptr_, data.dptr_,
DType(param_.slope));
});
break;
}
default:
Expand All @@ -155,33 +183,38 @@ class LeakyReLUOp : public Operator {
CHECK_EQ(req.size(), expected);
CHECK_EQ(in_data.size(), expected);
Stream<xpu> *s = ctx.get_stream<xpu>();
Tensor<xpu, 3> output;
Tensor<xpu, 3> data;
Tensor<xpu, 3> gdata;
Tensor<xpu, 3> grad;
Tensor<xpu, 3> mask;
Tensor<xpu, 1> weight;
Tensor<xpu, 1> grad_weight;
Tensor<xpu, 3, DType> output;
Tensor<xpu, 3, DType> data;
Tensor<xpu, 3, DType> gdata;
Tensor<xpu, 3, DType> grad;
Tensor<xpu, 3, DType> mask;
Tensor<xpu, 1, DType> weight;
Tensor<xpu, 1, DType> grad_weight;
int n = out_grad[leakyrelu::kOut].shape_[0];
int k = out_grad[leakyrelu::kOut].shape_[1];
Shape<3> dshape = Shape3(n, k, out_grad[leakyrelu::kOut].Size()/n/k);
grad = out_grad[leakyrelu::kOut].get_with_shape<xpu, 3, real_t>(dshape, s);
gdata = in_grad[leakyrelu::kData].get_with_shape<xpu, 3, real_t>(dshape, s);
output = out_data[leakyrelu::kOut].get_with_shape<xpu, 3, real_t>(dshape, s);
grad = out_grad[leakyrelu::kOut].get_with_shape<xpu, 3, DType>(dshape, s);
gdata = in_grad[leakyrelu::kData].get_with_shape<xpu, 3, DType>(dshape, s);
output = out_data[leakyrelu::kOut].get_with_shape<xpu, 3, DType>(dshape, s);
if (param_.act_type == leakyrelu::kRReLU) {
mask = out_data[leakyrelu::kMask].get_with_shape<xpu, 3, real_t>(dshape, s);
mask = out_data[leakyrelu::kMask].get_with_shape<xpu, 3, DType>(dshape, s);
}
if (param_.act_type == leakyrelu::kPReLU) {
data = in_data[leakyrelu::kData].get_with_shape<xpu, 3, real_t>(dshape, s);
data = in_data[leakyrelu::kData].get_with_shape<xpu, 3, DType>(dshape, s);
}
switch (param_.act_type) {
case leakyrelu::kLeakyReLU: {
Assign(gdata, req[leakyrelu::kData], F<mshadow_op::xelu_grad>(output, param_.slope) * grad);
MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kData], Req, {
mxnet_op::Kernel<mxnet_op::op_with_req<
mxnet_op::backward_grad_tuned<mxnet::op::mshadow_op::xelu_grad>, Req>, xpu>::Launch(
s, gdata.size(0) * gdata.size(1) * gdata.size(2), gdata.dptr_, grad.dptr_,
output.dptr_, DType(param_.slope));
});
break;
}
case leakyrelu::kPReLU: {
weight = in_data[leakyrelu::kGamma].get<xpu, 1, real_t>(s);
grad_weight = in_grad[leakyrelu::kGamma].get<xpu, 1, real_t>(s);
weight = in_data[leakyrelu::kGamma].get<xpu, 1, DType>(s);
grad_weight = in_grad[leakyrelu::kGamma].get<xpu, 1, DType>(s);
if (weight.shape_.Size() == 1) {
Shape<4> gshape = Shape4(1, grad.shape_[0], grad.shape_[1], grad.shape_[2]);
Assign(grad_weight, req[leakyrelu::kGamma],
Expand All @@ -204,7 +237,12 @@ class LeakyReLUOp : public Operator {
break;
}
case leakyrelu::kELU: {
Assign(gdata, req[leakyrelu::kData], F<mshadow_op::elu_grad>(output, param_.slope) * grad);
MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kData], Req, {
mxnet_op::Kernel<mxnet_op::op_with_req<
mxnet_op::backward_grad_tuned<mxnet::op::mshadow_op::elu_grad>, Req>, xpu>::Launch(
s, gdata.size(0) * gdata.size(1) * gdata.size(2), gdata.dptr_, grad.dptr_,
output.dptr_, DType(param_.slope));
});
break;
}
default:
Expand All @@ -217,7 +255,7 @@ class LeakyReLUOp : public Operator {
}; // class LeakyReLUOp

template<typename xpu>
Operator* CreateOp(LeakyReLUParam type);
Operator* CreateOp(LeakyReLUParam type, int dtype);

#if DMLC_USE_CXX11
class LeakyReLUProp : public OperatorProperty {
Expand Down Expand Up @@ -256,6 +294,26 @@ class LeakyReLUProp : public OperatorProperty {
return true;
}

bool InferType(std::vector<int> *in_type,
std::vector<int> *out_type,
std::vector<int> *aux_type) const override {
int dtype = -1;
for (const int& type : *in_type) {
type_assign(&dtype, type);
}
for (const int& type : *out_type) {
type_assign(&dtype, type);
}

for (size_t i = 0; i < in_type->size(); ++i) {
TYPE_ASSIGN_CHECK(*in_type, i, dtype);
}
for (size_t i = 0; i < out_type->size(); ++i) {
TYPE_ASSIGN_CHECK(*out_type, i, dtype);
}
return dtype != -1;
}

OperatorProperty* Copy() const override {
auto ptr = new LeakyReLUProp();
ptr->param_ = param_;
Expand Down Expand Up @@ -338,7 +396,13 @@ class LeakyReLUProp : public OperatorProperty {
}
}

Operator* CreateOperator(Context ctx) const override;
Operator* CreateOperator(Context ctx) const override {
LOG(FATAL) << "Not Implemented.";
return NULL;
}

Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
std::vector<int> *in_type) const override;

private:
LeakyReLUParam param_;
Expand Down
13 changes: 9 additions & 4 deletions src/operator/leaky_relu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,17 @@
namespace mxnet {
namespace op {
template<>
Operator *CreateOp<cpu>(LeakyReLUParam param) {
return new LeakyReLUOp<cpu>(param);
Operator *CreateOp<cpu>(LeakyReLUParam param, int dtype) {
Operator* op = NULL;
MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
op = new LeakyReLUOp<cpu, DType>(param);
});
return op;
}

Operator *LeakyReLUProp::CreateOperator(Context ctx) const {
DO_BIND_DISPATCH(CreateOp, param_);
Operator *LeakyReLUProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
std::vector<int> *in_type) const {
DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
}

DMLC_REGISTER_PARAMETER(LeakyReLUParam);
Expand Down
8 changes: 6 additions & 2 deletions src/operator/leaky_relu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,12 @@
namespace mxnet {
namespace op {
template<>
Operator *CreateOp<gpu>(LeakyReLUParam param) {
return new LeakyReLUOp<gpu>(param);
Operator *CreateOp<gpu>(LeakyReLUParam param, int dtype) {
Operator* op = NULL;
MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
op = new LeakyReLUOp<gpu, DType>(param);
});
return op;
}

} // namespace op
Expand Down
15 changes: 11 additions & 4 deletions src/operator/mshadow_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,13 @@ MXNET_UNARY_MATH_OP_NC(identity, a);

MXNET_UNARY_MATH_OP(identity_grad, 1);

struct identity_with_cast {
template<typename DTypeIn, typename DTypeOut>
MSHADOW_XINLINE static void Map(int i, DTypeOut *out, DTypeIn *in) {
out[i] = DTypeOut(in[i]);
}
};

MXNET_BINARY_MATH_OP_NC(left, a);

MXNET_BINARY_MATH_OP_NC(right, b);
Expand Down Expand Up @@ -119,13 +126,13 @@ MXNET_UNARY_MATH_OP_NC(relu, a > DType(0) ? a : DType(0));

MXNET_UNARY_MATH_OP_NC(relu_grad, a > DType(0) ? DType(1) : DType(0));

MXNET_BINARY_MATH_OP(xelu, a > DType(0) ? math::id(a) :
math::id(a) * math::id(b));
MXNET_BINARY_MATH_OP_NC(xelu, a > DType(0) ? a :
DType(static_cast<float>(a) * static_cast<float>(b)));

MXNET_BINARY_MATH_OP_NC(xelu_grad, a > DType(0) ? DType(1) : b);

MXNET_BINARY_MATH_OP(elu, a > DType(0) ? math::id(a) :
math::id(b) * math::expm1(a));
MXNET_BINARY_MATH_OP_NC(elu, a > DType(0) ? a :
DType(math::id(b) * math::expm1(a)));

MXNET_BINARY_MATH_OP_NC(elu_grad, a > DType(0) ? DType(1) : DType(b + a));

Expand Down
4 changes: 4 additions & 0 deletions src/operator/operator_tune.cc
Original file line number Diff line number Diff line change
Expand Up @@ -314,9 +314,13 @@ IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::right); // NOLINT()
IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::right); // NOLINT()
IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::power); // NOLINT()
IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rpower); // NOLINT()
IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::xelu); // NOLINT()
IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::elu); // NOLINT()
IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::power_grad); // NOLINT()
IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::rpower_grad); // NOLINT()
IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::power_rgrad); // NOLINT()
IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::xelu_grad); // NOLINT()
IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::elu_grad); // NOLINT()
IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::maximum); // NOLINT()
IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::minimum); // NOLINT()
IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::hypot); // NOLINT()
Expand Down
71 changes: 71 additions & 0 deletions tests/python/unittest/test_operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,77 @@ def frelu_grad(x):
check_symbolic_backward(y, [xa], [np.ones(shape)], [ga])


@with_seed(1234)
def test_leaky_relu():
def fleaky_relu(x, act_type, slope=0.25):
neg_indices = x < 0
out = x.copy()
if act_type == 'elu':
out[neg_indices] = slope * (np.exp(out[neg_indices]) - 1.)
elif act_type == 'leaky':
out[neg_indices] = slope * out[neg_indices]
return out
def fleaky_relu_grad(grad, x, y, act_type, slope=0.25):
neg_indices = x < 0
out = np.ones(x.shape)
if act_type == 'elu':
out[neg_indices] = y[neg_indices] + slope
elif act_type == 'leaky':
out[neg_indices] = slope
return out * grad
shape = (3, 4)
x = mx.symbol.Variable("x")
slp = 0.0625
for dtype in [np.float16, np.float32, np.float64]:
xa = np.random.uniform(low=-1.0,high=-0.2,size=shape).astype(dtype)
eps = 1e-4
xa[abs(xa) < eps] = 1.0
# eps = 1e-2 if dtype is np.float16 else 1e-4
for act_type in ['leaky']:
y = mx.symbol.LeakyReLU(data=x, slope=slp, act_type=act_type)
ya = fleaky_relu(xa, slope=slp, act_type=act_type)
ga = fleaky_relu_grad(np.ones(shape), xa, ya, slope=slp, act_type=act_type)
check_numeric_gradient(y, [xa], numeric_eps=eps, rtol=1e-4, atol=1e-4)
check_symbolic_forward(y, [xa], [ya], rtol=eps, atol=1e-5, dtype=dtype)
check_symbolic_backward(y, [xa], [np.ones(shape)], [ga], rtol=eps, atol=1e-5, dtype=dtype)


@with_seed(1234)
def test_prelu():
def fprelu(x, gamma):
pos_indices = x > 0
out = x.copy()
out = np.multiply(out, gamma)
out[pos_indices] = x[pos_indices]
return out
def fprelu_grad(x, y, gamma):
pos_indices = x > 0
grad_x = np.multiply(np.ones(x.shape), gamma)
grad_gam = np.zeros(gamma.shape)
copy_x = x.copy()
copy_x[pos_indices] = 0.0
grad_x[pos_indices] = 1.0
if gamma.shape[0] == 1:
grad_gam = np.sum(np.sum(copy_x))
elif gamma.shape[0] > 1:
grad_gam = np.sum(copy_x, axis=0)
return (grad_x, grad_gam)
shape = (3,4)
x = mx.symbol.Variable("x")
gamma = mx.symbol.Variable("gamma")
for dtype in [np.float16, np.float32, np.float64]:
for gam in [np.array([0.1], dtype=dtype), np.array([0.1, 0.2, 0.3, 0.4], dtype=dtype)]:
xa = np.random.uniform(low=-1.0,high=1.0,size=shape).astype(dtype)
eps = 1e-4
xa[abs(xa) < eps] = 1.0
y = mx.symbol.LeakyReLU(data=x, gamma=gamma, act_type='prelu')
ya = fprelu(xa, gam)
g_xa, g_gam = fprelu_grad(xa, ya, gamma=gam)
check_numeric_gradient(y, [xa, gam], numeric_eps=eps, rtol=1e-3, atol=1e-4)
check_symbolic_forward(y, [xa, gam], [ya], rtol=1e-3, atol=1e-20)
check_symbolic_backward(y, [xa, gam], [np.ones(shape)], [g_xa], rtol=1e-3, atol=1e-20)


@with_seed()
def test_sigmoid():
def fsigmoid(a):
Expand Down

0 comments on commit 5ee08c7

Please sign in to comment.