Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

[MXNET-807] Support integer label type in ctc_loss operator #12468

Merged
merged 17 commits into from
Sep 12, 2018
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
165 changes: 89 additions & 76 deletions src/operator/contrib/ctc_loss-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,10 @@

namespace mxnet {
namespace op {

namespace ctc_loss {
enum CTCLossOpInputs { kData, kLabel };
enum CTCLossOpOutputs { kOut, kGrad };
enum CTCLossOpForwardResource { kTempSpace };
enum CTCLossOpInputs { kData, kLabel };
enum CTCLossOpOutputs { kOut, kGrad };
enum CTCLossOpForwardResource { kTempSpace };
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

which case does this part fall into? how were we able to check it in without breaking master build if it's either of the cases?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, overlooked these lines. I have removed them.

}

template <typename T>
Expand Down Expand Up @@ -256,66 +255,69 @@ class CTCLossOp : public Operator {
exceed_cudnn_limit = false;
Stream<xpu> *s = ctx.get_stream<xpu>();

Tensor<xpu, 3, real_t> data =
MSHADOW_TYPE_SWITCH(in_data[ctc_loss::kLabel].type_flag_, DType, {
Tensor<xpu, 3, real_t> data =
in_data[ctc_loss::kData].get<xpu, 3, real_t>(s);
Tensor<xpu, 2, real_t> labels =
in_data[ctc_loss::kLabel].get<xpu, 2, real_t>(s);
Tensor<xpu, 2, DType> labels =
in_data[ctc_loss::kLabel].get<xpu, 2, DType>(s);

Tensor<xpu, 1, real_t> costs =
Tensor<xpu, 1, real_t> costs =
out_data[ctc_loss::kOut].get<xpu, 1, real_t>(s);
Tensor<xpu, 3, real_t> grad =
Tensor<xpu, 3, real_t> grad =
out_data[ctc_loss::kGrad].get<xpu, 3, real_t>(s);

int max_seq_len = data.size(0);
int batch_size = data.size(1);
int alphabet_size = data.size(2);

// data_lengths
std::vector<int> data_lengths(batch_size, max_seq_len);
if (param_.use_data_lengths) {
int kInputLength = 2;
IndexTensorToVector(in_data[kInputLength].get<xpu, 1, real_t>(s), &data_lengths);
}

// label_lengths
std::vector<int> packed_labels;
std::vector<int> label_lengths(batch_size);

if (param_.use_label_lengths) {
int kLabelLength = 2+param_.use_data_lengths;
exceed_cudnn_limit = PackLabelByLength(labels, in_data[kLabelLength].get<xpu, 1, real_t>(s),
&packed_labels, &label_lengths);
} else {
exceed_cudnn_limit = LabelTensorToPackedVector(labels, param_.blank_label == 0?0:-1,
&packed_labels, &label_lengths);
}

// CUDNN is disabled due to lack of support for input lengths
/* #if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7 */
/* if (!exceed_cudnn_limit) { */
/* cudnn_forward(ctx, s, data, costs, grad, */
/* &data_lengths, &label_lengths, &packed_labels, */
/* max_seq_len, batch_size, alphabet_size, */
/* req[ctc_loss::kGrad] != mxnet::kNullOp); */
/* } else { */
/* baidu_forward(ctx, s, data, costs, grad, */
/* &data_lengths, &label_lengths, &packed_labels, */
/* batch_size, alphabet_size, req[ctc_loss::kGrad] != mxnet::kNullOp); */
/* } */
/* #else */

baidu_forward(ctx, s, data, costs, grad,
&data_lengths, &label_lengths, &packed_labels,
batch_size, alphabet_size, req[ctc_loss::kGrad] != mxnet::kNullOp);

if (param_.use_data_lengths) {
// baidu warp CTC implementation sometimes includes undefined gradients
// for data outside of length mask. Setting to 0 to make it consistent
// with CPU implementation.
int kInputLength = 2;
mxnet_op::SequenceMask(grad, in_data[kInputLength].get<xpu, 1, real_t>(s),
static_cast<real_t>(0));
}
int max_seq_len = data.size(0);
int batch_size = data.size(1);
int alphabet_size = data.size(2);

// data_lengths
std::vector<int> data_lengths(batch_size, max_seq_len);
if (param_.use_data_lengths) {
int kInputLength = 2;
IndexTensorToVector(in_data[kInputLength].get<xpu, 1, real_t>(s), &data_lengths);
}

// label_lengths
std::vector<int> packed_labels;
std::vector<int> label_lengths(batch_size);

if (param_.use_label_lengths) {
int kLabelLength = 2 + param_.use_data_lengths;
exceed_cudnn_limit =
PackLabelByLength(labels, in_data[kLabelLength].get<xpu, 1, DType>(s),
&packed_labels, &label_lengths);
} else {
exceed_cudnn_limit = LabelTensorToPackedVector(labels, param_.blank_label == 0?0:-1,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some formatting issues with whitespaces and indentation

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, what was exactly the issue? The make lint seems to pass.

Copy link
Contributor

@lebeg lebeg Sep 6, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, should be 0 ? 0 : -1

&packed_labels, &label_lengths);
}

// CUDNN is disabled due to lack of support for input lengths
/* #if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7 */
/* if (!exceed_cudnn_limit) { */
/* cudnn_forward(ctx, s, data, costs, grad, */
/* &data_lengths, &label_lengths, &packed_labels, */
/* max_seq_len, batch_size, alphabet_size, */
/* req[ctc_loss::kGrad] != mxnet::kNullOp); */
/* } else { */
/* baidu_forward(ctx, s, data, costs, grad, */
/* &data_lengths, &label_lengths, &packed_labels, */
/* batch_size, alphabet_size, req[ctc_loss::kGrad] != mxnet::kNullOp);*/
/* } */
/* #else */

baidu_forward(ctx, s, data, costs, grad,
&data_lengths, &label_lengths, &packed_labels,
batch_size, alphabet_size, req[ctc_loss::kGrad] != mxnet::kNullOp);

if (param_.use_data_lengths) {
// baidu warp CTC implementation sometimes includes undefined gradients
// for data outside of length mask. Setting to 0 to make it consistent
// with CPU implementation.
int kInputLength = 2;
mxnet_op::SequenceMask(grad, in_data[kInputLength].get<xpu, 1, real_t>(s),
static_cast<real_t>(0));
}
});
}

virtual void Backward(const OpContext &ctx,
Expand All @@ -331,21 +333,20 @@ class CTCLossOp : public Operator {
Stream<xpu> *s = ctx.get_stream<xpu>();

Tensor<xpu, 3, real_t> data_grad =
in_grad[ctc_loss::kData].get<xpu, 3, real_t>(s);
in_grad[ctc_loss::kData].get<xpu, 3, real_t>(s);
Tensor<xpu, 1, real_t> output_grad =
out_grad[ctc_loss::kOut].get<xpu, 1, real_t>(s);
out_grad[ctc_loss::kOut].get<xpu, 1, real_t>(s);

Tensor<xpu, 3, real_t> data_grad_computed =
out_data[ctc_loss::kGrad].get<xpu, 3, real_t>(s);
out_data[ctc_loss::kGrad].get<xpu, 3, real_t>(s);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed. However, these 4 space indentation seems a violation of Google C++ style guide. Are we ignoring them in the lint?
https://google.github.io/styleguide/cppguide.html


Assign(data_grad, req[ctc_loss::kData],
mshadow::expr::broadcast<1>(output_grad, data_grad.shape_) * data_grad_computed);
mshadow::expr::broadcast<1>(output_grad, data_grad.shape_) * data_grad_computed);
}

private:
CTCLossParam param_;
bool exceed_cudnn_limit;

#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
cudnnDataType_t dtype_;
cudnnCTCLossDescriptor_t ctc_desc_;
Expand Down Expand Up @@ -434,17 +435,17 @@ class CTCLossOp : public Operator {
}
#endif // __CUDACC__ && CUDNN

inline virtual void baidu_forward(const OpContext &ctx,
mshadow::Stream<xpu>* s,
mshadow::Tensor<xpu, 3, real_t> data,
mshadow::Tensor<xpu, 1, real_t> costs,
mshadow::Tensor<xpu, 3, real_t> grad,
std::vector<int>* data_lengths,
std::vector<int>* label_lengths,
std::vector<int>* packed_labels,
int batch_size,
int alphabet_size,
bool req_grad) {
inline void baidu_forward(const OpContext &ctx,
mshadow::Stream<xpu>* s,
mshadow::Tensor<xpu, 3, real_t> data,
mshadow::Tensor<xpu, 1, real_t> costs,
mshadow::Tensor<xpu, 3, real_t> grad,
std::vector<int>* data_lengths,
std::vector<int>* label_lengths,
std::vector<int>* packed_labels,
int batch_size,
int alphabet_size,
bool req_grad) {
using namespace mshadow;
// allocate temporary workspace
size_t size_bytes;
Expand Down Expand Up @@ -534,11 +535,23 @@ class CTCLossProp : public OperatorProperty {
TShape oshape(1);
oshape[0] = dshape[1]; // batch size
out_shape->clear();
out_shape->push_back(oshape);
out_shape->push_back(oshape); // forward output
out_shape->push_back(dshape); // grad output
return true;
}

bool InferType(std::vector<int> *in_type,
std::vector<int> *out_type,
std::vector<int> *aux_type) const override {
CHECK_LE(in_type->size(), this->ListArguments().size());
int dtype = (*in_type)[ctc_loss::kData];
CHECK_NE(dtype, -1) << "Input data must have specified type";

out_type->clear();
out_type->push_back(dtype); // forward output
out_type->push_back(dtype); // grad output
return true;
}
OperatorProperty *Copy() const override {
auto ptr = new CTCLossProp();
ptr->param_ = param_;
Expand Down
42 changes: 42 additions & 0 deletions tests/python/unittest/test_contrib_operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,48 @@ def assert_match(inputs, x, y, threshold, is_ascend=False):
assert_match([[0.5, 0.6], [0.1, 0.2], [0.3, 0.4]], [1, -1, 0], [2, 0], 1e-12, False)
assert_match([[0.5, 0.6], [0.1, 0.2], [0.3, 0.4]], [-1, 0, 1], [1, 2], 100, True)

def test_ctc_loss_op():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feel free to add test cases for large labels there.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the reference. Moving the unit test there.

batch_size = 10
seq_len = 5
label_len = 3
num_classes = 6
np.random.seed(1)
x = np.random.uniform(size=(seq_len, batch_size, num_classes))
y = np.random.randint(0, num_classes, size=(batch_size, label_len))

def test_cpu(x, y):
data = mx.nd.array(x, ctx=mx.cpu(0))
label = mx.nd.array(y, ctx=mx.cpu(0))
loss = mx.nd.contrib.ctc_loss(data=data, label=label)
loss = mx.nd.make_loss(loss)
expected_output = [9.604521, 7.096151, 4.906869, 5.5237527, 5.9895644, 5.584548,
5.528411, 5.765914, 6.740701, 5.2625823]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This testing strategy (i.e. compare the output from random input and labels with fixed seed from recorded output) is not meaningful and does not guarantee anything. It merely increases the line coverage.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did not notice the unit test in test_operator.py. I have removed this one.

assert np.isclose(loss.asnumpy(), expected_output).all()

def test_gpu(x, y):
data = mx.nd.array(x, ctx=mx.gpu(0))
label = mx.nd.array(y, ctx=mx.gpu(0))
loss = mx.nd.contrib.ctc_loss(data=data, label=label)
loss = mx.nd.make_loss(loss)
expected_output = [9.604521, 7.096151, 4.906869, 5.5237527, 5.9895644, 5.584548,
5.528411, 5.765914, 6.740701, 5.2625823]
assert np.isclose(loss.asnumpy(), expected_output).all()

def test_integer_label(x, y):
data = mx.nd.array(x, ctx=mx.cpu(0))
label = mx.nd.array(y, ctx=mx.cpu(0), dtype=np.int32)
loss = mx.nd.contrib.ctc_loss(data=data, label=label)
loss = mx.nd.make_loss(loss)
expected_output = [9.604521, 7.096151, 4.906869, 5.5237527, 5.9895644, 5.584548,
5.528411, 5.765914, 6.740701, 5.2625823]
assert np.isclose(loss.asnumpy(), expected_output).all()

test_cpu(x, y)
if default_context().device_type == 'gpu':
test_gpu(x, y)
test_integer_label(x, y)


if __name__ == '__main__':
import nose
nose.runmodule()