src/operator/softmax_output.cc

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*!
 * \file softmax_output.cc
 * \brief
 * \author Bing Xu, Zhang Rong A
 */
#include "./softmax_output-inl.h"
#if MXNET_USE_ONEDNN == 1
#include "operator/nn/dnnl/dnnl_base-inl.h"
#include "operator/nn/dnnl/dnnl_softmax_output-inl.h"
#endif
namespace mxnet {
namespace op {

DMLC_REGISTER_PARAMETER(SoftmaxOutputParam);
struct SoftmaxOutputGrad {
  const char* op_name;
  std::vector<nnvm::NodeEntry> operator()(const nnvm::ObjectPtr& n,
                                          const std::vector<nnvm::NodeEntry>& ograds) const {
    std::vector<nnvm::NodeEntry> out_data(n->num_outputs());
    for (uint32_t i = 0; i < out_data.size(); ++i) {
      out_data[i] = nnvm::NodeEntry{n, i, 0};
    }
    std::vector<nnvm::NodeEntry> heads;
    heads.push_back(out_data[softmaxout_enum::kOut]);
    heads.push_back(n->inputs[softmaxout_enum::kLabel]);

    nnvm::ObjectPtr gnode = nnvm::Node::Create();
    gnode->inputs         = std::move(heads);
    gnode->control_deps.emplace_back(n);
    gnode->attrs      = n->attrs;
    gnode->attrs.op   = nnvm::Op::Get("_backward_SoftmaxOutput");
    gnode->attrs.name = n->attrs.name + "_backward";
    std::vector<nnvm::NodeEntry> in_grad(2);
    in_grad[0] = nnvm::NodeEntry{gnode, 0, 0};
    in_grad[1] = nnvm::NodeEntry{gnode, 1, 0};
    return in_grad;
  }
};

static inline std::vector<std::string> ListArguments() {
  return {"data", "label"};
}

static bool SoftmaxOutputType(const nnvm::NodeAttrs& attrs,
                              std::vector<int>* in_type,
                              std::vector<int>* out_type) {
  CHECK_EQ(in_type->size(), 2U);
  int dtype = (*in_type)[0];
  if (type_is_none(dtype)) {
    // Input type is undefined, we try backward inference
    if (out_type->size() == 0 || type_is_none((*out_type)[0])) {
      // Neither the input nor the output are defined,
      // types cannot be infered for this op
      return false;
    } else {
      // Input type is undefined but output type is: backward inference
      dtype = (*out_type)[0];
    }
  } else {
    // Input type is defined but output type is not: forward inference
    out_type->clear();
    out_type->push_back(dtype);
  }
  for (size_t i = 0; i < in_type->size(); ++i) {
    if ((*in_type)[i] == -1) {
      (*in_type)[i] = dtype;
    } else {
      UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
    }
  }
  return true;
}

static bool SoftmaxOutputShape(const nnvm::NodeAttrs& attrs,
                               mxnet::ShapeVector* in_shape,
                               mxnet::ShapeVector* out_shape) {
  using namespace mshadow;
  const SoftmaxOutputParam& param = nnvm::get<SoftmaxOutputParam>(attrs.parsed);
  CHECK_EQ(in_shape->size(), 2U) << "Input:[data, label]";
  const mxnet::TShape& dshape = in_shape->at(0);
  if (!mxnet::ndim_is_known(dshape))
    return false;

  // label.shape == data.shape: use probability as label
  if (dshape != (*in_shape)[softmaxout_enum::kLabel]) {
    if (param.multi_output) {
      mxnet::TShape lshape1 = Shape2(dshape[0], dshape.Size() / dshape[0] / dshape[1]);
      mxnet::TShape lshape2(dshape.ndim() - 1, -1);
      lshape2[0] = dshape[0];
      for (int i = 2; i < dshape.ndim(); ++i)
        lshape2[i - 1] = dshape[i];
      mxnet::TShape lshape3 = dshape;
      lshape3[1]            = 1;
      if (!mxnet::ndim_is_known(in_shape->at(softmaxout_enum::kLabel))) {
        in_shape->at(softmaxout_enum::kLabel) = lshape1;
      } else if (in_shape->at(softmaxout_enum::kLabel) == lshape1) {
      } else if (in_shape->at(softmaxout_enum::kLabel) == lshape2) {
      } else if (in_shape->at(softmaxout_enum::kLabel) == lshape3) {
      } else {
        std::ostringstream os;
        os << "Expecting " << lshape1 << " or " << lshape2 << ". But got "
           << in_shape->at(softmaxout_enum::kLabel);
        throw InferShapeError(os.str(), softmaxout_enum::kLabel);
      }
    } else {
      mxnet::TShape label_shape(dshape.ndim() - 1, -1);
      for (int i = 0; i + 1 < dshape.ndim(); ++i)
        label_shape[i] = dshape[i];
      SHAPE_ASSIGN_CHECK(*in_shape, softmaxout_enum::kLabel, label_shape);
    }
  }

  out_shape->clear();
  out_shape->push_back(dshape);
  return true;
}

#if MXNET_USE_ONEDNN == 1
inline static bool SoftmaxOutputStorageType(const nnvm::NodeAttrs& attrs,
                                            const int dev_mask,
                                            DispatchMode* dispatch_mode,
                                            std::vector<int>* in_attrs,
                                            std::vector<int>* out_attrs) {
  CHECK_EQ(in_attrs->size(), 2);
  CHECK_EQ(out_attrs->size(), 1);

  return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
}

void SoftmaxOutputComputeExCPU(const nnvm::NodeAttrs& attrs,
                               const OpContext& ctx,
                               const std::vector<NDArray>& inputs,
                               const std::vector<OpReqType>& req,
                               const std::vector<NDArray>& outputs) {
  CHECK_EQ(inputs.size(), 2U);
  const SoftmaxOutputParam& param = nnvm::get<SoftmaxOutputParam>(attrs.parsed);
  if (SupportDNNLSoftmaxOutput(param, inputs[0]) && !ctx.is_train) {
    DNNL_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
    DNNLRun(DNNLSoftmaxOutputForward, attrs, ctx, inputs, req, outputs);
    DNNL_OPCHECK_RUN(SoftmaxOutputCompute<cpu>, attrs, ctx, inputs, req, outputs);
    return;
  }
  FallBackCompute(SoftmaxOutputCompute<cpu>, attrs, ctx, inputs, req, outputs);
}
#endif

NNVM_REGISTER_OP(SoftmaxOutput)
    .describe(R"code(Computes the gradient of cross entropy loss with respect to softmax output.

- This operator computes the gradient in two steps.
  The cross entropy loss does not actually need to be computed.

  - Applies softmax function on the input array.
  - Computes and returns the gradient of cross entropy loss w.r.t. the softmax output.

- The softmax function, cross entropy loss and gradient is given by:

  - Softmax Function:

    .. math:: \text{softmax}(x)_i = \frac{exp(x_i)}{\sum_j exp(x_j)}

  - Cross Entropy Function:

    .. math:: \text{CE(label, output)} = - \sum_i \text{label}_i \log(\text{output}_i)

  - The gradient of cross entropy loss w.r.t softmax output:

    .. math:: \text{gradient} = \text{output} - \text{label}

- During forward propagation, the softmax function is computed for each instance in the input array.

  For general *N*-D input arrays with shape :math:`(d_1, d_2, ..., d_n)`. The size is
  :math:`s=d_1 \cdot d_2 \cdot \cdot \cdot d_n`. We can use the parameters `preserve_shape`
  and `multi_output` to specify the way to compute softmax:

  - By default, `preserve_shape` is ``false``. This operator will reshape the input array
    into a 2-D array with shape :math:`(d_1, \frac{s}{d_1})` and then compute the softmax function for
    each row in the reshaped array, and afterwards reshape it back to the original shape
    :math:`(d_1, d_2, ..., d_n)`.
  - If `preserve_shape` is ``true``, the softmax function will be computed along
    the last axis (`axis` = ``-1``).
  - If `multi_output` is ``true``, the softmax function will be computed along
    the second axis (`axis` = ``1``).

- During backward propagation, the gradient of cross-entropy loss w.r.t softmax output array is computed.
  The provided label can be a one-hot label array or a probability label array.

  - If the parameter `use_ignore` is ``true``, `ignore_label` can specify input instances
    with a particular label to be ignored during backward propagation. **This has no effect when
    softmax `output` has same shape as `label`**.

    Example::

      data = [[1,2,3,4],[2,2,2,2],[3,3,3,3],[4,4,4,4]]
      label = [1,0,2,3]
      ignore_label = 1
      SoftmaxOutput(data=data, label = label,\
                    multi_output=true, use_ignore=true,\
                    ignore_label=ignore_label)
      ## forward softmax output
      [[ 0.0320586   0.08714432  0.23688284  0.64391428]
       [ 0.25        0.25        0.25        0.25      ]
       [ 0.25        0.25        0.25        0.25      ]
       [ 0.25        0.25        0.25        0.25      ]]
      ## backward gradient output
      [[ 0.    0.    0.    0.  ]
       [-0.75  0.25  0.25  0.25]
       [ 0.25  0.25 -0.75  0.25]
       [ 0.25  0.25  0.25 -0.75]]
      ## notice that the first row is all 0 because label[0] is 1, which is equal to ignore_label.

  - The parameter `grad_scale` can be used to rescale the gradient, which is often used to
    give each loss function different weights.

  - This operator also supports various ways to normalize the gradient by `normalization`,
    The `normalization` is applied if softmax output has different shape than the labels.
    The `normalization` mode can be set to the followings:

    - ``'null'``: do nothing.
    - ``'batch'``: divide the gradient by the batch size.
    - ``'valid'``: divide the gradient by the number of instances which are not ignored.

)code" ADD_FILELINE)
    .set_num_inputs(2)
    .set_num_outputs(1)
    .set_attr_parser(ParamParser<SoftmaxOutputParam>)
#if MXNET_USE_ONEDNN == 1
    .set_attr<FInferStorageType>("FInferStorageType", SoftmaxOutputStorageType)
    .set_attr<bool>("TIsDNNL", true)
    .set_attr<FComputeEx>("FComputeEx<cpu>", SoftmaxOutputComputeExCPU)
#endif
    .set_attr<nnvm::FListInputNames>("FListInputNames",
                                     [](const NodeAttrs& attrs) {
                                       return std::vector<std::string>{"data", "label"};
                                     })
    .set_attr<nnvm::FListOutputNames>("FListOutputNames",
                                      [](const NodeAttrs& attrs) {
                                        return std::vector<std::string>{"output"};
                                      })
    .set_attr<mxnet::FInferShape>("FInferShape", SoftmaxOutputShape)
    .set_attr<nnvm::FInferType>("FInferType", SoftmaxOutputType)
    .set_attr<FCompute>("FCompute<cpu>", SoftmaxOutputCompute<cpu>)
    .set_attr<nnvm::FGradient>("FGradient", SoftmaxOutputGrad{"_backward_SoftmaxOutput"})
    .set_attr<nnvm::FInplaceOption>("FInplaceOption",
                                    [](const NodeAttrs& attrs) {
                                      return std::vector<std::pair<int, int> >{{0, 0}};
                                    })
    .add_argument("data", "NDArray-or-Symbol", "Input array.")
    .add_argument("label", "NDArray-or-Symbol", "Ground truth label.")
    .add_arguments(SoftmaxOutputParam::__FIELDS__());

// Softmax symbol is renamed to SoftmaxOutput and deprecated since Dec, 2015
NNVM_REGISTER_OP(SoftmaxOutput).add_alias("Softmax");

NNVM_REGISTER_OP(_backward_SoftmaxOutput)
    .set_num_inputs(2)
    .set_num_outputs(2)
    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
    .set_attr<nnvm::FInplaceOption>("FInplaceOption",
                                    [](const NodeAttrs& attrs) {
                                      return std::vector<std::pair<int, int> >{{0, 0}};
                                    })
    .set_attr<FResourceRequest>("FResourceRequest",
                                [](const NodeAttrs& n) {
                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
                                })
    .set_attr_parser(ParamParser<SoftmaxOutputParam>)
    .set_attr<FCompute>("FCompute<cpu>", SoftmaxOutputGradCompute<cpu>);
}  // namespace op
}  // namespace mxnet