sigmoid cross-entropy loss: normalize by one/batch size/all/etc.

sig-ce loss handles all the same normalizations as the softmax loss; refer to BVLC#3296 for more detail n.b. this changes the default normalization for this loss! previously this loss normalized by batch size, but now it normalizes by the total number of outputs/targets.
shelhamer · Nov 17, 2016 · 38d1033 · 38d1033
1 parent c6ab965
commit 38d1033
Show file tree

Hide file tree

Showing 4 changed files with 99 additions and 30 deletions.
diff --git a/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp b/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp
@@ -97,6 +97,13 @@ class SigmoidCrossEntropyLossLayer : public LossLayer<Dtype> {
  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
  const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
+ /// Read the normalization mode parameter and compute the normalizer based
+ /// on the blob size. If normalization_mode is VALID, the count of valid
+ /// outputs will be read from valid_count, unless it is -1 in which case
+ /// all outputs are assumed to be valid.
+ virtual Dtype get_normalizer(
+ LossParameter_NormalizationMode normalization_mode, int valid_count);
+
  /// The internal SigmoidLayer used to map predictions to probabilities.
  shared_ptr<SigmoidLayer<Dtype> > sigmoid_layer_;
  /// sigmoid_output stores the output of the SigmoidLayer.
@@ -110,6 +117,10 @@ class SigmoidCrossEntropyLossLayer : public LossLayer<Dtype> {
  bool has_ignore_label_;
  /// The label indicating that an instance should be ignored.
  int ignore_label_;
+ /// How to normalize the loss.
+ LossParameter_NormalizationMode normalization_;
+ Dtype normalizer_;
+ int outer_num_, inner_num_;
 };
 
 } // namespace caffe

diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
@@ -1,3 +1,4 @@
+#include <algorithm>
 #include <vector>
 
 #include "caffe/layers/sigmoid_cross_entropy_loss_layer.hpp"
@@ -20,39 +21,82 @@ void SigmoidCrossEntropyLossLayer<Dtype>::LayerSetUp(
  if (has_ignore_label_) {
  ignore_label_ = this->layer_param_.loss_param().ignore_label();
  }
+ if (!this->layer_param_.loss_param().has_normalization() &&
+ this->layer_param_.loss_param().has_normalize()) {
+ normalization_ = this->layer_param_.loss_param().normalize() ?
+ LossParameter_NormalizationMode_VALID :
+ LossParameter_NormalizationMode_BATCH_SIZE;
+ } else {
+ normalization_ = this->layer_param_.loss_param().normalization();
+ }
 }
 
 template <typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::Reshape(
  const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
  LossLayer<Dtype>::Reshape(bottom, top);
+ outer_num_ = bottom[0]->shape(0); // batch size
+ inner_num_ = bottom[0]->count(1); // instance size: |output| == |target|
  CHECK_EQ(bottom[0]->count(), bottom[1]->count()) <<
  "SIGMOID_CROSS_ENTROPY_LOSS layer inputs must have the same count.";
  sigmoid_layer_->Reshape(sigmoid_bottom_vec_, sigmoid_top_vec_);
 }
 
+// TODO(shelhamer) loss normalization should be pulled up into LossLayer,
+// instead of duplicated here and in SoftMaxWithLossLayer
+template <typename Dtype>
+Dtype SigmoidCrossEntropyLossLayer<Dtype>::get_normalizer(
+ LossParameter_NormalizationMode normalization_mode, int valid_count) {
+ Dtype normalizer;
+ switch (normalization_mode) {
+ case LossParameter_NormalizationMode_FULL:
+ normalizer = Dtype(outer_num_ * inner_num_);
+ break;
+ case LossParameter_NormalizationMode_VALID:
+ if (valid_count == -1) {
+ normalizer = Dtype(outer_num_ * inner_num_);
+ } else {
+ normalizer = Dtype(valid_count);
+ }
+ break;
+ case LossParameter_NormalizationMode_BATCH_SIZE:
+ normalizer = Dtype(outer_num_);
+ break;
+ case LossParameter_NormalizationMode_NONE:
+ normalizer = Dtype(1);
+ break;
+ default:
+ LOG(FATAL) << "Unknown normalization mode: "
+ << LossParameter_NormalizationMode_Name(normalization_mode);
+ }
+ // Some users will have no labels for some examples in order to 'turn off' a
+ // particular loss in a multi-task setup. The max prevents NaNs in that case.
+ return std::max(Dtype(1.0), normalizer);
+}
+
 template <typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::Forward_cpu(
  const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
  // The forward pass computes the sigmoid outputs.
  sigmoid_bottom_vec_[0] = bottom[0];
  sigmoid_layer_->Forward(sigmoid_bottom_vec_, sigmoid_top_vec_);
  // Compute the loss (negative log likelihood)
- const int count = bottom[0]->count();
- const int num = bottom[0]->num();
  // Stable version of loss computation from input data
  const Dtype* input_data = bottom[0]->cpu_data();
  const Dtype* target = bottom[1]->cpu_data();
+ int valid_count = 0;
  Dtype loss = 0;
- for (int i = 0; i < count; ++i) {
+ for (int i = 0; i < bottom[0]->count(); ++i) {
  const int target_value = static_cast<int>(target[i]);
  if (has_ignore_label_ && target_value == ignore_label_) {
  continue;
  }
  loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) -
  log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0)));
+ ++valid_count;
  }
- top[0]->mutable_cpu_data()[0] = loss / num;
+ normalizer_ = get_normalizer(normalization_, valid_count);
+ top[0]->mutable_cpu_data()[0] = loss / normalizer_;
 }
 
 template <typename Dtype>
@@ -66,14 +110,10 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Backward_cpu(
  if (propagate_down[0]) {
  // First, compute the diff
  const int count = bottom[0]->count();
- const int num = bottom[0]->num();
  const Dtype* sigmoid_output_data = sigmoid_output_->cpu_data();
  const Dtype* target = bottom[1]->cpu_data();
  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
  caffe_sub(count, sigmoid_output_data, target, bottom_diff);
- // Scale down gradient
- const Dtype loss_weight = top[0]->cpu_diff()[0];
- caffe_scal(count, loss_weight / num, bottom_diff);
  // Zero out gradient of ignored targets.
  if (has_ignore_label_) {
  for (int i = 0; i < count; ++i) {
@@ -83,6 +123,9 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Backward_cpu(
  }
  }
  }
+ // Scale down gradient
+ Dtype loss_weight = top[0]->cpu_diff()[0] / normalizer_;
+ caffe_scal(count, loss_weight, bottom_diff);
  }
 }
 

diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
@@ -5,26 +5,38 @@
 
 namespace caffe {
 
+
 template <typename Dtype>
 __global__ void SigmoidCrossEntropyLossForwardGPU(const int nthreads,
- const Dtype* input_data, const Dtype* target, Dtype* loss) {
+ const Dtype* input_data, const Dtype* target, Dtype* loss,
+ const bool has_ignore_label_, const int ignore_label_,
+ Dtype* counts) {
  CUDA_KERNEL_LOOP(i, nthreads) {
- loss[i] = input_data[i] * (target[i] - (input_data[i] >= 0)) -
- log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0)));
+ const int target_value = static_cast<int>(target[i]);
+ if (has_ignore_label_ && target_value == ignore_label_) {
+ loss[i] = 0;
+ counts[i] = 0;
+ } else {
+ loss[i] = input_data[i] * (target[i] - (input_data[i] >= 0)) -
+ log(1 + exp(input_data[i] - 2 * input_data[i] *
+ (input_data[i] >= 0)));
+ counts[i] = 1;
+ }
  }
 }
 
 template <typename Dtype>
-__global__ void SigmoidCrossEntropyLossIgnoreGPU(const int count,
- const int ignore_label, const Dtype* target, Dtype* reference) {
- CUDA_KERNEL_LOOP(index, count) {
- const int target_value = static_cast<int>(target[index]);
+__global__ void SigmoidCrossEntropyLossIgnoreDiffGPU(const int count,
+ const int ignore_label, const Dtype* target, Dtype* diff) {
+ CUDA_KERNEL_LOOP(i, count) {
+ const int target_value = static_cast<int>(target[i]);
  if (target_value == ignore_label) {
- reference[index] = 0;
+ diff[i] = 0;
  }
  }
 }
 
+
 template <typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::Forward_gpu(
  const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
@@ -33,26 +45,30 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Forward_gpu(
  sigmoid_layer_->Forward(sigmoid_bottom_vec_, sigmoid_top_vec_);
  // Compute the loss (negative log likelihood)
  const int count = bottom[0]->count();
- const int num = bottom[0]->num();
  // Stable version of loss computation from input data
  const Dtype* input_data = bottom[0]->gpu_data();
  const Dtype* target = bottom[1]->gpu_data();
  // Since this memory is not used for anything until it is overwritten
  // on the backward pass, we use it here to avoid having to allocate new GPU
  // memory to accumulate intermediate results in the kernel.
  Dtype* loss_data = bottom[0]->mutable_gpu_diff();
+ Dtype* count_data = bottom[1]->mutable_gpu_diff();
+ Dtype valid_count;
  // NOLINT_NEXT_LINE(whitespace/operators)
  SigmoidCrossEntropyLossForwardGPU<Dtype><<<CAFFE_GET_BLOCKS(count),
- CAFFE_CUDA_NUM_THREADS>>>(count, input_data, target, loss_data);
- // Zero out loss of ignored targets.
- if (has_ignore_label_) {
- // NOLINT_NEXT_LINE(whitespace/operators)
- SigmoidCrossEntropyLossIgnoreGPU<Dtype><<<CAFFE_GET_BLOCKS(count),
- CAFFE_CUDA_NUM_THREADS>>>(count, ignore_label_, target, loss_data);
+ CAFFE_CUDA_NUM_THREADS>>>(count, input_data, target, loss_data,
+ has_ignore_label_, ignore_label_, count_data);
+ // Only launch another CUDA kernel if we actually need the valid count.
+ if (normalization_ == LossParameter_NormalizationMode_VALID &&
+ has_ignore_label_) {
+ caffe_gpu_asum(count, count_data, &valid_count);
+ } else {
+ valid_count = count;
  }
  Dtype loss;
  caffe_gpu_asum(count, loss_data, &loss);
- top[0]->mutable_cpu_data()[0] = loss / num;
+ normalizer_ = get_normalizer(normalization_, valid_count);
+ top[0]->mutable_cpu_data()[0] = loss / normalizer_;
 }
 
 template <typename Dtype>
@@ -66,21 +82,20 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Backward_gpu(
  if (propagate_down[0]) {
  // First, compute the diff
  const int count = bottom[0]->count();
- const int num = bottom[0]->num();
  const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data();
  const Dtype* target = bottom[1]->gpu_data();
  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
  caffe_copy(count, sigmoid_output_data, bottom_diff);
  caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff);
- // Scale down gradient
- const Dtype loss_weight = top[0]->cpu_diff()[0];
- caffe_gpu_scal(count, loss_weight / num, bottom_diff);
  // Zero out gradient of ignored targets.
  if (has_ignore_label_) {
  // NOLINT_NEXT_LINE(whitespace/operators)
- SigmoidCrossEntropyLossIgnoreGPU<Dtype><<<CAFFE_GET_BLOCKS(count),
+ SigmoidCrossEntropyLossIgnoreDiffGPU<Dtype><<<CAFFE_GET_BLOCKS(count),
  CAFFE_CUDA_NUM_THREADS>>>(count, ignore_label_, target, bottom_diff);
  }
+ // Scale down gradient
+ Dtype loss_weight = top[0]->cpu_diff()[0] / normalizer_;
+ caffe_gpu_scal(count, loss_weight, bottom_diff);
  }
 }
 

diff --git a/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/test/test_sigmoid_cross_entropy_loss_layer.cpp
@@ -56,7 +56,7 @@ class SigmoidCrossEntropyLossLayerTest : public MultiDeviceTest<TypeParam> {
  loss -= target[i] * log(prediction + (target[i] == Dtype(0)));
  loss -= (1 - target[i]) * log(1 - prediction + (target[i] == Dtype(1)));
  }
- return loss / num;
+ return loss / count;
  }
 
  void TestForward() {