Better normalization options for SoftmaxWithLoss layer.

BVLC · Nov 11, 2015 · d5a78e1 · d5a78e1
1 parent 0ec116e
commit d5a78e1
Show file tree

Hide file tree

Showing 4 changed files with 87 additions and 32 deletions.
diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp
@@ -747,6 +747,12 @@ class SoftmaxWithLossLayer : public LossLayer<Dtype> {
  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
  const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
+ /// Read the normalization mode parameter and compute the normalizer based
+ /// on the blob size. If normalization_mode is VALID, the count of valid
+ /// outputs will be read from valid_count, unless it is -1 in which case
+ /// all outputs are assumed to be valid.
+ virtual Dtype get_normalizer(
+ LossParameter_NormalizationMode normalization_mode, int valid_count);
 
  /// The internal SoftmaxLayer used to map predictions to a distribution.
  shared_ptr<Layer<Dtype> > softmax_layer_;
@@ -760,9 +766,8 @@ class SoftmaxWithLossLayer : public LossLayer<Dtype> {
  bool has_ignore_label_;
  /// The label indicating that an instance should be ignored.
  int ignore_label_;
- /// Whether to normalize the loss by the total number of values present
- /// (otherwise just by the batch size).
- bool normalize_;
+ /// How to normalize the output loss.
+ LossParameter_NormalizationMode normalization_;
 
  int softmax_axis_, outer_num_, inner_num_;
 };

diff --git a/src/caffe/layers/softmax_loss_layer.cpp b/src/caffe/layers/softmax_loss_layer.cpp
@@ -25,7 +25,14 @@ void SoftmaxWithLossLayer<Dtype>::LayerSetUp(
  if (has_ignore_label_) {
  ignore_label_ = this->layer_param_.loss_param().ignore_label();
  }
- normalize_ = this->layer_param_.loss_param().normalize();
+ if (!this->layer_param_.loss_param().has_normalization() &&
+ this->layer_param_.loss_param().has_normalize()) {
+ normalization_ = this->layer_param_.loss_param().normalize() ?
+ LossParameter_NormalizationMode_VALID :
+ LossParameter_NormalizationMode_BATCH_SIZE;
+ } else {
+ normalization_ = this->layer_param_.loss_param().normalization();
+ }
 }
 
 template <typename Dtype>
@@ -48,6 +55,34 @@ void SoftmaxWithLossLayer<Dtype>::Reshape(
  }
 }
 
+template <typename Dtype>
+Dtype SoftmaxWithLossLayer<Dtype>::get_normalizer(
+ LossParameter_NormalizationMode normalization_mode, int valid_count) {
+ Dtype normalizer;
+ switch (normalization_mode) {
+ case LossParameter_NormalizationMode_FULL:
+ normalizer = Dtype(outer_num_ * inner_num_);
+ break;
+ case LossParameter_NormalizationMode_VALID:
+ if (valid_count == -1) {
+ normalizer = Dtype(outer_num_ * inner_num_);
+ } else {
+ normalizer = Dtype(valid_count);
+ }
+ break;
+ case LossParameter_NormalizationMode_BATCH_SIZE:
+ normalizer = Dtype(outer_num_);
+ break;
+ case LossParameter_NormalizationMode_NONE:
+ normalizer = Dtype(1);
+ break;
+ default:
+ LOG(FATAL) << "Unknown normalization mode: "
+ << LossParameter_NormalizationMode_Name(normalization_mode);
+ }
+ return normalizer;
+}
+
 template <typename Dtype>
 void SoftmaxWithLossLayer<Dtype>::Forward_cpu(
  const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
@@ -71,11 +106,7 @@ void SoftmaxWithLossLayer<Dtype>::Forward_cpu(
  ++count;
  }
  }
- if (normalize_) {
- top[0]->mutable_cpu_data()[0] = loss / count;
- } else {
- top[0]->mutable_cpu_data()[0] = loss / outer_num_;
- }
+ top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_, count);
  if (top.size() == 2) {
  top[1]->ShareData(prob_);
  }
@@ -109,12 +140,9 @@ void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
  }
  }
  // Scale gradient
- const Dtype loss_weight = top[0]->cpu_diff()[0];
- if (normalize_) {
- caffe_scal(prob_.count(), loss_weight / count, bottom_diff);
- } else {
- caffe_scal(prob_.count(), loss_weight / outer_num_, bottom_diff);
- }
+ Dtype loss_weight = top[0]->cpu_diff()[0] /
+ get_normalizer(normalization_, count);
+ caffe_scal(prob_.count(), loss_weight, bottom_diff);
  }
 }
 

diff --git a/src/caffe/layers/softmax_loss_layer.cu b/src/caffe/layers/softmax_loss_layer.cu
@@ -49,14 +49,15 @@ void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
  outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
  Dtype loss;
  caffe_gpu_asum(nthreads, loss_data, &loss);
- if (normalize_) {
-  Dtype count;
-  caffe_gpu_asum(nthreads, counts, &count);
-  loss /= count;
- } else {
- loss /= outer_num_;
+ Dtype valid_count = -1;
+ // Only launch another CUDA kernel if we actually need the count of valid
+ // outputs.
+ if (normalization_ == LossParameter_NormalizationMode_VALID &&
+  has_ignore_label_) {
+ caffe_gpu_asum(nthreads, counts, &valid_count);
  }
- top[0]->mutable_cpu_data()[0] = loss;
+ top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_,
+ valid_count);
  if (top.size() == 2) {
  top[1]->ShareData(prob_);
  }
@@ -108,14 +109,17 @@ void SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
  SoftmaxLossBackwardGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
  CAFFE_CUDA_NUM_THREADS>>>(nthreads, top_data, label, bottom_diff,
  outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
- const Dtype loss_weight = top[0]->cpu_diff()[0];
- if (normalize_) {
-  Dtype count;
-  caffe_gpu_asum(nthreads, counts, &count);
-  caffe_gpu_scal(prob_.count(), loss_weight / count, bottom_diff);
- } else {
- caffe_gpu_scal(prob_.count(), loss_weight / outer_num_, bottom_diff);
+
+ Dtype valid_count = -1;
+ // Only launch another CUDA kernel if we actually need the count of valid
+ // outputs.
+ if (normalization_ == LossParameter_NormalizationMode_VALID &&
+  has_ignore_label_) {
+ caffe_gpu_asum(nthreads, counts, &valid_count);
  }
+ const Dtype loss_weight = top[0]->cpu_diff()[0] /
+ get_normalizer(normalization_, valid_count);
+ caffe_gpu_scal(prob_.count(), loss_weight , bottom_diff);
  }
 }
 

diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
@@ -420,9 +420,27 @@ message TransformationParameter {
 message LossParameter {
  // If specified, ignore instances with the given label.
  optional int32 ignore_label = 1;
- // If true, normalize each batch across all instances (including spatial
- // dimesions, but not ignored instances); else, divide by batch size only.
- optional bool normalize = 2 [default = true];
+ // How to normalize the loss for loss layers that aggregate across batches,
+ // spatial dimensions, or other dimensions. Currently only implemented in
+ // SoftmaxWithLoss layer.
+ enum NormalizationMode {
+ // Divide by the number of examples in the batch times spatial dimensions.
+ // Outputs that receive the ignore label will NOT be ignored in computing
+ // the normalization factor.
+ FULL = 0;
+ // Divide by the total number of output locations that do not take the 
+ // ignore_label. If ignore_label is not set, this behaves like FULL.
+ VALID = 1;
+ // Divide by the batch size.
+ BATCH_SIZE = 2;
+ // Do not normalize the loss.
+ NONE = 3;
+ }
+ optional NormalizationMode normalization = 3 [default = VALID];
+ // Deprecated. Ignored if normalization is specified. If normalization
+ // is not specified, then setting this to false will be equivalent to
+ // normalization = BATCH_SIZE to be consistent with previous behavior.
+ optional bool normalize = 2;
 }
 
 // Messages that store parameters used by individual layer types follow, in