BVLC · ghost · Feb 15, 2015
diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp
@@ -605,6 +605,37 @@ class ThresholdLayer : public NeuronLayer<Dtype> {
   Dtype threshold_;
 };
 
+template <typename Dtype>
+class BatchnormLayer : public NeuronLayer<Dtype> {
+ public:
+  explicit BatchnormLayer(const LayerParameter& param)
+      : NeuronLayer<Dtype>(param) {}
+
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  virtual inline LayerParameter_LayerType type() const {
+    return LayerParameter_LayerType_BATCHNORM;
+  }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top);
+  //virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      //vector<Blob<Dtype>*>* top);
+
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
+  //virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      //const vector<bool>& propagate_down, vector<Blob<Dtype>*>* bottom);
+
+  int num_;
+  int bottom_size_;
+  Dtype var_epsilon_;
+  Blob<Dtype> batch_mean_;
+  Blob<Dtype> buffer_blob_;
+  Blob<Dtype> batch_variance_;
+};
+
 }  // namespace caffe
 
 #endif  // CAFFE_NEURON_LAYERS_HPP_
diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp
@@ -185,6 +185,8 @@ Layer<Dtype>* GetLayer(const LayerParameter& param) {
     return new AbsValLayer<Dtype>(param);
   case LayerParameter_LayerType_ARGMAX:
     return new ArgMaxLayer<Dtype>(param);
+  case LayerParameter_LayerType_BATCHNORM:
+    return new BatchnormLayer<Dtype>(param);
   case LayerParameter_LayerType_BNLL:
     return new BNLLLayer<Dtype>(param);
   case LayerParameter_LayerType_CONCAT:

diff --git a/src/caffe/layers/batchnorm_layer.cpp b/src/caffe/layers/batchnorm_layer.cpp
@@ -0,0 +1,159 @@
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void BatchnormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      vector<Blob<Dtype>*>* top) {
+  num_ = bottom[0]->num();
+  bottom_size_ = bottom[0]->count() / bottom[0]->num();
+
+  // Initialize the beta and gamma blobs to 1
+  this->blobs_.resize(2);
+  for (int i = 0; i < 2; ++i) {
+    this->blobs_[i].reset(new Blob<Dtype>(
+      1, bottom_size_, 1, 1));
+    caffe_set(this->blobs_[i]->count(), i == 0 ? Dtype(1) : Dtype(0),
+      this->blobs_[i]->mutable_cpu_data());
+  }
+
+  batch_mean_.Reshape(1, bottom_size_, 1, 1);
+  buffer_blob_.Reshape(1, bottom_size_, 1, 1);
+  batch_variance_.Reshape(1, bottom_size_, 1, 1);
+  var_epsilon_ = Dtype(0.1);
+
+  // Propagate gradients to the parameters (as directed by backward pass).
+  this->param_propagate_down_.resize(this->blobs_.size(), true);
+}
+
+template <typename Dtype>
+void BatchnormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    vector<Blob<Dtype>*>* top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  const Dtype* gamma_data = this->blobs_[0]->cpu_data();
+  const Dtype* beta_data = this->blobs_[1]->cpu_data();
+  Dtype* top_data = (*top)[0]->mutable_cpu_data();
+
+  Dtype* mean_data = batch_mean_.mutable_cpu_data();
+  Dtype* variance_data = batch_variance_.mutable_cpu_data();
+  Dtype* buffer = buffer_blob_.mutable_cpu_data();
+
+  caffe_set(bottom_size_, Dtype(0), mean_data);
+  caffe_set(bottom_size_, Dtype(0), variance_data);
+
+  for (int n = 0; n < num_; ++n) {
+    caffe_add(bottom_size_, bottom_data + bottom[0]->offset(n), mean_data,
+        mean_data);
+    caffe_sqr(bottom_size_, bottom_data + bottom[0]->offset(n), buffer);
+    caffe_add(bottom_size_, buffer, variance_data, variance_data);
+  }
+  caffe_cpu_scale(bottom_size_, Dtype(1) / Dtype(num_), mean_data, mean_data);
+  caffe_cpu_scale(bottom_size_, Dtype(1) / Dtype(num_), variance_data,
+      variance_data);
+
+  caffe_sqr(bottom_size_, mean_data, buffer);
+  caffe_sub(bottom_size_, variance_data, buffer, variance_data);
+  caffe_add_scalar(bottom_size_, var_epsilon_, variance_data);
+  caffe_powx(bottom_size_, variance_data, Dtype(0.5), variance_data);
+
+  for (int n = 0; n < num_; ++n) {
+    caffe_sub(bottom_size_, bottom_data + bottom[0]->offset(n), mean_data, buffer);
+    caffe_div(bottom_size_, buffer, variance_data, buffer);
+    caffe_mul(bottom_size_, buffer, gamma_data, buffer);
+    caffe_add(bottom_size_, buffer, beta_data,
+        top_data + (*top)[0]->offset(n));
+  }
+}
+
+template <typename Dtype>
+void BatchnormLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    vector<Blob<Dtype>*>* bottom) {
+  const Dtype* top_data = top[0]->cpu_data();
+  const Dtype* top_diff = top[0]->cpu_diff();
+  const Dtype* variance_data = batch_variance_.cpu_data();
+  const Dtype* gamma_data = this->blobs_[0]->cpu_data();
+  const Dtype* beta_data = this->blobs_[1]->cpu_data();
+
+  Dtype* dl_dvar = batch_variance_.mutable_cpu_diff();
+  Dtype* dl_dmean = batch_mean_.mutable_cpu_diff();
+  Dtype* buffer = buffer_blob_.mutable_cpu_data();
+
+  Dtype* bottom_diff = (*bottom)[0]->mutable_cpu_diff();
+  Dtype* gamma_diff = this->blobs_[0]->mutable_cpu_diff();
+  Dtype* beta_diff = this->blobs_[1]->mutable_cpu_diff();
+
+  caffe_set((*bottom)[0]->count(), Dtype(0), bottom_diff);
+  caffe_set(this->blobs_[0]->count(), Dtype(0), gamma_diff);
+  caffe_set(this->blobs_[1]->count(), Dtype(0), beta_diff);
+  caffe_set(bottom_size_, Dtype(0), dl_dvar);
+  caffe_set(bottom_size_, Dtype(0), dl_dmean);
+
+  for (int n = 0; n < num_; ++n) {
+    // fill gamma_diff
+    caffe_sub(bottom_size_, top_data + top[0]->offset(n), beta_data,
+        buffer);
+    caffe_div(bottom_size_, buffer, gamma_data,
+        buffer);
+    caffe_mul(bottom_size_, buffer, top_diff + top[0]->offset(n),
+        buffer);
+    caffe_add(bottom_size_, buffer, gamma_diff, gamma_diff);
+
+    // fill beta_diff
+    caffe_add(bottom_size_, top_diff + top[0]->offset(n), beta_diff, beta_diff);
+  }
+
+  // fill bottom_diff direct term
+  for (int n = 0; n < num_; ++n) {
+    caffe_mul(bottom_size_, top_diff + top[0]->offset(n), gamma_data, buffer);
+    caffe_div(bottom_size_, buffer, variance_data, buffer);
+    caffe_add(bottom_size_, buffer, bottom_diff + (*bottom)[0]->offset(n),
+        bottom_diff + (*bottom)[0]->offset(n));
+  }
+
+  // fill bottom_diff variance contribution term
+  for (int n = 0; n < num_; ++n) {
+    caffe_sub(bottom_size_, top_data + top[0]->offset(n), beta_data, buffer);
+    caffe_mul(bottom_size_, buffer, variance_data, buffer);
+    caffe_mul(bottom_size_, buffer, top_diff + top[0]->offset(n), buffer);
+    caffe_add(bottom_size_, buffer, dl_dvar, dl_dvar);
+  }
+  caffe_powx(bottom_size_, variance_data, Dtype(-3.0), buffer);
+  caffe_mul(bottom_size_, dl_dvar, buffer, dl_dvar);
+  caffe_cpu_scale(bottom_size_, Dtype(-0.5), dl_dvar, dl_dvar);
+  for (int n = 0; n < num_; ++n) {
+    caffe_sub(bottom_size_, top_data + top[0]->offset(n), beta_data, buffer);
+    caffe_div(bottom_size_, buffer, gamma_data, buffer);
+    caffe_mul(bottom_size_, buffer, variance_data, buffer);
+    caffe_cpu_scale(bottom_size_, Dtype(2) / Dtype(num_), buffer, buffer);
+    caffe_mul(bottom_size_, buffer, dl_dvar, buffer);
+    caffe_add(bottom_size_, buffer, bottom_diff + (*bottom)[0]->offset(n),
+        bottom_diff + (*bottom)[0]->offset(n));
+  }
+
+  // fill bottom_diff mean contribution term
+  for (int n = 0; n < num_; ++n) {
+    caffe_mul(bottom_size_, top_diff + top[0]->offset(n), gamma_data, buffer);
+    caffe_div(bottom_size_, buffer, variance_data, buffer);
+    caffe_sub(bottom_size_, dl_dmean, buffer, dl_dmean);
+  }
+  caffe_cpu_scale(bottom_size_, Dtype(1) / Dtype(num_), dl_dmean, dl_dmean);
+  for (int n = 0; n < num_; ++n) {
+    caffe_add(bottom_size_, dl_dmean, bottom_diff + (*bottom)[0]->offset(n),
+        bottom_diff + (*bottom)[0]->offset(n));
+  }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(BatchnormLayer);
+#endif
+
+INSTANTIATE_CLASS(BatchnormLayer);
+
+
+}  // namespace caffe
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
@@ -234,6 +234,7 @@ message LayerParameter {
     ABSVAL = 35;
     ACCURACY = 1;
     ARGMAX = 30;
+    BATCHNORM = 104;
     BNLL = 2;
     CONCAT = 3;
     CONTRASTIVE_LOSS = 37;

diff --git a/src/caffe/test/test_batchnorm_layer.cpp b/src/caffe/test/test_batchnorm_layer.cpp
@@ -0,0 +1,74 @@
+#include <algorithm>
+#include <cstring>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/vision_layers.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+#define BATCH_SIZE 2
+#define INPUT_DATA_SIZE 3
+
+namespace caffe {
+
+template <typename TypeParam>
+class BatchnormLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  BatchnormLayerTest()
+      : epsilon_(Dtype(1e-5)),
+        blob_bottom_(new Blob<Dtype>()),
+        blob_top_(new Blob<Dtype>()) {}
+  virtual void SetUp() {
+    Caffe::set_random_seed(1601);
+    blob_bottom_->Reshape(BATCH_SIZE, INPUT_DATA_SIZE, 1, 1);
+    // fill the values
+    FillerParameter filler_param;
+    GaussianFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_);
+    blob_bottom_vec_.push_back(blob_bottom_);
+    blob_top_vec_.push_back(blob_top_);
+  }
+  virtual ~BatchnormLayerTest() { delete blob_bottom_; delete blob_top_; }
+
+  Dtype epsilon_;
+  Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+
+TYPED_TEST_CASE(BatchnormLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(BatchnormLayerTest, TestSetupAcrossChannels) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  BatchnormLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, &(this->blob_top_vec_));
+  EXPECT_EQ(this->blob_top_->num(), BATCH_SIZE);
+  EXPECT_EQ(this->blob_top_->channels(), INPUT_DATA_SIZE);
+  EXPECT_EQ(this->blob_top_->height(), 1);
+  EXPECT_EQ(this->blob_top_->width(), 1);
+}
+
+TYPED_TEST(BatchnormLayerTest, TestGradientAcrossChannels) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  BatchnormLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-2);
+  //vector<bool> propagate_down(this->blob_bottom_vec_.size(), true);
+  //layer.Backward(this->blob_top_vec_, propagate_down,
+                 //&(this->blob_bottom_vec_));
+  checker.CheckGradientExhaustive(&layer, &(this->blob_bottom_vec_),
+      &(this->blob_top_vec_));
+}
+
+}  // namespace caffe