From f86bc6d15ebcc8b1d2746ec6e1e6483590b53bb7 Mon Sep 17 00:00:00 2001 From: Venkat Santhanam Date: Thu, 27 Apr 2017 19:36:41 -0400 Subject: [PATCH] Add BN layer from BVLC/caffe PR #1965 Added bn_layer.[cpp/cu] with corresponding hpp file. Performs batch-normalization with in-place scale/shift. Originally created by ducha-aiki: https://github.com/ducha-aiki ChenglongChen: https://github.com/ChenglongChen Russell91: https://github.com/Russell91 jjkjkj: https://github.com/jjkjkj detailed discussion of this implementation can be found at: https://github.com/BVLC/caffe/pull/1965 --- include/caffe/layers/bn_layer.hpp | 79 ++++++ src/caffe/layers/bn_layer.cpp | 427 ++++++++++++++++++++++++++++++ src/caffe/layers/bn_layer.cu | 325 +++++++++++++++++++++++ src/caffe/proto/caffe.proto | 36 ++- 4 files changed, 855 insertions(+), 12 deletions(-) create mode 100644 include/caffe/layers/bn_layer.hpp create mode 100644 src/caffe/layers/bn_layer.cpp create mode 100644 src/caffe/layers/bn_layer.cu diff --git a/include/caffe/layers/bn_layer.hpp b/include/caffe/layers/bn_layer.hpp new file mode 100644 index 00000000000..0efa4490ae5 --- /dev/null +++ b/include/caffe/layers/bn_layer.hpp @@ -0,0 +1,79 @@ +#ifndef CAFFE_BN_LAYER_HPP_ +#define CAFFE_BN_LAYER_HPP_ + +#include + +#include "caffe/blob.hpp" +#include "caffe/layer.hpp" +#include "caffe/proto/caffe.pb.h" +#include "caffe/util/math_functions.hpp" + +namespace caffe { + +/** +* @brief Batch Normalization per-channel with scale & shift linear transform. +* +*/ +template +class BNLayer : public Layer { + /* + notice: + this code is based on the implementation of by following authors. + + ducha-aiki: https://github.com/ducha-aiki + ChenglongChen: https://github.com/ChenglongChen + Russell91: https://github.com/Russell91 + jjkjkj: https://github.com/jjkjkj + + detailed discussion of this implementation can be found at: + https://github.com/BVLC/caffe/pull/1965 + */ + public: + explicit BNLayer(const LayerParameter& param) + : Layer(param) {} + virtual void LayerSetUp(const vector*>& bottom, + const vector*>& top); + virtual void Reshape(const vector*>& bottom, + const vector*>& top); + virtual inline const char* type() const { return "BN"; } + virtual inline int ExactNumBottomBlobs() const { return 1; } + virtual inline int MinTopBlobs() const { return 1; } + // if the BNMode is "LEARN" mamximum 3 top blobs are available + virtual inline int MaxTopBlobs() const { + return (this->layer_param_.bn_param().bn_mode() == + BNParameter_BNMode_LEARN) ? 3 : 1; + } + + protected: + virtual void Forward_cpu(const vector*>& bottom, + const vector*>& top); + virtual void Forward_gpu(const vector*>& bottom, + const vector*>& top); + virtual void Backward_cpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + virtual void Backward_gpu(const vector*>& top, + const vector& propagate_down, const vector*>& bottom); + + // spatial mean & variance + Blob spatial_mean_, spatial_variance_; + // batch mean & variance + Blob batch_mean_, batch_variance_; + // buffer blob + Blob buffer_blob_; + + Blob x_norm_; + // x_sum_multiplier is used to carry out sum using BLAS + Blob spatial_sum_multiplier_, batch_sum_multiplier_; + + // dimension + int N_; + int C_; + int H_; + int W_; + // eps + Dtype var_eps_; +}; + +} // namespace caffe + +#endif // CAFFE_BN_LAYER_HPP_ diff --git a/src/caffe/layers/bn_layer.cpp b/src/caffe/layers/bn_layer.cpp new file mode 100644 index 00000000000..82f4237ff56 --- /dev/null +++ b/src/caffe/layers/bn_layer.cpp @@ -0,0 +1,427 @@ +/* +notice: +this code is based on the implementation of by following authors. + +ducha-aiki: https://github.com/ducha-aiki +ChenglongChen: https://github.com/ChenglongChen +Russell91: https://github.com/Russell91 +jjkjkj: https://github.com/jjkjkj + +detailed discussion of this implementation can be found at: +https://github.com/BVLC/caffe/pull/1965 +*/ + +#include +#include +#include + +#include "caffe/filler.hpp" +#include "caffe/layer.hpp" +#include "caffe/util/math_functions.hpp" +#include "caffe/layers/bn_layer.hpp" + +namespace caffe { + +template +void BNLayer::Reshape(const vector*>& bottom, + const vector*>& top) { + // Figure out the dimensions + N_ = bottom[0]->num(); + C_ = bottom[0]->channels(); + H_ = bottom[0]->height(); + W_ = bottom[0]->width(); + top[0]->Reshape(bottom[0]->num(), bottom[0]->channels(), + bottom[0]->height(), bottom[0]->width()); + if (top.size() > 1) { + // top blob for batch mean + top[1]->Reshape(1, C_, 1, 1); + } + if (top.size() > 2) { + // top blob for batch variance + top[2]->Reshape(1, C_, 1, 1); + } + + x_norm_.Reshape(bottom[0]->num(), bottom[0]->channels(), + bottom[0]->height(), bottom[0]->width()); + + // mean + spatial_mean_.Reshape(N_, C_, 1, 1); + batch_mean_.Reshape(1, C_, 1, 1); + // variance + spatial_variance_.Reshape(N_, C_, 1, 1); + batch_variance_.Reshape(1, C_, 1, 1); + // buffer blob + buffer_blob_.Reshape(N_, C_, H_, W_); + + // fill spatial multiplier + spatial_sum_multiplier_.Reshape(1, 1, H_, W_); + Dtype* spatial_multipl_data = spatial_sum_multiplier_.mutable_cpu_data(); + caffe_set(spatial_sum_multiplier_.count(), Dtype(1), + spatial_multipl_data); + caffe_set(spatial_sum_multiplier_.count(), Dtype(0), + spatial_sum_multiplier_.mutable_cpu_diff()); + // fill batch multiplier + batch_sum_multiplier_.Reshape(N_, 1, 1, 1); + Dtype* batch_multiplier_data = batch_sum_multiplier_.mutable_cpu_data(); + caffe_set(batch_sum_multiplier_.count(), Dtype(1), + batch_multiplier_data); + caffe_set(batch_sum_multiplier_.count(), Dtype(0), + batch_sum_multiplier_.mutable_cpu_diff()); +} +template +void BNLayer::LayerSetUp(const vector*>& bottom, + const vector*>& top) { + // Figure out the dimensions + N_ = bottom[0]->num(); + C_ = bottom[0]->channels(); + H_ = bottom[0]->height(); + W_ = bottom[0]->width(); + var_eps_ = 1e-9; + + // Check if we need to set up the weights + if (this->blobs_.size() > 0) { + LOG(INFO) << "Skipping parameter initialization"; + } else { + this->blobs_.resize(2); + + // fill scale with scale_filler + this->blobs_[0].reset(new Blob(1, C_, 1, 1)); + shared_ptr > scale_filler(GetFiller( + this->layer_param_.bn_param().scale_filler())); + scale_filler->Fill(this->blobs_[0].get()); + + // fill shift with shift_filler + this->blobs_[1].reset(new Blob(1, C_, 1, 1)); + shared_ptr > shift_filler(GetFiller( + this->layer_param_.bn_param().shift_filler())); + shift_filler->Fill(this->blobs_[1].get()); + } // parameter initialization + this->param_propagate_down_.resize(this->blobs_.size(), true); +} + +template +void BNLayer::Forward_cpu(const vector*>& bottom, + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* top_data = top[0]->mutable_cpu_data(); + const Dtype* const_top_data = top[0]->cpu_data(); + + const Dtype* scale_data = this->blobs_[0]->cpu_data(); + const Dtype* shift_data = this->blobs_[1]->cpu_data(); + + switch (this->layer_param_.bn_param().bn_mode()) { + case BNParameter_BNMode_LEARN: + // put the squares of bottom into buffer_blob_ + caffe_powx(bottom[0]->count(), bottom_data, Dtype(2), + buffer_blob_.mutable_cpu_data()); + + // computes variance using var(X) = E(X^2) - (EX)^2 + // EX across spatial + caffe_cpu_gemv(CblasNoTrans, N_ * C_, H_ * W_, + Dtype(1. / (H_ * W_)), bottom_data, + spatial_sum_multiplier_.cpu_data(), Dtype(0), + spatial_mean_.mutable_cpu_data()); + // EX across batch + caffe_cpu_gemv(CblasTrans, N_, C_, Dtype(1. / N_), + spatial_mean_.cpu_data(), + batch_sum_multiplier_.cpu_data(), Dtype(0), + batch_mean_.mutable_cpu_data()); + + // E(X^2) across spatial + caffe_cpu_gemv(CblasNoTrans, N_ * C_, H_ * W_, + Dtype(1. / (H_ * W_)), buffer_blob_.cpu_data(), + spatial_sum_multiplier_.cpu_data(), Dtype(0), + spatial_variance_.mutable_cpu_data()); + // E(X^2) across batch + caffe_cpu_gemv(CblasTrans, N_, C_, Dtype(1. / N_), + spatial_variance_.cpu_data(), + batch_sum_multiplier_.cpu_data(), Dtype(0), + batch_variance_.mutable_cpu_data()); + + caffe_powx(batch_mean_.count(), batch_mean_.cpu_data(), Dtype(2), + buffer_blob_.mutable_cpu_data()); // (EX)^2 + caffe_sub(batch_mean_.count(), batch_variance_.cpu_data(), + buffer_blob_.cpu_data(), + batch_variance_.mutable_cpu_data()); // variance + + // save top[1] (batch_mean) and top[2] (batch_variance) + if (top.size() > 1) { + caffe_copy(batch_mean_.count(), batch_mean_.cpu_data(), + top[1]->mutable_cpu_data()); + } + if (top.size() > 2) { + caffe_copy(batch_variance_.count(), batch_variance_.cpu_data(), + top[2]->mutable_cpu_data()); + } + + // do mean and variance normalization + // subtract mean + caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, N_, + C_, 1, Dtype(1), + batch_sum_multiplier_.cpu_data(), + batch_mean_.cpu_data(), Dtype(0), + spatial_mean_.mutable_cpu_data()); + + caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, N_ * C_, + H_ * W_, 1, Dtype(-1), + spatial_mean_.cpu_data(), + spatial_sum_multiplier_.cpu_data(), Dtype(0), + buffer_blob_.mutable_cpu_data()); + + caffe_add(buffer_blob_.count(), bottom_data, + buffer_blob_.cpu_data(), top_data); + + // normalize variance + caffe_add_scalar(batch_variance_.count(), var_eps_, + batch_variance_.mutable_cpu_data()); + caffe_powx(batch_variance_.count(), + batch_variance_.cpu_data(), Dtype(0.5), + batch_variance_.mutable_cpu_data()); + + caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, N_, + C_, 1, Dtype(1), + batch_sum_multiplier_.cpu_data(), + batch_variance_.cpu_data(), Dtype(0), + spatial_variance_.mutable_cpu_data()); + caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, + N_ * C_, H_ * W_, 1, Dtype(1), + spatial_variance_.cpu_data(), + spatial_sum_multiplier_.cpu_data(), Dtype(0), + buffer_blob_.mutable_cpu_data()); + + caffe_div(buffer_blob_.count(), const_top_data, + buffer_blob_.cpu_data(), top_data); + + // Saving x_norm + caffe_copy(buffer_blob_.count(), const_top_data, + x_norm_.mutable_cpu_data()); + // scale + caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1), + batch_sum_multiplier_.cpu_data(), scale_data, Dtype(0), + spatial_variance_.mutable_cpu_data()); + caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, N_ * C_, + H_ * W_, 1, Dtype(1), + spatial_variance_.cpu_data(), + spatial_sum_multiplier_.cpu_data(), Dtype(0), + buffer_blob_.mutable_cpu_data()); + caffe_mul(buffer_blob_.count(), top_data, + buffer_blob_.cpu_data(), top_data); + + // shift + caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1), + batch_sum_multiplier_.cpu_data(), shift_data, Dtype(0), + spatial_mean_.mutable_cpu_data()); + caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, + N_ * C_, H_ * W_, 1, Dtype(1), + spatial_mean_.cpu_data(), + spatial_sum_multiplier_.cpu_data(), Dtype(0), + buffer_blob_.mutable_cpu_data()); + caffe_add(buffer_blob_.count(), const_top_data, + buffer_blob_.cpu_data(), top_data); + break; + case BNParameter_BNMode_INFERENCE: + // scale + caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1), + batch_sum_multiplier_.cpu_data(), scale_data, Dtype(0), + spatial_variance_.mutable_cpu_data()); + caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, N_ * C_, + H_ * W_, 1, Dtype(1), + spatial_variance_.cpu_data(), + spatial_sum_multiplier_.cpu_data(), Dtype(0), + buffer_blob_.mutable_cpu_data()); + caffe_mul(buffer_blob_.count(), bottom_data, + buffer_blob_.cpu_data(), top_data); + + // shift + caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1), + batch_sum_multiplier_.cpu_data(), shift_data, Dtype(0), + spatial_mean_.mutable_cpu_data()); + caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, + N_ * C_, H_ * W_, 1, Dtype(1), + spatial_mean_.cpu_data(), + spatial_sum_multiplier_.cpu_data(), Dtype(0), + buffer_blob_.mutable_cpu_data()); + caffe_add(buffer_blob_.count(), const_top_data, + buffer_blob_.cpu_data(), top_data); + break; + default: + LOG(FATAL) << "Unknown BN mode."; + } +} + +template +void BNLayer::Backward_cpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + const Dtype* top_diff = top[0]->cpu_diff(); + const Dtype* bottom_data = bottom[0]->cpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); + + Dtype* scale_diff = this->blobs_[0]->mutable_cpu_diff(); + Dtype* shift_diff = this->blobs_[1]->mutable_cpu_diff(); + const Dtype* scale_data = this->blobs_[0]->cpu_data(); + + switch (this->layer_param_.bn_param().bn_mode()) { + case BNParameter_BNMode_LEARN: + // Propagate layer to parameters + // gradient w.r.t. scale + caffe_mul(buffer_blob_.count(), x_norm_.cpu_data(), + top_diff, buffer_blob_.mutable_cpu_data()); + // EX across spatial + caffe_cpu_gemv(CblasNoTrans, N_ * C_, + H_ * W_, Dtype(1), buffer_blob_.cpu_data(), + spatial_sum_multiplier_.cpu_data(), Dtype(0), + spatial_variance_.mutable_cpu_diff()); + // EX across batch + caffe_cpu_gemv(CblasTrans, N_, C_, Dtype(1), + spatial_variance_.cpu_diff(), + batch_sum_multiplier_.cpu_data(), Dtype(0), scale_diff); + + // gradient w.r.t. shift + // EX across spatial + caffe_cpu_gemv(CblasNoTrans, N_ * C_, + H_ * W_, Dtype(1), top_diff, + spatial_sum_multiplier_.cpu_data(), + Dtype(0), spatial_mean_.mutable_cpu_diff()); + // EX across batch + caffe_cpu_gemv(CblasTrans, N_, C_, + Dtype(1), spatial_mean_.cpu_diff(), + batch_sum_multiplier_.cpu_data(), + Dtype(0), shift_diff); + + // Propagate down + + // put scale * top_diff to buffer_blob_ + caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1), + batch_sum_multiplier_.cpu_data(), scale_data, Dtype(0), + spatial_variance_.mutable_cpu_data()); + caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, N_ * C_, + H_ * W_, 1, Dtype(1), + spatial_variance_.cpu_data(), + spatial_sum_multiplier_.cpu_data(), Dtype(0), + buffer_blob_.mutable_cpu_data()); + caffe_mul(buffer_blob_.count(), top_diff, buffer_blob_.cpu_data(), + buffer_blob_.mutable_cpu_data()); + + // use new top diff for computation + caffe_mul(buffer_blob_.count(), x_norm_.cpu_data(), + buffer_blob_.cpu_data(), bottom_diff); + // EX across spatial + caffe_cpu_gemv(CblasNoTrans, N_ * C_, H_ * W_, + Dtype(1), bottom_diff, + spatial_sum_multiplier_.cpu_data(), Dtype(0), + spatial_mean_.mutable_cpu_data()); + // EX across batch + caffe_cpu_gemv(CblasTrans, N_, C_, Dtype(1), + spatial_mean_.cpu_data(), + batch_sum_multiplier_.cpu_data(), Dtype(0), + batch_mean_.mutable_cpu_data()); + + caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, + N_, C_, 1, Dtype(1), + batch_sum_multiplier_.cpu_data(), + batch_mean_.cpu_data(), Dtype(0), + spatial_mean_.mutable_cpu_data()); + caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, N_ * C_, + H_ * W_, 1, Dtype(1), + spatial_mean_.cpu_data(), + spatial_sum_multiplier_.cpu_data(), Dtype(0), + bottom_diff); + + caffe_mul(buffer_blob_.count(), + x_norm_.cpu_data(), bottom_diff, bottom_diff); + + // EX across spatial + caffe_cpu_gemv(CblasNoTrans, N_ * C_, + H_ * W_, Dtype(1), buffer_blob_.cpu_data(), + spatial_sum_multiplier_.cpu_data(), Dtype(0), + spatial_mean_.mutable_cpu_data()); + // EX across batch + caffe_cpu_gemv(CblasTrans, N_, C_, Dtype(1), + spatial_mean_.cpu_data(), + batch_sum_multiplier_.cpu_data(), Dtype(0), + batch_mean_.mutable_cpu_data()); + + caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, + N_, C_, 1, Dtype(1), + batch_sum_multiplier_.cpu_data(), + batch_mean_.cpu_data(), Dtype(0), + spatial_mean_.mutable_cpu_data()); + caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, + N_ * C_, H_ * W_, 1, Dtype(1), + spatial_mean_.cpu_data(), + spatial_sum_multiplier_.cpu_data(), Dtype(1), bottom_diff); + + caffe_cpu_axpby(buffer_blob_.count(), Dtype(1), + buffer_blob_.cpu_data(), Dtype(-1. / (N_ * H_ * W_)), + bottom_diff); + + // put the squares of bottom into buffer_blob_ + caffe_powx(buffer_blob_.count(), bottom_data, Dtype(2), + buffer_blob_.mutable_cpu_data()); + + caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, + N_, C_, 1, Dtype(1), + batch_sum_multiplier_.cpu_data(), + batch_variance_.cpu_data(), Dtype(0), + spatial_variance_.mutable_cpu_data()); + caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, + N_ * C_, H_ * W_, 1, Dtype(1), + spatial_variance_.cpu_data(), + spatial_sum_multiplier_.cpu_data(), Dtype(0), + buffer_blob_.mutable_cpu_data()); + + caffe_div(buffer_blob_.count(), bottom_diff, + buffer_blob_.cpu_data(), bottom_diff); + break; + case BNParameter_BNMode_INFERENCE: + // Propagate layer to parameters + // gradient w.r.t. scale + caffe_mul(buffer_blob_.count(), bottom_data, + top_diff, buffer_blob_.mutable_cpu_data()); + // EX across spatial + caffe_cpu_gemv(CblasNoTrans, N_ * C_, + H_ * W_, Dtype(1), buffer_blob_.cpu_data(), + spatial_sum_multiplier_.cpu_data(), Dtype(0), + spatial_variance_.mutable_cpu_diff()); + // EX across batch + caffe_cpu_gemv(CblasTrans, N_, C_, Dtype(1), + spatial_variance_.cpu_diff(), + batch_sum_multiplier_.cpu_data(), Dtype(0), scale_diff); + + // gradient w.r.t. shift + // EX across spatial + caffe_cpu_gemv(CblasNoTrans, N_ * C_, + H_ * W_, Dtype(1), top_diff, + spatial_sum_multiplier_.cpu_data(), + Dtype(0), spatial_mean_.mutable_cpu_diff()); + // EX across batch + caffe_cpu_gemv(CblasTrans, N_, C_, + Dtype(1), spatial_mean_.cpu_diff(), + batch_sum_multiplier_.cpu_data(), + Dtype(0), shift_diff); + + // Propagate down + // put scale * top_diff to buffer_blob_ + caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1), + batch_sum_multiplier_.cpu_data(), scale_data, Dtype(0), + spatial_variance_.mutable_cpu_data()); + caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, N_ * C_, + H_ * W_, 1, Dtype(1), + spatial_variance_.cpu_data(), + spatial_sum_multiplier_.cpu_data(), Dtype(0), + buffer_blob_.mutable_cpu_data()); + caffe_mul(buffer_blob_.count(), top_diff, buffer_blob_.cpu_data(), + bottom_diff); + break; + default: + LOG(FATAL) << "Unknown BN mode."; + } +} +#ifdef CPU_ONLY +STUB_GPU(BNLayer); +#endif + +INSTANTIATE_CLASS(BNLayer); +REGISTER_LAYER_CLASS(BN); +} // namespace caffe diff --git a/src/caffe/layers/bn_layer.cu b/src/caffe/layers/bn_layer.cu new file mode 100644 index 00000000000..e4a93512e19 --- /dev/null +++ b/src/caffe/layers/bn_layer.cu @@ -0,0 +1,325 @@ +/* +notice: +this code is based on the implementation of by following authors. + +ducha-aiki: https://github.com/ducha-aiki +ChenglongChen: https://github.com/ChenglongChen +Russell91: https://github.com/Russell91 +jjkjkj: https://github.com/jjkjkj + +detailed discussion of this implementation can be found at: +https://github.com/BVLC/caffe/pull/1965 +*/ + +#include +#include + +#include "caffe/filler.hpp" +#include "caffe/layer.hpp" +#include "caffe/util/math_functions.hpp" +#include "caffe/layers/bn_layer.hpp" + +namespace caffe { + +template +void BNLayer::Forward_gpu(const vector*>& bottom, + const vector*>& top) { + const Dtype* bottom_data = bottom[0]->gpu_data(); + const Dtype* const_top_data = top[0]->gpu_data(); + Dtype* top_data = top[0]->mutable_gpu_data(); + Dtype* spatial_mean_data = spatial_mean_.mutable_gpu_data(); + Dtype* buffer_data = buffer_blob_.mutable_gpu_data(); + const Dtype* const_buffer_data = buffer_blob_.gpu_data(); + + switch (this->layer_param_.bn_param().bn_mode()) { + case BNParameter_BNMode_LEARN: + // put the squares of bottom into buffer_blob_ + caffe_gpu_powx(bottom[0]->count(), bottom_data, Dtype(2), + buffer_blob_.mutable_gpu_data()); + + // computes variance using var(X) = E(X^2) - (EX)^2 + // EX across spatial + caffe_gpu_gemv(CblasNoTrans, N_ * C_, H_ * W_, + Dtype(1. / (H_ * W_)), + bottom_data, spatial_sum_multiplier_.gpu_data(), + Dtype(0), spatial_mean_data); + // EX across batch + caffe_gpu_gemv(CblasTrans, N_, C_, Dtype(1. / N_), + spatial_mean_.gpu_data(), + batch_sum_multiplier_.gpu_data(), Dtype(0), + batch_mean_.mutable_gpu_data()); + + // E(X^2) across spatial + caffe_gpu_gemv(CblasNoTrans, N_ * C_, H_ * W_, + Dtype(1. / (H_ * W_)), buffer_data, + spatial_sum_multiplier_.gpu_data(), Dtype(0), + spatial_variance_.mutable_gpu_data()); + // E(X^2) across batch + caffe_gpu_gemv(CblasTrans, N_, C_, Dtype(1. / N_), + spatial_variance_.gpu_data(), + batch_sum_multiplier_.gpu_data(), Dtype(0), + batch_variance_.mutable_gpu_data()); + + caffe_gpu_powx(batch_mean_.count(), batch_mean_.gpu_data(), + Dtype(2), buffer_blob_.mutable_gpu_data()); // (EX)^2 + caffe_gpu_sub(batch_mean_.count(), batch_variance_.gpu_data(), + buffer_data, batch_variance_.mutable_gpu_data()); // variance + + // save top[1] (batch_mean) and top[2] (batch_variance) + if (top.size() > 1) { + caffe_copy(batch_mean_.count(), batch_mean_.gpu_data(), + top[1]->mutable_gpu_data()); + } + if (top.size() > 2) { + caffe_copy(batch_variance_.count(), batch_variance_.gpu_data(), + top[2]->mutable_gpu_data()); + } + + // do mean and variance normalization + // subtract mean + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1), + batch_sum_multiplier_.gpu_data(), batch_mean_.gpu_data(), Dtype(0), + spatial_mean_data); + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, N_ * C_, H_ * W_, + 1, -Dtype(1), + spatial_mean_.gpu_data(), spatial_sum_multiplier_.gpu_data(), Dtype(0), + buffer_blob_.mutable_gpu_data()); + + caffe_gpu_add(buffer_blob_.count(), bottom_data, buffer_data, top_data); + + // normalize variance + caffe_gpu_add_scalar(batch_variance_.count(), var_eps_, + batch_variance_.mutable_gpu_data()); + caffe_gpu_powx(batch_variance_.count(), batch_variance_.gpu_data(), + Dtype(0.5), batch_variance_.mutable_gpu_data()); + + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1), + batch_sum_multiplier_.gpu_data(), batch_variance_.gpu_data(), Dtype(0), + spatial_variance_.mutable_gpu_data()); + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, N_ * C_, + H_ * W_, 1, Dtype(1), + spatial_variance_.gpu_data(), spatial_sum_multiplier_.gpu_data(), + Dtype(0), buffer_blob_.mutable_gpu_data()); + + caffe_gpu_div(buffer_blob_.count(), top_data, buffer_data, top_data); + + // Saving x_norm + caffe_copy(top[0]->count(), const_top_data, x_norm_.mutable_gpu_data()); + + // scale + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1), + batch_sum_multiplier_.gpu_data(), this->blobs_[0]->gpu_data(), + Dtype(0), spatial_variance_.mutable_gpu_data()); + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, N_ * C_, + H_ * W_, 1, Dtype(1), + spatial_variance_.gpu_data(), spatial_sum_multiplier_.gpu_data(), + Dtype(0), buffer_blob_.mutable_gpu_data()); + + caffe_gpu_mul(buffer_blob_.count(), top_data, buffer_data, top_data); + + // shift + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1), + batch_sum_multiplier_.gpu_data(), + this->blobs_[1]->gpu_data(), Dtype(0), + spatial_mean_data); + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, N_ * C_, H_ * W_, 1, + Dtype(1), + spatial_mean_.gpu_data(), spatial_sum_multiplier_.gpu_data(), Dtype(0), + buffer_blob_.mutable_gpu_data()); + caffe_gpu_add(buffer_blob_.count(), top_data, buffer_data, top_data); + break; + case BNParameter_BNMode_INFERENCE: + // scale + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1), + batch_sum_multiplier_.gpu_data(), this->blobs_[0]->gpu_data(), + Dtype(0), spatial_variance_.mutable_gpu_data()); + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, N_ * C_, + H_ * W_, 1, Dtype(1), + spatial_variance_.gpu_data(), spatial_sum_multiplier_.gpu_data(), + Dtype(0), buffer_blob_.mutable_gpu_data()); + + caffe_gpu_mul(buffer_blob_.count(), bottom_data, buffer_data, top_data); + + // shift + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1), + batch_sum_multiplier_.gpu_data(), + this->blobs_[1]->gpu_data(), Dtype(0), + spatial_mean_data); + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, N_ * C_, H_ * W_, 1, + Dtype(1), + spatial_mean_.gpu_data(), spatial_sum_multiplier_.gpu_data(), Dtype(0), + buffer_blob_.mutable_gpu_data()); + caffe_gpu_add(buffer_blob_.count(), top_data, buffer_data, top_data); + break; + default: + LOG(FATAL) << "Unknown BN mode."; + } +} + +template +void BNLayer::Backward_gpu(const vector*>& top, + const vector& propagate_down, + const vector*>& bottom) { + const Dtype* top_diff = top[0]->gpu_diff(); + const Dtype* top_data = top[0]->gpu_data(); + const Dtype* bottom_data = bottom[0]->gpu_data(); + Dtype* bottom_diff = bottom[0]->mutable_gpu_diff(); + const Dtype* const_bottom_diff = bottom[0]->gpu_diff(); + Dtype* spatial_mean_data = spatial_mean_.mutable_gpu_data(); + Dtype* buffer_data = buffer_blob_.mutable_gpu_data(); + const Dtype* const_buffer_data = buffer_blob_.gpu_data(); + + switch (this->layer_param_.bn_param().bn_mode()) { + case BNParameter_BNMode_LEARN: + // Propage to layer params + // gradient w.r.t. scale + caffe_gpu_mul(buffer_blob_.count(), x_norm_.gpu_data(), + top_diff, buffer_blob_.mutable_gpu_data()); + // EX across spatial + caffe_gpu_gemv(CblasNoTrans, N_ * C_, H_ * W_, Dtype(1), + buffer_data, spatial_sum_multiplier_.gpu_data(), Dtype(0), + spatial_variance_.mutable_gpu_data()); + // EX across batch + caffe_gpu_gemv(CblasTrans, N_, C_, Dtype(1), + spatial_variance_.gpu_data(), + batch_sum_multiplier_.gpu_data(), Dtype(0), + this->blobs_[0]->mutable_gpu_diff()); + + // gradient w.r.t. shift + // EX across spatial + caffe_gpu_gemv(CblasNoTrans, N_ * C_, H_ * W_, Dtype(1), + top_diff, spatial_sum_multiplier_.gpu_data(), + Dtype(0), spatial_mean_data); + // EX across batch + caffe_gpu_gemv(CblasTrans, N_, C_, Dtype(1), + spatial_mean_.gpu_data(), + batch_sum_multiplier_.gpu_data(), Dtype(0), + this->blobs_[1]->mutable_gpu_diff()); + + // Propagate down + // scale top diff + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1), + batch_sum_multiplier_.gpu_data(), this->blobs_[0]->gpu_data(), + Dtype(0), spatial_variance_.mutable_gpu_data()); + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, N_ * C_, + H_ * W_, 1, Dtype(1), + spatial_variance_.gpu_data(), spatial_sum_multiplier_.gpu_data(), + Dtype(0), + buffer_blob_.mutable_gpu_data()); + caffe_gpu_mul(buffer_blob_.count(), top_diff, buffer_data, + buffer_blob_.mutable_gpu_data()); + + // use new top diff for computation + caffe_gpu_mul(buffer_blob_.count(), x_norm_.gpu_data(), + buffer_data, bottom_diff); + // EX across spatial + caffe_gpu_gemv(CblasNoTrans, N_ * C_, H_ * W_, + Dtype(1), bottom_diff, + spatial_sum_multiplier_.gpu_data(), Dtype(0), spatial_mean_data); + // EX across batch + caffe_gpu_gemv(CblasTrans, N_, C_, Dtype(1), + spatial_mean_.gpu_data(), + batch_sum_multiplier_.gpu_data(), Dtype(0), + batch_mean_.mutable_gpu_data()); + + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1), + batch_sum_multiplier_.gpu_data(), + batch_mean_.gpu_data(), Dtype(0), + spatial_mean_data); + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, N_ * C_, + H_ * W_, 1, Dtype(1), spatial_mean_.gpu_data(), + spatial_sum_multiplier_.gpu_data(), Dtype(0), + bottom_diff); + + caffe_gpu_mul(buffer_blob_.count(), x_norm_.gpu_data(), + bottom_diff, bottom_diff); + + // EX across spatial + caffe_gpu_gemv(CblasNoTrans, N_ * C_, H_ * W_, Dtype(1), + buffer_data, spatial_sum_multiplier_.gpu_data(), + Dtype(0), spatial_mean_data); + + // EX across batch + caffe_gpu_gemv(CblasTrans, N_, C_, Dtype(1), + spatial_mean_.gpu_data(), + batch_sum_multiplier_.gpu_data(), Dtype(0), + batch_mean_.mutable_gpu_data()); + + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, N_, + C_, 1, Dtype(1), + batch_sum_multiplier_.gpu_data(), + batch_mean_.gpu_data(), Dtype(0), + spatial_mean_data); + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, N_ * C_, + H_ * W_, 1, Dtype(1), + spatial_mean_.gpu_data(), spatial_sum_multiplier_.gpu_data(), + Dtype(1), + bottom_diff); + + caffe_gpu_axpby(buffer_blob_.count(), Dtype(1), buffer_data, + Dtype(-1. / (N_ * H_ * W_)), + bottom_diff); + + // put the squares of bottom into buffer_blob_ + caffe_gpu_powx(buffer_blob_.count(), bottom_data, Dtype(2), + buffer_blob_.mutable_gpu_data()); + + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1), + batch_sum_multiplier_.gpu_data(), batch_variance_.gpu_data(), Dtype(0), + spatial_variance_.mutable_gpu_data()); + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, N_ * C_, + H_ * W_, 1, Dtype(1), + spatial_variance_.gpu_data(), spatial_sum_multiplier_.gpu_data(), + Dtype(0), + buffer_blob_.mutable_gpu_data()); + + caffe_gpu_div(buffer_blob_.count(), const_bottom_diff, + const_buffer_data, bottom_diff); + break; + case BNParameter_BNMode_INFERENCE: + // Propage to layer params + // gradient w.r.t. scale + caffe_gpu_mul(buffer_blob_.count(), bottom_data, + top_diff, buffer_blob_.mutable_gpu_data()); + // EX across spatial + caffe_gpu_gemv(CblasNoTrans, N_ * C_, H_ * W_, Dtype(1), + buffer_data, spatial_sum_multiplier_.gpu_data(), Dtype(0), + spatial_variance_.mutable_gpu_data()); + // EX across batch + caffe_gpu_gemv(CblasTrans, N_, C_, Dtype(1), + spatial_variance_.gpu_data(), + batch_sum_multiplier_.gpu_data(), Dtype(0), + this->blobs_[0]->mutable_gpu_diff()); + + // gradient w.r.t. shift + // EX across spatial + caffe_gpu_gemv(CblasNoTrans, N_ * C_, H_ * W_, Dtype(1), + top_diff, spatial_sum_multiplier_.gpu_data(), + Dtype(0), spatial_mean_data); + // EX across batch + caffe_gpu_gemv(CblasTrans, N_, C_, Dtype(1), + spatial_mean_.gpu_data(), + batch_sum_multiplier_.gpu_data(), Dtype(0), + this->blobs_[1]->mutable_gpu_diff()); + + // Propagate down + // scale top diff + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, N_, C_, 1, Dtype(1), + batch_sum_multiplier_.gpu_data(), this->blobs_[0]->gpu_data(), + Dtype(0), spatial_variance_.mutable_gpu_data()); + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, N_ * C_, + H_ * W_, 1, Dtype(1), + spatial_variance_.gpu_data(), spatial_sum_multiplier_.gpu_data(), + Dtype(0), + buffer_blob_.mutable_gpu_data()); + caffe_gpu_mul(buffer_blob_.count(), top_diff, buffer_data, + bottom_diff); + break; + default: + LOG(FATAL) << "Unknown BN mode."; + } +} + +INSTANTIATE_LAYER_GPU_FUNCS(BNLayer); + +} // namespace caffe diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto index 5469acc3b03..22e6a48c8fe 100644 --- a/src/caffe/proto/caffe.proto +++ b/src/caffe/proto/caffe.proto @@ -308,7 +308,7 @@ message ParamSpec { // NOTE // Update the next available ID when you add a new LayerParameter field. // -// LayerParameter next available layer-specific ID: 148 (last added: recursive_conv_param) +// LayerParameter next available layer-specific ID: 149 (last added: bn_param) message LayerParameter { optional string name = 1; // the layer name optional string type = 2; // the layer type @@ -364,6 +364,7 @@ message LayerParameter { optional ArgMaxParameter argmax_param = 103; optional BatchNormParameter batch_norm_param = 139; optional BiasParameter bias_param = 141; + optional BNParameter bn_param = 148; optional ConcatParameter concat_param = 104; optional ContrastiveLossParameter contrastive_loss_param = 105; optional ConvolutionParameter convolution_param = 106; @@ -491,17 +492,6 @@ message ArgMaxParameter { optional int32 axis = 3; } -message ConcatParameter { - // The axis along which to concatenate -- may be negative to index from the - // end (e.g., -1 for the last axis). Other axes must have the - // same dimension for all the bottom blobs. - // By default, ConcatLayer concatenates blobs along the "channels" axis (1). - optional int32 axis = 2 [default = 1]; - - // DEPRECATED: alias for "axis" -- does not support negative indexing. - optional uint32 concat_dim = 1 [default = 1]; -} - message BatchNormParameter { // If false, normalization is performed over the current mini-batch // and global statistics are accumulated (but not yet used) by a moving @@ -556,6 +546,28 @@ message BiasParameter { optional FillerParameter filler = 3; } +// Message that stores parameters used by BN (Batch Normalization) layer +message BNParameter { + enum BNMode { + LEARN = 0; + INFERENCE = 1; + } + optional BNMode bn_mode = 3 [default = LEARN]; + optional FillerParameter scale_filler = 1; // The filler for the scale + optional FillerParameter shift_filler = 2; // The filler for the shift +} + +message ConcatParameter { + // The axis along which to concatenate -- may be negative to index from the + // end (e.g., -1 for the last axis). Other axes must have the + // same dimension for all the bottom blobs. + // By default, ConcatLayer concatenates blobs along the "channels" axis (1). + optional int32 axis = 2 [default = 1]; + + // DEPRECATED: alias for "axis" -- does not support negative indexing. + optional uint32 concat_dim = 1 [default = 1]; +} + message ContrastiveLossParameter { // margin for dissimilar pair optional float margin = 1 [default = 1.0];