Merge pull request BVLC#4 from m1lhaus/master

Merged latest changes from upstream Caffe repo
cepiross · Apr 15, 2018 · ab718bc · ab718bc
2 parents 131b5b6 + 9c2284c
commit ab718bc
Show file tree

Hide file tree

Showing 19 changed files with 807 additions and 170 deletions.
diff --git a/docs/installation.md b/docs/installation.md
@@ -80,7 +80,7 @@ The main requirements are `numpy` and `boost.python` (provided by boost). `panda
 
 You can install the dependencies with
 
- for req in $(cat requirements.txt); do pip install $req; done
+ pip install -r requirements.txt
 
 but we suggest first installing the [Anaconda](https://store.continuum.io/cshop/anaconda/) Python distribution, which provides most of the necessary packages, as well as the `hdf5` library dependency.
 

diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp
@@ -108,9 +108,9 @@ class PositiveUnitballFiller : public Filler<Dtype> {
  caffe_rng_uniform<Dtype>(blob->count(), 0, 1, blob->mutable_cpu_data());
  // We expect the filler to not be called very frequently, so we will
  // just use a simple implementation
- int dim = blob->count() / blob->num();
+ int dim = blob->count() / blob->shape(0);
  CHECK(dim);
- for (int i = 0; i < blob->num(); ++i) {
+ for (int i = 0; i < blob->shape(0); ++i) {
  Dtype sum = 0;
  for (int j = 0; j < dim; ++j) {
  sum += data[i * dim + j];
@@ -147,8 +147,11 @@ class XavierFiller : public Filler<Dtype> {
  : Filler<Dtype>(param) {}
  virtual void Fill(Blob<Dtype>* blob) {
  CHECK(blob->count());
- int fan_in = blob->count() / blob->num();
- int fan_out = blob->count() / blob->channels();
+ int fan_in = blob->count() / blob->shape(0);
+ // Compatibility with ND blobs
+ int fan_out = blob->num_axes() > 1 ?
+ blob->count() / blob->shape(1) :
+ blob->count();
  Dtype n = fan_in; // default to fan_in
  if (this->filler_param_.variance_norm() ==
  FillerParameter_VarianceNorm_AVERAGE) {
@@ -189,8 +192,11 @@ class MSRAFiller : public Filler<Dtype> {
  : Filler<Dtype>(param) {}
  virtual void Fill(Blob<Dtype>* blob) {
  CHECK(blob->count());
- int fan_in = blob->count() / blob->num();
- int fan_out = blob->count() / blob->channels();
+ int fan_in = blob->count() / blob->shape(0);
+ // Compatibility with ND blobs
+ int fan_out = blob->num_axes() > 1 ?
+ blob->count() / blob->shape(1) :
+ blob->count();
  Dtype n = fan_in; // default to fan_in
  if (this->filler_param_.variance_norm() ==
  FillerParameter_VarianceNorm_AVERAGE) {

diff --git a/include/caffe/layers/swish_layer.hpp b/include/caffe/layers/swish_layer.hpp
@@ -0,0 +1,96 @@
+#ifndef CAFFE_SWISH_LAYER_HPP_
+#define CAFFE_SWISH_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/neuron_layer.hpp"
+#include "caffe/layers/sigmoid_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Swish non-linearity @f$ y = x \sigma (\beta x) @f$.
+ * A novel activation function that tends to work better than ReLU [1].
+ *
+ * [1] Prajit Ramachandran, Barret Zoph, Quoc V. Le. "Searching for
+ * Activation Functions". arXiv preprint arXiv:1710.05941v2 (2017).
+ */
+template <typename Dtype>
+class SwishLayer : public NeuronLayer<Dtype> {
+ public:
+ /**
+ * @param param provides SwishParameter swish_param,
+ * with SwishLayer options:
+ * - beta (\b optional, default 1).
+ * the value @f$ \beta @f$ in the @f$ y = x \sigma (\beta x) @f$.
+ */
+ explicit SwishLayer(const LayerParameter& param)
+ : NeuronLayer<Dtype>(param),
+ sigmoid_layer_(new SigmoidLayer<Dtype>(param)),
+ sigmoid_input_(new Blob<Dtype>()),
+ sigmoid_output_(new Blob<Dtype>()) {}
+ virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+
+ virtual inline const char* type() const { return "Swish"; }
+
+ protected:
+ /**
+ * @param bottom input Blob vector (length 1)
+ * -# @f$ (N \times C \times H \times W) @f$
+ * the inputs @f$ x @f$
+ * @param top output Blob vector (length 1)
+ * -# @f$ (N \times C \times H \times W) @f$
+ * the computed outputs @f$
+ * y = x \sigma (\beta x)
+ * @f$.
+ */
+ virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+ virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+ const vector<Blob<Dtype>*>& top);
+
+ /**
+ * @brief Computes the error gradient w.r.t. the sigmoid inputs.
+ *
+ * @param top output Blob vector (length 1), providing the error gradient with
+ * respect to the outputs
+ * -# @f$ (N \times C \times H \times W) @f$
+ * containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+ * with respect to computed outputs @f$ y @f$
+ * @param propagate_down see Layer::Backward.
+ * @param bottom input Blob vector (length 1)
+ * -# @f$ (N \times C \times H \times W) @f$
+ * the inputs @f$ x @f$; Backward fills their diff with
+ * gradients @f$
+ * \frac{\partial E}{\partial x}
+ * = \frac{\partial E}{\partial y}(\beta y +
+ * \sigma (\beta x)(1 - \beta y))
+ * @f$ if propagate_down[0]
+ */
+ virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+ virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+ const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+ /// The internal SigmoidLayer
+ shared_ptr<SigmoidLayer<Dtype> > sigmoid_layer_;
+ /// sigmoid_input_ stores the input of the SigmoidLayer.
+ shared_ptr<Blob<Dtype> > sigmoid_input_;
+ /// sigmoid_output_ stores the output of the SigmoidLayer.
+ shared_ptr<Blob<Dtype> > sigmoid_output_;
+ /// bottom vector holder to call the underlying SigmoidLayer::Forward
+ vector<Blob<Dtype>*> sigmoid_bottom_vec_;
+ /// top vector holder to call the underlying SigmoidLayer::Forward
+ vector<Blob<Dtype>*> sigmoid_top_vec_;
+};
+
+} // namespace caffe
+
+#endif // CAFFE_SWISH_LAYER_HPP_
diff --git a/python/caffe/draw.py b/python/caffe/draw.py
@@ -59,18 +59,60 @@ def get_edge_label(layer):
  return edge_label
 
 
-def get_layer_label(layer, rankdir):
+def get_layer_lr_mult(layer):
+ """Get the learning rate multipliers.
+
+ Get the learning rate multipliers for the given layer. Assumes a
+ Convolution/Deconvolution/InnerProduct layer.
+
+ Parameters
+ ----------
+ layer : caffe_pb2.LayerParameter
+ A Convolution, Deconvolution, or InnerProduct layer.
+
+ Returns
+ -------
+ learning_rates : tuple of floats
+ the learning rate multipliers for the weights and biases.
+ """
+ if layer.type not in ['Convolution', 'Deconvolution', 'InnerProduct']:
+ raise ValueError("%s layers do not have a "
+ "learning rate multiplier" % layer.type)
+
+ if not hasattr(layer, 'param'):
+ return (1.0, 1.0)
+
+ params = getattr(layer, 'param')
+
+ if len(params) == 0:
+ return (1.0, 1.0)
+
+ if len(params) == 1:
+ lrm0 = getattr(params[0],'lr_mult', 1.0)
+ return (lrm0, 1.0)
+
+ if len(params) == 2:
+ lrm0, lrm1 = [getattr(p,'lr_mult', 1.0) for p in params]
+ return (lrm0, lrm1)
+
+ raise ValueError("Could not parse the learning rate multiplier")
+
+
+def get_layer_label(layer, rankdir, display_lrm=False):
  """Define node label based on layer type.
 
  Parameters
  ----------
- layer : ?
+ layer : caffe_pb2.LayerParameter
  rankdir : {'LR', 'TB', 'BT'}
  Direction of graph layout.
+ display_lrm : boolean, optional
+ If True include the learning rate multipliers in the label (default is
+ False).
 
  Returns
  -------
- string :
+ node_label : string
  A label for the current layer
  """
 
@@ -81,36 +123,54 @@ def get_layer_label(layer, rankdir):
  else:
  # If graph orientation is horizontal, vertical space is free and
  # horizontal space is not; separate words with newlines
- separator = '\\n'
-
- if layer.type == 'Convolution' or layer.type == 'Deconvolution':
- # Outer double quotes needed or else colon characters don't parse
- # properly
- node_label = '"%s%s(%s)%skernel size: %d%sstride: %d%spad: %d"' %\
- (layer.name,
- separator,
- layer.type,
- separator,
- layer.convolution_param.kernel_size[0] if len(layer.convolution_param.kernel_size) else 1,
- separator,
- layer.convolution_param.stride[0] if len(layer.convolution_param.stride) else 1,
- separator,
- layer.convolution_param.pad[0] if len(layer.convolution_param.pad) else 0)
- elif layer.type == 'Pooling':
+ separator = r'\n'
+
+ # Initializes a list of descriptors that will be concatenated into the
+ # `node_label`
+ descriptors_list = []
+ # Add the layer's name
+ descriptors_list.append(layer.name)
+ # Add layer's type
+ if layer.type == 'Pooling':
  pooling_types_dict = get_pooling_types_dict()
- node_label = '"%s%s(%s %s)%skernel size: %d%sstride: %d%spad: %d"' %\
- (layer.name,
- separator,
- pooling_types_dict[layer.pooling_param.pool],
- layer.type,
- separator,
- layer.pooling_param.kernel_size,
- separator,
- layer.pooling_param.stride,
- separator,
- layer.pooling_param.pad)
+ layer_type = '(%s %s)' % (layer.type,
+ pooling_types_dict[layer.pooling_param.pool])
  else:
- node_label = '"%s%s(%s)"' % (layer.name, separator, layer.type)
+ layer_type = '(%s)' % layer.type
+ descriptors_list.append(layer_type)
+
+ # Describe parameters for spatial operation layers
+ if layer.type in ['Convolution', 'Deconvolution', 'Pooling']:
+ if layer.type == 'Pooling':
+ kernel_size = layer.pooling_param.kernel_size
+ stride = layer.pooling_param.stride
+ padding = layer.pooling_param.pad
+ else:
+ kernel_size = layer.convolution_param.kernel_size[0] if \
+ len(layer.convolution_param.kernel_size) else 1
+ stride = layer.convolution_param.stride[0] if \
+ len(layer.convolution_param.stride) else 1
+ padding = layer.convolution_param.pad[0] if \
+ len(layer.convolution_param.pad) else 0
+ spatial_descriptor = separator.join([
+ "kernel size: %d" % kernel_size,
+ "stride: %d" % stride,
+ "pad: %d" % padding,
+ ])
+ descriptors_list.append(spatial_descriptor)
+
+ # Add LR multiplier for learning layers
+ if display_lrm and layer.type in ['Convolution', 'Deconvolution', 'InnerProduct']:
+ lrm0, lrm1 = get_layer_lr_mult(layer)
+ if any([lrm0, lrm1]):
+ lr_mult = "lr mult: %.1f, %.1f" % (lrm0, lrm1)
+ descriptors_list.append(lr_mult)
+
+ # Concatenate the descriptors into one label
+ node_label = separator.join(descriptors_list)
+ # Outer double quotes needed or else colon characters don't parse
+ # properly
+ node_label = '"%s"' % node_label
  return node_label
 
 
@@ -127,7 +187,7 @@ def choose_color_by_layertype(layertype):
  return color
 
 
-def get_pydot_graph(caffe_net, rankdir, label_edges=True, phase=None):
+def get_pydot_graph(caffe_net, rankdir, label_edges=True, phase=None, display_lrm=False):
  """Create a data structure which represents the `caffe_net`.
 
  Parameters
@@ -140,6 +200,9 @@ def get_pydot_graph(caffe_net, rankdir, label_edges=True, phase=None):
  phase : {caffe_pb2.Phase.TRAIN, caffe_pb2.Phase.TEST, None} optional
  Include layers from this network phase. If None, include all layers.
  (the default is None)
+ display_lrm : boolean, optional
+ If True display the learning rate multipliers when relevant (default is
+ False).
 
  Returns
  -------
@@ -164,7 +227,7 @@ def get_pydot_graph(caffe_net, rankdir, label_edges=True, phase=None):
  included = included and not layer_phase.phase == phase
  if not included:
  continue
- node_label = get_layer_label(layer, rankdir)
+ node_label = get_layer_label(layer, rankdir, display_lrm=display_lrm)
  node_name = "%s_%s" % (layer.name, layer.type)
  if (len(layer.bottom) == 1 and len(layer.top) == 1 and
  layer.bottom[0] == layer.top[0]):
@@ -202,7 +265,7 @@ def get_pydot_graph(caffe_net, rankdir, label_edges=True, phase=None):
  return pydot_graph
 
 
-def draw_net(caffe_net, rankdir, ext='png', phase=None):
+def draw_net(caffe_net, rankdir, ext='png', phase=None, display_lrm=False):
  """Draws a caffe net and returns the image string encoded using the given
  extension.
 
@@ -214,16 +277,20 @@ def draw_net(caffe_net, rankdir, ext='png', phase=None):
  phase : {caffe_pb2.Phase.TRAIN, caffe_pb2.Phase.TEST, None} optional
  Include layers from this network phase. If None, include all layers.
  (the default is None)
+ display_lrm : boolean, optional
+ If True display the learning rate multipliers for the learning layers
+ (default is False).
 
  Returns
  -------
  string :
  Postscript representation of the graph.
  """
- return get_pydot_graph(caffe_net, rankdir, phase=phase).create(format=ext)
+ return get_pydot_graph(caffe_net, rankdir, phase=phase,
+ display_lrm=display_lrm).create(format=ext)
 
 
-def draw_net_to_file(caffe_net, filename, rankdir='LR', phase=None):
+def draw_net_to_file(caffe_net, filename, rankdir='LR', phase=None, display_lrm=False):
  """Draws a caffe net, and saves it to file using the format given as the
  file extension. Use '.raw' to output raw text that you can manually feed
  to graphviz to draw graphs.
@@ -238,7 +305,10 @@ def draw_net_to_file(caffe_net, filename, rankdir='LR', phase=None):
  phase : {caffe_pb2.Phase.TRAIN, caffe_pb2.Phase.TEST, None} optional
  Include layers from this network phase. If None, include all layers.
  (the default is None)
+ display_lrm : boolean, optional
+ If True display the learning rate multipliers for the learning layers
+ (default is False).
  """
  ext = filename[filename.rfind('.')+1:]
  with open(filename, 'wb') as fid:
- fid.write(draw_net(caffe_net, rankdir, ext, phase))
+ fid.write(draw_net(caffe_net, rankdir, ext, phase, display_lrm))
diff --git a/python/draw_net.py b/python/draw_net.py
@@ -33,6 +33,10 @@ def parse_args():
  'TEST, or ALL. If ALL, then all layers are drawn '
  'regardless of phase.'),
  default="ALL")
+ parser.add_argument('--display_lrm', action='store_true',
+ help=('Use this flag to visualize the learning rate '
+ 'multiplier, when non-zero, for the learning '
+ 'layers (Convolution, Deconvolution, InnerProduct).'))
 
  args = parser.parse_args()
  return args
@@ -51,7 +55,7 @@ def main():
  elif args.phase != "ALL":
  raise ValueError("Unknown phase: " + args.phase)
  caffe.draw.draw_net_to_file(net, args.output_image_file, args.rankdir,
- phase)
+ phase, args.display_lrm)
 
 
 if __name__ == '__main__':

diff --git a/src/caffe/layers/embed_layer.cu b/src/caffe/layers/embed_layer.cu
@@ -15,6 +15,11 @@ __global__ void EmbedForward(const int nthreads, const Dtype* bottom_data,
  const int n = top_index / N;
  const int d = top_index % N;
  const int index = static_cast<int>(bottom_data[n]);
+ #ifdef DEBUG
+ assert(index >= 0);
+ assert(index < K);
+ assert(static_cast<Dtype>(index) == bottom_data[n]);
+ #endif
  const int weight_index = index * N + d;
  top_data[top_index] = weight[weight_index];
  }