diff --git a/docs/tutorial/data.md b/docs/tutorial/data.md
index 40605f7cd73..3bf7d932eda 100644
--- a/docs/tutorial/data.md
+++ b/docs/tutorial/data.md
@@ -10,15 +10,15 @@ New input types are supported by developing a new data layer -- the rest of the
 
 This data layer definition
 
-    layers {
+    layer {
       name: "mnist"
-      # DATA layer loads leveldb or lmdb storage DBs for high-throughput.
-      type: DATA
+      # Data layer loads leveldb or lmdb storage DBs for high-throughput.
+      type: "Data"
       # the 1st top is the data itself: the name is only convention
       top: "data"
       # the 2nd top is the ground truth: the name is only convention
       top: "label"
-      # the DATA layer configuration
+      # the Data layer configuration
       data_param {
         # path to the DB
         source: "examples/mnist/mnist_train_lmdb"
@@ -46,9 +46,9 @@ The (data, label) pairing is a convenience for classification models.
 
 **Transformations**: data preprocessing is parametrized by transformation messages within the data layer definition.
 
-    layers {
+    layer {
       name: "data"
-      type: DATA
+      type: "Data"
       [...]
       transform_param {
         scale: 0.1
diff --git a/docs/tutorial/layers.md b/docs/tutorial/layers.md
index 839939f5ad6..ff2ee491244 100644
--- a/docs/tutorial/layers.md
+++ b/docs/tutorial/layers.md
@@ -23,7 +23,7 @@ In contrast, other layers (with few exceptions) ignore the spatial structure of
 
 #### Convolution
 
-* LayerType: `CONVOLUTION`
+* Layer type: `Convolution`
 * CPU implementation: `./src/caffe/layers/convolution_layer.cpp`
 * CUDA GPU implementation: `./src/caffe/layers/convolution_layer.cu`
 * Parameters (`ConvolutionParameter convolution_param`)
@@ -43,15 +43,15 @@ In contrast, other layers (with few exceptions) ignore the spatial structure of
     - `n * c_o * h_o * w_o`, where `h_o = (h_i + 2 * pad_h - kernel_h) / stride_h + 1` and `w_o` likewise.
 * Sample (as seen in `./examples/imagenet/imagenet_train_val.prototxt`)
 
-      layers {
+      layer {
         name: "conv1"
-        type: CONVOLUTION
+        type: "Convolution"
         bottom: "data"
         top: "conv1"
-        blobs_lr: 1          # learning rate multiplier for the filters
-        blobs_lr: 2          # learning rate multiplier for the biases
-        weight_decay: 1      # weight decay multiplier for the filters
-        weight_decay: 0      # weight decay multiplier for the biases
+        # learning rate and decay multipliers for the filters
+        param { lr_mult: 1 decay_mult: 1 }
+        # learning rate and decay multipliers for the biases
+        param { lr_mult: 2 decay_mult: 0 }
         convolution_param {
           num_output: 96     # learn 96 filters
           kernel_size: 11    # each filter is 11x11
@@ -67,11 +67,11 @@ In contrast, other layers (with few exceptions) ignore the spatial structure of
         }
       }
 
-The `CONVOLUTION` layer convolves the input image with a set of learnable filters, each producing one feature map in the output image.
+The `Convolution` layer convolves the input image with a set of learnable filters, each producing one feature map in the output image.
 
 #### Pooling
 
-* LayerType: `POOLING`
+* Layer type: `Pooling`
 * CPU implementation: `./src/caffe/layers/pooling_layer.cpp`
 * CUDA GPU implementation: `./src/caffe/layers/pooling_layer.cu`
 * Parameters (`PoolingParameter pooling_param`)
@@ -87,9 +87,9 @@ The `CONVOLUTION` layer convolves the input image with a set of learnable filter
     - `n * c * h_o * w_o`, where h_o and w_o are computed in the same way as convolution.
 * Sample (as seen in `./examples/imagenet/imagenet_train_val.prototxt`)
 
-      layers {
+      layer {
         name: "pool1"
-        type: POOLING
+        type: "Pooling"
         bottom: "conv1"
         top: "pool1"
         pooling_param {
@@ -101,7 +101,7 @@ The `CONVOLUTION` layer convolves the input image with a set of learnable filter
 
 #### Local Response Normalization (LRN)
 
-* LayerType: `LRN`
+* Layer type: `LRN`
 * CPU Implementation: `./src/caffe/layers/lrn_layer.cpp`
 * CUDA GPU Implementation: `./src/caffe/layers/lrn_layer.cu`
 * Parameters (`LRNParameter lrn_param`)
@@ -115,7 +115,7 @@ The local response normalization layer performs a kind of "lateral inhibition" b
 
 #### im2col
 
-`IM2COL` is a helper for doing the image-to-column transformation that you most likely do not need to know about. This is used in Caffe's original convolution to do matrix multiplication by laying out all patches into a matrix.
+`Im2col` is a helper for doing the image-to-column transformation that you most likely do not need to know about. This is used in Caffe's original convolution to do matrix multiplication by laying out all patches into a matrix.
 
 ### Loss Layers
 
@@ -123,19 +123,19 @@ Loss drives learning by comparing an output to a target and assigning cost to mi
 
 #### Softmax
 
-* LayerType: `SOFTMAX_LOSS`
+* Layer type: `SoftmaxWithLoss`
 
 The softmax loss layer computes the multinomial logistic loss of the softmax of its inputs. It's conceptually identical to a softmax layer followed by a multinomial logistic loss layer, but provides a more numerically stable gradient.
 
 #### Sum-of-Squares / Euclidean
 
-* LayerType: `EUCLIDEAN_LOSS`
+* Layer type: `EuclideanLoss`
 
 The Euclidean loss layer computes the sum of squares of differences of its two inputs, $$\frac 1 {2N} \sum_{i=1}^N \| x^1_i - x^2_i \|_2^2$$.
 
 #### Hinge / Margin
 
-* LayerType: `HINGE_LOSS`
+* Layer type: `HingeLoss`
 * CPU implementation: `./src/caffe/layers/hinge_loss_layer.cpp`
 * CUDA GPU implementation: none yet
 * Parameters (`HingeLossParameter hinge_loss_param`)
@@ -149,17 +149,17 @@ The Euclidean loss layer computes the sum of squares of differences of its two i
 * Samples
 
       # L1 Norm
-      layers {
+      layer {
         name: "loss"
-        type: HINGE_LOSS
+        type: "HingeLoss"
         bottom: "pred"
         bottom: "label"
       }
 
       # L2 Norm
-      layers {
+      layer {
         name: "loss"
-        type: HINGE_LOSS
+        type: "HingeLoss"
         bottom: "pred"
         bottom: "label"
         top: "loss"
@@ -172,15 +172,15 @@ The hinge loss layer computes a one-vs-all hinge or squared hinge loss.
 
 #### Sigmoid Cross-Entropy
 
-`SIGMOID_CROSS_ENTROPY_LOSS`
+`SigmoidCrossEntropyLoss`
 
 #### Infogain
 
-`INFOGAIN_LOSS`
+`InfogainLoss`
 
 #### Accuracy and Top-k
 
-`ACCURACY` scores the output as the accuracy of output with respect to target -- it is not actually a loss and has no backward step.
+`Accuracy` scores the output as the accuracy of output with respect to target -- it is not actually a loss and has no backward step.
 
 ### Activation / Neuron Layers
 
@@ -193,7 +193,7 @@ In general, activation / Neuron layers are element-wise operators, taking one bo
 
 #### ReLU / Rectified-Linear and Leaky-ReLU
 
-* LayerType: `RELU`
+* Layer type: `ReLU`
 * CPU implementation: `./src/caffe/layers/relu_layer.cpp`
 * CUDA GPU implementation: `./src/caffe/layers/relu_layer.cu`
 * Parameters (`ReLUParameter relu_param`)
@@ -201,66 +201,66 @@ In general, activation / Neuron layers are element-wise operators, taking one bo
         - `negative_slope` [default 0]: specifies whether to leak the negative part by multiplying it with the slope value rather than setting it to 0.
 * Sample (as seen in `./examples/imagenet/imagenet_train_val.prototxt`)
 
-      layers {
+      layer {
         name: "relu1"
-        type: RELU
+        type: "ReLU"
         bottom: "conv1"
         top: "conv1"
       }
 
-Given an input value x, The `RELU` layer computes the output as x if x > 0 and negative_slope * x if x <= 0. When the negative slope parameter is not set, it is equivalent to the standard ReLU function of taking max(x, 0). It also supports in-place computation, meaning that the bottom and the top blob could be the same to preserve memory consumption.
+Given an input value x, The `ReLU` layer computes the output as x if x > 0 and negative_slope * x if x <= 0. When the negative slope parameter is not set, it is equivalent to the standard ReLU function of taking max(x, 0). It also supports in-place computation, meaning that the bottom and the top blob could be the same to preserve memory consumption.
 
 #### Sigmoid
 
-* LayerType: `SIGMOID`
+* Layer type: `Sigmoid`
 * CPU implementation: `./src/caffe/layers/sigmoid_layer.cpp`
 * CUDA GPU implementation: `./src/caffe/layers/sigmoid_layer.cu`
 * Sample (as seen in `./examples/imagenet/mnist_autoencoder.prototxt`)
 
-      layers {
+      layer {
         name: "encode1neuron"
         bottom: "encode1"
         top: "encode1neuron"
-        type: SIGMOID
+        type: "Sigmoid"
       }
 
-The `SIGMOID` layer computes the output as sigmoid(x) for each input element x.
+The `Sigmoid` layer computes the output as sigmoid(x) for each input element x.
 
 #### TanH / Hyperbolic Tangent
 
-* LayerType: `TANH`
+* Layer type: `TanH`
 * CPU implementation: `./src/caffe/layers/tanh_layer.cpp`
 * CUDA GPU implementation: `./src/caffe/layers/tanh_layer.cu`
 * Sample
 
-      layers {
+      layer {
         name: "layer"
         bottom: "in"
         top: "out"
-        type: TANH
+        type: "TanH"
       }
 
-The `TANH` layer computes the output as tanh(x) for each input element x.
+The `TanH` layer computes the output as tanh(x) for each input element x.
 
 #### Absolute Value
 
-* LayerType: `ABSVAL`
+* Layer type: `AbsVal`
 * CPU implementation: `./src/caffe/layers/absval_layer.cpp`
 * CUDA GPU implementation: `./src/caffe/layers/absval_layer.cu`
 * Sample
 
-      layers {
+      layer {
         name: "layer"
         bottom: "in"
         top: "out"
-        type: ABSVAL
+        type: "AbsVal"
       }
 
-The `ABSVAL` layer computes the output as abs(x) for each input element x.
+The `AbsVal` layer computes the output as abs(x) for each input element x.
 
 #### Power
 
-* LayerType: `POWER`
+* Layer type: `Power`
 * CPU implementation: `./src/caffe/layers/power_layer.cpp`
 * CUDA GPU implementation: `./src/caffe/layers/power_layer.cu`
 * Parameters (`PowerParameter power_param`)
@@ -270,11 +270,11 @@ The `ABSVAL` layer computes the output as abs(x) for each input element x.
         - `shift` [default 0]
 * Sample
 
-      layers {
+      layer {
         name: "layer"
         bottom: "in"
         top: "out"
-        type: POWER
+        type: "Power"
         power_param {
           power: 1
           scale: 1
@@ -282,16 +282,16 @@ The `ABSVAL` layer computes the output as abs(x) for each input element x.
         }
       }
 
-The `POWER` layer computes the output as (shift + scale * x) ^ power for each input element x.
+The `Power` layer computes the output as (shift + scale * x) ^ power for each input element x.
 
 #### BNLL
 
-* LayerType: `BNLL`
+* Layer type: `BNLL`
 * CPU implementation: `./src/caffe/layers/bnll_layer.cpp`
 * CUDA GPU implementation: `./src/caffe/layers/bnll_layer.cu`
 * Sample
 
-      layers {
+      layer {
         name: "layer"
         bottom: "in"
         top: "out"
@@ -309,7 +309,7 @@ Common input preprocessing (mean subtraction, scaling, random cropping, and mirr
 
 #### Database
 
-* LayerType: `DATA`
+* Layer type: `Data`
 * Parameters
     - Required
         - `source`: the name of the directory containing the database
@@ -322,7 +322,7 @@ Common input preprocessing (mean subtraction, scaling, random cropping, and mirr
 
 #### In-Memory
 
-* LayerType: `MEMORY_DATA`
+* Layer type: `MemoryData`
 * Parameters
     - Required
         - `batch_size`, `channels`, `height`, `width`: specify the size of input chunks to read from memory
@@ -331,7 +331,7 @@ The memory data layer reads data directly from memory, without copying it. In or
 
 #### HDF5 Input
 
-* LayerType: `HDF5_DATA`
+* Layer type: `HDF5Data`
 * Parameters
     - Required
         - `source`: the name of the file to read from
@@ -339,7 +339,7 @@ The memory data layer reads data directly from memory, without copying it. In or
 
 #### HDF5 Output
 
-* LayerType: `HDF5_OUTPUT`
+* Layer type: `HDF5Output`
 * Parameters
     - Required
         - `file_name`: name of file to write to
@@ -348,7 +348,7 @@ The HDF5 output layer performs the opposite function of the other layers in this
 
 #### Images
 
-* LayerType: `IMAGE_DATA`
+* Layer type: `ImageData`
 * Parameters
     - Required
         - `source`: name of a text file, with each line giving an image filename and label
@@ -360,17 +360,17 @@ The HDF5 output layer performs the opposite function of the other layers in this
 
 #### Windows
 
-`WINDOW_DATA`
+`WindowData`
 
 #### Dummy
 
-`DUMMY_DATA` is for development and debugging. See `DummyDataParameter`.
+`DummyData` is for development and debugging. See `DummyDataParameter`.
 
 ### Common Layers
 
 #### Inner Product
 
-* LayerType: `INNER_PRODUCT`
+* Layer type: `InnerProduct`
 * CPU implementation: `./src/caffe/layers/inner_product_layer.cpp`
 * CUDA GPU implementation: `./src/caffe/layers/inner_product_layer.cu`
 * Parameters (`InnerProductParameter inner_product_param`)
@@ -387,13 +387,13 @@ The HDF5 output layer performs the opposite function of the other layers in this
     - `n * c_o * 1 * 1`
 * Sample
 
-      layers {
+      layer {
         name: "fc8"
-        type: INNER_PRODUCT
-        blobs_lr: 1          # learning rate multiplier for the filters
-        blobs_lr: 2          # learning rate multiplier for the biases
-        weight_decay: 1      # weight decay multiplier for the filters
-        weight_decay: 0      # weight decay multiplier for the biases
+        type: "InnerProduct"
+        # learning rate and decay multipliers for the weights
+        param { lr_mult: 1 decay_mult: 1 }
+        # learning rate and decay multipliers for the biases
+        param { lr_mult: 2 decay_mult: 0 }
         inner_product_param {
           num_output: 1000
           weight_filler {
@@ -409,79 +409,118 @@ The HDF5 output layer performs the opposite function of the other layers in this
         top: "fc8"
       }
 
-The `INNER_PRODUCT` layer (also usually referred to as the fully connected layer) treats the input as a simple vector and produces an output in the form of a single vector (with the blob's height and width set to 1).
+The `InnerProduct` layer (also usually referred to as the fully connected layer) treats the input as a simple vector and produces an output in the form of a single vector (with the blob's height and width set to 1).
 
 #### Splitting
 
-The `SPLIT` layer is a utility layer that splits an input blob to multiple output blobs. This is used when a blob is fed into multiple output layers.
+The `Split` layer is a utility layer that splits an input blob to multiple output blobs. This is used when a blob is fed into multiple output layers.
 
 #### Flattening
 
-The `FLATTEN` layer is a utility layer that flattens an input of shape `n * c * h * w` to a simple vector output of shape `n * (c*h*w) * 1 * 1`.
+The `Flatten` layer is a utility layer that flattens an input of shape `n * c * h * w` to a simple vector output of shape `n * (c*h*w)`
+
+#### Reshape
+
+* Layer type: `Reshape`
+* Implementation: `./src/caffe/layers/reshape_layer.cpp`
+* Parameters (`ReshapeParameter reshape_param`)
+    - Optional: (also see detailed description below)
+        - `shape`
+
+* Input
+    - a single blob with arbitrary dimensions
+* Output
+    - the same blob, with modified dimensions, as specified by `reshape_param`
+
+* Sample
+
+        layer {
+          name: "reshape"
+          type: "Reshape"
+          bottom: "input"
+          top: "output"
+          reshape_param {
+            shape {
+              dim: 0  # copy the dimension from below
+              dim: 2
+              dim: 3
+              dim: -1 # infer it from the other dimensions
+            }
+          }
+        }
+
+The `Reshape` layer can be used to change the dimensions of its input, without changing its data. Just like the `Flatten` layer, only the dimensions are changed; no data is copied in the process.
+
+Output dimensions are specified by the `ReshapeParam` proto. Positive numbers are used directly, setting the corresponding dimension of the output blob. In addition, two special values are accepted for any of the target dimension values:
+
+* **0** means "copy the respective dimension of the bottom layer". That is, if the bottom has 2 as its 1st dimension, the top will have 2 as its 1st dimension as well, given `dim: 0` as the 1st target dimension.
+* **-1** stands for "infer this from the other dimensions". This behavior is similar to that of -1 in *numpy*'s or `[]` for *MATLAB*'s reshape: this dimension is calculated to keep the overall element count the same as in the bottom layer. At most one -1 can be used in a reshape operation.
+
+As another example, specifying `reshape_param { shape { dim: 0 dim: -1 } }` makes the layer behave in exactly the same way as the `Flatten` layer.
 
 #### Concatenation
 
-* LayerType: `CONCAT`
+* Layer type: `Concat`
 * CPU implementation: `./src/caffe/layers/concat_layer.cpp`
 * CUDA GPU implementation: `./src/caffe/layers/concat_layer.cu`
 * Parameters (`ConcatParameter concat_param`)
     - Optional
-        - `concat_dim` [default 1]: 0 for concatenation along num and 1 for channels.
+        - `axis` [default 1]: 0 for concatenation along num and 1 for channels.
 * Input
     - `n_i * c_i * h * w` for each input blob i from 1 to K.
 * Output
-    - if `concat_dim = 0`: `(n_1 + n_2 + ... + n_K) * c_1 * h * w`, and all input `c_i` should be the same.
-    - if `concat_dim = 1`: `n_1 * (c_1 + c_2 + ... + c_K) * h * w`, and all input `n_i` should be the same.
+    - if `axis = 0`: `(n_1 + n_2 + ... + n_K) * c_1 * h * w`, and all input `c_i` should be the same.
+    - if `axis = 1`: `n_1 * (c_1 + c_2 + ... + c_K) * h * w`, and all input `n_i` should be the same.
 * Sample
 
-      layers {
+      layer {
         name: "concat"
         bottom: "in1"
         bottom: "in2"
         top: "out"
-        type: CONCAT
+        type: "Concat"
         concat_param {
-          concat_dim: 1
+          axis: 1
         }
       }
 
-The `CONCAT` layer is a utility layer that concatenates its multiple input blobs to one single output blob. Currently, the layer supports concatenation along num or channels only.
+The `Concat` layer is a utility layer that concatenates its multiple input blobs to one single output blob.
 
 #### Slicing
 
-The `SLICE` layer is a utility layer that slices an input layer to multiple output layers along a given dimension (currently num or channel only) with given slice indices.
+The `Slice` layer is a utility layer that slices an input layer to multiple output layers along a given dimension (currently num or channel only) with given slice indices.
 
 * Sample
 
-      layers {
+      layer {
         name: "slicer_label"
-        type: SLICE
+        type: "Slice"
         bottom: "label"
         ## Example of label with a shape N x 3 x 1 x 1
         top: "label1"
         top: "label2"
         top: "label3"
         slice_param {
-            slice_dim: 1
-            slice_point: 1
-            slice_point: 2
+          axis: 1
+          slice_point: 1
+          slice_point: 2
         }
       }
 
-`slice_dim` indicates the target dimension and can assume only two values: 0 for num or 1 for channel; `slice_point` indicates indexes in the selected dimension (the number of indexes must be equal to the number of top blobs minus one). 
+`axis` indicates the target axis; `slice_point` indicates indexes in the selected dimension (the number of indices must be equal to the number of top blobs minus one). 
 
 
 #### Elementwise Operations
 
-`ELTWISE`
+`Eltwise`
 
 #### Argmax
 
-`ARGMAX`
+`ArgMax`
 
 #### Softmax
 
-`SOFTMAX`
+`Softmax`
 
 #### Mean-Variance Normalization
 
diff --git a/docs/tutorial/loss.md b/docs/tutorial/loss.md
index aac561774bb..d2d0e77fbed 100644
--- a/docs/tutorial/loss.md
+++ b/docs/tutorial/loss.md
@@ -10,30 +10,30 @@ Hence, the goal of learning is to find a setting of the weights that *minimizes*
 The loss in Caffe is computed by the Forward pass of the network.
 Each layer takes a set of input (`bottom`) blobs and produces a set of output (`top`) blobs.
 Some of these layers' outputs may be used in the loss function.
-A typical choice of loss function for one-versus-all classification tasks is the `SOFTMAX_LOSS` function, used in a network definition as follows, for example:
+A typical choice of loss function for one-versus-all classification tasks is the `SoftmaxWithLoss` function, used in a network definition as follows, for example:
 
-    layers {
+    layer {
       name: "loss"
-      type: SOFTMAX_LOSS
+      type: "SoftmaxWithLoss"
       bottom: "pred"
       bottom: "label"
       top: "loss"
     }
 
-In a `SOFTMAX_LOSS` function, the `top` blob is a scalar (dimensions $$1 \times 1 \times 1 \times 1$$) which averages the loss (computed from predicted labels `pred` and actuals labels `label`) over the entire mini-batch.
+In a `SoftmaxWithLoss` function, the `top` blob is a scalar (empty shape) which averages the loss (computed from predicted labels `pred` and actuals labels `label`) over the entire mini-batch.
 
 ### Loss weights
 
-For nets with multiple layers producing a loss (e.g., a network that both classifies the input using a `SOFTMAX_LOSS` layer and reconstructs it using a `EUCLIDEAN_LOSS` layer), *loss weights* can be used to specify their relative importance.
+For nets with multiple layers producing a loss (e.g., a network that both classifies the input using a `SoftmaxWithLoss` layer and reconstructs it using a `EuclideanLoss` layer), *loss weights* can be used to specify their relative importance.
 
-By convention, Caffe layer types with the suffix `_LOSS` contribute to the loss function, but other layers are assumed to be purely used for intermediate computations.
+By convention, Caffe layer types with the suffix `Loss` contribute to the loss function, but other layers are assumed to be purely used for intermediate computations.
 However, any layer can be used as a loss by adding a field `loss_weight: <float>` to a layer definition for each `top` blob produced by the layer.
-Layers with the suffix `_LOSS` have an implicit `loss_weight: 1` for the first `top` blob (and `loss_weight: 0` for any additional `top`s); other layers have an implicit `loss_weight: 0` for all `top`s.
-So, the above `SOFTMAX_LOSS` layer could be equivalently written as:
+Layers with the suffix `Loss` have an implicit `loss_weight: 1` for the first `top` blob (and `loss_weight: 0` for any additional `top`s); other layers have an implicit `loss_weight: 0` for all `top`s.
+So, the above `SoftmaxWithLoss` layer could be equivalently written as:
 
-    layers {
+    layer {
       name: "loss"
-      type: SOFTMAX_LOSS
+      type: "SoftmaxWithLoss"
       bottom: "pred"
       bottom: "label"
       top: "loss"
diff --git a/docs/tutorial/net_layer_blob.md b/docs/tutorial/net_layer_blob.md
index 1f0966f88a4..e8b7bd316a9 100644
--- a/docs/tutorial/net_layer_blob.md
+++ b/docs/tutorial/net_layer_blob.md
@@ -11,22 +11,20 @@ We will go over the details of these components in more detail.
 
 ## Blob storage and communication
 
-A Blob is a wrapper over the actual data being processed and passed along by Caffe, and also under the hood provides synchronization capability between the CPU and the GPU. Mathematically, a blob is a 4-dimensional array that stores things in the order of (Num, Channels, Height and Width), from major to minor, and stored in a C-contiguous fashion.  The main reason for putting Num (the name is due to legacy reasons, and is equivalent to the notation of "batch" as in minibatch SGD).
+A Blob is a wrapper over the actual data being processed and passed along by Caffe, and also under the hood provides synchronization capability between the CPU and the GPU. Mathematically, a blob is an N-dimensional array stored in a C-contiguous fashion.
 
-Caffe stores and communicates data in 4-dimensional arrays called blobs. Blobs provide a unified memory interface, holding data e.g. batches of images, model parameters, and derivatives for optimization.
+Caffe stores and communicates data using blobs. Blobs provide a unified memory interface holding data; e.g., batches of images, model parameters, and derivatives for optimization.
 
 Blobs conceal the computational and mental overhead of mixed CPU/GPU operation by synchronizing from the CPU host to the GPU device as needed. Memory on the host and device is allocated on demand (lazily) for efficient memory usage.
 
-The conventional blob dimensions for data are number N x channel K x height H x width W. Blob memory is row-major in layout so the last / rightmost dimension changes fastest. For example, the value at index (n, k, h, w) is physically located at index ((n * K + k) * H + h) * W + w.
+The conventional blob dimensions for batches of image data are number N x channel K x height H x width W. Blob memory is row-major in layout, so the last / rightmost dimension changes fastest. For example, in a 4D blob, the value at index (n, k, h, w) is physically located at index ((n * K + k) * H + h) * W + w.
 
 - Number / N is the batch size of the data. Batch processing achieves better throughput for communication and device processing. For an ImageNet training batch of 256 images B = 256.
 - Channel / K is the feature dimension e.g. for RGB images K = 3.
 
-Note that although we have designed blobs with its dimensions corresponding to image applications, they are named purely for notational purpose and it is totally valid for you to do non-image applications. For example, if you simply need fully-connected networks like the conventional multi-layer perceptron, use blobs of dimensions (Num, Channels, 1, 1) and call the InnerProductLayer (which we will cover soon).
+Note that although many blobs in Caffe examples are 4D with axes for image applications, it is totally valid to use blobs for non-image applications. For example, if you simply need fully-connected networks like the conventional multi-layer perceptron, use 2D blobs (shape (N, D)) and call the InnerProductLayer (which we will cover soon).
 
-Caffe operations are general with respect to the channel dimension / K. Grayscale and hyperspectral imagery are fine. Caffe can likewise model and process arbitrary vectors in blobs with singleton. That is, the shape of blob holding 1000 vectors of 16 feature dimensions is 1000 x 16 x 1 x 1.
-
-Parameter blob dimensions vary according to the type and configuration of the layer. For a convolution layer with 96 filters of 11 x 11 spatial dimension and 3 inputs the blob is 96 x 3 x 11 x 11. For an inner product / fully-connected layer with 1000 output channels and 1024 input channels the parameter blob is 1 x 1 x 1000 x 1024.
+Parameter blob dimensions vary according to the type and configuration of the layer. For a convolution layer with 96 filters of 11 x 11 spatial dimension and 3 inputs the blob is 96 x 3 x 11 x 11. For an inner product / fully-connected layer with 1000 output channels and 1024 input channels the parameter blob is 1000 x 1024.
 
 For custom data it may be necessary to hack your own input preparation tool or data layer. However once your data is in your job is done. The modularity of layers accomplishes the rest of the work for you.
 
@@ -95,9 +93,9 @@ A simple logistic regression classifier
 is defined by
 
     name: "LogReg"
-    layers {
+    layer {
       name: "mnist"
-      type: DATA
+      type: "Data"
       top: "data"
       top: "label"
       data_param {
@@ -105,18 +103,18 @@ is defined by
         batch_size: 64
       }
     }
-    layers {
+    layer {
       name: "ip"
-      type: INNER_PRODUCT
+      type: "InnerProduct"
       bottom: "data"
       top: "ip"
       inner_product_param {
         num_output: 2
       }
     }
-    layers {
+    layer {
       name: "loss"
-      type: SOFTMAX_LOSS
+      type: "SoftmaxWithLoss"
       bottom: "ip"
       bottom: "label"
       top: "loss"
@@ -135,19 +133,19 @@ Model initialization is handled by `Net::Init()`. The initialization mainly does
     I0902 22:52:17.935807 2079114000 data_layer.cpp:135] Opening leveldb input_leveldb
     I0902 22:52:17.937155 2079114000 data_layer.cpp:195] output data size: 64,1,28,28
     I0902 22:52:17.938570 2079114000 net.cpp:103] Top shape: 64 1 28 28 (50176)
-    I0902 22:52:17.938593 2079114000 net.cpp:103] Top shape: 64 1 1 1 (64)
+    I0902 22:52:17.938593 2079114000 net.cpp:103] Top shape: 64 (64)
     I0902 22:52:17.938611 2079114000 net.cpp:67] Creating Layer ip
     I0902 22:52:17.938617 2079114000 net.cpp:394] ip <- data
     I0902 22:52:17.939177 2079114000 net.cpp:356] ip -> ip
     I0902 22:52:17.939196 2079114000 net.cpp:96] Setting up ip
-    I0902 22:52:17.940289 2079114000 net.cpp:103] Top shape: 64 2 1 1 (128)
+    I0902 22:52:17.940289 2079114000 net.cpp:103] Top shape: 64 2 (128)
     I0902 22:52:17.941270 2079114000 net.cpp:67] Creating Layer loss
     I0902 22:52:17.941305 2079114000 net.cpp:394] loss <- ip
     I0902 22:52:17.941314 2079114000 net.cpp:394] loss <- label
     I0902 22:52:17.941323 2079114000 net.cpp:356] loss -> loss
     # set up the loss and configure the backward pass
     I0902 22:52:17.941328 2079114000 net.cpp:96] Setting up loss
-    I0902 22:52:17.941328 2079114000 net.cpp:103] Top shape: 1 1 1 1 (1)
+    I0902 22:52:17.941328 2079114000 net.cpp:103] Top shape: (1)
     I0902 22:52:17.941329 2079114000 net.cpp:109]     with loss weight 1
     I0902 22:52:17.941779 2079114000 net.cpp:170] loss needs backward computation.
     I0902 22:52:17.941787 2079114000 net.cpp:170] ip needs backward computation.
diff --git a/examples/mnist/readme.md b/examples/mnist/readme.md
index ef7f5da67d5..269e53ab9b9 100644
--- a/examples/mnist/readme.md
+++ b/examples/mnist/readme.md
@@ -38,9 +38,9 @@ Specifically, we will write a `caffe::NetParameter` (or in python, `caffe.proto.
 
 Currently, we will read the MNIST data from the lmdb we created earlier in the demo. This is defined by a data layer:
 
-    layers {
+    layer {
       name: "mnist"
-      type: DATA
+      type: "Data"
       data_param {
         source: "mnist_train_lmdb"
         backend: LMDB
@@ -57,14 +57,14 @@ Specifically, this layer has name `mnist`, type `data`, and it reads the data fr
 
 Let's define the first convolution layer:
 
-    layers {
+    layer {
       name: "conv1"
-      type: CONVOLUTION
-      blobs_lr: 1.
-      blobs_lr: 2.
+      type: "Convolution"
+      param { lr_mult: 1 }
+      param { lr_mult: 2 }
       convolution_param {
         num_output: 20
-        kernelsize: 5
+        kernel_size: 5
         stride: 1
         weight_filler {
           type: "xavier"
@@ -81,15 +81,15 @@ This layer takes the `data` blob (it is provided by the data layer), and produce
 
 The fillers allow us to randomly initialize the value of the weights and bias. For the weight filler, we will use the `xavier` algorithm that automatically determines the scale of initialization based on the number of input and output neurons. For the bias filler, we will simply initialize it as constant, with the default filling value 0.
 
-`blobs_lr` are the learning rate adjustments for the layer's learnable parameters. In this case, we will set the weight learning rate to be the same as the learning rate given by the solver during runtime, and the bias learning rate to be twice as large as that - this usually leads to better convergence rates.
+`lr_mult`s are the learning rate adjustments for the layer's learnable parameters. In this case, we will set the weight learning rate to be the same as the learning rate given by the solver during runtime, and the bias learning rate to be twice as large as that - this usually leads to better convergence rates.
 
 ### Writing the Pooling Layer
 
 Phew. Pooling layers are actually much easier to define:
 
-    layers {
+    layer {
       name: "pool1"
-      type: POOLING
+      type: "Pooling"
       pooling_param {
         kernel_size: 2
         stride: 2
@@ -107,11 +107,11 @@ Similarly, you can write up the second convolution and pooling layers. Check `$C
 
 Writing a fully connected layer is also simple:
 
-    layers {
+    layer {
       name: "ip1"
-      type: INNER_PRODUCT
-      blobs_lr: 1.
-      blobs_lr: 2.
+      type: "InnerProduct"
+      param { lr_mult: 1 }
+      param { lr_mult: 2 }
       inner_product_param {
         num_output: 500
         weight_filler {
@@ -125,15 +125,15 @@ Writing a fully connected layer is also simple:
       top: "ip1"
     }
 
-This defines a fully connected layer (for some legacy reason, Caffe calls it an `innerproduct` layer) with 500 outputs. All other lines look familiar, right?
+This defines a fully connected layer (known in Caffe as an `InnerProduct` layer) with 500 outputs. All other lines look familiar, right?
 
 ### Writing the ReLU Layer
 
 A ReLU Layer is also simple:
 
-    layers {
+    layer {
       name: "relu1"
-      type: RELU
+      type: "ReLU"
       bottom: "ip1"
       top: "ip1"
     }
@@ -142,11 +142,11 @@ Since ReLU is an element-wise operation, we can do *in-place* operations to save
 
 After the ReLU layer, we will write another innerproduct layer:
 
-    layers {
+    layer {
       name: "ip2"
-      type: INNER_PRODUCT
-      blobs_lr: 1.
-      blobs_lr: 2.
+      type: "InnerProduct"
+      param { lr_mult: 1 }
+      param { lr_mult: 2 }
       inner_product_param {
         num_output: 10
         weight_filler {
@@ -164,9 +164,9 @@ After the ReLU layer, we will write another innerproduct layer:
 
 Finally, we will write the loss!
 
-    layers {
+    layer {
       name: "loss"
-      type: SOFTMAX_LOSS
+      type: "SoftmaxWithLoss"
       bottom: "ip2"
       bottom: "label"
     }
@@ -178,7 +178,7 @@ The `softmax_loss` layer implements both the softmax and the multinomial logisti
 
 Layer definitions can include rules for whether and when they are included in the network definition, like the one below:
 
-    layers {
+    layer {
       // ...layer definition...
       include: { phase: TRAIN }
     }
@@ -190,7 +190,7 @@ In the above example, this layer will be included only in `TRAIN` phase.
 If we change `TRAIN` with `TEST`, then this layer will be used only in test phase.
 By default, that is without layer rules, a layer is always included in the network.
 Thus, `lenet_train_test.prototxt` has two `DATA` layers defined (with different `batch_size`), one for the training phase and one for the testing phase.
-Also, there is an `ACCURACY` layer which is included only in `TEST` phase for reporting the model accuracy every 100 iteration, as defined in `lenet_solver.prototxt`.
+Also, there is an `Accuracy` layer which is included only in `TEST` phase for reporting the model accuracy every 100 iteration, as defined in `lenet_solver.prototxt`.
 
 ## Define the MNIST Solver
 
diff --git a/examples/siamese/readme.md b/examples/siamese/readme.md
index ce98ec10819..83db8c94395 100644
--- a/examples/siamese/readme.md
+++ b/examples/siamese/readme.md
@@ -39,13 +39,19 @@ exactly the same as the [LeNet model](mnist.html), the only difference is that
 we have replaced the top layers that produced probabilities over the 10 digit
 classes with a linear "feature" layer that produces a 2 dimensional vector.
 
-    layers {
+    layer {
       name: "feat"
-      type: INNER_PRODUCT
+      type: "InnerProduct"
       bottom: "ip2"
       top: "feat"
-      blobs_lr: 1
-      blobs_lr: 2
+      param {
+        name: "feat_w"
+        lr_mult: 1
+      }
+      param {
+        name: "feat_b"
+        lr_mult: 2
+      }
       inner_product_param {
         num_output: 2
       }
@@ -64,17 +70,19 @@ earlier. Each entry in this database contains the image data for a pair of
 images (`pair_data`) and a binary label saying if they belong to the same class
 or different classes (`sim`).
 
-    layers {
+    layer {
       name: "pair_data"
-      type: DATA
+      type: "Data"
       top: "pair_data"
       top: "sim"
-      data_param {
-        source: "examples/siamese/mnist-siamese-train-leveldb"
+      include { phase: TRAIN }
+      transform_param {
         scale: 0.00390625
+      }
+      data_param {
+        source: "examples/siamese/mnist_siamese_train_leveldb"
         batch_size: 64
       }
-      include: { phase: TRAIN }
     }
 
 In order to pack a pair of images into the same blob in the database we pack one
@@ -83,16 +91,16 @@ so we add a slice layer after the data layer. This takes the `pair_data` and
 slices it along the channel dimension so that we have a single image in `data`
 and its paired image in `data_p.`
 
-    layers {
-        name: "slice_pair"
-        type: SLICE
-        bottom: "pair_data"
-        top: "data"
-        top: "data_p"
-        slice_param {
-            slice_dim: 1
-            slice_point: 1
-        }
+    layer {
+      name: "slice_pair"
+      type: "Slice"
+      bottom: "pair_data"
+      top: "data"
+      top: "data_p"
+      slice_param {
+        slice_dim: 1
+        slice_point: 1
+      }
     }
 
 ### Building the First Side of the Siamese Net
@@ -105,17 +113,17 @@ parameters allows Caffe to share the parameters between layers on both sides of
 the siamese net. In the definition this looks like:
 
     ...
-    param: "conv1_w"
-    param: "conv1_b"
+    param { name: "conv1_w" ...  }
+    param { name: "conv1_b" ...  }
     ...
-    param: "conv2_w"
-    param: "conv2_b"
+    param { name: "conv2_w" ...  }
+    param { name: "conv2_b" ...  }
     ...
-    param: "ip1_w"
-    param: "ip1_b"
+    param { name: "ip1_w" ...  }
+    param { name: "ip1_b" ...  }
     ...
-    param: "ip2_w"
-    param: "ip2_b"
+    param { name: "ip2_w" ...  }
+    param { name: "ip2_b" ...  }
     ...
 
 ### Building the Second Side of the Siamese Net
@@ -133,9 +141,9 @@ an Invariant Mapping". This loss function encourages matching pairs to be close
 together in feature space while pushing non-matching pairs apart. This cost
 function is implemented with the `CONTRASTIVE_LOSS` layer:
 
-    layers {
+    layer {
         name: "loss"
-        type: CONTRASTIVE_LOSS
+        type: "ContrastiveLoss"
         contrastive_loss_param {
             margin: 1.0
         }
diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
index b0193be5e9c..01fdaf9a298 100644
--- a/include/caffe/common_layers.hpp
+++ b/include/caffe/common_layers.hpp
@@ -345,7 +345,6 @@ class MVNLayer : public Layer<Dtype> {
   Blob<Dtype> sum_multiplier_;
 };
 
-
 /**
  *  @brief Generating Perspective Maps -- a shading map controlled
  *         by input blobs "slop" and "intercept"
@@ -427,8 +426,11 @@ class ReductionLayer : public Layer<Dtype> {
   Blob<Dtype> sum_multiplier_;
 };
 
- /**
- * @brief Reshapes an input Blob.
+/*
+ * @brief Reshapes the input Blob into an arbitrary-sized output Blob.
+ *
+ * Note: similarly to FlattenLayer, this layer does not change the input values
+ * (see FlattenLayer, Blob::ShareData and Blob::ShareDiff).
  */
 template <typename Dtype>
 class ReshapeLayer : public Layer<Dtype> {
@@ -445,35 +447,21 @@ class ReshapeLayer : public Layer<Dtype> {
   virtual inline int ExactNumTopBlobs() const { return 1; }
 
  protected:
-  /**
-   * @param bottom input Blob vector (length 1)
-   *   -# @f$ (D_1 \times D_2 \times ... \times D_m) @f$
-   *      the inputs
-   * @param top output Blob vector (length 1)
-   *   -# @f$ (d_1 \times d_2 \times ... \times d_n) @f$,
-   *      the outputs -- i.e., the (virtually) copied inputs.
-   *      The shape is specified by <code>reshape_param.shape()</code>, and the
-   *      product of the dimensions in the new shape must match that of the
-   *      input shape; i.e., @f$ d_1 d_2 ... d_n = D_1 D_2 ... D_m @f$.
-   */
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {}
-
-  /**
-   * @brief Computes the error gradient w.r.t. the concatenate inputs.
-   *
-   * @param top output Blob vector (length 1), providing the error gradient with
-   *        respect to the outputs
-   * @param propagate_down see Layer::Backward.
-   * @param bottom input Blob vector (length K), into which the top error
-   *        gradient is (virtually) copied
-   */
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {}
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
+
+  /// @brief vector of axes indices whose dimensions we'll copy from the bottom
+  vector<int> copy_axes_;
+  /// @brief the index of the axis whose dimension we infer, or -1 if none
+  int inferred_axis_;
+  /// @brief the product of the "constant" output dimensions
+  int constant_count_;
 };
 
 /**
diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp
index b34b78bfc14..ee6f0e40919 100644
--- a/include/caffe/data_layers.hpp
+++ b/include/caffe/data_layers.hpp
@@ -14,7 +14,6 @@
 #include "caffe/filler.hpp"
 #include "caffe/internal_thread.hpp"
 #include "caffe/layer.hpp"
-#include "caffe/net.hpp"
 #include "caffe/proto/caffe.pb.h"
 #include "caffe/util/db.hpp"
 
@@ -29,7 +28,6 @@ template <typename Dtype>
 class BaseDataLayer : public Layer<Dtype> {
  public:
   explicit BaseDataLayer(const LayerParameter& param);
-  virtual ~BaseDataLayer() {}
   // LayerSetUp: implements common data layer setup functionality, and calls
   // DataLayerSetUp to do special data layer setup for individual layer types.
   // This method may not be overridden except by the BasePrefetchingDataLayer.
@@ -58,7 +56,6 @@ class BasePrefetchingDataLayer :
  public:
   explicit BasePrefetchingDataLayer(const LayerParameter& param)
       : BaseDataLayer<Dtype>(param) {}
-  virtual ~BasePrefetchingDataLayer() {}
   // LayerSetUp: implements common data layer setup functionality, and calls
   // DataLayerSetUp to do special data layer setup for individual layer types.
   // This method may not be overridden.
diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp
index 2d13ef97c05..8f924a75755 100644
--- a/include/caffe/layer.hpp
+++ b/include/caffe/layer.hpp
@@ -406,6 +406,7 @@ template <typename Dtype>
 inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
   Dtype loss = 0;
+  Reshape(bottom, top);
   switch (Caffe::mode()) {
   case Caffe::CPU:
     Forward_cpu(bottom, top);
diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp
index 86f75193398..70eda2054ae 100644
--- a/include/caffe/loss_layers.hpp
+++ b/include/caffe/loss_layers.hpp
@@ -633,8 +633,6 @@ class SigmoidCrossEntropyLossLayer : public LossLayer<Dtype> {
   /// @copydoc SigmoidCrossEntropyLossLayer
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
 
   /**
    * @brief Computes the sigmoid cross-entropy loss error gradient w.r.t. the
diff --git a/include/caffe/neuron_layers.hpp b/include/caffe/neuron_layers.hpp
index 323215134c7..9cf233f0eb3 100644
--- a/include/caffe/neuron_layers.hpp
+++ b/include/caffe/neuron_layers.hpp
@@ -8,7 +8,6 @@
 #include "caffe/blob.hpp"
 #include "caffe/common.hpp"
 #include "caffe/layer.hpp"
-#include "caffe/net.hpp"
 #include "caffe/proto/caffe.pb.h"
 
 #define HDF5_DATA_DATASET_NAME "data"
@@ -734,7 +733,8 @@ class PReLULayer : public NeuronLayer<Dtype> {
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   bool channel_shared_;
-  Blob<Dtype> multiplier_;  // dot multipler for backward computation of params
+  Blob<Dtype> multiplier_;  // dot multiplier for backward computation of params
+  Blob<Dtype> backward_buff_;  // temporary buffer for backward computation
   Blob<Dtype> bottom_memory_;  // memory for in-place computation
 };
 
diff --git a/include/caffe/python_layer.hpp b/include/caffe/python_layer.hpp
index 816ef453720..19cf18c9742 100644
--- a/include/caffe/python_layer.hpp
+++ b/include/caffe/python_layer.hpp
@@ -14,12 +14,12 @@ template <typename Dtype>
 class PythonLayer : public Layer<Dtype> {
  public:
   PythonLayer(PyObject* self, const LayerParameter& param)
-      : Layer<Dtype>(param), self_(self) { }
+      : Layer<Dtype>(param), self_(bp::handle<>(bp::borrowed(self))) { }
 
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
     try {
-      bp::call_method<bp::object>(self_, "setup", bottom, top);
+      self_.attr("setup")(bottom, top);
     } catch (bp::error_already_set) {
       PyErr_Print();
       throw;
@@ -29,7 +29,7 @@ class PythonLayer : public Layer<Dtype> {
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
     try {
-      bp::call_method<bp::object>(self_, "reshape", bottom, top);
+      self_.attr("reshape")(bottom, top);
     } catch (bp::error_already_set) {
       PyErr_Print();
       throw;
@@ -42,7 +42,7 @@ class PythonLayer : public Layer<Dtype> {
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
     try {
-      bp::call_method<bp::object>(self_, "forward", bottom, top);
+      self_.attr("forward")(bottom, top);
     } catch (bp::error_already_set) {
       PyErr_Print();
       throw;
@@ -51,8 +51,7 @@ class PythonLayer : public Layer<Dtype> {
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
     try {
-      bp::call_method<bp::object>(self_, "backward", top, propagate_down,
-          bottom);
+      self_.attr("backward")(top, propagate_down, bottom);
     } catch (bp::error_already_set) {
       PyErr_Print();
       throw;
@@ -60,7 +59,7 @@ class PythonLayer : public Layer<Dtype> {
   }
 
  private:
-  PyObject* self_;
+  bp::object self_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
index 6ad24dd5b9f..cd94ed1eadf 100644
--- a/include/caffe/vision_layers.hpp
+++ b/include/caffe/vision_layers.hpp
@@ -542,6 +542,72 @@ class CuDNNPoolingLayer : public PoolingLayer<Dtype> {
 };
 #endif
 
+/**
+ * @brief Does spatial pyramid pooling on the input image
+ *        by taking the max, average, etc. within regions
+ *        so that the result vector of different sized
+ *        images are of the same size.
+ */
+template <typename Dtype>
+class SPPLayer : public Layer<Dtype> {
+ public:
+  explicit SPPLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "SPP"; }
+  virtual inline int ExactNumBottomBlobs() const { return 1; }
+  virtual inline int MinTopBlobs() const { return 1; }
+  // MAX POOL layers can output an extra top blob for the mask;
+  // others can only output the pooled inputs.
+  virtual inline int MaxTopBlobs() const {
+    return (this->layer_param_.pooling_param().pool() ==
+            PoolingParameter_PoolMethod_MAX) ? 2 : 1;
+  }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  // calculates the kernel and stride dimensions for the pooling layer,
+  // returns a correctly configured LayerParameter for a PoolingLayer
+  virtual LayerParameter GetPoolingParam(const int pyramid_level,
+      const int bottom_h, const int bottom_w, const SPPParameter spp_param);
+
+  int pyramid_height_;
+  int bottom_h_, bottom_w_;
+  int channels_;
+  int kernel_h_, kernel_w_;
+  int pad_h_, pad_w_;
+
+  /// the internal Split layer that feeds the pooling layers
+  shared_ptr<SplitLayer<Dtype> > split_layer_;
+  /// top vector holder used in call to the underlying SplitLayer::Forward
+  vector<Blob<Dtype>*> split_top_vec_;
+  /// bottom vector holder used in call to the underlying PoolingLayer::Forward
+  vector<vector<Blob<Dtype>*>*> pooling_bottom_vecs_;
+  /// the internal Pooling layers of different kernel sizes
+  vector<shared_ptr<PoolingLayer<Dtype> > > pooling_layers_;
+  /// top vector holders used in call to the underlying PoolingLayer::Forward
+  vector<vector<Blob<Dtype>*>*> pooling_top_vecs_;
+  /// pooling_outputs stores the outputs of the PoolingLayers
+  vector<Blob<Dtype>*> pooling_outputs_;
+  /// the internal Flatten layers that the Pooling layers feed into
+  vector<FlattenLayer<Dtype>*> flatten_layers_;
+  /// top vector holders used in call to the underlying FlattenLayer::Forward
+  vector<vector<Blob<Dtype>*>*> flatten_top_vecs_;
+  /// flatten_outputs stores the outputs of the FlattenLayers
+  vector<Blob<Dtype>*> flatten_outputs_;
+  /// bottom vector holder used in call to the underlying ConcatLayer::Forward
+  vector<Blob<Dtype>*> concat_bottom_vec_;
+  /// the internal Concat layers that the Flatten layers feed into
+  shared_ptr<ConcatLayer<Dtype> > concat_layer_;
+};
+
 }  // namespace caffe
 
 #endif  // CAFFE_VISION_LAYERS_HPP_
diff --git a/matlab/caffe/hdf5creation/demo.m b/matlab/caffe/hdf5creation/demo.m
index f554b87e5f6..4f9f7b5a454 100644
--- a/matlab/caffe/hdf5creation/demo.m
+++ b/matlab/caffe/hdf5creation/demo.m
@@ -52,9 +52,9 @@
 fprintf('HDF5 filename listed in %s \n', 'list.txt');
 
 % NOTE: In net definition prototxt, use list.txt as input to HDF5_DATA as: 
-% layers {
+% layer {
 %   name: "data"
-%   type: HDF5_DATA
+%   type: "HDF5Data"
 %   top: "data"
 %   top: "labelvec"
 %   hdf5_data_param {
diff --git a/python/caffe/__init__.py b/python/caffe/__init__.py
index 37e8956da4f..fbe7112e868 100644
--- a/python/caffe/__init__.py
+++ b/python/caffe/__init__.py
@@ -3,4 +3,4 @@
 from .proto.caffe_pb2 import TRAIN, TEST
 from .classifier import Classifier
 from .detector import Detector
-import io
+from . import io
diff --git a/python/caffe/classifier.py b/python/caffe/classifier.py
index 49f8003ce9d..7fb2ccc8ff3 100644
--- a/python/caffe/classifier.py
+++ b/python/caffe/classifier.py
@@ -29,7 +29,7 @@ def __init__(self, model_file, pretrained_file, image_dims=None,
         in_ = self.inputs[0]
         self.transformer = caffe.io.Transformer(
             {in_: self.blobs[in_].data.shape})
-        self.transformer.set_transpose(in_, (2,0,1))
+        self.transformer.set_transpose(in_, (2, 0, 1))
         if mean is not None:
             self.transformer.set_mean(in_, mean)
         if input_scale is not None:
@@ -44,7 +44,6 @@ def __init__(self, model_file, pretrained_file, image_dims=None,
             image_dims = self.crop_dims
         self.image_dims = image_dims
 
-
     def predict(self, inputs, oversample=True):
         """
         Predict classification probabilities of inputs.
@@ -78,7 +77,7 @@ def predict(self, inputs, oversample=True):
             input_ = input_[:, crop[0]:crop[2], crop[1]:crop[3], :]
 
         # Classify
-        caffe_in = np.zeros(np.array(input_.shape)[[0,3,1,2]],
+        caffe_in = np.zeros(np.array(input_.shape)[[0, 3, 1, 2]],
                             dtype=np.float32)
         for ix, in_ in enumerate(input_):
             caffe_in[ix] = self.transformer.preprocess(self.inputs[0], in_)
diff --git a/python/caffe/detector.py b/python/caffe/detector.py
index a67b818b93f..f72b548ac9a 100644
--- a/python/caffe/detector.py
+++ b/python/caffe/detector.py
@@ -41,7 +41,7 @@ def __init__(self, model_file, pretrained_file, mean=None,
         in_ = self.inputs[0]
         self.transformer = caffe.io.Transformer(
             {in_: self.blobs[in_].data.shape})
-        self.transformer.set_transpose(in_, (2,0,1))
+        self.transformer.set_transpose(in_, (2, 0, 1))
         if mean is not None:
             self.transformer.set_mean(in_, mean)
         if input_scale is not None:
@@ -53,17 +53,18 @@ def __init__(self, model_file, pretrained_file, mean=None,
 
         self.configure_crop(context_pad)
 
-
     def detect_windows(self, images_windows):
         """
         Do windowed detection over given images and windows. Windows are
         extracted then warped to the input dimensions of the net.
 
-        Take
+        Parameters
+        ----------
         images_windows: (image filename, window list) iterable.
         context_crop: size of context border to crop in pixels.
 
-        Give
+        Returns
+        -------
         detections: list of {filename: image filename, window: crop coordinates,
             predictions: prediction vector} dicts.
         """
@@ -82,7 +83,7 @@ def detect_windows(self, images_windows):
         for ix, window_in in enumerate(window_inputs):
             caffe_in[ix] = self.transformer.preprocess(in_, window_in)
         out = self.forward_all(**{in_: caffe_in})
-        predictions = out[self.outputs[0]].squeeze(axis=(2,3))
+        predictions = out[self.outputs[0]].squeeze(axis=(2, 3))
 
         # Package predictions with images and windows.
         detections = []
@@ -97,16 +98,17 @@ def detect_windows(self, images_windows):
                 ix += 1
         return detections
 
-
     def detect_selective_search(self, image_fnames):
         """
         Do windowed detection over Selective Search proposals by extracting
         the crop and warping to the input dimensions of the net.
 
-        Take
+        Parameters
+        ----------
         image_fnames: list
 
-        Give
+        Returns
+        -------
         detections: list of {filename: image filename, window: crop coordinates,
             predictions: prediction vector} dicts.
         """
@@ -120,17 +122,18 @@ def detect_selective_search(self, image_fnames):
         # Run windowed detection on the selective search list.
         return self.detect_windows(zip(image_fnames, windows_list))
 
-
     def crop(self, im, window):
         """
         Crop a window from the image for detection. Include surrounding context
         according to the `context_pad` configuration.
 
-        Take
+        Parameters
+        ----------
         im: H x W x K image ndarray to crop.
         window: bounding box coordinates as ymin, xmin, ymax, xmax.
 
-        Give
+        Returns
+        -------
         crop: cropped window.
         """
         # Crop window from the image.
@@ -175,14 +178,14 @@ def crop(self, im, window):
 
         return crop
 
-
     def configure_crop(self, context_pad):
         """
         Configure crop dimensions and amount of context for cropping.
         If context is included, make the special input mean for context padding.
 
-        Take
-        context_pad: amount of context for cropping.
+        Parameters
+        ----------
+        context_pad : amount of context for cropping.
         """
         # crop dimensions
         in_ = self.inputs[0]
@@ -204,8 +207,8 @@ def configure_crop(self, context_pad):
                 crop_mean = mean.copy().transpose(inv_transpose)
                 if channel_order is not None:
                     channel_order_inverse = [channel_order.index(i)
-                                            for i in range(crop_mean.shape[2])]
-                    crop_mean = crop_mean[:,:, channel_order_inverse]
+                                             for i in range(crop_mean.shape[2])]
+                    crop_mean = crop_mean[:, :, channel_order_inverse]
                 if raw_scale is not None:
                     crop_mean /= raw_scale
                 self.crop_mean = crop_mean
diff --git a/python/caffe/draw.py b/python/caffe/draw.py
index 6a4dbd47351..08b7c1de14b 100644
--- a/python/caffe/draw.py
+++ b/python/caffe/draw.py
@@ -11,19 +11,23 @@
 import pydot
 
 # Internal layer and blob styles.
-LAYER_STYLE_DEFAULT = {'shape': 'record', 'fillcolor': '#6495ED',
-         'style': 'filled'}
-NEURON_LAYER_STYLE = {'shape': 'record', 'fillcolor': '#90EE90',
-         'style': 'filled'}
-BLOB_STYLE = {'shape': 'octagon', 'fillcolor': '#E0E0E0',
-        'style': 'filled'}
+LAYER_STYLE_DEFAULT = {'shape': 'record',
+                       'fillcolor': '#6495ED',
+                       'style': 'filled'}
+NEURON_LAYER_STYLE = {'shape': 'record',
+                      'fillcolor': '#90EE90',
+                      'style': 'filled'}
+BLOB_STYLE = {'shape': 'octagon',
+              'fillcolor': '#E0E0E0',
+              'style': 'filled'}
+
 
 def get_pooling_types_dict():
     """Get dictionary mapping pooling type number to type name
     """
     desc = caffe_pb2.PoolingParameter.PoolMethod.DESCRIPTOR
     d = {}
-    for k,v in desc.values_by_name.items():
+    for k, v in desc.values_by_name.items():
         d[v.number] = k
     return d
 
@@ -145,21 +149,24 @@ def get_pydot_graph(caffe_net, rankdir, label_edges=True):
                    label=edge['label']))
   return pydot_graph
 
+
 def draw_net(caffe_net, rankdir, ext='png'):
-  """Draws a caffe net and returns the image string encoded using the given
-  extension.
+    """Draws a caffe net and returns the image string encoded using the given
+    extension.
 
-  Input:
+    Parameters
+    ----------
     caffe_net: a caffe.proto.caffe_pb2.NetParameter protocol buffer.
     ext: the image extension. Default 'png'.
-  """
-  return get_pydot_graph(caffe_net, rankdir).create(format=ext)
+    """
+    return get_pydot_graph(caffe_net, rankdir).create(format=ext)
+
 
 def draw_net_to_file(caffe_net, filename, rankdir='LR'):
-  """Draws a caffe net, and saves it to file using the format given as the
-  file extension. Use '.raw' to output raw text that you can manually feed
-  to graphviz to draw graphs.
-  """
-  ext = filename[filename.rfind('.')+1:]
-  with open(filename, 'wb') as fid:
-    fid.write(draw_net(caffe_net, rankdir, ext))
+    """Draws a caffe net, and saves it to file using the format given as the
+    file extension. Use '.raw' to output raw text that you can manually feed
+    to graphviz to draw graphs.
+    """
+    ext = filename[filename.rfind('.')+1:]
+    with open(filename, 'wb') as fid:
+        fid.write(draw_net(caffe_net, rankdir, ext))
diff --git a/python/caffe/io.py b/python/caffe/io.py
index 6ae2cf13cc0..e5feff38796 100644
--- a/python/caffe/io.py
+++ b/python/caffe/io.py
@@ -8,16 +8,16 @@
     from caffe.proto import caffe_pb2
 except:
     import sys
-    if sys.version_info >= (3,0):
+    if sys.version_info >= (3, 0):
         print("Failed to include caffe_pb2, things might go wrong!")
     else:
         raise
 
-## proto / datum / ndarray conversion
 
+## proto / datum / ndarray conversion
 def blobproto_to_array(blob, return_diff=False):
-    """Convert a blob proto to an array. In default, we will just return the data,
-    unless return_diff is True, in which case we will return the diff.
+    """Convert a blob proto to an array. In default, we will just return the
+    data, unless return_diff is True, in which case we will return the diff.
     """
     if return_diff:
         return np.array(blob.diff).reshape(
@@ -35,7 +35,7 @@ def array_to_blobproto(arr, diff=None):
     if arr.ndim != 4:
         raise ValueError('Incorrect array shape.')
     blob = caffe_pb2.BlobProto()
-    blob.num, blob.channels, blob.height, blob.width = arr.shape;
+    blob.num, blob.channels, blob.height, blob.width = arr.shape
     blob.data.extend(arr.astype(float).flat)
     if diff is not None:
         blob.diff.extend(diff.astype(float).flat)
@@ -81,7 +81,7 @@ def datum_to_array(datum):
     as one can easily get it by calling datum.label.
     """
     if len(datum.data):
-        return np.fromstring(datum.data, dtype = np.uint8).reshape(
+        return np.fromstring(datum.data, dtype=np.uint8).reshape(
             datum.channels, datum.height, datum.width)
     else:
         return np.array(datum.float_data).astype(float).reshape(
@@ -97,8 +97,9 @@ class Transformer:
     Note: this is mostly for illustrative purposes and it is likely better
     to define your own input preprocessing routine for your needs.
 
-    Take
-    net: a Net for which the input should be prepared
+    Parameters
+    ----------
+    net : a Net for which the input should be prepared
     """
     def __init__(self, inputs):
         self.inputs = inputs
@@ -108,13 +109,11 @@ def __init__(self, inputs):
         self.mean = {}
         self.input_scale = {}
 
-
     def __check_input(self, in_):
         if in_ not in self.inputs:
             raise Exception('{} is not one of the net inputs: {}'.format(
                 in_, self.inputs))
 
-
     def preprocess(self, in_, data):
         """
         Format input for Caffe:
@@ -155,7 +154,6 @@ def preprocess(self, in_, data):
             caffe_in *= input_scale
         return caffe_in
 
-
     def deprocess(self, in_, data):
         """
         Invert Caffe formatting; see preprocess().
@@ -179,7 +177,6 @@ def deprocess(self, in_, data):
             decaf_in = decaf_in.transpose([transpose[t] for t in transpose])
         return decaf_in
 
-
     def set_transpose(self, in_, order):
         """
         Set the input channel order for e.g. RGB to BGR conversion
@@ -195,7 +192,6 @@ def set_transpose(self, in_, order):
                             'dimensions as the input.')
         self.transpose[in_] = order
 
-
     def set_channel_swap(self, in_, order):
         """
         Set the input channel order for e.g. RGB to BGR conversion
@@ -213,7 +209,6 @@ def set_channel_swap(self, in_, order):
                             'dimensions as the input channels.')
         self.channel_swap[in_] = order
 
-
     def set_raw_scale(self, in_, scale):
         """
         Set the scale of raw features s.t. the input blob = input * scale.
@@ -228,7 +223,6 @@ def set_raw_scale(self, in_, scale):
         self.__check_input(in_)
         self.raw_scale[in_] = scale
 
-
     def set_mean(self, in_, mean):
         """
         Set the mean to subtract for centering the data.
@@ -254,7 +248,6 @@ def set_mean(self, in_, mean):
                 raise ValueError('Mean shape incompatible with input shape.')
         self.mean[in_] = mean
 
-
     def set_input_scale(self, in_, scale):
         """
         Set the scale of preprocessed inputs s.t. the blob = blob * scale.
@@ -359,7 +352,7 @@ def oversample(images, crop_dims):
 
     # Extract crops
     crops = np.empty((10 * len(images), crop_dims[0], crop_dims[1],
-                            im_shape[-1]), dtype=np.float32)
+                      im_shape[-1]), dtype=np.float32)
     ix = 0
     for im in images:
         for crop in crops_ix:
diff --git a/python/caffe/pycaffe.py b/python/caffe/pycaffe.py
index 3c19261f690..e8a676a26d2 100644
--- a/python/caffe/pycaffe.py
+++ b/python/caffe/pycaffe.py
@@ -5,9 +5,9 @@
 
 from collections import OrderedDict
 try:
-	from itertools import izip_longest
+    from itertools import izip_longest
 except:
-	from itertools import zip_longest as izip_longest
+    from itertools import zip_longest as izip_longest
 import numpy as np
 
 from ._caffe import Net, SGDSolver
@@ -53,16 +53,19 @@ def _Net_forward(self, blobs=None, start=None, end=None, **kwargs):
     """
     Forward pass: prepare inputs and run the net forward.
 
-    Take
-    blobs: list of blobs to return in addition to output blobs.
-    kwargs: Keys are input blob names and values are blob ndarrays.
-            For formatting inputs for Caffe, see Net.preprocess().
-            If None, input is taken from data layers.
-    start: optional name of layer at which to begin the forward pass
-    end: optional name of layer at which to finish the forward pass (inclusive)
-
-    Give
-    outs: {blob name: blob ndarray} dict.
+    Parameters
+    ----------
+    blobs : list of blobs to return in addition to output blobs.
+    kwargs : Keys are input blob names and values are blob ndarrays.
+             For formatting inputs for Caffe, see Net.preprocess().
+             If None, input is taken from data layers.
+    start : optional name of layer at which to begin the forward pass
+    end : optional name of layer at which to finish the forward pass
+          (inclusive)
+
+    Returns
+    -------
+    outs : {blob name: blob ndarray} dict.
     """
     if blobs is None:
         blobs = []
@@ -99,14 +102,17 @@ def _Net_backward(self, diffs=None, start=None, end=None, **kwargs):
     """
     Backward pass: prepare diffs and run the net backward.
 
-    Take
-    diffs: list of diffs to return in addition to bottom diffs.
-    kwargs: Keys are output blob names and values are diff ndarrays.
+    Parameters
+    ----------
+    diffs : list of diffs to return in addition to bottom diffs.
+    kwargs : Keys are output blob names and values are diff ndarrays.
             If None, top diffs are taken from forward loss.
-    start: optional name of layer at which to begin the backward pass
-    end: optional name of layer at which to finish the backward pass (inclusive)
+    start : optional name of layer at which to begin the backward pass
+    end : optional name of layer at which to finish the backward pass
+        (inclusive)
 
-    Give
+    Returns
+    -------
     outs: {blob name: diff ndarray} dict.
     """
     if diffs is None:
@@ -146,13 +152,15 @@ def _Net_forward_all(self, blobs=None, **kwargs):
     """
     Run net forward in batches.
 
-    Take
-    blobs: list of blobs to extract as in forward()
-    kwargs: Keys are input blob names and values are blob ndarrays.
-            Refer to forward().
+    Parameters
+    ----------
+    blobs : list of blobs to extract as in forward()
+    kwargs : Keys are input blob names and values are blob ndarrays.
+             Refer to forward().
 
-    Give
-    all_outs: {blob name: list of blobs} dict.
+    Returns
+    -------
+    all_outs : {blob name: list of blobs} dict.
     """
     # Collect outputs from batches
     all_outs = {out: [] for out in set(self.outputs + (blobs or []))}
@@ -175,14 +183,16 @@ def _Net_forward_backward_all(self, blobs=None, diffs=None, **kwargs):
     """
     Run net forward + backward in batches.
 
-    Take
+    Parameters
+    ----------
     blobs: list of blobs to extract as in forward()
     diffs: list of diffs to extract as in backward()
     kwargs: Keys are input (for forward) and output (for backward) blob names
             and values are ndarrays. Refer to forward() and backward().
             Prefilled variants are called for lack of input or output blobs.
 
-    Give
+    Returns
+    -------
     all_blobs: {blob name: blob ndarray} dict.
     all_diffs: {blob name: diff ndarray} dict.
     """
@@ -229,11 +239,13 @@ def _Net_batch(self, blobs):
     """
     Batch blob lists according to net's batch size.
 
-    Take
+    Parameters
+    ----------
     blobs: Keys blob names and values are lists of blobs (of any length).
            Naturally, all the lists should have the same length.
 
-    Give (yield)
+    Yields
+    ------
     batch: {blob name: list of blobs} dict for a single batch.
     """
     num = len(blobs.itervalues().next())
diff --git a/python/caffe/test/test_net.py b/python/caffe/test/test_net.py
index 62b407da8aa..cc367477752 100644
--- a/python/caffe/test/test_net.py
+++ b/python/caffe/test/test_net.py
@@ -5,6 +5,7 @@
 
 import caffe
 
+
 def simple_net_file(num_output):
     """Make a simple net prototxt, based on test_net.cpp, returning the name
     of the (temporary) file."""
@@ -31,6 +32,7 @@ def simple_net_file(num_output):
     f.close()
     return f.name
 
+
 class TestNet(unittest.TestCase):
     def setUp(self):
         self.num_output = 13
diff --git a/python/caffe/test/test_python_layer.py b/python/caffe/test/test_python_layer.py
index dd99f6f15b9..6fba49143bb 100644
--- a/python/caffe/test/test_python_layer.py
+++ b/python/caffe/test/test_python_layer.py
@@ -4,6 +4,7 @@
 
 import caffe
 
+
 class SimpleLayer(caffe.Layer):
     """A layer that just multiplies by ten"""
 
@@ -19,6 +20,7 @@ def forward(self, bottom, top):
     def backward(self, top, propagate_down, bottom):
         bottom[0].diff[...] = 10 * top[0].diff
 
+
 def python_net_file():
     with tempfile.NamedTemporaryFile(delete=False) as f:
         f.write("""name: 'pythonnet' force_backward: true
@@ -31,6 +33,7 @@ def python_net_file():
           python_param { module: 'test_python_layer' layer: 'SimpleLayer' } }""")
         return f.name
 
+
 class TestPythonLayer(unittest.TestCase):
     def setUp(self):
         net_file = python_net_file()
diff --git a/python/caffe/test/test_solver.py b/python/caffe/test/test_solver.py
index d59f23d973a..09b974dad66 100644
--- a/python/caffe/test/test_solver.py
+++ b/python/caffe/test/test_solver.py
@@ -6,6 +6,7 @@
 import caffe
 from test_net import simple_net_file
 
+
 class TestSolver(unittest.TestCase):
     def setUp(self):
         self.num_output = 13
diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp
index 6d2b3f502d9..94fdcc35fb6 100644
--- a/src/caffe/blob.cpp
+++ b/src/caffe/blob.cpp
@@ -26,6 +26,7 @@ void Blob<Dtype>::Reshape(const vector<int>& shape) {
   shape_.resize(shape.size());
   for (int i = 0; i < shape.size(); ++i) {
     CHECK_GE(shape[i], 0);
+    CHECK_LE(shape[i], INT_MAX / count_) << "blob size exceeds INT_MAX";
     count_ *= shape[i];
     shape_[i] = shape[i];
   }
diff --git a/src/caffe/layers/base_data_layer.cpp b/src/caffe/layers/base_data_layer.cpp
index 352200915d7..931e4a9c0ab 100644
--- a/src/caffe/layers/base_data_layer.cpp
+++ b/src/caffe/layers/base_data_layer.cpp
@@ -2,7 +2,6 @@
 #include <vector>
 
 #include "caffe/data_layers.hpp"
-#include "caffe/net.hpp"
 #include "caffe/util/io.hpp"
 
 namespace caffe {
diff --git a/src/caffe/layers/contrastive_loss_layer.cpp b/src/caffe/layers/contrastive_loss_layer.cpp
index 0692c11c257..25e167819d3 100644
--- a/src/caffe/layers/contrastive_loss_layer.cpp
+++ b/src/caffe/layers/contrastive_loss_layer.cpp
@@ -41,6 +41,8 @@ void ContrastiveLossLayer<Dtype>::Forward_cpu(
       diff_.mutable_cpu_data());  // a_i-b_i
   const int channels = bottom[0]->channels();
   Dtype margin = this->layer_param_.contrastive_loss_param().margin();
+  bool legacy_version =
+      this->layer_param_.contrastive_loss_param().legacy_version();
   Dtype loss(0.0);
   for (int i = 0; i < bottom[0]->num(); ++i) {
     dist_sq_.mutable_cpu_data()[i] = caffe_cpu_dot(channels,
@@ -48,7 +50,12 @@ void ContrastiveLossLayer<Dtype>::Forward_cpu(
     if (static_cast<int>(bottom[2]->cpu_data()[i])) {  // similar pairs
       loss += dist_sq_.cpu_data()[i];
     } else {  // dissimilar pairs
-      loss += std::max(margin-dist_sq_.cpu_data()[i], Dtype(0.0));
+      if (legacy_version) {
+        loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0));
+      } else {
+        Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]), 0.0);
+        loss += dist*dist;
+      }
     }
   }
   loss = loss / static_cast<Dtype>(bottom[0]->num()) / Dtype(2);
@@ -59,6 +66,8 @@ template <typename Dtype>
 void ContrastiveLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   Dtype margin = this->layer_param_.contrastive_loss_param().margin();
+  bool legacy_version =
+      this->layer_param_.contrastive_loss_param().legacy_version();
   for (int i = 0; i < 2; ++i) {
     if (propagate_down[i]) {
       const Dtype sign = (i == 0) ? 1 : -1;
@@ -76,10 +85,20 @@ void ContrastiveLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
               Dtype(0.0),
               bout + (j*channels));
         } else {  // dissimilar pairs
-          if ((margin-dist_sq_.cpu_data()[j]) > Dtype(0.0)) {
+          Dtype mdist(0.0);
+          Dtype beta(0.0);
+          if (legacy_version) {
+            mdist = margin - dist_sq_.cpu_data()[j];
+            beta = -alpha;
+          } else {
+            Dtype dist = sqrt(dist_sq_.cpu_data()[j]);
+            mdist = margin - dist;
+            beta = -alpha * mdist / (dist + Dtype(1e-4));
+          }
+          if (mdist > Dtype(0.0)) {
             caffe_cpu_axpby(
                 channels,
-                -alpha,
+                beta,
                 diff_.cpu_data() + (j*channels),
                 Dtype(0.0),
                 bout + (j*channels));
diff --git a/src/caffe/layers/contrastive_loss_layer.cu b/src/caffe/layers/contrastive_loss_layer.cu
index 78a55995a0a..931239316ac 100644
--- a/src/caffe/layers/contrastive_loss_layer.cu
+++ b/src/caffe/layers/contrastive_loss_layer.cu
@@ -32,12 +32,20 @@ void ContrastiveLossLayer<Dtype>::Forward_gpu(
       Dtype(0.0),
       dist_sq_.mutable_gpu_data());  // \Sum (a_i-b_i)^2
   Dtype margin = this->layer_param_.contrastive_loss_param().margin();
+  bool legacy_version =
+      this->layer_param_.contrastive_loss_param().legacy_version();
   Dtype loss(0.0);
   for (int i = 0; i < bottom[0]->num(); ++i) {
     if (static_cast<int>(bottom[2]->cpu_data()[i])) {  // similar pairs
       loss += dist_sq_.cpu_data()[i];
     } else {  // dissimilar pairs
-      loss += std::max(margin-dist_sq_.cpu_data()[i], Dtype(0.0));
+      if (legacy_version) {
+        loss += std::max(margin - dist_sq_.cpu_data()[i], Dtype(0.0));
+      } else {
+        Dtype dist = std::max(margin - sqrt(dist_sq_.cpu_data()[i]),
+                              Dtype(0.0));
+        loss += dist*dist;
+      }
     }
   }
   loss = loss / static_cast<Dtype>(bottom[0]->num()) / Dtype(2);
@@ -45,8 +53,8 @@ void ContrastiveLossLayer<Dtype>::Forward_gpu(
 }
 
 template <typename Dtype>
-__global__ void CLLForward(const int count, const int channels,
-    const Dtype margin, const Dtype alpha,
+__global__ void CLLBackward(const int count, const int channels,
+    const Dtype margin, const bool legacy_version, const Dtype alpha,
     const Dtype* y, const Dtype* diff, const Dtype* dist_sq,
     Dtype *bottom_diff) {
   CUDA_KERNEL_LOOP(i, count) {
@@ -54,8 +62,18 @@ __global__ void CLLForward(const int count, const int channels,
     if (static_cast<int>(y[n])) {  // similar pairs
       bottom_diff[i] = alpha * diff[i];
     } else {  // dissimilar pairs
-      if ((margin-dist_sq[n]) > 0.0) {
-        bottom_diff[i] = -alpha * diff[i];
+      Dtype mdist(0.0);
+      Dtype beta(0.0);
+      if (legacy_version) {
+        mdist = (margin - dist_sq[n]);
+        beta = -alpha;
+      } else {
+        Dtype dist = sqrt(dist_sq[n]);
+        mdist = (margin - dist);
+        beta = -alpha * mdist / (dist + Dtype(1e-4)) * diff[i];
+      }
+      if (mdist > 0.0) {
+        bottom_diff[i] = beta;
       } else {
         bottom_diff[i] = 0;
       }
@@ -71,12 +89,14 @@ void ContrastiveLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const int count = bottom[0]->count();
       const int channels = bottom[0]->channels();
       Dtype margin = this->layer_param_.contrastive_loss_param().margin();
+      const bool legacy_version =
+          this->layer_param_.contrastive_loss_param().legacy_version();
       const Dtype sign = (i == 0) ? 1 : -1;
       const Dtype alpha = sign * top[0]->cpu_diff()[0] /
           static_cast<Dtype>(bottom[0]->num());
       // NOLINT_NEXT_LINE(whitespace/operators)
-      CLLForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-          count, channels, margin, alpha,
+      CLLBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+          count, channels, margin, legacy_version, alpha,
           bottom[2]->gpu_data(),  // pair similarity 0 or 1
           diff_.gpu_data(),  // the cached eltwise difference between a and b
           dist_sq_.gpu_data(),  // the cached square distance between a and b
diff --git a/src/caffe/layers/prelu_layer.cpp b/src/caffe/layers/prelu_layer.cpp
index 7119a274dd3..7a38f9fac80 100644
--- a/src/caffe/layers/prelu_layer.cpp
+++ b/src/caffe/layers/prelu_layer.cpp
@@ -45,7 +45,8 @@ void PReLULayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 
   // Propagate gradients to the parameters (as directed by backward pass).
   this->param_propagate_down_.resize(this->blobs_.size(), true);
-  multiplier_.Reshape(vector<int>(1, bottom[0]->count() / bottom[0]->num()));
+  multiplier_.Reshape(vector<int>(1, bottom[0]->count(1)));
+  backward_buff_.Reshape(vector<int>(1, bottom[0]->count(1)));
   caffe_set(multiplier_.count(), Dtype(1), multiplier_.mutable_cpu_data());
 }
 
diff --git a/src/caffe/layers/prelu_layer.cu b/src/caffe/layers/prelu_layer.cu
index fd0eda5d191..dfa238d85bd 100644
--- a/src/caffe/layers/prelu_layer.cu
+++ b/src/caffe/layers/prelu_layer.cu
@@ -86,22 +86,22 @@ void PReLULayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     int cdim = channels * dim;
     Dtype dsum = 0.;
     for (int n = 0; n < bottom[0]->num(); ++n) {
-      Dtype* temp_buff = multiplier_.mutable_gpu_diff();
       // compute element-wise diff
       // NOLINT_NEXT_LINE(whitespace/operators)
-      PReLUParamBackward<Dtype><<<CAFFE_GET_BLOCKS(count),
+      PReLUParamBackward<Dtype><<<CAFFE_GET_BLOCKS(cdim),
           CAFFE_CUDA_NUM_THREADS>>>(
           cdim, top_diff + top[0]->offset(n),
-          bottom_data + bottom[0]->offset(n), multiplier_.mutable_gpu_diff());
+          bottom_data + bottom[0]->offset(n),
+          backward_buff_.mutable_gpu_diff());
       CUDA_POST_KERNEL_CHECK;
       if (channel_shared_) {
         Dtype d;
-        caffe_gpu_dot<Dtype>(channels * dim, multiplier_.gpu_diff(),
+        caffe_gpu_dot<Dtype>(channels * dim, backward_buff_.gpu_diff(),
             multiplier_.gpu_data(), &d);
         dsum += d;
       } else {
         caffe_gpu_gemv<Dtype>(CblasNoTrans, channels, dim, 1.,
-            multiplier_.gpu_diff(), multiplier_.gpu_data(), 1.,
+            backward_buff_.gpu_diff(), multiplier_.gpu_data(), 1.,
             slope_diff);
       }
     }
diff --git a/src/caffe/layers/reshape_layer.cpp b/src/caffe/layers/reshape_layer.cpp
index 3e87b1a8383..ffe970f2689 100644
--- a/src/caffe/layers/reshape_layer.cpp
+++ b/src/caffe/layers/reshape_layer.cpp
@@ -7,17 +7,86 @@ namespace caffe {
 
 template <typename Dtype>
 void ReshapeLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  top[0]->Reshape(this->layer_param_.reshape_param().shape());
-  top[0]->ShareData(*bottom[0]);
-  top[0]->ShareDiff(*bottom[0]);
+    const vector<Blob<Dtype>*>& top) {
+  inferred_axis_ = -1;
+  copy_axes_.clear();
+  const BlobShape& top_blob_shape = this->layer_param_.reshape_param().shape();
+  const int top_num_axes = top_blob_shape.dim_size();
+  constant_count_ = 1;
+  for (int i = 0; i < top_num_axes; ++i) {
+    const int top_dim = top_blob_shape.dim(i);
+    if (top_dim == 0) {
+      copy_axes_.push_back(i);
+    } else if (top_dim == -1) {
+      CHECK_EQ(inferred_axis_, -1) << "new shape contains multiple "
+          << "-1 dims; at most a single (1) value of -1 may be specified";
+      inferred_axis_ = i;
+    } else {
+      constant_count_ *= top_dim;
+    }
+  }
 }
 
 template <typename Dtype>
 void ReshapeLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
+    const vector<Blob<Dtype>*>& top) {
+  const int input_start_axis = this->layer_param_.reshape_param().axis();
+  const int start_axis = (input_start_axis >= 0) ? input_start_axis :
+      bottom[0]->num_axes() + input_start_axis + 1;
+  CHECK_GE(start_axis, 0) << "axis " << input_start_axis << " out of range";
+  CHECK_LE(start_axis, bottom[0]->num_axes()) << "axis " << input_start_axis
+      << " out of range for " << bottom[0]->num_axes() << "-D input blob";
+  const int num_axes = this->layer_param_.reshape_param().num_axes();
+  CHECK_GE(num_axes, -1) << "num_axes must be >= 0, or -1 for all";
+  const int end_axis =
+      (num_axes == -1) ? bottom[0]->num_axes() : (start_axis + num_axes);
+  CHECK_LE(end_axis, bottom[0]->num_axes())
+      << "end_axis = axis + num_axes is out of range";
+  const int num_axes_replaced = end_axis - start_axis;
+  const int num_axes_retained = bottom[0]->num_axes() - num_axes_replaced;
+  const BlobShape& top_blob_shape = this->layer_param_.reshape_param().shape();
+  const int num_new_axes = top_blob_shape.dim_size();
+  vector<int> top_shape(num_axes_retained + num_new_axes);
+  int top_shape_index = 0;
+  for (int i = 0; i < start_axis; ++i) {
+    top_shape[top_shape_index++] = bottom[0]->shape(i);
+  }
+  for (int i = 0; i < num_new_axes; ++i) {
+    top_shape[top_shape_index++] = top_blob_shape.dim(i);
+  }
+  for (int i = end_axis; i < bottom[0]->num_axes(); ++i) {
+    top_shape[top_shape_index++] = bottom[0]->shape(i);
+  }
+  CHECK_EQ(top_shape_index, top_shape.size());
+  for (int i = 0; i < copy_axes_.size(); ++i) {
+    const int copy_axis_index = copy_axes_[i];
+    CHECK_GT(bottom[0]->num_axes(), start_axis + copy_axis_index)
+        << "new shape contains a 0, but there was no corresponding bottom axis "
+        << "to copy";
+    top_shape[start_axis + copy_axis_index] =
+        bottom[0]->shape(start_axis + copy_axis_index);
+  }
+  if (inferred_axis_ >= 0) {
+    // A -1 dim was specified; infer the correct dimension by computing the
+    // product of the other dimensions.
+    int explicit_count = constant_count_;
+    explicit_count *= bottom[0]->count(0, start_axis);
+    explicit_count *= bottom[0]->count(end_axis);
+    for (int i = 0; i < copy_axes_.size(); ++i) {
+      const int copy_axis_index = copy_axes_[i];
+      explicit_count *= top_shape[start_axis + copy_axis_index];
+    }
+    CHECK_EQ(0, bottom[0]->count() % explicit_count) << "bottom count ("
+        << bottom[0]->count() << ") must be divisible by the product of "
+        << "the specified dimensions (" << explicit_count << ")";
+    const int inferred_dim = bottom[0]->count() / explicit_count;
+    top_shape[start_axis + inferred_axis_] = inferred_dim;
+  }
+  top[0]->Reshape(top_shape);
   CHECK_EQ(top[0]->count(), bottom[0]->count())
-     << "new shape must have the same count as input";
+      << "output count must match input count";
+  top[0]->ShareData(*bottom[0]);
+  top[0]->ShareDiff(*bottom[0]);
 }
 
 INSTANTIATE_CLASS(ReshapeLayer);
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
index 077d949981c..cc236fe1e8e 100644
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
+++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cpp
@@ -71,7 +71,7 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Backward_cpu(
 }
 
 #ifdef CPU_ONLY
-STUB_GPU(SigmoidCrossEntropyLossLayer);
+STUB_GPU_BACKWARD(SigmoidCrossEntropyLossLayer, Backward);
 #endif
 
 INSTANTIATE_CLASS(SigmoidCrossEntropyLossLayer);
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
index 08f7f492297..547fa80c72f 100644
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
+++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
@@ -8,26 +8,6 @@
 
 namespace caffe {
 
-template <typename Dtype>
-void SigmoidCrossEntropyLossLayer<Dtype>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  // The forward pass computes the sigmoid outputs.
-  sigmoid_bottom_vec_[0] = bottom[0];
-  sigmoid_layer_->Forward(sigmoid_bottom_vec_, sigmoid_top_vec_);
-  // Compute the loss (negative log likelihood)
-  const int count = bottom[0]->count();
-  const int num = bottom[0]->num();
-  // Stable version of loss computation from input data
-  const Dtype* input_data = bottom[0]->cpu_data();
-  const Dtype* target = bottom[1]->cpu_data();
-  Dtype loss = 0;
-  for (int i = 0; i < count; ++i) {
-    loss -= input_data[i] * (target[i] - (input_data[i] >= 0)) -
-        log(1 + exp(input_data[i] - 2 * input_data[i] * (input_data[i] >= 0)));
-  }
-  top[0]->mutable_cpu_data()[0] = loss / num;
-}
-
 template <typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::Backward_gpu(
     const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
@@ -51,7 +31,7 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Backward_gpu(
   }
 }
 
-INSTANTIATE_LAYER_GPU_FUNCS(SigmoidCrossEntropyLossLayer);
+INSTANTIATE_LAYER_GPU_BACKWARD(SigmoidCrossEntropyLossLayer);
 
 
 }  // namespace caffe
diff --git a/src/caffe/layers/spp_layer.cpp b/src/caffe/layers/spp_layer.cpp
new file mode 100644
index 00000000000..795dd71693e
--- /dev/null
+++ b/src/caffe/layers/spp_layer.cpp
@@ -0,0 +1,193 @@
+#include <algorithm>
+#include <cfloat>
+#include <vector>
+
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/syncedmem.hpp"
+#include "caffe/util/math_functions.hpp"
+#include "caffe/vision_layers.hpp"
+
+namespace caffe {
+
+using std::min;
+using std::max;
+
+template <typename Dtype>
+LayerParameter SPPLayer<Dtype>::GetPoolingParam(const int pyramid_level,
+      const int bottom_h, const int bottom_w, const SPPParameter spp_param) {
+  LayerParameter pooling_param;
+  int num_bins = pow(2, pyramid_level);
+
+  // find padding and kernel size so that the pooling is
+  // performed across the entire image
+  int kernel_h = ceil(bottom_h / static_cast<double>(num_bins));
+  // remainder_h is the min number of pixels that need to be padded before
+  // entire image height is pooled over with the chosen kernel dimension
+  int remainder_h = kernel_h * num_bins - bottom_h;
+  // pooling layer pads (2 * pad_h) pixels on the top and bottom of the
+  // image.
+  int pad_h = (remainder_h + 1) / 2;
+
+  // similar logic for width
+  int kernel_w = ceil(bottom_w / static_cast<double>(num_bins));
+  int remainder_w = kernel_w * num_bins - bottom_w;
+  int pad_w = (remainder_w + 1) / 2;
+
+  pooling_param.mutable_pooling_param()->set_pad_h(pad_h);
+  pooling_param.mutable_pooling_param()->set_pad_w(pad_w);
+  pooling_param.mutable_pooling_param()->set_kernel_h(kernel_h);
+  pooling_param.mutable_pooling_param()->set_kernel_w(kernel_w);
+  pooling_param.mutable_pooling_param()->set_stride_h(kernel_h);
+  pooling_param.mutable_pooling_param()->set_stride_w(kernel_w);
+
+  switch (spp_param.pool()) {
+  case SPPParameter_PoolMethod_MAX:
+    pooling_param.mutable_pooling_param()->set_pool(
+        PoolingParameter_PoolMethod_MAX);
+    break;
+  case SPPParameter_PoolMethod_AVE:
+    pooling_param.mutable_pooling_param()->set_pool(
+        PoolingParameter_PoolMethod_AVE);
+    break;
+  case SPPParameter_PoolMethod_STOCHASTIC:
+    pooling_param.mutable_pooling_param()->set_pool(
+        PoolingParameter_PoolMethod_STOCHASTIC);
+    break;
+  default:
+    LOG(FATAL) << "Unknown pooling method.";
+  }
+
+  return pooling_param;
+}
+
+template <typename Dtype>
+void SPPLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  SPPParameter spp_param = this->layer_param_.spp_param();
+
+  bottom_h_ = bottom[0]->height();
+  bottom_w_ = bottom[0]->width();
+  CHECK_GT(bottom_h_, 0) << "Input dimensions cannot be zero.";
+  CHECK_GT(bottom_w_, 0) << "Input dimensions cannot be zero.";
+
+  pyramid_height_ = spp_param.pyramid_height();
+  split_top_vec_.clear();
+  pooling_bottom_vecs_.clear();
+  pooling_layers_.clear();
+  pooling_top_vecs_.clear();
+  pooling_outputs_.clear();
+  flatten_layers_.clear();
+  flatten_top_vecs_.clear();
+  flatten_outputs_.clear();
+  concat_bottom_vec_.clear();
+
+  // split layer output holders setup
+  for (int i = 0; i < pyramid_height_; i++) {
+    split_top_vec_.push_back(new Blob<Dtype>());
+  }
+
+  // split layer setup
+  LayerParameter split_param;
+  split_layer_.reset(new SplitLayer<Dtype>(split_param));
+  split_layer_->SetUp(bottom, split_top_vec_);
+
+  for (int i = 0; i < pyramid_height_; i++) {
+    // pooling layer input holders setup
+    pooling_bottom_vecs_.push_back(new vector<Blob<Dtype>*>);
+    pooling_bottom_vecs_[i]->push_back(split_top_vec_[i]);
+
+    // pooling layer output holders setup
+    pooling_outputs_.push_back(new Blob<Dtype>());
+    pooling_top_vecs_.push_back(new vector<Blob<Dtype>*>);
+    pooling_top_vecs_[i]->push_back(pooling_outputs_[i]);
+
+    // pooling layer setup
+    LayerParameter pooling_param = GetPoolingParam(
+        i, bottom_h_, bottom_w_, spp_param);
+
+    pooling_layers_.push_back(shared_ptr<PoolingLayer<Dtype> > (
+        new PoolingLayer<Dtype>(pooling_param)));
+    pooling_layers_[i]->SetUp(*pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
+
+    // flatten layer output holders setup
+    flatten_outputs_.push_back(new Blob<Dtype>());
+    flatten_top_vecs_.push_back(new vector<Blob<Dtype>*>);
+    flatten_top_vecs_[i]->push_back(flatten_outputs_[i]);
+
+    // flatten layer setup
+    LayerParameter flatten_param;
+    flatten_layers_.push_back(new FlattenLayer<Dtype>(flatten_param));
+    flatten_layers_[i]->SetUp(*pooling_top_vecs_[i], *flatten_top_vecs_[i]);
+
+    // concat layer input holders setup
+    concat_bottom_vec_.push_back(flatten_outputs_[i]);
+  }
+
+  // concat layer setup
+  LayerParameter concat_param;
+  concat_layer_.reset(new ConcatLayer<Dtype>(concat_param));
+  concat_layer_->SetUp(concat_bottom_vec_, top);
+}
+
+template <typename Dtype>
+void SPPLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
+      << "corresponding to (num, channels, height, width)";
+  channels_ = bottom[0]->channels();
+  bottom_h_ = bottom[0]->height();
+  bottom_w_ = bottom[0]->width();
+  SPPParameter spp_param = this->layer_param_.spp_param();
+  split_layer_->Reshape(bottom, split_top_vec_);
+  for (int i = 0; i < pyramid_height_; i++) {
+    LayerParameter pooling_param = GetPoolingParam(
+        i, bottom_h_, bottom_w_, spp_param);
+
+    pooling_layers_[i].reset(
+        new PoolingLayer<Dtype>(pooling_param));
+    pooling_layers_[i]->SetUp(
+        *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
+    pooling_layers_[i]->Reshape(
+        *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
+    flatten_layers_[i]->Reshape(
+        *pooling_top_vecs_[i], *flatten_top_vecs_[i]);
+  }
+  concat_layer_->Reshape(concat_bottom_vec_, top);
+}
+
+template <typename Dtype>
+void SPPLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  split_layer_->Forward(bottom, split_top_vec_);
+  for (int i = 0; i < pyramid_height_; i++) {
+    pooling_layers_[i]->Forward(
+        *pooling_bottom_vecs_[i], *pooling_top_vecs_[i]);
+    flatten_layers_[i]->Forward(
+        *pooling_top_vecs_[i], *flatten_top_vecs_[i]);
+  }
+  concat_layer_->Forward(concat_bottom_vec_, top);
+}
+
+template <typename Dtype>
+void SPPLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (!propagate_down[0]) {
+    return;
+  }
+  vector<bool> concat_propagate_down(pyramid_height_, true);
+  concat_layer_->Backward(top, concat_propagate_down, concat_bottom_vec_);
+  for (int i = 0; i < pyramid_height_; i++) {
+    flatten_layers_[i]->Backward(
+        *flatten_top_vecs_[i], propagate_down, *pooling_top_vecs_[i]);
+    pooling_layers_[i]->Backward(
+        *pooling_top_vecs_[i], propagate_down, *pooling_bottom_vecs_[i]);
+  }
+  split_layer_->Backward(split_top_vec_, propagate_down, bottom);
+}
+
+
+INSTANTIATE_CLASS(SPPLayer);
+REGISTER_LAYER_CLASS(SPP);
+
+}  // namespace caffe
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 888eec1d501..3694f25d860 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -410,7 +410,7 @@ void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
     // (i.e., not given a param_name) or explicitly given a name that we
     // haven't already seen.
     param_owners_.push_back(-1);
-    if (param_size) {
+    if (param_name.size()) {
       param_names_index_[param_name] = net_param_id;
     }
   } else {
@@ -470,7 +470,6 @@ Dtype Net<Dtype>::ForwardFromTo(int start, int end) {
   }
   for (int i = start; i <= end; ++i) {
     // LOG(ERROR) << "Forwarding " << layer_names_[i];
-    layers_[i]->Reshape(bottom_vecs_[i], top_vecs_[i]);
     Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]);
     loss += layer_loss;
     if (debug_info_) { ForwardDebugInfo(i); }
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 1b0ea261e49..239fc038437 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -262,7 +262,7 @@ message ParamSpec {
 // NOTE
 // Update the next available ID when you add a new LayerParameter field.
 //
-// LayerParameter next available layer-specific ID: 138 (last added: perspective_param)
+// LayerParameter next available layer-specific ID: 139 (last added: spp_param)
 message LayerParameter {
   optional string name = 1; // the layer name
   optional string type = 2; // the layer type
@@ -336,6 +336,7 @@ message LayerParameter {
   optional ReshapeParameter reshape_param = 136;
   optional SigmoidParameter sigmoid_param = 124;
   optional SoftmaxParameter softmax_param = 125;
+  optional SPPParameter spp_param = 138;
   optional SliceParameter slice_param = 126;
   optional TanHParameter tanh_param = 127;
   optional ThresholdParameter threshold_param = 128;
@@ -410,8 +411,15 @@ message ConcatParameter {
 
 // Message that stores parameters used by ContrastiveLossLayer
 message ContrastiveLossParameter {
-  //margin for dissimilar pair
+  // margin for dissimilar pair
   optional float margin = 1 [default = 1.0];
+  // The first implementation of this cost did not exactly match the cost of
+  // Hadsell et al 2006 -- using (margin - d^2) instead of (margin - d)^2.
+  // legacy_version = false (the default) uses (margin - d)^2 as proposed in the
+  // Hadsell paper. New models should probably use this version.
+  // legacy_version = true uses (margin - d^2). This is kept to support /
+  // reproduce existing models and results
+  optional bool legacy_version = 2 [default = false]; 
 }
 
 // Message that stores parameters used by ConvolutionLayer
@@ -724,9 +732,67 @@ message ReductionParameter {
 
 // Message that stores parameters used by ReshapeLayer
 message ReshapeParameter {
-  // The new shape of the Blob. Must have the same "count" (product of
-  // dimensions) as the input Blob.
+  // Specify the output dimensions. If some of the dimensions are set to 0,
+  // the corresponding dimension from the bottom layer is used (unchanged).
+  // Exactly one dimension may be set to -1, in which case its value is
+  // inferred from the count of the bottom blob and the remaining dimensions.
+  // For example, suppose we want to reshape a 2D blob "input" with shape 2 x 8:
+  //
+  //   layer {
+  //     type: "Reshape" bottom: "input" top: "output"
+  //     reshape_param { ... }
+  //   }
+  //
+  // If "input" is 2D with shape 2 x 8, then the following reshape_param
+  // specifications are all equivalent, producing a 3D blob "output" with shape
+  // 2 x 2 x 4:
+  //
+  //   reshape_param { shape { dim:  2  dim: 2  dim:  4 } }
+  //   reshape_param { shape { dim:  0  dim: 2  dim:  4 } }
+  //   reshape_param { shape { dim:  0  dim: 2  dim: -1 } }
+  //   reshape_param { shape { dim: -1  dim: 0  dim:  2 } }
+  //
   optional BlobShape shape = 1;
+
+  // axis and num_axes control the portion of the bottom blob's shape that are
+  // replaced by (included in) the reshape. By default (axis == 0 and
+  // num_axes == -1), the entire bottom blob shape is included in the reshape,
+  // and hence the shape field must specify the entire output shape.
+  //
+  // axis may be non-zero to retain some portion of the beginning of the input
+  // shape (and may be negative to index from the end; e.g., -1 to begin the
+  // reshape after the last axis, including nothing in the reshape,
+  // -2 to include only the last axis, etc.).
+  //
+  // For example, suppose "input" is a 2D blob with shape 2 x 8.
+  // Then the following ReshapeLayer specifications are all equivalent,
+  // producing a blob "output" with shape 2 x 2 x 4:
+  //
+  //   reshape_param { shape { dim: 2  dim: 2  dim: 4 } }
+  //   reshape_param { shape { dim: 2  dim: 4 } axis:  1 }
+  //   reshape_param { shape { dim: 2  dim: 4 } axis: -3 }
+  //
+  // num_axes specifies the extent of the reshape.
+  // If num_axes >= 0 (and axis >= 0), the reshape will be performed only on
+  // input axes in the range [axis, axis+num_axes].
+  // num_axes may also be -1, the default, to include all remaining axes
+  // (starting from axis).
+  //
+  // For example, suppose "input" is a 2D blob with shape 2 x 8.
+  // Then the following ReshapeLayer specifications are equivalent,
+  // producing a blob "output" with shape 1 x 2 x 8.
+  //
+  //   reshape_param { shape { dim:  1  dim: 2  dim:  8 } }
+  //   reshape_param { shape { dim:  1  dim: 2  }  num_axes: 1 }
+  //   reshape_param { shape { dim:  1  }  num_axes: 0 }
+  //
+  // On the other hand, these would produce output blob shape 2 x 1 x 8:
+  //
+  //   reshape_param { shape { dim: 2  dim: 1  dim: 8  }  }
+  //   reshape_param { shape { dim: 1 }  axis: 1  num_axes: 0 }
+  //
+  optional int32 axis = 2 [default = 0];
+  optional int32 num_axes = 3 [default = -1];
 }
 
 // Message that stores parameters used by ReLULayer
@@ -848,6 +914,23 @@ message WindowDataParameter {
   optional string root_folder = 13 [default = ""];
 }
 
+// Message that stores parameters used by SPPLayer
+message SPPParameter {
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional uint32 pyramid_height = 1;
+  optional PoolMethod pool = 2 [default = MAX]; // The pooling method
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 6 [default = DEFAULT];
+}
+
 // DEPRECATED: use LayerParameter.
 message V1LayerParameter {
   repeated string bottom = 2;
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 096980dd7af..877b19b86f8 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -167,7 +167,7 @@ void Solver<Dtype>::Step(int iters) {
   vector<Dtype> losses;
   Dtype smoothed_loss = 0;
 
-  for (; iter_ < stop_iter; ++iter_) {
+  while (iter_ < stop_iter) {
     if (param_.test_interval() && iter_ % param_.test_interval() == 0
         && (iter_ > 0 || param_.test_initialization())) {
       TestAll();
@@ -210,8 +210,12 @@ void Solver<Dtype>::Step(int iters) {
     ComputeUpdateValue();
     net_->Update();
 
+    // Increment the internal iter_ counter -- its value should always indicate
+    // the number of times the weights have been updated.
+    ++iter_;
+
     // Save a snapshot if needed.
-    if (param_.snapshot() && (iter_ + 1) % param_.snapshot() == 0) {
+    if (param_.snapshot() && iter_ % param_.snapshot() == 0) {
       Snapshot();
     }
   }
@@ -327,15 +331,14 @@ void Solver<Dtype>::Snapshot() {
   string model_filename, snapshot_filename;
   const int kBufferSize = 20;
   char iter_str_buffer[kBufferSize];
-  // Add one to iter_ to get the number of iterations that have completed.
-  snprintf(iter_str_buffer, kBufferSize, "_iter_%d", iter_ + 1);
+  snprintf(iter_str_buffer, kBufferSize, "_iter_%d", iter_);
   filename += iter_str_buffer;
   model_filename = filename + ".caffemodel";
   LOG(INFO) << "Snapshotting to " << model_filename;
   WriteProtoToBinaryFile(net_param, model_filename.c_str());
   SolverState state;
   SnapshotSolverState(&state);
-  state.set_iter(iter_ + 1);
+  state.set_iter(iter_);
   state.set_learned_net(model_filename);
   state.set_current_step(current_step_);
   snapshot_filename = filename + ".solverstate";
diff --git a/src/caffe/test/test_contrastive_loss_layer.cpp b/src/caffe/test/test_contrastive_loss_layer.cpp
index d269fbc26f2..1e9447cbc51 100644
--- a/src/caffe/test/test_contrastive_loss_layer.cpp
+++ b/src/caffe/test/test_contrastive_loss_layer.cpp
@@ -22,15 +22,15 @@ class ContrastiveLossLayerTest : public MultiDeviceTest<TypeParam> {
 
  protected:
   ContrastiveLossLayerTest()
-      : blob_bottom_data_i_(new Blob<Dtype>(128, 10, 1, 1)),
-        blob_bottom_data_j_(new Blob<Dtype>(128, 10, 1, 1)),
-        blob_bottom_y_(new Blob<Dtype>(128, 1, 1, 1)),
+      : blob_bottom_data_i_(new Blob<Dtype>(512, 2, 1, 1)),
+        blob_bottom_data_j_(new Blob<Dtype>(512, 2, 1, 1)),
+        blob_bottom_y_(new Blob<Dtype>(512, 1, 1, 1)),
         blob_top_loss_(new Blob<Dtype>()) {
     // fill the values
     FillerParameter filler_param;
-    filler_param.set_mean(0.0);
-    filler_param.set_std(0.3);  // distances~=1.0 to test both sides of margin
-    GaussianFiller<Dtype> filler(filler_param);
+    filler_param.set_min(-1.0);
+    filler_param.set_max(1.0);  // distances~=1.0 to test both sides of margin
+    UniformFiller<Dtype> filler(filler_param);
     filler.Fill(this->blob_bottom_data_i_);
     blob_bottom_vec_.push_back(blob_bottom_data_i_);
     filler.Fill(this->blob_bottom_data_j_);
@@ -79,7 +79,8 @@ TYPED_TEST(ContrastiveLossLayerTest, TestForward) {
     if (this->blob_bottom_y_->cpu_data()[i]) {  // similar pairs
       loss += dist_sq;
     } else {
-      loss += std::max(margin-dist_sq, Dtype(0));
+      Dtype dist = std::max(margin - sqrt(dist_sq), 0.0);
+      loss += dist*dist;
     }
   }
   loss /= static_cast<Dtype>(num) * Dtype(2);
@@ -99,4 +100,47 @@ TYPED_TEST(ContrastiveLossLayerTest, TestGradient) {
       this->blob_top_vec_, 1);
 }
 
+TYPED_TEST(ContrastiveLossLayerTest, TestForwardLegacy) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  layer_param.mutable_contrastive_loss_param()->set_legacy_version(true);
+  ContrastiveLossLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // manually compute to compare
+  const Dtype margin = layer_param.contrastive_loss_param().margin();
+  const int num = this->blob_bottom_data_i_->num();
+  const int channels = this->blob_bottom_data_i_->channels();
+  Dtype loss(0);
+  for (int i = 0; i < num; ++i) {
+    Dtype dist_sq(0);
+    for (int j = 0; j < channels; ++j) {
+      Dtype diff = this->blob_bottom_data_i_->cpu_data()[i*channels+j] -
+          this->blob_bottom_data_j_->cpu_data()[i*channels+j];
+      dist_sq += diff*diff;
+    }
+    if (this->blob_bottom_y_->cpu_data()[i]) {  // similar pairs
+      loss += dist_sq;
+    } else {
+      loss += std::max(margin - dist_sq, Dtype(0.0));
+    }
+  }
+  loss /= static_cast<Dtype>(num) * Dtype(2);
+  EXPECT_NEAR(this->blob_top_loss_->cpu_data()[0], loss, 1e-6);
+}
+
+TYPED_TEST(ContrastiveLossLayerTest, TestGradientLegacy) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  layer_param.mutable_contrastive_loss_param()->set_legacy_version(true);
+  ContrastiveLossLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  GradientChecker<Dtype> checker(1e-2, 1e-2, 1701);
+  // check the gradient for the first two bottom layers
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 1);
+}
+
 }  // namespace caffe
diff --git a/src/caffe/test/test_neuron_layer.cpp b/src/caffe/test/test_neuron_layer.cpp
index c9d52f247a6..030f4bbae7f 100644
--- a/src/caffe/test/test_neuron_layer.cpp
+++ b/src/caffe/test/test_neuron_layer.cpp
@@ -541,14 +541,10 @@ TYPED_TEST(NeuronLayerTest, TestPReLUInPlace) {
   caffe_copy(ip2.blobs()[0]->count(), ip.blobs()[0]->cpu_data(),
       ip2.blobs()[0]->mutable_cpu_data());
   // Forward in-place
-  ip.Reshape(this->blob_bottom_vec_, this->blob_top_vec_);
   ip.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-  prelu.Reshape(this->blob_top_vec_, this->blob_top_vec_);
   prelu.Forward(this->blob_top_vec_, this->blob_top_vec_);
   // Forward non-in-place
-  ip2.Reshape(blob_bottom_vec_2, blob_middle_vec_2);
   ip2.Forward(blob_bottom_vec_2, blob_middle_vec_2);
-  prelu2.Reshape(blob_middle_vec_2, blob_top_vec_2);
   prelu2.Forward(blob_middle_vec_2, blob_top_vec_2);
   // Check numbers
   for (int s = 0; s < blob_top_2->count(); ++s) {
diff --git a/src/caffe/test/test_reshape_layer.cpp b/src/caffe/test/test_reshape_layer.cpp
index 78f157b81a7..9d08ec60d4e 100644
--- a/src/caffe/test/test_reshape_layer.cpp
+++ b/src/caffe/test/test_reshape_layer.cpp
@@ -5,8 +5,8 @@
 
 #include "caffe/blob.hpp"
 #include "caffe/common.hpp"
+#include "caffe/common_layers.hpp"
 #include "caffe/filler.hpp"
-#include "caffe/vision_layers.hpp"
 
 #include "caffe/test/test_caffe_main.hpp"
 #include "caffe/test/test_gradient_check_util.hpp"
@@ -18,9 +18,8 @@ class ReshapeLayerTest : public MultiDeviceTest<TypeParam> {
   typedef typename TypeParam::Dtype Dtype;
  protected:
   ReshapeLayerTest()
-      : blob_bottom_(new Blob<Dtype>(2, 3, 6, 5)),
-        blob_top_(new Blob<Dtype>()) {
-    Caffe::set_random_seed(1701);
+    : blob_bottom_(new Blob<Dtype>(2, 3, 6, 5)),
+      blob_top_(new Blob<Dtype>()) {
     // fill the values
     FillerParameter filler_param;
     GaussianFiller<Dtype> filler(filler_param);
@@ -28,7 +27,9 @@ class ReshapeLayerTest : public MultiDeviceTest<TypeParam> {
     blob_bottom_vec_.push_back(blob_bottom_);
     blob_top_vec_.push_back(blob_top_);
   }
+
   virtual ~ReshapeLayerTest() { delete blob_bottom_; delete blob_top_; }
+
   Blob<Dtype>* const blob_bottom_;
   Blob<Dtype>* const blob_top_;
   vector<Blob<Dtype>*> blob_bottom_vec_;
@@ -37,44 +38,186 @@ class ReshapeLayerTest : public MultiDeviceTest<TypeParam> {
 
 TYPED_TEST_CASE(ReshapeLayerTest, TestDtypesAndDevices);
 
-TYPED_TEST(ReshapeLayerTest, TestSetup) {
+TYPED_TEST(ReshapeLayerTest, TestFlattenOutputSizes) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
-  BlobShape* shape = layer_param.mutable_reshape_param()->mutable_shape();
-  shared_ptr<ReshapeLayer<Dtype> > layer;
+  BlobShape* blob_shape = layer_param.mutable_reshape_param()->mutable_shape();
+  blob_shape->add_dim(0);
+  blob_shape->add_dim(-1);
+  blob_shape->add_dim(1);
+  blob_shape->add_dim(1);
 
-  shape->Clear();
-  shape->add_dim(2 * 3 * 6 * 5);
-  layer.reset(new ReshapeLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_top_->num_axes(), 1);
-  EXPECT_EQ(this->blob_top_->shape(0), 2 * 3 * 6 * 5);
+  ReshapeLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(this->blob_top_->num(), 2);
+  EXPECT_EQ(this->blob_top_->channels(), 3 * 6 * 5);
+  EXPECT_EQ(this->blob_top_->height(), 1);
+  EXPECT_EQ(this->blob_top_->width(), 1);
+}
 
-  shape->Clear();
-  shape->add_dim(2 * 3 * 6);
-  shape->add_dim(5);
-  layer.reset(new ReshapeLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_top_->num_axes(), 2);
-  EXPECT_EQ(this->blob_top_->shape(0), 2 * 3 * 6);
-  EXPECT_EQ(this->blob_top_->shape(1), 5);
+TYPED_TEST(ReshapeLayerTest, TestFlattenValues) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  BlobShape* blob_shape = layer_param.mutable_reshape_param()->mutable_shape();
+  blob_shape->add_dim(0);
+  blob_shape->add_dim(-1);
+  blob_shape->add_dim(1);
+  blob_shape->add_dim(1);
+  ReshapeLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int c = 0; c < 3 * 6 * 5; ++c) {
+    EXPECT_EQ(this->blob_top_->data_at(0, c, 0, 0),
+        this->blob_bottom_->data_at(0, c / (6 * 5), (c / 5) % 6, c % 5));
+    EXPECT_EQ(this->blob_top_->data_at(1, c, 0, 0),
+        this->blob_bottom_->data_at(1, c / (6 * 5), (c / 5) % 6, c % 5));
+  }
+}
 
-  shape->Clear();
-  shape->add_dim(6);
-  shape->add_dim(1);
-  shape->add_dim(2);
-  shape->add_dim(3);
-  shape->add_dim(1);
-  shape->add_dim(5);
-  layer.reset(new ReshapeLayer<Dtype>(layer_param));
-  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  ASSERT_EQ(this->blob_top_->num_axes(), 6);
-  EXPECT_EQ(this->blob_top_->shape(0), 6);
+// Test whether setting output dimensions to 0 either explicitly or implicitly
+// copies the respective dimension of the input layer.
+TYPED_TEST(ReshapeLayerTest, TestCopyDimensions) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  BlobShape* blob_shape = layer_param.mutable_reshape_param()->mutable_shape();
+  blob_shape->add_dim(0);
+  blob_shape->add_dim(0);
+  blob_shape->add_dim(0);
+  blob_shape->add_dim(0);
+  ReshapeLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+
+  EXPECT_EQ(this->blob_top_->num(), 2);
+  EXPECT_EQ(this->blob_top_->channels(), 3);
+  EXPECT_EQ(this->blob_top_->height(), 6);
+  EXPECT_EQ(this->blob_top_->width(), 5);
+}
+
+// When a dimension is set to -1, we should infer its value from the other
+// dimensions (including those that get copied from below).
+TYPED_TEST(ReshapeLayerTest, TestInferenceOfUnspecified) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  BlobShape* blob_shape = layer_param.mutable_reshape_param()->mutable_shape();
+  blob_shape->add_dim(0);
+  blob_shape->add_dim(3);
+  blob_shape->add_dim(10);
+  blob_shape->add_dim(-1);
+
+  // Count is 180, thus height should be 180 / (2*3*10) = 3.
+
+  ReshapeLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+
+  EXPECT_EQ(this->blob_top_->num(), 2);
+  EXPECT_EQ(this->blob_top_->channels(), 3);
+  EXPECT_EQ(this->blob_top_->height(), 10);
+  EXPECT_EQ(this->blob_top_->width(), 3);
+}
+
+TYPED_TEST(ReshapeLayerTest, TestInferenceOfUnspecifiedWithStartAxis) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  layer_param.mutable_reshape_param()->set_axis(1);
+  BlobShape* blob_shape = layer_param.mutable_reshape_param()->mutable_shape();
+  blob_shape->add_dim(3);
+  blob_shape->add_dim(10);
+  blob_shape->add_dim(-1);
+
+  ReshapeLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+
+  ASSERT_EQ(this->blob_top_->num_axes(), 4);
+  EXPECT_EQ(this->blob_top_->num(), 2);
+  EXPECT_EQ(this->blob_top_->channels(), 3);
+  EXPECT_EQ(this->blob_top_->height(), 10);
+  EXPECT_EQ(this->blob_top_->width(), 3);
+}
+
+TYPED_TEST(ReshapeLayerTest, TestInsertSingletonAxesStart) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  layer_param.mutable_reshape_param()->set_axis(0);
+  layer_param.mutable_reshape_param()->set_num_axes(0);
+  BlobShape* blob_shape = layer_param.mutable_reshape_param()->mutable_shape();
+  blob_shape->add_dim(1);
+  blob_shape->add_dim(1);
+  blob_shape->add_dim(1);
+
+  ReshapeLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+
+  ASSERT_EQ(this->blob_top_->num_axes(), 7);
+  EXPECT_EQ(this->blob_top_->shape(0), 1);
   EXPECT_EQ(this->blob_top_->shape(1), 1);
-  EXPECT_EQ(this->blob_top_->shape(2), 2);
-  EXPECT_EQ(this->blob_top_->shape(3), 3);
+  EXPECT_EQ(this->blob_top_->shape(2), 1);
+  EXPECT_EQ(this->blob_top_->shape(3), 2);
+  EXPECT_EQ(this->blob_top_->shape(4), 3);
+  EXPECT_EQ(this->blob_top_->shape(5), 6);
+  EXPECT_EQ(this->blob_top_->shape(6), 5);
+}
+
+TYPED_TEST(ReshapeLayerTest, TestInsertSingletonAxesMiddle) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  layer_param.mutable_reshape_param()->set_axis(2);
+  layer_param.mutable_reshape_param()->set_num_axes(0);
+  BlobShape* blob_shape = layer_param.mutable_reshape_param()->mutable_shape();
+  blob_shape->add_dim(1);
+  blob_shape->add_dim(1);
+  blob_shape->add_dim(1);
+
+  ReshapeLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+
+  ASSERT_EQ(this->blob_top_->num_axes(), 7);
+  EXPECT_EQ(this->blob_top_->shape(0), 2);
+  EXPECT_EQ(this->blob_top_->shape(1), 3);
+  EXPECT_EQ(this->blob_top_->shape(2), 1);
+  EXPECT_EQ(this->blob_top_->shape(3), 1);
   EXPECT_EQ(this->blob_top_->shape(4), 1);
-  EXPECT_EQ(this->blob_top_->shape(5), 5);
+  EXPECT_EQ(this->blob_top_->shape(5), 6);
+  EXPECT_EQ(this->blob_top_->shape(6), 5);
+}
+
+TYPED_TEST(ReshapeLayerTest, TestInsertSingletonAxesEnd) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  layer_param.mutable_reshape_param()->set_axis(-1);
+  layer_param.mutable_reshape_param()->set_num_axes(0);
+  BlobShape* blob_shape = layer_param.mutable_reshape_param()->mutable_shape();
+  blob_shape->add_dim(1);
+  blob_shape->add_dim(1);
+  blob_shape->add_dim(1);
+
+  ReshapeLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+
+  ASSERT_EQ(this->blob_top_->num_axes(), 7);
+  EXPECT_EQ(this->blob_top_->shape(0), 2);
+  EXPECT_EQ(this->blob_top_->shape(1), 3);
+  EXPECT_EQ(this->blob_top_->shape(2), 6);
+  EXPECT_EQ(this->blob_top_->shape(3), 5);
+  EXPECT_EQ(this->blob_top_->shape(4), 1);
+  EXPECT_EQ(this->blob_top_->shape(5), 1);
+  EXPECT_EQ(this->blob_top_->shape(6), 1);
+}
+
+TYPED_TEST(ReshapeLayerTest, TestFlattenMiddle) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  layer_param.mutable_reshape_param()->set_axis(1);
+  layer_param.mutable_reshape_param()->set_num_axes(2);
+  BlobShape* blob_shape = layer_param.mutable_reshape_param()->mutable_shape();
+  blob_shape->add_dim(-1);
+
+  ReshapeLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+
+  ASSERT_EQ(this->blob_top_->num_axes(), 3);
+  EXPECT_EQ(this->blob_top_->shape(0), 2);
+  EXPECT_EQ(this->blob_top_->shape(1), 3 * 6);
+  EXPECT_EQ(this->blob_top_->shape(2), 5);
 }
 
 TYPED_TEST(ReshapeLayerTest, TestForward) {
@@ -134,5 +277,4 @@ TYPED_TEST(ReshapeLayerTest, TestGradient) {
       this->blob_top_vec_);
 }
 
-
 }  // namespace caffe
diff --git a/src/caffe/test/test_spp_layer.cpp b/src/caffe/test/test_spp_layer.cpp
new file mode 100644
index 00000000000..b2585f1a5fa
--- /dev/null
+++ b/src/caffe/test/test_spp_layer.cpp
@@ -0,0 +1,131 @@
+#include <algorithm>
+#include <cstring>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/vision_layers.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+template <typename TypeParam>
+class SPPLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  SPPLayerTest()
+      : blob_bottom_(new Blob<Dtype>()),
+        blob_bottom_2_(new Blob<Dtype>()),
+        blob_bottom_3_(new Blob<Dtype>()),
+        blob_top_(new Blob<Dtype>()) {}
+  virtual void SetUp() {
+    Caffe::set_random_seed(1701);
+    blob_bottom_->Reshape(2, 3, 9, 8);
+    blob_bottom_2_->Reshape(4, 3, 1024, 765);
+    blob_bottom_3_->Reshape(10, 3, 7, 7);
+    // fill the values
+    FillerParameter filler_param;
+    GaussianFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_);
+    blob_bottom_vec_.push_back(blob_bottom_);
+    blob_bottom_vec_2_.push_back(blob_bottom_2_);
+    blob_bottom_vec_3_.push_back(blob_bottom_3_);
+    blob_top_vec_.push_back(blob_top_);
+  }
+  virtual ~SPPLayerTest() { delete blob_bottom_; delete blob_top_; }
+
+  Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_bottom_2_;
+  Blob<Dtype>* const blob_bottom_3_;
+  Blob<Dtype>* const blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_bottom_vec_2_;
+  vector<Blob<Dtype>*> blob_bottom_vec_3_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+TYPED_TEST_CASE(SPPLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(SPPLayerTest, TestSetup) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  layer_param.mutable_spp_param()->set_pyramid_height(3);
+  SPPLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  // expected number of pool results is geometric sum
+  // (1 - r ** n)/(1 - r) where r = 4 and n = pyramid_height
+  // (1 - 4 ** 3)/(1 - 4) = 21
+  // multiply bottom num_channels * expected_pool_results
+  // to get expected num_channels (3 * 21 = 63)
+  EXPECT_EQ(this->blob_top_->num(), 2);
+  EXPECT_EQ(this->blob_top_->channels(), 63);
+  EXPECT_EQ(this->blob_top_->height(), 1);
+  EXPECT_EQ(this->blob_top_->width(), 1);
+}
+
+TYPED_TEST(SPPLayerTest, TestEqualOutputDims) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  layer_param.mutable_spp_param()->set_pyramid_height(5);
+  SPPLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_2_, this->blob_top_vec_);
+  // expected number of pool results is geometric sum
+  // (1 - r ** n)/(1 - r) where r = 4 and n = pyramid_height
+  // (1 - 4 ** 5)/(1 - 4) = 341
+  // multiply bottom num_channels * expected_pool_results
+  // to get expected num_channels (3 * 341 = 1023)
+  EXPECT_EQ(this->blob_top_->num(), 4);
+  EXPECT_EQ(this->blob_top_->channels(), 1023);
+  EXPECT_EQ(this->blob_top_->height(), 1);
+  EXPECT_EQ(this->blob_top_->width(), 1);
+}
+
+TYPED_TEST(SPPLayerTest, TestEqualOutputDims2) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  layer_param.mutable_spp_param()->set_pyramid_height(3);
+  SPPLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_3_, this->blob_top_vec_);
+  // expected number of pool results is geometric sum
+  // (1 - r ** n)/(1 - r) where r = 4 and n = pyramid_height
+  // (1 - 4 ** 3)/(1 - 4) = 21
+  // multiply bottom num_channels * expected_pool_results
+  // to get expected num_channels (3 * 21 = 63)
+  EXPECT_EQ(this->blob_top_->num(), 10);
+  EXPECT_EQ(this->blob_top_->channels(), 63);
+  EXPECT_EQ(this->blob_top_->height(), 1);
+  EXPECT_EQ(this->blob_top_->width(), 1);
+}
+
+TYPED_TEST(SPPLayerTest, TestForwardBackward) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  layer_param.mutable_spp_param()->set_pyramid_height(3);
+  SPPLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  vector<bool> propagate_down(this->blob_bottom_vec_.size(), true);
+  layer.Backward(this->blob_top_vec_, propagate_down,
+                 this->blob_bottom_vec_);
+}
+
+TYPED_TEST(SPPLayerTest, TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  SPPParameter* spp_param = layer_param.mutable_spp_param();
+  spp_param->set_pyramid_height(3);
+  SPPLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-4, 1e-2);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+
+}  // namespace caffe
diff --git a/tools/caffe.cpp b/tools/caffe.cpp
index eb9e97f5e27..70b15f890f7 100644
--- a/tools/caffe.cpp
+++ b/tools/caffe.cpp
@@ -252,9 +252,6 @@ int time() {
     forward_timer.Start();
     for (int i = 0; i < layers.size(); ++i) {
       timer.Start();
-      // Although Reshape should be essentially free, we include it here
-      // so that we will notice Reshape performance bugs.
-      layers[i]->Reshape(bottom_vecs[i], top_vecs[i]);
       layers[i]->Forward(bottom_vecs[i], top_vecs[i]);
       forward_time_per_layer[i] += timer.MicroSeconds();
     }