diff --git a/data/coco/README.md b/data/coco/README.md
new file mode 100644
index 00000000000..53d36707f2e
--- /dev/null
+++ b/data/coco/README.md
@@ -0,0 +1,24 @@
+For details about the Microsoft COCO ("Common Objects in Context") dataset [1],
+visit mscoco.org.  This README provides instructions for downloading and
+installing the tools and dataset.
+
+1) Download and extract the COCO Python tools by running:
+
+    ./download_tools.sh
+
+2) Install the tools, and optionally download the data by running:
+
+    cd tools
+    python setup.py install  # follow prompts to download or skip data
+
+3) Download train/val/test splits using:
+
+    ./get_coco2014_aux.sh
+
+(or see the COCO README (tools/README) for more information).
+
+
+[1] Lin, Tsung-Yi, Michael Maire, Serge Belongie, James Hays, Pietro Perona,
+    Deva Ramanan, Piotr Dollár, and C. Lawrence Zitnick.
+    "Microsoft COCO: Common Objects in Context."
+    arXiv preprint arXiv:1405.0312 (2014).
diff --git a/data/coco/download_tools.sh b/data/coco/download_tools.sh
new file mode 100755
index 00000000000..c90bc1a2624
--- /dev/null
+++ b/data/coco/download_tools.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+
+wget http://msvocds.blob.core.windows.net/annotations-0-9/tools.zip
+unzip tools.zip
+rm tools.zip
diff --git a/data/coco/get_coco2014_aux.sh b/data/coco/get_coco2014_aux.sh
new file mode 100755
index 00000000000..91cf3e1ef0d
--- /dev/null
+++ b/data/coco/get_coco2014_aux.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env sh
+#
+# Downloads Andrej Karpathy's train/val/test splits of COCO2014 as text files.
+
+echo "Downloading..."
+
+wget http://dl.caffe.berkeleyvision.org/coco2014_aux.tar.gz
+
+echo "Unzipping..."
+
+tar -xf coco2014_aux.tar.gz && rm -f coco2014_aux.tar.gz
+
+echo "Done."
diff --git a/examples/coco_caption/.gitignore b/examples/coco_caption/.gitignore
new file mode 100644
index 00000000000..e040331b7f2
--- /dev/null
+++ b/examples/coco_caption/.gitignore
@@ -0,0 +1 @@
+h5_data/
diff --git a/examples/coco_caption/coco_to_hdf5_data.py b/examples/coco_caption/coco_to_hdf5_data.py
new file mode 100755
index 00000000000..233ee829078
--- /dev/null
+++ b/examples/coco_caption/coco_to_hdf5_data.py
@@ -0,0 +1,267 @@
+#!/usr/bin/env python
+
+from hashlib import sha1
+import os
+import random
+random.seed(3)
+import re
+import sys
+
+sys.path.append('./examples/coco_caption/')
+
+COCO_PATH = './data/coco/tools'
+COCO_TOOL_PATH = '%s/pycocotools' % COCO_PATH
+
+MAX_HASH = 100000
+
+sys.path.append(COCO_TOOL_PATH)
+from coco import COCO
+
+from hdf5_sequence_generator import SequenceGenerator, HDF5SequenceWriter
+
+# UNK_IDENTIFIER is the word used to identify unknown words
+UNK_IDENTIFIER = '<unk>'
+
+SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)')
+def split_sentence(sentence):
+  # break sentence into a list of words and punctuation
+  sentence = [s.lower() for s in SENTENCE_SPLIT_REGEX.split(sentence.strip()) if len(s.strip()) > 0]
+  # remove the '.' from the end of the sentence
+  if sentence[-1] != '.':
+    # print "Warning: sentence doesn't end with '.'; ends with: %s" % sentence[-1]
+    return sentence
+  return sentence[:-1]
+
+MAX_WORDS = 20
+
+class CocoSequenceGenerator(SequenceGenerator):
+  def __init__(self, coco, batch_num_streams, vocab=None,
+               max_words=MAX_WORDS, align=True, shuffle=True, gt_captions=True,
+               pad=True, truncate=True, split_ids=None):
+    self.max_words = max_words
+    num_empty_lines = 0
+    self.images = []
+    num_total = 0
+    num_missing = 0
+    num_captions = 0
+    known_images = {}
+    image_root = '%s/%s' % (COCO_PATH, coco.image_folder)
+    if split_ids is None:
+      split_ids = coco.images.keys()
+    for image_id in split_ids:
+      image_info = coco.images[image_id]
+      image_path = '%s/%s/%s' % \
+          (image_root, image_info['file_path'], image_info['file_name'])
+      if os.path.isfile(image_path):
+        assert image_id not in known_images  # no duplicates allowed
+        known_images[image_id] = {}
+        known_images[image_id]['path'] = image_path
+        if gt_captions:
+          known_images[image_id]['sentences'] = [split_sentence(anno['sentence'])
+              for anno in coco.image_to_annotations[image_id]]
+          num_captions += len(known_images[image_id]['sentences'])
+        else:
+          known_images[image_id]['sentences'] = []
+      else:
+        num_missing += 1
+        print 'Warning (#%d): image not found: %s' % (num_missing, image_path)
+      num_total += 1
+    print '%d/%d images missing' % (num_missing, num_total)
+    if vocab is None:
+      self.init_vocabulary(known_images)
+    else:
+      self.vocabulary_inverted = vocab
+      self.vocabulary = {}
+      for index, word in enumerate(self.vocabulary_inverted):
+        self.vocabulary[word] = index
+    self.image_sentence_pairs = []
+    num_no_sentences = 0
+    for image_filename, metadata in known_images.iteritems():
+      if not metadata['sentences']:
+        num_no_sentences += 1
+        print 'Warning (#%d): image with no sentences: %s' % (num_no_sentences, image_filename)
+      for sentence in metadata['sentences']:
+        self.image_sentence_pairs.append((metadata['path'], sentence))
+    self.index = 0
+    self.num_resets = 0
+    self.num_truncates = 0
+    self.num_pads = 0
+    self.num_outs = 0
+    self.image_list = []
+    SequenceGenerator.__init__(self)
+    self.batch_num_streams = batch_num_streams
+    # make the number of image/sentence pairs a multiple of the buffer size
+    # so each timestep of each batch is useful and we can align the images
+    if align:
+      num_pairs = len(self.image_sentence_pairs)
+      remainder = num_pairs % batch_num_streams
+      if remainder > 0:
+        num_needed = batch_num_streams - remainder
+        for i in range(num_needed):
+          choice = random.randint(0, num_pairs - 1)
+          self.image_sentence_pairs.append(self.image_sentence_pairs[choice])
+      assert len(self.image_sentence_pairs) % batch_num_streams == 0
+    if shuffle:
+      random.shuffle(self.image_sentence_pairs)
+    self.pad = pad
+    self.truncate = truncate
+    self.negative_one_padded_streams = frozenset(('input_sentence', 'target_sentence'))
+
+  def streams_exhausted(self):
+    return self.num_resets > 0
+
+  def init_vocabulary(self, image_annotations, min_count=5):
+    words_to_count = {}
+    for image_id, annotations in image_annotations.iteritems():
+      for annotation in annotations['sentences']:
+        for word in annotation:
+          word = word.strip()
+          if word not in words_to_count:
+            words_to_count[word] = 0
+          words_to_count[word] += 1
+    # Sort words by count, then alphabetically
+    words_by_count = sorted(words_to_count.keys(), key=lambda w: (-words_to_count[w], w))
+    print 'Initialized vocabulary with %d words; top 10 words:' % len(words_by_count)
+    for word in words_by_count[:10]:
+      print '\t%s (%d)' % (word, words_to_count[word])
+    # Add words to vocabulary
+    self.vocabulary = {UNK_IDENTIFIER: 0}
+    self.vocabulary_inverted = [UNK_IDENTIFIER]
+    for index, word in enumerate(words_by_count):
+      word = word.strip()
+      if words_to_count[word] < min_count:
+        break
+      self.vocabulary_inverted.append(word)
+      self.vocabulary[word] = index + 1
+    print 'Final vocabulary (restricted to words with counts of %d+) has %d words' % \
+        (min_count, len(self.vocabulary))
+
+  def dump_vocabulary(self, vocab_filename):
+    print 'Dumping vocabulary to file: %s' % vocab_filename
+    with open(vocab_filename, 'wb') as vocab_file:
+      for word in self.vocabulary_inverted:
+        vocab_file.write('%s\n' % word)
+    print 'Done.'
+
+  def dump_image_file(self, image_filename, dummy_image_filename=None):
+    print 'Dumping image list to file: %s' % image_filename
+    with open(image_filename, 'wb') as image_file:
+      for image_path, _ in self.image_list:
+        image_file.write('%s\n' % image_path)
+    if dummy_image_filename is not None:
+      print 'Dumping image list with dummy labels to file: %s' % dummy_image_filename
+      with open(dummy_image_filename, 'wb') as image_file:
+        for path_and_hash in self.image_list:
+          image_file.write('%s %d\n' % path_and_hash)
+    print 'Done.'
+
+  def next_line(self):
+    num_lines = float(len(self.image_sentence_pairs))
+    self.index += 1
+    if self.index == 1 or self.index == num_lines or self.index % 10000 == 0:
+      print 'Processed %d/%d (%f%%) lines' % (self.index, num_lines,
+                                              100 * self.index / num_lines)
+    if self.index == num_lines:
+      self.index = 0
+      self.num_resets += 1
+
+  def line_to_stream(self, sentence):
+    stream = []
+    for word in sentence:
+      word = word.strip()
+      if word in self.vocabulary:
+        stream.append(self.vocabulary[word])
+      else:  # unknown word; append UNK
+        stream.append(self.vocabulary[UNK_IDENTIFIER])
+    # increment the stream -- 0 will be the EOS character
+    stream = [s + 1 for s in stream]
+    return stream
+
+  def get_pad_value(self, stream_name):
+    return -1 if stream_name in self.negative_one_padded_streams else 0
+
+  def get_streams(self):
+    image_filename, line = self.image_sentence_pairs[self.index]
+    stream = self.line_to_stream(line)
+    pad = self.max_words - (len(stream) + 1) if self.pad else 0
+    if pad > 0: self.num_pads += 1
+    self.num_outs += 1
+    out = {}
+    out['stage_indicators'] = [1] * (len(stream) + 1) + [0] * pad
+    out['cont_sentence'] = [0] + [1] * len(stream) + [0] * pad
+    out['input_sentence'] = [0] + stream + [-1] * pad
+    out['target_sentence'] = stream + [0] + [-1] * pad
+    truncated = False
+    if self.truncate:
+      for key, val in out.iteritems():
+        if len(val) > self.max_words:
+          out[key] = val[:self.max_words]
+          truncated = True
+      self.num_truncates += truncated
+    image_hash = self.image_hash(image_filename)
+    out['hashed_image_path'] = [image_hash] * len(out['input_sentence'])
+    self.image_list.append((image_filename, image_hash))
+    self.next_line()
+    return out
+
+  def image_hash(self, filename):
+    image_hash = int(sha1(filename).hexdigest(), 16) % MAX_HASH
+    assert image_hash == float(image_hash)
+    return image_hash
+
+COCO_ANNO_PATH = '%s/annotations/sentences_%%s2014.json' % COCO_PATH
+COCO_IMAGE_PATTERN = '%s/images/%%s2014' % COCO_PATH
+COCO_IMAGE_ID_PATTERN = 'COCO_%s2014_%%012d.jpg'
+
+BUFFER_SIZE = 100
+OUTPUT_DIR = './examples/coco_caption/h5_data/buffer_%d' % BUFFER_SIZE
+SPLITS_PATTERN = './data/coco/coco2014_cocoid.%s.txt'
+OUTPUT_DIR_PATTERN = '%s/%%s_batches' % OUTPUT_DIR
+
+def preprocess_dataset(split_name, coco_split_name, batch_stream_length,
+                       vocab=None, aligned=True):
+  with open(SPLITS_PATTERN % split_name, 'r') as split_file:
+    split_image_ids = [int(line) for line in split_file.readlines()]
+  output_dataset_name = split_name
+  if aligned:
+    output_dataset_name += '_aligned_%d' % MAX_WORDS
+  else:
+    output_dataset_name += '_unaligned'
+  output_path = OUTPUT_DIR_PATTERN % output_dataset_name
+  coco = COCO(COCO_ANNO_PATH % coco_split_name)
+  sg = CocoSequenceGenerator(coco, BUFFER_SIZE, split_ids=split_image_ids,
+      vocab=vocab, align=aligned, pad=aligned, truncate=aligned)
+  sg.batch_stream_length = batch_stream_length
+  writer = HDF5SequenceWriter(sg, output_dir=output_path)
+  writer.write_to_exhaustion()
+  writer.write_filelists()
+  if vocab is None:
+    vocab_out_path = '%s/vocabulary.txt' % OUTPUT_DIR
+    sg.dump_vocabulary(vocab_out_path)
+  image_out_path = '%s/image_list.txt' % output_path
+  image_dummy_labels_out_path = '%s/image_list.with_dummy_labels.txt' % output_path
+  sg.dump_image_file(image_out_path, image_dummy_labels_out_path)
+  num_outs = sg.num_outs
+  num_pads = sg.num_pads
+  num_truncates = sg.num_truncates
+  print 'Padded %d/%d sequences; truncated %d/%d sequences' % \
+      (num_pads, num_outs, num_truncates, num_outs)
+  return sg.vocabulary_inverted
+
+def preprocess_coco():
+  vocab = None
+  DATASETS = [
+      ('train', 'train', 100000, True),
+      ('val', 'val', 100000, True),
+      ('test', 'val', 100000, True),
+      # Write unaligned datasets as well:
+      ('train', 'train', 100000, False),
+      ('val', 'val', 100000, False),
+      ('test', 'val', 100000, False),
+  ]
+  for split_name, coco_split_name, batch_stream_length, aligned in DATASETS:
+    vocab = preprocess_dataset(split_name, coco_split_name, batch_stream_length,
+                               vocab=vocab, aligned=aligned)
+
+if __name__ == "__main__":
+  preprocess_coco()
diff --git a/examples/coco_caption/hdf5_sequence_generator.py b/examples/coco_caption/hdf5_sequence_generator.py
new file mode 100644
index 00000000000..98d4657b6bf
--- /dev/null
+++ b/examples/coco_caption/hdf5_sequence_generator.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python
+
+import h5py
+import numpy as np
+import os
+import random
+import sys
+
+class SequenceGenerator():
+  def __init__(self):
+    self.dimension = 10
+    self.batch_stream_length = 2000
+    self.batch_num_streams = 8
+    self.min_stream_length = 13
+    self.max_stream_length = 17
+    self.substream_names = None
+    self.streams_initialized = False
+
+  def streams_exhausted(self):
+    return False
+
+  def init_streams(self):
+    self.streams = [None] * self.batch_num_streams
+    self.stream_indices = [0] * self.batch_num_streams
+    self.reset_stream(0)
+    self.streams_initialized = True
+
+  def reset_stream(self, stream_index):
+    streams = self.get_streams()
+    stream_names = sorted(streams.keys())
+    if self.substream_names is None:
+      assert len(stream_names) > 0
+      self.substream_names = stream_names
+    assert self.substream_names == stream_names
+    if self.streams[stream_index] is None:
+      self.streams[stream_index] = {}
+    stream_length = len(streams[stream_names[0]])
+    for k, v in streams.iteritems():
+      assert stream_length == len(v)
+      self.streams[stream_index][k] = v
+    self.stream_indices[stream_index] = 0
+
+  # Pad with zeroes by default -- override this to pad with soemthing else
+  # for a particular stream
+  def get_pad_value(self, stream_name):
+    return 0
+
+  def get_next_batch(self, truncate_at_exhaustion=True):
+    if not self.streams_initialized:
+      self.init_streams()
+    batch_size = self.batch_num_streams * self.batch_stream_length
+    batch = {}
+    batch_indicators = np.zeros((self.batch_stream_length, self.batch_num_streams))
+    for name in self.substream_names:
+      batch[name] = self.get_pad_value(name) * np.ones_like(batch_indicators)
+    exhausted = [False] * self.batch_num_streams
+    all_exhausted = False
+    reached_exhaustion = False
+    num_completed_streams = 0
+    for t in range(self.batch_stream_length):
+      all_exhausted = True
+      for i in range(self.batch_num_streams):
+        if not exhausted[i]:
+          if self.streams[i] is None or \
+              self.stream_indices[i] == len(self.streams[i][self.substream_names[0]]):
+            self.stream_indices[i] = 0
+            reached_exhaustion = reached_exhaustion or self.streams_exhausted()
+            if reached_exhaustion: exhausted[i] = True
+            if not reached_exhaustion or not truncate_at_exhaustion:
+              self.reset_stream(i)
+            else:
+              continue
+          for name in self.substream_names:
+            batch[name][t, i] = self.streams[i][name][self.stream_indices[i]]
+          batch_indicators[t, i] = 0 if self.stream_indices[i] == 0 else 1
+          self.stream_indices[i] += 1
+          if self.stream_indices[i] == len(self.streams[i][self.substream_names[0]]):
+            num_completed_streams += 1
+        if not exhausted[i]: all_exhausted = False
+      if all_exhausted and truncate_at_exhaustion:
+        print ('Exhausted all data; cutting off batch at timestep %d ' +
+               'with %d streams completed') % (t, num_completed_streams)
+        for name in self.substream_names:
+          batch[name] = batch[name][:t, :]
+        batch_indicators = batch_indicators[:t, :]
+        break
+    return batch, batch_indicators
+
+  def get_streams(self):
+    raise Exception('get_streams should be overridden to return a dict ' +
+                    'of equal-length iterables.')
+
+class HDF5SequenceWriter():
+  def __init__(self, sequence_generator, output_dir=None, verbose=False):
+    self.generator = sequence_generator
+    assert output_dir is not None  # required
+    self.output_dir = output_dir
+    if os.path.exists(output_dir):
+      raise Exception('Output directory already exists: ' + output_dir)
+    os.makedirs(output_dir)
+    self.verbose = verbose
+    self.filenames = []
+
+  def write_batch(self, stop_at_exhaustion=False):
+    batch_comps, cont_indicators = self.generator.get_next_batch()
+    batch_index = len(self.filenames)
+    filename = '%s/batch_%d.h5' % (self.output_dir, batch_index)
+    self.filenames.append(filename)
+    h5file = h5py.File(filename, 'w')
+    dataset = h5file.create_dataset('cont', shape=cont_indicators.shape, dtype=cont_indicators.dtype)
+    dataset[:] = cont_indicators
+    dataset = h5file.create_dataset('buffer_size', shape=(1,), dtype=np.int)
+    dataset[:] = self.generator.batch_num_streams
+    for key, batch in batch_comps.iteritems():
+      if self.verbose:
+        for s in range(self.generator.batch_num_streams):
+          stream = np.array(self.generator.streams[s][key])
+          print 'batch %d, stream %s, index %d: ' % (batch_index, key, s), stream
+      h5dataset = h5file.create_dataset(key, shape=batch.shape, dtype=batch.dtype)
+      h5dataset[:] = batch
+    h5file.close()
+
+  def write_to_exhaustion(self):
+    while not self.generator.streams_exhausted():
+      self.write_batch(stop_at_exhaustion=True)
+
+  def write_filelists(self):
+    assert self.filenames is not None
+    filelist_filename = '%s/hdf5_chunk_list.txt' % self.output_dir
+    with open(filelist_filename, 'w') as listfile:
+      for filename in self.filenames:
+        listfile.write('%s\n' % filename)
diff --git a/examples/coco_caption/lrcn.prototxt b/examples/coco_caption/lrcn.prototxt
new file mode 100644
index 00000000000..f0888df23bf
--- /dev/null
+++ b/examples/coco_caption/lrcn.prototxt
@@ -0,0 +1,768 @@
+# The network is used for the image captioning experiments of LRCN [1].
+# Please consider citing LRCN [1] if you use this example in your work.
+#
+# [1] J. Donahue, L. A. Hendricks, S. Guadarrama, M. Rohrbach, S. Venugopalan,
+#     K. Saenko, T. Darrell. "Long-term Recurrent Convolutional Networks for
+#     Visual Recognition and Description." arXiv preprint arXiv:1411.4389 (2014).
+
+name: "lrcn_caffenet_to_lstm"
+layer {
+  name: "data"
+  type: "ImageData"
+  top: "data"
+  top: "label"
+  include { phase: TRAIN }
+  transform_param {
+    mirror: true
+    crop_size: 227
+    mean_value: 104
+    mean_value: 117
+    mean_value: 123
+  }
+  image_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/image_list.with_dummy_labels.txt"
+    batch_size: 100
+    new_height: 256
+    new_width: 256
+  }
+}
+layer {
+  name: "data"
+  type: "HDF5Data"
+  top: "cont_sentence"
+  top: "input_sentence"
+  top: "target_sentence"
+  include { phase: TRAIN }
+  hdf5_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/hdf5_chunk_list.txt"
+    batch_size: 20
+  }
+}
+layer {
+  name: "data"
+  type: "ImageData"
+  top: "data"
+  top: "label"
+  include {
+    phase: TEST
+    stage: "test-on-train"
+  }
+  transform_param {
+    mirror: true
+    crop_size: 227
+    mean_value: 104
+    mean_value: 117
+    mean_value: 123
+  }
+  image_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/image_list.with_dummy_labels.txt"
+    batch_size: 100
+    new_height: 256
+    new_width: 256
+  }
+}
+layer {
+  name: "data"
+  type: "HDF5Data"
+  top: "cont_sentence"
+  top: "input_sentence"
+  top: "target_sentence"
+  include {
+    phase: TEST
+    stage: "test-on-train"
+  }
+  hdf5_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/hdf5_chunk_list.txt"
+    batch_size: 20
+  }
+}
+layer {
+  name: "data"
+  type: "ImageData"
+  top: "data"
+  top: "label"
+  include {
+    phase: TEST
+    stage: "test-on-val"
+  }
+  transform_param {
+    mirror: true
+    crop_size: 227
+    mean_value: 104
+    mean_value: 117
+    mean_value: 123
+  }
+  image_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/val_aligned_20_batches/image_list.with_dummy_labels.txt"
+    batch_size: 100
+    new_height: 256
+    new_width: 256
+  }
+}
+layer {
+  name: "data"
+  type: "HDF5Data"
+  top: "cont_sentence"
+  top: "input_sentence"
+  top: "target_sentence"
+  include {
+    phase: TEST
+    stage: "test-on-val"
+  }
+  hdf5_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/val_aligned_20_batches/hdf5_chunk_list.txt"
+    batch_size: 20
+  }
+}
+layer {
+  name: "silence"
+  type: "Silence"
+  bottom: "label"
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 96
+    kernel_size: 11
+    stride: 4
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 0.1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 0.2
+    decay_mult: 0
+  }
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 96
+    kernel_size: 11
+    stride: 4
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "pool1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "norm1"
+  top: "conv2"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 256
+    pad: 2
+    kernel_size: 5
+    group: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "norm1"
+  top: "conv2"
+  param {
+    lr_mult: 0.1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 0.2
+    decay_mult: 0
+  }
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 256
+    pad: 2
+    kernel_size: 5
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 1
+    }
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "pool2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0001
+    beta: 0.75
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "norm2"
+  top: "conv3"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "norm2"
+  top: "conv3"
+  param {
+    lr_mult: 0.1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 0.2
+    decay_mult: 0
+  }
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    group: 2
+  }
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 0.1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 0.2
+    decay_mult: 0
+  }
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 384
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 1
+    }
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  include { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    group: 2
+  }
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 0.1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 0.2
+    decay_mult: 0
+  }
+  exclude { stage: "freeze-convnet" }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+    group: 2
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 1
+    }
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "conv5"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  include { stage: "freeze-convnet" }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 0.1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 0.2
+    decay_mult: 0
+  }
+  exclude { stage: "freeze-convnet" }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 1
+    }
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 0
+  }
+  param {
+    lr_mult: 0
+  }
+  include { stage: "freeze-convnet" }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 0.1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 0.2
+    decay_mult: 0
+  }
+  exclude { stage: "freeze-convnet" }
+  inner_product_param {
+    num_output: 4096
+    weight_filler {
+      type: "gaussian"
+      std: 0.005
+    }
+    bias_filler {
+      type: "constant"
+      value: 1
+    }
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc8"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "fc8"
+  param {
+    lr_mult: 0.1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 0.2
+    decay_mult: 0
+  }
+  # exclude { stage: "freeze-convnet" }
+  inner_product_param {
+    num_output: 1000
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "embedding"
+  type: "Embed"
+  bottom: "input_sentence"
+  top: "embedded_input_sentence"
+  param {
+    lr_mult: 1
+  }
+  embed_param {
+    bias_term: false
+    input_dim: 8801
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+  }
+}
+layer {
+  name: "lstm1"
+  type: "LSTM"
+  bottom: "embedded_input_sentence"
+  bottom: "cont_sentence"
+  bottom: "fc8"
+  top: "lstm1"
+  include { stage: "unfactored" }
+  recurrent_param {
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "lstm2"
+  type: "LSTM"
+  bottom: "lstm1"
+  bottom: "cont_sentence"
+  top: "lstm2"
+  include {
+    stage: "unfactored"
+    stage: "2-layer"
+  }
+  recurrent_param {
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "lstm1"
+  type: "LSTM"
+  bottom: "embedded_input_sentence"
+  bottom: "cont_sentence"
+  top: "lstm1"
+  include { stage: "factored" }
+  recurrent_param {
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "lstm2"
+  type: "LSTM"
+  bottom: "lstm1"
+  bottom: "cont_sentence"
+  bottom: "fc8"
+  top: "lstm2"
+  include { stage: "factored" }
+  recurrent_param {
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "predict"
+  type: "InnerProduct"
+  bottom: "lstm1"
+  top: "predict"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  exclude { stage: "2-layer" }
+  inner_product_param {
+    num_output: 8801
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    axis: 2
+  }
+}
+layer {
+  name: "predict"
+  type: "InnerProduct"
+  bottom: "lstm2"
+  top: "predict"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  include { stage: "2-layer" }
+  inner_product_param {
+    num_output: 8801
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    axis: 2
+  }
+}
+layer {
+  name: "cross_entropy_loss"
+  type: "SoftmaxWithLoss"
+  bottom: "predict"
+  bottom: "target_sentence"
+  top: "cross_entropy_loss"
+  loss_weight: 20
+  loss_param {
+    ignore_label: -1
+  }
+  softmax_param {
+    axis: 2
+  }
+}
+layer {
+  name: "accuracy"
+  type: "Accuracy"
+  bottom: "predict"
+  bottom: "target_sentence"
+  top: "accuracy"
+  include { phase: TEST }
+  accuracy_param {
+    axis: 2
+    ignore_label: -1
+  }
+}
diff --git a/examples/coco_caption/lrcn_solver.prototxt b/examples/coco_caption/lrcn_solver.prototxt
new file mode 100644
index 00000000000..65ca272b30c
--- /dev/null
+++ b/examples/coco_caption/lrcn_solver.prototxt
@@ -0,0 +1,30 @@
+net: "./examples/coco_caption/lrcn.prototxt"
+
+# lrcn.prototxt supports three variants of the LRCN architecture:
+# (1) stage: 'factored' stage: '2-layer'
+# (2) stage: 'unfactored' stage: '1-layer'
+# (3) stage: 'unfactored' stage: '2-layer'
+# This solver uses variant (1).
+# To use a different variant, modify the states (train_state, test_state)
+# below as appropriate:
+
+train_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' }
+test_iter: 25
+test_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' stage: 'test-on-train' }
+test_iter: 25
+test_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' stage: 'test-on-val' }
+test_interval: 1000
+base_lr: 0.01
+lr_policy: "step"
+gamma: 0.5
+stepsize: 20000
+display: 1
+max_iter: 110000
+momentum: 0.9
+weight_decay: 0.0000
+snapshot: 5000
+snapshot_prefix: "./examples/coco_caption/lrcn"
+solver_mode: GPU
+random_seed: 1701
+average_loss: 100
+clip_gradients: 10
diff --git a/examples/coco_caption/lstm_language_model.prototxt b/examples/coco_caption/lstm_language_model.prototxt
new file mode 100644
index 00000000000..68fda5464fe
--- /dev/null
+++ b/examples/coco_caption/lstm_language_model.prototxt
@@ -0,0 +1,150 @@
+name: "lstm_language_model"
+layer {
+  name: "data"
+  type: "HDF5Data"
+  top: "cont_sentence"
+  top: "input_sentence"
+  top: "target_sentence"
+  include { phase: TRAIN }
+  hdf5_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/train_unaligned_batches/hdf5_chunk_list.txt"
+    batch_size: 20
+  }
+}
+layer {
+  name: "data"
+  type: "HDF5Data"
+  top: "cont_sentence"
+  top: "input_sentence"
+  top: "target_sentence"
+  include {
+    phase: TEST
+    stage: "test-on-train"
+  }
+  hdf5_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/train_unaligned_batches/hdf5_chunk_list.txt"
+    batch_size: 20
+  }
+}
+layer {
+  name: "data"
+  type: "HDF5Data"
+  top: "cont_sentence"
+  top: "input_sentence"
+  top: "target_sentence"
+  include {
+    phase: TEST
+    stage: "test-on-val"
+  }
+  hdf5_data_param {
+    source: "./examples/coco_caption/h5_data/buffer_100/val_unaligned_batches/hdf5_chunk_list.txt"
+    batch_size: 20
+  }
+}
+layer {
+  name: "embedding"
+  type: "Embed"
+  bottom: "input_sentence"
+  top: "embedded_input_sentence"
+  param {
+    lr_mult: 1
+  }
+  embed_param {
+    bias_term: false
+    input_dim: 8801  # = vocab_size + 1 (for EOS)
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+  }
+}
+layer {
+  name: "embed-drop"
+  type: "Dropout"
+  bottom: "embedded_input_sentence"
+  top: "embedded_input_sentence"
+  dropout_param { dropout_ratio: 0.5 }
+  include { stage: "embed-drop" }
+}
+layer {
+  name: "lstm1"
+  type: "LSTM"
+  bottom: "embedded_input_sentence"
+  bottom: "cont_sentence"
+  top: "lstm1"
+  recurrent_param {
+    num_output: 1000
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "lstm-drop"
+  type: "Dropout"
+  bottom: "lstm1"
+  top: "lstm1"
+  dropout_param { dropout_ratio: 0.5 }
+  include { stage: "lstm-drop" }
+}
+layer {
+  name: "predict"
+  type: "InnerProduct"
+  bottom: "lstm1"
+  top: "predict"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 8801  # = vocab_size + 1 (+1 for EOS)
+    weight_filler {
+      type: "uniform"
+      min: -0.08
+      max: 0.08
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+    axis: 2
+  }
+}
+layer {
+  name: "cross_entropy_loss"
+  type: "SoftmaxWithLoss"
+  bottom: "predict"
+  bottom: "target_sentence"
+  top: "cross_entropy_loss"
+  loss_weight: 20
+  loss_param {
+    ignore_label: -1
+  }
+  softmax_param {
+    axis: 2
+  }
+}
+layer {
+  name: "accuracy"
+  type: "Accuracy"
+  bottom: "predict"
+  bottom: "target_sentence"
+  top: "accuracy"
+  include { phase: TEST }
+  accuracy_param {
+    axis: 2
+    ignore_label: -1
+  }
+}
diff --git a/examples/coco_caption/lstm_lm_solver.prototxt b/examples/coco_caption/lstm_lm_solver.prototxt
new file mode 100644
index 00000000000..fb36ad15a5b
--- /dev/null
+++ b/examples/coco_caption/lstm_lm_solver.prototxt
@@ -0,0 +1,21 @@
+net: "./examples/coco_caption/lstm_language_model.prototxt"
+train_state: { stage: 'embed-drop' stage: 'lstm-drop' }
+test_iter: 25
+test_state: { stage: 'test-on-train' }
+test_iter: 25
+test_state: { stage: 'test-on-val' }
+test_interval: 1000
+base_lr: 0.1
+lr_policy: "step"
+gamma: 0.5
+stepsize: 20000
+display: 1
+max_iter: 110000
+momentum: 0.9
+weight_decay: 0.0000
+snapshot: 5000
+snapshot_prefix: "./examples/coco_caption/lstm_lm"
+solver_mode: GPU
+random_seed: 1701
+average_loss: 100
+clip_gradients: 10
diff --git a/examples/coco_caption/train_language_model.sh b/examples/coco_caption/train_language_model.sh
new file mode 100755
index 00000000000..6e8a8c47b37
--- /dev/null
+++ b/examples/coco_caption/train_language_model.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+GPU_ID=0
+DATA_DIR=./examples/coco_caption/h5_data/
+if [ ! -d $DATA_DIR ]; then
+    echo "Data directory not found: $DATA_DIR"
+    echo "First, download the COCO dataset (follow instructions in data/coco)"
+    echo "Then, run ./examples/coco_caption/coco_to_hdf5_data.py to create the Caffe input data"
+    exit 1
+fi
+
+./build/tools/caffe train \
+    -solver ./examples/coco_caption/lstm_lm_solver.prototxt \
+    -gpu $GPU_ID
diff --git a/examples/coco_caption/train_lrcn.sh b/examples/coco_caption/train_lrcn.sh
new file mode 100755
index 00000000000..5099e762ccd
--- /dev/null
+++ b/examples/coco_caption/train_lrcn.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+GPU_ID=0
+WEIGHTS=\
+./models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel
+DATA_DIR=./examples/coco_caption/h5_data/
+if [ ! -d $DATA_DIR ]; then
+    echo "Data directory not found: $DATA_DIR"
+    echo "First, download the COCO dataset (follow instructions in data/coco)"
+    echo "Then, run ./examples/coco_caption/coco_to_hdf5_data.py to create the Caffe input data"
+    exit 1
+fi
+
+./build/tools/caffe train \
+    -solver ./examples/coco_caption/lrcn_solver.prototxt \
+    -weights $WEIGHTS \
+    -gpu $GPU_ID
diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
index b1ac3a93eff..a23c671b7e9 100644
--- a/include/caffe/common_layers.hpp
+++ b/include/caffe/common_layers.hpp
@@ -85,7 +85,7 @@ class ConcatLayer : public Layer<Dtype> {
       const vector<Blob<Dtype>*>& top);
 
   virtual inline const char* type() const { return "Concat"; }
-  virtual inline int MinBottomBlobs() const { return 2; }
+  virtual inline int MinBottomBlobs() const { return 1; }
   virtual inline int ExactNumTopBlobs() const { return 1; }
 
  protected:
@@ -176,10 +176,49 @@ class EltwiseLayer : public Layer<Dtype> {
   EltwiseParameter_EltwiseOp op_;
   vector<Dtype> coeffs_;
   Blob<int> max_idx_;
+  bool coeff_blob_;
 
   bool stable_prod_grad_;
 };
 
+/**
+ * @brief A layer for learning "embeddings" of one-hot vector input.
+ *        Equivalent to an InnerProductLayer with one-hot vectors as input, but
+ *        for efficiency the input is the "hot" index of each column itself.
+ *
+ * TODO(dox): thorough documentation for Forward, Backward, and proto params.
+ */
+template <typename Dtype>
+class EmbedLayer : public Layer<Dtype> {
+ public:
+  explicit EmbedLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Embed"; }
+  virtual inline int ExactNumBottomBlobs() const { return 1; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  int M_;
+  int K_;
+  int N_;
+  bool bias_term_;
+  Blob<Dtype> bias_multiplier_;
+};
+
 /**
  * @brief Reshapes the input Blob into flat vectors.
  *
@@ -212,7 +251,9 @@ class FlattenLayer : public Layer<Dtype> {
    *      the outputs -- i.e., the (virtually) copied, flattened inputs
    */
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
+      const vector<Blob<Dtype>*>& top) {}
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {}
 
   /**
    * @brief Computes the error gradient w.r.t. the concatenate inputs.
@@ -224,7 +265,9 @@ class FlattenLayer : public Layer<Dtype> {
    *        gradient is (virtually) copied
    */
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
 };
 
 /**
@@ -297,6 +340,53 @@ class MVNLayer : public Layer<Dtype> {
   Blob<Dtype> sum_multiplier_;
 };
 
+/**
+ * @brief Reshapes an input Blob.
+ */
+template <typename Dtype>
+class ReshapeLayer : public Layer<Dtype> {
+ public:
+  explicit ReshapeLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Reshape"; }
+  virtual inline int ExactNumBottomBlobs() const { return 1; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (D_1 \times D_2 \times ... \times D_m) @f$
+   *      the inputs
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (d_1 \times d_2 \times ... \times d_n) @f$,
+   *      the outputs -- i.e., the (virtually) copied inputs.
+   *      The shape is specified by <code>reshape_param.shape()</code>, and the
+   *      product of the dimensions in the new shape must match that of the
+   *      input shape; i.e., @f$ d_1 d_2 ... d_n = D_1 D_2 ... D_m @f$.
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {}
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {}
+
+  /**
+   * @brief Computes the error gradient w.r.t. the concatenate inputs.
+   *
+   * @param top output Blob vector (length 1), providing the error gradient with
+   *        respect to the outputs
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length K), into which the top error
+   *        gradient is (virtually) copied
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {}
+};
+
 /**
  * @brief Ignores bottom blobs while producing no top blobs. (This is useful
  *        to suppress outputs during testing.)
@@ -440,7 +530,7 @@ class SliceLayer : public Layer<Dtype> {
 
   virtual inline const char* type() const { return "Slice"; }
   virtual inline int ExactNumBottomBlobs() const { return 1; }
-  virtual inline int MinTopBlobs() const { return 2; }
+  virtual inline int MinTopBlobs() const { return 1; }
 
  protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp
index d3eecd2e510..bb78001b305 100644
--- a/include/caffe/loss_layers.hpp
+++ b/include/caffe/loss_layers.hpp
@@ -86,6 +86,8 @@ class AccuracyLayer : public Layer<Dtype> {
   bool has_ignore_label_;
   /// The label indicating that an instance should be ignored.
   int ignore_label_;
+
+  Dtype denominator_;
 };
 
 /**
diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp
index 075afebc9b0..10fc1705192 100644
--- a/include/caffe/net.hpp
+++ b/include/caffe/net.hpp
@@ -84,6 +84,13 @@ class Net {
 
   /// @brief Updates the network weights based on the diff values computed.
   void Update();
+  /**
+   * @brief Shares weight data of owner blobs with shared blobs.
+   *
+   * Note: this is called by Net::Init, and thus should normally not be
+   * called manually.
+   */
+  void ShareWeightData();
 
   /**
    * @brief For an already initialized net, implicitly copies (i.e., using no
@@ -150,6 +157,9 @@ class Net {
     return param_names_index_;
   }
   inline const vector<int>& param_owners() const { return param_owners_; }
+  inline const vector<string>& param_display_names() const {
+    return param_display_names_;
+  }
   /// @brief Input and output blob numbers
   inline int num_inputs() const { return net_input_blobs_.size(); }
   inline int num_outputs() const { return net_output_blobs_.size(); }
diff --git a/include/caffe/sequence_layers.hpp b/include/caffe/sequence_layers.hpp
new file mode 100644
index 00000000000..8ac735435a4
--- /dev/null
+++ b/include/caffe/sequence_layers.hpp
@@ -0,0 +1,312 @@
+#ifndef CAFFE_SEQUENCE_LAYERS_HPP_
+#define CAFFE_SEQUENCE_LAYERS_HPP_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/net.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+template <typename Dtype> class RecurrentLayer;
+
+/**
+ * @brief An abstract class for implementing recurrent behavior inside of an
+ *        unrolled network.  This Layer type cannot be instantiated -- instaed,
+ *        you should use one of its implementations which defines the recurrent
+ *        architecture, such as RNNLayer or LSTMLayer.
+ */
+template <typename Dtype>
+class RecurrentLayer : public Layer<Dtype> {
+ public:
+  explicit RecurrentLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reset();
+
+  virtual inline const char* type() const { return "Recurrent"; }
+  virtual inline int MinBottomBlobs() const { return 2; }
+  virtual inline int MaxBottomBlobs() const { return 3; }
+  virtual inline int ExactNumTopBlobs() const { return 1; }
+
+  virtual inline bool AllowForceBackward(const int bottom_index) const {
+    // Can't propagate to sequence continuation indicators.
+    return bottom_index != 1;
+  }
+
+ protected:
+  /**
+   * @brief Fills net_param with the recurrent network arcthiecture.  Subclasses
+   *        should define this -- see RNNLayer and LSTMLayer for examples.
+   */
+  virtual void FillUnrolledNet(NetParameter* net_param) const = 0;
+
+  /**
+   * @brief Fills names with the names of the 0th timestep recurrent input
+   *        Blob&s.  Subclasses should define this -- see RNNLayer and LSTMLayer
+   *        for examples.
+   */
+  virtual void RecurrentInputBlobNames(vector<string>* names) const = 0;
+
+  /**
+   * @brief Fills names with the names of the Tth timestep recurrent output
+   *        Blob&s.  Subclasses should define this -- see RNNLayer and LSTMLayer
+   *        for examples.
+   */
+  virtual void RecurrentOutputBlobNames(vector<string>* names) const = 0;
+
+  /**
+   * @brief Fills names with the names of the output blobs, concatenated across
+   *        all timesteps.  Should return a name for each top Blob.
+   *        Subclasses should define this -- see RNNLayer and LSTMLayer for
+   *        examples.
+   */
+  virtual void OutputBlobNames(vector<string>* names) const = 0;
+
+  /**
+   * @param bottom input Blob vector (length 2-3)
+   *
+   *   -# @f$ (T \times N \times ...) @f$
+   *      the time-varying input @f$ x @f$.  After the first two axes, whose
+   *      dimensions must correspond to the number of timesteps @f$ T @f$ and
+   *      the number of independent streams @f$ N @f$, respectively, its
+   *      dimensions may be arbitrary.  Note that the ordering of dimensions --
+   *      @f$ (T \times N \times ...) @f$, rather than
+   *      @f$ (N \times T \times ...) @f$ -- means that the @f$ N @f$
+   *      independent input streams must be "interleaved".
+   *
+   *   -# @f$ (T \times N) @f$
+   *      the sequence continuation indicators @f$ \delta @f$.
+   *      These inputs should be binary (0 or 1) indicators, where
+   *      @f$ \delta_{t,n} = 0 @f$ means that timestep @f$ t @f$ of stream
+   *      @f$ n @f$ is the beginning of a new sequence, and hence the previous
+   *      hidden state @f$ h_{t-1} @f$ is multiplied by @f$ \delta_t = 0 @f$
+   *      and has no effect on the cell's output at timestep @f$ t @f$, and
+   *      a value of @f$ \delta_{t,n} = 1 @f$ means that timestep @f$ t @f$ of
+   *      stream @f$ n @f$ is a continuation from the previous timestep
+   *      @f$ t-1 @f$, and the previous hidden state @f$ h_{t-1} @f$ affects the
+   *      updated hidden state and output.
+   *
+   *   -# @f$ (N \times ...) @f$ (optional)
+   *      the static (non-time-varying) input @f$ x_{static} @f$.
+   *      After the first axis, whose dimension must be the number of
+   *      independent streams, its dimensions may be arbitrary.
+   *      This is mathematically equivalent to using a time-varying input of
+   *      @f$ x'_t = [x_t; x_{static}] @f$ -- i.e., tiling the static input
+   *      across the @f$ T @f$ timesteps and concatenating with the time-varying
+   *      input.  Note that if this input is used, all timesteps in a single
+   *      batch within a particular one of the @f$ N @f$ streams must share the
+   *      same static input, even if the sequence continuation indicators
+   *      suggest that difference sequences are ending and beginning within a
+   *      single batch.  This may require padding and/or truncation for uniform
+   *      length.
+   *
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (T \times N \times D) @f$
+   *      the time-varying output @f$ y @f$, where @f$ D @f$ is
+   *      <code>recurrent_param.num_output()</code>.
+   *      Refer to documentation for particular RecurrentLayer implementations
+   *      (such as RNNLayer and LSTMLayer) for the definition of @f$ y @f$.
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  /// @brief A helper function, useful for stringifying timestep indices.
+  virtual string int_to_str(const int t) const;
+
+  /// @brief A Net to implement the Recurrent functionality.
+  shared_ptr<Net<Dtype> > unrolled_net_;
+
+  /// @brief The number of independent streams to process simultaneously.
+  int N_;
+
+  /**
+   * @brief The number of timesteps in the layer's input, and the number of
+   *        timesteps over which to backpropagate through time.
+   */
+  int T_;
+
+  /// @brief Whether the layer has a "static" input copied across all timesteps.
+  bool static_input_;
+
+  vector<Blob<Dtype>* > recur_input_blobs_;
+  vector<Blob<Dtype>* > recur_output_blobs_;
+  vector<Blob<Dtype>* > output_blobs_;
+  Blob<Dtype>* x_input_blob_;
+  Blob<Dtype>* x_static_input_blob_;
+  Blob<Dtype>* cont_input_blob_;
+};
+
+/**
+ * @brief Processes sequential inputs using a "Long Short-Term Memory" (LSTM)
+ *        [1] style recurrent neural network (RNN). Implemented as a network
+ *        unrolled the LSTM computation in time.
+ *
+ *
+ * The specific architecture used in this implementation is as described in
+ * "Learning to Execute" [2], reproduced below:
+ *     i_t := \sigmoid[ W_{hi} * h_{t-1} + W_{xi} * x_t + b_i ]
+ *     f_t := \sigmoid[ W_{hf} * h_{t-1} + W_{xf} * x_t + b_f ]
+ *     o_t := \sigmoid[ W_{ho} * h_{t-1} + W_{xo} * x_t + b_o ]
+ *     g_t :=    \tanh[ W_{hg} * h_{t-1} + W_{xg} * x_t + b_g ]
+ *     c_t := (f_t .* c_{t-1}) + (i_t .* g_t)
+ *     h_t := o_t .* \tanh[c_t]
+ * In the implementation, the i, f, o, and g computations are performed as a
+ * single inner product.
+ *
+ * Notably, this implementation lacks the "diagonal" gates, as used in the
+ * LSTM architectures described by Alex Graves [3] and others.
+ *
+ * [1] Hochreiter, Sepp, and Schmidhuber, Jürgen. "Long short-term memory."
+ *     Neural Computation 9, no. 8 (1997): 1735-1780.
+ *
+ * [2] Zaremba, Wojciech, and Sutskever, Ilya. "Learning to execute."
+ *     arXiv preprint arXiv:1410.4615 (2014).
+ *
+ * [3] Graves, Alex. "Generating sequences with recurrent neural networks."
+ *     arXiv preprint arXiv:1308.0850 (2013).
+ */
+template <typename Dtype>
+class LSTMLayer : public RecurrentLayer<Dtype> {
+ public:
+  explicit LSTMLayer(const LayerParameter& param)
+      : RecurrentLayer<Dtype>(param) {}
+
+  virtual inline const char* type() const { return "LSTM"; }
+
+ protected:
+  virtual void FillUnrolledNet(NetParameter* net_param) const;
+  virtual void RecurrentInputBlobNames(vector<string>* names) const;
+  virtual void RecurrentOutputBlobNames(vector<string>* names) const;
+  virtual void OutputBlobNames(vector<string>* names) const;
+};
+
+/**
+ * @brief A helper for LSTMLayer: computes a single timestep of the
+ *        non-linearity of the LSTM, producing the updated cell and hidden
+ *        states.
+ */
+template <typename Dtype>
+class LSTMUnitLayer : public Layer<Dtype> {
+ public:
+  explicit LSTMUnitLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "LSTMUnit"; }
+  virtual inline int ExactNumBottomBlobs() const { return 3; }
+  virtual inline int ExactNumTopBlobs() const { return 2; }
+
+  virtual inline bool AllowForceBackward(const int bottom_index) const {
+    // Can't propagate to sequence continuation indicators.
+    return bottom_index != 2;
+  }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 3)
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the previous timestep cell state @f$ c_{t-1} @f$
+   *   -# @f$ (1 \times N \times 4D) @f$
+   *      the "gate inputs" @f$ [i_t', f_t', o_t', g_t'] @f$
+   *   -# @f$ (1 \times 1 \times N) @f$
+   *      the sequence continuation indicators  @f$ \delta_t @f$
+   * @param top output Blob vector (length 2)
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the updated cell state @f$ c_t @f$, computed as:
+   *          i_t := \sigmoid[i_t']
+   *          f_t := \sigmoid[f_t']
+   *          o_t := \sigmoid[o_t']
+   *          g_t := \tanh[g_t']
+   *          c_t := cont_t * (f_t .* c_{t-1}) + (i_t .* g_t)
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the updated hidden state @f$ h_t @f$, computed as:
+   *          h_t := o_t .* \tanh[c_t]
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the error gradient w.r.t. the LSTMUnit inputs.
+   *
+   * @param top output Blob vector (length 2), providing the error gradient with
+   *        respect to the outputs
+   *   -# @f$ (1 \times N \times D) @f$:
+   *      containing error gradients @f$ \frac{\partial E}{\partial c_t} @f$
+   *      with respect to the updated cell state @f$ c_t @f$
+   *   -# @f$ (1 \times N \times D) @f$:
+   *      containing error gradients @f$ \frac{\partial E}{\partial h_t} @f$
+   *      with respect to the updated cell state @f$ h_t @f$
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 3), into which the error gradients
+   *        with respect to the LSTMUnit inputs @f$ c_{t-1} @f$ and the gate
+   *        inputs are computed.  Computatation of the error gradients w.r.t.
+   *        the sequence indicators is not implemented.
+   *   -# @f$ (1 \times N \times D) @f$
+   *      the error gradient w.r.t. the previous timestep cell state
+   *      @f$ c_{t-1} @f$
+   *   -# @f$ (1 \times N \times 4D) @f$
+   *      the error gradient w.r.t. the "gate inputs"
+   *      @f$ [
+   *          \frac{\partial E}{\partial i_t}
+   *          \frac{\partial E}{\partial f_t}
+   *          \frac{\partial E}{\partial o_t}
+   *          \frac{\partial E}{\partial g_t}
+   *          ] @f$
+   *   -# @f$ (1 \times 1 \times N) @f$
+   *      the gradient w.r.t. the sequence continuation indicators
+   *      @f$ \delta_t @f$ is currently not computed.
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  /// @brief The hidden and output dimension.
+  int hidden_dim_;
+  Blob<Dtype> X_acts_;
+};
+
+/**
+ * @brief Processes time-varying inputs using a simple recurrent neural network
+ *        (RNN). Implemented as a network unrolling the RNN computation in time.
+ *
+ * Given time-varying inputs @f$ x_t @f$, computes hidden state @f$
+ *     h_t := \tanh[ W_{hh} h_{t_1} + W_{xh} x_t + b_h ]
+ * @f$, and outputs @f$
+ *     o_t := \tanh[ W_{ho} h_t + b_o ]
+ * @f$.
+ */
+template <typename Dtype>
+class RNNLayer : public RecurrentLayer<Dtype> {
+ public:
+  explicit RNNLayer(const LayerParameter& param)
+      : RecurrentLayer<Dtype>(param) {}
+
+  virtual inline const char* type() const { return "RNN"; }
+
+ protected:
+  virtual void FillUnrolledNet(NetParameter* net_param) const;
+  virtual void RecurrentInputBlobNames(vector<string>* names) const;
+  virtual void RecurrentOutputBlobNames(vector<string>* names) const;
+  virtual void OutputBlobNames(vector<string>* names) const;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_SEQUENCE_LAYERS_HPP_
diff --git a/include/caffe/test/test_gradient_check_util.hpp b/include/caffe/test/test_gradient_check_util.hpp
index 22937711b58..25f35d1589e 100644
--- a/include/caffe/test/test_gradient_check_util.hpp
+++ b/include/caffe/test/test_gradient_check_util.hpp
@@ -45,6 +45,10 @@ class GradientChecker {
   void CheckGradientEltwise(Layer<Dtype>* layer,
       const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
 
+  // Checks the gradient of a single output with respect to particular input
+  // blob(s).  If check_bottom = i >= 0, check only the ith bottom Blob.
+  // If check_bottom == -1, check everything -- all bottom Blobs and all
+  // param Blobs.  Otherwise (if check_bottom < -1), check only param Blobs.
   void CheckGradientSingle(Layer<Dtype>* layer,
       const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
       int check_bottom, int top_id, int top_data_id, bool element_wise = false);
@@ -80,21 +84,25 @@ void GradientChecker<Dtype>::CheckGradientSingle(Layer<Dtype>* layer,
       CHECK_EQ(top_count, bottom[blob_id]->count());
     }
   }
-  // First, figure out what blobs we need to check against.
+  // First, figure out what blobs we need to check against, and zero init
+  // parameter blobs.
   vector<Blob<Dtype>*> blobs_to_check;
-  vector<bool> propagate_down(bottom.size(), check_bottom < 0);
+  vector<bool> propagate_down(bottom.size(), check_bottom == -1);
   for (int i = 0; i < layer->blobs().size(); ++i) {
-    blobs_to_check.push_back(layer->blobs()[i].get());
+    Blob<Dtype>* blob = layer->blobs()[i].get();
+    caffe_set(blob->count(), static_cast<Dtype>(0), blob->mutable_cpu_diff());
+    blobs_to_check.push_back(blob);
   }
-  if (check_bottom < 0) {
+  if (check_bottom == -1) {
     for (int i = 0; i < bottom.size(); ++i) {
       blobs_to_check.push_back(bottom[i]);
     }
-  } else {
+  } else if (check_bottom >= 0) {
     CHECK_LT(check_bottom, bottom.size());
     blobs_to_check.push_back(bottom[check_bottom]);
     propagate_down[check_bottom] = true;
   }
+  CHECK_GT(blobs_to_check.size(), 0) << "No blobs to check.";
   // Compute the gradient analytically using Backward
   Caffe::set_random_seed(seed_);
   // Ignore the loss from the layer (it's just the weighted sum of the losses
diff --git a/include/caffe/util/gpu_util.cuh b/include/caffe/util/gpu_util.cuh
new file mode 100644
index 00000000000..994202f2a1a
--- /dev/null
+++ b/include/caffe/util/gpu_util.cuh
@@ -0,0 +1,35 @@
+#ifndef CAFFE_UTIL_GPU_UTIL_H_
+#define CAFFE_UTIL_GPU_UTIL_H_
+
+namespace caffe {
+
+template <typename Dtype>
+inline __device__ Dtype caffe_gpu_atomic_add(const Dtype val, Dtype* address);
+
+template <>
+inline __device__
+float caffe_gpu_atomic_add(const float val, float* address) {
+  return atomicAdd(address, val);
+}
+
+// double atomicAdd implementation taken from:
+// http://docs.nvidia.com/cuda/cuda-c-programming-guide/#axzz3PVCpVsEG
+template <>
+inline __device__
+double caffe_gpu_atomic_add(const double val, double* address) {
+  unsigned long long int* address_as_ull =  // NOLINT(runtime/int)
+      // NOLINT_NEXT_LINE(runtime/int)
+      reinterpret_cast<unsigned long long int*>(address);
+  unsigned long long int old = *address_as_ull;  // NOLINT(runtime/int)
+  unsigned long long int assumed;  // NOLINT(runtime/int)
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+        __double_as_longlong(val + __longlong_as_double(assumed)));
+  } while (assumed != old);
+  return __longlong_as_double(old);
+}
+
+}  // namespace caffe
+
+#endif  // CAFFE_UTIL_GPU_UTIL_H_
diff --git a/src/caffe/layers/accuracy_layer.cpp b/src/caffe/layers/accuracy_layer.cpp
index 90aad675ed3..73ba83fc9e9 100644
--- a/src/caffe/layers/accuracy_layer.cpp
+++ b/src/caffe/layers/accuracy_layer.cpp
@@ -20,6 +20,10 @@ void AccuracyLayer<Dtype>::LayerSetUp(
   if (has_ignore_label_) {
     ignore_label_ = this->layer_param_.accuracy_param().ignore_label();
   }
+
+  denominator_ = this->layer_param_.accuracy_param().denominator();
+  CHECK_GE(denominator_, 0)
+      << "Denominator must be positive; or 0, for the batch size.";
 }
 
 template <typename Dtype>
@@ -81,7 +85,8 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   }
 
   // LOG(INFO) << "Accuracy: " << accuracy;
-  top[0]->mutable_cpu_data()[0] = accuracy / count;
+  const Dtype denominator = (denominator_ == 0) ? count : denominator_;
+  top[0]->mutable_cpu_data()[0] = accuracy / denominator;
   // Accuracy layer should not be used as a loss function.
 }
 
diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index c0c9f6f3371..928ef5ee468 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -39,13 +39,6 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* weight = this->blobs_[0]->cpu_data();
   Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
-  if (this->param_propagate_down_[0]) {
-    caffe_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
-  }
-  if (this->bias_term_ && this->param_propagate_down_[1]) {
-    caffe_set(this->blobs_[1]->count(), Dtype(0),
-        this->blobs_[1]->mutable_cpu_diff());
-  }
   for (int i = 0; i < top.size(); ++i) {
     const Dtype* top_diff = top[i]->cpu_diff();
     const Dtype* bottom_data = bottom[i]->cpu_data();
diff --git a/src/caffe/layers/conv_layer.cu b/src/caffe/layers/conv_layer.cu
index 3902fdf3930..b8a98ff7cc9 100644
--- a/src/caffe/layers/conv_layer.cu
+++ b/src/caffe/layers/conv_layer.cu
@@ -31,13 +31,6 @@ void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* weight = this->blobs_[0]->gpu_data();
   Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
-  if (this->param_propagate_down_[0]) {
-    caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
-  }
-  if (this->bias_term_ && this->param_propagate_down_[1]) {
-    caffe_gpu_set(this->blobs_[1]->count(), Dtype(0),
-        this->blobs_[1]->mutable_gpu_diff());
-  }
   for (int i = 0; i < top.size(); ++i) {
     const Dtype* top_diff = top[i]->gpu_diff();
     // Bias gradient, if necessary.
diff --git a/src/caffe/layers/cudnn_conv_layer.cu b/src/caffe/layers/cudnn_conv_layer.cu
index 071014e1b48..b5bfdb098e0 100644
--- a/src/caffe/layers/cudnn_conv_layer.cu
+++ b/src/caffe/layers/cudnn_conv_layer.cu
@@ -54,12 +54,10 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   if (this->param_propagate_down_[0]) {
     weight = this->blobs_[0]->gpu_data();
     weight_diff = this->blobs_[0]->mutable_gpu_diff();
-    caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
   }
   Dtype* bias_diff = NULL;
   if (this->bias_term_ && this->param_propagate_down_[1]) {
     bias_diff = this->blobs_[1]->mutable_gpu_diff();
-    caffe_gpu_set(this->blobs_[1]->count(), Dtype(0), bias_diff);
   }
   for (int i = 0; i < top.size(); ++i) {
     const Dtype* top_diff = top[i]->gpu_diff();
diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp
index e6d65ab526b..a4612963b6b 100644
--- a/src/caffe/layers/deconv_layer.cpp
+++ b/src/caffe/layers/deconv_layer.cpp
@@ -39,13 +39,6 @@ void DeconvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* weight = this->blobs_[0]->cpu_data();
   Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
-  if (this->param_propagate_down_[0]) {
-    caffe_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
-  }
-  if (this->bias_term_ && this->param_propagate_down_[1]) {
-    caffe_set(this->blobs_[1]->count(), Dtype(0),
-        this->blobs_[1]->mutable_cpu_diff());
-  }
   for (int i = 0; i < top.size(); ++i) {
     const Dtype* top_diff = top[i]->cpu_diff();
     const Dtype* bottom_data = bottom[i]->cpu_data();
diff --git a/src/caffe/layers/deconv_layer.cu b/src/caffe/layers/deconv_layer.cu
index 9198dd64c72..39bc4de8c66 100644
--- a/src/caffe/layers/deconv_layer.cu
+++ b/src/caffe/layers/deconv_layer.cu
@@ -31,13 +31,6 @@ void DeconvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* weight = this->blobs_[0]->gpu_data();
   Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
-  if (this->param_propagate_down_[0]) {
-    caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
-  }
-  if (this->bias_term_ && this->param_propagate_down_[1]) {
-    caffe_gpu_set(this->blobs_[1]->count(), Dtype(0),
-        this->blobs_[1]->mutable_gpu_diff());
-  }
   for (int i = 0; i < top.size(); ++i) {
     const Dtype* top_diff = top[i]->gpu_diff();
     const Dtype* bottom_data = bottom[i]->gpu_data();
diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp
index a80700736bd..55d2d955c7c 100644
--- a/src/caffe/layers/eltwise_layer.cpp
+++ b/src/caffe/layers/eltwise_layer.cpp
@@ -10,18 +10,23 @@ namespace caffe {
 template <typename Dtype>
 void EltwiseLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-  CHECK(this->layer_param().eltwise_param().coeff_size() == 0
-      || this->layer_param().eltwise_param().coeff_size() == bottom.size()) <<
+  op_ = this->layer_param_.eltwise_param().operation();
+  coeff_blob_ = this->layer_param().eltwise_param().coeff_blob();
+  if (coeff_blob_) {
+    CHECK_EQ(op_, EltwiseParameter_EltwiseOp_SUM)
+        << "coeff_blob option only implemented for the SUM operation";
+  }
+  const int coeff_size = this->layer_param().eltwise_param().coeff_size();
+  CHECK(coeff_size == 0 || (!coeff_blob_ && coeff_size == bottom.size())
+                        || (coeff_blob_ && coeff_size == bottom.size() - 1)) <<
       "Eltwise Layer takes one coefficient per bottom blob.";
-  CHECK(!(this->layer_param().eltwise_param().operation()
-      == EltwiseParameter_EltwiseOp_PROD
-      && this->layer_param().eltwise_param().coeff_size())) <<
+  CHECK(op_ == EltwiseParameter_EltwiseOp_SUM
+      || this->layer_param().eltwise_param().coeff_size() == 0) <<
       "Eltwise layer only takes coefficients for summation.";
-  op_ = this->layer_param_.eltwise_param().operation();
   // Blob-wise coefficients for the elementwise operation.
-  coeffs_ = vector<Dtype>(bottom.size(), 1);
-  if (this->layer_param().eltwise_param().coeff_size()) {
-    for (int i = 0; i < bottom.size(); ++i) {
+  coeffs_.resize(bottom.size(), 1);
+  if (coeff_size) {
+    for (int i = 0; i < bottom.size() - coeff_blob_; ++i) {
       coeffs_[i] = this->layer_param().eltwise_param().coeff(i);
     }
   }
@@ -32,7 +37,19 @@ template <typename Dtype>
 void EltwiseLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   for (int i = 1; i < bottom.size(); ++i) {
-    CHECK(bottom[i]->shape() == bottom[0]->shape());
+    if (coeff_blob_ && i == bottom.size() - 1) {
+      CHECK_EQ(i, bottom[i]->shape(0))
+          << "Dimension of coeff blob axis 0 must equal the number of bottom "
+          << "blobs (not including the coeff blob itself).";
+      for (int input_axis = 0, coeff_axis = 1;
+           coeff_axis < bottom[i]->num_axes(); ++input_axis, ++coeff_axis) {
+        CHECK_EQ(bottom[0]->shape(input_axis), bottom[i]->shape(coeff_axis))
+            << "Each axis i >= 1 of the coeff blob must match the (i-1)th "
+            << "axis of the input.";
+      }
+    } else {
+      CHECK(bottom[i]->shape() == bottom[0]->shape());
+    }
   }
   top[0]->ReshapeLike(*bottom[0]);
   // If max operation, we will initialize the vector index part.
@@ -60,8 +77,21 @@ void EltwiseLayer<Dtype>::Forward_cpu(
   case EltwiseParameter_EltwiseOp_SUM:
     caffe_set(count, Dtype(0), top_data);
     // TODO(shelhamer) does BLAS optimize to sum for coeff = 1?
-    for (int i = 0; i < bottom.size(); ++i) {
-      caffe_axpy(count, coeffs_[i], bottom[i]->cpu_data(), top_data);
+    for (int i = 0; i < bottom.size() - coeff_blob_; ++i) {
+      if (coeff_blob_) {
+        const int num = bottom[bottom.size() - 1]->count() /
+                        (bottom.size() - 1);
+        const int dim = bottom[i]->count() / num;
+        const Dtype* bottom_data = bottom[i]->cpu_data();
+        const Dtype* coeff_data = bottom[bottom.size() - 1]->cpu_data();
+        for (int j = 0; j < num; ++j, bottom_data += dim, top_data += dim) {
+          const Dtype coeff = coeffs_[i] * coeff_data[i * num + j];
+          caffe_axpy(dim, coeff, bottom_data, top_data);
+        }
+        top_data = top[0]->mutable_cpu_data();
+      } else {
+        caffe_axpy(count, coeffs_[i], bottom[i]->cpu_data(), top_data);
+      }
     }
     break;
   case EltwiseParameter_EltwiseOp_MAX:
@@ -104,7 +134,7 @@ void EltwiseLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   const int count = top[0]->count();
   const Dtype* top_data = top[0]->cpu_data();
   const Dtype* top_diff = top[0]->cpu_diff();
-  for (int i = 0; i < bottom.size(); ++i) {
+  for (int i = 0; i < bottom.size() - coeff_blob_; ++i) {
     if (propagate_down[i]) {
       const Dtype* bottom_data = bottom[i]->cpu_data();
       Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
@@ -128,7 +158,16 @@ void EltwiseLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
         caffe_mul(count, bottom_diff, top_diff, bottom_diff);
         break;
       case EltwiseParameter_EltwiseOp_SUM:
-        if (coeffs_[i] == Dtype(1)) {
+        if (coeff_blob_) {
+          const int num = bottom[bottom.size() - 1]->count() /
+                          (bottom.size() - 1);
+          const int dim = bottom[i]->count() / num;
+          const Dtype* coeff_data = bottom[bottom.size() - 1]->cpu_data();
+          for (int j = 0; j < num; ++j, bottom_diff += dim, top_diff += dim) {
+            const Dtype coeff = coeffs_[i] * coeff_data[i * num + j];
+            caffe_cpu_scale(dim, coeff, top_diff, bottom_diff);
+          }
+        } else if (coeffs_[i] == Dtype(1.)) {
           caffe_copy(count, top_diff, bottom_diff);
         } else {
           caffe_cpu_scale(count, coeffs_[i], top_diff, bottom_diff);
diff --git a/src/caffe/layers/eltwise_layer.cu b/src/caffe/layers/eltwise_layer.cu
index 2247870d97f..97f52079108 100644
--- a/src/caffe/layers/eltwise_layer.cu
+++ b/src/caffe/layers/eltwise_layer.cu
@@ -31,12 +31,33 @@ __global__ void MaxForward(const int nthreads, const Dtype* bottom_data_a,
   }
 }
 
+template <typename Dtype>
+__global__ void CoeffSum(const int count, const int dim,
+    const int num_offset, const Dtype coeff, const Dtype* coeff_data,
+    const bool backward, const Dtype* in, Dtype* out) {
+  CUDA_KERNEL_LOOP(index, count) {
+    const int n = num_offset + index / dim;
+    const Dtype other_coeff = coeff_data ? coeff_data[n] : Dtype(1);
+    const Dtype final_coeff = coeff * other_coeff;
+    const Dtype result = in[index] * final_coeff;
+    if (num_offset == 0 || backward) {
+      out[index] = result;
+    } else {
+      out[index] += result;
+    }
+  }
+}
+
 template <typename Dtype>
 void EltwiseLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
   int* mask = NULL;
   const int count = top[0]->count();
+  const int num = top[0]->num();
+  const int dim = count / num;
   Dtype* top_data = top[0]->mutable_gpu_data();
+  const Dtype* coeff_data = NULL;
+  const bool kBackward = false;
   switch (op_) {
   case EltwiseParameter_EltwiseOp_PROD:
     caffe_gpu_mul(count, bottom[0]->gpu_data(), bottom[1]->gpu_data(),
@@ -46,10 +67,17 @@ void EltwiseLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     }
     break;
   case EltwiseParameter_EltwiseOp_SUM:
-    caffe_gpu_set(count, Dtype(0.), top_data);
     // TODO(shelhamer) does cuBLAS optimize to sum for coeff = 1?
-    for (int i = 0; i < bottom.size(); ++i) {
-      caffe_gpu_axpy(count, coeffs_[i], bottom[i]->gpu_data(), top_data);
+    if (coeff_blob_) {
+      coeff_data = bottom[bottom.size() - 1]->gpu_data();
+    }
+    for (int i = 0; i < bottom.size() - coeff_blob_; ++i) {
+      const Dtype* bottom_data = bottom[i]->gpu_data();
+      CoeffSum<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+          <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+          count, dim, i * num, coeffs_[i], coeff_data,
+          kBackward, bottom_data, top_data);
+      CUDA_POST_KERNEL_CHECK;
     }
     break;
   case EltwiseParameter_EltwiseOp_MAX:
@@ -85,11 +113,18 @@ void EltwiseLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const int* mask = NULL;
   const int count = top[0]->count();
+  const int num = top[0]->num();
+  const int dim = count / num;
   const Dtype* top_data = top[0]->gpu_data();
-  const Dtype* top_diff = top[0]->gpu_diff();
-  for (int i = 0; i < bottom.size(); ++i) {
+  const Dtype* coeff_data = NULL;
+  if (coeff_blob_) {
+    coeff_data = bottom[bottom.size() - 1]->gpu_data();
+  }
+  const bool kBackward = true;
+  for (int i = 0; i < bottom.size() - coeff_blob_; ++i) {
     if (propagate_down[i]) {
       const Dtype* bottom_data = bottom[i]->gpu_data();
+      const Dtype* top_diff = top[0]->gpu_diff();
       Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
       switch (op_) {
       case EltwiseParameter_EltwiseOp_PROD:
@@ -111,11 +146,11 @@ void EltwiseLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
         caffe_gpu_mul(count, bottom_diff, top_diff, bottom_diff);
         break;
       case EltwiseParameter_EltwiseOp_SUM:
-        if (coeffs_[i] == Dtype(1.)) {
-          caffe_copy(count, top_diff, bottom_diff);
-        } else {
-          caffe_gpu_scale(count, coeffs_[i], top_diff, bottom_diff);
-        }
+        CoeffSum<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+            <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+            count, dim, i * num, coeffs_[i], coeff_data,
+            kBackward, top_diff, bottom_diff);
+        CUDA_POST_KERNEL_CHECK;
         break;
       case EltwiseParameter_EltwiseOp_MAX:
         mask = max_idx_.gpu_data();
diff --git a/src/caffe/layers/embed_layer.cpp b/src/caffe/layers/embed_layer.cpp
new file mode 100644
index 00000000000..be6b2cd2727
--- /dev/null
+++ b/src/caffe/layers/embed_layer.cpp
@@ -0,0 +1,122 @@
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/common_layers.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void EmbedLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  N_ = this->layer_param_.embed_param().num_output();
+  CHECK_GT(N_, 0) << "EmbedLayer num_output must be positive.";
+  K_ = this->layer_param_.embed_param().input_dim();
+  CHECK_GT(K_, 0) << "EmbedLayer input_dim must be positive.";
+  bias_term_ = this->layer_param_.embed_param().bias_term();
+  // Check if we need to set up the weights
+  if (this->blobs_.size() > 0) {
+    LOG(INFO) << "Skipping parameter initialization";
+  } else {
+    if (bias_term_) {
+      this->blobs_.resize(2);
+    } else {
+      this->blobs_.resize(1);
+    }
+    // Initialize the weights --
+    // transposed from InnerProductLayer for spatial locality.
+    vector<int> weight_shape(2);
+    weight_shape[0] = K_;
+    weight_shape[1] = N_;
+    this->blobs_[0].reset(new Blob<Dtype>(weight_shape));
+    // fill the weights
+    shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(
+        this->layer_param_.embed_param().weight_filler()));
+    weight_filler->Fill(this->blobs_[0].get());
+    // If necessary, initialize and fill the bias term
+    if (bias_term_) {
+      vector<int> bias_shape(1, N_);
+      this->blobs_[1].reset(new Blob<Dtype>(bias_shape));
+      shared_ptr<Filler<Dtype> > bias_filler(GetFiller<Dtype>(
+          this->layer_param_.embed_param().bias_filler()));
+      bias_filler->Fill(this->blobs_[1].get());
+    }
+  }  // parameter initialization
+  this->param_propagate_down_.resize(this->blobs_.size(), true);
+}
+
+template <typename Dtype>
+void EmbedLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  // Figure out the dimensions
+  M_ = bottom[0]->count();
+  vector<int> top_shape = bottom[0]->shape();
+  top_shape.push_back(N_);
+  top[0]->Reshape(top_shape);
+  // Set up the bias multiplier
+  if (bias_term_) {
+    vector<int> bias_shape(1, M_);
+    bias_multiplier_.Reshape(bias_shape);
+    caffe_set(M_, Dtype(1), bias_multiplier_.mutable_cpu_data());
+  }
+}
+
+template <typename Dtype>
+void EmbedLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  const Dtype* weight = this->blobs_[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  int index;
+  for (int n = 0; n < M_; ++n) {
+    index = static_cast<int>(bottom_data[n]);
+    DCHECK_GE(index, 0);
+    DCHECK_LT(index, K_);
+    DCHECK_EQ(static_cast<Dtype>(index), bottom_data[n]) << "non-integer input";
+    caffe_copy(N_, weight + index * N_, top_data + n * N_);
+  }
+  if (bias_term_) {
+    const Dtype* bias = this->blobs_[1]->cpu_data();
+    caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, 1, Dtype(1),
+        bias_multiplier_.cpu_data(), bias, Dtype(1), top_data);
+  }
+}
+
+template <typename Dtype>
+void EmbedLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  CHECK(!propagate_down[0]) << "Can't backpropagate to EmbedLayer input.";
+  if (this->param_propagate_down_[0]) {
+    const Dtype* top_diff = top[0]->cpu_diff();
+    const Dtype* bottom_data = bottom[0]->cpu_data();
+    // Gradient with respect to weight
+    Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
+    int index;
+    for (int n = 0; n < M_; ++n) {
+      index = static_cast<int>(bottom_data[n]);
+      DCHECK_GE(index, 0);
+      DCHECK_LT(index, K_);
+      DCHECK_EQ(static_cast<Dtype>(index), bottom_data[n])
+          << "non-integer input";
+      caffe_axpy(N_, Dtype(1), top_diff + n * N_, weight_diff + index * N_);
+    }
+  }
+  if (bias_term_ && this->param_propagate_down_[1]) {
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff();
+    caffe_cpu_gemv<Dtype>(CblasTrans, M_, N_, Dtype(1), top_diff,
+        bias_multiplier_.cpu_data(), Dtype(1), bias_diff);
+  }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(EmbedLayer);
+#endif
+
+INSTANTIATE_CLASS(EmbedLayer);
+REGISTER_LAYER_CLASS(Embed);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/embed_layer.cu b/src/caffe/layers/embed_layer.cu
new file mode 100644
index 00000000000..672fb9c608c
--- /dev/null
+++ b/src/caffe/layers/embed_layer.cu
@@ -0,0 +1,85 @@
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/common_layers.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/util/gpu_util.cuh"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void EmbedForward(const int nthreads, const Dtype* bottom_data,
+    const Dtype* weight, const int M, const int N, const int K,
+    Dtype* top_data) {
+  CUDA_KERNEL_LOOP(top_index, nthreads) {
+    const int n = top_index / N;
+    const int d = top_index % N;
+    const int index = static_cast<int>(bottom_data[n]);
+    const int weight_index = index * N + d;
+    top_data[top_index] = weight[weight_index];
+  }
+}
+
+template <typename Dtype>
+__global__ void EmbedBackward(const int nthreads, const Dtype* bottom_data,
+    const Dtype* top_diff, const int M, const int N, const int K,
+    Dtype* weight_diff);
+
+template <typename Dtype>
+__global__ void EmbedBackward(const int nthreads, const Dtype* bottom_data,
+    const Dtype* top_diff, const int M, const int N, const int K,
+    Dtype* weight_diff) {
+  CUDA_KERNEL_LOOP(top_index, nthreads) {
+    const int n = top_index / N;
+    const int d = top_index % N;
+    const int index = static_cast<int>(bottom_data[n]);
+    const int weight_index = index * N + d;
+    caffe_gpu_atomic_add(top_diff[top_index], weight_diff + weight_index);
+  }
+}
+
+template <typename Dtype>
+void EmbedLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  const int count = top[0]->count();
+  EmbedForward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+      count, bottom_data, weight, M_, N_, K_, top_data);
+  if (bias_term_) {
+    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, 1, Dtype(1),
+        bias_multiplier_.gpu_data(),
+        this->blobs_[1]->gpu_data(), Dtype(1), top_data);
+  }
+}
+
+template <typename Dtype>
+void EmbedLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  CHECK(!propagate_down[0]) << "Can't backpropagate to EmbedLayer input.";
+  if (this->param_propagate_down_[0]) {
+    const int top_count = top[0]->count();
+    const int count = this->blobs_[0]->count();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    const Dtype* bottom_data = bottom[0]->gpu_data();
+    Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
+    EmbedBackward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<CAFFE_GET_BLOCKS(top_count), CAFFE_CUDA_NUM_THREADS>>>(
+        top_count, bottom_data, top_diff, M_, N_, K_, weight_diff);
+  }
+  if (bias_term_ && this->param_propagate_down_[1]) {
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bias_diff = this->blobs_[1]->mutable_gpu_diff();
+    caffe_gpu_gemv<Dtype>(CblasTrans, M_, N_, Dtype(1), top_diff,
+        bias_multiplier_.gpu_data(), Dtype(1), bias_diff);
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(EmbedLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/flatten_layer.cpp b/src/caffe/layers/flatten_layer.cpp
index 745f271ea45..3a078fabb81 100644
--- a/src/caffe/layers/flatten_layer.cpp
+++ b/src/caffe/layers/flatten_layer.cpp
@@ -14,18 +14,8 @@ void FlattenLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
   top_shape[1] = bottom[0]->count() / bottom[0]->num();
   top[0]->Reshape(top_shape);
   CHECK_EQ(top[0]->count(), bottom[0]->count());
-}
-
-template <typename Dtype>
-void FlattenLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
   top[0]->ShareData(*bottom[0]);
-}
-
-template <typename Dtype>
-void FlattenLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  bottom[0]->ShareDiff(*top[0]);
+  top[0]->ShareDiff(*bottom[0]);
 }
 
 INSTANTIATE_CLASS(FlattenLayer);
diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index 89e0c8fbad7..83c3235eb71 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -101,13 +101,13 @@ void InnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const Dtype* bottom_data = bottom[0]->cpu_data();
     // Gradient with respect to weight
     caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
-        top_diff, bottom_data, (Dtype)0., this->blobs_[0]->mutable_cpu_diff());
+        top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_cpu_diff());
   }
   if (bias_term_ && this->param_propagate_down_[1]) {
     const Dtype* top_diff = top[0]->cpu_diff();
     // Gradient with respect to bias
     caffe_cpu_gemv<Dtype>(CblasTrans, M_, N_, (Dtype)1., top_diff,
-        bias_multiplier_.cpu_data(), (Dtype)0.,
+        bias_multiplier_.cpu_data(), (Dtype)1.,
         this->blobs_[1]->mutable_cpu_diff());
   }
   if (propagate_down[0]) {
diff --git a/src/caffe/layers/inner_product_layer.cu b/src/caffe/layers/inner_product_layer.cu
index a9e1784a205..dd90cac12a8 100644
--- a/src/caffe/layers/inner_product_layer.cu
+++ b/src/caffe/layers/inner_product_layer.cu
@@ -33,13 +33,13 @@ void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const Dtype* bottom_data = bottom[0]->gpu_data();
     // Gradient with respect to weight
     caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
-        top_diff, bottom_data, (Dtype)0., this->blobs_[0]->mutable_gpu_diff());
+        top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_gpu_diff());
   }
   if (bias_term_ && this->param_propagate_down_[1]) {
     const Dtype* top_diff = top[0]->gpu_diff();
     // Gradient with respect to bias
     caffe_gpu_gemv<Dtype>(CblasTrans, M_, N_, (Dtype)1., top_diff,
-        bias_multiplier_.gpu_data(), (Dtype)0.,
+        bias_multiplier_.gpu_data(), (Dtype)1.,
         this->blobs_[1]->mutable_gpu_diff());
   }
   if (propagate_down[0]) {
diff --git a/src/caffe/layers/lstm_layer.cpp b/src/caffe/layers/lstm_layer.cpp
new file mode 100644
index 00000000000..91543f73f71
--- /dev/null
+++ b/src/caffe/layers/lstm_layer.cpp
@@ -0,0 +1,221 @@
+#include <string>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/sequence_layers.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void LSTMLayer<Dtype>::RecurrentInputBlobNames(vector<string>* names) const {
+  names->resize(2);
+  (*names)[0] = "h_0";
+  (*names)[1] = "c_0";
+}
+
+template <typename Dtype>
+void LSTMLayer<Dtype>::RecurrentOutputBlobNames(vector<string>* names) const {
+  names->resize(2);
+  (*names)[0] = "h_" + this->int_to_str(this->T_);
+  (*names)[1] = "c_T";
+}
+
+template <typename Dtype>
+void LSTMLayer<Dtype>::OutputBlobNames(vector<string>* names) const {
+  names->resize(1);
+  (*names)[0] = "h";
+}
+
+template <typename Dtype>
+void LSTMLayer<Dtype>::FillUnrolledNet(NetParameter* net_param) const {
+  const int num_output = this->layer_param_.recurrent_param().num_output();
+  CHECK_GT(num_output, 0) << "num_output must be positive";
+  const FillerParameter& weight_filler =
+      this->layer_param_.recurrent_param().weight_filler();
+  const FillerParameter& bias_filler =
+      this->layer_param_.recurrent_param().bias_filler();
+
+  // Add generic LayerParameter's (without bottoms/tops) of layer types we'll
+  // use to save redundant code.
+  LayerParameter hidden_param;
+  hidden_param.set_type("InnerProduct");
+  hidden_param.mutable_inner_product_param()->set_num_output(num_output * 4);
+  hidden_param.mutable_inner_product_param()->set_bias_term(false);
+  hidden_param.mutable_inner_product_param()->set_axis(2);
+  hidden_param.mutable_inner_product_param()->
+      mutable_weight_filler()->CopyFrom(weight_filler);
+
+  LayerParameter biased_hidden_param(hidden_param);
+  biased_hidden_param.mutable_inner_product_param()->set_bias_term(true);
+  biased_hidden_param.mutable_inner_product_param()->
+      mutable_bias_filler()->CopyFrom(bias_filler);
+
+  LayerParameter sum_param;
+  sum_param.set_type("Eltwise");
+  sum_param.mutable_eltwise_param()->set_operation(
+      EltwiseParameter_EltwiseOp_SUM);
+
+  LayerParameter slice_param;
+  slice_param.set_type("Slice");
+  slice_param.mutable_slice_param()->set_axis(0);
+
+  LayerParameter split_param;
+  split_param.set_type("Split");
+
+  BlobShape input_shape;
+  input_shape.add_dim(1);  // c_0 and h_0 are a single timestep
+  input_shape.add_dim(this->N_);
+  input_shape.add_dim(num_output);
+
+  net_param->add_input("c_0");
+  net_param->add_input_shape()->CopyFrom(input_shape);
+
+  net_param->add_input("h_0");
+  net_param->add_input_shape()->CopyFrom(input_shape);
+
+  LayerParameter* cont_slice_param = net_param->add_layer();
+  cont_slice_param->CopyFrom(slice_param);
+  cont_slice_param->set_name("cont_slice");
+  cont_slice_param->add_bottom("cont");
+  cont_slice_param->mutable_slice_param()->set_axis(1);
+
+  // Add layer to transform all timesteps of x to the hidden state dimension.
+  //     W_xc_x = W_xc * x + b_c
+  {
+    LayerParameter* x_transform_param = net_param->add_layer();
+    x_transform_param->CopyFrom(biased_hidden_param);
+    x_transform_param->set_name("x_transform");
+    x_transform_param->add_param()->set_name("W_xc");
+    x_transform_param->add_param()->set_name("b_c");
+    x_transform_param->add_bottom("x");
+    x_transform_param->add_top("W_xc_x");
+  }
+
+  if (this->static_input_) {
+    // Add layer to transform x_static to the gate dimension.
+    //     W_xc_x_static = W_xc_static * x_static
+    LayerParameter* x_static_transform_param = net_param->add_layer();
+    x_static_transform_param->CopyFrom(hidden_param);
+    x_static_transform_param->mutable_inner_product_param()->set_axis(1);
+    x_static_transform_param->set_name("W_xc_x_static");
+    x_static_transform_param->add_param()->set_name("W_xc_static");
+    x_static_transform_param->add_bottom("x_static");
+    x_static_transform_param->add_top("W_xc_x_static");
+
+    LayerParameter* reshape_param = net_param->add_layer();
+    reshape_param->set_type("Reshape");
+    BlobShape* new_shape =
+         reshape_param->mutable_reshape_param()->mutable_shape();
+    new_shape->add_dim(1);  // One timestep.
+    new_shape->add_dim(this->N_);
+    new_shape->add_dim(
+        x_static_transform_param->inner_product_param().num_output());
+    reshape_param->add_bottom("W_xc_x_static");
+    reshape_param->add_top("W_xc_x_static");
+  }
+
+  LayerParameter* x_slice_param = net_param->add_layer();
+  x_slice_param->CopyFrom(slice_param);
+  x_slice_param->add_bottom("W_xc_x");
+  x_slice_param->set_name("W_xc_x_slice");
+
+  LayerParameter output_concat_layer;
+  output_concat_layer.set_name("h_concat");
+  output_concat_layer.set_type("Concat");
+  output_concat_layer.add_top("h");
+  output_concat_layer.mutable_concat_param()->set_axis(0);
+
+  for (int t = 1; t <= this->T_; ++t) {
+    string tm1s = this->int_to_str(t - 1);
+    string ts = this->int_to_str(t);
+
+    cont_slice_param->add_top("cont_" + ts);
+    x_slice_param->add_top("W_xc_x_" + ts);
+
+    // Add layers to flush the hidden state when beginning a new
+    // sequence, as indicated by cont_t.
+    //     h_conted_{t-1} := cont_t * h_{t-1}
+    //
+    // Normally, cont_t is binary (i.e., 0 or 1), so:
+    //     h_conted_{t-1} := h_{t-1} if cont_t == 1
+    //                       0   otherwise
+    {
+      LayerParameter* cont_h_param = net_param->add_layer();
+      cont_h_param->CopyFrom(sum_param);
+      cont_h_param->mutable_eltwise_param()->set_coeff_blob(true);
+      cont_h_param->set_name("h_conted_" + tm1s);
+      cont_h_param->add_bottom("h_" + tm1s);
+      cont_h_param->add_bottom("cont_" + ts);
+      cont_h_param->add_top("h_conted_" + tm1s);
+    }
+
+    // Add layer to compute
+    //     W_hc_h_{t-1} := W_hc * h_conted_{t-1}
+    {
+      LayerParameter* w_param = net_param->add_layer();
+      w_param->CopyFrom(hidden_param);
+      w_param->set_name("transform_" + ts);
+      w_param->add_param()->set_name("W_hc");
+      w_param->add_bottom("h_conted_" + tm1s);
+      w_param->add_top("W_hc_h_" + tm1s);
+      w_param->mutable_inner_product_param()->set_axis(2);
+    }
+
+    // Add the outputs of the linear transformations to compute the gate input.
+    //     gate_input_t := W_hc * h_conted_{t-1} + W_xc * x_t + b_c
+    //                   = W_hc_h_{t-1} + W_xc_x_t + b_c
+    {
+      LayerParameter* input_sum_layer = net_param->add_layer();
+      input_sum_layer->CopyFrom(sum_param);
+      input_sum_layer->set_name("gate_input_" + ts);
+      input_sum_layer->add_bottom("W_hc_h_" + tm1s);
+      input_sum_layer->add_bottom("W_xc_x_" + ts);
+      if (this->static_input_) {
+        input_sum_layer->add_bottom("W_xc_x_static");
+      }
+      input_sum_layer->add_top("gate_input_" + ts);
+    }
+
+    // Add LSTMUnit layer to compute the cell & hidden vectors c_t and h_t.
+    // Inputs: c_{t-1}, gate_input_t = (i_t, f_t, o_t, g_t), cont_t
+    // Outputs: c_t, h_t
+    //     [ i_t' ]
+    //     [ f_t' ] := gate_input_t
+    //     [ o_t' ]
+    //     [ g_t' ]
+    //         i_t := \sigmoid[i_t']
+    //         f_t := \sigmoid[f_t']
+    //         o_t := \sigmoid[o_t']
+    //         g_t := \tanh[g_t']
+    //         c_t := cont_t * (f_t .* c_{t-1}) + (i_t .* g_t)
+    //         h_t := o_t .* \tanh[c_t]
+    {
+      LayerParameter* lstm_unit_param = net_param->add_layer();
+      lstm_unit_param->set_type("LSTMUnit");
+      lstm_unit_param->add_bottom("c_" + tm1s);
+      lstm_unit_param->add_bottom("gate_input_" + ts);
+      lstm_unit_param->add_bottom("cont_" + ts);
+      lstm_unit_param->add_top("c_" + ts);
+      lstm_unit_param->add_top("h_" + ts);
+      lstm_unit_param->set_name("unit_" + ts);
+    }
+    output_concat_layer.add_bottom("h_" + ts);
+  }  // for (int t = 1; t <= this->T_; ++t)
+
+  {
+    LayerParameter* c_T_copy_param = net_param->add_layer();
+    c_T_copy_param->CopyFrom(split_param);
+    c_T_copy_param->add_bottom("c_" + this->int_to_str(this->T_));
+    c_T_copy_param->add_top("c_T");
+  }
+  net_param->add_layer()->CopyFrom(output_concat_layer);
+}
+
+INSTANTIATE_CLASS(LSTMLayer);
+REGISTER_LAYER_CLASS(LSTM);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/lstm_unit_layer.cpp b/src/caffe/layers/lstm_unit_layer.cpp
new file mode 100644
index 00000000000..74078d264f5
--- /dev/null
+++ b/src/caffe/layers/lstm_unit_layer.cpp
@@ -0,0 +1,128 @@
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/sequence_layers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+inline Dtype sigmoid(Dtype x) {
+  return 1. / (1. + exp(-x));
+}
+
+template <typename Dtype>
+inline Dtype tanh(Dtype x) {
+  return 2. * sigmoid(2. * x) - 1.;
+}
+
+template <typename Dtype>
+void LSTMUnitLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  for (int i = 0; i < bottom.size(); ++i) {
+    CHECK_EQ(3, bottom[i]->num_axes());
+    CHECK_EQ(1, bottom[i]->shape(0));
+  }
+  const int num_instances = bottom[0]->shape(1);
+  hidden_dim_ = bottom[0]->shape(2);
+  CHECK_EQ(num_instances, bottom[1]->shape(1));
+  CHECK_EQ(4 * hidden_dim_, bottom[1]->shape(2));
+  CHECK_EQ(1, bottom[2]->shape(1));
+  CHECK_EQ(num_instances, bottom[2]->shape(2));
+  top[0]->ReshapeLike(*bottom[0]);
+  top[1]->ReshapeLike(*bottom[0]);
+  X_acts_.ReshapeLike(*bottom[1]);
+}
+
+template <typename Dtype>
+void LSTMUnitLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const int num = bottom[0]->shape(1);
+  const int x_dim = hidden_dim_ * 4;
+  const Dtype* C_prev = bottom[0]->cpu_data();
+  const Dtype* X = bottom[1]->cpu_data();
+  const Dtype* flush = bottom[2]->cpu_data();
+  Dtype* C = top[0]->mutable_cpu_data();
+  Dtype* H = top[1]->mutable_cpu_data();
+  for (int n = 0; n < num; ++n) {
+    for (int d = 0; d < hidden_dim_; ++d) {
+      const Dtype i = sigmoid(X[d]);
+      const Dtype f = (*flush == 0) ? 0 :
+          (*flush * sigmoid(X[1 * hidden_dim_ + d]));
+      const Dtype o = sigmoid(X[2 * hidden_dim_ + d]);
+      const Dtype g = tanh(X[3 * hidden_dim_ + d]);
+      const Dtype c_prev = C_prev[d];
+      const Dtype c = f * c_prev + i * g;
+      C[d] = c;
+      const Dtype tanh_c = tanh(c);
+      H[d] = o * tanh_c;
+    }
+    C_prev += hidden_dim_;
+    X += x_dim;
+    C += hidden_dim_;
+    H += hidden_dim_;
+    ++flush;
+  }
+}
+
+template <typename Dtype>
+void LSTMUnitLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  CHECK(!propagate_down[2]) << "Cannot backpropagate to sequence indicators.";
+  if (!propagate_down[0] && !propagate_down[1]) { return; }
+
+  const int num = bottom[0]->shape(1);
+  const int x_dim = hidden_dim_ * 4;
+  const Dtype* C_prev = bottom[0]->cpu_data();
+  const Dtype* X = bottom[1]->cpu_data();
+  const Dtype* flush = bottom[2]->cpu_data();
+  const Dtype* C = top[0]->cpu_data();
+  const Dtype* H = top[1]->cpu_data();
+  const Dtype* C_diff = top[0]->cpu_diff();
+  const Dtype* H_diff = top[1]->cpu_diff();
+  Dtype* C_prev_diff = bottom[0]->mutable_cpu_diff();
+  Dtype* X_diff = bottom[1]->mutable_cpu_diff();
+  for (int n = 0; n < num; ++n) {
+    for (int d = 0; d < hidden_dim_; ++d) {
+      const Dtype i = sigmoid(X[d]);
+      const Dtype f = (*flush == 0) ? 0 :
+          (*flush * sigmoid(X[1 * hidden_dim_ + d]));
+      const Dtype o = sigmoid(X[2 * hidden_dim_ + d]);
+      const Dtype g = tanh(X[3 * hidden_dim_ + d]);
+      const Dtype c_prev = C_prev[d];
+      const Dtype c = C[d];
+      const Dtype tanh_c = tanh(c);
+      Dtype* c_prev_diff = C_prev_diff + d;
+      Dtype* i_diff = X_diff + d;
+      Dtype* f_diff = X_diff + 1 * hidden_dim_ + d;
+      Dtype* o_diff = X_diff + 2 * hidden_dim_ + d;
+      Dtype* g_diff = X_diff + 3 * hidden_dim_ + d;
+      const Dtype c_term_diff =
+          C_diff[d] + H_diff[d] * o * (1 - tanh_c * tanh_c);
+      *c_prev_diff = c_term_diff * f;
+      *i_diff = c_term_diff * g * i * (1 - i);
+      *f_diff = c_term_diff * c_prev * f * (1 - f);
+      *o_diff = H_diff[d] * tanh_c * o * (1 - o);
+      *g_diff = c_term_diff * i * (1 - g * g);
+    }
+    C_prev += hidden_dim_;
+    X += x_dim;
+    C += hidden_dim_;
+    H += hidden_dim_;
+    C_diff += hidden_dim_;
+    H_diff += hidden_dim_;
+    X_diff += x_dim;
+    C_prev_diff += hidden_dim_;
+    ++flush;
+  }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(LSTMUnitLayer);
+#endif
+
+INSTANTIATE_CLASS(LSTMUnitLayer);
+REGISTER_LAYER_CLASS(LSTMUnit);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/lstm_unit_layer.cu b/src/caffe/layers/lstm_unit_layer.cu
new file mode 100644
index 00000000000..d6bf85071f5
--- /dev/null
+++ b/src/caffe/layers/lstm_unit_layer.cu
@@ -0,0 +1,154 @@
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "caffe/layer.hpp"
+#include "caffe/sequence_layers.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+__device__ Dtype sigmoid(const Dtype x) {
+  return Dtype(1) / (Dtype(1) + exp(-x));
+}
+
+template <typename Dtype>
+__device__ Dtype tanh(const Dtype x) {
+  return Dtype(2) * sigmoid(Dtype(2) * x) - Dtype(1);
+}
+
+template <typename Dtype>
+__global__ void LSTMActsForward(const int nthreads, const int dim,
+                                const Dtype* X, Dtype* X_acts) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int x_dim = 4 * dim;
+    const int d = index % x_dim;
+    if (d < 3 * dim) {
+      X_acts[index] = sigmoid(X[index]);
+    } else {
+      X_acts[index] = tanh(X[index]);
+    }
+  }
+}
+
+template <typename Dtype>
+__global__ void LSTMUnitForward(const int nthreads, const int dim,
+    const Dtype* C_prev, const Dtype* X, const Dtype* flush,
+    Dtype* C, Dtype* H) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int n = index / dim;
+    const int d = index % dim;
+    const Dtype* X_offset = X + 4 * dim * n;
+    const Dtype i = X_offset[d];
+    const Dtype f = X_offset[1 * dim + d];
+    const Dtype o = X_offset[2 * dim + d];
+    const Dtype g = X_offset[3 * dim + d];
+    const Dtype c_prev = C_prev[index];
+    const Dtype c = flush[n] * f * c_prev + i * g;
+    C[index] = c;
+    const Dtype tanh_c = tanh(c);
+    H[index] = o * tanh_c;
+  }
+}
+
+template <typename Dtype>
+void LSTMUnitLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const int count = top[1]->count();
+  const Dtype* C_prev = bottom[0]->gpu_data();
+  const Dtype* X = bottom[1]->gpu_data();
+  const Dtype* flush = bottom[2]->gpu_data();
+  Dtype* X_acts = X_acts_.mutable_gpu_data();
+  Dtype* C = top[0]->mutable_gpu_data();
+  Dtype* H = top[1]->mutable_gpu_data();
+  const int X_count = bottom[1]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  LSTMActsForward<Dtype><<<CAFFE_GET_BLOCKS(X_count), CAFFE_CUDA_NUM_THREADS>>>(
+      X_count, hidden_dim_, X, X_acts);
+  CUDA_POST_KERNEL_CHECK;
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  LSTMUnitForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+      count, hidden_dim_, C_prev, X_acts, flush, C, H);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+template <typename Dtype>
+__global__ void LSTMUnitBackward(const int nthreads, const int dim,
+    const Dtype* C_prev, const Dtype* X, const Dtype* C, const Dtype* H,
+    const Dtype* flush, const Dtype* C_diff, const Dtype* H_diff,
+    Dtype* C_prev_diff, Dtype* X_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int n = index / dim;
+    const int d = index % dim;
+    const Dtype* X_offset = X + 4 * dim * n;
+    const Dtype i = X_offset[d];
+    const Dtype f = X_offset[1 * dim + d];
+    const Dtype o = X_offset[2 * dim + d];
+    const Dtype g = X_offset[3 * dim + d];
+    const Dtype c_prev = C_prev[index];
+    const Dtype c = C[index];
+    const Dtype tanh_c = tanh(c);
+    Dtype* c_prev_diff = C_prev_diff + index;
+    Dtype* X_diff_offset = X_diff + 4 * dim * n;
+    Dtype* i_diff = X_diff_offset + d;
+    Dtype* f_diff = X_diff_offset + 1 * dim + d;
+    Dtype* o_diff = X_diff_offset + 2 * dim + d;
+    Dtype* g_diff = X_diff_offset + 3 * dim + d;
+    const Dtype c_term_diff =
+        C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c);
+    const Dtype flush_n = flush[n];
+    *c_prev_diff = flush_n * c_term_diff * f;
+    *i_diff = c_term_diff * g;
+    *f_diff = flush_n * c_term_diff * c_prev;
+    *o_diff = H_diff[index] * tanh_c;
+    *g_diff = c_term_diff * i;
+  }
+}
+
+template <typename Dtype>
+__global__ void LSTMActsBackward(const int nthreads, const int dim,
+    const Dtype* X_acts, const Dtype* X_acts_diff, Dtype* X_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int x_dim = 4 * dim;
+    const int d = index % x_dim;
+    const Dtype X_act = X_acts[index];
+    if (d < 3 * dim) {
+      X_diff[index] = X_acts_diff[index] * X_act * (Dtype(1) - X_act);
+    } else {
+      X_diff[index] = X_acts_diff[index] * (Dtype(1) - X_act * X_act);
+    }
+  }
+}
+
+template <typename Dtype>
+void LSTMUnitLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  CHECK(!propagate_down[2]) << "Cannot backpropagate to sequence indicators.";
+  if (!propagate_down[0] && !propagate_down[1]) { return; }
+
+  const int count = top[1]->count();
+  const Dtype* C_prev = bottom[0]->gpu_data();
+  const Dtype* X_acts = X_acts_.gpu_data();
+  const Dtype* flush = bottom[2]->gpu_data();
+  const Dtype* C = top[0]->gpu_data();
+  const Dtype* H = top[1]->gpu_data();
+  const Dtype* C_diff = top[0]->gpu_diff();
+  const Dtype* H_diff = top[1]->gpu_diff();
+  Dtype* C_prev_diff = bottom[0]->mutable_gpu_diff();
+  Dtype* X_acts_diff = X_acts_.mutable_gpu_diff();
+  LSTMUnitBackward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(count, hidden_dim_,
+      C_prev, X_acts, C, H, flush, C_diff, H_diff, C_prev_diff, X_acts_diff);
+  CUDA_POST_KERNEL_CHECK;
+  const int X_count = bottom[1]->count();
+  Dtype* X_diff = bottom[1]->mutable_gpu_diff();
+  LSTMActsBackward<Dtype>  // NOLINT_NEXT_LINE(whitespace/operators)
+      <<<CAFFE_GET_BLOCKS(X_count), CAFFE_CUDA_NUM_THREADS>>>(
+      X_count, hidden_dim_, X_acts, X_acts_diff, X_diff);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(LSTMUnitLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/recurrent_layer.cpp b/src/caffe/layers/recurrent_layer.cpp
new file mode 100644
index 00000000000..7dc38fec901
--- /dev/null
+++ b/src/caffe/layers/recurrent_layer.cpp
@@ -0,0 +1,222 @@
+#include <string>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/sequence_layers.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+string RecurrentLayer<Dtype>::int_to_str(const int t) const {
+  ostringstream num;
+  num << t;
+  return num.str();
+}
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  CHECK_GE(bottom[0]->num_axes(), 2)
+      << "bottom[0] must have at least 2 axes -- (#timesteps, #streams, ...)";
+  T_ = bottom[0]->shape(0);
+  N_ = bottom[0]->shape(1);
+  LOG(INFO) << "Initializing recurrent layer: assuming input batch contains "
+            << T_ << " timesteps of " << N_ << " independent streams.";
+
+  CHECK_EQ(bottom[1]->num_axes(), 2)
+      << "bottom[1] must have exactly 2 axes -- (#timesteps, #streams)";
+  CHECK_EQ(T_, bottom[1]->shape(0));
+  CHECK_EQ(N_, bottom[1]->shape(1));
+
+  // If provided, bottom[2] is a static input to the recurrent net.
+  static_input_ = (bottom.size() > 2);
+  if (static_input_) {
+    CHECK_GE(bottom[2]->num_axes(), 1);
+    CHECK_EQ(N_, bottom[2]->shape(0));
+  }
+
+  // Create a NetParameter; setup the inputs that aren't unique to particular
+  // recurrent architectures.
+  NetParameter net_param;
+  net_param.set_force_backward(true);
+
+  net_param.add_input("x");
+  BlobShape input_shape;
+  for (int i = 0; i < bottom[0]->num_axes(); ++i) {
+    input_shape.add_dim(bottom[0]->shape(i));
+  }
+  net_param.add_input_shape()->CopyFrom(input_shape);
+
+  input_shape.Clear();
+  input_shape.add_dim(1);
+  for (int i = 0; i < bottom[1]->num_axes(); ++i) {
+    input_shape.add_dim(bottom[1]->shape(i));
+  }
+  net_param.add_input("cont");
+  net_param.add_input_shape()->CopyFrom(input_shape);
+
+  if (static_input_) {
+    input_shape.Clear();
+    for (int i = 0; i < bottom[2]->num_axes(); ++i) {
+      input_shape.add_dim(bottom[2]->shape(i));
+    }
+    net_param.add_input("x_static");
+    net_param.add_input_shape()->CopyFrom(input_shape);
+  }
+
+  // Call the child's FillUnrolledNet implementation to specify the unrolled
+  // recurrent architecture.
+  this->FillUnrolledNet(&net_param);
+
+  // Prepend this layer's name to the names of each layer in the unrolled net.
+  const string& layer_name = this->layer_param_.name();
+  if (layer_name.size() > 0) {
+    for (int i = 0; i < net_param.layer_size(); ++i) {
+      LayerParameter* layer = net_param.mutable_layer(i);
+      layer->set_name(layer_name + "_" + layer->name());
+    }
+  }
+
+  // Create the unrolled net.
+  unrolled_net_.reset(new Net<Dtype>(net_param));
+  unrolled_net_->set_debug_info(
+      this->layer_param_.recurrent_param().debug_info());
+
+  // Setup pointers to the inputs.
+  x_input_blob_ = CHECK_NOTNULL(unrolled_net_->blob_by_name("x").get());
+  cont_input_blob_ = CHECK_NOTNULL(unrolled_net_->blob_by_name("cont").get());
+  if (static_input_) {
+    x_static_input_blob_ =
+        CHECK_NOTNULL(unrolled_net_->blob_by_name("x_static").get());
+  }
+
+  // Setup pointers to paired recurrent inputs/outputs.
+  vector<string> recur_input_names;
+  RecurrentInputBlobNames(&recur_input_names);
+  vector<string> recur_output_names;
+  RecurrentOutputBlobNames(&recur_output_names);
+  const int num_recur_blobs = recur_input_names.size();
+  CHECK_EQ(num_recur_blobs, recur_output_names.size());
+  recur_input_blobs_.resize(num_recur_blobs);
+  recur_output_blobs_.resize(num_recur_blobs);
+  for (int i = 0; i < recur_input_names.size(); ++i) {
+    recur_input_blobs_[i] =
+        CHECK_NOTNULL(unrolled_net_->blob_by_name(recur_input_names[i]).get());
+    recur_output_blobs_[i] =
+        CHECK_NOTNULL(unrolled_net_->blob_by_name(recur_output_names[i]).get());
+  }
+
+  // Setup pointers to outputs.
+  vector<string> output_names;
+  OutputBlobNames(&output_names);
+  CHECK_EQ(top.size(), output_names.size())
+      << "OutputBlobNames must provide an output blob name for each top.";
+  output_blobs_.resize(output_names.size());
+  for (int i = 0; i < output_names.size(); ++i) {
+    output_blobs_[i] =
+        CHECK_NOTNULL(unrolled_net_->blob_by_name(output_names[i]).get());
+  }
+
+  // We should have 2 inputs (x and cont), plus a number of recurrent inputs,
+  // plus maybe a static input.
+  CHECK_EQ(2 + num_recur_blobs + static_input_,
+           unrolled_net_->input_blobs().size());
+
+  // This layer's parameters are any parameters in the layers of the unrolled
+  // net. We only want one copy of each parameter, so check that the parameter
+  // is "owned" by the layer, rather than shared with another.
+  this->blobs_.clear();
+  for (int i = 0; i < unrolled_net_->params().size(); ++i) {
+    if (unrolled_net_->param_owners()[i] == -1) {
+      LOG(INFO) << "Adding parameter " << i << ": "
+                << unrolled_net_->param_display_names()[i];
+      this->blobs_.push_back(unrolled_net_->params()[i]);
+    }
+  }
+  // Check that param_propagate_down is set for all of the parameters in the
+  // unrolled net; set param_propagate_down to true in this layer.
+  for (int i = 0; i < unrolled_net_->layers().size(); ++i) {
+    for (int j = 0; j < unrolled_net_->layers()[i]->blobs().size(); ++j) {
+      CHECK(unrolled_net_->layers()[i]->param_propagate_down(j))
+          << "param_propagate_down not set for layer " << i << ", param " << j;
+    }
+  }
+  this->param_propagate_down_.clear();
+  this->param_propagate_down_.resize(this->blobs_.size(), true);
+
+  // Set the diffs of recurrent outputs to 0 -- we can't backpropagate across
+  // batches.
+  for (int i = 0; i < recur_output_blobs_.size(); ++i) {
+    caffe_set(recur_output_blobs_[i]->count(), Dtype(0),
+              recur_output_blobs_[i]->mutable_cpu_diff());
+  }
+}
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(top.size(), output_blobs_.size());
+  for (int i = 0; i < top.size(); ++i) {
+    top[i]->ReshapeLike(*output_blobs_[i]);
+    output_blobs_[i]->ShareData(*top[i]);
+    output_blobs_[i]->ShareDiff(*top[i]);
+  }
+  x_input_blob_->ShareData(*bottom[0]);
+  x_input_blob_->ShareDiff(*bottom[0]);
+  cont_input_blob_->ShareData(*bottom[1]);
+  if (static_input_) {
+    x_static_input_blob_->ShareData(*bottom[2]);
+    x_static_input_blob_->ShareDiff(*bottom[2]);
+  }
+}
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::Reset() {
+  // "Reset" the hidden state of the net by zeroing out all recurrent outputs.
+  for (int i = 0; i < recur_output_blobs_.size(); ++i) {
+    caffe_set(recur_output_blobs_[i]->count(), Dtype(0),
+              recur_output_blobs_[i]->mutable_cpu_data());
+  }
+}
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  // Hacky fix for test time... reshare all the shared blobs.
+  // TODO: somehow make this work non-hackily.
+  if (this->phase_ == TEST) {
+    unrolled_net_->ShareWeightData();
+  }
+
+  DCHECK_EQ(recur_input_blobs_.size(), recur_output_blobs_.size());
+  for (int i = 0; i < recur_input_blobs_.size(); ++i) {
+    const int count = recur_input_blobs_[i]->count();
+    DCHECK_EQ(count, recur_output_blobs_[i]->count());
+    const Dtype* timestep_T_data = recur_output_blobs_[i]->cpu_data();
+    Dtype* timestep_0_data = recur_input_blobs_[i]->mutable_cpu_data();
+    caffe_copy(count, timestep_T_data, timestep_0_data);
+  }
+
+  unrolled_net_->ForwardPrefilled();
+}
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  CHECK(!propagate_down[1]) << "Cannot backpropagate to sequence indicators.";
+  if (!propagate_down[0] && !propagate_down[2]) { return; }
+
+  unrolled_net_->Backward();
+}
+
+#ifdef CPU_ONLY
+STUB_GPU_FORWARD(RecurrentLayer, Forward);
+#endif
+
+INSTANTIATE_CLASS(RecurrentLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/recurrent_layer.cu b/src/caffe/layers/recurrent_layer.cu
new file mode 100644
index 00000000000..ce4b2f9b77c
--- /dev/null
+++ b/src/caffe/layers/recurrent_layer.cu
@@ -0,0 +1,35 @@
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/sequence_layers.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void RecurrentLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  // Hacky fix for test time... reshare all the shared blobs.
+  // TODO: somehow make this work non-hackily.
+  if (this->phase_ == TEST) {
+    unrolled_net_->ShareWeightData();
+  }
+
+  DCHECK_EQ(recur_input_blobs_.size(), recur_output_blobs_.size());
+  for (int i = 0; i < recur_input_blobs_.size(); ++i) {
+    const int count = recur_input_blobs_[i]->count();
+    DCHECK_EQ(count, recur_output_blobs_[i]->count());
+    const Dtype* timestep_T_data = recur_output_blobs_[i]->gpu_data();
+    Dtype* timestep_0_data = recur_input_blobs_[i]->mutable_gpu_data();
+    caffe_copy(count, timestep_T_data, timestep_0_data);
+  }
+
+  unrolled_net_->ForwardPrefilled();
+}
+
+INSTANTIATE_LAYER_GPU_FORWARD(RecurrentLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/reshape_layer.cpp b/src/caffe/layers/reshape_layer.cpp
new file mode 100644
index 00000000000..f8e5122b297
--- /dev/null
+++ b/src/caffe/layers/reshape_layer.cpp
@@ -0,0 +1,20 @@
+#include <vector>
+
+#include "caffe/common_layers.hpp"
+#include "caffe/layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void ReshapeLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  top[0]->Reshape(this->layer_param_.reshape_param().shape());
+  CHECK_EQ(top[0]->count(), bottom[0]->count());
+  top[0]->ShareData(*bottom[0]);
+  top[0]->ShareDiff(*bottom[0]);
+}
+
+INSTANTIATE_CLASS(ReshapeLayer);
+REGISTER_LAYER_CLASS(Reshape);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/rnn_layer.cpp b/src/caffe/layers/rnn_layer.cpp
new file mode 100644
index 00000000000..a2a22f62819
--- /dev/null
+++ b/src/caffe/layers/rnn_layer.cpp
@@ -0,0 +1,217 @@
+#include <string>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/sequence_layers.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void RNNLayer<Dtype>::RecurrentInputBlobNames(vector<string>* names) const {
+  names->resize(1);
+  (*names)[0] = "h_0";
+}
+
+template <typename Dtype>
+void RNNLayer<Dtype>::RecurrentOutputBlobNames(vector<string>* names) const {
+  names->resize(1);
+  (*names)[0] = "h_" + this->int_to_str(this->T_);
+}
+
+template <typename Dtype>
+void RNNLayer<Dtype>::OutputBlobNames(vector<string>* names) const {
+  names->resize(1);
+  (*names)[0] = "o";
+}
+
+template <typename Dtype>
+void RNNLayer<Dtype>::FillUnrolledNet(NetParameter* net_param) const {
+  const int num_output = this->layer_param_.recurrent_param().num_output();
+  CHECK_GT(num_output, 0) << "num_output must be positive";
+  const FillerParameter& weight_filler =
+      this->layer_param_.recurrent_param().weight_filler();
+  const FillerParameter& bias_filler =
+      this->layer_param_.recurrent_param().bias_filler();
+
+  // Add generic LayerParameter's (without bottoms/tops) of layer types we'll
+  // use to save redundant code.
+  LayerParameter hidden_param;
+  hidden_param.set_type("InnerProduct");
+  hidden_param.mutable_inner_product_param()->set_num_output(num_output);
+  hidden_param.mutable_inner_product_param()->set_bias_term(false);
+  hidden_param.mutable_inner_product_param()->set_axis(2);
+  hidden_param.mutable_inner_product_param()->
+      mutable_weight_filler()->CopyFrom(weight_filler);
+
+  LayerParameter biased_hidden_param(hidden_param);
+  biased_hidden_param.mutable_inner_product_param()->set_bias_term(true);
+  biased_hidden_param.mutable_inner_product_param()->
+      mutable_bias_filler()->CopyFrom(bias_filler);
+
+  LayerParameter sum_param;
+  sum_param.set_type("Eltwise");
+  sum_param.mutable_eltwise_param()->set_operation(
+      EltwiseParameter_EltwiseOp_SUM);
+
+  LayerParameter tanh_param;
+  tanh_param.set_type("TanH");
+
+  LayerParameter slice_param;
+  slice_param.set_type("Slice");
+  slice_param.mutable_slice_param()->set_axis(0);
+
+  BlobShape input_shape;
+  input_shape.add_dim(1);  // h_0 is a single timestep
+  input_shape.add_dim(this->N_);
+  input_shape.add_dim(num_output);
+  net_param->add_input("h_0");
+  net_param->add_input_shape()->CopyFrom(input_shape);
+
+  LayerParameter* cont_slice_param = net_param->add_layer();
+  cont_slice_param->CopyFrom(slice_param);
+  cont_slice_param->set_name("cont_slice");
+  cont_slice_param->add_bottom("cont");
+  cont_slice_param->mutable_slice_param()->set_axis(1);
+
+  // Add layer to transform all timesteps of x to the hidden state dimension.
+  //     W_xh_x = W_xh * x + b_h
+  {
+    LayerParameter* x_transform_param = net_param->add_layer();
+    x_transform_param->CopyFrom(biased_hidden_param);
+    x_transform_param->set_name("x_transform");
+    x_transform_param->add_param()->set_name("W_xh");
+    x_transform_param->add_param()->set_name("b_h");
+    x_transform_param->add_bottom("x");
+    x_transform_param->add_top("W_xh_x");
+  }
+
+  if (this->static_input_) {
+    // Add layer to transform x_static to the hidden state dimension.
+    //     W_xh_x_static = W_xh_static * x_static
+    LayerParameter* x_static_transform_param = net_param->add_layer();
+    x_static_transform_param->CopyFrom(hidden_param);
+    x_static_transform_param->mutable_inner_product_param()->set_axis(1);
+    x_static_transform_param->set_name("W_xh_x_static");
+    x_static_transform_param->add_param()->set_name("W_xh_static");
+    x_static_transform_param->add_bottom("x_static");
+    x_static_transform_param->add_top("W_xh_x_static");
+
+    LayerParameter* reshape_param = net_param->add_layer();
+    reshape_param->set_type("Reshape");
+    BlobShape* new_shape =
+         reshape_param->mutable_reshape_param()->mutable_shape();
+    new_shape->add_dim(1);  // One timestep.
+    new_shape->add_dim(this->N_);
+    new_shape->add_dim(
+        x_static_transform_param->inner_product_param().num_output());
+    reshape_param->set_name("W_xh_x_static_reshape");
+    reshape_param->add_bottom("W_xh_x_static");
+    reshape_param->add_top("W_xh_x_static");
+  }
+
+  LayerParameter* x_slice_param = net_param->add_layer();
+  x_slice_param->CopyFrom(slice_param);
+  x_slice_param->set_name("W_xh_x_slice");
+  x_slice_param->add_bottom("W_xh_x");
+
+  LayerParameter output_concat_layer;
+  output_concat_layer.set_name("o_concat");
+  output_concat_layer.set_type("Concat");
+  output_concat_layer.add_top("o");
+  output_concat_layer.mutable_concat_param()->set_axis(0);
+
+  for (int t = 1; t <= this->T_; ++t) {
+    string tm1s = this->int_to_str(t - 1);
+    string ts = this->int_to_str(t);
+
+    cont_slice_param->add_top("cont_" + ts);
+    x_slice_param->add_top("W_xh_x_" + ts);
+
+    // Add layer to flush the hidden state when beginning a new sequence,
+    // as indicated by cont_t.
+    //     h_conted_{t-1} := cont_t * h_{t-1}
+    //
+    // Normally, cont_t is binary (i.e., 0 or 1), so:
+    //     h_conted_{t-1} := h_{t-1} if cont_t == 1
+    //                       0   otherwise
+    {
+      LayerParameter* cont_h_param = net_param->add_layer();
+      cont_h_param->CopyFrom(sum_param);
+      cont_h_param->mutable_eltwise_param()->set_coeff_blob(true);
+      cont_h_param->set_name("h_conted_" + tm1s);
+      cont_h_param->add_bottom("h_" + tm1s);
+      cont_h_param->add_bottom("cont_" + ts);
+      cont_h_param->add_top("h_conted_" + tm1s);
+    }
+
+    // Add layer to compute
+    //     W_hh_h_{t-1} := W_hh * h_conted_{t-1}
+    {
+      LayerParameter* w_param = net_param->add_layer();
+      w_param->CopyFrom(hidden_param);
+      w_param->set_name("W_hh_h_" + tm1s);
+      w_param->add_param()->set_name("W_hh");
+      w_param->add_bottom("h_conted_" + tm1s);
+      w_param->add_top("W_hh_h_" + tm1s);
+      w_param->mutable_inner_product_param()->set_axis(2);
+    }
+
+    // Add layers to compute
+    //     h_t := \tanh( W_hh * h_conted_{t-1} + W_xh * x_t + b_h )
+    //          = \tanh( W_hh_h_{t-1} + W_xh_t )
+    {
+      LayerParameter* h_input_sum_param = net_param->add_layer();
+      h_input_sum_param->CopyFrom(sum_param);
+      h_input_sum_param->set_name("h_input_sum_" + ts);
+      h_input_sum_param->add_bottom("W_hh_h_" + tm1s);
+      h_input_sum_param->add_bottom("W_xh_x_" + ts);
+      if (this->static_input_) {
+        h_input_sum_param->add_bottom("W_xh_x_static");
+      }
+      h_input_sum_param->add_top("h_neuron_input_" + ts);
+    }
+    {
+      LayerParameter* h_neuron_param = net_param->add_layer();
+      h_neuron_param->CopyFrom(tanh_param);
+      h_neuron_param->set_name("h_neuron_" + ts);
+      h_neuron_param->add_bottom("h_neuron_input_" + ts);
+      h_neuron_param->add_top("h_" + ts);
+    }
+
+    // Add layer to compute
+    //     W_ho_h_t := W_ho * h_t + b_o
+    {
+      LayerParameter* w_param = net_param->add_layer();
+      w_param->CopyFrom(biased_hidden_param);
+      w_param->set_name("W_ho_h_" + ts);
+      w_param->add_param()->set_name("W_ho");
+      w_param->add_param()->set_name("b_o");
+      w_param->add_bottom("h_" + ts);
+      w_param->add_top("W_ho_h_" + ts);
+      w_param->mutable_inner_product_param()->set_axis(2);
+    }
+
+    // Add layers to compute
+    //     o_t := \tanh( W_ho h_t + b_o)
+    //          = \tanh( W_ho_h_t )
+    {
+      LayerParameter* o_neuron_param = net_param->add_layer();
+      o_neuron_param->CopyFrom(tanh_param);
+      o_neuron_param->set_name("o_neuron_" + ts);
+      o_neuron_param->add_bottom("W_ho_h_" + ts);
+      o_neuron_param->add_top("o_" + ts);
+    }
+    output_concat_layer.add_bottom("o_" + ts);
+  }  // for (int t = 1; t <= this->T_; ++t)
+
+  net_param->add_layer()->CopyFrom(output_concat_layer);
+}
+
+INSTANTIATE_CLASS(RNNLayer);
+REGISTER_LAYER_CLASS(RNN);
+
+}  // namespace caffe
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index fd00b122630..14c5bfc195e 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -213,6 +213,7 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
     layer_names_index_[layer_names_[layer_id]] = layer_id;
   }
   GetLearningRateAndWeightDecay();
+  ShareWeightData();
   debug_info_ = param.debug_info();
   LOG(INFO) << "Network initialization done.";
   LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
@@ -437,8 +438,6 @@ void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
       // Strict dimension checking -- all dims must be the same.
       CHECK(this_blob->shape() == owner_blob->shape());
     }
-    layers_[layer_id]->blobs()[param_id]->ShareData(
-        *layers_[owner_layer_id]->blobs()[owner_param_id]);
   }
 }
 
@@ -741,35 +740,7 @@ void Net<Dtype>::ToProto(NetParameter* param, bool write_diff) const {
 
 template <typename Dtype>
 void Net<Dtype>::Update() {
-  // First, accumulate the diffs of any shared parameters into their owner's
-  // diff. (Assumes that the learning rate, weight decay, etc. have already been
-  // accounted for in the current diff.)
-  for (int i = 0; i < params_.size(); ++i) {
-    if (param_owners_[i] < 0) { continue; }
-    if (debug_info_) { UpdateDebugInfo(i); }
-    const int count = params_[i]->count();
-    const Dtype* this_diff;
-    Dtype* owner_diff;
-    switch (Caffe::mode()) {
-    case Caffe::CPU:
-      this_diff = params_[i]->cpu_diff();
-      owner_diff = params_[param_owners_[i]]->mutable_cpu_diff();
-      caffe_add(count, this_diff, owner_diff, owner_diff);
-      break;
-#ifndef CPU_ONLY
-    case Caffe::GPU:
-      this_diff = params_[i]->gpu_diff();
-      owner_diff = params_[param_owners_[i]]->mutable_gpu_diff();
-      caffe_gpu_add(count, this_diff, owner_diff, owner_diff);
-      break;
-#else
-      NO_GPU;
-#endif
-    default:
-      LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
-    }
-  }
-  // Now, update the owned parameters.
+  // Update only the owned parameters.
   for (int i = 0; i < params_.size(); ++i) {
     if (param_owners_[i] >= 0) { continue; }
     if (debug_info_) { UpdateDebugInfo(i); }
@@ -777,6 +748,15 @@ void Net<Dtype>::Update() {
   }
 }
 
+template <typename Dtype>
+void Net<Dtype>::ShareWeightData() {
+  for (int i = 0; i < params_.size(); ++i) {
+    if (param_owners_[i] < 0) { continue; }
+    params_[i]->ShareData(*params_[param_owners_[i]]);
+    params_[i]->ShareDiff(*params_[param_owners_[i]]);
+  }
+}
+
 template <typename Dtype>
 bool Net<Dtype>::has_blob(const string& blob_name) const {
   return blob_names_index_.find(blob_name) != blob_names_index_.end();
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 5b21cf20028..915b0af5d25 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -88,7 +88,7 @@ message NetParameter {
 // NOTE
 // Update the next available ID when you add a new SolverParameter field.
 //
-// SolverParameter next available ID: 36 (last added: clip_gradients)
+// SolverParameter next available ID: 37 (last added: iter_size)
 message SolverParameter {
   //////////////////////////////////////////////////////////////////////////////
   // Specifying the train and test networks
@@ -141,6 +141,7 @@ message SolverParameter {
   // Display the loss averaged over the last average_loss iterations
   optional int32 average_loss = 33 [default = 1];
   optional int32 max_iter = 7; // the maximum number of iterations
+  optional int32 iter_size = 36 [default = 1];
   optional string lr_policy = 8; // The learning rate decay policy.
   optional float gamma = 9; // The parameter to compute the learning rate.
   optional float power = 10; // The parameter to compute the learning rate.
@@ -259,7 +260,7 @@ message ParamSpec {
 // NOTE
 // Update the next available ID when you add a new LayerParameter field.
 //
-// LayerParameter next available layer-specific ID: 132 (last added: prelu_param)
+// LayerParameter next available layer-specific ID: 135 (last added: recurrent_param)
 message LayerParameter {
   optional string name = 1; // the layer name
   optional string type = 2; // the layer type
@@ -311,6 +312,7 @@ message LayerParameter {
   optional DropoutParameter dropout_param = 108;
   optional DummyDataParameter dummy_data_param = 109;
   optional EltwiseParameter eltwise_param = 110;
+  optional EmbedParameter embed_param = 132;
   optional ExpParameter exp_param = 111;
   optional HDF5DataParameter hdf5_data_param = 112;
   optional HDF5OutputParameter hdf5_output_param = 113;
@@ -325,7 +327,9 @@ message LayerParameter {
   optional PowerParameter power_param = 122;
   optional PReLUParameter prelu_param = 131;
   optional PythonParameter python_param = 130;
+  optional RecurrentParameter recurrent_param = 134;
   optional ReLUParameter relu_param = 123;
+  optional ReshapeParameter reshape_param = 133;
   optional SigmoidParameter sigmoid_param = 124;
   optional SoftmaxParameter softmax_param = 125;
   optional SliceParameter slice_param = 126;
@@ -378,6 +382,12 @@ message AccuracyParameter {
 
   // If specified, ignore instances with the given label.
   optional int32 ignore_label = 3;
+
+  // Controls the denominator in the computed accuracy = #correct / denominator.
+  // Must be a positive number, or the default of 0, for the total input weight.
+  // If no input weights are used, the denominator is the batch size, as the
+  // weights each default to 1.
+  optional float denominator = 4 [default = 0];
 }
 
 // Message that stores parameters used by ArgMaxLayer
@@ -500,6 +510,24 @@ message EltwiseParameter {
   // Whether to use an asymptotically slower (for >2 inputs) but stabler method
   // of computing the gradient for the PROD operation. (No effect for SUM op.)
   optional bool stable_prod_grad = 3 [default = true];
+
+  // If true and the EltwiseOp is SUM, the last bottom blob is a singleton
+  // coefficient for the first N-1 bottom blobs, with shape (N-1, 1, 1, 1).
+  optional bool coeff_blob = 4 [default = false];
+}
+
+// Message that stores parameters used by EmbedLayer
+message EmbedParameter {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  // The input is given as integers to be interpreted as one-hot
+  // vector indices with dimension num_input.  Hence num_input should be
+  // 1 greater than the maximum possible input value.
+  optional uint32 input_dim = 2;
+
+  optional bool bias_term = 3 [default = true]; // Whether to use a bias term
+  optional FillerParameter weight_filler = 4; // The filler for the weight
+  optional FillerParameter bias_filler = 5; // The filler for the bias
+
 }
 
 // Message that stores parameters used by ExpLayer
@@ -666,6 +694,26 @@ message PythonParameter {
   optional string layer = 2;
 }
 
+// Message that stores parameters used by ReshapeLayer
+message ReshapeParameter {
+  // The new shape of the Blob. Must have the same "count" (product of
+  // dimensions) as the input Blob.
+  optional BlobShape shape = 1;
+}
+
+// Message that stores parameters used by RecurrentLayer
+message RecurrentParameter {
+  // The dimension of the output (and usually hidden state) representation --
+  // must be explicitly set to non-zero.
+  optional uint32 num_output = 1 [default = 0];
+
+  optional FillerParameter weight_filler = 2; // The filler for the weight
+  optional FillerParameter bias_filler = 3; // The filler for the bias
+
+  // Whether to enable displaying debug_info in the unrolled recurrent net.
+  optional bool debug_info = 4 [default = false];
+}
+
 // Message that stores parameters used by ReLULayer
 message ReLUParameter {
   // Allow non-zero slope for negative inputs to speed up optimization
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 096980dd7af..8e2d2a83a1f 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -168,6 +168,25 @@ void Solver<Dtype>::Step(int iters) {
   Dtype smoothed_loss = 0;
 
   for (; iter_ < stop_iter; ++iter_) {
+    // zero-init the params
+    for (int i = 0; i < net_->params().size(); ++i) {
+      shared_ptr<Blob<Dtype> > blob = net_->params()[i];
+      switch (Caffe::mode()) {
+      case Caffe::CPU:
+        caffe_set(blob->count(), static_cast<Dtype>(0),
+            blob->mutable_cpu_diff());
+        break;
+      case Caffe::GPU:
+#ifndef CPU_ONLY
+        caffe_gpu_set(blob->count(), static_cast<Dtype>(0),
+            blob->mutable_gpu_diff());
+#else
+        NO_GPU;
+#endif
+        break;
+      }
+    }
+
     if (param_.test_interval() && iter_ % param_.test_interval() == 0
         && (iter_ > 0 || param_.test_initialization())) {
       TestAll();
@@ -175,7 +194,13 @@ void Solver<Dtype>::Step(int iters) {
 
     const bool display = param_.display() && iter_ % param_.display() == 0;
     net_->set_debug_info(display && param_.debug_info());
-    Dtype loss = net_->ForwardBackward(bottom_vec);
+    // accumulate the loss and gradient
+    Dtype loss = 0;
+    for (int i = 0; i < param_.iter_size(); ++i) {
+      loss += net_->ForwardBackward(bottom_vec);
+    }
+    loss /= param_.iter_size();
+    // average the loss across iterations for smoothed reporting
     if (losses.size() < average_loss) {
       losses.push_back(loss);
       int size = losses.size();
@@ -471,7 +496,8 @@ void SGDSolver<Dtype>::ComputeUpdateValue() {
   case Caffe::CPU:
     for (int param_id = 0; param_id < net_params.size(); ++param_id) {
       // Compute the value to history, and then copy them to the blob's diff.
-      Dtype local_rate = rate * net_params_lr[param_id];
+      Dtype local_rate = rate * net_params_lr[param_id]
+                              / this->param_.iter_size();
       Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
 
       if (local_decay) {
@@ -507,7 +533,8 @@ void SGDSolver<Dtype>::ComputeUpdateValue() {
 #ifndef CPU_ONLY
     for (int param_id = 0; param_id < net_params.size(); ++param_id) {
       // Compute the value to history, and then copy them to the blob's diff.
-      Dtype local_rate = rate * net_params_lr[param_id];
+      Dtype local_rate = rate * net_params_lr[param_id]
+                              / this->param_.iter_size();
       Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
 
       if (local_decay) {
diff --git a/src/caffe/test/test_eltwise_layer.cpp b/src/caffe/test/test_eltwise_layer.cpp
index be0c1347709..85c11341abd 100644
--- a/src/caffe/test/test_eltwise_layer.cpp
+++ b/src/caffe/test/test_eltwise_layer.cpp
@@ -22,14 +22,18 @@ class EltwiseLayerTest : public MultiDeviceTest<TypeParam> {
       : blob_bottom_a_(new Blob<Dtype>(2, 3, 4, 5)),
         blob_bottom_b_(new Blob<Dtype>(2, 3, 4, 5)),
         blob_bottom_c_(new Blob<Dtype>(2, 3, 4, 5)),
+        blob_bottom_coeff_(new Blob<Dtype>()),
         blob_top_(new Blob<Dtype>()) {
-    // fill the values
+    vector<int> coeff_shape(2);
+    coeff_shape[0] = 3; coeff_shape[1] = 2;
+    blob_bottom_coeff_->Reshape(coeff_shape);
     Caffe::set_random_seed(1701);
     FillerParameter filler_param;
     UniformFiller<Dtype> filler(filler_param);
     filler.Fill(this->blob_bottom_a_);
     filler.Fill(this->blob_bottom_b_);
     filler.Fill(this->blob_bottom_c_);
+    filler.Fill(this->blob_bottom_coeff_);
     blob_bottom_vec_.push_back(blob_bottom_a_);
     blob_bottom_vec_.push_back(blob_bottom_b_);
     blob_bottom_vec_.push_back(blob_bottom_c_);
@@ -39,11 +43,13 @@ class EltwiseLayerTest : public MultiDeviceTest<TypeParam> {
     delete blob_bottom_a_;
     delete blob_bottom_b_;
     delete blob_bottom_c_;
+    delete blob_bottom_coeff_;
     delete blob_top_;
   }
   Blob<Dtype>* const blob_bottom_a_;
   Blob<Dtype>* const blob_bottom_b_;
   Blob<Dtype>* const blob_bottom_c_;
+  Blob<Dtype>* const blob_bottom_coeff_;
   Blob<Dtype>* const blob_top_;
   vector<Blob<Dtype>*> blob_bottom_vec_;
   vector<Blob<Dtype>*> blob_top_vec_;
@@ -126,6 +132,37 @@ TYPED_TEST(EltwiseLayerTest, TestSumCoeff) {
   }
 }
 
+TYPED_TEST(EltwiseLayerTest, TestSumBlobCoeff) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  EltwiseParameter* eltwise_param = layer_param.mutable_eltwise_param();
+  eltwise_param->set_operation(EltwiseParameter_EltwiseOp_SUM);
+  eltwise_param->set_coeff_blob(true);
+  eltwise_param->add_coeff(1);
+  eltwise_param->add_coeff(-0.5);
+  eltwise_param->add_coeff(2);
+  shared_ptr<EltwiseLayer<Dtype> > layer(
+      new EltwiseLayer<Dtype>(layer_param));
+  this->blob_bottom_vec_.push_back(this->blob_bottom_coeff_);
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  const Dtype* data = this->blob_top_->cpu_data();
+  const int count = this->blob_top_->count();
+  const int num = this->blob_top_->num();
+  const int dim = count / num;
+  const Dtype* coeff_data = this->blob_bottom_coeff_->cpu_data();
+  for (int n = 0; n < num; ++n) {
+    for (int d = 0; d < dim; ++d) {
+      Dtype sum = 0;
+      for (int i = 0; i < this->blob_bottom_vec_.size() - 1; ++i) {
+        const Dtype coeff = coeff_data[i * num + n] * eltwise_param->coeff(i);
+        sum += coeff * this->blob_bottom_vec_[i]->cpu_data()[n * dim + d];
+      }
+      EXPECT_NEAR(data[n * dim + d], sum, 1e-4);
+    }
+  }
+}
+
 TYPED_TEST(EltwiseLayerTest, TestStableProdGradient) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
@@ -175,6 +212,26 @@ TYPED_TEST(EltwiseLayerTest, TestSumCoeffGradient) {
       this->blob_top_vec_);
 }
 
+TYPED_TEST(EltwiseLayerTest, TestSumBlobCoeffGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  EltwiseParameter* eltwise_param = layer_param.mutable_eltwise_param();
+  eltwise_param->set_operation(EltwiseParameter_EltwiseOp_SUM);
+  eltwise_param->set_coeff_blob(true);
+  eltwise_param->add_coeff(1);
+  eltwise_param->add_coeff(-0.5);
+  eltwise_param->add_coeff(2);
+  EltwiseLayer<Dtype> layer(layer_param);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_coeff_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 1);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 2);
+}
+
 TYPED_TEST(EltwiseLayerTest, TestMax) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
diff --git a/src/caffe/test/test_embed_layer.cpp b/src/caffe/test/test_embed_layer.cpp
new file mode 100644
index 00000000000..7a4fb9800f2
--- /dev/null
+++ b/src/caffe/test/test_embed_layer.cpp
@@ -0,0 +1,183 @@
+#include <cstring>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/vision_layers.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+#ifndef CPU_ONLY
+extern cudaDeviceProp CAFFE_TEST_CUDA_PROP;
+#endif
+
+template <typename TypeParam>
+class EmbedLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+ protected:
+  EmbedLayerTest()
+      : blob_bottom_(new Blob<Dtype>(4, 1, 1, 1)),
+        blob_top_(new Blob<Dtype>()) {
+    // fill the values
+    FillerParameter filler_param;
+    UniformFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_);
+    blob_bottom_vec_.push_back(blob_bottom_);
+    blob_top_vec_.push_back(blob_top_);
+  }
+  virtual ~EmbedLayerTest() { delete blob_bottom_; delete blob_top_; }
+  Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+TYPED_TEST_CASE(EmbedLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(EmbedLayerTest, TestSetUp) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  EmbedParameter* embed_param = layer_param.mutable_embed_param();
+  embed_param->set_num_output(10);
+  embed_param->set_input_dim(5);
+  shared_ptr<EmbedLayer<Dtype> > layer(new EmbedLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_top_->num_axes(), 5);
+  EXPECT_EQ(this->blob_top_->shape(0), 4);
+  EXPECT_EQ(this->blob_top_->shape(1), 1);
+  EXPECT_EQ(this->blob_top_->shape(2), 1);
+  EXPECT_EQ(this->blob_top_->shape(3), 1);
+  EXPECT_EQ(this->blob_top_->shape(4), 10);
+}
+
+TYPED_TEST(EmbedLayerTest, TestForward) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  EmbedParameter* embed_param = layer_param.mutable_embed_param();
+  const int kNumOutput = 10;
+  const int kInputDim = 5;
+  embed_param->set_num_output(kNumOutput);
+  embed_param->set_input_dim(kInputDim);
+  embed_param->mutable_weight_filler()->set_type("uniform");
+  embed_param->mutable_weight_filler()->set_min(-10);
+  embed_param->mutable_weight_filler()->set_max(10);
+  embed_param->set_bias_term(false);
+  shared_ptr<EmbedLayer<Dtype> > layer(new EmbedLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(1, layer->blobs().size());
+  vector<int> weight_shape(2);
+  weight_shape[0] = kInputDim;
+  weight_shape[1] = kNumOutput;
+  ASSERT_TRUE(weight_shape == layer->blobs()[0]->shape());
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    this->blob_bottom_->mutable_cpu_data()[i] = caffe_rng_rand() % kInputDim;
+  }
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  vector<int> weight_offset(2, 0);
+  vector<int> top_offset(5, 0);
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    weight_offset[0] = static_cast<int>(this->blob_bottom_->cpu_data()[i]);
+    weight_offset[1] = 0;
+    top_offset[0] = i;
+    top_offset[4] = 0;
+    for (int j = 0; j < kNumOutput; ++j) {
+      EXPECT_EQ(layer->blobs()[0]->data_at(weight_offset),
+                this->blob_top_->data_at(top_offset));
+      ++top_offset[4];
+      ++weight_offset[1];
+    }
+  }
+}
+
+TYPED_TEST(EmbedLayerTest, TestForwardWithBias) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  EmbedParameter* embed_param = layer_param.mutable_embed_param();
+  const int kNumOutput = 10;
+  const int kInputDim = 5;
+  embed_param->set_num_output(kNumOutput);
+  embed_param->set_input_dim(kInputDim);
+  embed_param->mutable_weight_filler()->set_type("uniform");
+  embed_param->mutable_weight_filler()->set_min(-10);
+  embed_param->mutable_weight_filler()->set_max(10);
+  embed_param->mutable_bias_filler()->CopyFrom(embed_param->weight_filler());
+  embed_param->set_bias_term(true);
+  shared_ptr<EmbedLayer<Dtype> > layer(new EmbedLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(2, layer->blobs().size());
+  vector<int> weight_shape(2);
+  weight_shape[0] = kInputDim;
+  weight_shape[1] = kNumOutput;
+  ASSERT_TRUE(weight_shape == layer->blobs()[0]->shape());
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    this->blob_bottom_->mutable_cpu_data()[i] = caffe_rng_rand() % kInputDim;
+  }
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  vector<int> bias_offset(1, 0);
+  vector<int> weight_offset(2, 0);
+  vector<int> top_offset(5, 0);
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    weight_offset[0] = static_cast<int>(this->blob_bottom_->cpu_data()[i]);
+    weight_offset[1] = 0;
+    top_offset[0] = i;
+    top_offset[4] = 0;
+    bias_offset[0] = 0;
+    for (int j = 0; j < kNumOutput; ++j) {
+      EXPECT_EQ(layer->blobs()[0]->data_at(weight_offset) +
+                layer->blobs()[1]->data_at(bias_offset),
+                this->blob_top_->data_at(top_offset));
+      ++top_offset[4];
+      ++weight_offset[1];
+      ++bias_offset[0];
+    }
+  }
+}
+
+TYPED_TEST(EmbedLayerTest, TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  EmbedParameter* embed_param = layer_param.mutable_embed_param();
+  embed_param->set_num_output(10);
+  embed_param->set_input_dim(5);
+  embed_param->set_bias_term(false);
+  embed_param->mutable_weight_filler()->set_type("uniform");
+  embed_param->mutable_weight_filler()->set_min(-10);
+  embed_param->mutable_weight_filler()->set_max(10);
+  EmbedLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  this->blob_bottom_->mutable_cpu_data()[0] = 4;
+  this->blob_bottom_->mutable_cpu_data()[1] = 2;
+  this->blob_bottom_->mutable_cpu_data()[2] = 2;
+  this->blob_bottom_->mutable_cpu_data()[3] = 3;
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, -2);
+}
+
+TYPED_TEST(EmbedLayerTest, TestGradientWithBias) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  EmbedParameter* embed_param = layer_param.mutable_embed_param();
+  embed_param->set_num_output(10);
+  embed_param->set_input_dim(5);
+  embed_param->set_bias_term(true);
+  embed_param->mutable_weight_filler()->set_type("uniform");
+  embed_param->mutable_weight_filler()->set_min(-10);
+  embed_param->mutable_weight_filler()->set_max(10);
+  embed_param->mutable_bias_filler()->CopyFrom(embed_param->weight_filler());
+  EmbedLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  this->blob_bottom_->mutable_cpu_data()[0] = 4;
+  this->blob_bottom_->mutable_cpu_data()[1] = 2;
+  this->blob_bottom_->mutable_cpu_data()[2] = 2;
+  this->blob_bottom_->mutable_cpu_data()[3] = 3;
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, -2);
+}
+
+}  // namespace caffe
diff --git a/src/caffe/test/test_lstm_layer.cpp b/src/caffe/test/test_lstm_layer.cpp
new file mode 100644
index 00000000000..a0ce45f6383
--- /dev/null
+++ b/src/caffe/test/test_lstm_layer.cpp
@@ -0,0 +1,265 @@
+#include <cstring>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/sequence_layers.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+template <typename TypeParam>
+class LSTMLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  LSTMLayerTest() : num_output_(7) {
+    blob_bottom_vec_.push_back(&blob_bottom_);
+    blob_bottom_vec_.push_back(&blob_bottom_flush_);
+    blob_top_vec_.push_back(&blob_top_);
+    unit_blob_bottom_vec_.push_back(&unit_blob_bottom_c_prev_);
+    unit_blob_bottom_vec_.push_back(&unit_blob_bottom_x_);
+    unit_blob_bottom_vec_.push_back(&unit_blob_bottom_flush_);
+    unit_blob_top_vec_.push_back(&unit_blob_top_c_);
+    unit_blob_top_vec_.push_back(&unit_blob_top_h_);
+
+    ReshapeBlobs(1, 3);
+
+    layer_param_.mutable_recurrent_param()->set_num_output(num_output_);
+    FillerParameter* weight_filler =
+        layer_param_.mutable_recurrent_param()->mutable_weight_filler();
+    weight_filler->set_type("gaussian");
+    weight_filler->set_std(0.2);
+    FillerParameter* bias_filler =
+        layer_param_.mutable_recurrent_param()->mutable_bias_filler();
+    bias_filler->set_type("gaussian");
+    bias_filler->set_std(0.1);
+
+    layer_param_.set_phase(TEST);
+  }
+
+  void ReshapeBlobs(int num_timesteps, int num_instances) {
+    blob_bottom_.Reshape(num_timesteps, num_instances, 3, 2);
+    vector<int> shape(2);
+    shape[0] = num_timesteps;
+    shape[1] = num_instances;
+    blob_bottom_flush_.Reshape(shape);
+    shape.push_back(num_output_);
+
+    shape[0] = 1; shape[1] = num_instances; shape[2] = 4 * num_output_;
+    unit_blob_bottom_x_.Reshape(shape);
+    shape[0] = 1; shape[1] = num_instances; shape[2] = num_output_;
+    unit_blob_bottom_c_prev_.Reshape(shape);
+    shape[0] = 1; shape[1] = 1; shape[2] = num_instances;
+    unit_blob_bottom_flush_.Reshape(shape);
+
+    FillerParameter filler_param;
+    filler_param.set_min(-1);
+    filler_param.set_max(1);
+    UniformFiller<Dtype> filler(filler_param);
+    filler.Fill(&blob_bottom_);
+    filler.Fill(&unit_blob_bottom_c_prev_);
+    filler.Fill(&unit_blob_bottom_x_);
+  }
+
+  int num_output_;
+  LayerParameter layer_param_;
+  Blob<Dtype> blob_bottom_;
+  Blob<Dtype> blob_bottom_flush_;
+  Blob<Dtype> blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+
+  Blob<Dtype> unit_blob_bottom_flush_;
+  Blob<Dtype> unit_blob_bottom_c_prev_;
+  Blob<Dtype> unit_blob_bottom_x_;
+  Blob<Dtype> unit_blob_top_c_;
+  Blob<Dtype> unit_blob_top_h_;
+  vector<Blob<Dtype>*> unit_blob_bottom_vec_;
+  vector<Blob<Dtype>*> unit_blob_top_vec_;
+};
+
+TYPED_TEST_CASE(LSTMLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(LSTMLayerTest, TestSetUp) {
+  typedef typename TypeParam::Dtype Dtype;
+  LSTMLayer<Dtype> layer(this->layer_param_);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  vector<int> expected_top_shape = this->blob_bottom_.shape();
+  expected_top_shape.resize(3);
+  expected_top_shape[2] = this->num_output_;
+  EXPECT_TRUE(this->blob_top_.shape() == expected_top_shape);
+}
+
+TYPED_TEST(LSTMLayerTest, TestForward) {
+  typedef typename TypeParam::Dtype Dtype;
+  const int kNumTimesteps = 3;
+  const int num = this->blob_bottom_.shape(1);
+  this->ReshapeBlobs(kNumTimesteps, num);
+
+  // Fill the flush blob with <0, 1, 1, ..., 1>,
+  // indicating a sequence that begins at the first timestep
+  // then continues for the rest of the sequence.
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_flush_.mutable_cpu_data()[t * num + n] = t > 0;
+    }
+  }
+
+  // Process the full sequence in a single batch.
+  FillerParameter filler_param;
+  filler_param.set_mean(0);
+  filler_param.set_std(1);
+  GaussianFiller<Dtype> sequence_filler(filler_param);
+  sequence_filler.Fill(&this->blob_bottom_);
+  shared_ptr<LSTMLayer<Dtype> > layer(new LSTMLayer<Dtype>(this->layer_param_));
+  Caffe::set_random_seed(1701);
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  LOG(INFO) << "Calling forward for full sequence LSTM";
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+  // Copy the inputs and outputs to reuse/check them later.
+  Blob<Dtype> bottom_copy(this->blob_bottom_.shape());
+  bottom_copy.CopyFrom(this->blob_bottom_);
+  Blob<Dtype> top_copy(this->blob_top_.shape());
+  top_copy.CopyFrom(this->blob_top_);
+
+  // Process the batch one timestep at a time;
+  // check that we get the same result.
+  this->ReshapeBlobs(1, num);
+  layer.reset(new LSTMLayer<Dtype>(this->layer_param_));
+  Caffe::set_random_seed(1701);
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  const int bottom_count = this->blob_bottom_.count();
+  const int top_count = this->blob_top_.count();
+  const Dtype kEpsilon = 1e-5;
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
+               this->blob_bottom_.mutable_cpu_data());
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_flush_.mutable_cpu_data()[n] = t > 0;
+    }
+    LOG(INFO) << "Calling forward for LSTM timestep " << t;
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    for (int i = 0; i < top_count; ++i) {
+      ASSERT_LT(t * top_count + i, top_copy.count());
+      EXPECT_NEAR(this->blob_top_.cpu_data()[i],
+                  top_copy.cpu_data()[t * top_count + i], kEpsilon)
+         << "t = " << t << "; i = " << i;
+    }
+  }
+
+  // Process the batch one timestep at a time with all flush blobs set to 0.
+  // Check that we get a different result, except in the first timestep.
+  Caffe::set_random_seed(1701);
+  layer.reset(new LSTMLayer<Dtype>(this->layer_param_));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
+               this->blob_bottom_.mutable_cpu_data());
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_flush_.mutable_cpu_data()[n] = 0;
+    }
+    LOG(INFO) << "Calling forward for LSTM timestep " << t;
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    for (int i = 0; i < top_count; ++i) {
+      if (t == 0) {
+        EXPECT_NEAR(this->blob_top_.cpu_data()[i],
+                    top_copy.cpu_data()[t * top_count + i], kEpsilon)
+           << "t = " << t << "; i = " << i;
+      } else {
+        EXPECT_NE(this->blob_top_.cpu_data()[i],
+                  top_copy.cpu_data()[t * top_count + i])
+           << "t = " << t << "; i = " << i;
+      }
+    }
+  }
+}
+
+TYPED_TEST(LSTMLayerTest, TestLSTMUnitSetUp) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  LSTMUnitLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->unit_blob_bottom_vec_, this->unit_blob_top_vec_);
+  const int num_axes = this->unit_blob_bottom_c_prev_.num_axes();
+  ASSERT_EQ(num_axes, this->unit_blob_top_c_.num_axes());
+  ASSERT_EQ(num_axes, this->unit_blob_top_h_.num_axes());
+  for (int i = 0; i < num_axes; ++i) {
+    EXPECT_EQ(this->unit_blob_bottom_c_prev_.shape(i),
+              this->unit_blob_top_c_.shape(i));
+    EXPECT_EQ(this->unit_blob_bottom_c_prev_.shape(i),
+              this->unit_blob_top_h_.shape(i));
+  }
+}
+
+TYPED_TEST(LSTMLayerTest, TestLSTMUnitGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  LSTMUnitLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  Dtype* flush_data = this->blob_bottom_flush_.mutable_cpu_data();
+  flush_data[0] = 0;
+  flush_data[1] = 0;
+  flush_data[2] = 0;
+  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
+      this->unit_blob_top_vec_, 0);
+  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
+      this->unit_blob_top_vec_, 1);
+}
+
+TYPED_TEST(LSTMLayerTest, TestLSTMUnitGradientNonZeroFlush) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  LSTMUnitLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  Dtype* flush_data = this->blob_bottom_flush_.mutable_cpu_data();
+  flush_data[0] = 1;
+  flush_data[1] = 0;
+  flush_data[2] = 1;
+  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
+      this->unit_blob_top_vec_, 0);
+  checker.CheckGradientExhaustive(&layer, this->unit_blob_bottom_vec_,
+      this->unit_blob_top_vec_, 1);
+}
+
+TYPED_TEST(LSTMLayerTest, TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LSTMLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(LSTMLayerTest, TestGradientNonZeroFlush) {
+  typedef typename TypeParam::Dtype Dtype;
+  LSTMLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  for (int i = 0; i < this->blob_bottom_flush_.count(); ++i) {
+    this->blob_bottom_flush_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(LSTMLayerTest, TestGradientNonZeroFlushBufferSize2) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->ReshapeBlobs(2, 2);
+  // fill the values
+  FillerParameter filler_param;
+  UniformFiller<Dtype> filler(filler_param);
+  filler.Fill(&this->blob_bottom_);
+  LSTMLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  for (int i = 0; i < this->blob_bottom_flush_.count(); ++i) {
+    this->blob_bottom_flush_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+}  // namespace caffe
diff --git a/src/caffe/test/test_net.cpp b/src/caffe/test/test_net.cpp
index 08106e79274..c65116d7eb1 100644
--- a/src/caffe/test/test_net.cpp
+++ b/src/caffe/test/test_net.cpp
@@ -1007,11 +1007,10 @@ TYPED_TEST(NetTest, TestSharedWeightsUpdate) {
   EXPECT_EQ(this->net_->layer_names()[2], "innerproduct2");
   Blob<Dtype>* ip1_weights = this->net_->layers()[1]->blobs()[0].get();
   Blob<Dtype>* ip2_weights = this->net_->layers()[2]->blobs()[0].get();
-  // Check that data blobs of shared weights share the same location in memory.
+  // Check that data and diff blobs of shared weights share the same memory
+  // locations.
   EXPECT_EQ(ip1_weights->cpu_data(), ip2_weights->cpu_data());
-  // Check that diff blobs of shared weights are at different locations in
-  // memory.  (The diffs should be accumulated at update time.)
-  EXPECT_NE(ip1_weights->cpu_diff(), ip2_weights->cpu_diff());
+  EXPECT_EQ(ip1_weights->cpu_diff(), ip2_weights->cpu_diff());
   this->net_->Forward(bottom);
   this->net_->Backward();
   // Compute the expected update as the data minus the two diffs.
@@ -1024,11 +1023,7 @@ TYPED_TEST(NetTest, TestSharedWeightsUpdate) {
   // Make sure the diffs are non-trivial.
   for (int i = 0; i < count; ++i) {
     EXPECT_NE(0, ip1_weights->cpu_diff()[i]);
-    EXPECT_NE(0, ip2_weights->cpu_diff()[i]);
-    EXPECT_NE(ip1_weights->cpu_diff()[i], ip2_weights->cpu_diff()[i]);
   }
-  caffe_axpy(count, Dtype(1), ip2_weights->cpu_diff(),
-             shared_params.mutable_cpu_diff());
   caffe_axpy(count, Dtype(-1), shared_params.cpu_diff(),
              shared_params.mutable_cpu_data());
   const Dtype* expected_updated_params = shared_params.cpu_data();
@@ -1065,8 +1060,8 @@ TYPED_TEST(NetTest, TestSharedWeightsUpdate) {
     EXPECT_NE(0, ip1_weights->cpu_diff()[i]);
     EXPECT_NE(0, ip2_weights->cpu_diff()[i]);
     EXPECT_NE(ip1_weights->cpu_diff()[i], ip2_weights->cpu_diff()[i]);
-    EXPECT_EQ(ip1_weights->cpu_diff()[i] + ip2_weights->cpu_diff()[i],
-              shared_params.cpu_diff()[i]);
+    EXPECT_FLOAT_EQ(ip1_weights->cpu_diff()[i] + ip2_weights->cpu_diff()[i],
+                    shared_params.cpu_diff()[i]);
   }
   caffe_axpy(count, Dtype(-1), ip1_weights->cpu_diff(),
              unshared_params1.mutable_cpu_data());
@@ -1096,11 +1091,10 @@ TYPED_TEST(NetTest, TestSharedWeightsResume) {
   EXPECT_EQ(this->net_->layer_names()[2], "innerproduct2");
   Blob<Dtype>* ip1_weights = this->net_->layers()[1]->blobs()[0].get();
   Blob<Dtype>* ip2_weights = this->net_->layers()[2]->blobs()[0].get();
-  // Check that data blobs of shared weights share the same location in memory.
+  // Check that data and diff blobs of shared weights share the same memory
+  // locations.
   EXPECT_EQ(ip1_weights->cpu_data(), ip2_weights->cpu_data());
-  // Check that diff blobs of shared weights are at different locations in
-  // memory.  (The diffs should be accumulated at update time.)
-  EXPECT_NE(ip1_weights->cpu_diff(), ip2_weights->cpu_diff());
+  EXPECT_EQ(ip1_weights->cpu_diff(), ip2_weights->cpu_diff());
   this->net_->ForwardBackward(bottom);
   this->net_->Update();
   Blob<Dtype> shared_params;
@@ -1123,14 +1117,13 @@ TYPED_TEST(NetTest, TestSharedWeightsResume) {
   ASSERT_FALSE(NULL == ip1_weights);
   ASSERT_FALSE(NULL == ip2_weights);
   EXPECT_NE(ip1_weights, ip2_weights);
-  // Check that data blobs of shared weights share the same location in memory.
+  // Check that data and diff blobs of shared weights share the same memory
+  // locations.
   EXPECT_EQ(ip1_weights->cpu_data(), ip2_weights->cpu_data());
+  EXPECT_EQ(ip1_weights->cpu_diff(), ip2_weights->cpu_diff());
   for (int i = 0; i < count; ++i) {
     EXPECT_FLOAT_EQ(shared_params.cpu_data()[i], ip1_weights->cpu_data()[i]);
   }
-  // Check that diff blobs of shared weights are at different locations in
-  // memory.  (The diffs should be accumulated at update time.)
-  EXPECT_NE(ip1_weights->cpu_diff(), ip2_weights->cpu_diff());
 }
 
 TYPED_TEST(NetTest, TestParamPropagateDown) {
diff --git a/src/caffe/test/test_reshape_layer.cpp b/src/caffe/test/test_reshape_layer.cpp
new file mode 100644
index 00000000000..78f157b81a7
--- /dev/null
+++ b/src/caffe/test/test_reshape_layer.cpp
@@ -0,0 +1,138 @@
+#include <cstring>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/vision_layers.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+template <typename TypeParam>
+class ReshapeLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+ protected:
+  ReshapeLayerTest()
+      : blob_bottom_(new Blob<Dtype>(2, 3, 6, 5)),
+        blob_top_(new Blob<Dtype>()) {
+    Caffe::set_random_seed(1701);
+    // fill the values
+    FillerParameter filler_param;
+    GaussianFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_);
+    blob_bottom_vec_.push_back(blob_bottom_);
+    blob_top_vec_.push_back(blob_top_);
+  }
+  virtual ~ReshapeLayerTest() { delete blob_bottom_; delete blob_top_; }
+  Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+TYPED_TEST_CASE(ReshapeLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(ReshapeLayerTest, TestSetup) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  BlobShape* shape = layer_param.mutable_reshape_param()->mutable_shape();
+  shared_ptr<ReshapeLayer<Dtype> > layer;
+
+  shape->Clear();
+  shape->add_dim(2 * 3 * 6 * 5);
+  layer.reset(new ReshapeLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_top_->num_axes(), 1);
+  EXPECT_EQ(this->blob_top_->shape(0), 2 * 3 * 6 * 5);
+
+  shape->Clear();
+  shape->add_dim(2 * 3 * 6);
+  shape->add_dim(5);
+  layer.reset(new ReshapeLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_top_->num_axes(), 2);
+  EXPECT_EQ(this->blob_top_->shape(0), 2 * 3 * 6);
+  EXPECT_EQ(this->blob_top_->shape(1), 5);
+
+  shape->Clear();
+  shape->add_dim(6);
+  shape->add_dim(1);
+  shape->add_dim(2);
+  shape->add_dim(3);
+  shape->add_dim(1);
+  shape->add_dim(5);
+  layer.reset(new ReshapeLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  ASSERT_EQ(this->blob_top_->num_axes(), 6);
+  EXPECT_EQ(this->blob_top_->shape(0), 6);
+  EXPECT_EQ(this->blob_top_->shape(1), 1);
+  EXPECT_EQ(this->blob_top_->shape(2), 2);
+  EXPECT_EQ(this->blob_top_->shape(3), 3);
+  EXPECT_EQ(this->blob_top_->shape(4), 1);
+  EXPECT_EQ(this->blob_top_->shape(5), 5);
+}
+
+TYPED_TEST(ReshapeLayerTest, TestForward) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  BlobShape* shape = layer_param.mutable_reshape_param()->mutable_shape();
+  shape->add_dim(6);
+  shape->add_dim(2);
+  shape->add_dim(3);
+  shape->add_dim(5);
+  ReshapeLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    EXPECT_EQ(this->blob_top_->cpu_data()[i],
+              this->blob_bottom_->cpu_data()[i]);
+  }
+}
+
+TYPED_TEST(ReshapeLayerTest, TestForwardAfterReshape) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  BlobShape* shape = layer_param.mutable_reshape_param()->mutable_shape();
+  shape->add_dim(6);
+  shape->add_dim(2);
+  shape->add_dim(3);
+  shape->add_dim(5);
+  ReshapeLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // We know the above produced the correct result from TestForward.
+  // Reshape the bottom and call layer.Reshape, then try again.
+  vector<int> new_bottom_shape(1, 2 * 3 * 6 * 5);
+  this->blob_bottom_->Reshape(new_bottom_shape);
+  layer.Reshape(this->blob_bottom_vec_, this->blob_top_vec_);
+  FillerParameter filler_param;
+  GaussianFiller<Dtype> filler(filler_param);
+  filler.Fill(this->blob_bottom_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    EXPECT_EQ(this->blob_top_->cpu_data()[i],
+              this->blob_bottom_->cpu_data()[i]);
+  }
+}
+
+TYPED_TEST(ReshapeLayerTest, TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  BlobShape* shape = layer_param.mutable_reshape_param()->mutable_shape();
+  shape->add_dim(6);
+  shape->add_dim(2);
+  shape->add_dim(3);
+  shape->add_dim(5);
+  ReshapeLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-2);
+  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+
+}  // namespace caffe
diff --git a/src/caffe/test/test_rnn_layer.cpp b/src/caffe/test/test_rnn_layer.cpp
new file mode 100644
index 00000000000..eab9269ce77
--- /dev/null
+++ b/src/caffe/test/test_rnn_layer.cpp
@@ -0,0 +1,196 @@
+#include <cstring>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/sequence_layers.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+template <typename TypeParam>
+class RNNLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  RNNLayerTest() : num_output_(7) {
+    blob_bottom_vec_.push_back(&blob_bottom_);
+    blob_bottom_vec_.push_back(&blob_bottom_flush_);
+    blob_top_vec_.push_back(&blob_top_);
+
+    ReshapeBlobs(1, 3);
+
+    layer_param_.mutable_recurrent_param()->set_num_output(num_output_);
+    FillerParameter* weight_filler =
+        layer_param_.mutable_recurrent_param()->mutable_weight_filler();
+    weight_filler->set_type("gaussian");
+    weight_filler->set_std(0.2);
+    FillerParameter* bias_filler =
+        layer_param_.mutable_recurrent_param()->mutable_bias_filler();
+    bias_filler->set_type("gaussian");
+    bias_filler->set_std(0.1);
+
+    layer_param_.set_phase(TEST);
+  }
+
+  void ReshapeBlobs(int num_timesteps, int num_instances) {
+    blob_bottom_.Reshape(num_timesteps, num_instances, 3, 2);
+    vector<int> shape(2);
+    shape[0] = num_timesteps;
+    shape[1] = num_instances;
+    blob_bottom_flush_.Reshape(shape);
+
+    FillerParameter filler_param;
+    filler_param.set_min(-1);
+    filler_param.set_max(1);
+    UniformFiller<Dtype> filler(filler_param);
+    filler.Fill(&blob_bottom_);
+  }
+
+  int num_output_;
+  LayerParameter layer_param_;
+  Blob<Dtype> blob_bottom_;
+  Blob<Dtype> blob_bottom_flush_;
+  Blob<Dtype> blob_top_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+TYPED_TEST_CASE(RNNLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(RNNLayerTest, TestSetUp) {
+  typedef typename TypeParam::Dtype Dtype;
+  RNNLayer<Dtype> layer(this->layer_param_);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  vector<int> expected_top_shape = this->blob_bottom_.shape();
+  expected_top_shape.resize(3);
+  expected_top_shape[2] = this->num_output_;
+  EXPECT_TRUE(this->blob_top_.shape() == expected_top_shape);
+}
+
+TYPED_TEST(RNNLayerTest, TestForward) {
+  typedef typename TypeParam::Dtype Dtype;
+  const int kNumTimesteps = 3;
+  const int num = this->blob_bottom_.shape(1);
+  this->ReshapeBlobs(kNumTimesteps, num);
+
+  // Fill the flush blob with <0, 1, 1, ..., 1>,
+  // indicating a sequence that begins at the first timestep
+  // then continues for the rest of the sequence.
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_flush_.mutable_cpu_data()[t * num + n] = t > 0;
+    }
+  }
+
+  // Process the full sequence in a single batch.
+  FillerParameter filler_param;
+  filler_param.set_mean(0);
+  filler_param.set_std(1);
+  GaussianFiller<Dtype> sequence_filler(filler_param);
+  sequence_filler.Fill(&this->blob_bottom_);
+  shared_ptr<RNNLayer<Dtype> > layer(new RNNLayer<Dtype>(this->layer_param_));
+  Caffe::set_random_seed(1701);
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  LOG(INFO) << "Calling forward for full sequence RNN";
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+  // Copy the inputs and outputs to reuse/check them later.
+  Blob<Dtype> bottom_copy(this->blob_bottom_.shape());
+  bottom_copy.CopyFrom(this->blob_bottom_);
+  Blob<Dtype> top_copy(this->blob_top_.shape());
+  top_copy.CopyFrom(this->blob_top_);
+
+  // Process the batch one timestep at a time;
+  // check that we get the same result.
+  this->ReshapeBlobs(1, num);
+  layer.reset(new RNNLayer<Dtype>(this->layer_param_));
+  Caffe::set_random_seed(1701);
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  const int bottom_count = this->blob_bottom_.count();
+  const int top_count = this->blob_top_.count();
+  const Dtype kEpsilon = 1e-5;
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
+               this->blob_bottom_.mutable_cpu_data());
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_flush_.mutable_cpu_data()[n] = t > 0;
+    }
+    LOG(INFO) << "Calling forward for RNN timestep " << t;
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    for (int i = 0; i < top_count; ++i) {
+      ASSERT_LT(t * top_count + i, top_copy.count());
+      EXPECT_NEAR(this->blob_top_.cpu_data()[i],
+                  top_copy.cpu_data()[t * top_count + i], kEpsilon)
+         << "t = " << t << "; i = " << i;
+    }
+  }
+
+  // Process the batch one timestep at a time with all flush blobs set to 0.
+  // Check that we get a different result, except in the first timestep.
+  Caffe::set_random_seed(1701);
+  layer.reset(new RNNLayer<Dtype>(this->layer_param_));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  for (int t = 0; t < kNumTimesteps; ++t) {
+    caffe_copy(bottom_count, bottom_copy.cpu_data() + t * bottom_count,
+               this->blob_bottom_.mutable_cpu_data());
+    for (int n = 0; n < num; ++n) {
+      this->blob_bottom_flush_.mutable_cpu_data()[n] = 0;
+    }
+    LOG(INFO) << "Calling forward for RNN timestep " << t;
+    layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    for (int i = 0; i < top_count; ++i) {
+      if (t == 0) {
+        EXPECT_NEAR(this->blob_top_.cpu_data()[i],
+                    top_copy.cpu_data()[t * top_count + i], kEpsilon)
+           << "t = " << t << "; i = " << i;
+      } else {
+        EXPECT_NE(this->blob_top_.cpu_data()[i],
+                  top_copy.cpu_data()[t * top_count + i])
+           << "t = " << t << "; i = " << i;
+      }
+    }
+  }
+}
+
+TYPED_TEST(RNNLayerTest, TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  RNNLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(RNNLayerTest, TestGradientNonZeroFlush) {
+  typedef typename TypeParam::Dtype Dtype;
+  RNNLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  for (int i = 0; i < this->blob_bottom_flush_.count(); ++i) {
+    this->blob_bottom_flush_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+TYPED_TEST(RNNLayerTest, TestGradientNonZeroFlushBufferSize2) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->ReshapeBlobs(2, 2);
+  // fill the values
+  FillerParameter filler_param;
+  UniformFiller<Dtype> filler(filler_param);
+  filler.Fill(&this->blob_bottom_);
+  RNNLayer<Dtype> layer(this->layer_param_);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  for (int i = 0; i < this->blob_bottom_flush_.count(); ++i) {
+    this->blob_bottom_flush_.mutable_cpu_data()[i] = i > 2;
+  }
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+}  // namespace caffe