Lasagne · ebenolson · Aug 29, 2015 · Aug 31, 2015 · Aug 29, 2015 · Sep 3, 2015
diff --git a/examples/imagecaption/COCO Preprocessing.ipynb b/examples/imagecaption/COCO Preprocessing.ipynb
diff --git a/examples/imagecaption/Caption Generation.ipynb b/examples/imagecaption/Caption Generation.ipynb
diff --git a/examples/imagecaption/README.md b/examples/imagecaption/README.md
@@ -0,0 +1,9 @@
+# Image Captioning with LSTM
+
+This is a partial implementation of "Show and Tell: A Neural Image Caption Generator" (http://arxiv.org/abs/1411.4555), borrowing heavily from Andrej Karpathy's NeuralTalk (https://github.com/karpathy/neuraltalk)
+
+This example consists of three parts:
+
+1. COCO Preprocessing - prepare the dataset by precomputing image representations using GoogLeNet
+2. RNN Training - train a network to predict image captions
+3. Caption Generation - use the trained network to caption new images
diff --git a/examples/imagecaption/RNN Training.ipynb b/examples/imagecaption/RNN Training.ipynb
@@ -0,0 +1,387 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Image Captioning with LSTM\n",
+    "\n",
+    "This is a partial implementation of \"Show and Tell: A Neural Image Caption Generator\" (http://arxiv.org/abs/1411.4555), borrowing heavily from Andrej Karpathy's NeuralTalk (https://github.com/karpathy/neuraltalk)\n",
+    "\n",
+    "This example consists of three parts:\n",
+    "1. COCO Preprocessing - prepare the dataset by precomputing image representations using GoogLeNet\n",
+    "2. RNN Training - train a network to predict image captions\n",
+    "3. Caption Generation - use the trained network to caption new images"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Output\n",
+    "This notebook defines and trains an RNN to predict captions starting from a vector image representation. A link to download the final result is given in the next notebook.\n",
+    "\n",
+    "\n",
+    "### Prerequisites\n",
+    "\n",
+    "To run this notebook, you'll need the output from the previous notebook, 'coco_with_cnn_features.pkl'. It can also be downloaded from https://s3.amazonaws.com/lasagne/recipes/datasets/coco_with_cnn_features.pkl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using gpu device 0: GeForce GTX 770 (CNMeM is disabled)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pickle\n",
+    "import random\n",
+    "import numpy as np\n",
+    "\n",
+    "import theano\n",
+    "import theano.tensor as T\n",
+    "import lasagne\n",
+    "\n",
+    "from collections import Counter\n",
+    "from lasagne.utils import floatX"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load the preprocessed dataset containing features extracted by GoogLeNet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "dataset = pickle.load(open('coco_with_cnn_features.pkl'))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Count words occuring at least 5 times and construct mapping int <-> word"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "allwords = Counter()\n",
+    "for item in dataset:\n",
+    "    for sentence in item['sentences']:\n",
+    "        allwords.update(sentence['tokens'])\n",
+    "        \n",
+    "vocab = [k for k, v in allwords.items() if v >= 5]\n",
+    "vocab.insert(0, '#START#')\n",
+    "vocab.append('#END#')\n",
+    "\n",
+    "word_to_index = {w: i for i, w in enumerate(vocab)}\n",
+    "index_to_word = {i: w for i, w in enumerate(vocab)}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "10370"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(vocab)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "SEQUENCE_LENGTH = 32\n",
+    "MAX_SENTENCE_LENGTH = SEQUENCE_LENGTH - 3 # 1 for image, 1 for start token, 1 for end token\n",
+    "BATCH_SIZE = 100\n",
+    "CNN_FEATURE_SIZE = 1000\n",
+    "EMBEDDING_SIZE = 512"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Returns a list of tuples (cnn features, list of words, image ID)\n",
+    "def get_data_batch(dataset, size, split='train'):\n",
+    "    items = []\n",
+    "    \n",
+    "    while len(items) < size:\n",
+    "        item = random.choice(dataset)\n",
+    "        if item['split'] != split:\n",
+    "            continue\n",
+    "        sentence = random.choice(item['sentences'])['tokens']\n",
+    "        if len(sentence) > MAX_SENTENCE_LENGTH:\n",
+    "            continue\n",
+    "        items.append((item['cnn features'], sentence, item['cocoid']))\n",
+    "    \n",
+    "    return items"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# Convert a list of tuples into arrays that can be fed into the network\n",
+    "def prep_batch_for_network(batch):\n",
+    "    x_cnn = floatX(np.zeros((len(batch), 1000)))\n",
+    "    x_sentence = np.zeros((len(batch), SEQUENCE_LENGTH - 1), dtype='int32')\n",
+    "    y_sentence = np.zeros((len(batch), SEQUENCE_LENGTH), dtype='int32')\n",
+    "    mask = np.zeros((len(batch), SEQUENCE_LENGTH), dtype='bool')\n",
+    "\n",
+    "    for j, (cnn_features, sentence, _) in enumerate(batch):\n",
+    "        x_cnn[j] = cnn_features\n",
+    "        i = 0\n",
+    "        for word in ['#START#'] + sentence + ['#END#']:\n",
+    "            if word in word_to_index:\n",
+    "                mask[j, i] = True\n",
+    "                y_sentence[j, i] = word_to_index[word]\n",
+    "                x_sentence[j, i] = word_to_index[word]\n",
+    "                i += 1\n",
+    "        mask[j, 0] = False\n",
+    "                \n",
+    "    return x_cnn, x_sentence, y_sentence, mask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# sentence embedding maps integer sequence with dim (BATCH_SIZE, SEQUENCE_LENGTH - 1) to \n",
+    "# (BATCH_SIZE, SEQUENCE_LENGTH-1, EMBEDDING_SIZE)\n",
+    "l_input_sentence = lasagne.layers.InputLayer((BATCH_SIZE, SEQUENCE_LENGTH - 1))\n",
+    "l_sentence_embedding = lasagne.layers.EmbeddingLayer(l_input_sentence,\n",
+    "                                                     input_size=len(vocab),\n",
+    "                                                     output_size=EMBEDDING_SIZE,\n",
+    "                                                    )\n",
+    "\n",
+    "# cnn embedding changes the dimensionality of the representation from 1000 to EMBEDDING_SIZE, \n",
+    "# and reshapes to add the time dimension - final dim (BATCH_SIZE, 1, EMBEDDING_SIZE)\n",
+    "l_input_cnn = lasagne.layers.InputLayer((BATCH_SIZE, CNN_FEATURE_SIZE))\n",
+    "l_cnn_embedding = lasagne.layers.DenseLayer(l_input_cnn, num_units=EMBEDDING_SIZE,\n",
+    "                                            nonlinearity=lasagne.nonlinearities.identity)\n",
+    "\n",
+    "l_cnn_embedding = lasagne.layers.ReshapeLayer(l_cnn_embedding, ([0], 1, [1]))\n",
+    "\n",
+    "# the two are concatenated to form the RNN input with dim (BATCH_SIZE, SEQUENCE_LENGTH, EMBEDDING_SIZE)\n",
+    "l_rnn_input = lasagne.layers.ConcatLayer([l_cnn_embedding, l_sentence_embedding])\n",
+    "\n",
+    "l_dropout_input = lasagne.layers.DropoutLayer(l_rnn_input, p=0.5)\n",
+    "l_lstm = lasagne.layers.LSTMLayer(l_dropout_input,\n",
+    "                                  num_units=EMBEDDING_SIZE,\n",
+    "                                  unroll_scan=True,\n",
+    "                                  grad_clipping=5.)\n",
+    "l_dropout_output = lasagne.layers.DropoutLayer(l_lstm, p=0.5)\n",
+    "\n",
+    "# the RNN output is reshaped to combine the batch and time dimensions\n",
+    "# dim (BATCH_SIZE * SEQUENCE_LENGTH, EMBEDDING_SIZE)\n",
+    "l_shp = lasagne.layers.ReshapeLayer(l_dropout_output, (-1, EMBEDDING_SIZE))\n",
+    "\n",
+    "# decoder is a fully connected layer with one output unit for each word in the vocabulary\n",
+    "l_decoder = lasagne.layers.DenseLayer(l_shp, num_units=len(vocab), nonlinearity=lasagne.nonlinearities.softmax)\n",
+    "\n",
+    "# finally, the separation between batch and time dimension is restored\n",
+    "l_out = lasagne.layers.ReshapeLayer(l_decoder, (BATCH_SIZE, SEQUENCE_LENGTH, len(vocab)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Define symbolic variables for the various inputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# cnn feature vector\n",
+    "x_cnn_sym = T.matrix()\n",
+    "\n",
+    "# sentence encoded as sequence of integer word tokens\n",
+    "x_sentence_sym = T.imatrix()\n",
+    "\n",
+    "# mask defines which elements of the sequence should be predicted\n",
+    "mask_sym = T.imatrix()\n",
+    "\n",
+    "# ground truth for the RNN output\n",
+    "y_sentence_sym = T.imatrix()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "output = lasagne.layers.get_output(l_out, {\n",
+    "                l_input_sentence: x_sentence_sym,\n",
+    "                l_input_cnn: x_cnn_sym\n",
+    "                })"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def calc_cross_ent(net_output, mask, targets):\n",
+    "    # Helper function to calculate the cross entropy error\n",
+    "    preds = T.reshape(net_output, (-1, len(vocab)))\n",
+    "    targets = T.flatten(targets)\n",
+    "    cost = T.nnet.categorical_crossentropy(preds, targets)[T.flatten(mask).nonzero()]\n",
+    "    return cost\n",
+    "\n",
+    "loss = T.mean(calc_cross_ent(output, mask_sym, y_sentence_sym))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "MAX_GRAD_NORM = 15\n",
+    "\n",
+    "all_params = lasagne.layers.get_all_params(l_out, trainable=True)\n",
+    "\n",
+    "all_grads = T.grad(loss, all_params)\n",
+    "all_grads = [T.clip(g, -5, 5) for g in all_grads]\n",
+    "all_grads, norm = lasagne.updates.total_norm_constraint(\n",
+    "    all_grads, MAX_GRAD_NORM, return_norm=True)\n",
+    "\n",
+    "updates = lasagne.updates.adam(all_grads, all_params, learning_rate=0.001)\n",
+    "\n",
+    "f_train = theano.function([x_cnn_sym, x_sentence_sym, mask_sym, y_sentence_sym],\n",
+    "                          [loss, norm],\n",
+    "                          updates=updates\n",
+    "                         )\n",
+    "\n",
+    "f_val = theano.function([x_cnn_sym, x_sentence_sym, mask_sym, y_sentence_sym], loss)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "for iteration in range(20000):\n",
+    "    x_cnn, x_sentence, y_sentence, mask = prep_batch_for_network(get_data_batch(dataset, BATCH_SIZE))\n",
+    "    loss_train, norm = f_train(x_cnn, x_sentence, mask, y_sentence)\n",
+    "    if not iteration % 250:\n",
+    "        print('Iteration {}, loss_train: {}, norm: {}'.format(iteration, loss_train, norm))\n",
+    "        try:\n",
+    "            batch = get_data_batch(dataset, BATCH_SIZE, split='val')\n",
+    "            x_cnn, x_sentence, y_sentence, mask = prep_batch_for_network(batch)\n",
+    "            loss_val = f_val(x_cnn, x_sentence, mask, y_sentence)\n",
+    "            print('Val loss: {}'.format(loss_val))\n",
+    "        except IndexError:\n",
+    "            continue        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "param_values = lasagne.layers.get_all_param_values(l_out)\n",
+    "d = {'param values': param_values,\n",
+    "     'vocab': vocab,\n",
+    "     'word_to_index': word_to_index,\n",
+    "     'index_to_word': index_to_word,\n",
+    "    }\n",
+    "pickle.dump(d, open('lstm_coco_trained.pkl','w'), protocol=pickle.HIGHEST_PROTOCOL)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/examples/imagecaption/captions/.gitkeep b/examples/imagecaption/captions/.gitkeep
diff --git a/examples/imagecaption/coco/.gitkeep b/examples/imagecaption/coco/.gitkeep