Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Image captioning #15

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
402 changes: 402 additions & 0 deletions examples/imagecaption/COCO Preprocessing.ipynb

Large diffs are not rendered by default.

446 changes: 446 additions & 0 deletions examples/imagecaption/Caption Generation.ipynb

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions examples/imagecaption/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Image Captioning with LSTM

This is a partial implementation of "Show and Tell: A Neural Image Caption Generator" (http://arxiv.org/abs/1411.4555), borrowing heavily from Andrej Karpathy's NeuralTalk (https://github.com/karpathy/neuraltalk)

This example consists of three parts:

1. COCO Preprocessing - prepare the dataset by precomputing image representations using GoogLeNet
2. RNN Training - train a network to predict image captions
3. Caption Generation - use the trained network to caption new images
387 changes: 387 additions & 0 deletions examples/imagecaption/RNN Training.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,387 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Image Captioning with LSTM\n",
"\n",
"This is a partial implementation of \"Show and Tell: A Neural Image Caption Generator\" (http://arxiv.org/abs/1411.4555), borrowing heavily from Andrej Karpathy's NeuralTalk (https://github.com/karpathy/neuraltalk)\n",
"\n",
"This example consists of three parts:\n",
"1. COCO Preprocessing - prepare the dataset by precomputing image representations using GoogLeNet\n",
"2. RNN Training - train a network to predict image captions\n",
"3. Caption Generation - use the trained network to caption new images"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Output\n",
"This notebook defines and trains an RNN to predict captions starting from a vector image representation. A link to download the final result is given in the next notebook.\n",
"\n",
"\n",
"### Prerequisites\n",
"\n",
"To run this notebook, you'll need the output from the previous notebook, 'coco_with_cnn_features.pkl'. It can also be downloaded from https://s3.amazonaws.com/lasagne/recipes/datasets/coco_with_cnn_features.pkl"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using gpu device 0: GeForce GTX 770 (CNMeM is disabled)\n"
]
}
],
"source": [
"import pickle\n",
"import random\n",
"import numpy as np\n",
"\n",
"import theano\n",
"import theano.tensor as T\n",
"import lasagne\n",
"\n",
"from collections import Counter\n",
"from lasagne.utils import floatX"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Load the preprocessed dataset containing features extracted by GoogLeNet"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"dataset = pickle.load(open('coco_with_cnn_features.pkl'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Count words occuring at least 5 times and construct mapping int <-> word"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"allwords = Counter()\n",
"for item in dataset:\n",
" for sentence in item['sentences']:\n",
" allwords.update(sentence['tokens'])\n",
" \n",
"vocab = [k for k, v in allwords.items() if v >= 5]\n",
"vocab.insert(0, '#START#')\n",
"vocab.append('#END#')\n",
"\n",
"word_to_index = {w: i for i, w in enumerate(vocab)}\n",
"index_to_word = {i: w for i, w in enumerate(vocab)}"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"10370"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(vocab)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"SEQUENCE_LENGTH = 32\n",
"MAX_SENTENCE_LENGTH = SEQUENCE_LENGTH - 3 # 1 for image, 1 for start token, 1 for end token\n",
"BATCH_SIZE = 100\n",
"CNN_FEATURE_SIZE = 1000\n",
"EMBEDDING_SIZE = 512"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Returns a list of tuples (cnn features, list of words, image ID)\n",
"def get_data_batch(dataset, size, split='train'):\n",
" items = []\n",
" \n",
" while len(items) < size:\n",
" item = random.choice(dataset)\n",
" if item['split'] != split:\n",
" continue\n",
" sentence = random.choice(item['sentences'])['tokens']\n",
" if len(sentence) > MAX_SENTENCE_LENGTH:\n",
" continue\n",
" items.append((item['cnn features'], sentence, item['cocoid']))\n",
" \n",
" return items"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Convert a list of tuples into arrays that can be fed into the network\n",
"def prep_batch_for_network(batch):\n",
" x_cnn = floatX(np.zeros((len(batch), 1000)))\n",
" x_sentence = np.zeros((len(batch), SEQUENCE_LENGTH - 1), dtype='int32')\n",
" y_sentence = np.zeros((len(batch), SEQUENCE_LENGTH), dtype='int32')\n",
" mask = np.zeros((len(batch), SEQUENCE_LENGTH), dtype='bool')\n",
"\n",
" for j, (cnn_features, sentence, _) in enumerate(batch):\n",
" x_cnn[j] = cnn_features\n",
" i = 0\n",
" for word in ['#START#'] + sentence + ['#END#']:\n",
" if word in word_to_index:\n",
" mask[j, i] = True\n",
" y_sentence[j, i] = word_to_index[word]\n",
" x_sentence[j, i] = word_to_index[word]\n",
" i += 1\n",
" mask[j, 0] = False\n",
" \n",
" return x_cnn, x_sentence, y_sentence, mask"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# sentence embedding maps integer sequence with dim (BATCH_SIZE, SEQUENCE_LENGTH - 1) to \n",
"# (BATCH_SIZE, SEQUENCE_LENGTH-1, EMBEDDING_SIZE)\n",
"l_input_sentence = lasagne.layers.InputLayer((BATCH_SIZE, SEQUENCE_LENGTH - 1))\n",
"l_sentence_embedding = lasagne.layers.EmbeddingLayer(l_input_sentence,\n",
" input_size=len(vocab),\n",
" output_size=EMBEDDING_SIZE,\n",
" )\n",
"\n",
"# cnn embedding changes the dimensionality of the representation from 1000 to EMBEDDING_SIZE, \n",
"# and reshapes to add the time dimension - final dim (BATCH_SIZE, 1, EMBEDDING_SIZE)\n",
"l_input_cnn = lasagne.layers.InputLayer((BATCH_SIZE, CNN_FEATURE_SIZE))\n",
"l_cnn_embedding = lasagne.layers.DenseLayer(l_input_cnn, num_units=EMBEDDING_SIZE,\n",
" nonlinearity=lasagne.nonlinearities.identity)\n",
"\n",
"l_cnn_embedding = lasagne.layers.ReshapeLayer(l_cnn_embedding, ([0], 1, [1]))\n",
"\n",
"# the two are concatenated to form the RNN input with dim (BATCH_SIZE, SEQUENCE_LENGTH, EMBEDDING_SIZE)\n",
"l_rnn_input = lasagne.layers.ConcatLayer([l_cnn_embedding, l_sentence_embedding])\n",
"\n",
"l_dropout_input = lasagne.layers.DropoutLayer(l_rnn_input, p=0.5)\n",
"l_lstm = lasagne.layers.LSTMLayer(l_dropout_input,\n",
" num_units=EMBEDDING_SIZE,\n",
" unroll_scan=True,\n",
" grad_clipping=5.)\n",
"l_dropout_output = lasagne.layers.DropoutLayer(l_lstm, p=0.5)\n",
"\n",
"# the RNN output is reshaped to combine the batch and time dimensions\n",
"# dim (BATCH_SIZE * SEQUENCE_LENGTH, EMBEDDING_SIZE)\n",
"l_shp = lasagne.layers.ReshapeLayer(l_dropout_output, (-1, EMBEDDING_SIZE))\n",
"\n",
"# decoder is a fully connected layer with one output unit for each word in the vocabulary\n",
"l_decoder = lasagne.layers.DenseLayer(l_shp, num_units=len(vocab), nonlinearity=lasagne.nonlinearities.softmax)\n",
"\n",
"# finally, the separation between batch and time dimension is restored\n",
"l_out = lasagne.layers.ReshapeLayer(l_decoder, (BATCH_SIZE, SEQUENCE_LENGTH, len(vocab)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Define symbolic variables for the various inputs"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# cnn feature vector\n",
"x_cnn_sym = T.matrix()\n",
"\n",
"# sentence encoded as sequence of integer word tokens\n",
"x_sentence_sym = T.imatrix()\n",
"\n",
"# mask defines which elements of the sequence should be predicted\n",
"mask_sym = T.imatrix()\n",
"\n",
"# ground truth for the RNN output\n",
"y_sentence_sym = T.imatrix()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"output = lasagne.layers.get_output(l_out, {\n",
" l_input_sentence: x_sentence_sym,\n",
" l_input_cnn: x_cnn_sym\n",
" })"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def calc_cross_ent(net_output, mask, targets):\n",
" # Helper function to calculate the cross entropy error\n",
" preds = T.reshape(net_output, (-1, len(vocab)))\n",
" targets = T.flatten(targets)\n",
" cost = T.nnet.categorical_crossentropy(preds, targets)[T.flatten(mask).nonzero()]\n",
" return cost\n",
"\n",
"loss = T.mean(calc_cross_ent(output, mask_sym, y_sentence_sym))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"MAX_GRAD_NORM = 15\n",
"\n",
"all_params = lasagne.layers.get_all_params(l_out, trainable=True)\n",
"\n",
"all_grads = T.grad(loss, all_params)\n",
"all_grads = [T.clip(g, -5, 5) for g in all_grads]\n",
"all_grads, norm = lasagne.updates.total_norm_constraint(\n",
" all_grads, MAX_GRAD_NORM, return_norm=True)\n",
"\n",
"updates = lasagne.updates.adam(all_grads, all_params, learning_rate=0.001)\n",
"\n",
"f_train = theano.function([x_cnn_sym, x_sentence_sym, mask_sym, y_sentence_sym],\n",
" [loss, norm],\n",
" updates=updates\n",
" )\n",
"\n",
"f_val = theano.function([x_cnn_sym, x_sentence_sym, mask_sym, y_sentence_sym], loss)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"for iteration in range(20000):\n",
" x_cnn, x_sentence, y_sentence, mask = prep_batch_for_network(get_data_batch(dataset, BATCH_SIZE))\n",
" loss_train, norm = f_train(x_cnn, x_sentence, mask, y_sentence)\n",
" if not iteration % 250:\n",
" print('Iteration {}, loss_train: {}, norm: {}'.format(iteration, loss_train, norm))\n",
" try:\n",
" batch = get_data_batch(dataset, BATCH_SIZE, split='val')\n",
" x_cnn, x_sentence, y_sentence, mask = prep_batch_for_network(batch)\n",
" loss_val = f_val(x_cnn, x_sentence, mask, y_sentence)\n",
" print('Val loss: {}'.format(loss_val))\n",
" except IndexError:\n",
" continue "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"param_values = lasagne.layers.get_all_param_values(l_out)\n",
"d = {'param values': param_values,\n",
" 'vocab': vocab,\n",
" 'word_to_index': word_to_index,\n",
" 'index_to_word': index_to_word,\n",
" }\n",
"pickle.dump(d, open('lstm_coco_trained.pkl','w'), protocol=pickle.HIGHEST_PROTOCOL)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Empty file.
Empty file.
Loading