diff --git a/examples/coco_caption/lrcn.prototxt b/examples/coco_caption/lrcn.prototxt new file mode 100644 index 00000000000..62d08a2738d --- /dev/null +++ b/examples/coco_caption/lrcn.prototxt @@ -0,0 +1,767 @@ +# The network is used for the image captioning experiments of LRCN [1]. +# Please consider citing LRCN [1] if you use this example in your work. +# +# [1] J. Donahue, L. A. Hendricks, S. Guadarrama, M. Rohrbach, S. Venugopalan, +# K. Saenko, T. Darrell. "Long-term Recurrent Convolutional Networks for +# Visual Recognition and Description." arXiv preprint arXiv:1411.4389 (2014). + +name: "lrcn_caffenet_to_lstm" +layer { + name: "data" + type: "ImageData" + top: "data" + top: "label" + include { phase: TRAIN } + transform_param { + mirror: true + crop_size: 227 + mean_value: 104 + mean_value: 117 + mean_value: 123 + } + image_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/image_list.with_dummy_labels.txt" + batch_size: 100 + new_height: 256 + new_width: 256 + } +} +layer { + name: "data" + type: "HDF5Data" + top: "cont_sentence" + top: "input_sentence" + top: "target_sentence" + include { phase: TRAIN } + hdf5_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/hdf5_chunk_list.txt" + batch_size: 20 + } +} +layer { + name: "data" + type: "ImageData" + top: "data" + top: "label" + include { + phase: TEST + stage: "test-on-train" + } + transform_param { + mirror: true + crop_size: 227 + mean_value: 104 + mean_value: 117 + mean_value: 123 + } + image_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/image_list.with_dummy_labels.txt" + batch_size: 100 + new_height: 256 + new_width: 256 + } +} +layer { + name: "data" + type: "HDF5Data" + top: "cont_sentence" + top: "input_sentence" + top: "target_sentence" + include { + phase: TEST + stage: "test-on-train" + } + hdf5_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/train_aligned_20_batches/hdf5_chunk_list.txt" + batch_size: 20 + } +} +layer { + name: "data" + type: "ImageData" + top: "data" + top: "label" + include { + phase: TEST + stage: "test-on-val" + } + transform_param { + mirror: true + crop_size: 227 + mean_value: 104 + mean_value: 117 + mean_value: 123 + } + image_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/val_aligned_20_batches/image_list.with_dummy_labels.txt" + batch_size: 100 + new_height: 256 + new_width: 256 + } +} +layer { + name: "data" + type: "HDF5Data" + top: "cont_sentence" + top: "input_sentence" + top: "target_sentence" + include { + phase: TEST + stage: "test-on-val" + } + hdf5_data_param { + source: "./examples/coco_caption/h5_data/buffer_100/val_aligned_20_batches/hdf5_chunk_list.txt" + batch_size: 20 + } +} +layer { + name: "silence" + type: "Silence" + bottom: "label" +} +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + param { + lr_mult: 0 + } + param { + lr_mult: 0 + } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 96 + kernel_size: 11 + stride: 4 + } +} +layer { + name: "conv1" + type: "Convolution" + bottom: "data" + top: "conv1" + param { + lr_mult: 0.1 + decay_mult: 1 + } + param { + lr_mult: 0.2 + decay_mult: 0 + } + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 96 + kernel_size: 11 + stride: 4 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu1" + type: "ReLU" + bottom: "conv1" + top: "conv1" +} +layer { + name: "pool1" + type: "Pooling" + bottom: "conv1" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "norm1" + type: "LRN" + bottom: "pool1" + top: "norm1" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "conv2" + type: "Convolution" + bottom: "norm1" + top: "conv2" + param { + lr_mult: 0 + } + param { + lr_mult: 0 + } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 256 + pad: 2 + kernel_size: 5 + group: 2 + } +} +layer { + name: "conv2" + type: "Convolution" + bottom: "norm1" + top: "conv2" + param { + lr_mult: 0.1 + decay_mult: 1 + } + param { + lr_mult: 0.2 + decay_mult: 0 + } + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 256 + pad: 2 + kernel_size: 5 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 1 + } + } +} +layer { + name: "relu2" + type: "ReLU" + bottom: "conv2" + top: "conv2" +} +layer { + name: "pool2" + type: "Pooling" + bottom: "conv2" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "norm2" + type: "LRN" + bottom: "pool2" + top: "norm2" + lrn_param { + local_size: 5 + alpha: 0.0001 + beta: 0.75 + } +} +layer { + name: "conv3" + type: "Convolution" + bottom: "norm2" + top: "conv3" + param { + lr_mult: 0 + } + param { + lr_mult: 0 + } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + } +} +layer { + name: "conv3" + type: "Convolution" + bottom: "norm2" + top: "conv3" + param { + lr_mult: 0.1 + decay_mult: 1 + } + param { + lr_mult: 0.2 + decay_mult: 0 + } + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu3" + type: "ReLU" + bottom: "conv3" + top: "conv3" +} +layer { + name: "conv4" + type: "Convolution" + bottom: "conv3" + top: "conv4" + param { + lr_mult: 0 + } + param { + lr_mult: 0 + } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + group: 2 + } +} +layer { + name: "conv4" + type: "Convolution" + bottom: "conv3" + top: "conv4" + param { + lr_mult: 0.1 + decay_mult: 1 + } + param { + lr_mult: 0.2 + decay_mult: 0 + } + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 384 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 1 + } + } +} +layer { + name: "relu4" + type: "ReLU" + bottom: "conv4" + top: "conv4" +} +layer { + name: "conv5" + type: "Convolution" + bottom: "conv4" + top: "conv5" + param { + lr_mult: 0 + } + param { + lr_mult: 0 + } + include { stage: "freeze-convnet" } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + group: 2 + } +} +layer { + name: "conv5" + type: "Convolution" + bottom: "conv4" + top: "conv5" + param { + lr_mult: 0.1 + decay_mult: 1 + } + param { + lr_mult: 0.2 + decay_mult: 0 + } + exclude { stage: "freeze-convnet" } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + group: 2 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 1 + } + } +} +layer { + name: "relu5" + type: "ReLU" + bottom: "conv5" + top: "conv5" +} +layer { + name: "pool5" + type: "Pooling" + bottom: "conv5" + top: "pool5" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 2 + } +} +layer { + name: "fc6" + type: "InnerProduct" + bottom: "pool5" + top: "fc6" + param { + lr_mult: 0 + } + param { + lr_mult: 0 + } + include { stage: "freeze-convnet" } + inner_product_param { + num_output: 4096 + } +} +layer { + name: "fc6" + type: "InnerProduct" + bottom: "pool5" + top: "fc6" + param { + lr_mult: 0.1 + decay_mult: 1 + } + param { + lr_mult: 0.2 + decay_mult: 0 + } + exclude { stage: "freeze-convnet" } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 1 + } + } +} +layer { + name: "relu6" + type: "ReLU" + bottom: "fc6" + top: "fc6" +} +layer { + name: "drop6" + type: "Dropout" + bottom: "fc6" + top: "fc6" + dropout_param { + dropout_ratio: 0.5 + } +} +layer { + name: "fc7" + type: "InnerProduct" + bottom: "fc6" + top: "fc7" + param { + lr_mult: 0 + } + param { + lr_mult: 0 + } + include { stage: "freeze-convnet" } + inner_product_param { + num_output: 4096 + } +} +layer { + name: "fc7" + type: "InnerProduct" + bottom: "fc6" + top: "fc7" + param { + lr_mult: 0.1 + decay_mult: 1 + } + param { + lr_mult: 0.2 + decay_mult: 0 + } + exclude { stage: "freeze-convnet" } + inner_product_param { + num_output: 4096 + weight_filler { + type: "gaussian" + std: 0.005 + } + bias_filler { + type: "constant" + value: 1 + } + } +} +layer { + name: "relu7" + type: "ReLU" + bottom: "fc7" + top: "fc7" +} +layer { + name: "drop7" + type: "Dropout" + bottom: "fc7" + top: "fc7" + dropout_param { + dropout_ratio: 0.5 + } +} +layer { + name: "fc8" + type: "InnerProduct" + bottom: "fc7" + top: "fc8" + param { + lr_mult: 0.1 + decay_mult: 1 + } + param { + lr_mult: 0.2 + decay_mult: 0 + } + # exclude { stage: "freeze-convnet" } + inner_product_param { + num_output: 1000 + weight_filler { + type: "gaussian" + std: 0.01 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "embedding" + type: "Embed" + bottom: "input_sentence" + top: "embedded_input_sentence" + param { + lr_mult: 1 + } + embed_param { + bias_term: false + input_dim: 8801 + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + } +} +layer { + name: "lstm1" + type: "LSTM" + bottom: "embedded_input_sentence" + bottom: "cont_sentence" + bottom: "fc8" + top: "lstm1" + include { stage: "unfactored" } + recurrent_param { + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "lstm2" + type: "LSTM" + bottom: "lstm1" + bottom: "cont_sentence" + top: "lstm2" + include { + stage: "unfactored" + stage: "2-layer" + } + recurrent_param { + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "lstm1" + type: "LSTM" + bottom: "embedded_input_sentence" + bottom: "cont_sentence" + top: "lstm1" + include { stage: "factored" } + recurrent_param { + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "lstm2" + type: "LSTM" + bottom: "lstm1" + bottom: "cont_sentence" + bottom: "fc8" + top: "lstm2" + include { stage: "factored" } + recurrent_param { + num_output: 1000 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "predict" + type: "InnerProduct" + bottom: "lstm1" + top: "predict" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + exclude { stage: "2-layer" } + inner_product_param { + num_output: 8801 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + axis: 2 + } +} +layer { + name: "predict" + type: "InnerProduct" + bottom: "lstm2" + top: "predict" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + include { stage: "2-layer" } + inner_product_param { + num_output: 8801 + weight_filler { + type: "uniform" + min: -0.08 + max: 0.08 + } + bias_filler { + type: "constant" + value: 0 + } + axis: 2 + } +} +layer { + name: "cross_entropy_loss" + type: "SoftmaxWithLoss" + bottom: "predict" + bottom: "target_sentence" + top: "cross_entropy_loss" + loss_weight: 20 + loss_param { + ignore_label: -1 + } + softmax_param { + axis: 2 + } +} +layer { + name: "accuracy" + type: "Accuracy" + bottom: "predict" + bottom: "target_sentence" + top: "accuracy" + include { phase: TEST } + loss_param { + ignore_label: -1 + } +} diff --git a/examples/coco_caption/lrcn_solver.prototxt b/examples/coco_caption/lrcn_solver.prototxt new file mode 100644 index 00000000000..65ca272b30c --- /dev/null +++ b/examples/coco_caption/lrcn_solver.prototxt @@ -0,0 +1,30 @@ +net: "./examples/coco_caption/lrcn.prototxt" + +# lrcn.prototxt supports three variants of the LRCN architecture: +# (1) stage: 'factored' stage: '2-layer' +# (2) stage: 'unfactored' stage: '1-layer' +# (3) stage: 'unfactored' stage: '2-layer' +# This solver uses variant (1). +# To use a different variant, modify the states (train_state, test_state) +# below as appropriate: + +train_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' } +test_iter: 25 +test_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' stage: 'test-on-train' } +test_iter: 25 +test_state: { stage: 'freeze-convnet' stage: 'factored' stage: '2-layer' stage: 'test-on-val' } +test_interval: 1000 +base_lr: 0.01 +lr_policy: "step" +gamma: 0.5 +stepsize: 20000 +display: 1 +max_iter: 110000 +momentum: 0.9 +weight_decay: 0.0000 +snapshot: 5000 +snapshot_prefix: "./examples/coco_caption/lrcn" +solver_mode: GPU +random_seed: 1701 +average_loss: 100 +clip_gradients: 10 diff --git a/examples/coco_caption/train_lrcn.sh b/examples/coco_caption/train_lrcn.sh new file mode 100755 index 00000000000..5099e762ccd --- /dev/null +++ b/examples/coco_caption/train_lrcn.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +GPU_ID=0 +WEIGHTS=\ +./models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel +DATA_DIR=./examples/coco_caption/h5_data/ +if [ ! -d $DATA_DIR ]; then + echo "Data directory not found: $DATA_DIR" + echo "First, download the COCO dataset (follow instructions in data/coco)" + echo "Then, run ./examples/coco_caption/coco_to_hdf5_data.py to create the Caffe input data" + exit 1 +fi + +./build/tools/caffe train \ + -solver ./examples/coco_caption/lrcn_solver.prototxt \ + -weights $WEIGHTS \ + -gpu $GPU_ID