From c3a1b3b8ea36111d9cfa133c7cd85da0eb3271f6 Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Sat, 21 Oct 2017 14:11:06 -0400
Subject: [PATCH 1/2] Add preliminary training code (no eval yet)

---
 l3embedding/model.py | 173 ++++++++++++++++++++++++++++++++++---------
 l3embedding/train.py | 117 +++++++++++++++++++++++++++--
 2 files changed, 252 insertions(+), 38 deletions(-)

diff --git a/l3embedding/model.py b/l3embedding/model.py
index 4bf3874..140a6ff 100644
--- a/l3embedding/model.py
+++ b/l3embedding/model.py
@@ -1,42 +1,149 @@
+from keras.models import Model
+from keras.layers import Input, Conv2D, BatchNormalization, MaxPooling2D,\
+                         Flatten, Concatenate, Dense
+from kapre.time_frequency import Spectrogram
 
-from keras.layers import Input, Convolution2D, BatchNormalization
 
 def construct_cnn_L3_orig():
+    """
+    Constructs a model that replicates that used in Look, Listen and Learn
 
+    Relja Arandjelovic and (2017). Look, Listen and Learn. CoRR, abs/1705.08168, .
+
+    Returns
+    -------
+    model:  L3 CNN model
+            (Type: keras.models.Model)
+    """
+    ####
+    # Image subnetwork
+    ####
+    # INPUT
+    x_i = Input(shape=(224, 224, 3), dtype='float32')
+
+    # CONV BLOCK 1
+    n_filter_i_1 = 64
+    filt_size_i_1 = (3, 3)
+    pool_size_i_1 = (2,2)
+    y_i = Conv2D(n_filter_i_1, filt_size_i_1, padding='same',
+                 activation='relu')(x_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = Conv2D(n_filter_i_1, filt_size_i_1, padding='same',
+                 activation='relu')(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = MaxPooling2D(pool_size=pool_size_i_1, strides=2, padding='same')(y_i)
+
+    # CONV BLOCK 2
+    n_filter_i_2 = 128
+    filt_size_i_2 = (3, 3)
+    pool_size_i_2 = (2,2)
+    y_i = Conv2D(n_filter_i_2, filt_size_i_2, padding='same',
+                 activation='relu')(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = Conv2D(n_filter_i_2, filt_size_i_2, padding='same',
+                 activation='relu')(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = MaxPooling2D(pool_size=pool_size_i_2, strides=2, padding='same')(y_i)
+
+    # CONV BLOCK 3
+    n_filter_i_3 = 256
+    filt_size_i_3 = (3, 3)
+    pool_size_i_3 = (2,2)
+    y_i = Conv2D(n_filter_i_3, filt_size_i_3, padding='same',
+                 activation='relu')(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = Conv2D(n_filter_i_3, filt_size_i_3, padding='same',
+                 activation='relu')(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = MaxPooling2D(pool_size=pool_size_i_3, strides=2, padding='same')(y_i)
+
+    # CONV BLOCK 4
+    n_filter_i_4 = 512
+    filt_size_i_4 = (3, 3)
+    pool_size_i_4 = (28, 28)
+    y_i = Conv2D(n_filter_i_4, filt_size_i_4, padding='same',
+                 activation='relu')(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = Conv2D(n_filter_i_4, filt_size_i_4, padding='same',
+                 activation='relu')(y_i)
+    y_i = BatchNormalization()(y_i)
+    y_i = MaxPooling2D(pool_size=pool_size_i_4, strides=2, padding='same')(y_i)
+    y_i = Flatten()(y_i)
+
+
+    ####
+    # Audio subnetwork
+    ####
+    n_dft = 512
+    n_hop = 16
+    asr = 48000
+    audio_window_dur = 1
     # INPUT
-    x = Input(shape=(n_freq_cnn, n_frames_cnn, 1), dtype='float32')
-
-    # CONV 1
-    y = Convolution2D(n_filters_cnn, filter_size_cnn, padding='valid',
-               activation='relu')(x)
-    y = MaxPooling2D(pool_size=pool_size_cnn, strides=None, padding='valid')(y)
-    y = BatchNormalization()(y)
-
-    # CONV 2
-    y = Convolution2D(n_filters_cnn, filter_size_cnn, padding='valid',
-               activation='relu')(y)
-    y = MaxPooling2D(pool_size=pool_size_cnn, strides=None, padding='valid')(y)
-    y = BatchNormalization()(y)
-
-    # CONV 3
-    y = Convolution2D(n_filters_cnn, filter_size_cnn, padding='valid',
-               activation='relu')(y)
-    # y = MaxPooling2D(pool_size=pool_size_cnn, strides=None, padding='valid')(y)
-    y = BatchNormalization()(y)
-
-    # Flatten for dense layers
-    y = Flatten()(y)
-    y = Dropout(0.5)(y)
-    y = Dense(n_dense_cnn, activation='relu')(y)
-    if large_cnn:
-        y = Dropout(0.5)(y)
-        y = Dense(n_dense_cnn, activation='relu')(y)
-    y = Dropout(0.5)(y)
-    y = Dense(n_classes, activation='sigmoid')(y)
-
-    m = Model(inputs=x, outputs=y)
-    return m
+    x_a = Input(shape=(1, asr * audio_window_dur), dtype='float32')
+
+    # SPECTROGRAM PREPROCESSING
+    # 257 x 199 x 1
+    y_a = Spectrogram(n_dft=n_dft, n_hop=n_hop,
+                      return_decibel_spectrogram=True)(x_a)
+    # CONV BLOCK 1
+    n_filter_a_1 = 64
+    filt_size_a_1 = (3, 3)
+    pool_size_a_1 = (2,2)
+    y_a= Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
+               activation='relu')(y_a)
+    y_a= BatchNormalization()(y_a)
+    y_a= Conv2D(n_filter_a_1, filt_size_a_1, padding='same',
+                       activation='relu')(y_a)
+    y_a= BatchNormalization()(y_a)
+    y_a= MaxPooling2D(pool_size=pool_size_a_1, strides=2, padding='same')(y_a)
+
+    # CONV BLOCK 2
+    n_filter_a_2 = 128
+    filt_size_a_2 = (3, 3)
+    pool_size_a_2 = (2,2)
+    y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
+                activation='relu')(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same',
+                activation='relu')(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = MaxPooling2D(pool_size=pool_size_a_2, strides=2, padding='same')(y_a)
+
+    # CONV BLOCK 3
+    n_filter_a_3 = 256
+    filt_size_a_3 = (3, 3)
+    pool_size_a_3 = (2,2)
+    y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
+                activation='relu')(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same',
+                activation='relu')(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = MaxPooling2D(pool_size=pool_size_a_3, strides=2, padding='same')(y_a)
+
+    # CONV BLOCK 4
+    n_filter_a_4 = 512
+    filt_size_a_4 = (3, 3)
+    pool_size_a_4 = (32, 24)
+    y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same',
+                activation='relu')(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same',
+                activation='relu')(y_a)
+    y_a = BatchNormalization()(y_a)
+    y_a = MaxPooling2D(pool_size=pool_size_a_4, strides=2, padding='same')(y_a)
+
+    y_a = Flatten()(y_a)
+
+
+
+    # Merge the subnetworks
+    y = Concatenate()([y_i, y_a])
+    y = Dense(128, activation='relu')(y)
+    y = Dense(2, activation='softmax')(y)
 
+    m = Model(inputs=[x_i, x_a], outputs=y)
+    return m, [x_i, x_a], y
 
 
 MODELS = {'cnn_L3_orig': construct_cnn_L3_orig}
\ No newline at end of file
diff --git a/l3embedding/train.py b/l3embedding/train.py
index 40fc385..49e4060 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -1,7 +1,12 @@
-
+from .model import construct_cnn_L3_orig
+import json
+import os
+import pickle
 import pescador
 import pandas as pd
 from tqdm import tqdm
+import keras
+from keras.optimizers import Adam
 
 
 def sampler(filename, file_list):
@@ -52,11 +57,113 @@ def data_generator(csv_file, batch_size=64):
         return pescador.BufferedStreamer(mux, batch_size)
 
 
-def train(csv_file, batch_size=64, rate=16, seed=20171011):
+class LossHistory(keras.callbacks.Callback):
+
+    def __init__(self, outfile):
+        super().__init__()
+        self.outfile = outfile
+
+    def on_train_begin(self, logs={}):
+        self.loss = []
+        self.val_loss = []
+
+    # def on_batch_end(self, batch, logs={}):
+    def on_epoch_end(self, epoch, logs={}):
+        self.loss.append(logs.get('loss'))
+        self.val_loss.append(logs.get('val_loss'))
+
+        loss_dict = {'loss': self.loss, 'val_loss': self.val_loss}
+        with open(self.outfile, 'wb') as fp:
+            pickle.dump(loss_dict, fp)
+
+
+def train(csv_file, model_id, output_dir, epochs=150, epoch_size=512,
+          batch_size=64, validation_size=1024, rate=16,
+          seed=20171011, verbose=False):
+    m, inputs, outputs = construct_cnn_L3_orig()
+    loss = 'binary_crossentropy'
+    metrics = ['accuracy']
+    #monitor = 'val_loss'
+
+    # Make sure the directories we need exist
+    model_dir = os.path.join(output_dir, model_id)
+    if not os.path.isdir(output_dir):
+        os.makedirs(output_dir)
+    if not os.path.isdir(model_dir):
+        os.makedirs(model_dir)
+
+    print('Compile model...')
+    m.compile(Adam(),
+              loss=loss,
+              metrics=metrics)
+
+    # Save the model
+    model_spec_path = os.path.join(model_dir, 'model_spec.pkl')
+    model_spec = keras.utils.serialize_keras_object(m)
+    with open(model_spec_path, 'wb') as fd:
+        pickle.dump(model_spec, fd)
+    model_json_path = os.path.join(model_dir, 'model.json')
+    model_json = m.to_json()
+    with open(model_json_path, 'w') as fd:
+        json.dump(model_json, fd, indent=2)
+
+    weight_path = os.path.join(model_dir, 'model.h5')
+
+    cb = []
+    cb.append(keras.callbacks.ModelCheckpoint(weight_path,
+                                              save_best_only=True,
+                                              verbose=1,))
+                                              #monitor=monitor))
+
+    history_checkpoint = os.path.join(model_dir, 'history_checkpoint.pkl')
+    cb.append(LossHistory(history_checkpoint))
+
+    history_csvlog = os.path.join(model_dir, 'history_csvlog.csv')
+    cb.append(keras.callbacks.CSVLogger(history_csvlog, append=True,
+                                        separator=','))
+
 
     train_gen = data_generator(
         csv_file,
         batch_size=batch_size,
-        lam=rate,
-        revive=True,
-        random_state=seed)
\ No newline at end of file
+        random_state=seed).tuples('video', 'audio', 'label')
+
+    train_gen = pescador.maps.keras_tuples(train_gen,
+                                           ['video', 'audio'],
+                                           'label')
+
+    # Fit the model
+    print('Fit model...')
+    if verbose:
+        verbosity = 1
+    else:
+        verbosity = 2
+    history = m.fit_generator(train_gen, epoch_size, epochs,
+    #                          validation_data=gen_val,
+    #                          validation_steps=validation_size,
+                              callbacks=cb,
+                              verbose=verbosity)
+
+    print('Done training. Saving results to disk...')
+    # Save history
+    with open(os.path.join(model_dir, 'history.pkl'), 'wb') as fd:
+        pickle.dump(history.history, fd)
+
+    # Evaluate model
+    print('Evaluate model...')
+    # Load best params
+    m.load_weights(weight_path)
+    with open(os.path.join(output_dir, 'index_test.json'), 'r') as fp:
+        test_idx = json.load(fp)['id']
+
+    # Compute eval scores
+    #results = score_model(output_dir, pump, model, test_idx, working,
+    #                      strong_label_file, duration, modelid,
+    #                      use_orig_duration=True)
+
+    # Save results to disk
+    #results_file = os.path.join(model_dir, 'results.json')
+    #with open(results_file, 'w') as fp:
+    #    json.dump(results, fp, indent=2)
+
+    #print('Done!')
\ No newline at end of file

From c5bbd03d26c214ed605a86ca6e8458096b5a246f Mon Sep 17 00:00:00 2001
From: Jason Cramer <jason.t.cramer@gmail.com>
Date: Wed, 1 Nov 2017 15:36:50 -0400
Subject: [PATCH 2/2] Change image sampling to follow paper Move script part of
 train.py to a different file Add retry loop to opening video files Add
 periodic model checkpoints to training

---
 l3embedding/train.py | 150 +++++++++++++------------------------------
 train.py             |  97 ++++++++++++++++++++++++++++
 2 files changed, 143 insertions(+), 104 deletions(-)
 create mode 100644 train.py

diff --git a/l3embedding/train.py b/l3embedding/train.py
index b7c543e..61e0189 100644
--- a/l3embedding/train.py
+++ b/l3embedding/train.py
@@ -1,11 +1,10 @@
-import argparse
 import json
 import os
 import pickle
 import glob
 import random
 import pescador
-from scipy.misc import imresize
+import scipy.misc
 import skvideo.io
 import soundfile as sf
 from tqdm import tqdm
@@ -68,7 +67,23 @@ def sample_one_second(audio_data, sampling_frequency, start, label):
     return audio_data[start:start+sampling_frequency], start / sampling_frequency
 
 
-def sample_one_frame(video_data, fps=30):
+def l3_frame_scaling(frame_data):
+    nx, ny, nc = frame_data.shape
+    scaling = 256.0 / min(nx, ny)
+
+    new_nx, new_ny = int(scaling * nx), int(scaling * ny)
+    assert 256 in (new_nx, new_ny)
+
+
+    resized_frame_data = scipy.misc.imresize(frame_data, (new_nx, new_ny, nc))
+
+    start_x, start_y = random.randrange(new_nx - 224), random.randrange(new_ny - 224)
+    end_x, end_y = start_x + 224, start_y + 224
+
+    return resized_frame_data[start_x:end_x, start_y:end_y, :]
+
+
+def sample_one_frame(video_data, fps=30, scaling_func=None):
     """Return one frame randomly and time (seconds).
 
     Args:
@@ -79,13 +94,16 @@ def sample_one_frame(video_data, fps=30):
         One frame sampled randomly and time in seconds
 
     """
-
+    if not scaling_func:
+        scaling_func = l3_frame_scaling
     num_frames = video_data.shape[0]
     frame = random.randrange(num_frames - fps)
-    return imresize(video_data[frame, :, :, :], (224, 224)), frame / fps
+    frame_data = video_data[frame, :, :, :]
+    frame_data = scaling_func(frame_data)
+    return frame_data, frame / fps
 
 
-def sampler(video_file, audio_files):
+def sampler(video_file, audio_files, io_retries=10):
     """Sample one frame from video_file, with 50% chance sample one second from corresponding audio_file,
        50% chance sample one second from another audio_file in the list of audio_files.
 
@@ -98,8 +116,17 @@ def sampler(video_file, audio_files):
         and label (0: not from corresponding files, 1: from corresponding files)
 
     """
+    for _ in range(io_retries):
+        try:
+            video_data = skvideo.io.vread(video_file)
+            break
+        except Exception as e:
+            print("Could not open {}. Retrying...".format(video_file))
+            continue
+    else:
+        import pdb
+        pdb.set_trace()
 
-    video_data = skvideo.io.vread(video_file)
     audio_file = video_to_audio(video_file)
 
     if random.random() < 0.5:
@@ -113,16 +140,18 @@ def sampler(video_file, audio_files):
     while True:
         sample_video_data, video_start = sample_one_frame(video_data)
         sample_audio_data, audio_start = sample_one_second(audio_data, sampling_frequency, video_start, label)
+        sample_audio_data = sample_audio_data[:,0]
 
-        yield {
+        sample = {
             'video': sample_video_data,
-            'audio': sample_audio_data[:,0],
+            'audio': sample_audio_data,
             'label': label,
             'audio_file': audio_file,
             'video_file': video_file,
             'audio_start': audio_start,
             'video_start': video_start
         }
+        yield sample
 
 
 def data_generator(data_dir, k=32, batch_size=64, random_state=20171021):
@@ -179,7 +208,7 @@ def on_epoch_end(self, epoch, logs=None):
 #def train(train_csv_path, model_id, output_dir, num_epochs=150, epoch_size=512,
 def train(train_data_dir, model_id, output_dir, num_epochs=150, epoch_size=512,
           batch_size=64, validation_size=1024, num_streamers=16,
-          random_seed=20171021, verbose=False):
+          random_state=20171021, verbose=False, checkpoint_interval=100):
     m, inputs, outputs = construct_cnn_L3_orig()
     loss = 'binary_crossentropy'
     metrics = ['accuracy']
@@ -208,6 +237,7 @@ def train(train_data_dir, model_id, output_dir, num_epochs=150, epoch_size=512,
         json.dump(model_json, fd, indent=2)
 
     weight_path = os.path.join(model_dir, 'model.h5')
+    checkpoint_weight_path = os.path.join(model_dir, 'model.{epoch:02d}.h5')
 
     cb = []
     cb.append(keras.callbacks.ModelCheckpoint(weight_path,
@@ -215,6 +245,10 @@ def train(train_data_dir, model_id, output_dir, num_epochs=150, epoch_size=512,
                                               verbose=1,))
                                               #monitor=monitor))
 
+    cb.append(keras.callbacks.ModelCheckpoint(checkpoint_weight_path,
+                                              #monitor=monitor,
+                                              period=checkpoint_interval))
+
     history_checkpoint = os.path.join(model_dir, 'history_checkpoint.pkl')
     cb.append(LossHistory(history_checkpoint))
 
@@ -223,11 +257,12 @@ def train(train_data_dir, model_id, output_dir, num_epochs=150, epoch_size=512,
                                         separator=','))
 
 
+    print('Setting up data generator...')
     train_gen = data_generator(
         #train_csv_path,
         train_data_dir,
         batch_size=batch_size,
-        random_seed=random_seed,
+        random_state=random_state,
         k=num_streamers)
 
     train_gen = pescador.maps.keras_tuples(train_gen,
@@ -269,96 +304,3 @@ def train(train_data_dir, model_id, output_dir, num_epochs=150, epoch_size=512,
     #     json.dump(results, fp, indent=2)
 
     print('Done!')
-
-
-def parse_arguments():
-    """
-    Parse arguments from the command line
-
-
-    Returns:
-        args:  Argument dictionary
-               (Type: dict[str, *])
-    """
-    parser = argparse.ArgumentParser(description='Train an L3-like audio-visual correspondence model')
-
-    parser.add_argument('-e',
-                        '--num-epochs',
-                        dest='num_epochs',
-                        action='store',
-                        type=int,
-                        default=150,
-                        help='Maximum number of training epochs')
-
-    parser.add_argument('-es',
-                        '--epoch-size',
-                        dest='epoch_size',
-                        action='store',
-                        type=int,
-                        default=512,
-                        help='Number of training batches per epoch')
-
-    parser.add_argument('-b',
-                        '--batch-size',
-                        dest='batch_size',
-                        action='store',
-                        type=int,
-                        default=64,
-                        help='Number of training examples per batch')
-
-    parser.add_argument('-v',
-                        '--validation-size',
-                        dest='validation_size',
-                        action='store',
-                        type=int,
-                        default=1024,
-                        help='Number of trianing examples in the validation set')
-
-    parser.add_argument('-s',
-                        '--num-streamers',
-                        dest='num_streamers',
-                        action='store',
-                        type=int,
-                        default=32,
-                        help='Number of pescador streamers that can be open concurrently')
-
-    parser.add_argument('-r',
-                        '--random-seed',
-                        dest='random_seed',
-                        action='store',
-                        type=int,
-                        default=20171021,
-                        help='Random seed used to set the RNG state')
-
-    parser.add_argument('-v',
-                        '--verbose',
-                        dest='verbose',
-                        action='store_true',
-                        default=False,
-                        help='If True, print detailed messages')
-
-    """
-    parser.add_argument('train_csv_path',
-                        action='store',
-                        type=str,
-                        help='Path to training csv file')
-    """
-    parser.add_argument('train_data_dir',
-                        action='store',
-                        type=str,
-                        help='Path to directory where training subset files are stored')
-
-    parser.add_argument('model_id',
-                        action='store',
-                        type=str,
-                        help='Identifier for this model')
-
-    parser.add_argument('output_dir',
-                        action='store',
-                        type=str,
-                        help='Path to directory where output files will be stored')
-
-
-    return vars(parser.parse_args())
-
-
diff --git a/train.py b/train.py
new file mode 100644
index 0000000..e0a4447
--- /dev/null
+++ b/train.py
@@ -0,0 +1,97 @@
+import argparse
+from l3embedding.train import *
+
+
+def parse_arguments():
+    """
+    Parse arguments from the command line
+
+
+    Returns:
+        args:  Argument dictionary
+               (Type: dict[str, *])
+    """
+    parser = argparse.ArgumentParser(description='Train an L3-like audio-visual correspondence model')
+
+    parser.add_argument('-e',
+                        '--num-epochs',
+                        dest='num_epochs',
+                        action='store',
+                        type=int,
+                        default=150,
+                        help='Maximum number of training epochs')
+
+    parser.add_argument('-es',
+                        '--epoch-size',
+                        dest='epoch_size',
+                        action='store',
+                        type=int,
+                        default=512,
+                        help='Number of training batches per epoch')
+
+    parser.add_argument('-bs',
+                        '--batch-size',
+                        dest='batch_size',
+                        action='store',
+                        type=int,
+                        default=64,
+                        help='Number of training examples per batch')
+
+    parser.add_argument('-vs',
+                        '--validation-size',
+                        dest='validation_size',
+                        action='store',
+                        type=int,
+                        default=1024,
+                        help='Number of trianing examples in the validation set')
+
+    parser.add_argument('-s',
+                        '--num-streamers',
+                        dest='num_streamers',
+                        action='store',
+                        type=int,
+                        default=32,
+                        help='Number of pescador streamers that can be open concurrently')
+
+    parser.add_argument('-r',
+                        '--random-state',
+                        dest='random_state',
+                        action='store',
+                        type=int,
+                        default=20171021,
+                        help='Random seed used to set the RNG state')
+
+    parser.add_argument('-v',
+                        '--verbose',
+                        dest='verbose',
+                        action='store_true',
+                        default=False,
+                        help='If True, print detailed messages')
+
+    """
+    parser.add_argument('train_csv_path',
+                        action='store',
+                        type=str,
+                        help='Path to training csv file')
+    """
+    parser.add_argument('train_data_dir',
+                        action='store',
+                        type=str,
+                        help='Path to directory where training subset files are stored')
+
+    parser.add_argument('model_id',
+                        action='store',
+                        type=str,
+                        help='Identifier for this model')
+
+    parser.add_argument('output_dir',
+                        action='store',
+                        type=str,
+                        help='Path to directory where output files will be stored')
+
+
+    return vars(parser.parse_args())
+
+
+if __name__ == '__main__':
+    train(**(parse_arguments()))