From c3a1b3b8ea36111d9cfa133c7cd85da0eb3271f6 Mon Sep 17 00:00:00 2001 From: Jason Cramer Date: Sat, 21 Oct 2017 14:11:06 -0400 Subject: [PATCH 1/2] Add preliminary training code (no eval yet) --- l3embedding/model.py | 173 ++++++++++++++++++++++++++++++++++--------- l3embedding/train.py | 117 +++++++++++++++++++++++++++-- 2 files changed, 252 insertions(+), 38 deletions(-) diff --git a/l3embedding/model.py b/l3embedding/model.py index 4bf3874..140a6ff 100644 --- a/l3embedding/model.py +++ b/l3embedding/model.py @@ -1,42 +1,149 @@ +from keras.models import Model +from keras.layers import Input, Conv2D, BatchNormalization, MaxPooling2D,\ + Flatten, Concatenate, Dense +from kapre.time_frequency import Spectrogram -from keras.layers import Input, Convolution2D, BatchNormalization def construct_cnn_L3_orig(): + """ + Constructs a model that replicates that used in Look, Listen and Learn + Relja Arandjelovic and (2017). Look, Listen and Learn. CoRR, abs/1705.08168, . + + Returns + ------- + model: L3 CNN model + (Type: keras.models.Model) + """ + #### + # Image subnetwork + #### + # INPUT + x_i = Input(shape=(224, 224, 3), dtype='float32') + + # CONV BLOCK 1 + n_filter_i_1 = 64 + filt_size_i_1 = (3, 3) + pool_size_i_1 = (2,2) + y_i = Conv2D(n_filter_i_1, filt_size_i_1, padding='same', + activation='relu')(x_i) + y_i = BatchNormalization()(y_i) + y_i = Conv2D(n_filter_i_1, filt_size_i_1, padding='same', + activation='relu')(y_i) + y_i = BatchNormalization()(y_i) + y_i = MaxPooling2D(pool_size=pool_size_i_1, strides=2, padding='same')(y_i) + + # CONV BLOCK 2 + n_filter_i_2 = 128 + filt_size_i_2 = (3, 3) + pool_size_i_2 = (2,2) + y_i = Conv2D(n_filter_i_2, filt_size_i_2, padding='same', + activation='relu')(y_i) + y_i = BatchNormalization()(y_i) + y_i = Conv2D(n_filter_i_2, filt_size_i_2, padding='same', + activation='relu')(y_i) + y_i = BatchNormalization()(y_i) + y_i = MaxPooling2D(pool_size=pool_size_i_2, strides=2, padding='same')(y_i) + + # CONV BLOCK 3 + n_filter_i_3 = 256 + filt_size_i_3 = (3, 3) + pool_size_i_3 = (2,2) + y_i = Conv2D(n_filter_i_3, filt_size_i_3, padding='same', + activation='relu')(y_i) + y_i = BatchNormalization()(y_i) + y_i = Conv2D(n_filter_i_3, filt_size_i_3, padding='same', + activation='relu')(y_i) + y_i = BatchNormalization()(y_i) + y_i = MaxPooling2D(pool_size=pool_size_i_3, strides=2, padding='same')(y_i) + + # CONV BLOCK 4 + n_filter_i_4 = 512 + filt_size_i_4 = (3, 3) + pool_size_i_4 = (28, 28) + y_i = Conv2D(n_filter_i_4, filt_size_i_4, padding='same', + activation='relu')(y_i) + y_i = BatchNormalization()(y_i) + y_i = Conv2D(n_filter_i_4, filt_size_i_4, padding='same', + activation='relu')(y_i) + y_i = BatchNormalization()(y_i) + y_i = MaxPooling2D(pool_size=pool_size_i_4, strides=2, padding='same')(y_i) + y_i = Flatten()(y_i) + + + #### + # Audio subnetwork + #### + n_dft = 512 + n_hop = 16 + asr = 48000 + audio_window_dur = 1 # INPUT - x = Input(shape=(n_freq_cnn, n_frames_cnn, 1), dtype='float32') - - # CONV 1 - y = Convolution2D(n_filters_cnn, filter_size_cnn, padding='valid', - activation='relu')(x) - y = MaxPooling2D(pool_size=pool_size_cnn, strides=None, padding='valid')(y) - y = BatchNormalization()(y) - - # CONV 2 - y = Convolution2D(n_filters_cnn, filter_size_cnn, padding='valid', - activation='relu')(y) - y = MaxPooling2D(pool_size=pool_size_cnn, strides=None, padding='valid')(y) - y = BatchNormalization()(y) - - # CONV 3 - y = Convolution2D(n_filters_cnn, filter_size_cnn, padding='valid', - activation='relu')(y) - # y = MaxPooling2D(pool_size=pool_size_cnn, strides=None, padding='valid')(y) - y = BatchNormalization()(y) - - # Flatten for dense layers - y = Flatten()(y) - y = Dropout(0.5)(y) - y = Dense(n_dense_cnn, activation='relu')(y) - if large_cnn: - y = Dropout(0.5)(y) - y = Dense(n_dense_cnn, activation='relu')(y) - y = Dropout(0.5)(y) - y = Dense(n_classes, activation='sigmoid')(y) - - m = Model(inputs=x, outputs=y) - return m + x_a = Input(shape=(1, asr * audio_window_dur), dtype='float32') + + # SPECTROGRAM PREPROCESSING + # 257 x 199 x 1 + y_a = Spectrogram(n_dft=n_dft, n_hop=n_hop, + return_decibel_spectrogram=True)(x_a) + # CONV BLOCK 1 + n_filter_a_1 = 64 + filt_size_a_1 = (3, 3) + pool_size_a_1 = (2,2) + y_a= Conv2D(n_filter_a_1, filt_size_a_1, padding='same', + activation='relu')(y_a) + y_a= BatchNormalization()(y_a) + y_a= Conv2D(n_filter_a_1, filt_size_a_1, padding='same', + activation='relu')(y_a) + y_a= BatchNormalization()(y_a) + y_a= MaxPooling2D(pool_size=pool_size_a_1, strides=2, padding='same')(y_a) + + # CONV BLOCK 2 + n_filter_a_2 = 128 + filt_size_a_2 = (3, 3) + pool_size_a_2 = (2,2) + y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same', + activation='relu')(y_a) + y_a = BatchNormalization()(y_a) + y_a = Conv2D(n_filter_a_2, filt_size_a_2, padding='same', + activation='relu')(y_a) + y_a = BatchNormalization()(y_a) + y_a = MaxPooling2D(pool_size=pool_size_a_2, strides=2, padding='same')(y_a) + + # CONV BLOCK 3 + n_filter_a_3 = 256 + filt_size_a_3 = (3, 3) + pool_size_a_3 = (2,2) + y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same', + activation='relu')(y_a) + y_a = BatchNormalization()(y_a) + y_a = Conv2D(n_filter_a_3, filt_size_a_3, padding='same', + activation='relu')(y_a) + y_a = BatchNormalization()(y_a) + y_a = MaxPooling2D(pool_size=pool_size_a_3, strides=2, padding='same')(y_a) + + # CONV BLOCK 4 + n_filter_a_4 = 512 + filt_size_a_4 = (3, 3) + pool_size_a_4 = (32, 24) + y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same', + activation='relu')(y_a) + y_a = BatchNormalization()(y_a) + y_a = Conv2D(n_filter_a_4, filt_size_a_4, padding='same', + activation='relu')(y_a) + y_a = BatchNormalization()(y_a) + y_a = MaxPooling2D(pool_size=pool_size_a_4, strides=2, padding='same')(y_a) + + y_a = Flatten()(y_a) + + + + # Merge the subnetworks + y = Concatenate()([y_i, y_a]) + y = Dense(128, activation='relu')(y) + y = Dense(2, activation='softmax')(y) + m = Model(inputs=[x_i, x_a], outputs=y) + return m, [x_i, x_a], y MODELS = {'cnn_L3_orig': construct_cnn_L3_orig} \ No newline at end of file diff --git a/l3embedding/train.py b/l3embedding/train.py index 40fc385..49e4060 100644 --- a/l3embedding/train.py +++ b/l3embedding/train.py @@ -1,7 +1,12 @@ - +from .model import construct_cnn_L3_orig +import json +import os +import pickle import pescador import pandas as pd from tqdm import tqdm +import keras +from keras.optimizers import Adam def sampler(filename, file_list): @@ -52,11 +57,113 @@ def data_generator(csv_file, batch_size=64): return pescador.BufferedStreamer(mux, batch_size) -def train(csv_file, batch_size=64, rate=16, seed=20171011): +class LossHistory(keras.callbacks.Callback): + + def __init__(self, outfile): + super().__init__() + self.outfile = outfile + + def on_train_begin(self, logs={}): + self.loss = [] + self.val_loss = [] + + # def on_batch_end(self, batch, logs={}): + def on_epoch_end(self, epoch, logs={}): + self.loss.append(logs.get('loss')) + self.val_loss.append(logs.get('val_loss')) + + loss_dict = {'loss': self.loss, 'val_loss': self.val_loss} + with open(self.outfile, 'wb') as fp: + pickle.dump(loss_dict, fp) + + +def train(csv_file, model_id, output_dir, epochs=150, epoch_size=512, + batch_size=64, validation_size=1024, rate=16, + seed=20171011, verbose=False): + m, inputs, outputs = construct_cnn_L3_orig() + loss = 'binary_crossentropy' + metrics = ['accuracy'] + #monitor = 'val_loss' + + # Make sure the directories we need exist + model_dir = os.path.join(output_dir, model_id) + if not os.path.isdir(output_dir): + os.makedirs(output_dir) + if not os.path.isdir(model_dir): + os.makedirs(model_dir) + + print('Compile model...') + m.compile(Adam(), + loss=loss, + metrics=metrics) + + # Save the model + model_spec_path = os.path.join(model_dir, 'model_spec.pkl') + model_spec = keras.utils.serialize_keras_object(m) + with open(model_spec_path, 'wb') as fd: + pickle.dump(model_spec, fd) + model_json_path = os.path.join(model_dir, 'model.json') + model_json = m.to_json() + with open(model_json_path, 'w') as fd: + json.dump(model_json, fd, indent=2) + + weight_path = os.path.join(model_dir, 'model.h5') + + cb = [] + cb.append(keras.callbacks.ModelCheckpoint(weight_path, + save_best_only=True, + verbose=1,)) + #monitor=monitor)) + + history_checkpoint = os.path.join(model_dir, 'history_checkpoint.pkl') + cb.append(LossHistory(history_checkpoint)) + + history_csvlog = os.path.join(model_dir, 'history_csvlog.csv') + cb.append(keras.callbacks.CSVLogger(history_csvlog, append=True, + separator=',')) + train_gen = data_generator( csv_file, batch_size=batch_size, - lam=rate, - revive=True, - random_state=seed) \ No newline at end of file + random_state=seed).tuples('video', 'audio', 'label') + + train_gen = pescador.maps.keras_tuples(train_gen, + ['video', 'audio'], + 'label') + + # Fit the model + print('Fit model...') + if verbose: + verbosity = 1 + else: + verbosity = 2 + history = m.fit_generator(train_gen, epoch_size, epochs, + # validation_data=gen_val, + # validation_steps=validation_size, + callbacks=cb, + verbose=verbosity) + + print('Done training. Saving results to disk...') + # Save history + with open(os.path.join(model_dir, 'history.pkl'), 'wb') as fd: + pickle.dump(history.history, fd) + + # Evaluate model + print('Evaluate model...') + # Load best params + m.load_weights(weight_path) + with open(os.path.join(output_dir, 'index_test.json'), 'r') as fp: + test_idx = json.load(fp)['id'] + + # Compute eval scores + #results = score_model(output_dir, pump, model, test_idx, working, + # strong_label_file, duration, modelid, + # use_orig_duration=True) + + # Save results to disk + #results_file = os.path.join(model_dir, 'results.json') + #with open(results_file, 'w') as fp: + # json.dump(results, fp, indent=2) + + #print('Done!') \ No newline at end of file From c5bbd03d26c214ed605a86ca6e8458096b5a246f Mon Sep 17 00:00:00 2001 From: Jason Cramer Date: Wed, 1 Nov 2017 15:36:50 -0400 Subject: [PATCH 2/2] Change image sampling to follow paper Move script part of train.py to a different file Add retry loop to opening video files Add periodic model checkpoints to training --- l3embedding/train.py | 150 +++++++++++++------------------------------ train.py | 97 ++++++++++++++++++++++++++++ 2 files changed, 143 insertions(+), 104 deletions(-) create mode 100644 train.py diff --git a/l3embedding/train.py b/l3embedding/train.py index b7c543e..61e0189 100644 --- a/l3embedding/train.py +++ b/l3embedding/train.py @@ -1,11 +1,10 @@ -import argparse import json import os import pickle import glob import random import pescador -from scipy.misc import imresize +import scipy.misc import skvideo.io import soundfile as sf from tqdm import tqdm @@ -68,7 +67,23 @@ def sample_one_second(audio_data, sampling_frequency, start, label): return audio_data[start:start+sampling_frequency], start / sampling_frequency -def sample_one_frame(video_data, fps=30): +def l3_frame_scaling(frame_data): + nx, ny, nc = frame_data.shape + scaling = 256.0 / min(nx, ny) + + new_nx, new_ny = int(scaling * nx), int(scaling * ny) + assert 256 in (new_nx, new_ny) + + + resized_frame_data = scipy.misc.imresize(frame_data, (new_nx, new_ny, nc)) + + start_x, start_y = random.randrange(new_nx - 224), random.randrange(new_ny - 224) + end_x, end_y = start_x + 224, start_y + 224 + + return resized_frame_data[start_x:end_x, start_y:end_y, :] + + +def sample_one_frame(video_data, fps=30, scaling_func=None): """Return one frame randomly and time (seconds). Args: @@ -79,13 +94,16 @@ def sample_one_frame(video_data, fps=30): One frame sampled randomly and time in seconds """ - + if not scaling_func: + scaling_func = l3_frame_scaling num_frames = video_data.shape[0] frame = random.randrange(num_frames - fps) - return imresize(video_data[frame, :, :, :], (224, 224)), frame / fps + frame_data = video_data[frame, :, :, :] + frame_data = scaling_func(frame_data) + return frame_data, frame / fps -def sampler(video_file, audio_files): +def sampler(video_file, audio_files, io_retries=10): """Sample one frame from video_file, with 50% chance sample one second from corresponding audio_file, 50% chance sample one second from another audio_file in the list of audio_files. @@ -98,8 +116,17 @@ def sampler(video_file, audio_files): and label (0: not from corresponding files, 1: from corresponding files) """ + for _ in range(io_retries): + try: + video_data = skvideo.io.vread(video_file) + break + except Exception as e: + print("Could not open {}. Retrying...".format(video_file)) + continue + else: + import pdb + pdb.set_trace() - video_data = skvideo.io.vread(video_file) audio_file = video_to_audio(video_file) if random.random() < 0.5: @@ -113,16 +140,18 @@ def sampler(video_file, audio_files): while True: sample_video_data, video_start = sample_one_frame(video_data) sample_audio_data, audio_start = sample_one_second(audio_data, sampling_frequency, video_start, label) + sample_audio_data = sample_audio_data[:,0] - yield { + sample = { 'video': sample_video_data, - 'audio': sample_audio_data[:,0], + 'audio': sample_audio_data, 'label': label, 'audio_file': audio_file, 'video_file': video_file, 'audio_start': audio_start, 'video_start': video_start } + yield sample def data_generator(data_dir, k=32, batch_size=64, random_state=20171021): @@ -179,7 +208,7 @@ def on_epoch_end(self, epoch, logs=None): #def train(train_csv_path, model_id, output_dir, num_epochs=150, epoch_size=512, def train(train_data_dir, model_id, output_dir, num_epochs=150, epoch_size=512, batch_size=64, validation_size=1024, num_streamers=16, - random_seed=20171021, verbose=False): + random_state=20171021, verbose=False, checkpoint_interval=100): m, inputs, outputs = construct_cnn_L3_orig() loss = 'binary_crossentropy' metrics = ['accuracy'] @@ -208,6 +237,7 @@ def train(train_data_dir, model_id, output_dir, num_epochs=150, epoch_size=512, json.dump(model_json, fd, indent=2) weight_path = os.path.join(model_dir, 'model.h5') + checkpoint_weight_path = os.path.join(model_dir, 'model.{epoch:02d}.h5') cb = [] cb.append(keras.callbacks.ModelCheckpoint(weight_path, @@ -215,6 +245,10 @@ def train(train_data_dir, model_id, output_dir, num_epochs=150, epoch_size=512, verbose=1,)) #monitor=monitor)) + cb.append(keras.callbacks.ModelCheckpoint(checkpoint_weight_path, + #monitor=monitor, + period=checkpoint_interval)) + history_checkpoint = os.path.join(model_dir, 'history_checkpoint.pkl') cb.append(LossHistory(history_checkpoint)) @@ -223,11 +257,12 @@ def train(train_data_dir, model_id, output_dir, num_epochs=150, epoch_size=512, separator=',')) + print('Setting up data generator...') train_gen = data_generator( #train_csv_path, train_data_dir, batch_size=batch_size, - random_seed=random_seed, + random_state=random_state, k=num_streamers) train_gen = pescador.maps.keras_tuples(train_gen, @@ -269,96 +304,3 @@ def train(train_data_dir, model_id, output_dir, num_epochs=150, epoch_size=512, # json.dump(results, fp, indent=2) print('Done!') - - -def parse_arguments(): - """ - Parse arguments from the command line - - - Returns: - args: Argument dictionary - (Type: dict[str, *]) - """ - parser = argparse.ArgumentParser(description='Train an L3-like audio-visual correspondence model') - - parser.add_argument('-e', - '--num-epochs', - dest='num_epochs', - action='store', - type=int, - default=150, - help='Maximum number of training epochs') - - parser.add_argument('-es', - '--epoch-size', - dest='epoch_size', - action='store', - type=int, - default=512, - help='Number of training batches per epoch') - - parser.add_argument('-b', - '--batch-size', - dest='batch_size', - action='store', - type=int, - default=64, - help='Number of training examples per batch') - - parser.add_argument('-v', - '--validation-size', - dest='validation_size', - action='store', - type=int, - default=1024, - help='Number of trianing examples in the validation set') - - parser.add_argument('-s', - '--num-streamers', - dest='num_streamers', - action='store', - type=int, - default=32, - help='Number of pescador streamers that can be open concurrently') - - parser.add_argument('-r', - '--random-seed', - dest='random_seed', - action='store', - type=int, - default=20171021, - help='Random seed used to set the RNG state') - - parser.add_argument('-v', - '--verbose', - dest='verbose', - action='store_true', - default=False, - help='If True, print detailed messages') - - """ - parser.add_argument('train_csv_path', - action='store', - type=str, - help='Path to training csv file') - """ - parser.add_argument('train_data_dir', - action='store', - type=str, - help='Path to directory where training subset files are stored') - - parser.add_argument('model_id', - action='store', - type=str, - help='Identifier for this model') - - parser.add_argument('output_dir', - action='store', - type=str, - help='Path to directory where output files will be stored') - - - return vars(parser.parse_args()) - - diff --git a/train.py b/train.py new file mode 100644 index 0000000..e0a4447 --- /dev/null +++ b/train.py @@ -0,0 +1,97 @@ +import argparse +from l3embedding.train import * + + +def parse_arguments(): + """ + Parse arguments from the command line + + + Returns: + args: Argument dictionary + (Type: dict[str, *]) + """ + parser = argparse.ArgumentParser(description='Train an L3-like audio-visual correspondence model') + + parser.add_argument('-e', + '--num-epochs', + dest='num_epochs', + action='store', + type=int, + default=150, + help='Maximum number of training epochs') + + parser.add_argument('-es', + '--epoch-size', + dest='epoch_size', + action='store', + type=int, + default=512, + help='Number of training batches per epoch') + + parser.add_argument('-bs', + '--batch-size', + dest='batch_size', + action='store', + type=int, + default=64, + help='Number of training examples per batch') + + parser.add_argument('-vs', + '--validation-size', + dest='validation_size', + action='store', + type=int, + default=1024, + help='Number of trianing examples in the validation set') + + parser.add_argument('-s', + '--num-streamers', + dest='num_streamers', + action='store', + type=int, + default=32, + help='Number of pescador streamers that can be open concurrently') + + parser.add_argument('-r', + '--random-state', + dest='random_state', + action='store', + type=int, + default=20171021, + help='Random seed used to set the RNG state') + + parser.add_argument('-v', + '--verbose', + dest='verbose', + action='store_true', + default=False, + help='If True, print detailed messages') + + """ + parser.add_argument('train_csv_path', + action='store', + type=str, + help='Path to training csv file') + """ + parser.add_argument('train_data_dir', + action='store', + type=str, + help='Path to directory where training subset files are stored') + + parser.add_argument('model_id', + action='store', + type=str, + help='Identifier for this model') + + parser.add_argument('output_dir', + action='store', + type=str, + help='Path to directory where output files will be stored') + + + return vars(parser.parse_args()) + + +if __name__ == '__main__': + train(**(parse_arguments()))