diff --git a/cortex/datasets/__init__.py b/cortex/datasets/__init__.py index f7efe01..354c36f 100644 --- a/cortex/datasets/__init__.py +++ b/cortex/datasets/__init__.py @@ -42,12 +42,16 @@ def resolve(c): from .basic.caltech import CALTECH from .basic.uci import UCI from .basic.cifar import CIFAR + from .basic.europarl import Europarl + from .basic.voc import VOC r_dict = { 'mnist': MNIST, 'cifar': CIFAR, 'caltech': CALTECH, - 'uci': UCI + 'uci': UCI, + 'europarl': Europarl, + 'voc': VOC } C = r_dict.get(c, None) @@ -70,7 +74,7 @@ def make_one_hot(Y, n_classes=None): class_list = np.unique(Y).tolist() n_classes = len(class_list) else: - class_list = range(n_classes) + class_list = range(0, n_classes) if Y.ndim == 2: reshape = Y.shape @@ -86,7 +90,7 @@ def make_one_hot(Y, n_classes=None): i = class_list.index(Y[idx]) except ValueError: raise ValueError('Class list is missing elements') - O[idx, i] = 1.; + O[idx, i] = 1. if reshape is not None: O = O.reshape(reshape + (n_classes,)) @@ -176,7 +180,7 @@ def dataset_factory(resolve_dataset, dataset=None, split=[0.7, 0.2, 0.1], valid_batch_size=valid_batch_size, test_batch_size=test_batch_size, **dataset_args) else: - train, valid, test, idx = C.factory( + train, valid, test, idx = C.factory( split=split, idx=idx, batch_sizes=[train_batch_size, valid_batch_size, test_batch_size], **dataset_args) @@ -367,6 +371,7 @@ def __init__(self, data, distributions=None, labels='label', name=None, self.balance = balance self.dims = dict() + self.dimsall = dict() if distributions is None: self.distributions = dict() else: @@ -391,6 +396,7 @@ def __init__(self, data, distributions=None, labels='label', name=None, 'number of samples (shape[0]), ' '(%d vs %d)' % (self.n, v.shape[0])) self.dims[k] = v.shape[1] + self.dimsall[k] = v.shape if not k in self.distributions.keys(): self.distributions[k] = 'binomial' diff --git a/cortex/datasets/basic/europarl.py b/cortex/datasets/basic/europarl.py new file mode 100644 index 0000000..5e66197 --- /dev/null +++ b/cortex/datasets/basic/europarl.py @@ -0,0 +1,411 @@ +''' +Europarl dataset for machine translation. + +Currently only supports fr-en datasets. +''' + +from collections import defaultdict +from functools import partial +from guppy import hpy +import logging +import numpy as np +from os import path +from progressbar import ( + Bar, + Percentage, + ProgressBar, + Timer +) +import string + +from .. import BasicDataset, make_one_hot +from ...utils import floatX, intX +from ...utils.tools import resolve_path + +np.set_printoptions(threshold=np.nan) +logger = logging.getLogger(__name__) + + +class Europarl(BasicDataset): + '''Europarl dataset itterator. + + Attributes: + max_sentence (int): Maximum sentence length. + max_length (int): Maximum number of sentences. + max_words (int): Maximum size of vocabulary. + english_to_french (bool): If true English is under name key, and French under label key, else reversed. + debug (bool): If true restricts max_length to 1000. + + ''' + _PAD = 0 + _BEG = 1 + _END = 2 + _UNK = 3 + table = string.maketrans('', '') + + def __init__(self, source=None, english_to_french=True, + name='europarl', out_path=None, max_words=5000, + max_sentence=30, max_length=7000, debug=False, **kwargs): + """ + Args: + source (str): Path to where the europarl data is stored. + english_to_french (bool): True for English input French labels, False for reverse. + name (str): Name of dataset. + out_path (str): Path to save outs. + max_words (int): Maximum vocab size, extra words are marked unknown. + max_sentence (int): Maximum sentence length, longer sentences are ignored. + max_length (int): Maximum number of sentences. + debug (bool): If True restricts max_length to 1000. + **kwargs: + """ + + self.logger = logging.getLogger( + '.'.join([self.__module__, self.__class__.__name__])) + self.logger.info('Loading %s from %s' % (name, source)) + + if source is None: + raise ValueError('No source file provided.') + source = resolve_path(source) + + self.max_sentence = max_sentence + self.max_length = max_length + self.max_words = max_words + self.english_to_french = english_to_french + + if debug: + self.max_length = 1000 + + X, Y, Mx, My = self.get_data(source) + data = {name: X, + 'label': Y, + 'mask_in': Mx, + 'mask_out': My} + distributions = {name: 'multinomial', + 'label': 'multinomial', + 'mask_in': None, + 'mask_out': None} + + super(Europarl, self).__init__(data, distributions=distributions, + name=name, one_hot=False, **kwargs) + + self.dimsall['europarl'] = self.dimsall['europarl'][0], self.dimsall['europarl'][1], self.nX_tokens + self.dimsall['label'] = self.dimsall['label'][0], self.dimsall['label'][1], self.nY_tokens + + self.out_path = out_path + + if self.shuffle: + self.randomize() + + def slice_data(self, idx, data=None): + '''Function for restricting dataset in instance. + + Args: + idx (list): Indices of data to be kept. + data (dict): Data to be sliced and kept. + + ''' + if data is None: data = self.data + for k, v in data.iteritems(): + self.data[k] = v[idx] + self.n = len(idx) + + def get_data(self, source): + special_tokens = { + '': self._PAD, '': self._BEG, + '': self._END, '': self._UNK} + + def preprocess(s): + '''Preprocesses string. + + Args: + s (str): string to be preprocessed. + + Returns: + str: preprocessed string. + + ''' + return s.lower().translate(self.table, string.punctuation).split() + + def find_long_sentences(epath, fpath, n_lines): + with open(epath) as e: + with open(fpath) as f: + e.seek(0) + f.seek(0) + too_long_indices = [] + for i, eSentence, fSentence in zip(range(0, n_lines), e, f): + if len(preprocess(eSentence)) > self.max_sentence or len(preprocess(fSentence)) > self.max_sentence: + too_long_indices.append(i) + return too_long_indices + + def make_dictionary(sentences, n_lines, max_words=None, too_long_indices=None): + '''Forms a dictionary from words in sentences. + + If there are more words than max_words, use the top frequent ones. + + Args: + sentences (file Handle) + n_lines (int): number of lines in file. + max_words (Optional[int]): maximum number of words. Default + is self.max_words. + + Returns: + dict: word string to token dictionary. + int: maximum length sentence. + + ''' + self.logger.info('Forming dictionary') + if max_words is None: max_words = self.max_words + if too_long_indices is None: too_long_indices = [] + + count_dict = defaultdict(int) + + widgets = ['Counting words', ' (', Timer(), ') [', Percentage(), ']'] + pbar = ProgressBar(widgets=widgets, maxval=n_lines).start() + + max_len = 0 + for i, sentence in zip(range(0, n_lines), sentences): + ps = preprocess(sentence) + l = len(ps) + if l <= self.max_sentence and i not in too_long_indices: + for word in ps: + count_dict[word] += 1 + max_len = max(l, max_len) + pbar.update(i) + + count_keys_sorted = sorted( + count_dict, key=count_dict.get, reverse=True) + vals_sorted = sorted(count_dict.values(), reverse=True) + keys = count_keys_sorted[:max_words] + omit_freq = sum(vals_sorted[max_words:]) / float(sum(vals_sorted)) + self.logger.info('Setting %d words as with total frequency ' + '%.3g.' + % (max(0, len(count_keys_sorted) - max_words), + omit_freq)) + values = range(4, len(keys) + 4) + + d = dict() + d.update(**special_tokens) + d.update(**dict(zip(keys, values))) + return d, max_len + + def tokenize(sentence, d, pad_length): + '''Tokenize sentence using dictionary. + + If sentence is longer than max_sentence, returns []. + + Args: + sentence (str): sentence to be tokenized. + d (dict): token dictionary. + pad_length (int): total length up to pad. + + Returns: + list: tokenized sentence as list. + + ''' + ps = preprocess(sentence) + if len(ps) > self.max_sentence: + return [] + s = [self._BEG] + [d.get(w, self._UNK) for w in ps] + [self._END] + s += [self._PAD] * max(0, pad_length + 2 - len(s)) + return s + + def read_and_tokenize(file_path, max_length, too_long_indices=None): + '''Read and tokenize a file of sentences. + + Args: + file_path (str): path to file. + max_length (int): maximum number of lines to read. + + Returns: + list: list of tokenized sentences. + dict: token disctionary. + dict: reverse dictionary. + + ''' + self.logger.info('Reading sentences from %s' % file_path) + with open(file_path) as f: + n_lines = min(sum(1 for line in f), max_length) + f.seek(0) + d, max_len = make_dictionary(f, n_lines, too_long_indices=too_long_indices) + r_d = dict((v, k) for k, v in d.iteritems()) + tokenized_sentences = [] + + f.seek(0) + self.logger.info('Tokenizing sentences from %s' % file_path) + widgets = ['Tokenizing sentences', + ' (', Timer(), ') [', Percentage(), ']'] + pbar = ProgressBar(widgets=widgets, maxval=n_lines).start() + for i, sentence in zip(range(0, n_lines), f): + ts = tokenize(sentence, d, max_len) + assert len(ts) <= self.max_sentence + 2, (ts, len(ts)) + tokenized_sentences.append(ts) + pbar.update(i) + return tokenized_sentences, d, r_d + + def match_and_trim(sentences_a, sentences_b): + '''Matches 2 lists of sentences and removes incomplete pairs. + + If one of the pairs is `[]`, remove pair. + + Args: + sentences_a (list). + sentences_b (list). + + Returns: + list: new sentences_a + list: new sentences_b + + ''' + self.logger.info('Matching datasets and trimming') + if len(sentences_a) != len(sentences_b): + raise TypeError('Sentence lists are different lengths.') + + sentences_a_tr = [] + sentences_b_tr = [] + widgets = ['Matching sentences', + ' (', Timer(), ') [', Percentage(), ']'] + trimmed = 0 + pbar = ProgressBar(widgets=widgets, maxval=len(sentences_a)).start() + for i, (s_a, s_b) in enumerate(zip(sentences_a, sentences_b)): + if len(s_a) > 0 and len(s_b) > 0: + sentences_a_tr.append(s_a) + sentences_b_tr.append(s_b) + else: + trimmed += 1 + pbar.update(i) + self.logger.debug('Trimmed %d sentences' % trimmed) + + return sentences_a_tr, sentences_b_tr + + too_long_indices = find_long_sentences(path.join(path.join(source, 'europarl-v7.fr-en.en')), + path.join(path.join(source, 'europarl-v7.fr-en.fr')), self.max_length) + + fr_sentences, self.fr_dict, self.fr_dict_r = read_and_tokenize( + path.join(path.join(source, 'europarl-v7.fr-en.fr')), self.max_length, too_long_indices=too_long_indices) + + en_sentences, self.en_dict, self.en_dict_r = read_and_tokenize( + path.join(path.join(source, 'europarl-v7.fr-en.en')), self.max_length, too_long_indices=too_long_indices) + + fr_sentences, en_sentences = match_and_trim(fr_sentences, en_sentences) + + if self.english_to_french: + X = np.array(en_sentences).astype(intX) + Y = np.array(fr_sentences).astype(intX) + else: + X = np.array(fr_sentences).astype(intX) + Y = np.array(en_sentences).astype(intX) + + self.nX_tokens = len(np.unique(X).tolist()) + int(3 not in np.unique(X).tolist()) + self.nY_tokens = len(np.unique(Y).tolist()) + int(3 not in np.unique(Y).tolist()) + + self.logger.info('Creating masks') + Mx = (X != 0).astype(intX) + My = (Y != 0).astype(intX) + + return X, Y, Mx, My + + @staticmethod + def factory(C=None, split=None, idx=None, batch_sizes=None, **kwargs): + ''' + + Args: + C: Data iterator to use, defaults to Europarl. + split: List of percentage values for train, valid, and test datasets respectively. + idx: List of indices for train, valid and test datasets respectively. + batch_sizes: List of batch sizes for train, valid, and test datasets respectively. + **kwargs: Other arguments to be passed to the data iterator. + + Returns: Train, valid, test,(datasets) indices(list of indices for data of each). + + ''' + + if C is None: + C = Europarl + europarl = C(batch_size=10, **kwargs) + if hasattr(europarl, 'logger'): + logger = europarl.logger + europarl.logger = None + + if idx is None: + logger.info('Splitting dataset into ratios %r' % split) + if round(np.sum(split), 5) != 1. or len(split) != 3: + raise ValueError(split) + + if europarl.balance: + raise NotImplementedError() + else: + split_idx = [] + accum = 0 + for s in split: # Create indicies from percentage values + s_i = int(s * europarl.n + accum) + split_idx.append(s_i) + accum += s_i + idx = range(europarl.n) + + train_idx = idx[:split_idx[0]] + valid_idx = idx[split_idx[0]:split_idx[1]] + test_idx = idx[split_idx[1]:] + idx = [train_idx, valid_idx, test_idx] + else: + logger.info('Splitting dataset into ratios %.2f / %.2f /%.2f ' + 'using given indices' + % tuple(len(idx[i]) / float(europarl.n) + for i in range(3))) + + # Shouldn't have different number of batch sizes than datasets + assert len(batch_sizes) == len(idx) + + datasets = [] + modes = ['train', 'valid', 'test'] + data = europarl.data + europarl.data = dict() + # Create correctly restricted copies of dataset + for i, bs, mode in zip(idx, batch_sizes, modes): + if bs is None: + dataset = None + else: + dataset = europarl.copy() + dataset.slice_data(i, data=data) + dataset.batch_size = bs + dataset.logger = logger + dataset.mode = mode + datasets.append(dataset) + + return datasets + [idx] + + def next(self, batch_size=None): + rval = super(Europarl, self).next(batch_size=batch_size) + rval[self.name] = make_one_hot(rval[self.name], + n_classes=self.nX_tokens) + rval['label'] = make_one_hot(rval['label'], + n_classes=self.nY_tokens) + return rval + + def save_images(self, image, english=True, out_path=None): + '''Shows tokenized in terms of original words. + + Uses reverse dictionary. + + ''' + print image.shape + sentences = [] + for sentence in image: + sen = [] + for token in sentence: + + if english: + sen.append(self.en_dict_r[token.argmax()]) + else: + sen.append(self.fr_dict_r[token.argmax]) + sentences.append(sen) + + if out_path is None: + for sentence in sentences: + print ' '.join(sentence) + '.' + + else: + with open(out_path, 'w') as f: + for sentence in sentences: + f.write(' '.join(sentence) + '.') + + return sentences diff --git a/cortex/datasets/basic/tests/__init__.py b/cortex/datasets/basic/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cortex/datasets/basic/tests/test_europarl.py b/cortex/datasets/basic/tests/test_europarl.py new file mode 100644 index 0000000..ba13101 --- /dev/null +++ b/cortex/datasets/basic/tests/test_europarl.py @@ -0,0 +1,27 @@ +""" +Tests europarl.py, try with nosetests test_europarl.py + +Checks that the shapes split properly and that the masks line up. +""" + +from cortex.datasets.basic.europarl import Europarl + + +def test_europarl(split=[0.7, 0.2, 0.1], batch_sizes=[10, 10, 10]): + train, valid, test, idx = Europarl.factory(split=split, batch_sizes=batch_sizes, + debug=True, source='/export/mialab/users/jjohnson/data/basic/europarl') + + for i, dataset in enumerate([train, valid, test]): + + for key in ['europarl', 'mask_in', 'label', 'mask_out']: # Test shapes. + assert dataset.data[key].shape == (int(idx[2][-1] * split[i]) + 1, 32) + + for data in dataset: # Test batch sizes. + assert len(data[key]) == batch_sizes[i] + + for k in idx[i]: # Test masks. + relative_k = k - idx[i][0] + for j in range(0, len(dataset.data['europarl'][relative_k])): + assert bool(dataset.data['europarl'][relative_k][j]) == bool(dataset.data['mask_in'][relative_k][j]) + for j in range(0, len(dataset.data['label'][relative_k])): + assert bool(dataset.data['label'][relative_k][j]) == bool(dataset.data['mask_out'][relative_k][j]) diff --git a/cortex/datasets/basic/tests/test_voc.py b/cortex/datasets/basic/tests/test_voc.py new file mode 100644 index 0000000..356b1a1 --- /dev/null +++ b/cortex/datasets/basic/tests/test_voc.py @@ -0,0 +1,9 @@ +""" +Tests voc.py. +""" + +from ..voc import VOC + + +def test_voc(): + test = VOC(source='$data', batch_size=10, chunk_size=15) diff --git a/cortex/datasets/basic/voc.py b/cortex/datasets/basic/voc.py new file mode 100644 index 0000000..b234a71 --- /dev/null +++ b/cortex/datasets/basic/voc.py @@ -0,0 +1,219 @@ +""" +Data itteratoor for VOC classification data. +""" + +from .. import BasicDataset +from os import path +import logging +from ...utils.tools import resolve_path +from PIL import Image +import random +import numpy as np +import matplotlib.pyplot as plt + + +class VOC(BasicDataset): + """Dataset iterator for VOC classification data. (Designed for use with Pyramid RNNs) + + Attributes: + + """ + + def __init__(self, images_loaded=10, chunk_size=5, out_path=None, chunks=1000, + start_image=0, mode='train', source=None, name='voc', **kwargs): + """ + + Args: + images_loaded (int): How many images to load + chunk_size (int): Dimension of chunks to be made. + mode (str): Type of data to load, train, valid, test. + source (str): Path to directory containing VOCdevkit + name: Name of iterator + **kwargs: + """ + + self.mode_resolve = {'train': 'train', 'valid': 'trainval', 'test': 'val'} + self.mode = self.mode_resolve[mode] + + self.logger = logging.getLogger( + '.'.join([self.__module__, self.__class__.__name__])) + self.logger.info('Loading %s from %s as %s' % (name, source, self.mode)) + + if source is None: + raise ValueError('No source file provided.') + source = resolve_path(source) + + self.start_image = start_image + self.chunks = chunks + self.images_loaded = images_loaded + if chunk_size % 2: + self.chunk_size = chunk_size + else: + self.logger.info('Using %d + 1 to get an odd chunk size.' % chunk_size) + self.chunk_size = chunk_size + 1 + + X, Y = self.get_data(source, self.mode) + data = {name: X, 'label': Y} + distributions = {name: 'multinomial', 'label': 'multinomial'} + + super(VOC, self).__init__(data, distributions=distributions, + name=name, **kwargs) + + self.out_path = out_path + + if self.shuffle: + self.randomize() + + @staticmethod + def factory(split=None, idx=None, batch_sizes=None, **kwargs): + if split is None: + raise NotImplementedError('Idx are not supported for this dataset yet.') + if batch_sizes is None: + raise ValueError('Need batch sizes') + + chunks = kwargs['chunks'] + chunk_ammounts = [] + for val in split: + chunk_ammounts.append(int(chunks * val)) + + train = VOC(images_loaded=10, start_image=0, chunk_size=kwargs['chunk_size'], + chunks=chunk_ammounts[0], mode='train', source=kwargs['source'], + batch_size=batch_sizes[0]) + valid = VOC(images_loaded=5, start_image=10, chunk_size=kwargs['chunk_size'], + chunks=chunk_ammounts[1], mode='valid', source=kwargs['source'], + batch_size=batch_sizes[1]) + test = VOC(images_loaded=5, start_image=15, chunk_size=kwargs['chunk_size'], + chunks=chunk_ammounts[2], mode='test', source=kwargs['source'], + batch_size=batch_sizes[2]) + + accum = 0 + idx = [] + for val in chunk_ammounts: + idx.append(range(accum, accum + val)) + accum += val + + return train, valid, test, idx + + def get_data(self, source, mode): + """Gets data given source, chunks it, and returns chunks with center labels. + + Args: + source (str): File path to directory containing VOCdevkit. + mode (str): Mode of data, eg. train, valid, test. + + Returns: + + """ + rand = random.Random() + buff_dist = (self.chunk_size + 1)/2 + + def get_unique(pixels): + """Helper function for get_data, returns the number of unique classifiers in an image. + + Args: + pixels (list of lists): Pixel classifier values. + + Returns (int): Number of unique classifiers in image. + + """ + unique = [] + for l in pixels: + for j in l: + if j not in unique: + unique.append(j) + return len(unique) + + def image_to_pixels(im): + """ + + Args: + im (Image): Image object form PIL + + Returns (list of lists): Pixels + + """ + pixels = list(im.getdata()) + width, height, = im.size + return [pixels[i * width:(i + 1) * width] for i in xrange(height)] + + def project_to_binary(pixels): + """Helper function for get_data, returns binary version of input pixels. + + Args: + pixels (list of lists of ints): pixels of an image. + + Returns: Pixels projected to binary. + + """ + retval = [] + for ln in pixels: + retval.append([int(bool(val)) for val in ln]) + return retval + + def get_random_chunk(pixels_data, pixels_label): + """Helper function for get_data, gets random chunk from data, and returns label for center. + + Args:self, VOC + pixels_data (list of lists): Image pixels of data. + pixels_label (list of lists): Image pixels of label. + + Returns: data_chunk (list of lists subsection of pixels_data), label_val (value of label at center of chunk) + + """ + y = rand.randint(buff_dist, len(pixels_data) - buff_dist) + x = rand.randint(buff_dist, len(pixels_data[0]) - buff_dist) + data_chunk = [] + label_val = pixels_label[y][x] + for index in range(y - buff_dist + 1, y + buff_dist): + data_chunk.append(pixels_data[index][x - buff_dist + 1:x + buff_dist]) + assert len(data_chunk) == self.chunk_size and len(data_chunk[0]) == self.chunk_size + return data_chunk, label_val + + names = [] + with open(source + '/basic/VOCdevkit/VOC2010/ImageSets/Segmentation/' + mode + '.txt') as f: + for line in f: + names.append(line[:-1]) + + self.data_images = [] + self.label_images = [] + images_loaded = 0 + for name in names: + if images_loaded < (self.images_loaded + self.start_image) and images_loaded >= self.start_image: + label_im = Image.open(source + '/basic/VOCdevkit/VOC2010/SegmentationObject/' + name + '.png') + label_pixels = image_to_pixels(label_im) + if get_unique(label_pixels) == 3: + self.label_images.append(project_to_binary(label_pixels)) + + data_im = Image.open(source + '/basic/VOCdevkit/VOC2010/JPEGImages/' + name + '.jpg').convert('L') + self.data_images.append(image_to_pixels(data_im)) + + images_loaded += 1 + elif images_loaded < self.start_image: + images_loaded += 1 + else: + break + + X = [] + Y = [] + for i in xrange(0, self.chunks): + k = rand.randint(0, len(self.data_images) - 1) + x, y = get_random_chunk(self.data_images[k], self.label_images[k]) + X.append(np.array(x, dtype='float32')/255.0) # Normalize + Y.append(np.array(y, dtype='float32')) + + assert len(X) == self.chunks and len(Y) == self.chunks + + return np.array(X), np.array(Y) + + def next(self): + rval = super(VOC, self).next() + + rval['label'] = np.array([b[1] for b in rval['label']]) + + return rval + + def save_images(self, image, ith_image, out_path=None): + + plt.matshow(image) + + plt.savefig(out_path + ith_image + '.png') diff --git a/cortex/demos/demos_basic/pyramid_voc.py b/cortex/demos/demos_basic/pyramid_voc.py new file mode 100644 index 0000000..dbeb736 --- /dev/null +++ b/cortex/demos/demos_basic/pyramid_voc.py @@ -0,0 +1,120 @@ +""" +Demo for Pyramid RNN on VOC classification dataset. + +Try with 'cortex-run pyramid_voc.py' +""" + +""" +Demo for next word guessing using an RNN. + +Try with cortex-run 'rnn_europarl.py ' +""" + +from collections import OrderedDict +import theano +import numpy as np +from cortex.models.pyramid_rnn import Pyramid_RNN +from cortex.utils import intX, floatX, logger +from cortex.datasets import resolve as resolve_dataset +import theano.tensor as T +import matplotlib.pyplot as plt + + +# Default arguments +_learning_args = dict( + learning_rate=0.01, + learning_rate_scheduler=None, + optimizer='rmsprop', + optimizer_args=dict(), + epochs=15, + valid_key='-sum log p(x | y)', + valid_sign='+', + excludes=[] +) + +_dataset_args = dict( + train_batch_size=10, + valid_batch_size=10, + # test_batch_size=10, + debug=False, + dataset='voc', + chunks=1000, + distribution='multinomial', + chunk_size=25, + source='$data' +) + +_model_args = dict( + dim_h=None, + l2_decay=None, + mask_in='mask_in' +) + +pyramid_args = dict( + dim_hs=[25], + input_layer='voc', + output='label', +) + +extra_arg_keys = ['pyramid_args'] + +theano.config.on_unused_input = 'ignore' +# theano.config.optimizer = 'None' +# theano.config.exception_verbosity = 'high' +# theano.config.compute_test_value = 'warn' + + +def _build(module): + models = OrderedDict() + dataset = module.dataset + pyramid_args = module.pyramid_args + width = dataset.chunk_size + dim_in = 1 + dim_out = 1 + distribution = dataset.distributions[pyramid_args['output']] + + model = Pyramid_RNN.factory(dim_in=dim_in, dim_out=dim_out, distribution=distribution, + width=width, **pyramid_args) + + models['pyramid_rnn'] = model + return models + + +def _cost(module): + models = module.models + + X = module.inputs[module.dataset.name].swapaxes(0, 1) + Y = module.inputs['label'] + used_inputs = ['label', module.dataset.name] + + model = models['pyramid_rnn'] + + outputs, preact, updates = model(X) + + results = OrderedDict() + p = outputs['p'] + base_cost = model.neg_log_prob(Y, p[:, 0]).mean() + cost = base_cost + + constants = [] + + l2_decay = module.l2_decay + if l2_decay is not False and l2_decay > 0.: + module.logger.info('Adding %.5f L2 weight decay' % l2_decay) + l2_rval = model.l2_decay(l2_decay) + l2_cost = l2_rval.pop('cost') + cost += l2_cost + results['l2_cost'] = l2_cost + + # results['error'] = (Y * (1 - p)).sum(axis=1).mean() + results['-sum log p(x | y)'] = base_cost + results['cost'] = cost + + return used_inputs, results, updates, constants, outputs + + +def _vis(module, outputs, results): + out_path = module.out_path + + plt.matshow(np.zeros((10, 10))) + plt.savefig('Testp_plot.png') diff --git a/cortex/demos/demos_basic/rnn_europarl.py b/cortex/demos/demos_basic/rnn_europarl.py new file mode 100644 index 0000000..d2f764a --- /dev/null +++ b/cortex/demos/demos_basic/rnn_europarl.py @@ -0,0 +1,122 @@ +""" +Demo for next word guessing using an RNN. + +Try with cortex-run 'rnn_europarl.py ' +""" + +from collections import OrderedDict +import theano +import numpy as np +from cortex.models.rnn import SimpleRNN +from cortex.utils import intX, floatX +from cortex.datasets import resolve as resolve_dataset + + +# Default arguments +_learning_args = dict( + learning_rate=0.01, + learning_rate_scheduler=None, + optimizer='rmsprop', + optimizer_args=dict(), + epochs=100, + valid_key='-sum log p(x | y)', + valid_sign='+', + excludes=[] +) + +_dataset_args = dict( + train_batch_size=10, + valid_batch_size=10, + debug=False, + dataset='europarl', + distribution='multinomial', + source='$data/basic/europarl' +) + +_model_args = dict( + dim_h=None, + l2_decay=None, + mask_in='mask_in' +) + +simple_rnn_args = dict( + dim_h=1000, + input_layer='europarl', + output='label', + dropout=None +) + +extra_arg_keys = ['simple_rnn_args'] + +#theano.config.compute_test_value = 'warn' + +#theano.config.exception_verbosity = 'high' + +#theano.config.optimizer = 'None' + + +def _build(module): + models = OrderedDict() + dataset = module.dataset + simple_rnn_args = module.simple_rnn_args + dim_in = dataset.dimsall[dataset.name][2] + dim_out = dataset.dimsall[dataset.name][2] + distribution = dataset.distributions[simple_rnn_args['output']] + + model = SimpleRNN.factory(dim_in=dim_in, dim_out=dim_out, distribution=distribution, **simple_rnn_args) + + models['rnn'] = model + return models + + +def _cost(module): + models = module.models + + mask_in = module.inputs['mask_in'].transpose(1, 0) + X = module.inputs[module.dataset.name].transpose(1, 0, 2) + used_inputs = [module.dataset.name, 'mask_in'] + + model = models['rnn'] + outputs, preact, updates = model(X, m=mask_in) + + results = OrderedDict() + p = outputs['p'] + base_cost = model.neg_log_prob(X[1:], p[:-1]).sum(0).mean() + cost = base_cost + + constants = [] + + l2_decay = module.l2_decay + if l2_decay is not False and l2_decay > 0.: + module.logger.info('Adding %.5f L2 weight decay' % l2_decay) + l2_rval = model.l2_decay(l2_decay) + l2_cost = l2_rval.pop('cost') + cost += l2_cost + results['l2_cost'] = l2_cost + + # results['error'] = (Y * (1 - p)).sum(axis=1).mean() + results['-sum log p(x | y)'] = base_cost + results['cost'] = cost + + return used_inputs, results, updates, constants, outputs + + +def _viz(module, outputs, results, n_samples=10, n_steps=10): + out_path = module.out_path + out_path = None #### For testing purposes + n_tokens = int(module.dataset.dimsall[module.dataset.name][2]) + + pvals = np.zeros((n_samples, n_tokens)) + 1./float(n_tokens) + x0 = module.models['rnn'].trng.multinomial(pvals=pvals, dtype=floatX) + + outputs, updates = module.models['rnn'].sample(x0=x0, n_steps=n_steps) + + updates = theano.OrderedUpdates(updates) + + f_vis = theano.function([], outputs['x'], updates=updates) + + def f_analysis(): + out = f_vis() + return module.dataset.save_images(out, out_path=out_path) + + return f_analysis diff --git a/cortex/demos/demos_basic/tests/__init__.py b/cortex/demos/demos_basic/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cortex/models/pyramid_rnn.py b/cortex/models/pyramid_rnn.py new file mode 100644 index 0000000..a8cfe33 --- /dev/null +++ b/cortex/models/pyramid_rnn.py @@ -0,0 +1,233 @@ +""" +Module for 2 dimensional pyramid RNN layers. +""" + +from .rnn import RNN +import collections +import theano.tensor as T +import theano +import numpy as np +from ..utils import tools +import collections as coll +from ..utils import floatX + + +class Pyramid_RNN(RNN): + + def __init__(self, dim_in, dim_hs, width, dim_out=None, + output_net=None, input_net=None, name='pyramid', **kwargs): + + if dim_out is None: + self.dim_out = 1 + + self.width = width + + super(Pyramid_RNN, self).__init__(dim_in=dim_in, dim_hs=dim_hs, name=name, + output_net=output_net, input_net=input_net, **kwargs) + + @staticmethod + def factory(dim_in=None, dim_out=None, dim_hs=None, width=None, **kwargs): + '''Factory for creating MLPs for Pyramid_RNN and returning . + + Convenience to quickly create MLPs from dictionaries, linking all + relevant dimensions and distributions. + + Args: + dim_in (int): input dimension. + dim_hs (list): dimensions of recurrent units. + dim_out (Optional[int]): output dimension. If not provided, assumed + to be dim_in. + width (int): How wide the input block is. + + Returns: + RNN + + ''' + assert len(dim_hs) > 0 + if dim_out is None: + dim_out = 1 + mlps, kwargs = Pyramid_RNN.mlp_factory(dim_in, dim_out, dim_hs, **kwargs) + kwargs.update(**mlps) + + return Pyramid_RNN(dim_in, dim_hs, width, dim_out=dim_out, **kwargs) + + @staticmethod + def mlp_factory(dim_in, dim_out, dim_hs, o_dim_in=None, i_net=None, + o_net=None, c_net=None, data_distribution='binomial', + initialization=None, init_args=None, **kwargs): + '''Factory for creating MLPs for Pyramid RNN. + + Args: + dim_in (int): input dimension. + dim_out (int): output dimension. If not provided, assumed + to be dim_in. + dim_hs (list): dimensions of recurrent units. + o_dim_in (Optional[int]): optional input dimension for output + net. If not provided, then use the last hidden dim. + i_net (dict): input network args. + o_net (dict): output network args. + c_net (dict): conditional network args. + data_distribution (str): distribution of the output. + initialization (str): type of initialization. + init_args (dict): initialization keyword arguments. + **kwargs: extra keyword arguments. + + Returns: + dict: MLPs. + dict: extra keyword arguments. + + ''' + + mlps, kwargs = RNN.mlp_factory(dim_in=dim_in, dim_out=dim_out, dim_hs=dim_hs, **kwargs) + + return mlps, kwargs + + def set_params(self): + '''Initialize RNN parameters. + + ''' + self.params = collections.OrderedDict() + for i, dim_h in enumerate(self.dim_hs): + Ur = tools.norm_weight(3 * dim_h, dim_h) + self.params['Ur%d' % i] = Ur + + self.set_net_params() + + def _step(self, m, y, h_, Ur): + '''Step function for RNN call. + + Args: + m (T.tensor): masks. + y (T.tensor): inputs. + h_ (T.tensor): recurrent state. + Ur (theano.shared): recurrent connection. + + Returns: + T.tensor: next recurrent state. + + ''' + H_t = T.concatenate((h_, T.roll(h_, 1, 2), T.roll(h_, -1, 2)), 2) + preact = y + T.dot(H_t, Ur) + h = T.tanh(preact) + h = m * h + (1 - m) * h_ + return h + + def step_call(self, x, m, h0s, *params): + '''Step version of __call__ for scan + + Args: + x (T.tensor): input. + m (T.tensor): mask. + h0s (list): list of recurrent initial states. Calculated in this function now, ie NOT IMPLEMENTED + *params: list of theano.shared. + + Returns: + OrderedDict: dictionary of results. and now calculated h0s + + ''' + n_steps = (x.shape[0] + 1)/2 + n_samples = x.shape[1] + input = x + + updates = theano.OrderedUpdates() + + h0s = [] + hs = [] + output = [] + for k in range(0, 4): # Iterate through directions. + x = self.rotate(input, k)[:(self.width + 1)/2, :, :].astype('float32') # Rotates input + h0s.append([T.alloc(0, x.shape[1], self.width, dim_h).astype(floatX) for dim_h in self.dim_hs]) # Make h0 + for i, h0 in enumerate(h0s[k]): + seqs = [m[:, :, :, None]] + self.call_seqs(x, None, i, *params) + outputs_info = [h0] + non_seqs = [self.get_recurrent_args(*params)[i]] + h, updates_ = theano.scan( + self._step, + sequences=seqs, + outputs_info=outputs_info, + non_sequences=non_seqs, + name=self.name + '_recurrent_steps_%d' % i, + n_steps=n_steps) + hs.append(h) + x = h + updates += updates_ + output.append(h[-1, :, (self.width + 1)/2, :]) # Collect directional outputs. + + o_params = self.get_output_args(*params) + out_net_out = self.output_net.step_call(T.sum(output, 0), *o_params) # Sum different directions. + preact = out_net_out['z'] + p = out_net_out['p'] + + return coll.OrderedDict(hs=hs, p=p, z=preact), updates, h0s[0] + + def __call__(self, x, m=None, h0s=None, condition_on=None): + '''Call function. + + For learning RNNs. + + Args: + x (T.tensor): input sequence. window x batch x dim (a x b x a) where a is chunk size, b is batch size. + m (T.tensor): mask. window x batch. For masking in recurrent steps. NOT IMPLEMENTED + h0s (Optional[list]): initial h0s. NOT IMPLEMENTED + condition_on (Optional[T.tensor]): conditional for recurrent step. + + Returns: + OrderedDict: dictionary of results: hiddens, probabilities, and + preacts. + theano.OrderedUpdates. + + ''' + constants = [] + + if m is None: + m = T.ones(x.shape).astype(floatX) + + params = self.get_sample_params() + + results, updates, h0s = self.step_call(x, m, h0s, *params) + results['h0s'] = h0s + return results, updates, constants + + def call_seqs(self, x, condition_on, level, *params): + '''Prepares the input for `__call__`. + + Args: + x (T.tensor): input + condtion_on (T.tensor or None): tensor to condition recurrence on. + level (int): reccurent level. + *params: list of theano.shared. + + Returns: + list: list of scan inputs. + + ''' + + x = x[:, :, :, None] + if level == 0: + i_params = self.get_input_args(*params) + a = self.input_net.step_preact(x, *i_params) + else: + i_params = self.get_inter_args(level - 1, *params) + a = self.inter_nets[level - 1].step_preact(x, *i_params) + + return [a] + + def rotate(self, tensor, n_times): + """ + + Args: + tensor (theano tensor): Tensor to rotate along first and third axes. + n_times (int): Number of times to rotate. + + Returns: Tensor rotated n times about its first and third axes. + + """ + if n_times == 0: + return tensor + + retval = tensor.swapaxes(0, 2)[::-1] + + if n_times == 1: + return retval + elif n_times > 1: + return self.rotate(retval, n_times - 1) diff --git a/cortex/models/tests/test_pyramid_rnn.py b/cortex/models/tests/test_pyramid_rnn.py new file mode 100644 index 0000000..0bcc21b --- /dev/null +++ b/cortex/models/tests/test_pyramid_rnn.py @@ -0,0 +1,80 @@ +''' +Module for testing 2D pyramid RNN. +''' + +from cortex.models.pyramid_rnn import Pyramid_RNN +import numpy as np +import theano +import theano.tensor as T + +theano.config.optimizer = 'None' + + +def test_build(dim_in=1, dim_h=17, width=13): + pyramid = Pyramid_RNN.factory(dim_in=dim_in, dim_hs=[dim_h], + width=width, dim_out=1) + pyramid.set_tparams() + + return pyramid + + +def test_step(pyramid=None, dim_in=1, dim_h=17, width=13): + if pyramid is None: + pyramid = test_build(dim_in=dim_in, dim_h=dim_h, width=width) + + m = theano.tensor.tensor3() + y = theano.tensor.tensor3() + h_ = theano.tensor.tensor3() + Ur = theano.tensor.matrix() + + activation = pyramid._step(m, y, h_, Ur) + f = theano.function([m, y, h_, Ur], activation) + + t = f(np.ones((10, width, dim_h), dtype='float32'), np.ones((10, width, dim_h), dtype='float32'), + np.ones((10, width, dim_h), dtype='float32'), pyramid.params['Ur0']) + + preact = np.ones((10, width, dim_h), dtype='float32') + \ + np.dot(np.ones((10, width, 3*dim_h), dtype='float32'), pyramid.params['Ur0']) + + n = np.tanh(preact) + + np.testing.assert_almost_equal(t, n) + + +def test_call(pyramid=None, dim_in=1, dim_h=17, width=13): + if pyramid is None: + pyramid = test_build(dim_in=dim_in, dim_h=dim_h, width=width) + + rng = np.random.RandomState() + + x = rng.randn(13, 10, 13) + + y = pyramid(x) + + f = theano.function([], y[0]['z']) + + # Now calculate what f should be using Numpy. + + outs =[] + for k in range(0, 4): # Iterate through directions + x = np.rot90(x.swapaxes(1, 2)).swapaxes(1, 2) # Rotate input + + dir_input = x[:(width + 1)/2, :, :, None] + dir_input = pyramid.input_net.params['b0'] + np.dot(dir_input, pyramid.input_net.params['W0']) + + h = np.zeros((x.shape[1], width, dim_h)) + Ur = pyramid.params['Ur0'] + + for layer in dir_input: # Iterate through height of pyramid + h_t = np.concatenate((h, np.roll(h, 1, 2), np.roll(h, -1, 2)), 2) + preact = layer + np.dot(h_t, Ur) + h = np.tanh(preact) + + outs.append(h[:, (width + 1)/2, :]) # Remember output for direction + + output = pyramid.output_net.params['b0'] + np.dot(sum(outs), pyramid.output_net.params['W0']) # Sum over direction\ + # and apply output network. + + # Test for equality. + + np.testing.assert_almost_equal(output, f()) # Check if they match. diff --git a/cortex/utils/trainer.py b/cortex/utils/trainer.py index 6f2ba19..c9f759c 100644 --- a/cortex/utils/trainer.py +++ b/cortex/utils/trainer.py @@ -29,6 +29,7 @@ logger = logging.getLogger('cortex') + def setup(module): '''Sets up module. @@ -37,6 +38,7 @@ def setup(module): print_section('Running setup') module.setup() + def set_data(module): '''Sets the datasets. @@ -61,6 +63,7 @@ def set_data(module): module.valid_dataset = datasets['valid'] module.test_dataset = datasets['test'] + def make_inputs(module): '''Forms the inputs from the dataset @@ -95,6 +98,7 @@ def make_inputs(module): dataset.reset() module.inputs = inps + def build(module, model_to_load=None): '''Forms the models. @@ -109,6 +113,7 @@ def build(module, model_to_load=None): module.models = models return set_tparams(module) + def set_tparams(module): tparams = OrderedDict() for k, v in module.models.iteritems(): @@ -116,6 +121,7 @@ def set_tparams(module): module.tparams = tparams return tparams + def set_cost(module): '''Sets costs. @@ -135,6 +141,7 @@ def set_cost(module): module.inputs = inputs return results, updates, constants, outputs + def set_test_function(module, results, outputs): '''Sets the test function of a module. @@ -142,9 +149,10 @@ def set_test_function(module, results, outputs): if hasattr(module, 'test'): f_test = module.test(results, outputs) else: - f_test = theano.function(module.inputs.values(), results) + f_test = theano.function(module.inputs.values(), results) # Not sure why this change was made, but it seems necessary. return f_test + def set_out_function(module, results, outputs): '''Sets function for outputs. ''' @@ -158,6 +166,7 @@ def set_out_function(module, results, outputs): f_outs = theano.function(module.inputs.values(), outs) return f_outs + def set_save_function(module, tparams): '''Sets the save function of a module. @@ -174,6 +183,7 @@ def save(outfile): f_save = save return f_save + def set_viz_function(module, results, outputs): '''Sets the visualization function of a module. @@ -188,12 +198,14 @@ def set_viz_function(module, results, outputs): else: return None + def set_eval_functions(module, **kwargs): if hasattr(module, 'eval'): return module.eval(**kwargs) else: return OrderedDict() + def check(module): '''Runs checks. @@ -202,6 +214,7 @@ def check(module): logger.info('Checking experiment.') module.check() + def finish(module): '''Extra finishing-up. @@ -210,6 +223,7 @@ def finish(module): logger.info('Finishing up setup') module.finish() + def train(module, cost, tparams, updates, constants, f_test=None, f_save=None, f_viz=None, f_outs=None, test_every=10, show_every=10, monitor_gradients=False): @@ -289,6 +303,7 @@ def show(self): for k in self.eval_keys: self.__dict__[k]() + class ModuleContainer(object): __required_methods = ['_build', '_cost'] __optional_methods = ['_setup', '_data', '_test', '_save', '_viz', '_check', @@ -411,6 +426,7 @@ def add_component(component): if len(component_list) > 0: flatten_component_layers(component_list, model_dict) + def load_module(model_file, strict=True): '''Loads pretrained model. @@ -503,6 +519,7 @@ def load_module(model_file, strict=True): set_tparams(module) return module + def main(args=None): if args is None: args = sys.argv[1:] diff --git a/cortex/utils/training.py b/cortex/utils/training.py index 4796806..1f279fc 100644 --- a/cortex/utils/training.py +++ b/cortex/utils/training.py @@ -12,7 +12,7 @@ from collections import OrderedDict from glob import glob import logging -if not 'matplotlib' in sys.modules: +if 'matplotlib' not in sys.modules: import matplotlib matplotlib.use('Agg') import numpy as np @@ -47,6 +47,7 @@ np.set_printoptions(threshold=np.nan) logger = logging.getLogger(__name__) + def make_argument_parser(): '''Generic experiment parser. @@ -69,6 +70,7 @@ def make_argument_parser(): help='Verbosity of the logging. (0, 1, 2)') return parser + def make_argument_parser_trainer(): '''Generic experiment parser for a trainer. @@ -93,6 +95,7 @@ def make_argument_parser_trainer(): help='Verbosity of the logging. (0, 1, 2)') return parser + def make_argument_parser_test(): '''Generic experiment parser for testing. @@ -112,6 +115,7 @@ def make_argument_parser_test(): help='Verbosity of the logging. (0, 1, 2)') return parser + def set_experiment(args): '''Generic experiment setup method. @@ -186,6 +190,7 @@ def set_experiment(args): exp_dict['model_to_load'] = model_to_load return exp_dict + def reload_model(args): '''Reloads a model from argparse args. @@ -237,6 +242,7 @@ def reload_model(args): exp_dict.update(**args) return exp_dict + def set_model(create_model, model_to_load, unpack, **kwargs): '''Convenience method for creating new or loading old model. @@ -267,6 +273,7 @@ def create_model(): models = create_model() return models + def set_tparams(model_dict): '''Generic tparams setter. @@ -279,6 +286,7 @@ def set_tparams(model_dict): tparams.update(**model.set_tparams()) return tparams + def set_params(tparams, updates, excludes=[]): '''Sets params, removing updates from tparams. @@ -307,6 +315,7 @@ def set_params(tparams, updates, excludes=[]): return tparams, all_params + def set_optimizer(inputs, cost, tparams, constants, updates, extra_outs, optimizer='sgd', optimizer_args=None, **learning_args): @@ -343,6 +352,7 @@ def set_optimizer(inputs, cost, tparams, constants, updates, extra_outs, return f_grad_shared, f_grad_updates, learning_args + def test(data_iter, f_test, f_test_keys, input_keys, n_samples=None): '''Tests the model using a data iterator. @@ -406,6 +416,7 @@ def test(data_iter, f_test, f_test_keys, input_keys, n_samples=None): return results + def validate(results, best_valid, e, best_epoch, save=None, valid_key=None, valid_sign=None, bestfile=None, **kwargs): '''Generic validation method. @@ -444,6 +455,7 @@ def validate(results, best_valid, e, best_epoch, save=None, valid_key=None, return best_valid, best_epoch + def main_loop(train, valid, f_grad_shared, f_grad_updates, f_test, f_test_keys=None,