diff --git a/Readme.md b/Readme.md index 48a6ba7c..2856660b 100644 --- a/Readme.md +++ b/Readme.md @@ -1,7 +1,7 @@ # char-rnn -This code implements **multi-layer Recurrent Neural Network** (RNN, LSTM, and GRU) for training/sampling from character-level language models. The input is a single text file and the model learns to predict the next character in the sequence. +This code implements **multi-layer Recurrent Neural Network** (RNN, LSTM, and GRU) for training/sampling from character-level language models. The input is a single text file and the model learns to predict the next character in the sequence. The context of this code base is described in detail in my [blog post](http://karpathy.github.io/2015/05/21/rnn-effectiveness/). @@ -15,7 +15,7 @@ This code is written in Lua and requires [Torch](http://torch.ch/). Additionally, you need to install the `nngraph` and `optim` packages using [LuaRocks](https://luarocks.org/) which you will be able to do after installing Torch ```bash -$ luarocks install nngraph +$ luarocks install nngraph $ luarocks install optim ``` diff --git a/inspect_checkpoint.lua b/inspect_checkpoint.lua index 6a027b1f..b53b27e8 100644 --- a/inspect_checkpoint.lua +++ b/inspect_checkpoint.lua @@ -32,4 +32,3 @@ print('opt:') print(model.opt) print('val losses:') print(model.val_losses) - diff --git a/model/LSTM.lua b/model/LSTM.lua index aa9dd682..92865ede 100644 --- a/model/LSTM.lua +++ b/model/LSTM.lua @@ -1,7 +1,7 @@ local LSTM = {} -function LSTM.lstm(input_size, rnn_size, n, dropout) - dropout = dropout or 0 +function LSTM.lstm(input_size, rnn_size, n, dropout, words) + dropout = dropout or 0 -- there will be 2*n+1 inputs local inputs = {} @@ -18,11 +18,17 @@ function LSTM.lstm(input_size, rnn_size, n, dropout) local prev_h = inputs[L*2+1] local prev_c = inputs[L*2] -- the input to this layer - if L == 1 then - x = OneHot(input_size)(inputs[1]) - input_size_L = input_size - else - x = outputs[(L-1)*2] + if L == 1 then + if not words then + x = OneHot(input_size)(inputs[1]) + input_size_L = input_size + else + x = Embedding(input_size, rnn_size)(inputs[1]) + input_size_L = rnn_size + end + + else + x = outputs[(L-1)*2] input_size_L = rnn_size end -- evaluate the input sums at once for efficiency @@ -47,7 +53,7 @@ function LSTM.lstm(input_size, rnn_size, n, dropout) local next_h = nn.CMulTable()({out_gate, nn.Tanh()(next_c)}) -- add dropout to output, if desired if dropout > 0 then next_h = nn.Dropout(dropout)(next_h) end - + table.insert(outputs, next_c) table.insert(outputs, next_h) end @@ -61,4 +67,3 @@ function LSTM.lstm(input_size, rnn_size, n, dropout) end return LSTM - diff --git a/sample.lua b/sample.lua index 1cb12866..fb3978c3 100644 --- a/sample.lua +++ b/sample.lua @@ -3,7 +3,7 @@ This file samples characters from a trained model -Code is based on implementation in +Code is based on implementation in https://github.com/oxford-cs-ml-2015/practical6 ]]-- @@ -14,6 +14,7 @@ require 'nngraph' require 'optim' require 'lfs' +require 'util.Embedding' require 'util.OneHot' require 'util.misc' @@ -111,4 +112,3 @@ for i=1, opt.length do io.write(ivocab[prev_char[1]]) end io.write('\n') io.flush() - diff --git a/train.lua b/train.lua index 11436c3d..1c2ddae3 100644 --- a/train.lua +++ b/train.lua @@ -3,11 +3,11 @@ This file trains a character-level multi-layer RNN on text data -Code is based on implementation in +Code is based on implementation in https://github.com/oxford-cs-ml-2015/practical6 but modified to have multi-layer support, GPU support, as well as many other common model/optimization bells and whistles. -The practical6 code is in turn based on +The practical6 code is in turn based on https://github.com/wojciechz/learning_to_execute which is turn based on other stuff in Torch, etc... (long lineage) @@ -20,8 +20,9 @@ require 'optim' require 'lfs' require 'util.OneHot' +require 'util.Embedding' require 'util.misc' -local CharSplitLMMinibatchLoader = require 'util.CharSplitLMMinibatchLoader' + local model_utils = require 'util.model_utils' local LSTM = require 'model.LSTM' @@ -35,6 +36,7 @@ cmd:option('-data_dir','data/tinyshakespeare','data directory. Should contain th -- model params cmd:option('-rnn_size', 128, 'size of LSTM internal state') cmd:option('-num_layers', 2, 'number of layers in the LSTM') +cmd:option('-words', false, 'whether the model operates on words (as opposed to chars)') cmd:option('-model', 'lstm', 'for now only lstm is supported. keep fixed') -- optimization cmd:option('-learning_rate',2e-3,'learning rate') @@ -61,10 +63,17 @@ cmd:text() -- parse input params opt = cmd:parse(arg) +local SplitLMMinibatchLoader +if opt.words then + SplitLMMinibatchLoader = require 'util.WordSplitLMMinibatchLoader' +else + SplitLMMinibatchLoader = require 'util.CharSplitLMMinibatchLoader' +end + torch.manualSeed(opt.seed) -- train / val / test split for data, in fractions local test_frac = math.max(0, 1 - opt.train_frac - opt.val_frac) -local split_sizes = {opt.train_frac, opt.val_frac, test_frac} +local split_sizes = {opt.train_frac, opt.val_frac, test_frac} if opt.gpuid >= 0 then print('using CUDA on GPU ' .. opt.gpuid .. '...') @@ -73,7 +82,7 @@ if opt.gpuid >= 0 then cutorch.setDevice(opt.gpuid + 1) -- note +1 to make it 0 indexed! sigh lua end -- create the data loader class -local loader = CharSplitLMMinibatchLoader.create(opt.data_dir, opt.batch_size, opt.seq_length, split_sizes) +local loader = SplitLMMinibatchLoader.create(opt.data_dir, opt.batch_size, opt.seq_length, split_sizes) local vocab_size = loader.vocab_size -- the number of distinct characters print('vocab size: ' .. vocab_size) -- make sure output directory exists @@ -81,8 +90,27 @@ if not path.exists(opt.checkpoint_dir) then lfs.mkdir(opt.checkpoint_dir) end -- define the model: prototypes for one timestep, then clone them in time protos = {} +-- local embeded_size = 100 +-- local input_size, embeded_size +-- if opt.words then +-- print('using an embedding transform for input...') +-- embeded_size = 100 +-- protos.embed = Embedding(vocab_size, embeded_size) +-- else +-- print('using one-hot for input...') +-- embeded_size = vocab_size +-- protos.embed = OneHot(vocab_size) +-- end +if not opt.words then + print('using one-hot for input...') + protos.embed = OneHot(vocab_size) +else + print('using an embedding transform of size ' .. opt.rnn_size) + protos.embed = Embedding(vocab_size, opt.rnn_size) +end print('creating an LSTM with ' .. opt.num_layers .. ' layers') -protos.rnn = LSTM.lstm(vocab_size, opt.rnn_size, opt.num_layers, opt.dropout) +protos.rnn = LSTM.lstm(vocab_size, opt.rnn_size, opt.num_layers, opt.dropout, opt.words) + -- the initial state of the cell/hidden states init_state = {} for L=1,opt.num_layers do @@ -91,6 +119,7 @@ for L=1,opt.num_layers do table.insert(init_state, h_init:clone()) table.insert(init_state, h_init:clone()) end + -- training criterion (negative log likelihood) protos.criterion = nn.ClassNLLCriterion() @@ -122,7 +151,7 @@ function eval_split(split_index, max_batches) loader:reset_batch_pointer(split_index) -- move batch iteration pointer for this split to front local loss = 0 local rnn_state = {[0] = init_state} - + for i = 1,n do -- iterate over batches in the split -- fetch a batch local x, y = loader:next_batch(split_index) @@ -136,9 +165,11 @@ function eval_split(split_index, max_batches) clones.rnn[t]:evaluate() -- for dropout proper functioning local lst = clones.rnn[t]:forward{x[{{}, t}], unpack(rnn_state[t-1])} rnn_state[t] = {} + for i=1,#init_state do table.insert(rnn_state[t], lst[i]) end - prediction = lst[#lst] + prediction = lst[#lst] loss = loss + clones.criterion[t]:forward(prediction, y[{{}, t}]) + end -- carry over lstm state rnn_state[0] = rnn_state[#rnn_state] @@ -150,6 +181,7 @@ function eval_split(split_index, max_batches) end -- do fwd/bwd and return loss, grad_params + local init_state_global = clone_list(init_state) function feval(x) if x ~= params then @@ -188,7 +220,7 @@ function feval(x) drnn_state[t-1] = {} for k,v in pairs(dlst) do if k > 1 then -- k == 1 is gradient on x, which we dont need - -- note we do k-1 because first item is dembeddings, and then follow the + -- note we do k-1 because first item is dembeddings, and then follow the -- derivatives of the state, starting at index 2. I know... drnn_state[t-1][k-1] = v end @@ -209,16 +241,18 @@ local optim_state = {learningRate = opt.learning_rate, alpha = opt.decay_rate} local iterations = opt.max_epochs * loader.ntrain local iterations_per_epoch = loader.ntrain local loss0 = nil + for i = 1, iterations do + local epoch = i / loader.ntrain local timer = torch.Timer() + local _, loss = optim.rmsprop(feval, params, optim_state) local time = timer:time().real local train_loss = loss[1] -- the loss is inside a list, pop it train_losses[i] = train_loss - -- exponential learning rate decay if i % loader.ntrain == 0 and opt.learning_rate_decay < 1 then if epoch >= opt.learning_rate_decay_after then @@ -251,7 +285,7 @@ for i = 1, iterations do if i % opt.print_every == 0 then print(string.format("%d/%d (epoch %.3f), train_loss = %6.8f, grad/param norm = %6.4e, time/batch = %.2fs", i, iterations, epoch, train_loss, grad_params:norm() / params:norm(), time)) end - + if i % 10 == 0 then collectgarbage() end -- handle early stopping if things are going really bad @@ -261,5 +295,3 @@ for i = 1, iterations do break -- halt end end - - diff --git a/util/CharSplitLMMinibatchLoader.lua b/util/CharSplitLMMinibatchLoader.lua index 1fafe398..3bc2795f 100644 --- a/util/CharSplitLMMinibatchLoader.lua +++ b/util/CharSplitLMMinibatchLoader.lua @@ -29,14 +29,14 @@ function CharSplitLMMinibatchLoader.create(data_dir, batch_size, seq_length, spl local len = data:size(1) if len % (batch_size * seq_length) ~= 0 then print('cutting off end of data so that the batches/sequences divide evenly') - data = data:sub(1, batch_size * seq_length + data = data:sub(1, batch_size * seq_length * math.floor(len / (batch_size * seq_length))) end -- count vocab self.vocab_size = 0 - for _ in pairs(self.vocab_mapping) do - self.vocab_size = self.vocab_size + 1 + for _ in pairs(self.vocab_mapping) do + self.vocab_size = self.vocab_size + 1 end -- self.batches is a table of tensors @@ -122,4 +122,3 @@ function CharSplitLMMinibatchLoader.text_to_tensor(in_textfile, out_vocabfile, o end return CharSplitLMMinibatchLoader - diff --git a/util/Embedding.lua b/util/Embedding.lua new file mode 100644 index 00000000..a3c96663 --- /dev/null +++ b/util/Embedding.lua @@ -0,0 +1,53 @@ +--[[ + Copyright 2014 Google Inc. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +]]-- + +local Embedding, parent = torch.class('Embedding', 'nn.Module') + +function Embedding:__init(inputSize, outputSize) + parent.__init(self) + self.outputSize = outputSize + self.weight = torch.Tensor(inputSize, outputSize) + self.gradWeight = torch.Tensor(inputSize, outputSize) +end + +function Embedding:updateOutput(input) + self.output:resize(input:size(1), self.outputSize) + for i = 1, input:size(1) do + self.output[i]:copy(self.weight[input[i]]) + end + return self.output +end + +function Embedding:updateGradInput(input, gradOutput) + if self.gradInput then + self.gradInput:resize(input:size()) + return self.gradInput + end +end + +function Embedding:accGradParameters(input, gradOutput, scale) + scale = scale or 1 + if scale == 0 then + self.gradWeight:zero() + end + for i = 1, input:size(1) do + local word = input[i] + self.gradWeight[word]:add(gradOutput[i]) + end +end + +-- we do not need to accumulate parameters when sharing +Embedding.sharedAccUpdateGradParameters = Embedding.accUpdateGradParameters diff --git a/util/WordSplitLMMinibatchLoader.lua b/util/WordSplitLMMinibatchLoader.lua new file mode 100644 index 00000000..9701f5f0 --- /dev/null +++ b/util/WordSplitLMMinibatchLoader.lua @@ -0,0 +1,142 @@ + +-- Modified from https://github.com/oxford-cs-ml-2015/practical6 +-- the modification included support for train/val/test splits +-- Further modified from CharSplitLMMiniBatchLoader to seperate by word by Jacob Edelman + +local CharSplitLMMinibatchLoader = {} +CharSplitLMMinibatchLoader.__index = CharSplitLMMinibatchLoader + +function CharSplitLMMinibatchLoader.create(data_dir, batch_size, seq_length, split_fractions) + -- split_fractions is e.g. {0.9, 0.05, 0.05} + + local self = {} + setmetatable(self, CharSplitLMMinibatchLoader) + + local input_file = path.join(data_dir, 'input.txt') + local vocab_file = path.join(data_dir, 'vocab.t7') + local tensor_file = path.join(data_dir, 'data.t7') + + -- construct a tensor with all the data + if not (path.exists(vocab_file) or path.exists(tensor_file)) then + print('one-time setup: preprocessing input text file ' .. input_file .. '...') + CharSplitLMMinibatchLoader.text_to_tensor(input_file, vocab_file, tensor_file) + end + + print('loading data files...') + local data = torch.load(tensor_file) + self.vocab_mapping = torch.load(vocab_file) + + -- cut off the end so that it divides evenly + local len = data:size(1) + if len % (batch_size * seq_length) ~= 0 then + print('cutting off end of data so that the batches/sequences divide evenly') + data = data:sub(1, batch_size * seq_length + * math.floor(len / (batch_size * seq_length))) + end + + -- count vocab + self.vocab_size = 0 + for _ in pairs(self.vocab_mapping) do + self.vocab_size = self.vocab_size + 1 + end + + -- self.batches is a table of tensors + print('reshaping tensor...') + self.batch_size = batch_size + self.seq_length = seq_length + + local ydata = data:clone() + ydata:sub(1,-2):copy(data:sub(2,-1)) + ydata[-1] = data[1] + self.x_batches = data:view(batch_size, -1):split(seq_length, 2) -- #rows = #batches + self.nbatches = #self.x_batches + self.y_batches = ydata:view(batch_size, -1):split(seq_length, 2) -- #rows = #batches + assert(#self.x_batches == #self.y_batches) + + self.ntrain = math.floor(self.nbatches * split_fractions[1]) + self.nval = math.floor(self.nbatches * split_fractions[2]) + self.ntest = self.nbatches - self.nval - self.ntrain -- the rest goes to test (to ensure this adds up exactly) + + self.split_sizes = {self.ntrain, self.nval, self.ntest} + self.batch_ix = {0,0,0} + + print(string.format('data load done. Number of batches in train: %d, val: %d, test: %d', self.ntrain, self.nval, self.ntest)) + collectgarbage() + return self +end + +function CharSplitLMMinibatchLoader:reset_batch_pointer(split_index, batch_index) + batch_index = batch_index or 0 + self.batch_ix[split_index] = batch_index +end + +function CharSplitLMMinibatchLoader:next_batch(split_index) + -- split_index is integer: 1 = train, 2 = val, 3 = test + self.batch_ix[split_index] = self.batch_ix[split_index] + 1 + if self.batch_ix[split_index] > self.split_sizes[split_index] then + self.batch_ix[split_index] = 1 -- cycle around to beginning + end + -- pull out the correct next batch + local ix = self.batch_ix[split_index] + if split_index == 2 then ix = ix + self.ntrain end -- offset by train set size + if split_index == 3 then ix = ix + self.ntrain + self.nval end -- offset by train + test + return self.x_batches[ix], self.y_batches[ix] +end + +-- *** STATIC method *** +function CharSplitLMMinibatchLoader.text_to_tensor(in_textfile, out_vocabfile, out_tensorfile) + local timer = torch.Timer() + + print('loading text file...') + local f = torch.DiskFile(in_textfile) + local rawdata = f:readString('*a') -- NOTE: this reads the whole file at once + f:close() + + -- create vocabulary if it doesn't exist yet + print('creating vocabulary mapping...') + -- record all characters to a set + local unordered = {} + local length = 0 + for char1,char2 in rawdata:gmatch'(%a*)(.?)' do + if char1 ~= "" then + if not unordered[char1] then unordered[char1] = true end + length = length + 1 + end + if char2 ~= "" then + if not unordered[char2] then unordered[char2] = true end + length = length + 1 + end + end + + -- sort into a table (i.e. keys become 1..N) + local ordered = {} + for char in pairs(unordered) do ordered[#ordered + 1] = char end + table.sort(ordered) + -- invert `ordered` to create the char->int mapping + local vocab_mapping = {} + for i, char in ipairs(ordered) do + vocab_mapping[char] = i + end + -- construct a tensor with all the data + print('putting data into tensor...') + local data = torch.Tensor(length) -- store it into 1D first, then rearrange + local i = 1 + for char1,char2 in rawdata:gmatch'(%a*)(.?)' do + if char1 ~= "" then + data[i] = vocab_mapping[char1] + i = i + 1 + end + if char2 ~= "" then + data[i] = vocab_mapping[char2] + i= i + 1 + end + end + + -- save output preprocessed files + print('saving ' .. out_vocabfile) + torch.save(out_vocabfile, vocab_mapping) + print('saving ' .. out_tensorfile) + torch.save(out_tensorfile, data) +end + +return CharSplitLMMinibatchLoader diff --git a/util/model_utils.lua b/util/model_utils.lua index 2edbed16..923bdd28 100644 --- a/util/model_utils.lua +++ b/util/model_utils.lua @@ -1,7 +1,7 @@ -- adapted from https://github.com/wojciechz/learning_to_execute -- utilities for combining/flattening parameters in a model --- the code in this script is more general than it needs to be, which is +-- the code in this script is more general than it needs to be, which is -- why it is kind of a large require 'torch' @@ -120,7 +120,6 @@ function model_utils.clone_many_times(net, T) params = {} end end - local paramsNoGrad if net.parametersNoGrad then paramsNoGrad = net:parametersNoGrad()