karpathy · YafahEdelman · May 31, 2015 · May 31, 2015 · May 31, 2015 · May 31, 2015
diff --git a/Readme.md b/Readme.md
@@ -1,7 +1,7 @@
 
 # char-rnn
 
-This code implements **multi-layer Recurrent Neural Network** (RNN, LSTM, and GRU) for training/sampling from character-level language models. The input is a single text file and the model learns to predict the next character in the sequence. 
+This code implements **multi-layer Recurrent Neural Network** (RNN, LSTM, and GRU) for training/sampling from character-level language models. The input is a single text file and the model learns to predict the next character in the sequence.
 
 The context of this code base is described in detail in my [blog post](http://karpathy.github.io/2015/05/21/rnn-effectiveness/).
 
@@ -15,7 +15,7 @@ This code is written in Lua and requires [Torch](http://torch.ch/).
 Additionally, you need to install the `nngraph` and `optim` packages using [LuaRocks](https://luarocks.org/) which you will be able to do after installing Torch
 
 ```bash
-$ luarocks install nngraph 
+$ luarocks install nngraph
 $ luarocks install optim
 ```
 

diff --git a/inspect_checkpoint.lua b/inspect_checkpoint.lua
@@ -32,4 +32,3 @@ print('opt:')
 print(model.opt)
 print('val losses:')
 print(model.val_losses)
-
diff --git a/model/LSTM.lua b/model/LSTM.lua
@@ -1,7 +1,7 @@
 
 local LSTM = {}
-function LSTM.lstm(input_size, rnn_size, n, dropout)
-  dropout = dropout or 0 
+function LSTM.lstm(input_size, rnn_size, n, dropout, words)
+  dropout = dropout or 0
 
   -- there will be 2*n+1 inputs
   local inputs = {}
@@ -18,11 +18,17 @@ function LSTM.lstm(input_size, rnn_size, n, dropout)
     local prev_h = inputs[L*2+1]
     local prev_c = inputs[L*2]
     -- the input to this layer
-    if L == 1 then 
-      x = OneHot(input_size)(inputs[1])
-      input_size_L = input_size
-    else 
-      x = outputs[(L-1)*2] 
+    if L == 1 then
+      if not words then
+        x = OneHot(input_size)(inputs[1])
+        input_size_L = input_size
+      else
+        x = Embedding(input_size, rnn_size)(inputs[1])
+        input_size_L = rnn_size
+      end
+
+    else
+      x = outputs[(L-1)*2]
       input_size_L = rnn_size
     end
     -- evaluate the input sums at once for efficiency
@@ -47,7 +53,7 @@ function LSTM.lstm(input_size, rnn_size, n, dropout)
     local next_h = nn.CMulTable()({out_gate, nn.Tanh()(next_c)})
     -- add dropout to output, if desired
     if dropout > 0 then next_h = nn.Dropout(dropout)(next_h) end
-    
+
     table.insert(outputs, next_c)
     table.insert(outputs, next_h)
   end
@@ -61,4 +67,3 @@ function LSTM.lstm(input_size, rnn_size, n, dropout)
 end
 
 return LSTM
-
diff --git a/sample.lua b/sample.lua
@@ -3,7 +3,7 @@
 
 This file samples characters from a trained model
 
-Code is based on implementation in 
+Code is based on implementation in
 https://github.com/oxford-cs-ml-2015/practical6
 
 ]]--
@@ -14,6 +14,7 @@ require 'nngraph'
 require 'optim'
 require 'lfs'
 
+require 'util.Embedding'
 require 'util.OneHot'
 require 'util.misc'
 
@@ -111,4 +112,3 @@ for i=1, opt.length do
     io.write(ivocab[prev_char[1]])
 end
 io.write('\n') io.flush()
-
diff --git a/train.lua b/train.lua
@@ -3,11 +3,11 @@
 
 This file trains a character-level multi-layer RNN on text data
 
-Code is based on implementation in 
+Code is based on implementation in
 https://github.com/oxford-cs-ml-2015/practical6
 but modified to have multi-layer support, GPU support, as well as
 many other common model/optimization bells and whistles.
-The practical6 code is in turn based on 
+The practical6 code is in turn based on
 https://github.com/wojciechz/learning_to_execute
 which is turn based on other stuff in Torch, etc... (long lineage)
 
@@ -20,8 +20,9 @@ require 'optim'
 require 'lfs'
 
 require 'util.OneHot'
+require 'util.Embedding'
 require 'util.misc'
-local CharSplitLMMinibatchLoader = require 'util.CharSplitLMMinibatchLoader'
+
 local model_utils = require 'util.model_utils'
 local LSTM = require 'model.LSTM'
 
@@ -35,6 +36,7 @@ cmd:option('-data_dir','data/tinyshakespeare','data directory. Should contain th
 -- model params
 cmd:option('-rnn_size', 128, 'size of LSTM internal state')
 cmd:option('-num_layers', 2, 'number of layers in the LSTM')
+cmd:option('-words', false, 'whether the model operates on words (as opposed to chars)')
 cmd:option('-model', 'lstm', 'for now only lstm is supported. keep fixed')
 -- optimization
 cmd:option('-learning_rate',2e-3,'learning rate')
@@ -61,10 +63,17 @@ cmd:text()
 
 -- parse input params
 opt = cmd:parse(arg)
+local SplitLMMinibatchLoader
+if opt.words then
+  SplitLMMinibatchLoader = require 'util.WordSplitLMMinibatchLoader'
+else
+  SplitLMMinibatchLoader = require 'util.CharSplitLMMinibatchLoader'
+end
+
 torch.manualSeed(opt.seed)
 -- train / val / test split for data, in fractions
 local test_frac = math.max(0, 1 - opt.train_frac - opt.val_frac)
-local split_sizes = {opt.train_frac, opt.val_frac, test_frac} 
+local split_sizes = {opt.train_frac, opt.val_frac, test_frac}
 
 if opt.gpuid >= 0 then
     print('using CUDA on GPU ' .. opt.gpuid .. '...')
@@ -73,16 +82,35 @@ if opt.gpuid >= 0 then
     cutorch.setDevice(opt.gpuid + 1) -- note +1 to make it 0 indexed! sigh lua
 end
 -- create the data loader class
-local loader = CharSplitLMMinibatchLoader.create(opt.data_dir, opt.batch_size, opt.seq_length, split_sizes)
+local loader = SplitLMMinibatchLoader.create(opt.data_dir, opt.batch_size, opt.seq_length, split_sizes)
 local vocab_size = loader.vocab_size  -- the number of distinct characters
 print('vocab size: ' .. vocab_size)
 -- make sure output directory exists
 if not path.exists(opt.checkpoint_dir) then lfs.mkdir(opt.checkpoint_dir) end
 
 -- define the model: prototypes for one timestep, then clone them in time
 protos = {}
+-- local embeded_size = 100
+-- local input_size, embeded_size
+-- if opt.words then
+--     print('using an embedding transform for input...')
+--     embeded_size = 100
+--     protos.embed = Embedding(vocab_size, embeded_size)
+-- else
+--     print('using one-hot for input...')
+--     embeded_size = vocab_size
+--     protos.embed = OneHot(vocab_size)
+-- end
+if not opt.words then
+    print('using one-hot for input...')
+    protos.embed = OneHot(vocab_size)
+else
+    print('using an embedding transform of size ' .. opt.rnn_size)
+    protos.embed = Embedding(vocab_size, opt.rnn_size)
+end
 print('creating an LSTM with ' .. opt.num_layers .. ' layers')
-protos.rnn = LSTM.lstm(vocab_size, opt.rnn_size, opt.num_layers, opt.dropout)
+protos.rnn = LSTM.lstm(vocab_size, opt.rnn_size, opt.num_layers, opt.dropout, opt.words)
+
 -- the initial state of the cell/hidden states
 init_state = {}
 for L=1,opt.num_layers do
@@ -91,6 +119,7 @@ for L=1,opt.num_layers do
     table.insert(init_state, h_init:clone())
     table.insert(init_state, h_init:clone())
 end
+
 -- training criterion (negative log likelihood)
 protos.criterion = nn.ClassNLLCriterion()
 
@@ -122,7 +151,7 @@ function eval_split(split_index, max_batches)
     loader:reset_batch_pointer(split_index) -- move batch iteration pointer for this split to front
     local loss = 0
     local rnn_state = {[0] = init_state}
-    
+
     for i = 1,n do -- iterate over batches in the split
         -- fetch a batch
         local x, y = loader:next_batch(split_index)
@@ -136,9 +165,11 @@ function eval_split(split_index, max_batches)
             clones.rnn[t]:evaluate() -- for dropout proper functioning
             local lst = clones.rnn[t]:forward{x[{{}, t}], unpack(rnn_state[t-1])}
             rnn_state[t] = {}
+
             for i=1,#init_state do table.insert(rnn_state[t], lst[i]) end
-            prediction = lst[#lst] 
+            prediction = lst[#lst]
             loss = loss + clones.criterion[t]:forward(prediction, y[{{}, t}])
+
         end
         -- carry over lstm state
         rnn_state[0] = rnn_state[#rnn_state]
@@ -150,6 +181,7 @@ function eval_split(split_index, max_batches)
 end
 
 -- do fwd/bwd and return loss, grad_params
+
 local init_state_global = clone_list(init_state)
 function feval(x)
     if x ~= params then
@@ -188,7 +220,7 @@ function feval(x)
         drnn_state[t-1] = {}
         for k,v in pairs(dlst) do
             if k > 1 then -- k == 1 is gradient on x, which we dont need
-                -- note we do k-1 because first item is dembeddings, and then follow the 
+                -- note we do k-1 because first item is dembeddings, and then follow the
                 -- derivatives of the state, starting at index 2. I know...
                 drnn_state[t-1][k-1] = v
             end
@@ -209,16 +241,18 @@ local optim_state = {learningRate = opt.learning_rate, alpha = opt.decay_rate}
 local iterations = opt.max_epochs * loader.ntrain
 local iterations_per_epoch = loader.ntrain
 local loss0 = nil
+
 for i = 1, iterations do
+
     local epoch = i / loader.ntrain
 
     local timer = torch.Timer()
+
     local _, loss = optim.rmsprop(feval, params, optim_state)
     local time = timer:time().real
 
     local train_loss = loss[1] -- the loss is inside a list, pop it
     train_losses[i] = train_loss
-
     -- exponential learning rate decay
     if i % loader.ntrain == 0 and opt.learning_rate_decay < 1 then
         if epoch >= opt.learning_rate_decay_after then
@@ -251,7 +285,7 @@ for i = 1, iterations do
     if i % opt.print_every == 0 then
         print(string.format("%d/%d (epoch %.3f), train_loss = %6.8f, grad/param norm = %6.4e, time/batch = %.2fs", i, iterations, epoch, train_loss, grad_params:norm() / params:norm(), time))
     end
-   
+
     if i % 10 == 0 then collectgarbage() end
 
     -- handle early stopping if things are going really bad
@@ -261,5 +295,3 @@ for i = 1, iterations do
         break -- halt
     end
 end
-
-
diff --git a/util/CharSplitLMMinibatchLoader.lua b/util/CharSplitLMMinibatchLoader.lua
@@ -29,14 +29,14 @@ function CharSplitLMMinibatchLoader.create(data_dir, batch_size, seq_length, spl
     local len = data:size(1)
     if len % (batch_size * seq_length) ~= 0 then
         print('cutting off end of data so that the batches/sequences divide evenly')
-        data = data:sub(1, batch_size * seq_length 
+        data = data:sub(1, batch_size * seq_length
                     * math.floor(len / (batch_size * seq_length)))
     end
 
     -- count vocab
     self.vocab_size = 0
-    for _ in pairs(self.vocab_mapping) do 
-        self.vocab_size = self.vocab_size + 1 
+    for _ in pairs(self.vocab_mapping) do
+        self.vocab_size = self.vocab_size + 1
     end
 
     -- self.batches is a table of tensors
@@ -122,4 +122,3 @@ function CharSplitLMMinibatchLoader.text_to_tensor(in_textfile, out_vocabfile, o
 end
 
 return CharSplitLMMinibatchLoader
-
diff --git a/util/Embedding.lua b/util/Embedding.lua
@@ -0,0 +1,53 @@
+--[[
+  Copyright 2014 Google Inc. All Rights Reserved.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+]]--
+
+local Embedding, parent = torch.class('Embedding', 'nn.Module')
+
+function Embedding:__init(inputSize, outputSize)
+  parent.__init(self)
+  self.outputSize = outputSize
+  self.weight = torch.Tensor(inputSize, outputSize)
+  self.gradWeight = torch.Tensor(inputSize, outputSize)
+end
+
+function Embedding:updateOutput(input)
+  self.output:resize(input:size(1), self.outputSize)
+  for i = 1, input:size(1) do
+    self.output[i]:copy(self.weight[input[i]])
+  end
+  return self.output
+end
+
+function Embedding:updateGradInput(input, gradOutput)
+  if self.gradInput then
+    self.gradInput:resize(input:size())
+    return self.gradInput
+  end
+end
+
+function Embedding:accGradParameters(input, gradOutput, scale)
+  scale = scale or 1
+  if scale == 0 then
+    self.gradWeight:zero()
+  end
+  for i = 1, input:size(1) do
+    local word = input[i]
+    self.gradWeight[word]:add(gradOutput[i])
+  end
+end
+
+-- we do not need to accumulate parameters when sharing
+Embedding.sharedAccUpdateGradParameters = Embedding.accUpdateGradParameters
Original file line number	Diff line number	Diff line change
Expand Up		@@ -32,4 +32,3 @@ print('opt:')
		print(model.opt)
		print('val losses:')
		print(model.val_losses)