From 95aac48daa1dec35264030ce9ecfdef03c74cd5c Mon Sep 17 00:00:00 2001
From: Jacob Edelman <edelman.jd@gmail.com>
Date: Sun, 31 May 2015 11:07:36 -0400
Subject: [PATCH 01/12] Changed the CharSplitLMMinibatchLoader to words

Changed the CharSplitLMMinibatchLoader to words where words are continuous strings of letters or any non-letter symbol.
---
 ...der.lua => WordSplitLMMinibatchLoader.lua} | 27 +++++++++++++++----
 1 file changed, 22 insertions(+), 5 deletions(-)
 rename util/{CharSplitLMMinibatchLoader.lua => WordSplitLMMinibatchLoader.lua} (86%)

diff --git a/util/CharSplitLMMinibatchLoader.lua b/util/WordSplitLMMinibatchLoader.lua
similarity index 86%
rename from util/CharSplitLMMinibatchLoader.lua
rename to util/WordSplitLMMinibatchLoader.lua
index 1fafe398..e382fe92 100644
--- a/util/CharSplitLMMinibatchLoader.lua
+++ b/util/WordSplitLMMinibatchLoader.lua
@@ -95,9 +95,18 @@ function CharSplitLMMinibatchLoader.text_to_tensor(in_textfile, out_vocabfile, o
     print('creating vocabulary mapping...')
     -- record all characters to a set
     local unordered = {}
-    for char in rawdata:gmatch'.' do
-        if not unordered[char] then unordered[char] = true end
+    local length = 0
+    for char1,char2 in rawdata:gmatch'(%a*)(.?)' do
+        if char1 ~= "" then
+            if not unordered[char1] then unordered[cha1] = true end
+            length = length + 1
+        end
+        if char2 ~= "" then
+            if not unordered[char2] then unordered[char2] = true end
+            length = length + 1
+        end
     end
+
     -- sort into a table (i.e. keys become 1..N)
     local ordered = {}
     for char in pairs(unordered) do ordered[#ordered + 1] = char end
@@ -109,9 +118,17 @@ function CharSplitLMMinibatchLoader.text_to_tensor(in_textfile, out_vocabfile, o
     end
     -- construct a tensor with all the data
     print('putting data into tensor...')
-    local data = torch.ByteTensor(#rawdata) -- store it into 1D first, then rearrange
-    for i=1, #rawdata do
-        data[i] = vocab_mapping[rawdata:sub(i, i)] -- lua has no string indexing using []
+    local data = torch.ByteTensor(length) -- store it into 1D first, then rearrange
+    local i = 0
+    for char1,char2 in rawdata:gmatch'(%a*)(.?)' do
+        if char1 ~= "" then
+            data[i] = char1
+            i = i + 1
+        end
+        if char2 ~= "" then
+            data[i] = char2
+            i= i + 1
+        end
     end
 
     -- save output preprocessed files

From 44f4e4f520c3970d6570624f2667011ebc4e49d2 Mon Sep 17 00:00:00 2001
From: Jacob Edelman <edelman.jd@gmail.com>
Date: Sun, 31 May 2015 11:12:09 -0400
Subject: [PATCH 02/12] Update train.lua

---
 train.lua | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train.lua b/train.lua
index 3136acb6..1d013d3d 100644
--- a/train.lua
+++ b/train.lua
@@ -21,7 +21,7 @@ require 'lfs'
 
 require 'util.OneHot'
 require 'util.misc'
-local CharSplitLMMinibatchLoader = require 'util.CharSplitLMMinibatchLoader'
+local CharSplitLMMinibatchLoader = require 'util.WordSplitLMMinibatchLoader'
 local model_utils = require 'util.model_utils'
 local LSTM = require 'model.LSTM'
 

From d6e05b95f809a81a827d9ec3eca5d658e18f20d4 Mon Sep 17 00:00:00 2001
From: Jacob Edelman <edelman.jd@gmail.com>
Date: Sun, 31 May 2015 11:14:05 -0400
Subject: [PATCH 03/12] Update WordSplitLMMinibatchLoader.lua

---
 util/WordSplitLMMinibatchLoader.lua | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/util/WordSplitLMMinibatchLoader.lua b/util/WordSplitLMMinibatchLoader.lua
index e382fe92..7d9b30ca 100644
--- a/util/WordSplitLMMinibatchLoader.lua
+++ b/util/WordSplitLMMinibatchLoader.lua
@@ -97,11 +97,11 @@ function CharSplitLMMinibatchLoader.text_to_tensor(in_textfile, out_vocabfile, o
     local unordered = {}
     local length = 0
     for char1,char2 in rawdata:gmatch'(%a*)(.?)' do
-        if char1 ~= "" then
+        if char1 ~= nil then
             if not unordered[char1] then unordered[cha1] = true end
             length = length + 1
         end
-        if char2 ~= "" then
+        if char2 ~= nil then
             if not unordered[char2] then unordered[char2] = true end
             length = length + 1
         end

From 2f55b6e46aa34a2f50b07594fda983340f34e5cb Mon Sep 17 00:00:00 2001
From: Jacob Edelman <edelman.jd@gmail.com>
Date: Sun, 31 May 2015 11:14:46 -0400
Subject: [PATCH 04/12] Update WordSplitLMMinibatchLoader.lua

---
 util/WordSplitLMMinibatchLoader.lua | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/util/WordSplitLMMinibatchLoader.lua b/util/WordSplitLMMinibatchLoader.lua
index 7d9b30ca..fda85be0 100644
--- a/util/WordSplitLMMinibatchLoader.lua
+++ b/util/WordSplitLMMinibatchLoader.lua
@@ -97,11 +97,13 @@ function CharSplitLMMinibatchLoader.text_to_tensor(in_textfile, out_vocabfile, o
     local unordered = {}
     local length = 0
     for char1,char2 in rawdata:gmatch'(%a*)(.?)' do
-        if char1 ~= nil then
+        if char1 ~= "" then
+            print(char1)
             if not unordered[char1] then unordered[cha1] = true end
             length = length + 1
         end
-        if char2 ~= nil then
+        if char2 ~= "" then
+            print(char2)
             if not unordered[char2] then unordered[char2] = true end
             length = length + 1
         end

From b391315dec2cc7a5ec9db7057efca86b9a616445 Mon Sep 17 00:00:00 2001
From: Jacob Edelman <edelman.jd@gmail.com>
Date: Sun, 31 May 2015 11:15:55 -0400
Subject: [PATCH 05/12] Update WordSplitLMMinibatchLoader.lua

---
 util/WordSplitLMMinibatchLoader.lua | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/util/WordSplitLMMinibatchLoader.lua b/util/WordSplitLMMinibatchLoader.lua
index fda85be0..927a5143 100644
--- a/util/WordSplitLMMinibatchLoader.lua
+++ b/util/WordSplitLMMinibatchLoader.lua
@@ -98,12 +98,10 @@ function CharSplitLMMinibatchLoader.text_to_tensor(in_textfile, out_vocabfile, o
     local length = 0
     for char1,char2 in rawdata:gmatch'(%a*)(.?)' do
         if char1 ~= "" then
-            print(char1)
-            if not unordered[char1] then unordered[cha1] = true end
+            if not unordered[char1] then unordered[char1] = true end
             length = length + 1
         end
         if char2 ~= "" then
-            print(char2)
             if not unordered[char2] then unordered[char2] = true end
             length = length + 1
         end

From 30d02206745e81c9e828aa4e0222345fd58dab77 Mon Sep 17 00:00:00 2001
From: JacobEdelman <edelman.jd@gmail.com>
Date: Mon, 1 Jun 2015 08:19:41 -0400
Subject: [PATCH 06/12] Made work but has ram overload.

---
 util/WordSplitLMMinibatchLoader.lua | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/util/WordSplitLMMinibatchLoader.lua b/util/WordSplitLMMinibatchLoader.lua
index 927a5143..4d3097a7 100644
--- a/util/WordSplitLMMinibatchLoader.lua
+++ b/util/WordSplitLMMinibatchLoader.lua
@@ -29,14 +29,14 @@ function CharSplitLMMinibatchLoader.create(data_dir, batch_size, seq_length, spl
     local len = data:size(1)
     if len % (batch_size * seq_length) ~= 0 then
         print('cutting off end of data so that the batches/sequences divide evenly')
-        data = data:sub(1, batch_size * seq_length 
+        data = data:sub(1, batch_size * seq_length
                     * math.floor(len / (batch_size * seq_length)))
     end
 
     -- count vocab
     self.vocab_size = 0
-    for _ in pairs(self.vocab_mapping) do 
-        self.vocab_size = self.vocab_size + 1 
+    for _ in pairs(self.vocab_mapping) do
+        self.vocab_size = self.vocab_size + 1
     end
 
     -- self.batches is a table of tensors
@@ -119,14 +119,14 @@ function CharSplitLMMinibatchLoader.text_to_tensor(in_textfile, out_vocabfile, o
     -- construct a tensor with all the data
     print('putting data into tensor...')
     local data = torch.ByteTensor(length) -- store it into 1D first, then rearrange
-    local i = 0
+    local i = 1
     for char1,char2 in rawdata:gmatch'(%a*)(.?)' do
         if char1 ~= "" then
-            data[i] = char1
+            data[i] = vocab_mapping[char1]
             i = i + 1
         end
         if char2 ~= "" then
-            data[i] = char2
+            data[i] = vocab_mapping[char2]
             i= i + 1
         end
     end
@@ -139,4 +139,3 @@ function CharSplitLMMinibatchLoader.text_to_tensor(in_textfile, out_vocabfile, o
 end
 
 return CharSplitLMMinibatchLoader
-

From 587674a55370d040446226c33f7aaf6ac8ecf3fc Mon Sep 17 00:00:00 2001
From: JacobEdelman <edelman.jd@gmail.com>
Date: Mon, 1 Jun 2015 13:11:34 -0400
Subject: [PATCH 07/12] Made work fully but has ram overload.

---
 Readme.md                           |   4 +-
 train.lua                           |  34 +++++---
 util/CharSplitLMMinibatchLoader.lua | 124 ++++++++++++++++++++++++++++
 util/WordSplitLMMinibatchLoader.lua |   3 +-
 util/model_utils.lua                |   2 +-
 5 files changed, 151 insertions(+), 16 deletions(-)
 create mode 100644 util/CharSplitLMMinibatchLoader.lua

diff --git a/Readme.md b/Readme.md
index 2c6a3d2b..272c02bd 100644
--- a/Readme.md
+++ b/Readme.md
@@ -1,7 +1,7 @@
 
 # char-rnn
 
-This code implements **multi-layer Recurrent Neural Network** (RNN, LSTM, and GRU) for training/sampling from character-level language models. The input is a single text file and the model learns to predict the next character in the sequence. 
+This code implements **multi-layer Recurrent Neural Network** (RNN, LSTM, and GRU) for training/sampling from character-level language models. The input is a single text file and the model learns to predict the next character in the sequence.
 
 The context of this code base is described in detail in my [blog post](http://karpathy.github.io/2015/05/21/rnn-effectiveness/).
 
@@ -16,7 +16,7 @@ This code is written in Lua and requires [Torch](http://torch.ch/).
 Additionally, you need to install the `nngraph` and `optim` packages using [LuaRocks](https://luarocks.org/)
 
 ```bash
-$ luarocks install nngraph 
+$ luarocks install nngraph
 $ luarocks install optim
 ```
 
diff --git a/train.lua b/train.lua
index 1d013d3d..e3e93f8b 100644
--- a/train.lua
+++ b/train.lua
@@ -3,11 +3,11 @@
 
 This file trains a character-level multi-layer RNN on text data
 
-Code is based on implementation in 
+Code is based on implementation in
 https://github.com/oxford-cs-ml-2015/practical6
 but modified to have multi-layer support, GPU support, as well as
 many other common model/optimization bells and whistles.
-The practical6 code is in turn based on 
+The practical6 code is in turn based on
 https://github.com/wojciechz/learning_to_execute
 which is turn based on other stuff in Torch, etc... (long lineage)
 
@@ -21,7 +21,7 @@ require 'lfs'
 
 require 'util.OneHot'
 require 'util.misc'
-local CharSplitLMMinibatchLoader = require 'util.WordSplitLMMinibatchLoader'
+
 local model_utils = require 'util.model_utils'
 local LSTM = require 'model.LSTM'
 
@@ -35,6 +35,7 @@ cmd:option('-data_dir','data/tinyshakespeare','data directory. Should contain th
 -- model params
 cmd:option('-rnn_size', 100, 'size of LSTM internal state')
 cmd:option('-num_layers', 2, 'number of layers in the LSTM')
+cmd:option('-words', false, 'whether the model operates on words (as opposed to chars)')
 cmd:option('-model', 'lstm', 'for now only lstm is supported. keep fixed')
 -- optimization
 cmd:option('-learning_rate',2e-3,'learning rate')
@@ -59,10 +60,17 @@ cmd:text()
 
 -- parse input params
 opt = cmd:parse(arg)
+
+if opt.words then
+  local SplitLMMinibatchLoader = require 'util.CharSplitLMMinibatchLoader'
+else
+  local SplitLMMinibatchLoader = require 'util.WordSplitLMMinibatchLoader'
+end
+
 torch.manualSeed(opt.seed)
 -- train / val / test split for data, in fractions
 local test_frac = math.max(0, 1 - opt.train_frac - opt.val_frac)
-local split_sizes = {opt.train_frac, opt.val_frac, test_frac} 
+local split_sizes = {opt.train_frac, opt.val_frac, test_frac}
 
 if opt.gpuid >= 0 then
     print('using CUDA on GPU ' .. opt.gpuid .. '...')
@@ -71,7 +79,7 @@ if opt.gpuid >= 0 then
     cutorch.setDevice(opt.gpuid + 1) -- note +1 to make it 0 indexed! sigh lua
 end
 -- create the data loader class
-local loader = CharSplitLMMinibatchLoader.create(opt.data_dir, opt.batch_size, opt.seq_length, split_sizes)
+local loader = SplitLMMinibatchLoader.create(opt.data_dir, opt.batch_size, opt.seq_length, split_sizes)
 local vocab_size = loader.vocab_size  -- the number of distinct characters
 print('vocab size: ' .. vocab_size)
 -- make sure output directory exists
@@ -121,7 +129,7 @@ function eval_split(split_index, max_batches)
     loader:reset_batch_pointer(split_index) -- move batch iteration pointer for this split to front
     local loss = 0
     local rnn_state = {[0] = init_state}
-    
+
     for i = 1,n do -- iterate over batches in the split
         -- fetch a batch
         local x, y = loader:next_batch(split_index)
@@ -193,10 +201,10 @@ function feval(x)
         local dlst = clones.rnn[t]:backward({embeddings[t], unpack(rnn_state[t-1])}, drnn_statet_passin)
         drnn_state[t-1] = {}
         for k,v in pairs(dlst) do
-            if k == 1 then 
+            if k == 1 then
                 dembeddings[t] = v
             else
-                -- note we do k-1 because first item is dembeddings, and then follow the 
+                -- note we do k-1 because first item is dembeddings, and then follow the
                 -- derivatives of the state, starting at index 2. I know...
                 drnn_state[t-1][k-1] = v
             end
@@ -251,9 +259,13 @@ for i = 1, iterations do
     end
 
     if i % opt.print_every == 0 then
-        print(string.format("%d/%d (epoch %.3f), train_bpc = %6.8f, grad/param norm = %6.4e, time/batch = %.2fs", i, iterations, epoch, train_bpc, grad_params:norm() / params:norm(), time))
+        if op.words then
+          print(string.format("%d/%d (epoch %.3f), train_bpc = %6.8f, grad/param norm = %6.4e, time/batch = %.2fs", i, iterations, epoch, train_bpc, grad_params:norm() / params:norm(), time))
+        else
+          print(string.format("%d/%d (epoch %.3f), train_bpw = %6.8f, grad/param norm = %6.4e, time/batch = %.2fs", i, iterations, epoch, train_bpc, grad_params:norm() / params:norm(), time))
+        end
     end
-   
+
     if i % 10 == 0 then collectgarbage() end
 
     -- handle early stopping if things are going really bad
@@ -263,5 +275,3 @@ for i = 1, iterations do
         break -- halt
     end
 end
-
-
diff --git a/util/CharSplitLMMinibatchLoader.lua b/util/CharSplitLMMinibatchLoader.lua
new file mode 100644
index 00000000..3bc2795f
--- /dev/null
+++ b/util/CharSplitLMMinibatchLoader.lua
@@ -0,0 +1,124 @@
+
+-- Modified from https://github.com/oxford-cs-ml-2015/practical6
+-- the modification included support for train/val/test splits
+
+local CharSplitLMMinibatchLoader = {}
+CharSplitLMMinibatchLoader.__index = CharSplitLMMinibatchLoader
+
+function CharSplitLMMinibatchLoader.create(data_dir, batch_size, seq_length, split_fractions)
+    -- split_fractions is e.g. {0.9, 0.05, 0.05}
+
+    local self = {}
+    setmetatable(self, CharSplitLMMinibatchLoader)
+
+    local input_file = path.join(data_dir, 'input.txt')
+    local vocab_file = path.join(data_dir, 'vocab.t7')
+    local tensor_file = path.join(data_dir, 'data.t7')
+
+    -- construct a tensor with all the data
+    if not (path.exists(vocab_file) or path.exists(tensor_file)) then
+        print('one-time setup: preprocessing input text file ' .. input_file .. '...')
+        CharSplitLMMinibatchLoader.text_to_tensor(input_file, vocab_file, tensor_file)
+    end
+
+    print('loading data files...')
+    local data = torch.load(tensor_file)
+    self.vocab_mapping = torch.load(vocab_file)
+
+    -- cut off the end so that it divides evenly
+    local len = data:size(1)
+    if len % (batch_size * seq_length) ~= 0 then
+        print('cutting off end of data so that the batches/sequences divide evenly')
+        data = data:sub(1, batch_size * seq_length
+                    * math.floor(len / (batch_size * seq_length)))
+    end
+
+    -- count vocab
+    self.vocab_size = 0
+    for _ in pairs(self.vocab_mapping) do
+        self.vocab_size = self.vocab_size + 1
+    end
+
+    -- self.batches is a table of tensors
+    print('reshaping tensor...')
+    self.batch_size = batch_size
+    self.seq_length = seq_length
+
+    local ydata = data:clone()
+    ydata:sub(1,-2):copy(data:sub(2,-1))
+    ydata[-1] = data[1]
+    self.x_batches = data:view(batch_size, -1):split(seq_length, 2)  -- #rows = #batches
+    self.nbatches = #self.x_batches
+    self.y_batches = ydata:view(batch_size, -1):split(seq_length, 2)  -- #rows = #batches
+    assert(#self.x_batches == #self.y_batches)
+
+    self.ntrain = math.floor(self.nbatches * split_fractions[1])
+    self.nval = math.floor(self.nbatches * split_fractions[2])
+    self.ntest = self.nbatches - self.nval - self.ntrain -- the rest goes to test (to ensure this adds up exactly)
+
+    self.split_sizes = {self.ntrain, self.nval, self.ntest}
+    self.batch_ix = {0,0,0}
+
+    print(string.format('data load done. Number of batches in train: %d, val: %d, test: %d', self.ntrain, self.nval, self.ntest))
+    collectgarbage()
+    return self
+end
+
+function CharSplitLMMinibatchLoader:reset_batch_pointer(split_index, batch_index)
+    batch_index = batch_index or 0
+    self.batch_ix[split_index] = batch_index
+end
+
+function CharSplitLMMinibatchLoader:next_batch(split_index)
+    -- split_index is integer: 1 = train, 2 = val, 3 = test
+    self.batch_ix[split_index] = self.batch_ix[split_index] + 1
+    if self.batch_ix[split_index] > self.split_sizes[split_index] then
+        self.batch_ix[split_index] = 1 -- cycle around to beginning
+    end
+    -- pull out the correct next batch
+    local ix = self.batch_ix[split_index]
+    if split_index == 2 then ix = ix + self.ntrain end -- offset by train set size
+    if split_index == 3 then ix = ix + self.ntrain + self.nval end -- offset by train + test
+    return self.x_batches[ix], self.y_batches[ix]
+end
+
+-- *** STATIC method ***
+function CharSplitLMMinibatchLoader.text_to_tensor(in_textfile, out_vocabfile, out_tensorfile)
+    local timer = torch.Timer()
+
+    print('loading text file...')
+    local f = torch.DiskFile(in_textfile)
+    local rawdata = f:readString('*a') -- NOTE: this reads the whole file at once
+    f:close()
+
+    -- create vocabulary if it doesn't exist yet
+    print('creating vocabulary mapping...')
+    -- record all characters to a set
+    local unordered = {}
+    for char in rawdata:gmatch'.' do
+        if not unordered[char] then unordered[char] = true end
+    end
+    -- sort into a table (i.e. keys become 1..N)
+    local ordered = {}
+    for char in pairs(unordered) do ordered[#ordered + 1] = char end
+    table.sort(ordered)
+    -- invert `ordered` to create the char->int mapping
+    local vocab_mapping = {}
+    for i, char in ipairs(ordered) do
+        vocab_mapping[char] = i
+    end
+    -- construct a tensor with all the data
+    print('putting data into tensor...')
+    local data = torch.ByteTensor(#rawdata) -- store it into 1D first, then rearrange
+    for i=1, #rawdata do
+        data[i] = vocab_mapping[rawdata:sub(i, i)] -- lua has no string indexing using []
+    end
+
+    -- save output preprocessed files
+    print('saving ' .. out_vocabfile)
+    torch.save(out_vocabfile, vocab_mapping)
+    print('saving ' .. out_tensorfile)
+    torch.save(out_tensorfile, data)
+end
+
+return CharSplitLMMinibatchLoader
diff --git a/util/WordSplitLMMinibatchLoader.lua b/util/WordSplitLMMinibatchLoader.lua
index 4d3097a7..9701f5f0 100644
--- a/util/WordSplitLMMinibatchLoader.lua
+++ b/util/WordSplitLMMinibatchLoader.lua
@@ -1,6 +1,7 @@
 
 -- Modified from https://github.com/oxford-cs-ml-2015/practical6
 -- the modification included support for train/val/test splits
+-- Further modified from CharSplitLMMiniBatchLoader to seperate by word by Jacob Edelman
 
 local CharSplitLMMinibatchLoader = {}
 CharSplitLMMinibatchLoader.__index = CharSplitLMMinibatchLoader
@@ -118,7 +119,7 @@ function CharSplitLMMinibatchLoader.text_to_tensor(in_textfile, out_vocabfile, o
     end
     -- construct a tensor with all the data
     print('putting data into tensor...')
-    local data = torch.ByteTensor(length) -- store it into 1D first, then rearrange
+    local data = torch.Tensor(length) -- store it into 1D first, then rearrange
     local i = 1
     for char1,char2 in rawdata:gmatch'(%a*)(.?)' do
         if char1 ~= "" then
diff --git a/util/model_utils.lua b/util/model_utils.lua
index 2edbed16..99422630 100644
--- a/util/model_utils.lua
+++ b/util/model_utils.lua
@@ -1,7 +1,7 @@
 
 -- adapted from https://github.com/wojciechz/learning_to_execute
 -- utilities for combining/flattening parameters in a model
--- the code in this script is more general than it needs to be, which is 
+-- the code in this script is more general than it needs to be, which is
 -- why it is kind of a large
 
 require 'torch'

From bf7b56acd4a8b6a30c7961a44bb968337ac4df14 Mon Sep 17 00:00:00 2001
From: JacobEdelman <edelman.jd@gmail.com>
Date: Mon, 1 Jun 2015 13:15:15 -0400
Subject: [PATCH 08/12] Fix commands.

---
 train.lua | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/train.lua b/train.lua
index e3e93f8b..461d6ab0 100644
--- a/train.lua
+++ b/train.lua
@@ -60,11 +60,11 @@ cmd:text()
 
 -- parse input params
 opt = cmd:parse(arg)
-
+local SplitLMMinibatchLoader
 if opt.words then
-  local SplitLMMinibatchLoader = require 'util.CharSplitLMMinibatchLoader'
+  SplitLMMinibatchLoader = require 'util.WordSplitLMMinibatchLoader'
 else
-  local SplitLMMinibatchLoader = require 'util.WordSplitLMMinibatchLoader'
+  SplitLMMinibatchLoader = require 'util.CharSplitLMMinibatchLoader'
 end
 
 torch.manualSeed(opt.seed)
@@ -259,10 +259,10 @@ for i = 1, iterations do
     end
 
     if i % opt.print_every == 0 then
-        if op.words then
-          print(string.format("%d/%d (epoch %.3f), train_bpc = %6.8f, grad/param norm = %6.4e, time/batch = %.2fs", i, iterations, epoch, train_bpc, grad_params:norm() / params:norm(), time))
-        else
+        if opt.words then
           print(string.format("%d/%d (epoch %.3f), train_bpw = %6.8f, grad/param norm = %6.4e, time/batch = %.2fs", i, iterations, epoch, train_bpc, grad_params:norm() / params:norm(), time))
+        else
+          print(string.format("%d/%d (epoch %.3f), train_bpc = %6.8f, grad/param norm = %6.4e, time/batch = %.2fs", i, iterations, epoch, train_bpc, grad_params:norm() / params:norm(), time))
         end
     end
 

From 477b19a1cbb0b130cd1a12f32d3d50803a731403 Mon Sep 17 00:00:00 2001
From: JacobEdelman <edelman.jd@gmail.com>
Date: Wed, 3 Jun 2015 09:39:59 -0400
Subject: [PATCH 09/12] Tried making more RAM friendly, is now broken.

---
 train.lua            | 24 ++++++++++++++++----
 util/Embedding.lua   | 53 ++++++++++++++++++++++++++++++++++++++++++++
 util/model_utils.lua |  1 -
 3 files changed, 73 insertions(+), 5 deletions(-)
 create mode 100644 util/Embedding.lua

diff --git a/train.lua b/train.lua
index 8b09e98c..de2cabdf 100644
--- a/train.lua
+++ b/train.lua
@@ -20,6 +20,7 @@ require 'optim'
 require 'lfs'
 
 require 'util.OneHot'
+require 'util.Embedding'
 require 'util.misc'
 
 local model_utils = require 'util.model_utils'
@@ -87,9 +88,19 @@ if not path.exists(opt.checkpoint_dir) then lfs.mkdir(opt.checkpoint_dir) end
 
 -- define the model: prototypes for one timestep, then clone them in time
 protos = {}
-protos.embed = OneHot(vocab_size)
+local embeded_size = 100
+local input_size, embeded_size
+if opt.words then
+    print('using an embedding transform for input...')
+    embeded_size = 100
+    protos.embed = Embedding(vocab_size, embeded_size)
+else
+    print('using one-hot for input...')
+    embeded_size = vocab_size
+    protos.embed = OneHot(vocab_size)
+end
 print('creating an LSTM with ' .. opt.num_layers .. ' layers')
-protos.rnn = LSTM.lstm(vocab_size, opt.rnn_size, opt.num_layers, opt.dropout)
+protos.rnn = LSTM.lstm(embeded_size, opt.rnn_size, opt.num_layers, opt.dropout)
 -- the initial state of the cell/hidden states
 init_state = {}
 for L=1,opt.num_layers do
@@ -100,7 +111,7 @@ for L=1,opt.num_layers do
 end
 state_predict_index = #init_state -- index of blob to make prediction from
 -- classifier on top
-protos.softmax = nn.Sequential():add(nn.Linear(opt.rnn_size, vocab_size)):add(nn.LogSoftMax())
+protos.softmax = nn.Sequential():add(nn.Linear(opt.rnn_size, embeded_size)):add(nn.LogSoftMax())
 -- training criterion (negative log likelihood)
 protos.criterion = nn.ClassNLLCriterion()
 
@@ -182,7 +193,10 @@ function feval(x)
         rnn_state[t] = clones.rnn[t]:forward{embeddings[t], unpack(rnn_state[t-1])}
         -- the following line is needed because nngraph tries to be clever
         if type(rnn_state[t]) ~= 'table' then rnn_state[t] = {rnn_state[t]} end
+
         predictions[t] = clones.softmax[t]:forward(rnn_state[t][state_predict_index])
+
+        -- predictions should be 200 me thinks
         loss = loss + clones.criterion[t]:forward(predictions[t], y[{{}, t}])
     end
     loss = loss / opt.seq_length
@@ -227,16 +241,18 @@ local optim_state = {learningRate = opt.learning_rate, alpha = opt.decay_rate}
 local iterations = opt.max_epochs * loader.ntrain
 local iterations_per_epoch = loader.ntrain
 local loss0 = nil
+
 for i = 1, iterations do
+
     local epoch = i / loader.ntrain
 
     local timer = torch.Timer()
+
     local _, loss = optim.rmsprop(feval, params, optim_state)
     local time = timer:time().real
 
     local train_loss = loss[1] -- the loss is inside a list, pop it
     train_losses[i] = train_loss
-
     -- every now and then or on last iteration
     if i % opt.eval_val_every == 0 or i == iterations then
         -- evaluate loss on validation data
diff --git a/util/Embedding.lua b/util/Embedding.lua
new file mode 100644
index 00000000..a3c96663
--- /dev/null
+++ b/util/Embedding.lua
@@ -0,0 +1,53 @@
+--[[
+  Copyright 2014 Google Inc. All Rights Reserved.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+]]--
+
+local Embedding, parent = torch.class('Embedding', 'nn.Module')
+
+function Embedding:__init(inputSize, outputSize)
+  parent.__init(self)
+  self.outputSize = outputSize
+  self.weight = torch.Tensor(inputSize, outputSize)
+  self.gradWeight = torch.Tensor(inputSize, outputSize)
+end
+
+function Embedding:updateOutput(input)
+  self.output:resize(input:size(1), self.outputSize)
+  for i = 1, input:size(1) do
+    self.output[i]:copy(self.weight[input[i]])
+  end
+  return self.output
+end
+
+function Embedding:updateGradInput(input, gradOutput)
+  if self.gradInput then
+    self.gradInput:resize(input:size())
+    return self.gradInput
+  end
+end
+
+function Embedding:accGradParameters(input, gradOutput, scale)
+  scale = scale or 1
+  if scale == 0 then
+    self.gradWeight:zero()
+  end
+  for i = 1, input:size(1) do
+    local word = input[i]
+    self.gradWeight[word]:add(gradOutput[i])
+  end
+end
+
+-- we do not need to accumulate parameters when sharing
+Embedding.sharedAccUpdateGradParameters = Embedding.accUpdateGradParameters
diff --git a/util/model_utils.lua b/util/model_utils.lua
index 99422630..923bdd28 100644
--- a/util/model_utils.lua
+++ b/util/model_utils.lua
@@ -120,7 +120,6 @@ function model_utils.clone_many_times(net, T)
             params = {}
         end
     end
-
     local paramsNoGrad
     if net.parametersNoGrad then
         paramsNoGrad = net:parametersNoGrad()

From e4bf6514236a65de9d2643ba6c07fb1381ba8695 Mon Sep 17 00:00:00 2001
From: JacobEdelman <edelman.jd@gmail.com>
Date: Wed, 3 Jun 2015 17:07:26 -0400
Subject: [PATCH 10/12] Fixed and updated to match size of rnn to encoding
 size.

---
 model/LSTM.lua | 23 ++++++++++++++---------
 train.lua      | 11 +++++++++++
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/model/LSTM.lua b/model/LSTM.lua
index aa9dd682..92865ede 100644
--- a/model/LSTM.lua
+++ b/model/LSTM.lua
@@ -1,7 +1,7 @@
 
 local LSTM = {}
-function LSTM.lstm(input_size, rnn_size, n, dropout)
-  dropout = dropout or 0 
+function LSTM.lstm(input_size, rnn_size, n, dropout, words)
+  dropout = dropout or 0
 
   -- there will be 2*n+1 inputs
   local inputs = {}
@@ -18,11 +18,17 @@ function LSTM.lstm(input_size, rnn_size, n, dropout)
     local prev_h = inputs[L*2+1]
     local prev_c = inputs[L*2]
     -- the input to this layer
-    if L == 1 then 
-      x = OneHot(input_size)(inputs[1])
-      input_size_L = input_size
-    else 
-      x = outputs[(L-1)*2] 
+    if L == 1 then
+      if not words then
+        x = OneHot(input_size)(inputs[1])
+        input_size_L = input_size
+      else
+        x = Embedding(input_size, rnn_size)(inputs[1])
+        input_size_L = rnn_size
+      end
+
+    else
+      x = outputs[(L-1)*2]
       input_size_L = rnn_size
     end
     -- evaluate the input sums at once for efficiency
@@ -47,7 +53,7 @@ function LSTM.lstm(input_size, rnn_size, n, dropout)
     local next_h = nn.CMulTable()({out_gate, nn.Tanh()(next_c)})
     -- add dropout to output, if desired
     if dropout > 0 then next_h = nn.Dropout(dropout)(next_h) end
-    
+
     table.insert(outputs, next_c)
     table.insert(outputs, next_h)
   end
@@ -61,4 +67,3 @@ function LSTM.lstm(input_size, rnn_size, n, dropout)
 end
 
 return LSTM
-
diff --git a/train.lua b/train.lua
index 44f7ca8c..0d4a9960 100644
--- a/train.lua
+++ b/train.lua
@@ -99,8 +99,16 @@ protos = {}
 --     embeded_size = vocab_size
 --     protos.embed = OneHot(vocab_size)
 -- end
+if not opt.words then
+    print('using one-hot for input...')
+    protos.embed = OneHot(vocab_size)
+else
+    print('using an embedding transform of size', opt.rnn_size)
+    protos.embed = Embedding(vocab_size, opt.rnn_size)
+end
 print('creating an LSTM with ' .. opt.num_layers .. ' layers')
 protos.rnn = LSTM.lstm(vocab_size, opt.rnn_size, opt.num_layers, opt.dropout, opt.words)
+
 -- the initial state of the cell/hidden states
 init_state = {}
 for L=1,opt.num_layers do
@@ -155,9 +163,11 @@ function eval_split(split_index, max_batches)
             clones.rnn[t]:evaluate() -- for dropout proper functioning
             local lst = clones.rnn[t]:forward{x[{{}, t}], unpack(rnn_state[t-1])}
             rnn_state[t] = {}
+
             for i=1,#init_state do table.insert(rnn_state[t], lst[i]) end
             prediction = lst[#lst]
             loss = loss + clones.criterion[t]:forward(prediction, y[{{}, t}])
+
         end
         -- carry over lstm state
         rnn_state[0] = rnn_state[#rnn_state]
@@ -169,6 +179,7 @@ function eval_split(split_index, max_batches)
 end
 
 -- do fwd/bwd and return loss, grad_params
+
 local init_state_global = clone_list(init_state)
 function feval(x)
     if x ~= params then

From 53ae17592e608d2f27959856cbab399905c41a10 Mon Sep 17 00:00:00 2001
From: JacobEdelman <edelman.jd@gmail.com>
Date: Wed, 3 Jun 2015 18:43:13 -0400
Subject: [PATCH 11/12] Made sampling work by adding in Embedding require.

---
 sample.lua | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sample.lua b/sample.lua
index 1cb12866..fb3978c3 100644
--- a/sample.lua
+++ b/sample.lua
@@ -3,7 +3,7 @@
 
 This file samples characters from a trained model
 
-Code is based on implementation in 
+Code is based on implementation in
 https://github.com/oxford-cs-ml-2015/practical6
 
 ]]--
@@ -14,6 +14,7 @@ require 'nngraph'
 require 'optim'
 require 'lfs'
 
+require 'util.Embedding'
 require 'util.OneHot'
 require 'util.misc'
 
@@ -111,4 +112,3 @@ for i=1, opt.length do
     io.write(ivocab[prev_char[1]])
 end
 io.write('\n') io.flush()
-

From 6a3aa3b3b4e21695063afc624b5195793eacd7c5 Mon Sep 17 00:00:00 2001
From: JacobEdelman <edelman.jd@gmail.com>
Date: Wed, 3 Jun 2015 20:42:13 -0400
Subject: [PATCH 12/12] Reformatting some printing.

---
 train.lua | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/train.lua b/train.lua
index 0d4a9960..d07c96db 100644
--- a/train.lua
+++ b/train.lua
@@ -103,7 +103,7 @@ if not opt.words then
     print('using one-hot for input...')
     protos.embed = OneHot(vocab_size)
 else
-    print('using an embedding transform of size', opt.rnn_size)
+    print('using an embedding transform of size ' .. opt.rnn_size)
     protos.embed = Embedding(vocab_size, opt.rnn_size)
 end
 print('creating an LSTM with ' .. opt.num_layers .. ' layers')