diff --git a/Readme.md b/Readme.md
index 48a6ba7c..2856660b 100644
--- a/Readme.md
+++ b/Readme.md
@@ -1,7 +1,7 @@
 
 # char-rnn
 
-This code implements **multi-layer Recurrent Neural Network** (RNN, LSTM, and GRU) for training/sampling from character-level language models. The input is a single text file and the model learns to predict the next character in the sequence. 
+This code implements **multi-layer Recurrent Neural Network** (RNN, LSTM, and GRU) for training/sampling from character-level language models. The input is a single text file and the model learns to predict the next character in the sequence.
 
 The context of this code base is described in detail in my [blog post](http://karpathy.github.io/2015/05/21/rnn-effectiveness/).
 
@@ -15,7 +15,7 @@ This code is written in Lua and requires [Torch](http://torch.ch/).
 Additionally, you need to install the `nngraph` and `optim` packages using [LuaRocks](https://luarocks.org/) which you will be able to do after installing Torch
 
 ```bash
-$ luarocks install nngraph 
+$ luarocks install nngraph
 $ luarocks install optim
 ```
 
diff --git a/inspect_checkpoint.lua b/inspect_checkpoint.lua
index 6a027b1f..b53b27e8 100644
--- a/inspect_checkpoint.lua
+++ b/inspect_checkpoint.lua
@@ -32,4 +32,3 @@ print('opt:')
 print(model.opt)
 print('val losses:')
 print(model.val_losses)
-
diff --git a/model/LSTM.lua b/model/LSTM.lua
index aa9dd682..92865ede 100644
--- a/model/LSTM.lua
+++ b/model/LSTM.lua
@@ -1,7 +1,7 @@
 
 local LSTM = {}
-function LSTM.lstm(input_size, rnn_size, n, dropout)
-  dropout = dropout or 0 
+function LSTM.lstm(input_size, rnn_size, n, dropout, words)
+  dropout = dropout or 0
 
   -- there will be 2*n+1 inputs
   local inputs = {}
@@ -18,11 +18,17 @@ function LSTM.lstm(input_size, rnn_size, n, dropout)
     local prev_h = inputs[L*2+1]
     local prev_c = inputs[L*2]
     -- the input to this layer
-    if L == 1 then 
-      x = OneHot(input_size)(inputs[1])
-      input_size_L = input_size
-    else 
-      x = outputs[(L-1)*2] 
+    if L == 1 then
+      if not words then
+        x = OneHot(input_size)(inputs[1])
+        input_size_L = input_size
+      else
+        x = Embedding(input_size, rnn_size)(inputs[1])
+        input_size_L = rnn_size
+      end
+
+    else
+      x = outputs[(L-1)*2]
       input_size_L = rnn_size
     end
     -- evaluate the input sums at once for efficiency
@@ -47,7 +53,7 @@ function LSTM.lstm(input_size, rnn_size, n, dropout)
     local next_h = nn.CMulTable()({out_gate, nn.Tanh()(next_c)})
     -- add dropout to output, if desired
     if dropout > 0 then next_h = nn.Dropout(dropout)(next_h) end
-    
+
     table.insert(outputs, next_c)
     table.insert(outputs, next_h)
   end
@@ -61,4 +67,3 @@ function LSTM.lstm(input_size, rnn_size, n, dropout)
 end
 
 return LSTM
-
diff --git a/sample.lua b/sample.lua
index 1cb12866..fb3978c3 100644
--- a/sample.lua
+++ b/sample.lua
@@ -3,7 +3,7 @@
 
 This file samples characters from a trained model
 
-Code is based on implementation in 
+Code is based on implementation in
 https://github.com/oxford-cs-ml-2015/practical6
 
 ]]--
@@ -14,6 +14,7 @@ require 'nngraph'
 require 'optim'
 require 'lfs'
 
+require 'util.Embedding'
 require 'util.OneHot'
 require 'util.misc'
 
@@ -111,4 +112,3 @@ for i=1, opt.length do
     io.write(ivocab[prev_char[1]])
 end
 io.write('\n') io.flush()
-
diff --git a/train.lua b/train.lua
index 11436c3d..1c2ddae3 100644
--- a/train.lua
+++ b/train.lua
@@ -3,11 +3,11 @@
 
 This file trains a character-level multi-layer RNN on text data
 
-Code is based on implementation in 
+Code is based on implementation in
 https://github.com/oxford-cs-ml-2015/practical6
 but modified to have multi-layer support, GPU support, as well as
 many other common model/optimization bells and whistles.
-The practical6 code is in turn based on 
+The practical6 code is in turn based on
 https://github.com/wojciechz/learning_to_execute
 which is turn based on other stuff in Torch, etc... (long lineage)
 
@@ -20,8 +20,9 @@ require 'optim'
 require 'lfs'
 
 require 'util.OneHot'
+require 'util.Embedding'
 require 'util.misc'
-local CharSplitLMMinibatchLoader = require 'util.CharSplitLMMinibatchLoader'
+
 local model_utils = require 'util.model_utils'
 local LSTM = require 'model.LSTM'
 
@@ -35,6 +36,7 @@ cmd:option('-data_dir','data/tinyshakespeare','data directory. Should contain th
 -- model params
 cmd:option('-rnn_size', 128, 'size of LSTM internal state')
 cmd:option('-num_layers', 2, 'number of layers in the LSTM')
+cmd:option('-words', false, 'whether the model operates on words (as opposed to chars)')
 cmd:option('-model', 'lstm', 'for now only lstm is supported. keep fixed')
 -- optimization
 cmd:option('-learning_rate',2e-3,'learning rate')
@@ -61,10 +63,17 @@ cmd:text()
 
 -- parse input params
 opt = cmd:parse(arg)
+local SplitLMMinibatchLoader
+if opt.words then
+  SplitLMMinibatchLoader = require 'util.WordSplitLMMinibatchLoader'
+else
+  SplitLMMinibatchLoader = require 'util.CharSplitLMMinibatchLoader'
+end
+
 torch.manualSeed(opt.seed)
 -- train / val / test split for data, in fractions
 local test_frac = math.max(0, 1 - opt.train_frac - opt.val_frac)
-local split_sizes = {opt.train_frac, opt.val_frac, test_frac} 
+local split_sizes = {opt.train_frac, opt.val_frac, test_frac}
 
 if opt.gpuid >= 0 then
     print('using CUDA on GPU ' .. opt.gpuid .. '...')
@@ -73,7 +82,7 @@ if opt.gpuid >= 0 then
     cutorch.setDevice(opt.gpuid + 1) -- note +1 to make it 0 indexed! sigh lua
 end
 -- create the data loader class
-local loader = CharSplitLMMinibatchLoader.create(opt.data_dir, opt.batch_size, opt.seq_length, split_sizes)
+local loader = SplitLMMinibatchLoader.create(opt.data_dir, opt.batch_size, opt.seq_length, split_sizes)
 local vocab_size = loader.vocab_size  -- the number of distinct characters
 print('vocab size: ' .. vocab_size)
 -- make sure output directory exists
@@ -81,8 +90,27 @@ if not path.exists(opt.checkpoint_dir) then lfs.mkdir(opt.checkpoint_dir) end
 
 -- define the model: prototypes for one timestep, then clone them in time
 protos = {}
+-- local embeded_size = 100
+-- local input_size, embeded_size
+-- if opt.words then
+--     print('using an embedding transform for input...')
+--     embeded_size = 100
+--     protos.embed = Embedding(vocab_size, embeded_size)
+-- else
+--     print('using one-hot for input...')
+--     embeded_size = vocab_size
+--     protos.embed = OneHot(vocab_size)
+-- end
+if not opt.words then
+    print('using one-hot for input...')
+    protos.embed = OneHot(vocab_size)
+else
+    print('using an embedding transform of size ' .. opt.rnn_size)
+    protos.embed = Embedding(vocab_size, opt.rnn_size)
+end
 print('creating an LSTM with ' .. opt.num_layers .. ' layers')
-protos.rnn = LSTM.lstm(vocab_size, opt.rnn_size, opt.num_layers, opt.dropout)
+protos.rnn = LSTM.lstm(vocab_size, opt.rnn_size, opt.num_layers, opt.dropout, opt.words)
+
 -- the initial state of the cell/hidden states
 init_state = {}
 for L=1,opt.num_layers do
@@ -91,6 +119,7 @@ for L=1,opt.num_layers do
     table.insert(init_state, h_init:clone())
     table.insert(init_state, h_init:clone())
 end
+
 -- training criterion (negative log likelihood)
 protos.criterion = nn.ClassNLLCriterion()
 
@@ -122,7 +151,7 @@ function eval_split(split_index, max_batches)
     loader:reset_batch_pointer(split_index) -- move batch iteration pointer for this split to front
     local loss = 0
     local rnn_state = {[0] = init_state}
-    
+
     for i = 1,n do -- iterate over batches in the split
         -- fetch a batch
         local x, y = loader:next_batch(split_index)
@@ -136,9 +165,11 @@ function eval_split(split_index, max_batches)
             clones.rnn[t]:evaluate() -- for dropout proper functioning
             local lst = clones.rnn[t]:forward{x[{{}, t}], unpack(rnn_state[t-1])}
             rnn_state[t] = {}
+
             for i=1,#init_state do table.insert(rnn_state[t], lst[i]) end
-            prediction = lst[#lst] 
+            prediction = lst[#lst]
             loss = loss + clones.criterion[t]:forward(prediction, y[{{}, t}])
+
         end
         -- carry over lstm state
         rnn_state[0] = rnn_state[#rnn_state]
@@ -150,6 +181,7 @@ function eval_split(split_index, max_batches)
 end
 
 -- do fwd/bwd and return loss, grad_params
+
 local init_state_global = clone_list(init_state)
 function feval(x)
     if x ~= params then
@@ -188,7 +220,7 @@ function feval(x)
         drnn_state[t-1] = {}
         for k,v in pairs(dlst) do
             if k > 1 then -- k == 1 is gradient on x, which we dont need
-                -- note we do k-1 because first item is dembeddings, and then follow the 
+                -- note we do k-1 because first item is dembeddings, and then follow the
                 -- derivatives of the state, starting at index 2. I know...
                 drnn_state[t-1][k-1] = v
             end
@@ -209,16 +241,18 @@ local optim_state = {learningRate = opt.learning_rate, alpha = opt.decay_rate}
 local iterations = opt.max_epochs * loader.ntrain
 local iterations_per_epoch = loader.ntrain
 local loss0 = nil
+
 for i = 1, iterations do
+
     local epoch = i / loader.ntrain
 
     local timer = torch.Timer()
+
     local _, loss = optim.rmsprop(feval, params, optim_state)
     local time = timer:time().real
 
     local train_loss = loss[1] -- the loss is inside a list, pop it
     train_losses[i] = train_loss
-
     -- exponential learning rate decay
     if i % loader.ntrain == 0 and opt.learning_rate_decay < 1 then
         if epoch >= opt.learning_rate_decay_after then
@@ -251,7 +285,7 @@ for i = 1, iterations do
     if i % opt.print_every == 0 then
         print(string.format("%d/%d (epoch %.3f), train_loss = %6.8f, grad/param norm = %6.4e, time/batch = %.2fs", i, iterations, epoch, train_loss, grad_params:norm() / params:norm(), time))
     end
-   
+
     if i % 10 == 0 then collectgarbage() end
 
     -- handle early stopping if things are going really bad
@@ -261,5 +295,3 @@ for i = 1, iterations do
         break -- halt
     end
 end
-
-
diff --git a/util/CharSplitLMMinibatchLoader.lua b/util/CharSplitLMMinibatchLoader.lua
index 1fafe398..3bc2795f 100644
--- a/util/CharSplitLMMinibatchLoader.lua
+++ b/util/CharSplitLMMinibatchLoader.lua
@@ -29,14 +29,14 @@ function CharSplitLMMinibatchLoader.create(data_dir, batch_size, seq_length, spl
     local len = data:size(1)
     if len % (batch_size * seq_length) ~= 0 then
         print('cutting off end of data so that the batches/sequences divide evenly')
-        data = data:sub(1, batch_size * seq_length 
+        data = data:sub(1, batch_size * seq_length
                     * math.floor(len / (batch_size * seq_length)))
     end
 
     -- count vocab
     self.vocab_size = 0
-    for _ in pairs(self.vocab_mapping) do 
-        self.vocab_size = self.vocab_size + 1 
+    for _ in pairs(self.vocab_mapping) do
+        self.vocab_size = self.vocab_size + 1
     end
 
     -- self.batches is a table of tensors
@@ -122,4 +122,3 @@ function CharSplitLMMinibatchLoader.text_to_tensor(in_textfile, out_vocabfile, o
 end
 
 return CharSplitLMMinibatchLoader
-
diff --git a/util/Embedding.lua b/util/Embedding.lua
new file mode 100644
index 00000000..a3c96663
--- /dev/null
+++ b/util/Embedding.lua
@@ -0,0 +1,53 @@
+--[[
+  Copyright 2014 Google Inc. All Rights Reserved.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+]]--
+
+local Embedding, parent = torch.class('Embedding', 'nn.Module')
+
+function Embedding:__init(inputSize, outputSize)
+  parent.__init(self)
+  self.outputSize = outputSize
+  self.weight = torch.Tensor(inputSize, outputSize)
+  self.gradWeight = torch.Tensor(inputSize, outputSize)
+end
+
+function Embedding:updateOutput(input)
+  self.output:resize(input:size(1), self.outputSize)
+  for i = 1, input:size(1) do
+    self.output[i]:copy(self.weight[input[i]])
+  end
+  return self.output
+end
+
+function Embedding:updateGradInput(input, gradOutput)
+  if self.gradInput then
+    self.gradInput:resize(input:size())
+    return self.gradInput
+  end
+end
+
+function Embedding:accGradParameters(input, gradOutput, scale)
+  scale = scale or 1
+  if scale == 0 then
+    self.gradWeight:zero()
+  end
+  for i = 1, input:size(1) do
+    local word = input[i]
+    self.gradWeight[word]:add(gradOutput[i])
+  end
+end
+
+-- we do not need to accumulate parameters when sharing
+Embedding.sharedAccUpdateGradParameters = Embedding.accUpdateGradParameters
diff --git a/util/WordSplitLMMinibatchLoader.lua b/util/WordSplitLMMinibatchLoader.lua
new file mode 100644
index 00000000..9701f5f0
--- /dev/null
+++ b/util/WordSplitLMMinibatchLoader.lua
@@ -0,0 +1,142 @@
+
+-- Modified from https://github.com/oxford-cs-ml-2015/practical6
+-- the modification included support for train/val/test splits
+-- Further modified from CharSplitLMMiniBatchLoader to seperate by word by Jacob Edelman
+
+local CharSplitLMMinibatchLoader = {}
+CharSplitLMMinibatchLoader.__index = CharSplitLMMinibatchLoader
+
+function CharSplitLMMinibatchLoader.create(data_dir, batch_size, seq_length, split_fractions)
+    -- split_fractions is e.g. {0.9, 0.05, 0.05}
+
+    local self = {}
+    setmetatable(self, CharSplitLMMinibatchLoader)
+
+    local input_file = path.join(data_dir, 'input.txt')
+    local vocab_file = path.join(data_dir, 'vocab.t7')
+    local tensor_file = path.join(data_dir, 'data.t7')
+
+    -- construct a tensor with all the data
+    if not (path.exists(vocab_file) or path.exists(tensor_file)) then
+        print('one-time setup: preprocessing input text file ' .. input_file .. '...')
+        CharSplitLMMinibatchLoader.text_to_tensor(input_file, vocab_file, tensor_file)
+    end
+
+    print('loading data files...')
+    local data = torch.load(tensor_file)
+    self.vocab_mapping = torch.load(vocab_file)
+
+    -- cut off the end so that it divides evenly
+    local len = data:size(1)
+    if len % (batch_size * seq_length) ~= 0 then
+        print('cutting off end of data so that the batches/sequences divide evenly')
+        data = data:sub(1, batch_size * seq_length
+                    * math.floor(len / (batch_size * seq_length)))
+    end
+
+    -- count vocab
+    self.vocab_size = 0
+    for _ in pairs(self.vocab_mapping) do
+        self.vocab_size = self.vocab_size + 1
+    end
+
+    -- self.batches is a table of tensors
+    print('reshaping tensor...')
+    self.batch_size = batch_size
+    self.seq_length = seq_length
+
+    local ydata = data:clone()
+    ydata:sub(1,-2):copy(data:sub(2,-1))
+    ydata[-1] = data[1]
+    self.x_batches = data:view(batch_size, -1):split(seq_length, 2)  -- #rows = #batches
+    self.nbatches = #self.x_batches
+    self.y_batches = ydata:view(batch_size, -1):split(seq_length, 2)  -- #rows = #batches
+    assert(#self.x_batches == #self.y_batches)
+
+    self.ntrain = math.floor(self.nbatches * split_fractions[1])
+    self.nval = math.floor(self.nbatches * split_fractions[2])
+    self.ntest = self.nbatches - self.nval - self.ntrain -- the rest goes to test (to ensure this adds up exactly)
+
+    self.split_sizes = {self.ntrain, self.nval, self.ntest}
+    self.batch_ix = {0,0,0}
+
+    print(string.format('data load done. Number of batches in train: %d, val: %d, test: %d', self.ntrain, self.nval, self.ntest))
+    collectgarbage()
+    return self
+end
+
+function CharSplitLMMinibatchLoader:reset_batch_pointer(split_index, batch_index)
+    batch_index = batch_index or 0
+    self.batch_ix[split_index] = batch_index
+end
+
+function CharSplitLMMinibatchLoader:next_batch(split_index)
+    -- split_index is integer: 1 = train, 2 = val, 3 = test
+    self.batch_ix[split_index] = self.batch_ix[split_index] + 1
+    if self.batch_ix[split_index] > self.split_sizes[split_index] then
+        self.batch_ix[split_index] = 1 -- cycle around to beginning
+    end
+    -- pull out the correct next batch
+    local ix = self.batch_ix[split_index]
+    if split_index == 2 then ix = ix + self.ntrain end -- offset by train set size
+    if split_index == 3 then ix = ix + self.ntrain + self.nval end -- offset by train + test
+    return self.x_batches[ix], self.y_batches[ix]
+end
+
+-- *** STATIC method ***
+function CharSplitLMMinibatchLoader.text_to_tensor(in_textfile, out_vocabfile, out_tensorfile)
+    local timer = torch.Timer()
+
+    print('loading text file...')
+    local f = torch.DiskFile(in_textfile)
+    local rawdata = f:readString('*a') -- NOTE: this reads the whole file at once
+    f:close()
+
+    -- create vocabulary if it doesn't exist yet
+    print('creating vocabulary mapping...')
+    -- record all characters to a set
+    local unordered = {}
+    local length = 0
+    for char1,char2 in rawdata:gmatch'(%a*)(.?)' do
+        if char1 ~= "" then
+            if not unordered[char1] then unordered[char1] = true end
+            length = length + 1
+        end
+        if char2 ~= "" then
+            if not unordered[char2] then unordered[char2] = true end
+            length = length + 1
+        end
+    end
+
+    -- sort into a table (i.e. keys become 1..N)
+    local ordered = {}
+    for char in pairs(unordered) do ordered[#ordered + 1] = char end
+    table.sort(ordered)
+    -- invert `ordered` to create the char->int mapping
+    local vocab_mapping = {}
+    for i, char in ipairs(ordered) do
+        vocab_mapping[char] = i
+    end
+    -- construct a tensor with all the data
+    print('putting data into tensor...')
+    local data = torch.Tensor(length) -- store it into 1D first, then rearrange
+    local i = 1
+    for char1,char2 in rawdata:gmatch'(%a*)(.?)' do
+        if char1 ~= "" then
+            data[i] = vocab_mapping[char1]
+            i = i + 1
+        end
+        if char2 ~= "" then
+            data[i] = vocab_mapping[char2]
+            i= i + 1
+        end
+    end
+
+    -- save output preprocessed files
+    print('saving ' .. out_vocabfile)
+    torch.save(out_vocabfile, vocab_mapping)
+    print('saving ' .. out_tensorfile)
+    torch.save(out_tensorfile, data)
+end
+
+return CharSplitLMMinibatchLoader
diff --git a/util/model_utils.lua b/util/model_utils.lua
index 2edbed16..923bdd28 100644
--- a/util/model_utils.lua
+++ b/util/model_utils.lua
@@ -1,7 +1,7 @@
 
 -- adapted from https://github.com/wojciechz/learning_to_execute
 -- utilities for combining/flattening parameters in a model
--- the code in this script is more general than it needs to be, which is 
+-- the code in this script is more general than it needs to be, which is
 -- why it is kind of a large
 
 require 'torch'
@@ -120,7 +120,6 @@ function model_utils.clone_many_times(net, T)
             params = {}
         end
     end
-
     local paramsNoGrad
     if net.parametersNoGrad then
         paramsNoGrad = net:parametersNoGrad()