diff --git a/.gitignore b/.gitignore
index 7f46698..e3093a6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@ snapshots
 mnist.hdf5
 *.swp
 cache
+*.bak
diff --git a/residual-layers.lua b/residual-layers.lua
index 92f0ac1..01a3021 100644
--- a/residual-layers.lua
+++ b/residual-layers.lua
@@ -24,7 +24,7 @@ require 'cudnn'
 require 'cunn'
 local nninit = require 'nninit'
 
-function addResidualLayer2(input,  nChannels, nOutChannels, stride)
+function ResidualLayer(nChannels, nOutChannels, stride)
    --[[
 
    Residual layers! Implements option (A) from Section 3.3. The input
@@ -50,46 +50,52 @@ function addResidualLayer2(input,  nChannels, nOutChannels, stride)
    stride = stride or 1
    -- Path 1: Convolution
    -- The first layer does the downsampling and the striding
-   local net = cudnn.SpatialConvolution(nChannels, nOutChannels,
+   local path1 = nn.Sequential()
+   path1:add(cudnn.SpatialConvolution(nChannels, nOutChannels,
                                            3,3, stride,stride, 1,1)
                                            :init('weight', nninit.kaiming, {gain = 'relu'})
-                                           :init('bias', nninit.constant, 0)(input)
-   net = cudnn.SpatialBatchNormalization(nOutChannels)
+                                           :init('bias', nninit.constant, 0))
+   path1:add(cudnn.SpatialBatchNormalization(nOutChannels)
                                             :init('weight', nninit.normal, 1.0, 0.002)
-                                            :init('bias', nninit.constant, 0)(net)
-   net = cudnn.ReLU(true)(net)
-   net = cudnn.SpatialConvolution(nOutChannels, nOutChannels,
+                                            :init('bias', nninit.constant, 0))
+   path1:add(cudnn.ReLU(true))
+   path1:add(cudnn.SpatialConvolution(nOutChannels, nOutChannels,
                                       3,3, 1,1, 1,1)
                                       :init('weight', nninit.kaiming, {gain = 'relu'})
-                                      :init('bias', nninit.constant, 0)(net)
+                                      :init('bias', nninit.constant, 0))
    -- Should we put Batch Normalization here? I think not, because
    -- BN would force the output to have unit variance, which breaks the residual
    -- property of the network.
    -- What about ReLU here? I think maybe not for the same reason. Figure 2
    -- implies that they don't use it here
+   path1:add(cudnn.SpatialBatchNormalization(nOutChannels))
 
    -- Path 2: Identity / skip connection
-   local skip = input
+   local path2 = nn.Sequential()
+   path2:add(nn.Identity())
    if stride > 1 then
        -- optional downsampling
-       skip = nn.SpatialAveragePooling(1, 1, stride,stride)(skip)
+       path2:add(nn.SpatialAveragePooling(1, 1, stride,stride))
    end
    if nOutChannels > nChannels then
        -- optional padding
-       skip = nn.Padding(1, (nOutChannels - nChannels), 3)(skip)
+       path2:add(nn.Padding(1, (nOutChannels - nChannels), 3))
    elseif nOutChannels < nChannels then
        -- optional narrow, ugh.
-       skip = nn.Narrow(2, 1, nOutChannels)(skip)
+       path2:add(nn.Narrow(2, 1, nOutChannels))
        -- NOTE this BREAKS with non-batch inputs!!
    end
 
    -- Add them together
-   net = cudnn.SpatialBatchNormalization(nOutChannels)(net)
-   net = nn.CAddTable(){net, skip}
-   net = cudnn.ReLU(true)(net)
+   local layer = nn.Sequential()
+   local concat = nn.ConcatTable(2)
+   concat:add(path1):add(path2)
+   layer:add(concat)
+   layer:add(nn.CAddTable())
+   layer:add(cudnn.ReLU(true))
    -- ^ don't put a ReLU here! see http://gitxiv.com/comments/7rffyqcPLirEEsmpX
 
-   return net
+   return layer
 end
 
 --[[
diff --git a/train-cifar.lua b/train-cifar.lua
index e114829..5a384b2 100644
--- a/train-cifar.lua
+++ b/train-cifar.lua
@@ -71,13 +71,13 @@ if opt.loadFrom == "" then
     model = cudnn.SpatialBatchNormalization(16)(model)
     model = cudnn.ReLU(true)(model)
     ------> 16, 32,32   First Group
-    for i=1,N do   model = addResidualLayer2(model, 16)   end
+    for i=1,N do   model = ResidualLayer(16)(model)   end
     ------> 32, 16,16   Second Group
-    model = addResidualLayer2(model, 16, 32, 2)
-    for i=1,N-1 do   model = addResidualLayer2(model, 32)   end
+    model = ResidualLayer(16, 32, 2)(model)
+    for i=1,N-1 do   model = ResidualLayer(32)(model)   end
     ------> 64, 8,8     Third Group
-    model = addResidualLayer2(model, 32, 64, 2)
-    for i=1,N-1 do   model = addResidualLayer2(model, 64)   end
+    model = ResidualLayer(32, 64, 2)(model)
+    for i=1,N-1 do   model = ResidualLayer(64)(model)   end
     ------> 10, 8,8     Pooling, Linear, Softmax
     model = nn.SpatialAveragePooling(8,8)(model)
     model = nn.Reshape(64)(model)