train.lua

--
--  Copyright (c) 2016, Facebook, Inc.
--  All rights reserved.
--
--  This source code is licensed under the BSD-style license found in the
--  LICENSE file in the root directory of this source tree. An additional grant
--  of patent rights can be found in the PATENTS file in the same directory.
--
--  The training loop and learning rate schedule
--


local optim = require 'optim'

local M = {}
local Trainer = torch.class('resnet.Trainer', M)


function Trainer:__init(model, criterion, opt, optimState)
   self.model = model
   self.criterion = criterion
   self.optimState = optimState or {
      learningRate = opt.LR,
      learningRateDecay = 0.0,
      momentum = opt.momentum,
      nesterov = true,
      -- nesterov = false,
      dampening = 0.0,
      weightDecay = opt.weightDecay,
   }
   self.opt = opt
   self.params, self.gradParams = self.model:getParameters()
   self.trainLogger = optim.Logger(paths.concat(self.opt.save, 'train.log'))
   self.testLogger = optim.Logger(paths.concat(self.opt.save, 'test.log'))
end

function Trainer:train(epoch, dataloader)

   -- Trains the model for a single epoch
   self.optimState.learningRate = self:learningRate(epoch)
   local learningRateMultiplier = self.optimState.learningRate / self.opt.LR
   local weightDecayMultiplier = learningRateMultiplier
   self.optimState.weightDecaycurrent = weightDecayMultiplier * self.optimState.weightDecay
   

   local timer = torch.Timer()
   local dataTimer = torch.Timer()

   local function feval()
      return self.criterion.output, self.gradParams
   end

   local trainSize = dataloader:size()
   top1Sum, top5Sum, lossSum = 0.0, 0.0, 0.0
   local N = 0

   print('=> Training epoch # ' .. epoch)
   -- set the batch norm to training mode
   self.model:training()
   for n, sample in dataloader:run() do

      local flag_nan = false
      collectgarbage()
      local dataTime = dataTimer:time().real

      -- Copy input and target to the GPU
      self:copyInputs(sample)

      local output = self.model:forward(self.input)
      local batchSize = 0
      batchSize = output:size(1)
      loss = self.criterion:forward(self.model.output,self.target)
      if loss ~= loss then
         flag_nan = true
      end
  
      if flag_nan then
         print('Err NaN!!')
         break
      end

      self.model:zeroGradParameters()
      self.criterion:backward(self.model.output, self.target)
      self.model:backward(self.input, self.criterion.gradInput)
      
      if opt.optimType == 'sgd' then
         optim.sgd(feval, self.params, self.optimState)
      elseif opt.optimType == 'adam' then
         optim.adam(feval, self.params, self.optimState)
      elseif opt.optimType == 'adamW' then
         -- You need to add adamW.lua file to "~~~torch/install/share/lua/5.2/optim/" to use adamW optimizer.
         optim.adamW(feval, self.params, self.optimState)
      elseif opt.optimType == 'adamax' then
         optim.adamax(feval, self.params, self.optimState)
      end

      local top1, top5 = self:computeScore(output, sample.target, 1)
      top1Sum = top1Sum + top1*batchSize
      top5Sum = top5Sum + top5*batchSize
      lossSum = lossSum + loss*batchSize
      
      N = N + batchSize

      print((' | Epoch: [%d][%d/%d]    Time %.3f  Data %.3f  Err %1.4f  top1 %7.3f  top5 %7.3f  LR %.0e'):format(
         epoch, n, trainSize, timer:time().real, dataTime, loss, 100-top1, 100-top5, self.optimState.learningRate))
      
      -- check that the storage didn't get changed due to an unfortunate getParameters call
      assert(self.params:storage() == self.model:parameters()[1]:storage())

      timer:reset()
      dataTimer:reset()
   end

   self.trainLogger:add{
         ['% top1 accuracy (train set)'] = 100-(top1Sum/N),
         ['% top5 accuracy (train set)'] = 100-(top5Sum/N),
         ['loss (train set)'] = lossSum/N
      }
   self.model:clearState()
   collectgarbage()
   return top1Sum / N, top5Sum / N, lossSum / N

end

function Trainer:test(epoch, dataloader)
   -- Computes the top-1 and top-5 err on the validation set
   local timer = torch.Timer()
   local dataTimer = torch.Timer()
   local size = dataloader:size()

   local nCrops = self.opt.tenCrop and 10 or 1
   top1Sum, top5Sum = 0.0, 0.0
   
   local N = 0

   self.model:evaluate()
   for n, sample in dataloader:run() do
      collectgarbage()
      local dataTime = dataTimer:time().real

      -- Copy input and target to the GPU
      self:copyInputs(sample)

      local output = self.model:forward(self.input)
      output:float()
      
      -- if opt.deploy ~= 'none' then
      --    print(output)
      --    error('stop!')
      -- end
      local batchSize = 0
      batchSize = output:size(1)/nCrops
      
      loss = self.criterion:forward(self.model.output,self.target)
      

      local top1, top5 = self:computeScore(output, sample.target, nCrops)
      top1Sum = top1Sum + top1*batchSize
      top5Sum = top5Sum + top5*batchSize
      
      N = N + batchSize


      print((' | Test: [%d][%d/%d]    Time %.3f  Data %.3f  top1 %7.3f (%7.3f)  top5 %7.3f (%7.3f)'):format(
         epoch, n, size, timer:time().real, dataTime, 100-top1, 100-(top1Sum / N), 100- top5, 100-(top5Sum / N)))
   
      timer:reset()
      dataTimer:reset()
   end
   self.model:training()

   self.testLogger:add{
         ['% top1 accuracy (test set) '] = 100-(top1Sum/N),
         ['% top5 accuracy (test set) '] = 100-(top5Sum/N)
      }
   print((' * Finished epoch # %d     top1: %7.3f  top5: %7.3f\n'):format(
      epoch, 100-(top1Sum / N), 100-(top5Sum / N)))

   return top1Sum / N, top5Sum / N
   
end

function Trainer:computeScore(output, target, nCrops)
   if nCrops > 1 then
      -- Sum over crops
      output = output:view(output:size(1) / nCrops, nCrops, output:size(2))
         --:exp()
         :sum(2):squeeze(2)
   end
      -- Coputes the top1 and top5 error rate
   local batchSize = output:size(1)

   local _ , predictions = output:float():topk(5, 2, true, true) -- descending

   -- Find which predictions match the target
   local correct = predictions:eq(
      target:long():view(batchSize, 1):expandAs(predictions))

   -- Top-1 score
   local top1 = 1.0 - (correct:narrow(2, 1, 1):sum() / batchSize)

   -- Top-5 score, if there are at least 5 classes
   local len = math.min(5, correct:size(2))
   local top5 = 1.0 - (correct:narrow(2, 1, len):sum() / batchSize)

   return top1 * 100, top5 * 100

end

local function getCudaTensorType(tensorType)
  if tensorType == 'torch.CudaHalfTensor' then
     return cutorch.createCudaHostHalfTensor()
  elseif tensorType == 'torch.CudaDoubleTensor' then
    return cutorch.createCudaHostDoubleTensor()
  else
     return cutorch.createCudaHostTensor()
  end
end

function Trainer:copyInputs(sample)
   -- Copies the input to a CUDA tensor, if using 1 GPU, or to pinned memory,
   -- if using DataParallelTable. The target is always copied to a CUDA tensor
   self.input = self.input or (self.opt.nGPU == 1
      and torch[self.opt.tensorType:match('torch.(%a+)')]()
      or getCudaTensorType(self.opt.tensorType))
   self.target = self.target or (torch.CudaLongTensor and torch.CudaLongTensor())
   self.input:resize(sample.input:size()):copy(sample.input)
   self.target:resize(sample.target:size()):copy(sample.target)
end

function Trainer:learningRate(epoch)

-- Training schedule
   local decay = 0
   if self.opt.dataset == 'imagenet' then
      if self.opt.depth == 0 then
         if self.opt.deploy ~= 'none' then
            decay = epoch >= 36 and 3 or epoch >=26 and 2 or epoch >= 16 and 1 or 0
            -- decay = epoch >=10 and 2 or epoch >= 5 and 1 or 0
         else
            -- decay = epoch >=101 and 2 or epoch >= 81 and 1 or 0
            -- decay = epoch >=81 and 2 or epoch >= 51 and 1 or 0
            decay = epoch >= 91 and 3 or epoch >=71 and 2 or epoch >= 41 and 1 or 0
            -- return epoch < 21 and 1e-2 or epoch < 31 and 5e-3 or epoch <41 and 1e-3 or epoch < 51 and 5e-4 or epoch < 61 and 1e-4
            -- return epoch < 41 and 5e-3 or epoch < 61 and 1e-3 or epoch < 81 and 0.2*1e-3 or epoch < 101 and 0.04*1e-3   
         end
      else
         if self.opt.deploy ~= 'none' then
            decay = epoch >= 36 and 3 or epoch >= 31 and 2 or epoch >= 21 and 1 or 0
         else
            -- decay = math.floor((epoch - 1) / 30)
            -- decay = epoch >= 81 and 3 or epoch >= 61 and 2 or epoch >= 31 and 1 or 0
            -- decay = epoch >= 96 and 4 or epoch >= 86 and 3 or epoch >= 61 and 2 or epoch >= 31 and 1 or 0
            -- decay = epoch >= 96 and 4 or epoch >= 91 and 3 or epoch >= 71 and 2 or epoch >= 51 and 1 or 0
            decay = epoch >= 111 and 3 or epoch >= 91 and 2 or epoch >= 71 and 1 or 0
         end
      end
   elseif self.opt.dataset == 'cifar10' then
      if self.opt.depth == 0 then
         -- decay = epoch >=101 and 2 or epoch >= 81 and 1 or 0
         decay = epoch >=161 and 2 or epoch >= 121 and 1 or 0
      else
         if self.opt.deploy ~= 'none' then
            decay = epoch >= 61 and 2 or epoch >= 31 and 1 or 0
         else
         -- decay = epoch >= 122 and 2 or epoch >= 81 and 1 or 0
         -- decay = epoch >= 91 and 2 or epoch >= 81 and 1 or 0
            decay = epoch >= 121 and 2 or epoch >= 61 and 1 or 0
         end
      end
   elseif self.opt.dataset == 'cifar100' then
      decay = epoch >= 122 and 2 or epoch >= 81 and 1 or 0
   end
   return self.opt.LR * math.pow(0.1, decay)
end


return M.Trainer