Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Gluon image-classification example improvement #9633

Merged
merged 9 commits into from
Mar 3, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 38 additions & 44 deletions example/gluon/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,14 @@
""" data iterator for mnist """
import os
import random
import logging
logging.basicConfig(level=logging.INFO)

import mxnet as mx
from mxnet.test_utils import get_cifar10
from mxnet.gluon.data.vision import ImageFolderDataset
from mxnet.gluon.data import DataLoader
from mxnet.contrib.io import DataLoaderIter

def get_cifar10_iterator(batch_size, data_shape, resize=-1, num_parts=1, part_index=0):
get_cifar10()
Expand Down Expand Up @@ -49,50 +55,38 @@ def get_cifar10_iterator(batch_size, data_shape, resize=-1, num_parts=1, part_in

return train, val


def get_imagenet_iterator(train_data, val_data, batch_size, data_shape, resize=-1, num_parts=1, part_index=0):
train = mx.io.ImageRecordIter(
path_imgrec = train_data,
data_shape = data_shape,
mean_r = 123.68,
mean_g = 116.779,
mean_b = 103.939,
std_r = 58.395,
std_g = 57.12,
std_b = 57.375,
preprocess_threads = 32,
shuffle = True,
batch_size = batch_size,
rand_crop = True,
resize = resize,
random_mirror = True,
max_random_h = 36,
max_random_s = 50,
max_random_l = 50,
max_random_rotate_angle = 10,
max_random_shear_ratio = 0.1,
max_random_aspect_ratio = 0.25,
fill_value = 127,
min_random_scale = 0.533,
num_parts = num_parts,
part_index = part_index)

val = mx.io.ImageRecordIter(
path_imgrec = val_data,
data_shape = data_shape,
mean_r = 123.68,
mean_g = 116.779,
mean_b = 103.939,
std_r = 58.395,
std_g = 57.12,
std_b = 57.375,
preprocess_threads = 32,
batch_size = batch_size,
resize = resize,
num_parts = num_parts,
part_index = part_index)

return train, val
def get_imagenet_transforms(data_shape=224, dtype='float32'):
def train_transform(image, label):
image, _ = mx.image.random_size_crop(image, (data_shape, data_shape), 0.08, (3/4., 4/3.))
image = mx.nd.image.random_flip_left_right(image)
image = mx.nd.image.to_tensor(image)
image = mx.nd.image.normalize(image, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
return mx.nd.cast(image, dtype), label

def val_transform(image, label):
image = mx.image.resize_short(image, data_shape + 32)
image, _ = mx.image.center_crop(image, (data_shape, data_shape))
image = mx.nd.image.to_tensor(image)
image = mx.nd.image.normalize(image, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
return mx.nd.cast(image, dtype), label
return train_transform, val_transform

def get_imagenet_iterator(root, batch_size, num_workers, data_shape=224, dtype='float32'):
"""Dataset loader with preprocessing."""
train_dir = os.path.join(root, 'train')
train_transform, val_transform = get_imagenet_transforms(data_shape, dtype)
logging.info("Loading image folder %s, this may take a bit long...", train_dir)
train_dataset = ImageFolderDataset(train_dir, transform=train_transform)
train_data = DataLoader(train_dataset, batch_size, shuffle=True,
last_batch='discard', num_workers=num_workers)
val_dir = os.path.join(root, 'val')
if not os.path.isdir(os.path.join(os.path.expanduser(root, 'val', 'n01440764'))):
user_warning = 'Make sure validation images are stored in one subdir per category, a helper script is available at https://git.io/vNQv1'
raise ValueError(user_warning)
logging.info("Loading image folder %s, this may take a bit long...", val_dir)
val_dataset = ImageFolderDataset(val_dir, transform=val_transform)
val_data = DataLoader(val_dataset, batch_size, last_batch='keep', num_workers=num_workers)
return DataLoaderIter(train_data, dtype), DataLoaderIter(val_data, dtype)


class DummyIter(mx.io.DataIter):
Expand Down
162 changes: 108 additions & 54 deletions example/gluon/image_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,36 +17,49 @@

from __future__ import division

import argparse, time
import argparse, time, os
import logging
logging.basicConfig(level=logging.INFO)

import mxnet as mx
from mxnet import gluon
from mxnet.gluon import nn
from mxnet.gluon.model_zoo import vision as models
from mxnet import autograd as ag
from mxnet.test_utils import get_mnist_iterator
from mxnet.metric import Accuracy, TopKAccuracy, CompositeEvalMetric
import numpy as np

from data import *

# logging
logging.basicConfig(level=logging.INFO)
fh = logging.FileHandler('image-classification.log')
logger = logging.getLogger()
logger.addHandler(fh)
formatter = logging.Formatter('%(message)s')
fh.setFormatter(formatter)
fh.setLevel(logging.DEBUG)
logging.debug('\n%s', '-' * 100)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
fh.setFormatter(formatter)

# CLI
parser = argparse.ArgumentParser(description='Train a model for image classification.')
parser.add_argument('--dataset', type=str, default='cifar10',
help='dataset to use. options are mnist, cifar10, and dummy.')
parser.add_argument('--train-data', type=str, default='',
help='training record file to use, required for imagenet.')
parser.add_argument('--val-data', type=str, default='',
help='validation record file to use, required for imagenet.')
help='dataset to use. options are mnist, cifar10, imagenet and dummy.')
parser.add_argument('--data-dir', type=str, default='',
help='training directory of imagenet images, contains train/val subdirs.')
parser.add_argument('--batch-size', type=int, default=32,
help='training batch size per device (CPU/GPU).')
parser.add_argument('--num-gpus', type=int, default=0,
help='number of gpus to use.')
parser.add_argument('--epochs', type=int, default=3,
parser.add_argument('--num-worker', '-j', dest='num_workers', default=4, type=int,
help='number of workers of dataloader.')
parser.add_argument('--gpus', type=str, default='',
help='ordinates of gpus to use, can be "0,1,2" or empty for cpu only.')
parser.add_argument('--epochs', type=int, default=120,
help='number of training epochs.')
parser.add_argument('--lr', type=float, default=0.01,
help='learning rate. default is 0.01.')
parser.add_argument('-momentum', type=float, default=0.9,
parser.add_argument('--lr', type=float, default=0.1,
help='learning rate. default is 0.1.')
parser.add_argument('--momentum', type=float, default=0.9,
help='momentum value for optimizer, default is 0.9.')
parser.add_argument('--wd', type=float, default=0.0001,
help='weight decay rate. default is 0.0001.')
Expand All @@ -62,54 +75,77 @@
help='enable batch normalization or not in vgg. default is false.')
parser.add_argument('--use-pretrained', action='store_true',
help='enable using pretrained model from gluon.')
parser.add_argument('--prefix', default='', type=str,
help='path to checkpoint prefix, default is current working dir')
parser.add_argument('--start-epoch', default=0, type=int,
help='starting epoch, 0 for fresh training, > 0 to resume')
parser.add_argument('--resume', type=str, default='',
help='path to saved weight where you want resume')
parser.add_argument('--lr-factor', default=0.1, type=float,
help='learning rate decay ratio')
parser.add_argument('--lr-steps', default='30,60,90', type=str,
help='list of learning rate decay epochs as in str')
parser.add_argument('--dtype', default='float32', type=str,
help='data type, float32 or float16 if applicable')
parser.add_argument('--save-frequency', default=10, type=int,
help='epoch frequence to save model, best model will always be saved')
parser.add_argument('--kvstore', type=str, default='device',
help='kvstore to use for trainer/module.')
parser.add_argument('--log-interval', type=int, default=50, help='Number of batches to wait before logging.')
parser.add_argument('--log-interval', type=int, default=50,
help='Number of batches to wait before logging.')
parser.add_argument('--profile', action='store_true',
help='Option to turn on memory profiling for front-end, '\
'and prints out the memory usage by python function at the end.')
opt = parser.parse_args()

logging.info(opt)

# global variables
logger.info('Starting new image-classification task:, %s',opt)
mx.random.seed(opt.seed)

model_name = opt.model
dataset_classes = {'mnist': 10, 'cifar10': 10, 'imagenet': 1000, 'dummy': 1000}

batch_size, dataset, classes = opt.batch_size, opt.dataset, dataset_classes[opt.dataset]

num_gpus = opt.num_gpus

context = [mx.gpu(int(i)) for i in opt.gpus.split(',')] if opt.gpus.strip() else [mx.cpu()]
num_gpus = len(context)
batch_size *= max(1, num_gpus)
context = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()]
lr_steps = [int(x) for x in opt.lr_steps.split(',') if x.strip()]
metric = CompositeEvalMetric([Accuracy(), TopKAccuracy(5)])

model_name = opt.model
def get_model(model, ctx, opt):
"""Model initialization."""
kwargs = {'ctx': ctx, 'pretrained': opt.use_pretrained, 'classes': classes}
if model.startswith('resnet'):
kwargs['thumbnail'] = opt.use_thumbnail
elif model.startswith('vgg'):
kwargs['batch_norm'] = opt.batch_norm

kwargs = {'ctx': context, 'pretrained': opt.use_pretrained, 'classes': classes}
if model_name.startswith('resnet'):
kwargs['thumbnail'] = opt.use_thumbnail
elif model_name.startswith('vgg'):
kwargs['batch_norm'] = opt.batch_norm
net = models.get_model(model, **kwargs)
if opt.resume:
net.load_params(opt.resume)
elif not opt.use_pretrained:
if model in ['alexnet']:
net.initialize(mx.init.Normal())
else:
net.initialize(mx.init.Xavier(magnitude=2))
net.cast(opt.dtype)
return net

net = models.get_model(opt.model, **kwargs)
net = get_model(opt.model, context, opt)

def get_data_iters(dataset, batch_size, num_workers=1, rank=0):
# get dataset iterators
"""get dataset iterators"""
if dataset == 'mnist':
train_data, val_data = get_mnist_iterator(batch_size, (1, 28, 28),
num_parts=num_workers, part_index=rank)
elif dataset == 'cifar10':
train_data, val_data = get_cifar10_iterator(batch_size, (3, 32, 32),
num_parts=num_workers, part_index=rank)
elif dataset == 'imagenet':
if not opt.data_dir:
raise ValueError('Dir containing raw images in train/val is required for imagenet, plz specify "--data-dir"')
if model_name == 'inceptionv3':
train_data, val_data = get_imagenet_iterator(opt.train_data, opt.val_data,
batch_size, (3, 299, 299),
num_parts=num_workers, part_index=rank)
train_data, val_data = get_imagenet_iterator(opt.data_dir, batch_size, opt.num_workers, 299, opt.dtype)
else:
train_data, val_data = get_imagenet_iterator(opt.train_data, opt.val_data,
batch_size, (3, 224, 224),
num_parts=num_workers, part_index=rank)
train_data, val_data = get_imagenet_iterator(opt.data_dir, batch_size, opt.num_workers, 224, opt.dtype)
elif dataset == 'dummy':
if model_name == 'inceptionv3':
train_data, val_data = dummy_iterator(batch_size, (3, 299, 299))
Expand All @@ -118,7 +154,7 @@ def get_data_iters(dataset, batch_size, num_workers=1, rank=0):
return train_data, val_data

def test(ctx, val_data):
metric = mx.metric.Accuracy()
metric.reset()
val_data.reset()
for batch in val_data:
data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
Expand All @@ -129,27 +165,45 @@ def test(ctx, val_data):
metric.update(label, outputs)
return metric.get()

def update_learning_rate(lr, trainer, epoch, ratio, steps):
"""Set the learning rate to the initial value decayed by ratio every N epochs."""
new_lr = lr * (ratio ** int(np.sum(np.array(steps) < epoch)))
trainer.set_learning_rate(new_lr)
return trainer

def save_checkpoint(epoch, top1, best_acc):
if opt.save_frequency and (epoch + 1) % opt.save_frequency == 0:
fname = os.path.join(opt.prefix, '%s_%d_acc_%.4f.params' % (opt.model, epoch, top1))
net.save_params(fname)
logger.info('[Epoch %d] Saving checkpoint to %s with Accuracy: %.4f', epoch, fname, top1)
if top1 > best_acc[0]:
best_acc[0] = top1
fname = os.path.join(opt.prefix, '%s_best.params' % (opt.model))
net.save_params(fname)
logger.info('[Epoch %d] Saving checkpoint to %s with Accuracy: %.4f', epoch, fname, top1)

def train(epochs, ctx):
def train(opt, ctx):
if isinstance(ctx, mx.Context):
ctx = [ctx]
net.initialize(mx.init.Xavier(magnitude=2), ctx=ctx)
kv = mx.kv.create(opt.kvstore)
train_data, val_data = get_data_iters(dataset, batch_size, kv.num_workers, kv.rank)
net.collect_params().reset_ctx(ctx)
trainer = gluon.Trainer(net.collect_params(), 'sgd',
{'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum},
{'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum,
'multi_precision': True},
kvstore = kv)
metric = mx.metric.Accuracy()
loss = gluon.loss.SoftmaxCrossEntropyLoss()

for epoch in range(epochs):
best_acc = [0]
for epoch in range(opt.start_epoch, opt.epochs):
trainer = update_learning_rate(opt.lr, trainer, epoch, opt.lr_factor, lr_steps)
tic = time.time()
train_data.reset()
metric.reset()
btic = time.time()
for i, batch in enumerate(train_data):
data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
data = gluon.utils.split_and_load(batch.data[0].astype(opt.dtype), ctx_list=ctx, batch_axis=0)
label = gluon.utils.split_and_load(batch.label[0].astype(opt.dtype), ctx_list=ctx, batch_axis=0)
outputs = []
Ls = []
with ag.record():
Expand All @@ -160,23 +214,23 @@ def train(epochs, ctx):
# on all GPUs for better speed on multiple GPUs.
Ls.append(L)
outputs.append(z)
for L in Ls:
L.backward()
ag.backward(Ls)
trainer.step(batch.data[0].shape[0])
metric.update(label, outputs)
if opt.log_interval and not (i+1)%opt.log_interval:
name, acc = metric.get()
logging.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f'%(
epoch, i, batch_size/(time.time()-btic), name, acc))
logger.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f, %s=%f'%(
epoch, i, batch_size/(time.time()-btic), name[0], acc[0], name[1], acc[1]))
btic = time.time()

name, acc = metric.get()
logging.info('[Epoch %d] training: %s=%f'%(epoch, name, acc))
logging.info('[Epoch %d] time cost: %f'%(epoch, time.time()-tic))
logger.info('[Epoch %d] training: %s=%f, %s=%f'%(epoch, name[0], acc[0], name[1], acc[1]))
logger.info('[Epoch %d] time cost: %f'%(epoch, time.time()-tic))
name, val_acc = test(ctx, val_data)
logging.info('[Epoch %d] validation: %s=%f'%(epoch, name, val_acc))
logger.info('[Epoch %d] validation: %s=%f, %s=%f'%(epoch, name[0], val_acc[0], name[1], val_acc[1]))

net.save_params('image-classifier-%s-%d.params'%(opt.model, epochs))
# save model if meet requirements
save_checkpoint(epoch, val_acc[0], best_acc)

def main():
if opt.mode == 'symbolic':
Expand All @@ -193,13 +247,13 @@ def main():
batch_end_callback = mx.callback.Speedometer(batch_size, max(1, opt.log_interval)),
epoch_end_callback = mx.callback.do_checkpoint('image-classifier-%s'% opt.model),
optimizer = 'sgd',
optimizer_params = {'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum},
optimizer_params = {'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum, 'multi_precision': True},
initializer = mx.init.Xavier(magnitude=2))
mod.save_params('image-classifier-%s-%d-final.params'%(opt.model, opt.epochs))
else:
if opt.mode == 'hybrid':
net.hybridize()
train(opt.epochs, context)
train(opt, context)

if __name__ == '__main__':
if opt.profile:
Expand Down
2 changes: 2 additions & 0 deletions python/mxnet/contrib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,5 @@
from . import tensorboard

from . import text

from . import io
Loading