Skip to content

Commit

Permalink
Support NHWC format improve speed (PaddlePaddle#99)
Browse files Browse the repository at this point in the history
  • Loading branch information
GuoxiaWang authored Jan 11, 2022
1 parent 3a09f50 commit b10401a
Show file tree
Hide file tree
Showing 28 changed files with 456 additions and 182 deletions.
198 changes: 121 additions & 77 deletions README.md

Large diffs are not rendered by default.

7 changes: 6 additions & 1 deletion configs/argparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,11 @@ def parse_args():
parser.parse_known_args(namespace=user_namespace)
cfg = get_config(user_namespace.config_file)

parser.add_argument(
'--seed',
type=int,
default=cfg.seed,
help='global seed, None means do not fix seed, int value means to run reproduction')
# Model setting
parser.add_argument(
'--is_static',
Expand All @@ -81,7 +86,7 @@ def parse_args():
'--data_format',
type=str,
default=cfg.data_format,
help='model data layout, "NCHW" or "NHWC"')
help='model data layout, "NCHW" for FP32 or "NHWC" for FP16')
parser.add_argument(
'--backbone', type=str, default=cfg.backbone, help='backbone network')
parser.add_argument(
Expand Down
4 changes: 3 additions & 1 deletion configs/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@
from easydict import EasyDict as edict

config = edict()
config.seed = None # global seed, None means do not fix seed, int value means to run reproduction

config.is_static = True
config.data_format = 'NCHW' # 'NCHW' or 'NHWC'
config.data_format = 'NHWC' # 'NCHW' for FP32 or 'NHWC' for FP16
config.backbone = 'FresResNet100'
config.classifier = 'LargeScaleClassifier'
config.embedding_size = 512
Expand Down
15 changes: 9 additions & 6 deletions configs/ms1mv2_mobileface.py → configs/ms1mv3_mobileface.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,17 @@

config = edict()
config.is_static = False
config.data_format = 'NCHW'
config.backbone = 'MobileFaceNet_128'
config.classifier = 'LargeScaleClassifier'
config.embedding_size = 128
config.model_parallel = True
config.sample_ratio = 1.0
config.sample_ratio = 0.1
config.loss = 'ArcFace'
config.dropout = 0.0

config.fp16 = False

config.lr = 0.1 # for global batch size = 512
config.lr_decay = 0.1
config.weight_decay = 5e-4
Expand All @@ -34,11 +37,11 @@
config.decay_boundaries = [10, 16, 22]

config.use_synthetic_dataset = False
config.dataset = "MS1M_v2"
config.data_dir = "./MS1M_v2"
config.label_file = "./MS1M_v2/label.txt"
config.dataset = "MS1M_v3"
config.data_dir = "./MS1M_v3"
config.label_file = "./MS1M_v3/label.txt"
config.is_bin = False
config.num_classes = 85742 # 85742 for MS1M_v2, 93431 for MS1M_v3
config.num_classes = 93431 # 85742 for MS1M_v2, 93431 for MS1M_v3
config.batch_size = 128 # global batch size 1024 of 8 GPU
config.num_workers = 8

Expand All @@ -48,7 +51,7 @@

config.logdir = './log'
config.log_interval_step = 100
config.output = './MS1M_v2_arcface_MobileFaceNet_128_0.1'
config.output = './MS1M_v3_arcface_MobileFaceNet_128_0.1'
config.resume = False
config.checkpoint_dir = None
config.max_num_last_checkpoint = 1
12 changes: 9 additions & 3 deletions datasets/common_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@

from datasets.kv_helper import read_img_from_bin


def transform(img):
# random horizontal flip
if random.randint(0, 1) == 0:
Expand All @@ -40,11 +39,14 @@ def transform(img):


class CommonDataset(paddle.io.Dataset):
def __init__(self, root_dir, label_file, rank=0, world_size=1, fp16=False, is_bin=True):
def __init__(self, root_dir, label_file, rank=0, world_size=1, fp16=False, is_bin=True, seed=0):
super(CommonDataset, self).__init__()
self.root_dir = root_dir
self.label_file = label_file
self.fp16 = fp16
self.seed = seed
if self.seed != 0:
random.seed(self.seed)
with open(label_file, "r") as fin:
self.full_lines = fin.readlines()

Expand Down Expand Up @@ -78,13 +80,17 @@ def __len__(self):
return self.num_samples

class SplitDataset(paddle.io.Dataset):
def __init__(self, root_dir, label_file, rank=0, world_size=1, fp16=False, is_bin=True):
def __init__(self, root_dir, label_file, rank=0, world_size=1, fp16=False, is_bin=True, seed=0):
super(SplitDataset, self).__init__()
self.root_dir = root_dir
self.label_file = label_file
self.rank = rank
self.world_size = world_size
self.fp16 = fp16
self.seed = seed
if self.seed != 0:
random.seed(self.seed)

with open(label_file, "r") as fin:
self.full_lines = fin.readlines()

Expand Down
10 changes: 7 additions & 3 deletions dynamic/backbones/iresnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def __init__(self,
act=None,
name=name + "_branch2a",
data_format=data_format)
self.prelu = PReLU(num_parameters=num_filters, name=name + "_branch2a_prelu")
self.prelu = PReLU(num_parameters=num_filters, data_format=data_format, name=name + "_branch2a_prelu")
self.conv1 = ConvBNLayer(
num_channels=num_filters,
num_filters=num_filters,
Expand Down Expand Up @@ -283,7 +283,7 @@ def __init__(self,
act=None,
name="conv1",
data_format=self.data_format)
self.prelu = PReLU(num_parameters=64, name="prelu1")
self.prelu = PReLU(num_parameters=64, data_format=self.data_format, name="prelu1")

self.block_list = paddle.nn.LayerList()
for block in range(len(units)):
Expand All @@ -308,13 +308,15 @@ def __init__(self,
feat_w = input_image_width // 16
feat_h = input_image_height // 16
self.fc_channels = num_filters[-1] * feat_w * feat_h
#NOTE(GuoxiaWang): don't use NHWC for last fc,
# thus we can train using NHWC and test using NCHW
self.fc = FC(num_filters[-1],
self.fc_channels,
num_features,
fc_type,
dropout,
name='fc',
data_format=self.data_format)
data_format="NCHW")

def forward(self, inputs):
if self.data_format == "NHWC":
Expand All @@ -324,6 +326,8 @@ def forward(self, inputs):
y = self.prelu(y)
for block in self.block_list:
y = block(y)
if self.data_format == "NHWC":
y = paddle.tensor.transpose(y, [0, 3, 1, 2])
y = self.fc(y)
return y

Expand Down
8 changes: 5 additions & 3 deletions dynamic/backbones/mobilefacenet.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def __init__(self, inp, oup, stride, expansion, data_format="NCHW"):
nn.Conv2D(
inp, inp * expansion, 1, 1, 0, bias_attr=False, data_format=data_format),
nn.BatchNorm2D(inp * expansion, data_format=data_format),
nn.PReLU(inp * expansion),
nn.PReLU(inp * expansion, data_format=data_format),

# 3*3 depth wise conv
nn.Conv2D(
Expand All @@ -56,7 +56,7 @@ def __init__(self, inp, oup, stride, expansion, data_format="NCHW"):
data_format=data_format
),
nn.BatchNorm2D(inp * expansion, data_format=data_format),
nn.PReLU(inp * expansion),
nn.PReLU(inp * expansion, data_format=data_format),

# 1*1 conv
nn.Conv2D(
Expand All @@ -82,7 +82,7 @@ def __init__(self, inp, oup, k, s, p, dw=False, linear=False, data_format="NCHW"

self.bn = nn.BatchNorm2D(oup, data_format=data_format)
if not linear:
self.prelu = nn.PReLU(oup)
self.prelu = nn.PReLU(oup, data_format=data_format)

def forward(self, x):
x = self.conv(x)
Expand Down Expand Up @@ -155,6 +155,8 @@ def forward(self, x):
x = self.conv2(x)
x = self.linear7(x)
x = self.linear1(x)
if self.data_format == "NHWC":
x = paddle.tensor.transpose(x, [0, 3, 1, 2])
x = x.reshape([x.shape[0], x.shape[1] * x.shape[2] * x.shape[3]])
return x

Expand Down
2 changes: 1 addition & 1 deletion dynamic/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def export(args):

backbone = eval("backbones.{}".format(args.backbone))(
num_features=args.embedding_size)
checkpoint.load(backbone, for_train=False, dtype='float32')
checkpoint.load(backbone, for_train=False)

print("Load checkpoint from '{}'.".format(args.checkpoint_dir))
backbone.eval()
Expand Down
46 changes: 33 additions & 13 deletions dynamic/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import sys
import numpy as np
import logging
import random

import paddle
from visualdl import LogWriter
Expand All @@ -33,24 +34,33 @@
from . import classifiers
from . import backbones

RELATED_FLAGS_SETTING = {
'FLAGS_cudnn_exhaustive_search': 1,
'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
'FLAGS_max_inplace_grad_add': 8,
'FLAGS_fraction_of_gpu_memory_to_use': 0.9999,
}
paddle.fluid.set_flags(RELATED_FLAGS_SETTING)


def train(args):
writer = LogWriter(logdir=args.logdir)

rank = int(os.getenv("PADDLE_TRAINER_ID", 0))
world_size = int(os.getenv("PADDLE_TRAINERS_NUM", 1))

gpu_id = int(os.getenv("FLAGS_selected_gpus", 0))
place = paddle.CUDAPlace(gpu_id)

RELATED_FLAGS_SETTING = {}
if args.seed == 0:
RELATED_FLAGS_SETTING['FLAGS_cudnn_deterministic'] = 1
RELATED_FLAGS_SETTING['FLAGS_benchmark'] = 1
args.num_workers = 0
else:
# args.seed == None or args.seed != 0
RELATED_FLAGS_SETTING['FLAGS_cudnn_exhaustive_search'] = 1
RELATED_FLAGS_SETTING['FLAGS_cudnn_batchnorm_spatial_persistent'] = 1
RELATED_FLAGS_SETTING['FLAGS_max_inplace_grad_add'] = 8
paddle.fluid.set_flags(RELATED_FLAGS_SETTING)

if args.seed is not None:
args.seed = args.seed + rank
paddle.seed(args.seed)
np.random.seed(args.seed)
random.seed(args.seed)

if world_size > 1:
import paddle.distributed.fleet as fleet

Expand All @@ -67,7 +77,8 @@ def train(args):
rank=rank,
world_size=world_size,
fp16=args.fp16,
is_bin=args.is_bin)
is_bin=args.is_bin,
seed=args.seed)

num_image = trainset.total_num_samples
total_batch_size = args.batch_size * world_size
Expand Down Expand Up @@ -139,14 +150,15 @@ def train(args):
callback_verification = CallBackVerification(
args.validation_interval_step,
rank,
world_size,
args.batch_size,
args.val_targets,
args.data_dir,
fp16=args.fp16, )

callback_logging = CallBackLogging(args.log_interval_step, rank,
world_size, total_steps,
args.batch_size, writer)
args.batch_size)

checkpoint = Checkpoint(
rank=rank,
Expand Down Expand Up @@ -213,7 +225,16 @@ def train(args):
loss_avg.update(loss_v.item(), 1)
callback_logging(global_step, loss_avg, epoch, lr_value)
if args.do_validation_while_train:
callback_verification(global_step, backbone)
best_metric = callback_verification(global_step, backbone)
if best_metric is not None and len(best_metric) > 0:
for ver_dataset in best_metric:
checkpoint.save(
backbone,
classifier,
optimizer,
epoch=epoch,
for_train=True,
best_metric=best_metric[ver_dataset])
lr_scheduler.step()

if global_step >= total_steps:
Expand All @@ -222,4 +243,3 @@ def train(args):

checkpoint.save(
backbone, classifier, optimizer, epoch=epoch, for_train=True)
writer.close()
Loading

0 comments on commit b10401a

Please sign in to comment.