From f278c53d0514a729d34af1a3f377d656358f8dff Mon Sep 17 00:00:00 2001
From: "p.vytovtov"
Date: Fri, 24 May 2019 17:35:36 +0300
Subject: [PATCH 1/4] Added support of periodically testing during training.
---
...rcnn_R_50_FPN_1x_periodically_testing.yaml | 42 +++++++++++++++++++
maskrcnn_benchmark/config/defaults.py | 1 +
maskrcnn_benchmark/engine/trainer.py | 31 +++++++++++++-
tools/train_net.py | 9 ++++
4 files changed, 82 insertions(+), 1 deletion(-)
create mode 100644 configs/e2e_mask_rcnn_R_50_FPN_1x_periodically_testing.yaml
diff --git a/configs/e2e_mask_rcnn_R_50_FPN_1x_periodically_testing.yaml b/configs/e2e_mask_rcnn_R_50_FPN_1x_periodically_testing.yaml
new file mode 100644
index 000000000..b34471007
--- /dev/null
+++ b/configs/e2e_mask_rcnn_R_50_FPN_1x_periodically_testing.yaml
@@ -0,0 +1,42 @@
+MODEL:
+ META_ARCHITECTURE: "GeneralizedRCNN"
+ WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+ BACKBONE:
+ CONV_BODY: "R-50-FPN"
+ RESNETS:
+ BACKBONE_OUT_CHANNELS: 256
+ RPN:
+ USE_FPN: True
+ ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+ PRE_NMS_TOP_N_TRAIN: 2000
+ PRE_NMS_TOP_N_TEST: 1000
+ POST_NMS_TOP_N_TEST: 1000
+ FPN_POST_NMS_TOP_N_TEST: 1000
+ ROI_HEADS:
+ USE_FPN: True
+ ROI_BOX_HEAD:
+ POOLER_RESOLUTION: 7
+ POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+ POOLER_SAMPLING_RATIO: 2
+ FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+ PREDICTOR: "FPNPredictor"
+ ROI_MASK_HEAD:
+ POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+ FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor"
+ PREDICTOR: "MaskRCNNC4Predictor"
+ POOLER_RESOLUTION: 14
+ POOLER_SAMPLING_RATIO: 2
+ RESOLUTION: 28
+ SHARE_BOX_FEATURE_EXTRACTOR: False
+ MASK_ON: True
+DATASETS:
+ TRAIN: ("coco_2014_train", "coco_2014_valminusminival")
+ TEST: ("coco_2014_minival",)
+DATALOADER:
+ SIZE_DIVISIBILITY: 32
+SOLVER:
+ BASE_LR: 0.02
+ WEIGHT_DECAY: 0.0001
+ STEPS: (60000, 80000)
+ MAX_ITER: 90000
+ TEST_PERIOD: 20
diff --git a/maskrcnn_benchmark/config/defaults.py b/maskrcnn_benchmark/config/defaults.py
index beae4070a..260b48474 100644
--- a/maskrcnn_benchmark/config/defaults.py
+++ b/maskrcnn_benchmark/config/defaults.py
@@ -406,6 +406,7 @@
_C.SOLVER.WARMUP_METHOD = "linear"
_C.SOLVER.CHECKPOINT_PERIOD = 2500
+_C.SOLVER.TEST_PERIOD = 0
# Number of images per batch
# This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will
diff --git a/maskrcnn_benchmark/engine/trainer.py b/maskrcnn_benchmark/engine/trainer.py
index 281d91339..d7ea288d4 100644
--- a/maskrcnn_benchmark/engine/trainer.py
+++ b/maskrcnn_benchmark/engine/trainer.py
@@ -1,13 +1,15 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
import datetime
import logging
+import os
import time
import torch
import torch.distributed as dist
-from maskrcnn_benchmark.utils.comm import get_world_size
+from maskrcnn_benchmark.utils.comm import get_world_size, synchronize
from maskrcnn_benchmark.utils.metric_logger import MetricLogger
+from maskrcnn_benchmark.engine.inference import inference
from apex import amp
@@ -37,13 +39,16 @@ def reduce_loss_dict(loss_dict):
def do_train(
+ cfg,
model,
data_loader,
+ data_loaders_val,
optimizer,
scheduler,
checkpointer,
device,
checkpoint_period,
+ test_period,
arguments,
):
logger = logging.getLogger("maskrcnn_benchmark.trainer")
@@ -54,6 +59,14 @@ def do_train(
model.train()
start_training_time = time.time()
end = time.time()
+
+ iou_types = ("bbox",)
+ if cfg.MODEL.MASK_ON:
+ iou_types = iou_types + ("segm",)
+ if cfg.MODEL.KEYPOINT_ON:
+ iou_types = iou_types + ("keypoints",)
+ dataset_names = cfg.DATASETS.TEST
+
for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
data_time = time.time() - end
iteration = iteration + 1
@@ -107,6 +120,22 @@ def do_train(
)
if iteration % checkpoint_period == 0:
checkpointer.save("model_{:07d}".format(iteration), **arguments)
+ if data_loaders_val is not None and test_period > 0 and iteration % test_period == 0:
+ synchronize()
+ for dataset_name, data_loader_val in zip(dataset_names, data_loaders_val):
+ inference(
+ model,
+ data_loader_val,
+ dataset_name=dataset_name,
+ iou_types=iou_types,
+ box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
+ device=cfg.MODEL.DEVICE,
+ expected_results=cfg.TEST.EXPECTED_RESULTS,
+ expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
+ output_folder=None,
+ )
+ synchronize()
+ model.train()
if iteration == max_iter:
checkpointer.save("model_final", **arguments)
diff --git a/tools/train_net.py b/tools/train_net.py
index 9f4761b3f..926493b69 100644
--- a/tools/train_net.py
+++ b/tools/train_net.py
@@ -72,16 +72,25 @@ def train(cfg, local_rank, distributed):
start_iter=arguments["iteration"],
)
+ test_period = cfg.SOLVER.TEST_PERIOD
+ if test_period > 0:
+ data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed)
+ else:
+ data_loaders_val = None
+
checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
do_train(
+ cfg,
model,
data_loader,
+ data_loaders_val,
optimizer,
scheduler,
checkpointer,
device,
checkpoint_period,
+ test_period,
arguments,
)
From 10b11c49c2c1c728421b8794b5436cdc2c04fea4 Mon Sep 17 00:00:00 2001
From: "p.vytovtov"
Date: Mon, 27 May 2019 10:57:04 +0300
Subject: [PATCH 2/4] Added losses logging periodically.
---
maskrcnn_benchmark/engine/trainer.py | 33 ++++++++++++++++++++++++++--
1 file changed, 31 insertions(+), 2 deletions(-)
diff --git a/maskrcnn_benchmark/engine/trainer.py b/maskrcnn_benchmark/engine/trainer.py
index d7ea288d4..6d77531a9 100644
--- a/maskrcnn_benchmark/engine/trainer.py
+++ b/maskrcnn_benchmark/engine/trainer.py
@@ -123,7 +123,7 @@ def do_train(
if data_loaders_val is not None and test_period > 0 and iteration % test_period == 0:
synchronize()
for dataset_name, data_loader_val in zip(dataset_names, data_loaders_val):
- inference(
+ _ = inference( # The result can be used for additional loggin, e. g. to TensorBoard
model,
data_loader_val,
dataset_name=dataset_name,
@@ -134,8 +134,37 @@ def do_train(
expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
output_folder=None,
)
- synchronize()
+ synchronize()
model.train()
+ meters_val = MetricLogger(delimiter=" ")
+ with torch.no_grad():
+ for idx_val, (images_val, targets_val, _) in enumerate(data_loaders_val[0]):
+ images_val = images_val.to(device)
+ targets_val = [target.to(device) for target in targets_val]
+ loss_dict = model(images_val, targets_val)
+ losses = sum(loss for loss in loss_dict.values())
+ loss_dict_reduced = reduce_loss_dict(loss_dict)
+ losses_reduced = sum(loss for loss in loss_dict_reduced.values())
+ meters_val.update(loss=losses_reduced, **loss_dict_reduced)
+ synchronize()
+ logger.info(
+ meters.delimiter.join(
+ [
+ "[Validation]: ",
+ "eta: {eta}",
+ "iter: {iter}",
+ "{meters}",
+ "lr: {lr:.6f}",
+ "max mem: {memory:.0f}",
+ ]
+ ).format(
+ eta=eta_string,
+ iter=iteration,
+ meters=str(meters_val),
+ lr=optimizer.param_groups[0]["lr"],
+ memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
+ )
+ )
if iteration == max_iter:
checkpointer.save("model_final", **arguments)
From 69c46ba3eff3843b12f4d69c6d56f892f2a0f2ef Mon Sep 17 00:00:00 2001
From: "p.vytovtov"
Date: Mon, 27 May 2019 12:54:01 +0300
Subject: [PATCH 3/4] Getting correct data for evaluation.
---
configs/e2e_mask_rcnn_R_50_FPN_1x_periodically_testing.yaml | 2 +-
maskrcnn_benchmark/data/build.py | 4 ++--
tools/train_net.py | 2 +-
3 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/configs/e2e_mask_rcnn_R_50_FPN_1x_periodically_testing.yaml b/configs/e2e_mask_rcnn_R_50_FPN_1x_periodically_testing.yaml
index b34471007..03ddc5058 100644
--- a/configs/e2e_mask_rcnn_R_50_FPN_1x_periodically_testing.yaml
+++ b/configs/e2e_mask_rcnn_R_50_FPN_1x_periodically_testing.yaml
@@ -39,4 +39,4 @@ SOLVER:
WEIGHT_DECAY: 0.0001
STEPS: (60000, 80000)
MAX_ITER: 90000
- TEST_PERIOD: 20
+ TEST_PERIOD: 2500
diff --git a/maskrcnn_benchmark/data/build.py b/maskrcnn_benchmark/data/build.py
index b0ce3c348..f0ff2a791 100644
--- a/maskrcnn_benchmark/data/build.py
+++ b/maskrcnn_benchmark/data/build.py
@@ -104,7 +104,7 @@ def make_batch_data_sampler(
return batch_sampler
-def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0):
+def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0, is_for_period=False):
num_gpus = get_world_size()
if is_train:
images_per_batch = cfg.SOLVER.IMS_PER_BATCH
@@ -152,7 +152,7 @@ def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0):
# If bbox aug is enabled in testing, simply set transforms to None and we will apply transforms later
transforms = None if not is_train and cfg.TEST.BBOX_AUG.ENABLED else build_transforms(cfg, is_train)
- datasets = build_dataset(dataset_list, transforms, DatasetCatalog, is_train)
+ datasets = build_dataset(dataset_list, transforms, DatasetCatalog, is_train or is_for_period)
data_loaders = []
for dataset in datasets:
diff --git a/tools/train_net.py b/tools/train_net.py
index 926493b69..76d621747 100644
--- a/tools/train_net.py
+++ b/tools/train_net.py
@@ -74,7 +74,7 @@ def train(cfg, local_rank, distributed):
test_period = cfg.SOLVER.TEST_PERIOD
if test_period > 0:
- data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed)
+ data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed, is_for_period=True)
else:
data_loaders_val = None
From e000d4c4ea524388dde65c26e69b3cb9a162e4d5 Mon Sep 17 00:00:00 2001
From: "p.vytovtov"
Date: Mon, 27 May 2019 17:25:04 +0300
Subject: [PATCH 4/4] Fixed validation dataset forming.
---
maskrcnn_benchmark/data/build.py | 2 +-
maskrcnn_benchmark/engine/trainer.py | 40 +++++++++++++++-------------
tools/train_net.py | 6 ++---
3 files changed, 26 insertions(+), 22 deletions(-)
diff --git a/maskrcnn_benchmark/data/build.py b/maskrcnn_benchmark/data/build.py
index f0ff2a791..07291019f 100644
--- a/maskrcnn_benchmark/data/build.py
+++ b/maskrcnn_benchmark/data/build.py
@@ -170,7 +170,7 @@ def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0, is_
collate_fn=collator,
)
data_loaders.append(data_loader)
- if is_train:
+ if is_train or is_for_period:
# during training, a single (possibly concatenated) data_loader is returned
assert len(data_loaders) == 1
return data_loaders[0]
diff --git a/maskrcnn_benchmark/engine/trainer.py b/maskrcnn_benchmark/engine/trainer.py
index 6d77531a9..7870e1a28 100644
--- a/maskrcnn_benchmark/engine/trainer.py
+++ b/maskrcnn_benchmark/engine/trainer.py
@@ -6,7 +6,9 @@
import torch
import torch.distributed as dist
+from tqdm import tqdm
+from maskrcnn_benchmark.data import make_data_loader
from maskrcnn_benchmark.utils.comm import get_world_size, synchronize
from maskrcnn_benchmark.utils.metric_logger import MetricLogger
from maskrcnn_benchmark.engine.inference import inference
@@ -42,7 +44,7 @@ def do_train(
cfg,
model,
data_loader,
- data_loaders_val,
+ data_loader_val,
optimizer,
scheduler,
checkpointer,
@@ -120,25 +122,27 @@ def do_train(
)
if iteration % checkpoint_period == 0:
checkpointer.save("model_{:07d}".format(iteration), **arguments)
- if data_loaders_val is not None and test_period > 0 and iteration % test_period == 0:
+ if data_loader_val is not None and test_period > 0 and iteration % test_period == 0:
+ meters_val = MetricLogger(delimiter=" ")
+ synchronize()
+ _ = inference( # The result can be used for additional logging, e. g. for TensorBoard
+ model,
+ # The method changes the segmentation mask format in a data loader,
+ # so every time a new data loader is created:
+ make_data_loader(cfg, is_train=False, is_distributed=(get_world_size() > 1), is_for_period=True),
+ dataset_name="[Validation]",
+ iou_types=iou_types,
+ box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
+ device=cfg.MODEL.DEVICE,
+ expected_results=cfg.TEST.EXPECTED_RESULTS,
+ expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
+ output_folder=None,
+ )
synchronize()
- for dataset_name, data_loader_val in zip(dataset_names, data_loaders_val):
- _ = inference( # The result can be used for additional loggin, e. g. to TensorBoard
- model,
- data_loader_val,
- dataset_name=dataset_name,
- iou_types=iou_types,
- box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
- device=cfg.MODEL.DEVICE,
- expected_results=cfg.TEST.EXPECTED_RESULTS,
- expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
- output_folder=None,
- )
- synchronize()
model.train()
- meters_val = MetricLogger(delimiter=" ")
with torch.no_grad():
- for idx_val, (images_val, targets_val, _) in enumerate(data_loaders_val[0]):
+ # Should be one image for each GPU:
+ for iteration_val, (images_val, targets_val, _) in enumerate(tqdm(data_loader_val)):
images_val = images_val.to(device)
targets_val = [target.to(device) for target in targets_val]
loss_dict = model(images_val, targets_val)
@@ -148,7 +152,7 @@ def do_train(
meters_val.update(loss=losses_reduced, **loss_dict_reduced)
synchronize()
logger.info(
- meters.delimiter.join(
+ meters_val.delimiter.join(
[
"[Validation]: ",
"eta: {eta}",
diff --git a/tools/train_net.py b/tools/train_net.py
index 76d621747..6b7f6222b 100644
--- a/tools/train_net.py
+++ b/tools/train_net.py
@@ -74,9 +74,9 @@ def train(cfg, local_rank, distributed):
test_period = cfg.SOLVER.TEST_PERIOD
if test_period > 0:
- data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed, is_for_period=True)
+ data_loader_val = make_data_loader(cfg, is_train=False, is_distributed=distributed, is_for_period=True)
else:
- data_loaders_val = None
+ data_loader_val = None
checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
@@ -84,7 +84,7 @@ def train(cfg, local_rank, distributed):
cfg,
model,
data_loader,
- data_loaders_val,
+ data_loader_val,
optimizer,
scheduler,
checkpointer,