Closed
Description
sys.platform: linux
Python: 3.7.7 (default, May 7 2020, 21:25:33) [GCC 7.3.0]
CUDA available: True
CUDA_HOME: /usr/local/cuda
NVCC: Cuda compilation tools, release 10.0, V10.0.130
GPU 0,1: GeForce GTX 1080 Ti
GCC: gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0
PyTorch: 1.4.0
PyTorch compiling details: PyTorch built with:
- GCC 7.3
- Intel(R) Math Kernel Library Version 2020.0.1 Product Build 20200208 for Intel(R) 64 architecture applications
- Intel(R) MKL-DNN v0.21.1 (Git Hash 7d2fd500bc78936d1d648ca713b901012f470dbc)
- OpenMP 201511 (a.k.a. OpenMP 4.5)
- NNPACK is enabled
- CUDA Runtime 10.0
- NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_37,code=compute_37
- CuDNN 7.6.3
- Magma 2.5.1
- Build settings: BLAS=MKL, BUILD_NAMEDTENSOR=OFF, BUILD_TYPE=Release, CXX_FLAGS= -Wno-deprecated -fvisibility-inlines-hidden -fopenmp -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -O2 -fPIC -Wno-narrowing -Wall -Wextra -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Wno-stringop-overflow, DISABLE_NUMA=1, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, USE_CUDA=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, USE_STATIC_DISPATCH=OFF,
TorchVision: 0.5.0
OpenCV: 4.2.0
MMCV: 1.0.4
MMSegmentation: 0.5.0+b57fb2b
MMCV Compiler: GCC 7.5
MMCV CUDA Compiler: 10.0
Error was encountered during training process with condfigs:
Config:
norm_cfg = dict(type='BN', requires_grad=True)
model = dict(
type='EncoderDecoder',
pretrained='open-mmlab://resnet50_v1c',
backbone=dict(
type='ResNetV1c',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
dilations=(1, 1, 2, 4),
strides=(1, 2, 1, 1),
norm_cfg=dict(type='BN', requires_grad=True),
norm_eval=False,
style='pytorch',
contract_dilation=True),
decode_head=dict(
type='PSPHead',
in_channels=2048,
in_index=3,
channels=512,
pool_scales=(1, 2, 3, 6),
dropout_ratio=0.1,
num_classes=9,
norm_cfg=dict(type='BN', requires_grad=True),
align_corners=False,
loss_decode=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
auxiliary_head=dict(
type='FCNHead',
in_channels=1024,
in_index=2,
channels=256,
num_convs=1,
concat_input=False,
dropout_ratio=0.1,
num_classes=9,
norm_cfg=dict(type='BN', requires_grad=True),
align_corners=False,
loss_decode=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)))
train_cfg = dict()
test_cfg = dict(mode='whole')
dataset_type = 'Aircraft'
data_root = '/mmdetection_aircraft/data/segm2/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
crop_size = (512, 512)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations'),
dict(type='Resize', img_scale=(640, 480), ratio_range=(0.5, 2.0)),
dict(type='RandomCrop', crop_size=(512, 512), cat_max_ratio=0.75),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='PhotoMetricDistortion'),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='Pad', size=(512, 512), pad_val=0, seg_pad_val=255),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_semantic_seg'])
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(640, 480),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=1,
train=dict(
type='Aircraft',
data_root='/mmdetection_aircraft/data/segm2/',
img_dir='JPEGImages',
ann_dir='PaletteClass',
pipeline=[
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations'),
dict(type='Resize', img_scale=(640, 480), ratio_range=(0.5, 2.0)),
dict(type='RandomCrop', crop_size=(512, 512), cat_max_ratio=0.75),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='PhotoMetricDistortion'),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='Pad', size=(512, 512), pad_val=0, seg_pad_val=255),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_semantic_seg'])
],
split='train.txt'),
val=dict(
type='Aircraft',
data_root='/mmdetection_aircraft/data/segm2/',
img_dir='JPEGImages',
ann_dir='PaletteClass',
pipeline=[
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(640, 480),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
],
split='val.txt'),
test=dict(
type='Aircraft',
data_root='/mmdetection_aircraft/data/segm2/',
img_dir='JPEGImages',
ann_dir='PaletteClass',
pipeline=[
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(640, 480),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(
type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
],
split='val.txt'))
log_config = dict(
interval=1, hooks=[dict(type='TextLoggerHook', by_epoch=False)])
dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = 'checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth'
resume_from = None
workflow = [('train', 1)]
cudnn_benchmark = True
optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
optimizer_config = dict()
lr_config = dict(policy='poly', power=0.9, min_lr=0.0001, by_epoch=False)
total_iters = 400
checkpoint_config = dict(by_epoch=False, interval=200)
evaluation = dict(interval=1, metric='mIoU')
work_dir = './work_dirs/pspnet'
seed = 0
gpu_ids = [1]
The script take an approximately 4-5GB of GPU from 11GB available and return this error:
#ERROR
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-8-fec2661e1f4c> in <module>
16 mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
17 train_segmentor(model, datasets, cfg, distributed=False, validate=True,
---> 18 meta=dict())
~/mmsegmentation/mmseg/apis/train.py in train_segmentor(model, dataset, cfg, distributed, validate, timestamp, meta)
104 elif cfg.load_from:
105 runner.load_checkpoint(cfg.load_from)
--> 106 runner.run(data_loaders, cfg.workflow, cfg.total_iters)
~/miniconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py in run(self, data_loaders, workflow, max_iters, **kwargs)
117 if mode == 'train' and self.iter >= max_iters:
118 break
--> 119 iter_runner(iter_loaders[i], **kwargs)
120
121 time.sleep(1) # wait for some hooks like loggers to finish
~/miniconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py in train(self, data_loader, **kwargs)
53 self.call_hook('before_train_iter')
54 data_batch = next(data_loader)
---> 55 outputs = self.model.train_step(data_batch, self.optimizer, **kwargs)
56 if not isinstance(outputs, dict):
57 raise TypeError('model.train_step() must return a dict')
~/miniconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/parallel/data_parallel.py in train_step(self, *inputs, **kwargs)
29
30 inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
---> 31 return self.module.train_step(*inputs[0], **kwargs[0])
32
33 def val_step(self, *inputs, **kwargs):
~/mmsegmentation/mmseg/models/segmentors/base.py in train_step(self, data_batch, optimizer, **kwargs)
150 #data_batch['gt_semantic_seg'] = data_batch['gt_semantic_seg'][:,:,:,:,0]
151 #print(data_batch['gt_semantic_seg'].shape)
--> 152 losses = self.forward_train(**data_batch, **kwargs)
153 loss, log_vars = self._parse_losses(losses)
154
~/mmsegmentation/mmseg/models/segmentors/encoder_decoder.py in forward_train(self, img, img_metas, gt_semantic_seg)
155
156 loss_decode = self._decode_head_forward_train(x, img_metas,
--> 157 gt_semantic_seg)
158 losses.update(loss_decode)
159
~/mmsegmentation/mmseg/models/segmentors/encoder_decoder.py in _decode_head_forward_train(self, x, img_metas, gt_semantic_seg)
99 loss_decode = self.decode_head.forward_train(x, img_metas,
100 gt_semantic_seg,
--> 101 self.train_cfg)
102
103 losses.update(add_prefix(loss_decode, 'decode'))
~/mmsegmentation/mmseg/models/decode_heads/decode_head.py in forward_train(self, inputs, img_metas, gt_semantic_seg, train_cfg)
184 """
185 seg_logits = self.forward(inputs)
--> 186 losses = self.losses(seg_logits, gt_semantic_seg)
187 return losses
188
~/miniconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/runner/fp16_utils.py in new_func(*args, **kwargs)
162 'method of nn.Module')
163 if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
--> 164 return old_func(*args, **kwargs)
165 # get the arg spec of the decorated method
166 args_info = getfullargspec(old_func)
~/mmsegmentation/mmseg/models/decode_heads/decode_head.py in losses(self, seg_logit, seg_label)
229 seg_label,
230 weight=seg_weight,
--> 231 ignore_index=self.ignore_index)
232 loss['acc_seg'] = accuracy(seg_logit, seg_label)
233 return loss
~/miniconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
530 result = self._slow_forward(*input, **kwargs)
531 else:
--> 532 result = self.forward(*input, **kwargs)
533 for hook in self._forward_hooks.values():
534 hook_result = hook(self, input, result)
~/mmsegmentation/mmseg/models/losses/cross_entropy_loss.py in forward(self, cls_score, label, weight, avg_factor, reduction_override, **kwargs)
175 class_weight=class_weight,
176 reduction=reduction,
--> 177 avg_factor=avg_factor)
178 return loss_cls
~/mmsegmentation/mmseg/models/losses/cross_entropy_loss.py in cross_entropy(pred, label, weight, class_weight, reduction, avg_factor, ignore_index)
28 weight = weight.float()
29 loss = weight_reduce_loss(
---> 30 loss, weight=weight, reduction=reduction, avg_factor=avg_factor)
31
32 return loss
~/mmsegmentation/mmseg/models/losses/utils.py in weight_reduce_loss(loss, weight, reduction, avg_factor)
45 # if avg_factor is not specified, just reduce the loss
46 if avg_factor is None:
---> 47 loss = reduce_loss(loss, reduction)
48 else:
49 # if reduction is mean, then average the loss by avg_factor
~/mmsegmentation/mmseg/models/losses/utils.py in reduce_loss(loss, reduction)
19 return loss
20 elif reduction_enum == 1:
---> 21 return loss.mean()
22 elif reduction_enum == 2:
23 return loss.sum()
RuntimeError: CUDA error: an illegal memory access was encountered
But if i reduce the size the image size twice with the same images per GPU (2) ,script takes approxiamtely 2GB from GPU and everything works fine.
Also,i want to add that using another PyTorch script with my own Dataloader i'm able to fill in GPU on full (11GB) by training process with the same Torch version and the same hardware.