From 8146214df97ba2ec04cc02169303847bb1afeba5 Mon Sep 17 00:00:00 2001 From: Dima Date: Sat, 25 Dec 2021 05:50:10 +0200 Subject: [PATCH 1/9] add parameters to make custom backbone for detr --- .../models/detr/configuration_detr.py | 8 ++++++++ src/transformers/models/detr/modeling_detr.py | 20 ++++++++++++------- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py index 3edc9da80e5c1c..c6bdaa82b67c31 100644 --- a/src/transformers/models/detr/configuration_detr.py +++ b/src/transformers/models/detr/configuration_detr.py @@ -155,6 +155,10 @@ def __init__( bbox_loss_coefficient=5, giou_loss_coefficient=2, eos_coefficient=0.1, + in_chans=3, + pretrained=True, + freeze_layers=True, + replace_batch_norm=True, **kwargs ): self.num_queries = num_queries @@ -190,6 +194,10 @@ def __init__( self.bbox_loss_coefficient = bbox_loss_coefficient self.giou_loss_coefficient = giou_loss_coefficient self.eos_coefficient = eos_coefficient + self.in_chans = in_chans + self.pretrained = pretrained + self.freeze_layers = freeze_layers + self.replace_batch_norm = replace_batch_norm super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) @property diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index 7d1140577a01a3..316f903627b245 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -315,23 +315,27 @@ class DetrTimmConvEncoder(nn.Module): """ - def __init__(self, name: str, dilation: bool): + def __init__(self, name: str, dilation: bool, in_chans: int, + pretrained=True, freeze_layers=True, replace_batch_norm=True): super().__init__() - kwargs = {} + kwargs = { + 'in_chans': in_chans + } if dilation: kwargs["output_stride"] = 16 requires_backends(self, ["timm"]) - backbone = create_model(name, pretrained=True, features_only=True, out_indices=(1, 2, 3, 4), **kwargs) + backbone = create_model(name, pretrained=pretrained, features_only=True, out_indices=(1, 2, 3, 4), **kwargs) # replace batch norm by frozen batch norm - with torch.no_grad(): - replace_batch_norm(backbone) + if replace_batch_norm: + with torch.no_grad(): + replace_batch_norm(backbone) self.model = backbone self.intermediate_channel_sizes = self.model.feature_info.channels() - if "resnet" in name: + if "resnet" in name and freeze_layers: for name, parameter in self.model.named_parameters(): if "layer2" not in name and "layer3" not in name and "layer4" not in name: parameter.requires_grad_(False) @@ -1159,7 +1163,9 @@ def __init__(self, config: DetrConfig): super().__init__(config) # Create backbone + positional encoding - backbone = DetrTimmConvEncoder(config.backbone, config.dilation) + backbone = DetrTimmConvEncoder(config.backbone, config.dilation, config.in_chans, + pretrained=config.pretrained, freeze_layers=config.freeze_layers, + replace_batch_norm=config.replace_batch_norm) position_embeddings = build_position_encoding(config) self.backbone = DetrConvModel(backbone, position_embeddings) From 15ba33101ae94d9f58626a509e8736a49085a455 Mon Sep 17 00:00:00 2001 From: Dima Date: Sun, 26 Dec 2021 17:04:23 +0200 Subject: [PATCH 2/9] rename `replace_batch_norm` to `fix_batch_norm` --- src/transformers/models/detr/configuration_detr.py | 4 ++-- src/transformers/models/detr/modeling_detr.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py index c6bdaa82b67c31..e22ca92871e877 100644 --- a/src/transformers/models/detr/configuration_detr.py +++ b/src/transformers/models/detr/configuration_detr.py @@ -158,7 +158,7 @@ def __init__( in_chans=3, pretrained=True, freeze_layers=True, - replace_batch_norm=True, + fix_batch_norm=True, **kwargs ): self.num_queries = num_queries @@ -197,7 +197,7 @@ def __init__( self.in_chans = in_chans self.pretrained = pretrained self.freeze_layers = freeze_layers - self.replace_batch_norm = replace_batch_norm + self.fix_batch_norm = fix_batch_norm super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) @property diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index 316f903627b245..d7169cab8e4366 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -316,7 +316,7 @@ class DetrTimmConvEncoder(nn.Module): """ def __init__(self, name: str, dilation: bool, in_chans: int, - pretrained=True, freeze_layers=True, replace_batch_norm=True): + pretrained=True, freeze_layers=True, fix_batch_norm=True): super().__init__() kwargs = { @@ -329,7 +329,7 @@ def __init__(self, name: str, dilation: bool, in_chans: int, backbone = create_model(name, pretrained=pretrained, features_only=True, out_indices=(1, 2, 3, 4), **kwargs) # replace batch norm by frozen batch norm - if replace_batch_norm: + if fix_batch_norm: with torch.no_grad(): replace_batch_norm(backbone) self.model = backbone @@ -1165,7 +1165,7 @@ def __init__(self, config: DetrConfig): # Create backbone + positional encoding backbone = DetrTimmConvEncoder(config.backbone, config.dilation, config.in_chans, pretrained=config.pretrained, freeze_layers=config.freeze_layers, - replace_batch_norm=config.replace_batch_norm) + fix_batch_norm=config.fix_batch_norm) position_embeddings = build_position_encoding(config) self.backbone = DetrConvModel(backbone, position_embeddings) From 2815479406c0bd308ed4be0884388822a6c7455c Mon Sep 17 00:00:00 2001 From: Dima Date: Sun, 26 Dec 2021 17:14:03 +0200 Subject: [PATCH 3/9] reformat --- src/transformers/models/detr/modeling_detr.py | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index d7169cab8e4366..8967eb7a2d7198 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -315,13 +315,12 @@ class DetrTimmConvEncoder(nn.Module): """ - def __init__(self, name: str, dilation: bool, in_chans: int, - pretrained=True, freeze_layers=True, fix_batch_norm=True): + def __init__( + self, name: str, dilation: bool, in_chans: int, pretrained=True, freeze_layers=True, fix_batch_norm=True + ): super().__init__() - kwargs = { - 'in_chans': in_chans - } + kwargs = {"in_chans": in_chans} if dilation: kwargs["output_stride"] = 16 @@ -1163,9 +1162,14 @@ def __init__(self, config: DetrConfig): super().__init__(config) # Create backbone + positional encoding - backbone = DetrTimmConvEncoder(config.backbone, config.dilation, config.in_chans, - pretrained=config.pretrained, freeze_layers=config.freeze_layers, - fix_batch_norm=config.fix_batch_norm) + backbone = DetrTimmConvEncoder( + config.backbone, + config.dilation, + config.in_chans, + pretrained=config.pretrained, + freeze_layers=config.freeze_layers, + fix_batch_norm=config.fix_batch_norm, + ) position_embeddings = build_position_encoding(config) self.backbone = DetrConvModel(backbone, position_embeddings) From 42624b2e7f8b64a27cfe7a2f91104a3d8e5c19a1 Mon Sep 17 00:00:00 2001 From: Dima Date: Mon, 27 Dec 2021 09:41:13 +0200 Subject: [PATCH 4/9] rename parameters and resolve tensor device placement issue --- src/transformers/image_utils.py | 2 ++ src/transformers/models/detr/configuration_detr.py | 4 ++-- src/transformers/models/detr/modeling_detr.py | 8 ++++---- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py index 773f3e1cad3914..78f34406824608 100644 --- a/src/transformers/image_utils.py +++ b/src/transformers/image_utils.py @@ -177,8 +177,10 @@ def normalize(self, image, mean, std): if not isinstance(mean, torch.Tensor): mean = torch.tensor(mean) + mean.to(image.device) if not isinstance(std, torch.Tensor): std = torch.tensor(std) + std.to(image.device) if image.ndim == 3 and image.shape[0] in [1, 3]: return (image - mean[:, None, None]) / std[:, None, None] diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py index e22ca92871e877..3ed7002b353652 100644 --- a/src/transformers/models/detr/configuration_detr.py +++ b/src/transformers/models/detr/configuration_detr.py @@ -194,8 +194,8 @@ def __init__( self.bbox_loss_coefficient = bbox_loss_coefficient self.giou_loss_coefficient = giou_loss_coefficient self.eos_coefficient = eos_coefficient - self.in_chans = in_chans - self.pretrained = pretrained + self.num_channels = num_channels + self.use_pretrained_backbone = use_pretrained_backbone self.freeze_layers = freeze_layers self.fix_batch_norm = fix_batch_norm super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index 8967eb7a2d7198..e2e1e153a367a3 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -316,11 +316,11 @@ class DetrTimmConvEncoder(nn.Module): """ def __init__( - self, name: str, dilation: bool, in_chans: int, pretrained=True, freeze_layers=True, fix_batch_norm=True + self, name: str, dilation: bool, num_channels: int, pretrained=True, freeze_layers=True, fix_batch_norm=True ): super().__init__() - kwargs = {"in_chans": in_chans} + kwargs = {"in_chans": num_channels} if dilation: kwargs["output_stride"] = 16 @@ -1165,8 +1165,8 @@ def __init__(self, config: DetrConfig): backbone = DetrTimmConvEncoder( config.backbone, config.dilation, - config.in_chans, - pretrained=config.pretrained, + config.num_channels, + pretrained=config.use_pretrained_backbone, freeze_layers=config.freeze_layers, fix_batch_norm=config.fix_batch_norm, ) From 9880a194eb8804ad614d8a097d93ebac141f10e8 Mon Sep 17 00:00:00 2001 From: Dima Date: Mon, 27 Dec 2021 09:49:24 +0200 Subject: [PATCH 5/9] bugfix: change names in constructor --- src/transformers/models/detr/configuration_detr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py index 3ed7002b353652..feaff7a3418cc9 100644 --- a/src/transformers/models/detr/configuration_detr.py +++ b/src/transformers/models/detr/configuration_detr.py @@ -155,8 +155,8 @@ def __init__( bbox_loss_coefficient=5, giou_loss_coefficient=2, eos_coefficient=0.1, - in_chans=3, - pretrained=True, + num_channels=3, + use_pretrained_backbone=True, freeze_layers=True, fix_batch_norm=True, **kwargs From 23986a33ad0d65b7357cdad339dfa5095fb1f0b7 Mon Sep 17 00:00:00 2001 From: Dima Date: Mon, 27 Dec 2021 10:42:53 +0200 Subject: [PATCH 6/9] bugfix --- src/transformers/image_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py index 78f34406824608..b292971ff86560 100644 --- a/src/transformers/image_utils.py +++ b/src/transformers/image_utils.py @@ -177,10 +177,10 @@ def normalize(self, image, mean, std): if not isinstance(mean, torch.Tensor): mean = torch.tensor(mean) - mean.to(image.device) + mean = mean.to(image.device) if not isinstance(std, torch.Tensor): std = torch.tensor(std) - std.to(image.device) + std = std.to(image.device) if image.ndim == 3 and image.shape[0] in [1, 3]: return (image - mean[:, None, None]) / std[:, None, None] From aaacc2b82fae4c44a4f643b79a5127e0259b26c9 Mon Sep 17 00:00:00 2001 From: Dima Date: Mon, 27 Dec 2021 21:03:43 +0200 Subject: [PATCH 7/9] fix PIL grayscale handling --- .../models/detr/feature_extraction_detr.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/detr/feature_extraction_detr.py b/src/transformers/models/detr/feature_extraction_detr.py index a2f93ac2a26096..709b12876e4800 100644 --- a/src/transformers/models/detr/feature_extraction_detr.py +++ b/src/transformers/models/detr/feature_extraction_detr.py @@ -581,13 +581,20 @@ def __call__( if pad_and_return_pixel_mask: # pad images up to largest image in batch and create pixel_mask max_size = self._max_by_axis([list(image.shape) for image in images]) - c, h, w = max_size + if len(max_size) != 2: + c, h, w = max_size + else: + h, w = max_size + c = 1 padded_images = [] pixel_mask = [] for image in images: # create padded image padded_image = np.zeros((c, h, w), dtype=np.float32) - padded_image[: image.shape[0], : image.shape[1], : image.shape[2]] = np.copy(image) + if c != 1: + padded_image[: image.shape[0], : image.shape[1], : image.shape[2]] = np.copy(image) + else: + padded_image[0, : image.shape[0], : image.shape[1]] = np.copy(image) padded_images.append(padded_image) # create pixel mask mask = np.zeros((h, w), dtype=np.int64) From f2677478c038a3be5f3245c6319130ca5a3abfd7 Mon Sep 17 00:00:00 2001 From: Dima Date: Mon, 27 Dec 2021 21:23:30 +0200 Subject: [PATCH 8/9] fix mask for grayscale images --- src/transformers/models/detr/feature_extraction_detr.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/detr/feature_extraction_detr.py b/src/transformers/models/detr/feature_extraction_detr.py index 709b12876e4800..aebaba12f47188 100644 --- a/src/transformers/models/detr/feature_extraction_detr.py +++ b/src/transformers/models/detr/feature_extraction_detr.py @@ -598,7 +598,8 @@ def __call__( padded_images.append(padded_image) # create pixel mask mask = np.zeros((h, w), dtype=np.int64) - mask[: image.shape[1], : image.shape[2]] = True + if c == 1: + mask[: image.shape[0], : image.shape[1]] = True pixel_mask.append(mask) images = padded_images From 8ac8b665e62da0079096f1467704cbbc8497875e Mon Sep 17 00:00:00 2001 From: Dima Date: Tue, 28 Dec 2021 00:48:04 +0200 Subject: [PATCH 9/9] bugfix --- src/transformers/models/detr/feature_extraction_detr.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/transformers/models/detr/feature_extraction_detr.py b/src/transformers/models/detr/feature_extraction_detr.py index aebaba12f47188..8775f0f200ae01 100644 --- a/src/transformers/models/detr/feature_extraction_detr.py +++ b/src/transformers/models/detr/feature_extraction_detr.py @@ -600,6 +600,8 @@ def __call__( mask = np.zeros((h, w), dtype=np.int64) if c == 1: mask[: image.shape[0], : image.shape[1]] = True + else: + mask[: image.shape[1], : image.shape[2]] = True pixel_mask.append(mask) images = padded_images