Skip to content

Commit

Permalink
fix bugs in detection
Browse files Browse the repository at this point in the history
  • Loading branch information
zkyseu committed Nov 14, 2023
1 parent d074f26 commit 9b94e0e
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 27 deletions.
58 changes: 44 additions & 14 deletions flowvision/models/detection/fcos.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,22 @@
from flowvision.layers import boxes as box_ops
from flowvision.layers import misc as misc_nn_ops
from flowvision.layers import LastLevelP6P7
from ..utils import load_state_dict_from_url
from .transform import _resize_boxes,paste_masks_in_image,_resize_keypoints
from . import det_utils
from .anchor_utils import AnchorGenerator
from .backbone_utils import resnet_fpn_backbone, _validate_trainable_layers
from .transform import GeneralizedRCNNTransform
from ..registry import ModelCreator


model_urls={
"fcos_resnet50_fpn_coco":"http://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/fcos_resnet50_fpn/model.pth"
}

class FCOSHead(nn.Module):
"""
A regression and classification head for use in FCOS.
Args:
in_channels (int): number of channels of the input feature
num_anchors (int): number of anchors to be predicted
Expand Down Expand Up @@ -153,7 +158,6 @@ def forward(self, x: List[Tensor]) -> Dict[str, Tensor]:
class FCOSClassificationHead(nn.Module):
"""
A classification head for use in FCOS.
Args:
in_channels (int): number of channels of the input feature.
num_anchors (int): number of anchors to be predicted.
Expand Down Expand Up @@ -225,7 +229,6 @@ def forward(self, x: List[Tensor]) -> Tensor:
class FCOSRegressionHead(nn.Module):
"""
A regression head for use in FCOS.
Args:
in_channels (int): number of channels of the input feature
num_anchors (int): number of anchors to be predicted
Expand Down Expand Up @@ -297,29 +300,23 @@ def forward(self, x: List[Tensor]) -> Tuple[Tensor, Tensor]:
class FCOS(nn.Module):
"""
Implements FCOS.
The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
image, and should be in 0-1 range. Different images can have different sizes.
The behavior of the model changes depending if it is in training or evaluation mode.
During training, the model expects both the input tensors, as well as a targets (list of dictionary),
containing:
- boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
- labels (Int64Tensor[N]): the class label for each ground-truth box
The model returns a Dict[Tensor] during training, containing the classification, regression
and centerness losses.
During inference, the model requires only the input tensors, and returns the post-processed
predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
follows:
- boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
- labels (Int64Tensor[N]): the predicted labels for each image
- scores (Tensor[N]): the scores for each prediction
Args:
backbone (nn.Module): the network used to compute the features for the model.
It should contain an out_channels attribute, which indicates the number of output
Expand Down Expand Up @@ -534,6 +531,8 @@ def postprocess_detections(

# keep only topk scoring predictions
num_topk = min(self.topk_candidates, topk_idxs.size(0))
if num_topk <= 0:
continue
scores_per_level, idxs = scores_per_level.topk(num_topk)
topk_idxs = topk_idxs[idxs]

Expand All @@ -542,7 +541,7 @@ def postprocess_detections(

boxes_per_level = self.box_coder.decode_single(
box_regression_per_level[anchor_idxs],
anchors_per_level[anchors_idxs],
anchors_per_level[anchor_idxs],
)
boxes_per_level = box_ops.clip_boxes_to_image(
boxes_per_level, image_shape
Expand All @@ -552,6 +551,16 @@ def postprocess_detections(
image_scores.append(scores_per_level)
image_labels.append(labels_per_level)

if len(image_boxes) <= 0:
detections.append(
{
"boxes": flow.tensor(image_boxes),
"scores": flow.tensor(image_scores),
"labels": flow.tensor(image_labels),
}
)
continue

image_boxes = flow.cat(image_boxes, dim=0)
image_scores = flow.cat(image_scores, dim=0)
image_labels = flow.cat(image_labels, dim=0)
Expand All @@ -572,14 +581,35 @@ def postprocess_detections(

return detections

def postprocess_bbox(self,result,image_shapes,original_image_sizes):
if self.training:
return result
for i, (pred, im_s, o_im_s) in enumerate(
zip(result, image_shapes, original_image_sizes)
):
boxes = pred["boxes"]
if len(boxes) <= 0:
result[i]["boxes"] = boxes
continue
boxes = _resize_boxes(boxes, im_s, o_im_s)
result[i]["boxes"] = boxes
if "masks" in pred:
masks = pred["masks"]
masks = paste_masks_in_image(masks, boxes, o_im_s)
result[i]["masks"] = masks
if "keypoints" in pred:
keypoints = pred["keypoints"]
keypoints = _resize_keypoints(keypoints, im_s, o_im_s)
result[i]["keypoints"] = keypoints
return result

def forward(
self, images: List[Tensor], targets: Optional[List[Dict[str, Tensor]]] = None,
) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]:
"""
Args:
images (list[Tensor]): images to be processed
targets (list[Dict[Tensor]]): ground-truth boxes present in the image (optional)
Returns:
result (list[BoxList] or dict[Tensor]): the output from the model.
During training, it returns a dict[Tensor] which contains the losses.
Expand Down Expand Up @@ -668,9 +698,9 @@ def forward(

# compute the detections
detections = self.postprocess_detections(
split_head_outputs, split_anchors, images.image_size
split_head_outputs, split_anchors, images.image_sizes
)
detections = self.transform.postprocess(
detections = self.postprocess_bbox(
detections, images.image_sizes, original_image_sizes
)

Expand Down Expand Up @@ -710,7 +740,7 @@ def _fcos_resnet_fpn(
state_dict = load_state_dict_from_url(
model_urls[weights_name], progress=progress
)
model.load_state_dict(state_dict)
model.load_state_dict(state_dict['model'])
det_utils.overwrite_eps(model, 0.0)
return model

Expand Down
48 changes: 36 additions & 12 deletions flowvision/models/detection/retinanet.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from ..utils import load_state_dict_from_url

from .transform import _resize_boxes,paste_masks_in_image,_resize_keypoints
from . import det_utils
from .anchor_utils import AnchorGenerator
from .transform import GeneralizedRCNNTransform
Expand Down Expand Up @@ -39,7 +40,6 @@ def _sum(x: List[Tensor]) -> Tensor:
class RetinaNetHead(nn.Module):
"""
A regression and classification head for use in RetinaNet.
Args:
in_channels (int): number of channels of the input feature
num_anchors (int): number of anchors to be predicted
Expand Down Expand Up @@ -79,7 +79,6 @@ def forward(self, x: List[Tensor]) -> Dict[str, Tensor]:
class RetinaNetClassificationHead(nn.Module):
"""
A classification head for use in RetinaNet.
Args:
in_channels (int): number of channels of the input feature
num_anchors (int): number of anchors to be predicted
Expand Down Expand Up @@ -177,7 +176,6 @@ def forward(self, x: List[Tensor]) -> Tensor:
class RetinaNetRegressionHead(nn.Module):
"""
A regression head for use in RetinaNet.
Args:
in_channels (int): number of channels of the input feature
num_anchors (int): number of anchors to be predicted
Expand Down Expand Up @@ -277,29 +275,23 @@ def forward(self, x: List[Tensor]) -> Tensor:
class RetinaNet(nn.Module):
"""
Implements RetinaNet.
The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
image, and should be in 0-1 range. Different images can have different sizes.
The behavior of the model changes depending if it is in training or evaluation mode.
During training, the model expects both the input tensors, as well as a targets (list of dictionary),
containing:
- boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
- labels (Int64Tensor[N]): the class label for each ground-truth box
The model returns a Dict[Tensor] during training, containing the classification and regression
losses.
During inference, the model requires only the input tensors, and returns the post-processed
predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
follows:
- boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
- labels (Int64Tensor[N]): the predicted labels for each image
- scores (Tensor[N]): the scores for each prediction
Args:
backbone (nn.Module): the network used to compute the features for the model.
It should contain an out_channels attribute, which indicates the number of output
Expand Down Expand Up @@ -467,6 +459,8 @@ def postprocess_detections(

# keep only topk scoring predictions
num_topk = min(self.topk_candidates, topk_idxs.size(0))
if num_topk <= 0:
continue
scores_per_level, idxs = scores_per_level.topk(num_topk)
topk_idxs = topk_idxs[idxs]

Expand All @@ -485,6 +479,15 @@ def postprocess_detections(
image_scores.append(scores_per_level)
image_labels.append(labels_per_level)

if len(image_boxes) <= 0:
detections.append(
{
"boxes": flow.tensor(image_boxes),
"scores": flow.tensor(image_scores),
"labels": flow.tensor(image_labels),
}
)
continue
image_boxes = flow.cat(image_boxes, dim=0)
image_scores = flow.cat(image_scores, dim=0)
image_labels = flow.cat(image_labels, dim=0)
Expand All @@ -505,20 +508,41 @@ def postprocess_detections(

return detections


def postprocess_bbox(self,result,image_shapes,original_image_sizes):
if self.training:
return result
for i, (pred, im_s, o_im_s) in enumerate(
zip(result, image_shapes, original_image_sizes)
):
boxes = pred["boxes"]
if len(boxes) <= 0:
result[i]["boxes"] = boxes
continue
boxes = _resize_boxes(boxes, im_s, o_im_s)
result[i]["boxes"] = boxes
if "masks" in pred:
masks = pred["masks"]
masks = paste_masks_in_image(masks, boxes, o_im_s)
result[i]["masks"] = masks
if "keypoints" in pred:
keypoints = pred["keypoints"]
keypoints = _resize_keypoints(keypoints, im_s, o_im_s)
result[i]["keypoints"] = keypoints
return result

def forward(
self, images: List[Tensor], targets: Optional[List[Dict[str, Tensor]]] = None
) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]:
"""
Args:
images (list[Tensor]): images to be processed
targets (list[Dict[Tensor]]): ground-truth boxes present in the image (optional)
Returns:
result (list[BoxList] or dict[Tensor]): the output from the model.
During training, it returns a dict[Tensor] which contains the losses.
During testing, it returns list[BoxList] contains additional fields
like `scores`, `labels` and `mask` (for Mask R-CNN models).
"""
if self.training and targets is None:
raise ValueError("In training mode, targets should be passed")
Expand Down Expand Up @@ -611,7 +635,7 @@ def forward(
detections = self.postprocess_detections(
split_head_outputs, split_anchors, images.image_sizes
)
detections = self.transform.postprocess(
detections = self.postprocess_bbox(
detections, images.image_sizes, original_image_sizes
)

Expand Down
2 changes: 2 additions & 0 deletions flowvision/version.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
__version__ = '0.2.1'
git_version = 'd074f26909e427c7ba19c5f1d4c8ff97fa6a4ea9'
2 changes: 1 addition & 1 deletion projects/detection/coco_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def prepare(self, predictions, iou_type):
def prepare_for_coco_detection(self, predictions):
coco_results = []
for original_id, prediction in predictions.items():
if len(prediction) == 0:
if len(prediction) == 0 or len(prediction['boxes'])==0:
continue

boxes = prediction["boxes"]
Expand Down

0 comments on commit 9b94e0e

Please sign in to comment.