From e8c60437096229d6fd24958cc44a9cb8d5885b13 Mon Sep 17 00:00:00 2001 From: Connor Anderson Date: Fri, 12 Jul 2024 12:13:44 -0400 Subject: [PATCH 1/3] Add check for target_sizes is None in post_process_image_guided_detection --- .../models/owlv2/image_processing_owlv2.py | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/owlv2/image_processing_owlv2.py b/src/transformers/models/owlv2/image_processing_owlv2.py index 1e9a5163a1a6fd..3e5cad4b8f5545 100644 --- a/src/transformers/models/owlv2/image_processing_owlv2.py +++ b/src/transformers/models/owlv2/image_processing_owlv2.py @@ -565,10 +565,11 @@ def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_thresh """ logits, target_boxes = outputs.logits, outputs.target_pred_boxes - if len(logits) != len(target_sizes): - raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") - if target_sizes.shape[1] != 2: - raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") + if target_sizes is not None: + if len(logits) != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits" + ) probs = torch.max(logits, dim=-1) scores = torch.sigmoid(probs.values) @@ -588,9 +589,14 @@ def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_thresh scores[idx][ious > nms_threshold] = 0.0 # Convert from relative [0, 1] to absolute [0, height] coordinates - img_h, img_w = target_sizes.unbind(1) - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device) - target_boxes = target_boxes * scale_fct[:, None, :] + if target_sizes is not None: + if isinstance(target_size, List): + img_h = torch.tensor([i[0] for i in target_sizes]) + img_w = torch.tensor([i[1] for i in target_sizes]) + else: + img_h, img_w = target_sizes.unbind(1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device) + target_boxes = target_boxes * scale_fct[:, None, :] # Compute box display alphas based on prediction scores results = [] From 26af6fc050143e61e6fd406852430d7f2b6f548a Mon Sep 17 00:00:00 2001 From: Connor Anderson Date: Mon, 15 Jul 2024 15:01:41 -0400 Subject: [PATCH 2/3] Make sure Owlvit and Owlv2 in sync --- .../models/owlv2/image_processing_owlv2.py | 6 +++--- .../models/owlvit/image_processing_owlvit.py | 16 +++++++++++----- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/owlv2/image_processing_owlv2.py b/src/transformers/models/owlv2/image_processing_owlv2.py index 3e5cad4b8f5545..ff812cff9ff9b0 100644 --- a/src/transformers/models/owlv2/image_processing_owlv2.py +++ b/src/transformers/models/owlv2/image_processing_owlv2.py @@ -590,13 +590,13 @@ def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_thresh # Convert from relative [0, 1] to absolute [0, height] coordinates if target_sizes is not None: - if isinstance(target_size, List): + if isinstance(target_sizes, List): img_h = torch.tensor([i[0] for i in target_sizes]) img_w = torch.tensor([i[1] for i in target_sizes]) else: img_h, img_w = target_sizes.unbind(1) - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device) - target_boxes = target_boxes * scale_fct[:, None, :] + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device) + target_boxes = target_boxes * scale_fct[:, None, :] # Compute box display alphas based on prediction scores results = [] diff --git a/src/transformers/models/owlvit/image_processing_owlvit.py b/src/transformers/models/owlvit/image_processing_owlvit.py index 25ea5f2720d527..21f283084470cb 100644 --- a/src/transformers/models/owlvit/image_processing_owlvit.py +++ b/src/transformers/models/owlvit/image_processing_owlvit.py @@ -556,10 +556,11 @@ def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_thresh """ logits, target_boxes = outputs.logits, outputs.target_pred_boxes - if len(logits) != len(target_sizes): - raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") - if target_sizes.shape[1] != 2: - raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") + if target_sizes is not None: + if len(logits) != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits" + ) probs = torch.max(logits, dim=-1) scores = torch.sigmoid(probs.values) @@ -579,7 +580,12 @@ def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_thresh scores[idx][ious > nms_threshold] = 0.0 # Convert from relative [0, 1] to absolute [0, height] coordinates - img_h, img_w = target_sizes.unbind(1) + if target_sizes is not None: + if isinstance(target_sizes, List): + img_h = torch.tensor([i[0] for i in target_sizes]) + img_w = torch.tensor([i[1] for i in target_sizes]) + else: + img_h, img_w = target_sizes.unbind(1) scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device) target_boxes = target_boxes * scale_fct[:, None, :] From 66e3f067af7dbd3063850c5c41a87ba444ceb6a3 Mon Sep 17 00:00:00 2001 From: Connor Anderson Date: Mon, 22 Jul 2024 09:20:08 -0400 Subject: [PATCH 3/3] Fix incorrect indentation; add check for correct size of target_sizes --- .../models/owlv2/image_processing_owlv2.py | 13 ++++++------- .../models/owlvit/image_processing_owlvit.py | 13 ++++++------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/src/transformers/models/owlv2/image_processing_owlv2.py b/src/transformers/models/owlv2/image_processing_owlv2.py index ff812cff9ff9b0..d3ef04238a8f80 100644 --- a/src/transformers/models/owlv2/image_processing_owlv2.py +++ b/src/transformers/models/owlv2/image_processing_owlv2.py @@ -565,11 +565,10 @@ def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_thresh """ logits, target_boxes = outputs.logits, outputs.target_pred_boxes - if target_sizes is not None: - if len(logits) != len(target_sizes): - raise ValueError( - "Make sure that you pass in as many target sizes as the batch dimension of the logits" - ) + if target_sizes is not None and len(logits) != len(target_sizes): + raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") + if target_sizes is not None and target_sizes.shape[1] != 2: + raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") probs = torch.max(logits, dim=-1) scores = torch.sigmoid(probs.values) @@ -595,8 +594,8 @@ def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_thresh img_w = torch.tensor([i[1] for i in target_sizes]) else: img_h, img_w = target_sizes.unbind(1) - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device) - target_boxes = target_boxes * scale_fct[:, None, :] + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device) + target_boxes = target_boxes * scale_fct[:, None, :] # Compute box display alphas based on prediction scores results = [] diff --git a/src/transformers/models/owlvit/image_processing_owlvit.py b/src/transformers/models/owlvit/image_processing_owlvit.py index 21f283084470cb..5bc889ba85d501 100644 --- a/src/transformers/models/owlvit/image_processing_owlvit.py +++ b/src/transformers/models/owlvit/image_processing_owlvit.py @@ -556,11 +556,10 @@ def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_thresh """ logits, target_boxes = outputs.logits, outputs.target_pred_boxes - if target_sizes is not None: - if len(logits) != len(target_sizes): - raise ValueError( - "Make sure that you pass in as many target sizes as the batch dimension of the logits" - ) + if target_sizes is not None and len(logits) != len(target_sizes): + raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") + if target_sizes is not None and target_sizes.shape[1] != 2: + raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") probs = torch.max(logits, dim=-1) scores = torch.sigmoid(probs.values) @@ -586,8 +585,8 @@ def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_thresh img_w = torch.tensor([i[1] for i in target_sizes]) else: img_h, img_w = target_sizes.unbind(1) - scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device) - target_boxes = target_boxes * scale_fct[:, None, :] + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device) + target_boxes = target_boxes * scale_fct[:, None, :] # Compute box display alphas based on prediction scores results = []