From e8c60437096229d6fd24958cc44a9cb8d5885b13 Mon Sep 17 00:00:00 2001
From: Connor Anderson <thecatalystak@gmail.com>
Date: Fri, 12 Jul 2024 12:13:44 -0400
Subject: [PATCH 1/3] Add check for target_sizes is None in
 post_process_image_guided_detection

---
 .../models/owlv2/image_processing_owlv2.py    | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/owlv2/image_processing_owlv2.py b/src/transformers/models/owlv2/image_processing_owlv2.py
index 1e9a5163a1a6fd..3e5cad4b8f5545 100644
--- a/src/transformers/models/owlv2/image_processing_owlv2.py
+++ b/src/transformers/models/owlv2/image_processing_owlv2.py
@@ -565,10 +565,11 @@ def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_thresh
         """
         logits, target_boxes = outputs.logits, outputs.target_pred_boxes
 
-        if len(logits) != len(target_sizes):
-            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
-        if target_sizes.shape[1] != 2:
-            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+        if target_sizes is not None:
+            if len(logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
 
         probs = torch.max(logits, dim=-1)
         scores = torch.sigmoid(probs.values)
@@ -588,9 +589,14 @@ def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_thresh
                     scores[idx][ious > nms_threshold] = 0.0
 
         # Convert from relative [0, 1] to absolute [0, height] coordinates
-        img_h, img_w = target_sizes.unbind(1)
-        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device)
-        target_boxes = target_boxes * scale_fct[:, None, :]
+        if target_sizes is not None:
+            if isinstance(target_size, List):
+                img_h = torch.tensor([i[0] for i in target_sizes])
+                img_w = torch.tensor([i[1] for i in target_sizes])
+            else:
+                img_h, img_w = target_sizes.unbind(1)
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device)
+            target_boxes = target_boxes * scale_fct[:, None, :]
 
         # Compute box display alphas based on prediction scores
         results = []

From 26af6fc050143e61e6fd406852430d7f2b6f548a Mon Sep 17 00:00:00 2001
From: Connor Anderson <thecatalystak@gmail.com>
Date: Mon, 15 Jul 2024 15:01:41 -0400
Subject: [PATCH 2/3] Make sure Owlvit and Owlv2 in sync

---
 .../models/owlv2/image_processing_owlv2.py       |  6 +++---
 .../models/owlvit/image_processing_owlvit.py     | 16 +++++++++++-----
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/owlv2/image_processing_owlv2.py b/src/transformers/models/owlv2/image_processing_owlv2.py
index 3e5cad4b8f5545..ff812cff9ff9b0 100644
--- a/src/transformers/models/owlv2/image_processing_owlv2.py
+++ b/src/transformers/models/owlv2/image_processing_owlv2.py
@@ -590,13 +590,13 @@ def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_thresh
 
         # Convert from relative [0, 1] to absolute [0, height] coordinates
         if target_sizes is not None:
-            if isinstance(target_size, List):
+            if isinstance(target_sizes, List):
                 img_h = torch.tensor([i[0] for i in target_sizes])
                 img_w = torch.tensor([i[1] for i in target_sizes])
             else:
                 img_h, img_w = target_sizes.unbind(1)
-            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device)
-            target_boxes = target_boxes * scale_fct[:, None, :]
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device)
+        target_boxes = target_boxes * scale_fct[:, None, :]
 
         # Compute box display alphas based on prediction scores
         results = []
diff --git a/src/transformers/models/owlvit/image_processing_owlvit.py b/src/transformers/models/owlvit/image_processing_owlvit.py
index 25ea5f2720d527..21f283084470cb 100644
--- a/src/transformers/models/owlvit/image_processing_owlvit.py
+++ b/src/transformers/models/owlvit/image_processing_owlvit.py
@@ -556,10 +556,11 @@ def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_thresh
         """
         logits, target_boxes = outputs.logits, outputs.target_pred_boxes
 
-        if len(logits) != len(target_sizes):
-            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
-        if target_sizes.shape[1] != 2:
-            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+        if target_sizes is not None:
+            if len(logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
 
         probs = torch.max(logits, dim=-1)
         scores = torch.sigmoid(probs.values)
@@ -579,7 +580,12 @@ def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_thresh
                     scores[idx][ious > nms_threshold] = 0.0
 
         # Convert from relative [0, 1] to absolute [0, height] coordinates
-        img_h, img_w = target_sizes.unbind(1)
+        if target_sizes is not None:
+            if isinstance(target_sizes, List):
+                img_h = torch.tensor([i[0] for i in target_sizes])
+                img_w = torch.tensor([i[1] for i in target_sizes])
+            else:
+                img_h, img_w = target_sizes.unbind(1)
         scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device)
         target_boxes = target_boxes * scale_fct[:, None, :]
 

From 66e3f067af7dbd3063850c5c41a87ba444ceb6a3 Mon Sep 17 00:00:00 2001
From: Connor Anderson <thecatalystak@gmail.com>
Date: Mon, 22 Jul 2024 09:20:08 -0400
Subject: [PATCH 3/3] Fix incorrect indentation; add check for correct size of
 target_sizes

---
 .../models/owlv2/image_processing_owlv2.py          | 13 ++++++-------
 .../models/owlvit/image_processing_owlvit.py        | 13 ++++++-------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/owlv2/image_processing_owlv2.py b/src/transformers/models/owlv2/image_processing_owlv2.py
index ff812cff9ff9b0..d3ef04238a8f80 100644
--- a/src/transformers/models/owlv2/image_processing_owlv2.py
+++ b/src/transformers/models/owlv2/image_processing_owlv2.py
@@ -565,11 +565,10 @@ def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_thresh
         """
         logits, target_boxes = outputs.logits, outputs.target_pred_boxes
 
-        if target_sizes is not None:
-            if len(logits) != len(target_sizes):
-                raise ValueError(
-                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
-                )
+        if target_sizes is not None and len(logits) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+        if target_sizes is not None and target_sizes.shape[1] != 2:
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
 
         probs = torch.max(logits, dim=-1)
         scores = torch.sigmoid(probs.values)
@@ -595,8 +594,8 @@ def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_thresh
                 img_w = torch.tensor([i[1] for i in target_sizes])
             else:
                 img_h, img_w = target_sizes.unbind(1)
-        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device)
-        target_boxes = target_boxes * scale_fct[:, None, :]
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device)
+            target_boxes = target_boxes * scale_fct[:, None, :]
 
         # Compute box display alphas based on prediction scores
         results = []
diff --git a/src/transformers/models/owlvit/image_processing_owlvit.py b/src/transformers/models/owlvit/image_processing_owlvit.py
index 21f283084470cb..5bc889ba85d501 100644
--- a/src/transformers/models/owlvit/image_processing_owlvit.py
+++ b/src/transformers/models/owlvit/image_processing_owlvit.py
@@ -556,11 +556,10 @@ def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_thresh
         """
         logits, target_boxes = outputs.logits, outputs.target_pred_boxes
 
-        if target_sizes is not None:
-            if len(logits) != len(target_sizes):
-                raise ValueError(
-                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
-                )
+        if target_sizes is not None and len(logits) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+        if target_sizes is not None and target_sizes.shape[1] != 2:
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
 
         probs = torch.max(logits, dim=-1)
         scores = torch.sigmoid(probs.values)
@@ -586,8 +585,8 @@ def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_thresh
                 img_w = torch.tensor([i[1] for i in target_sizes])
             else:
                 img_h, img_w = target_sizes.unbind(1)
-        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device)
-        target_boxes = target_boxes * scale_fct[:, None, :]
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(target_boxes.device)
+            target_boxes = target_boxes * scale_fct[:, None, :]
 
         # Compute box display alphas based on prediction scores
         results = []