Skip to content

Commit

Permalink
Fix OWLv2 post_process_object_detection for multiple images (huggingf…
Browse files Browse the repository at this point in the history
…ace#31082)

* Add test for multiple images

* [run slow] owlv2

* Fix box rescaling

* [run slow] owlv2
  • Loading branch information
qubvel authored May 28, 2024
1 parent c31473e commit 98e2d48
Showing 2 changed files with 35 additions and 18 deletions.
16 changes: 4 additions & 12 deletions src/transformers/models/owlv2/image_processing_owlv2.py
Original file line number Diff line number Diff line change
@@ -524,19 +524,11 @@ def post_process_object_detection(
else:
img_h, img_w = target_sizes.unbind(1)

# rescale coordinates
width_ratio = 1
height_ratio = 1
# Rescale coordinates, image is padded to square for inference,
# that is why we need to scale boxes to the max size
size = torch.max(img_h, img_w)
scale_fct = torch.stack([size, size, size, size], dim=1).to(boxes.device)

if img_w < img_h:
width_ratio = img_w / img_h
elif img_h < img_w:
height_ratio = img_h / img_w

img_w = img_w / width_ratio
img_h = img_h / height_ratio

scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
boxes = boxes * scale_fct[:, None, :]

results = []
37 changes: 31 additions & 6 deletions tests/models/owlv2/test_image_processor_owlv2.py
Original file line number Diff line number Diff line change
@@ -130,17 +130,42 @@ def test_image_processor_integration_test_resize(self):
model = Owlv2ForObjectDetection.from_pretrained(checkpoint)

image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
inputs = processor(text=["cat"], images=image, return_tensors="pt")
text = ["cat"]
target_size = image.size[::-1]
expected_boxes = torch.tensor(
[
[341.66656494140625, 23.38756561279297, 642.321044921875, 371.3482971191406],
[6.753320693969727, 51.96149826049805, 326.61810302734375, 473.12982177734375],
]
)

# single image
inputs = processor(text=[text], images=[image], return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)

target_sizes = torch.tensor([image.size[::-1]])
results = processor.post_process_object_detection(outputs, threshold=0.2, target_sizes=target_sizes)[0]
results = processor.post_process_object_detection(outputs, threshold=0.2, target_sizes=[target_size])[0]

boxes = results["boxes"]
self.assertTrue(
torch.allclose(boxes, expected_boxes, atol=1e-2),
f"Single image bounding boxes fail. Expected {expected_boxes}, got {boxes}",
)

# batch of images
inputs = processor(text=[text, text], images=[image, image], return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
results = processor.post_process_object_detection(
outputs, threshold=0.2, target_sizes=[target_size, target_size]
)

boxes = results["boxes"].tolist()
self.assertEqual(boxes[0], [341.66656494140625, 23.38756561279297, 642.321044921875, 371.3482971191406])
self.assertEqual(boxes[1], [6.753320693969727, 51.96149826049805, 326.61810302734375, 473.12982177734375])
for result in results:
boxes = result["boxes"]
self.assertTrue(
torch.allclose(boxes, expected_boxes, atol=1e-2),
f"Batch image bounding boxes fail. Expected {expected_boxes}, got {boxes}",
)

@unittest.skip("OWLv2 doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy
def test_call_numpy_4_channels(self):

0 comments on commit 98e2d48

Please sign in to comment.