@@ -1540,9 +1540,7 @@ def image_guided_detection(
1540
1540
>>> import requests
1541
1541
>>> from PIL import Image
1542
1542
>>> import torch
1543
- >>> import numpy as np
1544
1543
>>> from transformers import AutoProcessor, Owlv2ForObjectDetection
1545
- >>> from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
1546
1544
1547
1545
>>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
1548
1546
>>> model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")
@@ -1557,20 +1555,7 @@ def image_guided_detection(
1557
1555
>>> with torch.no_grad():
1558
1556
... outputs = model.image_guided_detection(**inputs)
1559
1557
1560
- >>> # Note: boxes need to be visualized on the padded, unnormalized image
1561
- >>> # hence we'll set the target image sizes (height, width) based on that
1562
-
1563
- >>> def get_preprocessed_image(pixel_values):
1564
- ... pixel_values = pixel_values.squeeze().numpy()
1565
- ... unnormalized_image = (pixel_values * np.array(OPENAI_CLIP_STD)[:, None, None]) + np.array(OPENAI_CLIP_MEAN)[:, None, None]
1566
- ... unnormalized_image = (unnormalized_image * 255).astype(np.uint8)
1567
- ... unnormalized_image = np.moveaxis(unnormalized_image, 0, -1)
1568
- ... unnormalized_image = Image.fromarray(unnormalized_image)
1569
- ... return unnormalized_image
1570
-
1571
- >>> unnormalized_image = get_preprocessed_image(inputs.pixel_values)
1572
-
1573
- >>> target_sizes = torch.Tensor([unnormalized_image.size[::-1]])
1558
+ >>> target_sizes = torch.Tensor([image.size[::-1]])
1574
1559
1575
1560
>>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
1576
1561
>>> results = processor.post_process_image_guided_detection(
@@ -1581,19 +1566,19 @@ def image_guided_detection(
1581
1566
>>> for box, score in zip(boxes, scores):
1582
1567
... box = [round(i, 2) for i in box.tolist()]
1583
1568
... print(f"Detected similar object with confidence {round(score.item(), 3)} at location {box}")
1584
- Detected similar object with confidence 0.938 at location [490.96, 109.89, 821.09, 536.11 ]
1585
- Detected similar object with confidence 0.959 at location [8.67, 721.29, 928.68, 732.78 ]
1586
- Detected similar object with confidence 0.902 at location [4.27, 720.02, 941.45, 761.59 ]
1587
- Detected similar object with confidence 0.985 at location [265.46 , -58.9, 1009.04, 365.66 ]
1588
- Detected similar object with confidence 1.0 at location [9.79, 28.69, 937.31, 941.64 ]
1589
- Detected similar object with confidence 0.998 at location [869.97, 58.28, 923.23, 978.1 ]
1590
- Detected similar object with confidence 0.985 at location [309.23, 21.07, 371.61, 932.02 ]
1591
- Detected similar object with confidence 0.947 at location [27.93, 859.45, 969.75, 915.44 ]
1592
- Detected similar object with confidence 0.996 at location [785.82, 41.38, 880.26, 966.37 ]
1593
- Detected similar object with confidence 0.998 at location [5.08, 721.17, 925.93, 998.41 ]
1594
- Detected similar object with confidence 0.969 at location [6.7, 898.1, 921.75, 949.51 ]
1595
- Detected similar object with confidence 0.966 at location [47.16, 927.29, 981.99, 942.14 ]
1596
- Detected similar object with confidence 0.924 at location [46.4, 936.13, 953.02, 950.78 ]
1569
+ Detected similar object with confidence 0.938 at location [327.31, 54.94, 547.39, 268.06 ]
1570
+ Detected similar object with confidence 0.959 at location [5.78, 360.65, 619.12, 366.39 ]
1571
+ Detected similar object with confidence 0.902 at location [2.85, 360.01, 627.63, 380.8 ]
1572
+ Detected similar object with confidence 0.985 at location [176.98 , -29.45, 672.69, 182.83 ]
1573
+ Detected similar object with confidence 1.0 at location [6.53, 14.35, 624.87, 470.82 ]
1574
+ Detected similar object with confidence 0.998 at location [579.98, 29.14, 615.49, 489.05 ]
1575
+ Detected similar object with confidence 0.985 at location [206.15, 10.53, 247.74, 466.01 ]
1576
+ Detected similar object with confidence 0.947 at location [18.62, 429.72, 646.5, 457.72 ]
1577
+ Detected similar object with confidence 0.996 at location [523.88, 20.69, 586.84, 483.18 ]
1578
+ Detected similar object with confidence 0.998 at location [3.39, 360.59, 617.29, 499.21 ]
1579
+ Detected similar object with confidence 0.969 at location [4.47, 449.05, 614.5, 474.76 ]
1580
+ Detected similar object with confidence 0.966 at location [31.44, 463.65, 654.66, 471.07 ]
1581
+ Detected similar object with confidence 0.924 at location [30.93, 468.07, 635.35, 475.39 ]
1597
1582
```"""
1598
1583
output_attentions = output_attentions if output_attentions is not None else self .config .output_attentions
1599
1584
output_hidden_states = (
@@ -1665,10 +1650,8 @@ def forward(
1665
1650
```python
1666
1651
>>> import requests
1667
1652
>>> from PIL import Image
1668
- >>> import numpy as np
1669
1653
>>> import torch
1670
1654
>>> from transformers import AutoProcessor, Owlv2ForObjectDetection
1671
- >>> from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
1672
1655
1673
1656
>>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
1674
1657
>>> model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")
@@ -1682,20 +1665,7 @@ def forward(
1682
1665
>>> with torch.no_grad():
1683
1666
... outputs = model(**inputs)
1684
1667
1685
- >>> # Note: boxes need to be visualized on the padded, unnormalized image
1686
- >>> # hence we'll set the target image sizes (height, width) based on that
1687
-
1688
- >>> def get_preprocessed_image(pixel_values):
1689
- ... pixel_values = pixel_values.squeeze().numpy()
1690
- ... unnormalized_image = (pixel_values * np.array(OPENAI_CLIP_STD)[:, None, None]) + np.array(OPENAI_CLIP_MEAN)[:, None, None]
1691
- ... unnormalized_image = (unnormalized_image * 255).astype(np.uint8)
1692
- ... unnormalized_image = np.moveaxis(unnormalized_image, 0, -1)
1693
- ... unnormalized_image = Image.fromarray(unnormalized_image)
1694
- ... return unnormalized_image
1695
-
1696
- >>> unnormalized_image = get_preprocessed_image(inputs.pixel_values)
1697
-
1698
- >>> target_sizes = torch.Tensor([unnormalized_image.size[::-1]])
1668
+ >>> target_sizes = torch.Tensor([image.size[::-1]])
1699
1669
>>> # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
1700
1670
>>> results = processor.post_process_object_detection(
1701
1671
... outputs=outputs, threshold=0.2, target_sizes=target_sizes
@@ -1708,8 +1678,8 @@ def forward(
1708
1678
>>> for box, score, label in zip(boxes, scores, labels):
1709
1679
... box = [round(i, 2) for i in box.tolist()]
1710
1680
... print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
1711
- Detected a photo of a cat with confidence 0.614 at location [512.5, 35.08, 963.48, 557.02 ]
1712
- Detected a photo of a cat with confidence 0.665 at location [10.13, 77.94, 489.93, 709.69 ]
1681
+ Detected a photo of a cat with confidence 0.614 at location [341.67, 23.39, 642.32, 371.35 ]
1682
+ Detected a photo of a cat with confidence 0.665 at location [6.75, 51.96, 326.62, 473.13 ]
1713
1683
```"""
1714
1684
output_attentions = output_attentions if output_attentions is not None else self .config .output_attentions
1715
1685
output_hidden_states = (
0 commit comments