Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Batched inference API and support for float16 inference #279

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added demo/assets/demo1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added demo/assets/demo2.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added demo/assets/demo3.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added demo/assets/demo4.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added demo/assets/demo5.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added demo/assets/demo6.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added demo/assets/demo7.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added demo/assets/demo8.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added demo/assets/demo9.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
292 changes: 292 additions & 0 deletions demo/groundingDINO_batched_float16.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,292 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Grounding DINO - Batched Half Precision Inference"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prepare Environments"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"from PIL import Image\n",
"import io\n",
"import os\n",
"import supervision as sv\n",
"import numpy as np\n",
"import requests\n",
"import cv2\n",
"\n",
"# Grounding DINO\n",
"from groundingdino.util.inference import BatchedModel\n",
"import torchvision.transforms.functional as F\n",
"from huggingface_hub import hf_hub_download\n",
"\n",
"# If you have multiple GPUs, you can set the GPU to use here.\n",
"# The default is to use the first GPU, which is usually GPU 0.\n",
"os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\""
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load Grounding DINO model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load demo image"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def download_image(url, image_file_path):\n",
" r = requests.get(url, timeout=4.0)\n",
" if r.status_code != requests.codes.ok:\n",
" assert False, 'Status code error: {}.'.format(r.status_code)\n",
"\n",
" with Image.open(io.BytesIO(r.content)) as im:\n",
" im.save(image_file_path)\n",
"\n",
" print('Image downloaded from url: {} and saved to: {}.'.format(url, image_file_path))\n",
"\n",
"def load_image(image_path):\n",
" image_source = Image.open(image_path).convert(\"RGB\")\n",
" image = np.asarray(image_source)\n",
" image_tensor = F.to_tensor(image)\n",
" return image, image_tensor\n",
"\n",
"local_image_path = \"assets/demo4.jpg\"\n",
"#download_image(image_url, local_image_path)\n",
"image_source, image_tensor = load_image(local_image_path)\n",
"Image.fromarray(image_source)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Run Grounding DINO for detection"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Use this command for evaluate the Grounding DINO model\n",
"# Or you can download the model by yourself\n",
"ckpt_repo_id = \"ShilongLiu/GroundingDINO\"\n",
"ckpt_filename = \"groundingdino_swint_ogc.pth\"\n",
"ckpt_config_filename = \"GroundingDINO_SwinT_OGC.cfg.py\"\n",
"device = \"cuda\"\n",
"\n",
"cache_config_file = hf_hub_download(repo_id=ckpt_repo_id, filename=ckpt_config_filename)\n",
"cache_file = hf_hub_download(repo_id=ckpt_repo_id, filename=ckpt_filename)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Single Precision"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"batch = 2\n",
"box_threshold = 0.3\n",
"text_threshold = 0.25\n",
"iou_threshold = 0.5\n",
"\n",
"# Batch of prompts\n",
"text_prompt = [\n",
" [\"Black dog\", \"Beige dog\"],\n",
" [\"Dog\", \"Stick\"]\n",
"]\n",
"\n",
"dtype = \"float32\"\n",
"\n",
"# Repeat image BATCH number of times\n",
"image_tensor = image_tensor.to(device=device).to(dtype=getattr(torch, dtype))\n",
"image_tensor = image_tensor[None, ...].expand(batch, -1, -1, -1)\n",
"\n",
"# Building GroundingDINO inference model\n",
"grounding_dino_model = BatchedModel(\n",
" model_config_path=cache_config_file, \n",
" model_checkpoint_path=cache_file,\n",
" device=device,\n",
" dtype=dtype,\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%timeit -n 10\n",
"with torch.no_grad():\n",
" bbox_batch, conf_batch, class_id_batch = grounding_dino_model(\n",
" image_batch=image_tensor,\n",
" text_prompts=text_prompt,\n",
" box_threshold=box_threshold,\n",
" text_threshold=text_threshold,\n",
" nms_threshold=iou_threshold\n",
" )\n",
" bbox_batch = [bbox.cpu().numpy() for bbox in bbox_batch]\n",
" conf_batch = [conf.cpu().numpy() for conf in conf_batch]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Half Precision"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dtype = \"float16\"\n",
"\n",
"image_tensor = image_tensor.to(device=device).to(dtype=getattr(torch, dtype))\n",
"\n",
"# Building GroundingDINO inference model\n",
"grounding_dino_model = BatchedModel(\n",
" model_config_path=cache_config_file, \n",
" model_checkpoint_path=cache_file,\n",
" device=device,\n",
" dtype=dtype\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%timeit -n 10\n",
"with torch.no_grad():\n",
" bbox_batch, conf_batch, class_id_batch = grounding_dino_model(\n",
" image_batch=image_tensor,\n",
" text_prompts=text_prompt,\n",
" box_threshold=box_threshold,\n",
" text_threshold=text_threshold,\n",
" nms_threshold=iou_threshold\n",
" )\n",
" bbox_batch = [bbox.cpu().numpy() for bbox in bbox_batch]\n",
" conf_batch = [conf.cpu().numpy() for conf in conf_batch]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Display result"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with torch.no_grad():\n",
" bbox_batch, conf_batch, class_id_batch = grounding_dino_model(\n",
" image_batch=image_tensor,\n",
" text_prompts=text_prompt,\n",
" box_threshold=box_threshold,\n",
" text_threshold=text_threshold,\n",
" nms_threshold=iou_threshold\n",
" )\n",
" bbox_batch = [bbox.cpu().numpy() for bbox in bbox_batch]\n",
" conf_batch = [conf.cpu().numpy() for conf in conf_batch]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from IPython.display import display\n",
"def annotate(image_source, boxes, logits, phrases) -> np.ndarray:\n",
" detections = sv.Detections(xyxy=boxes)\n",
" labels = [\n",
" f\"{phrase} {logit:.2f}\"\n",
" for phrase, logit\n",
" in zip(phrases, logits)\n",
" ]\n",
" box_annotator = sv.BoxAnnotator()\n",
" annotated_frame = cv2.cvtColor(image_source, cv2.COLOR_RGB2BGR)\n",
" annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)\n",
" return annotated_frame[...,::-1]\n",
"\n",
"\n",
"for i, (bbox, conf, class_id, class_label) in enumerate(zip(bbox_batch, conf_batch, class_id_batch, text_prompt)):\n",
" annotated_frame = annotate(\n",
" image_source=image_source, \n",
" boxes=bbox,\n",
" logits=conf,\n",
" phrases=np.array(class_label)[class_id]\n",
" )\n",
"\n",
" display(Image.fromarray(annotated_frame))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ def forward(self, x, mask=None):
attn = attn + relative_position_bias.unsqueeze(0)

if mask is not None:
mask = mask.to(dtype=x.dtype)
nW = mask.shape[0]
attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
attn = attn.view(-1, self.num_heads, N, N)
Expand Down
2 changes: 1 addition & 1 deletion groundingdino/models/GroundingDINO/ms_deform_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def multi_scale_deformable_attn_pytorch(
bs, _, num_heads, embed_dims = value.shape
_, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
sampling_grids = 2 * sampling_locations - 1
sampling_grids = 2 * sampling_locations.to(dtype=value.dtype) - 1
sampling_value_list = []
for level, (H_, W_) in enumerate(value_spatial_shapes):
# bs, H_*W_, num_heads, embed_dims ->
Expand Down
7 changes: 4 additions & 3 deletions groundingdino/models/GroundingDINO/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,7 @@ def forward(
output = tgt

intermediate = []
refpoints_unsigmoid = refpoints_unsigmoid.to(dtype=tgt.dtype)
reference_points = refpoints_unsigmoid.sigmoid()
ref_points = [reference_points]

Expand All @@ -667,14 +668,14 @@ def forward(
if reference_points.shape[-1] == 4:
reference_points_input = (
reference_points[:, :, None]
* torch.cat([valid_ratios, valid_ratios], -1)[None, :]
* torch.cat([valid_ratios, valid_ratios], -1)[None, :].to(dtype=tgt.dtype)
) # nq, bs, nlevel, 4
else:
assert reference_points.shape[-1] == 2
reference_points_input = reference_points[:, :, None] * valid_ratios[None, :]
reference_points_input = reference_points[:, :, None] * valid_ratios[None, :].to(dtype=tgt.dtype)
query_sine_embed = gen_sineembed_for_position(
reference_points_input[:, :, 0, :]
) # nq, bs, 256*2
).to(dtype=tgt.dtype) # nq, bs, 256*2

# conditional query
raw_query_pos = self.ref_point_head(query_sine_embed) # nq, bs, 256
Expand Down
2 changes: 1 addition & 1 deletion groundingdino/models/GroundingDINO/transformer_vanilla.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def __init__(
self.nhead = nhead

def with_pos_embed(self, tensor, pos: Optional[Tensor]):
return tensor if pos is None else tensor + pos
return tensor if pos is None else tensor + pos.to(dtype=tensor.dtype)

def forward(
self,
Expand Down
Loading