IDEA-Research · salvaba94 · Jan 27, 2024 · Mar 3, 2024
diff --git a/demo/assets/demo1.jpg b/demo/assets/demo1.jpg
diff --git a/demo/assets/demo2.jpg b/demo/assets/demo2.jpg
diff --git a/demo/assets/demo3.jpg b/demo/assets/demo3.jpg
diff --git a/demo/assets/demo4.jpg b/demo/assets/demo4.jpg
diff --git a/demo/assets/demo5.jpg b/demo/assets/demo5.jpg
diff --git a/demo/assets/demo6.jpg b/demo/assets/demo6.jpg
diff --git a/demo/assets/demo7.jpg b/demo/assets/demo7.jpg
diff --git a/demo/assets/demo8.jpg b/demo/assets/demo8.jpg
diff --git a/demo/assets/demo9.jpg b/demo/assets/demo9.jpg
diff --git a/demo/groundingDINO_batched_float16.ipynb b/demo/groundingDINO_batched_float16.ipynb
@@ -0,0 +1,292 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Grounding DINO - Batched Half Precision Inference"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare Environments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from PIL import Image\n",
+    "import io\n",
+    "import os\n",
+    "import supervision as sv\n",
+    "import numpy as np\n",
+    "import requests\n",
+    "import cv2\n",
+    "\n",
+    "# Grounding DINO\n",
+    "from groundingdino.util.inference import BatchedModel\n",
+    "import torchvision.transforms.functional as F\n",
+    "from huggingface_hub import hf_hub_download\n",
+    "\n",
+    "# If you have multiple GPUs, you can set the GPU to use here.\n",
+    "# The default is to use the first GPU, which is usually GPU 0.\n",
+    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\""
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Load Grounding DINO model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Load demo image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def download_image(url, image_file_path):\n",
+    "    r = requests.get(url, timeout=4.0)\n",
+    "    if r.status_code != requests.codes.ok:\n",
+    "        assert False, 'Status code error: {}.'.format(r.status_code)\n",
+    "\n",
+    "    with Image.open(io.BytesIO(r.content)) as im:\n",
+    "        im.save(image_file_path)\n",
+    "\n",
+    "    print('Image downloaded from url: {} and saved to: {}.'.format(url, image_file_path))\n",
+    "\n",
+    "def load_image(image_path):\n",
+    "    image_source = Image.open(image_path).convert(\"RGB\")\n",
+    "    image = np.asarray(image_source)\n",
+    "    image_tensor = F.to_tensor(image)\n",
+    "    return image, image_tensor\n",
+    "\n",
+    "local_image_path = \"assets/demo4.jpg\"\n",
+    "#download_image(image_url, local_image_path)\n",
+    "image_source, image_tensor = load_image(local_image_path)\n",
+    "Image.fromarray(image_source)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Run Grounding DINO for detection"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Use this command for evaluate the Grounding DINO model\n",
+    "# Or you can download the model by yourself\n",
+    "ckpt_repo_id = \"ShilongLiu/GroundingDINO\"\n",
+    "ckpt_filename = \"groundingdino_swint_ogc.pth\"\n",
+    "ckpt_config_filename = \"GroundingDINO_SwinT_OGC.cfg.py\"\n",
+    "device = \"cuda\"\n",
+    "\n",
+    "cache_config_file = hf_hub_download(repo_id=ckpt_repo_id, filename=ckpt_config_filename)\n",
+    "cache_file = hf_hub_download(repo_id=ckpt_repo_id, filename=ckpt_filename)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Single Precision"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch = 2\n",
+    "box_threshold = 0.3\n",
+    "text_threshold = 0.25\n",
+    "iou_threshold = 0.5\n",
+    "\n",
+    "# Batch of prompts\n",
+    "text_prompt = [\n",
+    "    [\"Black dog\", \"Beige dog\"],\n",
+    "    [\"Dog\", \"Stick\"]\n",
+    "]\n",
+    "\n",
+    "dtype = \"float32\"\n",
+    "\n",
+    "# Repeat image BATCH number of times\n",
+    "image_tensor = image_tensor.to(device=device).to(dtype=getattr(torch, dtype))\n",
+    "image_tensor = image_tensor[None, ...].expand(batch, -1, -1, -1)\n",
+    "\n",
+    "# Building GroundingDINO inference model\n",
+    "grounding_dino_model = BatchedModel(\n",
+    "    model_config_path=cache_config_file, \n",
+    "    model_checkpoint_path=cache_file,\n",
+    "    device=device,\n",
+    "    dtype=dtype,\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%timeit -n 10\n",
+    "with torch.no_grad():\n",
+    "    bbox_batch, conf_batch, class_id_batch  = grounding_dino_model(\n",
+    "        image_batch=image_tensor,\n",
+    "        text_prompts=text_prompt,\n",
+    "        box_threshold=box_threshold,\n",
+    "        text_threshold=text_threshold,\n",
+    "        nms_threshold=iou_threshold\n",
+    "    )\n",
+    "    bbox_batch = [bbox.cpu().numpy() for bbox in bbox_batch]\n",
+    "    conf_batch = [conf.cpu().numpy() for conf in conf_batch]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Half Precision"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dtype = \"float16\"\n",
+    "\n",
+    "image_tensor = image_tensor.to(device=device).to(dtype=getattr(torch, dtype))\n",
+    "\n",
+    "# Building GroundingDINO inference model\n",
+    "grounding_dino_model = BatchedModel(\n",
+    "    model_config_path=cache_config_file, \n",
+    "    model_checkpoint_path=cache_file,\n",
+    "    device=device,\n",
+    "    dtype=dtype\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%timeit -n 10\n",
+    "with torch.no_grad():\n",
+    "    bbox_batch, conf_batch, class_id_batch  = grounding_dino_model(\n",
+    "        image_batch=image_tensor,\n",
+    "        text_prompts=text_prompt,\n",
+    "        box_threshold=box_threshold,\n",
+    "        text_threshold=text_threshold,\n",
+    "        nms_threshold=iou_threshold\n",
+    "    )\n",
+    "    bbox_batch = [bbox.cpu().numpy() for bbox in bbox_batch]\n",
+    "    conf_batch = [conf.cpu().numpy() for conf in conf_batch]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Display result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with torch.no_grad():\n",
+    "    bbox_batch, conf_batch, class_id_batch  = grounding_dino_model(\n",
+    "        image_batch=image_tensor,\n",
+    "        text_prompts=text_prompt,\n",
+    "        box_threshold=box_threshold,\n",
+    "        text_threshold=text_threshold,\n",
+    "        nms_threshold=iou_threshold\n",
+    "    )\n",
+    "    bbox_batch = [bbox.cpu().numpy() for bbox in bbox_batch]\n",
+    "    conf_batch = [conf.cpu().numpy() for conf in conf_batch]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import display\n",
+    "def annotate(image_source, boxes, logits, phrases) -> np.ndarray:\n",
+    "    detections = sv.Detections(xyxy=boxes)\n",
+    "    labels = [\n",
+    "        f\"{phrase} {logit:.2f}\"\n",
+    "        for phrase, logit\n",
+    "        in zip(phrases, logits)\n",
+    "    ]\n",
+    "    box_annotator = sv.BoxAnnotator()\n",
+    "    annotated_frame = cv2.cvtColor(image_source, cv2.COLOR_RGB2BGR)\n",
+    "    annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)\n",
+    "    return annotated_frame[...,::-1]\n",
+    "\n",
+    "\n",
+    "for i, (bbox, conf, class_id, class_label)  in enumerate(zip(bbox_batch, conf_batch, class_id_batch, text_prompt)):\n",
+    "    annotated_frame = annotate(\n",
+    "        image_source=image_source, \n",
+    "        boxes=bbox,\n",
+    "        logits=conf,\n",
+    "        phrases=np.array(class_label)[class_id]\n",
+    "    )\n",
+    "\n",
+    "    display(Image.fromarray(annotated_frame))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/groundingdino/models/GroundingDINO/backbone/swin_transformer.py b/groundingdino/models/GroundingDINO/backbone/swin_transformer.py
@@ -159,6 +159,7 @@ def forward(self, x, mask=None):
         attn = attn + relative_position_bias.unsqueeze(0)
 
         if mask is not None:
+            mask = mask.to(dtype=x.dtype)
             nW = mask.shape[0]
             attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
             attn = attn.view(-1, self.num_heads, N, N)

diff --git a/groundingdino/models/GroundingDINO/ms_deform_attn.py b/groundingdino/models/GroundingDINO/ms_deform_attn.py
@@ -100,7 +100,7 @@ def multi_scale_deformable_attn_pytorch(
     bs, _, num_heads, embed_dims = value.shape
     _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
-    sampling_grids = 2 * sampling_locations - 1
+    sampling_grids = 2 * sampling_locations.to(dtype=value.dtype) - 1
     sampling_value_list = []
     for level, (H_, W_) in enumerate(value_spatial_shapes):
         # bs, H_*W_, num_heads, embed_dims ->

diff --git a/groundingdino/models/GroundingDINO/transformer.py b/groundingdino/models/GroundingDINO/transformer.py
@@ -659,6 +659,7 @@ def forward(
         output = tgt
 
         intermediate = []
+        refpoints_unsigmoid = refpoints_unsigmoid.to(dtype=tgt.dtype)
         reference_points = refpoints_unsigmoid.sigmoid()
         ref_points = [reference_points]
 
@@ -667,14 +668,14 @@ def forward(
             if reference_points.shape[-1] == 4:
                 reference_points_input = (
                     reference_points[:, :, None]
-                    * torch.cat([valid_ratios, valid_ratios], -1)[None, :]
+                    * torch.cat([valid_ratios, valid_ratios], -1)[None, :].to(dtype=tgt.dtype)
                 )  # nq, bs, nlevel, 4
             else:
                 assert reference_points.shape[-1] == 2
-                reference_points_input = reference_points[:, :, None] * valid_ratios[None, :]
+                reference_points_input = reference_points[:, :, None] * valid_ratios[None, :].to(dtype=tgt.dtype)
             query_sine_embed = gen_sineembed_for_position(
                 reference_points_input[:, :, 0, :]
-            )  # nq, bs, 256*2
+            ).to(dtype=tgt.dtype)  # nq, bs, 256*2
 
             # conditional query
             raw_query_pos = self.ref_point_head(query_sine_embed)  # nq, bs, 256

diff --git a/groundingdino/models/GroundingDINO/transformer_vanilla.py b/groundingdino/models/GroundingDINO/transformer_vanilla.py
@@ -96,7 +96,7 @@ def __init__(
         self.nhead = nhead
 
     def with_pos_embed(self, tensor, pos: Optional[Tensor]):
-        return tensor if pos is None else tensor + pos
+        return tensor if pos is None else tensor + pos.to(dtype=tensor.dtype)
 
     def forward(
         self,