add prompt template and example ipython notebooks

andrewliao11 · andrewliao11 · commit 723d6e414bc1 · 2024-10-22T13:32:22.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,8 @@
+# Please refere to ScanNet official website for the download link
+QSpatial_scannet/download-scannet.py
+QSpatial_scannet/images/
+QSpatial_scannet/scannet_dataset/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/QSpatial_scannet/download_and_render_scannet_images.py b/QSpatial_scannet/download_and_render_scannet_images.py
@@ -78,6 +78,7 @@ def export_color_images(self, output_dir, image_size=None, specified_frames=[]):
 
 scannet_id_and_frame = {
     'scene0015_00': ['0'],
+    
     'scene0019_00': ['400'],
     'scene0025_00': ['500'],
     'scene0025_02': ['400'],
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 
 Q-Spatial Bench is a benchmark designed to measure the **quantitative spatial reasoning** 📏 in large vision-language models.
 
-🔥The paper associated with Q-Spatial Bench is accepted by EMNLP 2024 main track!
+🔥 The paper associated with Q-Spatial Bench is accepted by EMNLP 2024 main track!
 
 - Our paper: *Reasoning Paths with Reference Objects Elicit Quantitative Spatial Reasoning in Large Vision-Language Models* [[arXiv link](https://arxiv.org/abs/2409.09788)]
 - Project website: [[link]()]
@@ -45,15 +45,14 @@ cd <REPO_ROOT>/QSpatial_scannet
 python download_and_render_scannet_images.py
 ```
 
-## Example Prompt Templates
 
+## Iterate over the Dataset
 
-## Evaluation
-
-In our paper, we measure the performance in success rate by thresholding the maximum ratio between an estimation and a ground truth value. We provide a simple ipython notebook `evaluation_helper.ipynb` to compute the success rate.
-
+We provide an example ipython notebook under `examples/iterate_over_dataset.ipynb`
 
+## Evaluation
 
+We provide an example ipython notebook under `examples/evaluate_success_rate.ipynb`
 
 
 # Citation
diff --git a/examples/evaluate_success_rate.ipynb b/examples/evaluate_success_rate.ipynb
@@ -0,0 +1,164 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.\n",
+      "Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/andrew/miniconda3/envs/python310/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import re\n",
+    "import json\n",
+    "import numpy as np\n",
+    "\n",
+    "from pathlib import Path\n",
+    "from datasets import load_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "Simple evaluator that handles\n",
+    "1. Load benchmark data\n",
+    "2. Parse prediction string\n",
+    "3. Evaluate success based on `delta` parameter\n",
+    "\"\"\"\n",
+    "\n",
+    "class QSpatialEvaluator:\n",
+    "    delta = 2\n",
+    "    \n",
+    "    def __init__(self, benchmark_split):\n",
+    "        assert benchmark_split in [\"QSpatial_plus\", \"QSpatial_scannet\"]\n",
+    "        self.dataset = load_dataset(\"andrewliao11/Q-Spatial-Bench\", split=benchmark_split)\n",
+    "        \n",
+    "    def evaluate(self, data_ind, vlm_response):\n",
+    "                \n",
+    "        #### Parse ground truth\n",
+    "        value = self.dataset[\"answer_value\"][data_ind]\n",
+    "        unit = self.dataset[\"answer_unit\"][data_ind]\n",
+    "        ground_truth_value_in_cms = value * self._get_multiplier(unit)\n",
+    "\n",
+    "        #### Parse prediction\n",
+    "        # Value\n",
+    "        pattern = r'scalar{([^}]*)}'\n",
+    "        str_inside_scalar_boxes = re.findall(pattern, vlm_response)[-1]\n",
+    "        scalar_list = re.findall(r'\\d+\\.?\\d*', str_inside_scalar_boxes)\n",
+    "        parsed_scalar = np.array(scalar_list).astype(float).mean()\n",
+    "\n",
+    "        # Unit\n",
+    "        pattern = r'distance_unit{([^}]*)}'\n",
+    "        str_inside_unit_boxes = re.findall(pattern, vlm_response)\n",
+    "        parsed_unit = str_inside_unit_boxes[-1]\n",
+    "\n",
+    "        pred_value_in_cms = parsed_scalar * self._get_multiplier(parsed_unit)\n",
+    "        success = max(pred_value_in_cms / ground_truth_value_in_cms, ground_truth_value_in_cms / pred_value_in_cms) < self.delta\n",
+    "\n",
+    "        return dict(\n",
+    "            ground_truth_value_in_cms = ground_truth_value_in_cms,\n",
+    "            pred_value_in_cms = pred_value_in_cms,\n",
+    "            success = success\n",
+    "        )\n",
+    "    \n",
+    "    def _get_multiplier(self, unit):\n",
+    "                \n",
+    "        unit = unit.lower()\n",
+    "        if unit in [\"meters\", \"meter\", \"m\", \"metre\", \"metres\"]:\n",
+    "            multiplier = 100\n",
+    "        elif unit in [\"centimeters\", \"centimeter\", \"cm\"]:\n",
+    "            multiplier = 1\n",
+    "        elif unit in [\"feet\", \"foot\", \"ft\"]:\n",
+    "            multiplier =  30.48\n",
+    "        elif unit in [\"inch\", \"inches\", \"in\"]:\n",
+    "            multiplier =  2.54\n",
+    "        elif unit in [\"mm\"]:\n",
+    "            multiplier =  0.1\n",
+    "        else: \n",
+    "            #raise ValueError(f\"Unknown unit: {unit}\")\n",
+    "            print(f\"Unknown unit: {unit}\")\n",
+    "            multiplier = 1\n",
+    "            \n",
+    "        return multiplier"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "evaluator = QSpatialEvaluator(benchmark_split=\"QSpatial_plus\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "To determine the minimum distance between the two speckled pattern stool chairs in the image, let's follow these steps:\\n\\n1. **Identify the Stools**: Locate the two speckled pattern stools in the image. They are positioned in front of the couches.\\n\\n2. **Reference Points**: Choose reference points on each stool to measure the distance. The closest points on the stools would be the edges facing each other.\\n\\n3. **Estimate the Distance**: Visually estimate the distance between these two closest points. Given the perspective and the relative size of the stools, we can approximate the distance.\\n\\nConsidering the size of the stools and the space between them, the minimum distance between the two speckled pattern stool chairs is approximately:\\n\\n\\\\scalar{1} \\\\distance_unit{meter}\n",
+      "\n",
+      "Evaluation: {'ground_truth_value_in_cms': 96.0, 'pred_value_in_cms': 100.0, 'success': True}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Example VLM responses from GPT-4o\n",
+    "vlm_response = \"To determine the minimum distance between the two speckled pattern stool chairs in the image, let's follow these steps:\\\\n\\\\n1. **Identify the Stools**: Locate the two speckled pattern stools in the image. They are positioned in front of the couches.\\\\n\\\\n2. **Reference Points**: Choose reference points on each stool to measure the distance. The closest points on the stools would be the edges facing each other.\\\\n\\\\n3. **Estimate the Distance**: Visually estimate the distance between these two closest points. Given the perspective and the relative size of the stools, we can approximate the distance.\\\\n\\\\nConsidering the size of the stools and the space between them, the minimum distance between the two speckled pattern stool chairs is approximately:\\\\n\\\\n\\\\\\\\scalar{1} \\\\\\\\distance_unit{meter}\\n\"\n",
+    "\n",
+    "print(vlm_response)\n",
+    "print(\"Evaluation:\", evaluator.evaluate(data_ind=41, vlm_response=vlm_response))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "multi_rounds_vlm",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/iterate_over_dataset.ipynb b/examples/iterate_over_dataset.ipynb
diff --git a/prompt_templates/spatial_prompt_single.txt b/prompt_templates/spatial_prompt_single.txt
@@ -0,0 +1,2 @@
+Question: {{ question }}
+Let's thinkg step by step and start by finding good reference objects or object parts in the image.
diff --git a/prompt_templates/spatial_prompt_steps.txt b/prompt_templates/spatial_prompt_steps.txt
@@ -0,0 +1,23 @@
+Question: {{ question }}
+
+---
+
+Use the following 4 steps sequentially to answer the question:
+
+Step 1 **Analyze the question**
+
+Step 2 **Identify up to 10 reference scales in the image, ranging from large to small sizes, and list them in the specified format**
+- A reference scale must be typical in size.
+- A reference scale can be the dimensions of an object or an object part.
+- A reference scale must NOT be floor tiles or floor planks.
+- Formulate the reference scales using the format: """The [choose from front-to-back, side-to-side, left-to-right, diameter, height (top to bottom edge), or mounting height (bottom edge to floor)] of [object or object part] is approximately [dimension estimate]."""
+
+Step 3 **Propose a robust step-by-step plan to answer the question by using the reference scales in Step 2**
+- A robust step-by-step plan performs the estimation in a coarse-to-fine manner.
+    - First, use a reliable and large-sized reference scale as the primary reference for estimation.
+    - Then, gradually use a reliable and smaller-sized reference scale for adjustment.
+    - Repeat until the estimation is precise enough.
+- When performing visual comparison, be aware of perspective distortion.
+- Do NOT rely on pixel measurements from the images.
+
+Step 4 **Focus on the image and follow the plan in Step 3 to answer the question**
diff --git a/prompt_templates/standard_prompt.txt b/prompt_templates/standard_prompt.txt
@@ -0,0 +1 @@
+Question: {{ question }}
diff --git a/prompt_templates/system_prompt.txt b/prompt_templates/system_prompt.txt
@@ -0,0 +1 @@
+You will be provided with a question and a 2D image. The question involves measuring the precise distance in 3D space through a 2D image. You will answer the question by providing a numeric answer consisting of a scalar and a distance unit in the format of """\scalar{scalar} \distance_unit{distance unit}""" at the end of your response.
diff --git a/prompt_templates/zero_shot_prompt.txt b/prompt_templates/zero_shot_prompt.txt
@@ -0,0 +1,2 @@
+Question: {{ question }}
+Let's think step by step.

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+Question: {{ question }}`
	`2`	`+Let's thinkg step by step and start by finding good reference objects or object parts in the image.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+You will be provided with a question and a 2D image. The question involves measuring the precise distance in 3D space through a 2D image. You will answer the question by providing a numeric answer consisting of a scalar and a distance unit in the format of """\scalar{scalar} \distance_unit{distance unit}""" at the end of your response.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+Question: {{ question }}`
	`2`	`+Let's think step by step.`