From 4e82c7b98d32a1dd1d6446adc8f02721d7e9caa7 Mon Sep 17 00:00:00 2001
From: Fabian Grob <34524155+fabianandresgrob@users.noreply.github.com>
Date: Wed, 6 Mar 2024 15:27:10 +0100
Subject: [PATCH] Feat (notebook): add example for dynamic quantization to ONNX
 export (#877)

---
 notebooks/ONNX_export_tutorial.ipynb | 112 ++++++++++++++++++++++++---
 1 file changed, 102 insertions(+), 10 deletions(-)

diff --git a/notebooks/ONNX_export_tutorial.ipynb b/notebooks/ONNX_export_tutorial.ipynb
index cad9de3bd..1417946a3 100644
--- a/notebooks/ONNX_export_tutorial.ipynb
+++ b/notebooks/ONNX_export_tutorial.ipynb
@@ -213,7 +213,7 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f2a6afb6f50>"
+       "<IPython.lib.display.IFrame at 0x7fb62ae3fe50>"
       ]
      },
      "execution_count": 4,
@@ -331,7 +331,7 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f2a6afa9e90>"
+       "<IPython.lib.display.IFrame at 0x7fb734383710>"
       ]
      },
      "execution_count": 6,
@@ -460,7 +460,7 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f2a69d7ff10>"
+       "<IPython.lib.display.IFrame at 0x7fb629e8a010>"
       ]
      },
      "execution_count": 8,
@@ -605,7 +605,7 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f2a69d2e710>"
+       "<IPython.lib.display.IFrame at 0x7fb62ae37190>"
       ]
      },
      "execution_count": 10,
@@ -704,7 +704,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2024-03-01 03:24:07.215804006 [W:onnxruntime:, graph.cc:1283 Graph] Initializer linear.bias appears in graph inputs and will not be treated as constant value/weight. This may prevent some of the graph optimizations, like const folding. Move it out of graph inputs if there is no need to override it, by either re-generating the model with latest exporter/converter or with the tool onnxruntime/tools/python/remove_initializer_from_input.py.\n"
+      "2024-03-06 02:12:47.492497092 [W:onnxruntime:, graph.cc:1283 Graph] Initializer linear.bias appears in graph inputs and will not be treated as constant value/weight. This may prevent some of the graph optimizations, like const folding. Move it out of graph inputs if there is no need to override it, by either re-generating the model with latest exporter/converter or with the tool onnxruntime/tools/python/remove_initializer_from_input.py.\n"
      ]
     }
    ],
@@ -842,18 +842,18 @@
    },
    "outputs": [
     {
-     "name": "stdout",
+     "name": "stderr",
      "output_type": "stream",
      "text": [
-      "True\n"
+      "/scratch/fabian/brevitas/src/brevitas/export/onnx/standard/manager.py:26: UserWarning: ONNX opset version set to 13, override with opset_version=\n",
+      "  warnings.warn(f\"ONNX opset version set to {DEFAULT_OPSET}, override with {ka}=\")\n"
      ]
     },
     {
-     "name": "stderr",
+     "name": "stdout",
      "output_type": "stream",
      "text": [
-      "/scratch/fabian/brevitas/src/brevitas/export/onnx/standard/manager.py:26: UserWarning: ONNX opset version set to 13, override with opset_version=\n",
-      "  warnings.warn(f\"ONNX opset version set to {DEFAULT_OPSET}, override with {ka}=\")\n"
+      "True\n"
      ]
     }
    ],
@@ -912,6 +912,98 @@
     "\n",
     "Due to differences in how the computation is performed between Brevitas and ONNX Runtime, it might happen the two results are slightly different (since Brevitas uses a style closer to QCDQ, rather than operating between integers), thus we added a tolerance for off-by-1 errors."
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Export Dynamically Quantized Models to ONNX "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can also export dynamically quantized models to ONNX, but there are some limitations. The ONNX DynamicQuantizeLinear requires the following settings:\n",
+    "- Asymmetric quantization (and therefore *unsigned*)\n",
+    "- Min-max scaling\n",
+    "- Rounding to nearest\n",
+    "- Per tensor scaling\n",
+    "- Bit width set to 8\n",
+    "\n",
+    "This is shown in the following example:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from brevitas_examples.common.generative.quantizers import ShiftedUint8DynamicActPerTensorFloat\n",
+    "\n",
+    "IN_CH = 3\n",
+    "IMG_SIZE = 128\n",
+    "OUT_CH = 128\n",
+    "BATCH_SIZE = 1\n",
+    "\n",
+    "class Model(torch.nn.Module):\n",
+    "    def __init__(self) -> None:\n",
+    "        super().__init__()\n",
+    "        self.linear = qnn.QuantLinear(IN_CH, OUT_CH, bias=True, weight_bit_width=8, input_quant=ShiftedUint8DynamicActPerTensorFloat)\n",
+    "        self.act = qnn.QuantReLU(input_quant=ShiftedUint8DynamicActPerTensorFloat)\n",
+    "    \n",
+    "    def forward(self, inp):\n",
+    "        inp = self.linear(inp)\n",
+    "        inp = self.act(inp)\n",
+    "        return inp\n",
+    "\n",
+    "inp = torch.randn(BATCH_SIZE, IN_CH)\n",
+    "model = Model() \n",
+    "model.eval()\n",
+    "path = 'dynamic_quant_model_qcdq.onnx'\n",
+    "\n",
+    "exported_model = export_onnx_qcdq(model, args=inp, export_path=path, opset_version=13)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Serving 'dynamic_quant_model_qcdq.onnx' at http://localhost:8086\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "        <iframe\n",
+       "            width=\"100%\"\n",
+       "            height=\"400\"\n",
+       "            src=\"http://localhost:8086/\"\n",
+       "            frameborder=\"0\"\n",
+       "            allowfullscreen\n",
+       "            \n",
+       "        ></iframe>\n",
+       "        "
+      ],
+      "text/plain": [
+       "<IPython.lib.display.IFrame at 0x7fb62856ccd0>"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "show_netron(\"dynamic_quant_model_qcdq.onnx\", 8086)"
+   ]
   }
  ],
  "metadata": {