From 4e82c7b98d32a1dd1d6446adc8f02721d7e9caa7 Mon Sep 17 00:00:00 2001 From: Fabian Grob <34524155+fabianandresgrob@users.noreply.github.com> Date: Wed, 6 Mar 2024 15:27:10 +0100 Subject: [PATCH] Feat (notebook): add example for dynamic quantization to ONNX export (#877) --- notebooks/ONNX_export_tutorial.ipynb | 112 ++++++++++++++++++++++++--- 1 file changed, 102 insertions(+), 10 deletions(-) diff --git a/notebooks/ONNX_export_tutorial.ipynb b/notebooks/ONNX_export_tutorial.ipynb index cad9de3bd..1417946a3 100644 --- a/notebooks/ONNX_export_tutorial.ipynb +++ b/notebooks/ONNX_export_tutorial.ipynb @@ -213,7 +213,7 @@ " " ], "text/plain": [ - "" + "" ] }, "execution_count": 4, @@ -331,7 +331,7 @@ " " ], "text/plain": [ - "" + "" ] }, "execution_count": 6, @@ -460,7 +460,7 @@ " " ], "text/plain": [ - "" + "" ] }, "execution_count": 8, @@ -605,7 +605,7 @@ " " ], "text/plain": [ - "" + "" ] }, "execution_count": 10, @@ -704,7 +704,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2024-03-01 03:24:07.215804006 [W:onnxruntime:, graph.cc:1283 Graph] Initializer linear.bias appears in graph inputs and will not be treated as constant value/weight. This may prevent some of the graph optimizations, like const folding. Move it out of graph inputs if there is no need to override it, by either re-generating the model with latest exporter/converter or with the tool onnxruntime/tools/python/remove_initializer_from_input.py.\n" + "2024-03-06 02:12:47.492497092 [W:onnxruntime:, graph.cc:1283 Graph] Initializer linear.bias appears in graph inputs and will not be treated as constant value/weight. This may prevent some of the graph optimizations, like const folding. Move it out of graph inputs if there is no need to override it, by either re-generating the model with latest exporter/converter or with the tool onnxruntime/tools/python/remove_initializer_from_input.py.\n" ] } ], @@ -842,18 +842,18 @@ }, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "True\n" + "/scratch/fabian/brevitas/src/brevitas/export/onnx/standard/manager.py:26: UserWarning: ONNX opset version set to 13, override with opset_version=\n", + " warnings.warn(f\"ONNX opset version set to {DEFAULT_OPSET}, override with {ka}=\")\n" ] }, { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "/scratch/fabian/brevitas/src/brevitas/export/onnx/standard/manager.py:26: UserWarning: ONNX opset version set to 13, override with opset_version=\n", - " warnings.warn(f\"ONNX opset version set to {DEFAULT_OPSET}, override with {ka}=\")\n" + "True\n" ] } ], @@ -912,6 +912,98 @@ "\n", "Due to differences in how the computation is performed between Brevitas and ONNX Runtime, it might happen the two results are slightly different (since Brevitas uses a style closer to QCDQ, rather than operating between integers), thus we added a tolerance for off-by-1 errors." ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Export Dynamically Quantized Models to ONNX " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also export dynamically quantized models to ONNX, but there are some limitations. The ONNX DynamicQuantizeLinear requires the following settings:\n", + "- Asymmetric quantization (and therefore *unsigned*)\n", + "- Min-max scaling\n", + "- Rounding to nearest\n", + "- Per tensor scaling\n", + "- Bit width set to 8\n", + "\n", + "This is shown in the following example:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "from brevitas_examples.common.generative.quantizers import ShiftedUint8DynamicActPerTensorFloat\n", + "\n", + "IN_CH = 3\n", + "IMG_SIZE = 128\n", + "OUT_CH = 128\n", + "BATCH_SIZE = 1\n", + "\n", + "class Model(torch.nn.Module):\n", + " def __init__(self) -> None:\n", + " super().__init__()\n", + " self.linear = qnn.QuantLinear(IN_CH, OUT_CH, bias=True, weight_bit_width=8, input_quant=ShiftedUint8DynamicActPerTensorFloat)\n", + " self.act = qnn.QuantReLU(input_quant=ShiftedUint8DynamicActPerTensorFloat)\n", + " \n", + " def forward(self, inp):\n", + " inp = self.linear(inp)\n", + " inp = self.act(inp)\n", + " return inp\n", + "\n", + "inp = torch.randn(BATCH_SIZE, IN_CH)\n", + "model = Model() \n", + "model.eval()\n", + "path = 'dynamic_quant_model_qcdq.onnx'\n", + "\n", + "exported_model = export_onnx_qcdq(model, args=inp, export_path=path, opset_version=13)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Serving 'dynamic_quant_model_qcdq.onnx' at http://localhost:8086\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "show_netron(\"dynamic_quant_model_qcdq.onnx\", 8086)" + ] } ], "metadata": {