omnipo · gvashishtha · Aug 28, 2020 · Aug 28, 2020
diff --git a/...t/production-deploy-to-aks-gpu-with-triton/production-deploy-to-aks-gpu-with-triton.ipynb b/...t/production-deploy-to-aks-gpu-with-triton/production-deploy-to-aks-gpu-with-triton.ipynb
@@ -28,7 +28,9 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "metadata": {},
+      "metadata": {
+        "tags": []
+      },
       "outputs": [],
       "source": [
         "import azureml.core\n",
@@ -46,7 +48,9 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "metadata": {},
+      "metadata": {
+        "tags": []
+      },
       "outputs": [],
       "source": [
         "from azureml.core.workspace import Workspace\n",
@@ -93,9 +97,7 @@
         "\n",
         "if not os.path.exists(target_file):\n",
         "    response = requests.get(model_url)\n",
-        "    open(target_file, 'wb').write(response.content)\n",
-        "\n",
-        "config_file = os.path.join('models', 'triton', 'densenet_onnx', 'config.pbtxt')"
+        "    open(target_file, 'wb').write(response.content)\n"
       ]
     },
     {
@@ -104,39 +106,7 @@
       "source": [
         "# Add Model Configuration file\n",
         "\n",
-        "Each model needs a [Model Configuration](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/model_configuration.html) that provides required and optional information about the model.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "tags": []
-      },
-      "outputs": [],
-      "source": [
-        "%%writefile $config_file\n",
-        "name: \"densenet_onnx\"\n",
-        "platform: \"onnxruntime_onnx\"\n",
-        "max_batch_size: 0\n",
-        "input [\n",
-        "  {\n",
-        "    name: \"data_0\"\n",
-        "    data_type: TYPE_FP32\n",
-        "    format: FORMAT_NCHW\n",
-        "    dims: [ 3, 224, 224 ]\n",
-        "    reshape { shape: [ 1, 3, 224, 224 ] }\n",
-        "  }\n",
-        "]\n",
-        "output [\n",
-        "  {\n",
-        "    name: \"fc6_1\"\n",
-        "    data_type: TYPE_FP32\n",
-        "    dims: [ 1000 ]\n",
-        "    reshape { shape: [ 1, 1000, 1, 1 ] }\n",
-        "    label_filename: \"densenet_labels.txt\"\n",
-        "  }\n",
-        "]"
+        "Each Triton model needs a [Model Configuration](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/model_configuration.html) that provides required and optional information about the model. We've provided one in this directory already.\n"
       ]
     },
     {
@@ -198,9 +168,11 @@
         "    print(\"Creating new gpu-cluster\")\n",
         "    \n",
         "    # Specify the configuration for the new cluster\n",
-        "    compute_config = AksCompute.provisioning_configuration(cluster_purpose=AksCompute.ClusterPurpose.DEV_TEST,\n",
-        "                                                           agent_count=1,\n",
-        "                                                           vm_size=\"Standard_NC6s_v3\")\n",
+        "    compute_config = AksCompute.provisioning_configuration(\n",
+        "        cluster_purpose=AksCompute.ClusterPurpose.DEV_TEST,\n",
+        "        agent_count=1,\n",
+        "        vm_size=\"Standard_NC6\",\n",
+        "        location=\"westus2\")\n",
         "    \n",
         "    # Create the cluster with the specified name and configuration\n",
         "    gpu_cluster = ComputeTarget.create(ws, gpu_cluster_name, compute_config)\n",
@@ -215,78 +187,11 @@
       "source": [
         "# Deploy the model as a web service to AKS\n",
         "\n",
-        "First create a scoring script\n",
+        "First create a scoring script. You can see the one we created for you in the `scripts` directory.\n",
         "\n",
         "** Note: ** Triton server listens to a fixed local port. You may choose to use the Triton Python [client library](https://docs.nvidia.com/deeplearning/triton-inference-server/master-user-guide/docs/client_library.html) to talk to it, while keeping the flexibility of pre-/post- processing."
       ]
     },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "tags": []
-      },
-      "outputs": [],
-      "source": [
-        "%%writefile score.py\n",
-        "import numpy as np\n",
-        "from PIL import Image\n",
-        "import sys\n",
-        "from functools import partial\n",
-        "import os\n",
-        "import io\n",
-        "\n",
-        "import tritonhttpclient\n",
-        "from tritonclientutils import InferenceServerException\n",
-        "\n",
-        "from azureml.contrib.services.aml_request import AMLRequest, rawhttp\n",
-        "from azureml.contrib.services.aml_response import AMLResponse\n",
-        "\n",
-        "sys.path.append(os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'models'))\n",
-        "from utils import preprocess, postprocess\n",
-        "\n",
-        "\n",
-        "trition_client = None\n",
-        "_url = \"localhost:8000\"\n",
-        "_model = \"densenet_onnx\"\n",
-        "_scaling = \"INCEPTION\"\n",
-        "\n",
-        "def init():\n",
-        "    global triton_client, max_batch_size, input_name, output_name, dtype\n",
-        "\n",
-        "    triton_client = tritonhttpclient.InferenceServerClient(_url)\n",
-        "\n",
-        "    max_batch_size = 0\n",
-        "    input_name = \"data_0\"\n",
-        "    output_name = \"fc6_1\"\n",
-        "    dtype = \"FP32\"\n",
-        "\n",
-        "\n",
-        "@rawhttp\n",
-        "def run(request):\n",
-        "    if request.method == 'POST':\n",
-        "        \n",
-        "        reqBody = request.get_data(False)\n",
-        "        img = Image.open(io.BytesIO(reqBody))\n",
-        "        \n",
-        "        image_data = preprocess(img, _scaling, dtype)\n",
-        "        \n",
-        "        input = tritonhttpclient.InferInput(input_name, image_data.shape, dtype)\n",
-        "        input.set_data_from_numpy(image_data, binary_data=True)\n",
-        "        output = tritonhttpclient.InferRequestedOutput(output_name, binary_data=True, class_count=1)\n",
-        "    \n",
-        "        res = triton_client.infer(_model,\n",
-        "                                [input],\n",
-        "                                request_id=\"0\",\n",
-        "                                outputs=[output])\n",
-        "\n",
-        "        result = postprocess(res, output_name, 1, max_batch_size > 0)\n",
-        "\n",
-        "        return AMLResponse(result, 200)\n",
-        "    else:\n",
-        "        return AMLResponse(\"bad request\", 500)\n"
-      ]
-    },
     {
       "cell_type": "markdown",
       "metadata": {},
@@ -355,8 +260,8 @@
         "# Optionally specify a worker count to leverage the capability of concurrency and server-side batching from Triton\n",
         "# env.environment_variables = {\"WORKER_COUNT\":\"128\"}\n",
         "\n",
-        "inference_config = InferenceConfig(entry_script=\"score.py\", environment=env)\n",
-        "aks_config = AksWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 4, gpu_cores = 1)\n",
+        "inference_config = InferenceConfig(entry_script=\"score.py\", source_directory=\"scripts\", environment=env)\n",
+        "aks_config = AksWebservice.deploy_configuration(cpu_cores=1, memory_gb=4, gpu_cores=1)\n",
         "\n",
         "# # Enable token auth and disable (key) auth on the webservice\n",
         "# aks_config = AksWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 4, gpu_cores = 1, token_auth_enabled=True, auth_enabled=False)"
@@ -438,6 +343,17 @@
         "model.delete()\n",
         "gpu_cluster.delete()\n"
       ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "tags": []
+      },
+      "outputs": [],
+      "source": [
+        "aks_service.wait_for_deployment(True)"
+      ]
     }
   ],
   "metadata": {
@@ -447,9 +363,9 @@
       }
     ],
     "kernelspec": {
-      "display_name": "Python 3.6.9 64-bit",
+      "display_name": "Python 3.7.7 64-bit",
       "language": "python",
-      "name": "python36964bit49f9d8f83b294f2eb4e5a3f7c26b67fb"
+      "name": "python_defaultSpec_1598590942828"
     },
     "language_info": {
       "codemirror_mode": {
@@ -461,7 +377,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.6.9-final"
+      "version": "3.7.7-final"
     }
   },
   "nbformat": 4,

diff --git a/how-to-use-azureml/deployment/production-deploy-to-aks-gpu-with-triton/scripts/score.py b/how-to-use-azureml/deployment/production-deploy-to-aks-gpu-with-triton/scripts/score.py
@@ -0,0 +1,25 @@
+import numpy as np
+from PIL import Image
+import sys
+from functools import partial
+import os
+import io
+
+from azureml.contrib.services.aml_request import AMLRequest, rawhttp
+from azureml.contrib.services.aml_response import AMLResponse
+from utils import preprocess, postprocess, triton_init, triton_infer
+
+
+def init():
+    triton_init()
+
+@rawhttp
+def run(request):
+    if request.method == 'POST':
+        reqBody = request.get_data(False)
+        img = Image.open(io.BytesIO(reqBody))
+        result = triton_infer(model_name="densenet_onnx", img=img)
+
+        return AMLResponse(result, 200)
+    else:
+        return AMLResponse("bad request", 500)
diff --git a/...oy-to-aks-gpu-with-triton/models/utils.py → ...y-to-aks-gpu-with-triton/scripts/utils.py b/...oy-to-aks-gpu-with-triton/models/utils.py → ...y-to-aks-gpu-with-triton/scripts/utils.py
@@ -1,6 +1,7 @@
 import numpy as np
+import tritonhttpclient
 from PIL import Image
-from tritonclientutils import triton_to_np_dtype
+from tritonclientutils import InferenceServerException, triton_to_np_dtype
 
 
 def preprocess(img, scaling, dtype):
@@ -71,4 +72,27 @@ def postprocess(results, output_name, batch_size, batching):
                 cls = result.split(':')
             output += "    {} ({}) = {}".format(cls[0], cls[1], cls[2])
 
-    return output
+    return output
+
+def triton_init(url="localhost:8000"):
+    global triton_client
+    triton_client = tritonhttpclient.InferenceServerClient(url)
+    _url = "localhost:8000"
+    _model = "densenet_onnx"
+    _scaling = "INCEPTION"
+
+# TODO: set input_name, output_name, dtype from model metadata, as in
+# https://github.com/triton-inference-server/server/blob/master/src/clients/python/examples/image_client.py#L402
+def triton_infer(model_name, img, input_name="data_0", output_name="fc6_1",
+                 dtype="FP32"):
+    image_data = preprocess(img, scaling="INCEPTION", dtype=dtype)
+    input = tritonhttpclient.InferInput(input_name, image_data.shape, dtype)
+    input.set_data_from_numpy(image_data, binary_data=True)
+    output = tritonhttpclient.InferRequestedOutput(output_name, binary_data=True, class_count=1)
+
+    res = triton_client.infer(model_name,
+                              [input],
+                              request_id="0",
+                              outputs=[output])
+
+    return postprocess(res, output_name=output_name, batch_size=1, batching=False)