diff --git a/how-to-use-azureml/deployment/production-deploy-to-aks-gpu-with-triton/production-deploy-to-aks-gpu-with-triton.ipynb b/how-to-use-azureml/deployment/production-deploy-to-aks-gpu-with-triton/production-deploy-to-aks-gpu-with-triton.ipynb index 129a0b22c..6df63378e 100644 --- a/how-to-use-azureml/deployment/production-deploy-to-aks-gpu-with-triton/production-deploy-to-aks-gpu-with-triton.ipynb +++ b/how-to-use-azureml/deployment/production-deploy-to-aks-gpu-with-triton/production-deploy-to-aks-gpu-with-triton.ipynb @@ -28,7 +28,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "import azureml.core\n", @@ -46,7 +48,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "from azureml.core.workspace import Workspace\n", @@ -93,9 +97,7 @@ "\n", "if not os.path.exists(target_file):\n", " response = requests.get(model_url)\n", - " open(target_file, 'wb').write(response.content)\n", - "\n", - "config_file = os.path.join('models', 'triton', 'densenet_onnx', 'config.pbtxt')" + " open(target_file, 'wb').write(response.content)\n" ] }, { @@ -104,39 +106,7 @@ "source": [ "# Add Model Configuration file\n", "\n", - "Each model needs a [Model Configuration](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/model_configuration.html) that provides required and optional information about the model.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "%%writefile $config_file\n", - "name: \"densenet_onnx\"\n", - "platform: \"onnxruntime_onnx\"\n", - "max_batch_size: 0\n", - "input [\n", - " {\n", - " name: \"data_0\"\n", - " data_type: TYPE_FP32\n", - " format: FORMAT_NCHW\n", - " dims: [ 3, 224, 224 ]\n", - " reshape { shape: [ 1, 3, 224, 224 ] }\n", - " }\n", - "]\n", - "output [\n", - " {\n", - " name: \"fc6_1\"\n", - " data_type: TYPE_FP32\n", - " dims: [ 1000 ]\n", - " reshape { shape: [ 1, 1000, 1, 1 ] }\n", - " label_filename: \"densenet_labels.txt\"\n", - " }\n", - "]" + "Each Triton model needs a [Model Configuration](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/model_configuration.html) that provides required and optional information about the model. We've provided one in this directory already.\n" ] }, { @@ -198,9 +168,11 @@ " print(\"Creating new gpu-cluster\")\n", " \n", " # Specify the configuration for the new cluster\n", - " compute_config = AksCompute.provisioning_configuration(cluster_purpose=AksCompute.ClusterPurpose.DEV_TEST,\n", - " agent_count=1,\n", - " vm_size=\"Standard_NC6s_v3\")\n", + " compute_config = AksCompute.provisioning_configuration(\n", + " cluster_purpose=AksCompute.ClusterPurpose.DEV_TEST,\n", + " agent_count=1,\n", + " vm_size=\"Standard_NC6\",\n", + " location=\"westus2\")\n", " \n", " # Create the cluster with the specified name and configuration\n", " gpu_cluster = ComputeTarget.create(ws, gpu_cluster_name, compute_config)\n", @@ -215,78 +187,11 @@ "source": [ "# Deploy the model as a web service to AKS\n", "\n", - "First create a scoring script\n", + "First create a scoring script. You can see the one we created for you in the `scripts` directory.\n", "\n", "** Note: ** Triton server listens to a fixed local port. You may choose to use the Triton Python [client library](https://docs.nvidia.com/deeplearning/triton-inference-server/master-user-guide/docs/client_library.html) to talk to it, while keeping the flexibility of pre-/post- processing." ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "%%writefile score.py\n", - "import numpy as np\n", - "from PIL import Image\n", - "import sys\n", - "from functools import partial\n", - "import os\n", - "import io\n", - "\n", - "import tritonhttpclient\n", - "from tritonclientutils import InferenceServerException\n", - "\n", - "from azureml.contrib.services.aml_request import AMLRequest, rawhttp\n", - "from azureml.contrib.services.aml_response import AMLResponse\n", - "\n", - "sys.path.append(os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'models'))\n", - "from utils import preprocess, postprocess\n", - "\n", - "\n", - "trition_client = None\n", - "_url = \"localhost:8000\"\n", - "_model = \"densenet_onnx\"\n", - "_scaling = \"INCEPTION\"\n", - "\n", - "def init():\n", - " global triton_client, max_batch_size, input_name, output_name, dtype\n", - "\n", - " triton_client = tritonhttpclient.InferenceServerClient(_url)\n", - "\n", - " max_batch_size = 0\n", - " input_name = \"data_0\"\n", - " output_name = \"fc6_1\"\n", - " dtype = \"FP32\"\n", - "\n", - "\n", - "@rawhttp\n", - "def run(request):\n", - " if request.method == 'POST':\n", - " \n", - " reqBody = request.get_data(False)\n", - " img = Image.open(io.BytesIO(reqBody))\n", - " \n", - " image_data = preprocess(img, _scaling, dtype)\n", - " \n", - " input = tritonhttpclient.InferInput(input_name, image_data.shape, dtype)\n", - " input.set_data_from_numpy(image_data, binary_data=True)\n", - " output = tritonhttpclient.InferRequestedOutput(output_name, binary_data=True, class_count=1)\n", - " \n", - " res = triton_client.infer(_model,\n", - " [input],\n", - " request_id=\"0\",\n", - " outputs=[output])\n", - "\n", - " result = postprocess(res, output_name, 1, max_batch_size > 0)\n", - "\n", - " return AMLResponse(result, 200)\n", - " else:\n", - " return AMLResponse(\"bad request\", 500)\n" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -355,8 +260,8 @@ "# Optionally specify a worker count to leverage the capability of concurrency and server-side batching from Triton\n", "# env.environment_variables = {\"WORKER_COUNT\":\"128\"}\n", "\n", - "inference_config = InferenceConfig(entry_script=\"score.py\", environment=env)\n", - "aks_config = AksWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 4, gpu_cores = 1)\n", + "inference_config = InferenceConfig(entry_script=\"score.py\", source_directory=\"scripts\", environment=env)\n", + "aks_config = AksWebservice.deploy_configuration(cpu_cores=1, memory_gb=4, gpu_cores=1)\n", "\n", "# # Enable token auth and disable (key) auth on the webservice\n", "# aks_config = AksWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 4, gpu_cores = 1, token_auth_enabled=True, auth_enabled=False)" @@ -438,6 +343,17 @@ "model.delete()\n", "gpu_cluster.delete()\n" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "aks_service.wait_for_deployment(True)" + ] } ], "metadata": { @@ -447,9 +363,9 @@ } ], "kernelspec": { - "display_name": "Python 3.6.9 64-bit", + "display_name": "Python 3.7.7 64-bit", "language": "python", - "name": "python36964bit49f9d8f83b294f2eb4e5a3f7c26b67fb" + "name": "python_defaultSpec_1598590942828" }, "language_info": { "codemirror_mode": { @@ -461,7 +377,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.9-final" + "version": "3.7.7-final" } }, "nbformat": 4, diff --git a/how-to-use-azureml/deployment/production-deploy-to-aks-gpu-with-triton/scripts/score.py b/how-to-use-azureml/deployment/production-deploy-to-aks-gpu-with-triton/scripts/score.py new file mode 100644 index 000000000..088c97af6 --- /dev/null +++ b/how-to-use-azureml/deployment/production-deploy-to-aks-gpu-with-triton/scripts/score.py @@ -0,0 +1,25 @@ +import numpy as np +from PIL import Image +import sys +from functools import partial +import os +import io + +from azureml.contrib.services.aml_request import AMLRequest, rawhttp +from azureml.contrib.services.aml_response import AMLResponse +from utils import preprocess, postprocess, triton_init, triton_infer + + +def init(): + triton_init() + +@rawhttp +def run(request): + if request.method == 'POST': + reqBody = request.get_data(False) + img = Image.open(io.BytesIO(reqBody)) + result = triton_infer(model_name="densenet_onnx", img=img) + + return AMLResponse(result, 200) + else: + return AMLResponse("bad request", 500) \ No newline at end of file diff --git a/how-to-use-azureml/deployment/production-deploy-to-aks-gpu-with-triton/models/utils.py b/how-to-use-azureml/deployment/production-deploy-to-aks-gpu-with-triton/scripts/utils.py similarity index 63% rename from how-to-use-azureml/deployment/production-deploy-to-aks-gpu-with-triton/models/utils.py rename to how-to-use-azureml/deployment/production-deploy-to-aks-gpu-with-triton/scripts/utils.py index 8f118e61f..735e7f875 100644 --- a/how-to-use-azureml/deployment/production-deploy-to-aks-gpu-with-triton/models/utils.py +++ b/how-to-use-azureml/deployment/production-deploy-to-aks-gpu-with-triton/scripts/utils.py @@ -1,6 +1,7 @@ import numpy as np +import tritonhttpclient from PIL import Image -from tritonclientutils import triton_to_np_dtype +from tritonclientutils import InferenceServerException, triton_to_np_dtype def preprocess(img, scaling, dtype): @@ -71,4 +72,27 @@ def postprocess(results, output_name, batch_size, batching): cls = result.split(':') output += " {} ({}) = {}".format(cls[0], cls[1], cls[2]) - return output \ No newline at end of file + return output + +def triton_init(url="localhost:8000"): + global triton_client + triton_client = tritonhttpclient.InferenceServerClient(url) + _url = "localhost:8000" + _model = "densenet_onnx" + _scaling = "INCEPTION" + +# TODO: set input_name, output_name, dtype from model metadata, as in +# https://github.com/triton-inference-server/server/blob/master/src/clients/python/examples/image_client.py#L402 +def triton_infer(model_name, img, input_name="data_0", output_name="fc6_1", + dtype="FP32"): + image_data = preprocess(img, scaling="INCEPTION", dtype=dtype) + input = tritonhttpclient.InferInput(input_name, image_data.shape, dtype) + input.set_data_from_numpy(image_data, binary_data=True) + output = tritonhttpclient.InferRequestedOutput(output_name, binary_data=True, class_count=1) + + res = triton_client.infer(model_name, + [input], + request_id="0", + outputs=[output]) + + return postprocess(res, output_name=output_name, batch_size=1, batching=False) \ No newline at end of file