From 7a8482fe4c164713bc00b54b7bc413b8ecc680d3 Mon Sep 17 00:00:00 2001
From: jagadeesh <jagadeeshj@ideas2it.com>
Date: Mon, 13 Mar 2023 22:51:27 +0530
Subject: [PATCH 1/5] fix: kserve fastapi migration issues

Signed-off-by: jagadeesh <jagadeeshj@ideas2it.com>
---
 .../v2/mnist/mnist_v2_bytes.json              |  8 +-
 .../kf_request_json/v2/mnist/tobytes.py       | 17 +++-
 .../kserve/kserve_wrapper/TorchserveModel.py  | 81 ++-----------------
 kubernetes/kserve/kserve_wrapper/__main__.py  |  4 +-
 ts/torch_handler/request_envelope/kservev2.py | 35 +++++---
 5 files changed, 48 insertions(+), 97 deletions(-)

diff --git a/kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_bytes.json b/kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_bytes.json
index 0c07866dba..683ada7b73 100644
--- a/kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_bytes.json
+++ b/kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_bytes.json
@@ -1,10 +1,10 @@
 {
     "inputs": [
         {
-            "data": "iVBORw0KGgoAAAANSUhEUgAAABwAAAAcCAAAAABXZoBIAAAA10lEQVR4nGNgGFhgy6xVdrCszBaLFN/mr28+/QOCr69DMCSnA8WvHti0acu/fx/10OS0X/975CDDw8DA1PDn/1pBVEmLf3+zocy2X/+8USXt/82Ds+/+m4sqeehfOpw97d9VFDmlO++t4JwQNMm6f6sZcEpee2+DR/I4A05J7tt4JJP+IUsu+ncRp6TxO9RAQJY0XvrvMAuypNNHuCTz8n+PzVEcy3DtqgiY1ptx6t8/ewY0yX9ntoDA63//Xs3hQpMMPPsPAv68qmDAAFKXwHIzMzCl6AoAxXp0QujtP+8AAAAASUVORK5CYII=",
+            "data": ["iVBORw0KGgoAAAANSUhEUgAAABwAAAAcCAAAAABXZoBIAAAA10lEQVR4nGNgGFhgy6xVdrCszBaLFN/mr28+/QOCr69DMCSnA8WvHti0acu/fx/10OS0X/975CDDw8DA1PDn/1pBVEmLf3+zocy2X/+8USXt/82Ds+/+m4sqeehfOpw97d9VFDmlO++t4JwQNMm6f6sZcEpee2+DR/I4A05J7tt4JJP+IUsu+ncRp6TxO9RAQJY0XvrvMAuypNNHuCTz8n+PzVEcy3DtqgiY1ptx6t8/ewY0yX9ntoDA63//Xs3hQpMMPPsPAv68qmDAAFKXwHIzMzCl6AoAxXp0QujtP+8AAAAASUVORK5CYII="],
             "datatype": "BYTES",
-            "name": "312a4eb0-0ca7-4803-a101-a6d2c18486fe",
-            "shape": -1
+            "name": "e8d5afed-0a56-4deb-ac9c-352663f51b93",
+            "shape": [-1]
         }
     ]
-}
\ No newline at end of file
+}
diff --git a/kubernetes/kserve/kf_request_json/v2/mnist/tobytes.py b/kubernetes/kserve/kf_request_json/v2/mnist/tobytes.py
index f065acd31f..71ef7d3b62 100644
--- a/kubernetes/kserve/kf_request_json/v2/mnist/tobytes.py
+++ b/kubernetes/kserve/kf_request_json/v2/mnist/tobytes.py
@@ -1,6 +1,6 @@
+import argparse
 import base64
 import json
-import argparse
 import uuid
 
 parser = argparse.ArgumentParser()
@@ -10,11 +10,20 @@
 image = open(args.filename, "rb")  # open binary file in read mode
 image_read = image.read()
 image_64_encode = base64.b64encode(image_read)
-bytes_array = image_64_encode.decode("utf-8")
+bytes_array = list(image_64_encode.decode("utf-8"))
 request = {
-    "inputs": [{"name": str(uuid.uuid4()), "shape": -1, "datatype": "BYTES", "data": bytes_array}]
+    "inputs": [
+        {
+            "name": str(uuid.uuid4()),
+            "shape": [-1],
+            "datatype": "BYTES",
+            "data": bytes_array,
+        }
+    ]
 }
 
-result_file = "{filename}.{ext}".format(filename=str(args.filename).split(".")[0], ext="json")
+result_file = "{filename}.{ext}".format(
+    filename=str(args.filename).split(".")[0], ext="json"
+)
 with open(result_file, "w") as outfile:
     json.dump(request, outfile, indent=4, sort_keys=True)
diff --git a/kubernetes/kserve/kserve_wrapper/TorchserveModel.py b/kubernetes/kserve/kserve_wrapper/TorchserveModel.py
index cf0aec512d..95a9c7b071 100644
--- a/kubernetes/kserve/kserve_wrapper/TorchserveModel.py
+++ b/kubernetes/kserve/kserve_wrapper/TorchserveModel.py
@@ -1,23 +1,19 @@
 """ The torchserve side inference end-points request are handled to
     return a KServe side response """
-import json
 import logging
 import pathlib
-from typing import Dict
 
 import kserve
-import tornado.web
+from kserve.errors import ModelMissingError
 from kserve.model import Model as Model
-from kserve.model import ModelMissingError
 
 logging.basicConfig(level=kserve.constants.KSERVE_LOGLEVEL)
 
+PREDICTOR_URL_FORMAT = PREDICTOR_V2_URL_FORMAT = "http://{0}/predictions/{1}"
+EXPLAINER_URL_FORMAT = EXPLAINER_V2_URL_FORMAT = "http://{0}/explanations/{1}"
 REGISTER_URL_FORMAT = "{0}/models?initial_workers=1&url={1}"
 UNREGISTER_URL_FORMAT = "{0}/models/{1}"
 
-PREDICTOR_URL_FORMAT = "http://{0}/v1/models/{1}:predict"
-EXPLAINER_URL_FORMAT = "http://{0}/v1/models/{1}:explain"
-
 
 class TorchserveModel(Model):
     """The torchserve side inference and explain end-points requests are handled to
@@ -49,76 +45,9 @@ def __init__(self, name, inference_address, management_address, model_dir):
         self.management_address = management_address
         self.model_dir = model_dir
 
-        logging.info("kfmodel Predict URL set to %s", self.predictor_host)
+        logging.info("Predict URL set to %s", self.predictor_host)
         self.explainer_host = self.predictor_host
-        logging.info("kfmodel Explain URL set to %s", self.explainer_host)
-
-    async def predict(self, request: Dict) -> Dict:
-        """The predict method is called when we hit the inference endpoint and handles
-        the inference request and response from the Torchserve side and passes it on
-        to the KServe side.
-
-        Args:
-            request (Dict): Input request from the http client side.
-
-        Raises:
-            NotImplementedError: If the predictor host on the KServe side is not
-                                 available.
-
-            tornado.web.HTTPError: If there is a bad response from the http client.
-
-        Returns:
-            Dict: The Response from the input from the inference endpoint.
-        """
-        if not self.predictor_host:
-            raise NotImplementedError
-        logging.debug("kfmodel predict request is %s", json.dumps(request))
-        logging.info("PREDICTOR_HOST : %s", self.predictor_host)
-        headers = {"Content-Type": "application/json; charset=UTF-8"}
-        response = await self._http_client.fetch(
-            PREDICTOR_URL_FORMAT.format(self.predictor_host, self.name),
-            method="POST",
-            request_timeout=self.timeout,
-            headers=headers,
-            body=json.dumps(request),
-        )
-
-        if response.code != 200:
-            raise tornado.web.HTTPError(status_code=response.code, reason=response.body)
-        return json.loads(response.body)
-
-    async def explain(self, request: Dict) -> Dict:
-        """The predict method is called when we hit the explain endpoint and handles the
-        explain request and response from the Torchserve side and passes it on to the
-        KServe side.
-
-        Args:
-            request (Dict): Input request from the http client side.
-
-        Raises:
-            NotImplementedError: If the predictor host on the KServe side is not
-                                 available.
-
-            tornado.web.HTTPError: If there is a bad response from the http client.
-
-        Returns:
-            Dict: The Response from the input from the explain endpoint.
-        """
-        if self.explainer_host is None:
-            raise NotImplementedError
-        logging.info("kfmodel explain request is %s", json.dumps(request))
-        logging.info("EXPLAINER_HOST : %s", self.explainer_host)
-        headers = {"Content-Type": "application/json; charset=UTF-8"}
-        response = await self._http_client.fetch(
-            EXPLAINER_URL_FORMAT.format(self.explainer_host, self.name),
-            method="POST",
-            request_timeout=self.timeout,
-            headers=headers,
-            body=json.dumps(request),
-        )
-        if response.code != 200:
-            raise tornado.web.HTTPError(status_code=response.code, reason=response.body)
-        return json.loads(response.body)
+        logging.info("Explain URL set to %s", self.explainer_host)
 
     def load(self) -> bool:
         """This method validates model availabilty in the model directory
diff --git a/kubernetes/kserve/kserve_wrapper/__main__.py b/kubernetes/kserve/kserve_wrapper/__main__.py
index e8063426fe..0273e44751 100644
--- a/kubernetes/kserve/kserve_wrapper/__main__.py
+++ b/kubernetes/kserve/kserve_wrapper/__main__.py
@@ -12,7 +12,7 @@
 DEFAULT_MODEL_NAME = "model"
 DEFAULT_INFERENCE_ADDRESS = "http://127.0.0.1:8085"
 INFERENCE_PORT = "8085"
-DEFAULT_MANAGEMENT_ADDRESS = "http://127.0.0.1:8081"
+DEFAULT_MANAGEMENT_ADDRESS = "http://127.0.0.1:8085"
 
 DEFAULT_MODEL_STORE = "/mnt/models/model-store"
 CONFIG_PATH = "/mnt/models/config/config.properties"
@@ -100,5 +100,5 @@ def parse_config():
     ModelServer(
         registered_models=registeredModels,
         http_port=8080,
-        grpc_port=7070,
+        grpc_port=8081,
     ).start(models)
diff --git a/ts/torch_handler/request_envelope/kservev2.py b/ts/torch_handler/request_envelope/kservev2.py
index 33e573cfb9..5a88e9497d 100644
--- a/ts/torch_handler/request_envelope/kservev2.py
+++ b/ts/torch_handler/request_envelope/kservev2.py
@@ -4,7 +4,9 @@
 """
 import json
 import logging
+
 import numpy as np
+
 from .base import BaseEnvelope
 
 logger = logging.getLogger(__name__)
@@ -87,7 +89,9 @@ def _batch_from_json(self, rows):
         Joins the instances of a batch of JSON objects
         """
         logger.debug("Parse input data %s", rows)
-        body_list = [body_list.get("data") or body_list.get("body") for body_list in rows]
+        body_list = [
+            body_list.get("data") or body_list.get("body") for body_list in rows
+        ]
         data_list = self._from_json(body_list)
         return data_list
 
@@ -99,7 +103,15 @@ def _from_json(self, body_list):
         if isinstance(body_list[0], (bytes, bytearray)):
             body_list = [json.loads(body.decode()) for body in body_list]
             logger.debug("Bytes array is %s", body_list)
-        if "id" in body_list[0]:
+
+        input_names = []
+        for index, input in enumerate(body_list[0]["inputs"]):
+            if input["datatype"] == "BYTES":
+                body_list[0]["inputs"][index]["data"] = input["data"][0]
+            input_names.append(input["name"])
+        setattr(self.context, "input_names", input_names)
+        logger.debug("Bytes array is %s", body_list)
+        if body_list[0].get("id") is not None:
             setattr(self.context, "input_request_id", body_list[0]["id"])
         data_list = [inputs_list.get("inputs") for inputs_list in body_list][0]
         return data_list
@@ -116,7 +128,7 @@ def format_output(self, data):
           "model_name": "bert",
           "model_version": "1",
           "outputs": [{
-            "name": "predict",
+            "name": "input-0",
             "shape": [1],
             "datatype": "INT64",
             "data": [2]
@@ -131,10 +143,10 @@ def format_output(self, data):
             delattr(self.context, "input_request_id")
         else:
             response["id"] = self.context.get_request_id(0)
-        response["model_name"] = self.context.manifest.get("model").get(
-            "modelName")
+        response["model_name"] = self.context.manifest.get("model").get("modelName")
         response["model_version"] = self.context.manifest.get("model").get(
-            "modelVersion")
+            "modelVersion"
+        )
         response["outputs"] = self._batch_to_json(data)
         return [response]
 
@@ -143,18 +155,19 @@ def _batch_to_json(self, data):
         Splits batch output to json objects
         """
         output = []
-        for item in data:
-            output.append(self._to_json(item))
+        input_names = getattr(self.context, "input_names")
+        delattr(self.context, "input_names")
+        for index, item in enumerate(data):
+            output.append(self._to_json(item, input_names[index]))
         return output
 
-    def _to_json(self, data):
+    def _to_json(self, data, input_name):
         """
         Constructs JSON object from data
         """
         output_data = {}
         data_ndarray = np.array(data)
-        output_data["name"] = ("explain" if self.context.get_request_header(
-            0, "explain") == "True" else "predict")
+        output_data["name"] = input_name
         output_data["shape"] = list(data_ndarray.shape)
         output_data["datatype"] = _to_datatype(data_ndarray.dtype)
         output_data["data"] = data_ndarray.flatten().tolist()

From f01868f941a8eff0e175a849884b5cd6ef2a454e Mon Sep 17 00:00:00 2001
From: sekyondaMeta <127536312+sekyondaMeta@users.noreply.github.com>
Date: Mon, 15 May 2023 15:05:33 -0400
Subject: [PATCH 2/5] Updating index.md to fix 8 broken links (#2329)

* Update index.md

Update to fix a broken link in index.md where the trailing .md is cut off from the management_api.md. Added an anchor link to force the .md to show up.

* Update to index.md

Update to index.md to fix several links ending in .md that sphinx is breaking. Added anchor links to each link and a corresponding anchor in the affected doc. Tested locally and seems to be working.

* Update inference_api.md

* Updated typos

Fixed typos and updated wordslist.txt

* Update wordlist.txt

---------

Co-authored-by: sekyonda <7411+sekyonda@users.noreply.ghe.oculus-rep.com>
Co-authored-by: lxning <23464292+lxning@users.noreply.github.com>
---
 docs/index.md                           | 16 ++++++++--------
 docs/inference_api.md                   |  4 ++--
 docs/management_api.md                  | 20 ++++++++++----------
 docs/metrics.md                         |  2 +-
 docs/performance_guide.md               |  4 ++--
 examples/README.md                      |  2 +-
 examples/Workflows/README.md            |  4 ++--
 ts_scripts/spellcheck_conf/wordlist.txt |  2 ++
 8 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index 57b890b1b1..b5e7c8fc05 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -4,19 +4,19 @@ TorchServe is a performant, flexible and easy to use tool for serving PyTorch mo
 
 
 ## ⚡ Why TorchServe
-* [Model Management API](https://github.com/pytorch/serve/blob/master/docs/management_api.md): multi model management with optimized worker to model allocation
-* [Inference API](https://github.com/pytorch/serve/blob/master/docs/inference_api.md): REST and gRPC support for batched inference
-* [TorchServe Workflows](https://github.com/pytorch/serve/blob/master/examples/Workflows/README.md): deploy complex DAGs with multiple interdependent models
+* [Model Management API](https://github.com/pytorch/serve/blob/master/docs/management_api.md#management-api): multi model management with optimized worker to model allocation
+* [Inference API](https://github.com/pytorch/serve/blob/master/docs/inference_api.md#inference-api): REST and gRPC support for batched inference
+* [TorchServe Workflows](https://github.com/pytorch/serve/blob/master/examples/Workflows/README.md#workflow-examples): deploy complex DAGs with multiple interdependent models
 * Default way to serve PyTorch models in
   * [Kubeflow](https://v0-5.kubeflow.org/docs/components/pytorchserving/)
   * [MLflow](https://github.com/mlflow/mlflow-torchserve)
   * [Sagemaker](https://aws.amazon.com/blogs/machine-learning/serving-pytorch-models-in-production-with-the-amazon-sagemaker-native-torchserve-integration/)
   * [Kserve](https://kserve.github.io/website/0.8/modelserving/v1beta1/torchserve/): Supports both v1 and v2 API
   * [Vertex AI](https://cloud.google.com/blog/topics/developers-practitioners/pytorch-google-cloud-how-deploy-pytorch-models-vertex-ai)
-* Export your model for optimized inference. Torchscript out of the box, [ORT and ONNX](https://github.com/pytorch/serve/blob/master/docs/performance_guide.md), [IPEX](https://github.com/pytorch/serve/tree/master/examples/intel_extension_for_pytorch), [TensorRT](https://github.com/pytorch/serve/blob/master/docs/performance_guide.md), [FasterTransformer](https://github.com/pytorch/serve/tree/master/examples/FasterTransformer_HuggingFace_Bert)
-* [Performance Guide](https://github.com/pytorch/serve/blob/master/docs/performance_guide.md): builtin support to optimize, benchmark and profile PyTorch and TorchServe performance
-* [Expressive handlers](https://github.com/pytorch/serve/blob/master/CONTRIBUTING.md): An expressive handler architecture that makes it trivial to support inferencing for your usecase with [many supported out of the box](https://github.com/pytorch/serve/tree/master/ts/torch_handler)
-* [Metrics API](https://github.com/pytorch/serve/blob/master/docs/metrics.md): out of box support for system level metrics with [Prometheus exports](https://github.com/pytorch/serve/tree/master/examples/custom_metrics), custom metrics and PyTorch profiler support
+* Export your model for optimized inference. Torchscript out of the box, [ORT and ONNX](https://github.com/pytorch/serve/blob/master/docs/performance_guide.md#performance-guide), [IPEX](https://github.com/pytorch/serve/tree/master/examples/intel_extension_for_pytorch), [TensorRT](https://github.com/pytorch/serve/blob/master/docs/performance_guide.md#performance-guide), [FasterTransformer](https://github.com/pytorch/serve/tree/master/examples/FasterTransformer_HuggingFace_Bert)
+* [Performance Guide](https://github.com/pytorch/serve/blob/master/docs/performance_guide.md#performance-guide): builtin support to optimize, benchmark and profile PyTorch and TorchServe performance
+* [Expressive handlers](https://github.com/pytorch/serve/blob/master/CONTRIBUTING.md#contributing-to-torchServe): An expressive handler architecture that makes it trivial to support inferencing for your usecase with [many supported out of the box](https://github.com/pytorch/serve/tree/master/ts/torch_handler)
+* [Metrics API](https://github.com/pytorch/serve/blob/master/docs/metrics.md#torchserve-metrics): out of box support for system level metrics with [Prometheus exports](https://github.com/pytorch/serve/tree/master/examples/custom_metrics), custom metrics and PyTorch profiler support
 
 ## 🤔 How does TorchServe work
 
@@ -56,7 +56,7 @@ TorchServe is a performant, flexible and easy to use tool for serving PyTorch mo
 * [TorchServe UseCases](https://github.com/pytorch/serve/blob/master/examples/README.md#usecases)
 * [Model Zoo](https://github.com/pytorch/serve/blob/master/docs/model_zoo.md) - List of pre-trained model archives ready to be served for inference with TorchServe.
 
-For [more examples](https://github.com/pytorch/serve/blob/master/examples/README.md)
+For [more examples](https://github.com/pytorch/serve/blob/master/examples/README.md#torchserve-internals)
 
 
 ## Advanced Features
diff --git a/docs/inference_api.md b/docs/inference_api.md
index b8c1012dbb..988aabd7a4 100644
--- a/docs/inference_api.md
+++ b/docs/inference_api.md
@@ -1,4 +1,4 @@
-# Inference API
+# [Inference API](#inference-api)
 
 Inference API is listening on port 8080 and only accessible from localhost by default. To change the default setting, see [TorchServe Configuration](configuration.md).
 
@@ -41,7 +41,7 @@ If the server is running, the response is:
 }
 ```
 
-"maxRetryTimeoutInSec" (default: 5MIN) can be defined in a model's config yaml file(eg. model-config.yaml). It is the maximum time window of recovering a dead backend worker. A healthy worker can be in the state: WORKER_STARTED, WORKER_MODEL_LOADED, or WORKER_STOPPED within maxRetryTimeoutInSec window. "Ping" endpont"
+"maxRetryTimeoutInSec" (default: 5MIN) can be defined in a model's config yaml file(e.g model-config.yaml). It is the maximum time window of recovering a dead backend worker. A healthy worker can be in the state: WORKER_STARTED, WORKER_MODEL_LOADED, or WORKER_STOPPED within maxRetryTimeoutInSec window. "Ping" endpoint"
 * return 200 + json message "healthy": for any model, the number of active workers is equal or larger than the configured minWorkers.
 * return 500 + json message "unhealthy": for any model, the number of active workers is less than the configured minWorkers.
 
diff --git a/docs/management_api.md b/docs/management_api.md
index c7e7af5d9f..991746fe52 100644
--- a/docs/management_api.md
+++ b/docs/management_api.md
@@ -1,4 +1,4 @@
-# Management API
+# [Management API](#management-api)
 
 TorchServe provides the following APIs that allows you to manage models at runtime:
 
@@ -41,13 +41,13 @@ curl -X POST  "http://localhost:8081/models?url=https://torchserve.pytorch.org/m
 }
 ```
 
-### Encrypted model serving 
+### Encrypted model serving
 If you'd like to serve an encrypted model then you need to setup [S3 SSE-KMS](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingKMSEncryption.html) with the following environment variables:
 * AWS_ACCESS_KEY_ID
 * AWS_SECRET_ACCESS_KEY
 * AWS_DEFAULT_REGION
 
-And set "s3_sse_kms=true" in HTTP request. 
+And set "s3_sse_kms=true" in HTTP request.
 
 For example: model squeezenet1_1 is [encrypted on S3 under your own private account](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingKMSEncryption.html). The model http url on S3 is `https://torchserve.pytorch.org/sse-test/squeezenet1_1.mar`.
 - if torchserve will run on EC2 instance (e.g. OS: ubuntu)
@@ -86,7 +86,7 @@ curl -v -X POST "http://localhost:8081/models?initial_workers=1&synchronous=fals
 < x-request-id: 4dc54158-c6de-42aa-b5dd-ebcb5f721043
 < content-length: 47
 < connection: keep-alive
-< 
+<
 {
   "status": "Processing worker updates..."
 }
@@ -102,7 +102,7 @@ curl -v -X POST "http://localhost:8081/models?initial_workers=1&synchronous=true
 < x-request-id: ecd2e502-382f-4c3b-b425-519fbf6d3b85
 < content-length: 89
 < connection: keep-alive
-< 
+<
 {
   "status": "Model \"squeezenet1_1\" Version: 1.0 registered with 1 initial workers"
 }
@@ -118,7 +118,7 @@ This API follows the [ManagementAPIsService.ScaleWorker](https://github.com/pyto
 * `min_worker` - (optional) the minimum number of worker processes. TorchServe will try to maintain this minimum for specified model. The default value is `1`.
 * `max_worker` - (optional) the maximum number of worker processes. TorchServe will make no more that this number of workers for the specified model. The default is the same as the setting for `min_worker`.
 * `synchronous` - whether or not the call is synchronous. The default value is `false`.
-* `timeout` - the specified wait time for a worker to complete all pending requests. If exceeded, the work process will be terminated. Use `0` to terminate the backend worker process immediately. Use `-1` to wait infinitely. The default value is `-1`. 
+* `timeout` - the specified wait time for a worker to complete all pending requests. If exceeded, the work process will be terminated. Use `0` to terminate the backend worker process immediately. Use `-1` to wait infinitely. The default value is `-1`.
 
 Use the Scale Worker API to dynamically adjust the number of workers for any version of a model to better serve different inference request loads.
 
@@ -134,7 +134,7 @@ curl -v -X PUT "http://localhost:8081/models/noop?min_worker=3"
 < x-request-id: 42adc58e-6956-4198-ad07-db6c620c4c1e
 < content-length: 47
 < connection: keep-alive
-< 
+<
 {
   "status": "Processing worker updates..."
 }
@@ -150,7 +150,7 @@ curl -v -X PUT "http://localhost:8081/models/noop?min_worker=3&synchronous=true"
 < x-request-id: b72b1ea0-81c6-4cce-92c4-530d3cfe5d4a
 < content-length: 63
 < connection: keep-alive
-< 
+<
 {
   "status": "Workers scaled to 3 for model: noop"
 }
@@ -169,7 +169,7 @@ curl -v -X PUT "http://localhost:8081/models/noop/2.0?min_worker=3&synchronous=t
 < x-request-id: 3997ccd4-ae44-4570-b249-e361b08d3d47
 < content-length: 77
 < connection: keep-alive
-< 
+<
 {
   "status": "Workers scaled to 3 for model: noop, version: 2.0"
 }
@@ -290,7 +290,7 @@ curl http://localhost:8081/models/noop/all
 ```
 
 `GET /models/{model_name}/{model_version}?customized=true`
-or 
+or
 `GET /models/{model_name}?customized=true`
 
 Use the Describe Model API to get detail runtime status and customized metadata of a version of a model:
diff --git a/docs/metrics.md b/docs/metrics.md
index fc578c097a..9993948683 100644
--- a/docs/metrics.md
+++ b/docs/metrics.md
@@ -1,4 +1,4 @@
-# TorchServe Metrics
+# [TorchServe Metrics](#torchserve-metrics)
 
 ## Contents of this document
 
diff --git a/docs/performance_guide.md b/docs/performance_guide.md
index aa0451b156..b22be3f7e2 100644
--- a/docs/performance_guide.md
+++ b/docs/performance_guide.md
@@ -1,4 +1,4 @@
-# Performance Guide
+# [Performance Guide](#performance-guide)
 In case you're interested in optimizing the memory usage, latency or throughput of a PyTorch model served with TorchServe, this is the guide for you.
 ## Optimizing PyTorch
 There are many tricks to optimize PyTorch models for production including but not limited to distillation, quantization, fusion, pruning, setting environment variables and we encourage you to benchmark and see what works best for you. An experimental tool that may make this process easier is https://pypi.org/project/torchprep.
@@ -9,7 +9,7 @@ In general it's hard to optimize models and the easiest approach can be exportin
 
 `pip install torchserve[onnx]`
 
-In particular TorchServe has native support for ONNX models which can be loaded via ORT for both accelerated CPU and GPU inference. ONNX operates a bit differentyl from a regular PyTorch model in that when you're running the conversion you need to explicity set and name your input and output dimensions. See https://github.com/pytorch/serve/blob/master/test/pytest/test_onnx.py for an example. So at a high level what TorchServe allows you to do is
+In particular TorchServe has native support for ONNX models which can be loaded via ORT for both accelerated CPU and GPU inference. ONNX operates a bit differently from a regular PyTorch model in that when you're running the conversion you need to explicitly set and name your input and output dimensions. See https://github.com/pytorch/serve/blob/master/test/pytest/test_onnx.py for an example. So at a high level what TorchServe allows you to do is
 1. Package serialized ONNX weights `torch-model-archiver --serialized-file model.onnx ...`
 2. Load those weights from `base_handler.py` using `ort_session = ort.InferenceSession(self.model_pt_path, providers=providers, sess_options=sess_options)` which supports reasonable defaults for both CPU and GPU inference
 3. Allow you define custom pre and post processing functions to pass in data in the format your onnx model expects with a custom handler
diff --git a/examples/README.md b/examples/README.md
index 6117d25c45..9dec94c386 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,4 +1,4 @@
-# Examples showcasing TorchServe Features and Integrations
+# [Examples showcasing TorchServe Features and Integrations](#torchserve-internals)
 
 ## TorchServe Internals
 
diff --git a/examples/Workflows/README.md b/examples/Workflows/README.md
index ea07a487ed..b08ab3ecb9 100644
--- a/examples/Workflows/README.md
+++ b/examples/Workflows/README.md
@@ -1,4 +1,4 @@
-# Workflow examples
+# [Workflow examples](#workflow-examples)
 
 Workflows can be used to compose an ensemble of Pytorch models and Python functions and package them in a `war` file. A workflow is executed as a DAG where the nodes can be either Pytorch models packaged as `mar` files or function nodes specified in the workflow handler file. The DAG can be used to define both sequential or parallel pipelines.
 
@@ -8,7 +8,7 @@ As an example a sequential pipeline may look something like
 input -> function1 -> model1 -> model2 -> function2 -> output
 ```
 
-And a parallel pipeline may look something like 
+And a parallel pipeline may look something like
 
 ```
                           model1
diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt
index e454862be7..f2a0141b1f 100644
--- a/ts_scripts/spellcheck_conf/wordlist.txt
+++ b/ts_scripts/spellcheck_conf/wordlist.txt
@@ -1049,3 +1049,5 @@ torchrun
 nproc
 largemodels
 torchpippy
+InferenceSession
+maxRetryTimeoutInSec

From 25f3700c40a7e5b05a919157ea3fad55f216bf23 Mon Sep 17 00:00:00 2001
From: Naman Nandan <namankt55@gmail.com>
Date: Tue, 16 May 2023 09:55:42 -0700
Subject: [PATCH 3/5] BERT nightly benchmark on Inferentia2 (#2283)

* Inf2 nightly benchmark

* fix linter spellcheck error

---------

Co-authored-by: Naman Nandan <namannan@amazon.com>
---
 .github/workflows/benchmark_nightly.yml       |  7 +-
 benchmarks/auto_benchmark.py                  |  2 +-
 benchmarks/benchmark_config_neuronx.yaml      | 45 ++++++++++++
 benchmarks/models_config/bert_neuronx.yaml    | 68 +++++++++++++++++++
 .../Download_Transformer_models.py            | 17 +++++
 examples/Huggingface_Transformers/README.md   |  4 +-
 ts_scripts/spellcheck_conf/wordlist.txt       |  1 +
 7 files changed, 140 insertions(+), 4 deletions(-)
 create mode 100644 benchmarks/benchmark_config_neuronx.yaml
 create mode 100644 benchmarks/models_config/bert_neuronx.yaml

diff --git a/.github/workflows/benchmark_nightly.yml b/.github/workflows/benchmark_nightly.yml
index 4348435ad6..0bb33739f9 100644
--- a/.github/workflows/benchmark_nightly.yml
+++ b/.github/workflows/benchmark_nightly.yml
@@ -10,7 +10,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        hardware: [cpu, gpu, inf1]
+        hardware: [cpu, gpu, inf1, inf2]
     runs-on:
       - self-hosted
       - ${{ matrix.hardware }}
@@ -52,6 +52,11 @@ jobs:
         env:
           NEURON_RT_NUM_CORES: 4
         run: python benchmarks/auto_benchmark.py --input benchmarks/benchmark_config_neuron.yaml --skip false
+      - name: Benchmark inf2 nightly
+        if: ${{ matrix.hardware == 'inf2' }}
+        env:
+          NEURON_RT_NUM_CORES: 1
+        run: python benchmarks/auto_benchmark.py --input benchmarks/benchmark_config_neuronx.yaml --skip false
       - name: Save benchmark artifacts
         uses: actions/upload-artifact@v2
         with:
diff --git a/benchmarks/auto_benchmark.py b/benchmarks/auto_benchmark.py
index e642d126e3..d7bf07f062 100644
--- a/benchmarks/auto_benchmark.py
+++ b/benchmarks/auto_benchmark.py
@@ -97,7 +97,7 @@ def load_config(self):
 
         self.bm_config["model_config_path"] = (
             "{}/{}".format(MODEL_JSON_CONFIG_PATH, self.bm_config["hardware"])
-            if self.bm_config["hardware"] in ["cpu", "gpu", "neuron"]
+            if self.bm_config["hardware"] in ["cpu", "gpu", "neuron", "neuronx"]
             else "{}/cpu".format(MODEL_JSON_CONFIG_PATH)
         )
 
diff --git a/benchmarks/benchmark_config_neuronx.yaml b/benchmarks/benchmark_config_neuronx.yaml
new file mode 100644
index 0000000000..b8cb3ecf68
--- /dev/null
+++ b/benchmarks/benchmark_config_neuronx.yaml
@@ -0,0 +1,45 @@
+# Torchserve version is to be installed. It can be one of the options
+#  - branch : "master"
+#  - nightly: "2022.3.16"
+#  - release: "0.5.3"
+# Nightly build will be installed if "ts_version" is not specifiged
+#ts_version:
+#    branch: &ts_version "master"
+
+# a list of model configure yaml files defined in benchmarks/models_config
+# or a list of model configure yaml files with full path
+models:
+  - "bert_neuronx.yaml"
+
+# benchmark on "cpu", "gpu", "neuron" or "neuronx".
+# "cpu" is set if "hardware" is not specified
+hardware: &hardware "neuronx"
+
+# load prometheus metrics report to remote storage or local different path if "metrics_cmd" is set.
+# the command line to load prometheus metrics report to remote system.
+# Here is an example of AWS cloudwatch command:
+# Note:
+#    - keep the values order as the same as the command definition.
+#    - set up the command before enabling `metrics_cmd`.
+#      For example, aws client and AWS credentials need to be setup before trying this example.
+metrics_cmd:
+  - "cmd": "aws cloudwatch put-metric-data"
+  - "--namespace": ["torchserve_benchmark_nightly_", *hardware]
+  - "--region": "us-east-2"
+  - "--metric-data": 'file:///tmp/benchmark/logs/stats_metrics.json'
+
+# load report to remote storage or local different path if "report_cmd" is set.
+# the command line to load report to remote storage.
+# Here is an example of AWS cloudwatch command:
+# Note:
+#    - keep the values order as the same as the command.
+#    - set up the command before enabling `report_cmd`.
+#      For example, aws client, AWS credentials and S3 bucket
+#      need to be setup before trying this example.
+#    - "today()" is a keyword to apply current date in the path
+#      For example, the dest path in the following example is
+#      s3://torchserve-model-serving/benchmark/2022-03-18/gpu
+report_cmd:
+  - "cmd": "aws s3 cp --recursive"
+  - "source": '/tmp/ts_benchmark/'
+  - "dest": ['s3://torchserve-benchmark/nightly', "today()", *hardware]
diff --git a/benchmarks/models_config/bert_neuronx.yaml b/benchmarks/models_config/bert_neuronx.yaml
new file mode 100644
index 0000000000..b7e4ba46f8
--- /dev/null
+++ b/benchmarks/models_config/bert_neuronx.yaml
@@ -0,0 +1,68 @@
+---
+bert_neuronx_batch_1:
+    scripted_mode:
+        benchmark_engine: "ab"
+        url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification_torchscript_neuronx_batch_1.mar
+        workers:
+            - 2
+        batch_delay: 100
+        batch_size:
+            - 1
+        input: "./examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt"
+        requests: 10000
+        concurrency: 100
+        backend_profiling: False
+        exec_env: "local"
+        processors:
+            - "neuronx"
+
+bert_neuronx_batch_2:
+    scripted_mode:
+        benchmark_engine: "ab"
+        url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification_torchscript_neuronx_batch_2.mar
+        workers:
+            - 2
+        batch_delay: 100
+        batch_size:
+            - 2
+        input: "./examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt"
+        requests: 10000
+        concurrency: 100
+        backend_profiling: False
+        exec_env: "local"
+        processors:
+            - "neuronx"
+
+bert_neuronx_batch_4:
+    scripted_mode:
+        benchmark_engine: "ab"
+        url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification_torchscript_neuronx_batch_4.mar
+        workers:
+            - 2
+        batch_delay: 100
+        batch_size:
+            - 4
+        input: "./examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt"
+        requests: 10000
+        concurrency: 100
+        backend_profiling: False
+        exec_env: "local"
+        processors:
+            - "neuronx"
+
+bert_neuronx_batch_8:
+    scripted_mode:
+        benchmark_engine: "ab"
+        url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification_torchscript_neuronx_batch_8.mar
+        workers:
+            - 2
+        batch_delay: 100
+        batch_size:
+            - 8
+        input: "./examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt"
+        requests: 10000
+        concurrency: 100
+        backend_profiling: False
+        exec_env: "local"
+        processors:
+            - "neuronx"
diff --git a/examples/Huggingface_Transformers/Download_Transformer_models.py b/examples/Huggingface_Transformers/Download_Transformer_models.py
index ff5af3d5c5..1ae3c6fd55 100644
--- a/examples/Huggingface_Transformers/Download_Transformer_models.py
+++ b/examples/Huggingface_Transformers/Download_Transformer_models.py
@@ -121,6 +121,23 @@ def transformers_model_dowloader(
                     "traced_{}_model_neuron_batch_{}.pt".format(model_name, batch_size),
                 ),
             )
+        elif hardware == "neuronx":
+            import torch_neuronx
+
+            input_ids = torch.cat([inputs["input_ids"]] * batch_size, 0).to(device)
+            attention_mask = torch.cat([inputs["attention_mask"]] * batch_size, 0).to(
+                device
+            )
+            traced_model = torch_neuronx.trace(model, (input_ids, attention_mask))
+            torch.jit.save(
+                traced_model,
+                os.path.join(
+                    NEW_DIR,
+                    "traced_{}_model_neuronx_batch_{}.pt".format(
+                        model_name, batch_size
+                    ),
+                ),
+            )
         else:
             input_ids = inputs["input_ids"].to(device)
             attention_mask = inputs["attention_mask"].to(device)
diff --git a/examples/Huggingface_Transformers/README.md b/examples/Huggingface_Transformers/README.md
index c278973f55..0c0679d62c 100644
--- a/examples/Huggingface_Transformers/README.md
+++ b/examples/Huggingface_Transformers/README.md
@@ -51,9 +51,9 @@ In the setup_config.json :
 
 *embedding_name* : The name of embedding layer in the chosen model, this could be `bert` for `bert-base-uncased`, `roberta` for `roberta-base` or `roberta` for `xlm-roberta-large`, or `gpt2` for `gpt2` model
 
-*hardware* : The target platform to trace the model for. Specify as `neuron` for [Inferentia1](https://aws.amazon.com/ec2/instance-types/inf1/).
+*hardware* : The target platform to trace the model for. Specify as `neuron` for [Inferentia1](https://aws.amazon.com/ec2/instance-types/inf1/) and `neuronx` for [Inferentia2](https://aws.amazon.com/ec2/instance-types/inf2/).
 
-*batch_size* : Input batch size when tracing the model for `neuron` as target hardware.
+*batch_size* : Input batch size when tracing the model for `neuron` or `neuronx` as target hardware.
 
 Once, `setup_config.json` has been set properly, the next step is to run
 
diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt
index f2a0141b1f..c1cab7dfa8 100644
--- a/ts_scripts/spellcheck_conf/wordlist.txt
+++ b/ts_scripts/spellcheck_conf/wordlist.txt
@@ -1051,3 +1051,4 @@ largemodels
 torchpippy
 InferenceSession
 maxRetryTimeoutInSec
+neuronx

From f385cfbfec57cd98e088157e5ff68a365e88a6d0 Mon Sep 17 00:00:00 2001
From: jagadeesh <jagadeeshj@ideas2it.com>
Date: Mon, 13 Mar 2023 22:51:27 +0530
Subject: [PATCH 4/5] fix: kserve fastapi migration issues

Signed-off-by: jagadeesh <jagadeeshj@ideas2it.com>
---
 .../v2/mnist/mnist_v2_bytes.json              |  8 +-
 .../kf_request_json/v2/mnist/tobytes.py       | 17 +++-
 .../kserve/kserve_wrapper/TorchserveModel.py  | 81 ++-----------------
 kubernetes/kserve/kserve_wrapper/__main__.py  |  4 +-
 ts/torch_handler/request_envelope/kservev2.py | 35 +++++---
 5 files changed, 48 insertions(+), 97 deletions(-)

diff --git a/kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_bytes.json b/kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_bytes.json
index 0c07866dba..683ada7b73 100644
--- a/kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_bytes.json
+++ b/kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_bytes.json
@@ -1,10 +1,10 @@
 {
     "inputs": [
         {
-            "data": "iVBORw0KGgoAAAANSUhEUgAAABwAAAAcCAAAAABXZoBIAAAA10lEQVR4nGNgGFhgy6xVdrCszBaLFN/mr28+/QOCr69DMCSnA8WvHti0acu/fx/10OS0X/975CDDw8DA1PDn/1pBVEmLf3+zocy2X/+8USXt/82Ds+/+m4sqeehfOpw97d9VFDmlO++t4JwQNMm6f6sZcEpee2+DR/I4A05J7tt4JJP+IUsu+ncRp6TxO9RAQJY0XvrvMAuypNNHuCTz8n+PzVEcy3DtqgiY1ptx6t8/ewY0yX9ntoDA63//Xs3hQpMMPPsPAv68qmDAAFKXwHIzMzCl6AoAxXp0QujtP+8AAAAASUVORK5CYII=",
+            "data": ["iVBORw0KGgoAAAANSUhEUgAAABwAAAAcCAAAAABXZoBIAAAA10lEQVR4nGNgGFhgy6xVdrCszBaLFN/mr28+/QOCr69DMCSnA8WvHti0acu/fx/10OS0X/975CDDw8DA1PDn/1pBVEmLf3+zocy2X/+8USXt/82Ds+/+m4sqeehfOpw97d9VFDmlO++t4JwQNMm6f6sZcEpee2+DR/I4A05J7tt4JJP+IUsu+ncRp6TxO9RAQJY0XvrvMAuypNNHuCTz8n+PzVEcy3DtqgiY1ptx6t8/ewY0yX9ntoDA63//Xs3hQpMMPPsPAv68qmDAAFKXwHIzMzCl6AoAxXp0QujtP+8AAAAASUVORK5CYII="],
             "datatype": "BYTES",
-            "name": "312a4eb0-0ca7-4803-a101-a6d2c18486fe",
-            "shape": -1
+            "name": "e8d5afed-0a56-4deb-ac9c-352663f51b93",
+            "shape": [-1]
         }
     ]
-}
\ No newline at end of file
+}
diff --git a/kubernetes/kserve/kf_request_json/v2/mnist/tobytes.py b/kubernetes/kserve/kf_request_json/v2/mnist/tobytes.py
index f065acd31f..71ef7d3b62 100644
--- a/kubernetes/kserve/kf_request_json/v2/mnist/tobytes.py
+++ b/kubernetes/kserve/kf_request_json/v2/mnist/tobytes.py
@@ -1,6 +1,6 @@
+import argparse
 import base64
 import json
-import argparse
 import uuid
 
 parser = argparse.ArgumentParser()
@@ -10,11 +10,20 @@
 image = open(args.filename, "rb")  # open binary file in read mode
 image_read = image.read()
 image_64_encode = base64.b64encode(image_read)
-bytes_array = image_64_encode.decode("utf-8")
+bytes_array = list(image_64_encode.decode("utf-8"))
 request = {
-    "inputs": [{"name": str(uuid.uuid4()), "shape": -1, "datatype": "BYTES", "data": bytes_array}]
+    "inputs": [
+        {
+            "name": str(uuid.uuid4()),
+            "shape": [-1],
+            "datatype": "BYTES",
+            "data": bytes_array,
+        }
+    ]
 }
 
-result_file = "{filename}.{ext}".format(filename=str(args.filename).split(".")[0], ext="json")
+result_file = "{filename}.{ext}".format(
+    filename=str(args.filename).split(".")[0], ext="json"
+)
 with open(result_file, "w") as outfile:
     json.dump(request, outfile, indent=4, sort_keys=True)
diff --git a/kubernetes/kserve/kserve_wrapper/TorchserveModel.py b/kubernetes/kserve/kserve_wrapper/TorchserveModel.py
index abf47959ed..aa28a50aa7 100644
--- a/kubernetes/kserve/kserve_wrapper/TorchserveModel.py
+++ b/kubernetes/kserve/kserve_wrapper/TorchserveModel.py
@@ -1,23 +1,19 @@
 """ The torchserve side inference end-points request are handled to
     return a KServe side response """
-import json
 import logging
 import pathlib
-from typing import Dict
 
 import kserve
-import tornado.web
+from kserve.errors import ModelMissingError
 from kserve.model import Model as Model
-from kserve.model import ModelMissingError
 
 logging.basicConfig(level=kserve.constants.KSERVE_LOGLEVEL)
 
+PREDICTOR_URL_FORMAT = PREDICTOR_V2_URL_FORMAT = "http://{0}/predictions/{1}"
+EXPLAINER_URL_FORMAT = EXPLAINER_V2_URL_FORMAT = "http://{0}/explanations/{1}"
 REGISTER_URL_FORMAT = "{0}/models?initial_workers=1&url={1}"
 UNREGISTER_URL_FORMAT = "{0}/models/{1}"
 
-PREDICTOR_URL_FORMAT = "http://{0}/v1/models/{1}:predict"
-EXPLAINER_URL_FORMAT = "http://{0}/v1/models/{1}:explain"
-
 
 class TorchserveModel(Model):
     """The torchserve side inference and explain end-points requests are handled to
@@ -49,76 +45,9 @@ def __init__(self, name, inference_address, management_address, model_dir):
         self.management_address = management_address
         self.model_dir = model_dir
 
-        logging.info("kfmodel Predict URL set to %s", self.predictor_host)
+        logging.info("Predict URL set to %s", self.predictor_host)
         self.explainer_host = self.predictor_host
-        logging.info("kfmodel Explain URL set to %s", self.explainer_host)
-
-    async def predict(self, request: Dict) -> Dict:
-        """The predict method is called when we hit the inference endpoint and handles
-        the inference request and response from the Torchserve side and passes it on
-        to the KServe side.
-
-        Args:
-            request (Dict): Input request from the http client side.
-
-        Raises:
-            NotImplementedError: If the predictor host on the KServe side is not
-                                 available.
-
-            tornado.web.HTTPError: If there is a bad response from the http client.
-
-        Returns:
-            Dict: The Response from the input from the inference endpoint.
-        """
-        if not self.predictor_host:
-            raise NotImplementedError
-        logging.debug("kfmodel predict request is %s", json.dumps(request))
-        logging.info("PREDICTOR_HOST : %s", self.predictor_host)
-        headers = {"Content-Type": "application/json; charset=UTF-8"}
-        response = await self._http_client.fetch(
-            PREDICTOR_URL_FORMAT.format(self.predictor_host, self.name),
-            method="POST",
-            request_timeout=self.timeout,
-            headers=headers,
-            body=json.dumps(request),
-        )
-
-        if response.code != 200:
-            raise tornado.web.HTTPError(status_code=response.code, reason=response.body)
-        return json.loads(response.body)
-
-    async def explain(self, request: Dict) -> Dict:
-        """The predict method is called when we hit the explain endpoint and handles the
-        explain request and response from the Torchserve side and passes it on to the
-        KServe side.
-
-        Args:
-            request (Dict): Input request from the http client side.
-
-        Raises:
-            NotImplementedError: If the predictor host on the KServe side is not
-                                 available.
-
-            tornado.web.HTTPError: If there is a bad response from the http client.
-
-        Returns:
-            Dict: The Response from the input from the explain endpoint.
-        """
-        if self.explainer_host is None:
-            raise NotImplementedError
-        logging.info("kfmodel explain request is %s", json.dumps(request))
-        logging.info("EXPLAINER_HOST : %s", self.explainer_host)
-        headers = {"Content-Type": "application/json; charset=UTF-8"}
-        response = await self._http_client.fetch(
-            EXPLAINER_URL_FORMAT.format(self.explainer_host, self.name),
-            method="POST",
-            request_timeout=self.timeout,
-            headers=headers,
-            body=json.dumps(request),
-        )
-        if response.code != 200:
-            raise tornado.web.HTTPError(status_code=response.code, reason=response.body)
-        return json.loads(response.body)
+        logging.info("Explain URL set to %s", self.explainer_host)
 
     def load(self) -> bool:
         """This method validates model availabilty in the model directory
diff --git a/kubernetes/kserve/kserve_wrapper/__main__.py b/kubernetes/kserve/kserve_wrapper/__main__.py
index e8063426fe..0273e44751 100644
--- a/kubernetes/kserve/kserve_wrapper/__main__.py
+++ b/kubernetes/kserve/kserve_wrapper/__main__.py
@@ -12,7 +12,7 @@
 DEFAULT_MODEL_NAME = "model"
 DEFAULT_INFERENCE_ADDRESS = "http://127.0.0.1:8085"
 INFERENCE_PORT = "8085"
-DEFAULT_MANAGEMENT_ADDRESS = "http://127.0.0.1:8081"
+DEFAULT_MANAGEMENT_ADDRESS = "http://127.0.0.1:8085"
 
 DEFAULT_MODEL_STORE = "/mnt/models/model-store"
 CONFIG_PATH = "/mnt/models/config/config.properties"
@@ -100,5 +100,5 @@ def parse_config():
     ModelServer(
         registered_models=registeredModels,
         http_port=8080,
-        grpc_port=7070,
+        grpc_port=8081,
     ).start(models)
diff --git a/ts/torch_handler/request_envelope/kservev2.py b/ts/torch_handler/request_envelope/kservev2.py
index 33e573cfb9..5a88e9497d 100644
--- a/ts/torch_handler/request_envelope/kservev2.py
+++ b/ts/torch_handler/request_envelope/kservev2.py
@@ -4,7 +4,9 @@
 """
 import json
 import logging
+
 import numpy as np
+
 from .base import BaseEnvelope
 
 logger = logging.getLogger(__name__)
@@ -87,7 +89,9 @@ def _batch_from_json(self, rows):
         Joins the instances of a batch of JSON objects
         """
         logger.debug("Parse input data %s", rows)
-        body_list = [body_list.get("data") or body_list.get("body") for body_list in rows]
+        body_list = [
+            body_list.get("data") or body_list.get("body") for body_list in rows
+        ]
         data_list = self._from_json(body_list)
         return data_list
 
@@ -99,7 +103,15 @@ def _from_json(self, body_list):
         if isinstance(body_list[0], (bytes, bytearray)):
             body_list = [json.loads(body.decode()) for body in body_list]
             logger.debug("Bytes array is %s", body_list)
-        if "id" in body_list[0]:
+
+        input_names = []
+        for index, input in enumerate(body_list[0]["inputs"]):
+            if input["datatype"] == "BYTES":
+                body_list[0]["inputs"][index]["data"] = input["data"][0]
+            input_names.append(input["name"])
+        setattr(self.context, "input_names", input_names)
+        logger.debug("Bytes array is %s", body_list)
+        if body_list[0].get("id") is not None:
             setattr(self.context, "input_request_id", body_list[0]["id"])
         data_list = [inputs_list.get("inputs") for inputs_list in body_list][0]
         return data_list
@@ -116,7 +128,7 @@ def format_output(self, data):
           "model_name": "bert",
           "model_version": "1",
           "outputs": [{
-            "name": "predict",
+            "name": "input-0",
             "shape": [1],
             "datatype": "INT64",
             "data": [2]
@@ -131,10 +143,10 @@ def format_output(self, data):
             delattr(self.context, "input_request_id")
         else:
             response["id"] = self.context.get_request_id(0)
-        response["model_name"] = self.context.manifest.get("model").get(
-            "modelName")
+        response["model_name"] = self.context.manifest.get("model").get("modelName")
         response["model_version"] = self.context.manifest.get("model").get(
-            "modelVersion")
+            "modelVersion"
+        )
         response["outputs"] = self._batch_to_json(data)
         return [response]
 
@@ -143,18 +155,19 @@ def _batch_to_json(self, data):
         Splits batch output to json objects
         """
         output = []
-        for item in data:
-            output.append(self._to_json(item))
+        input_names = getattr(self.context, "input_names")
+        delattr(self.context, "input_names")
+        for index, item in enumerate(data):
+            output.append(self._to_json(item, input_names[index]))
         return output
 
-    def _to_json(self, data):
+    def _to_json(self, data, input_name):
         """
         Constructs JSON object from data
         """
         output_data = {}
         data_ndarray = np.array(data)
-        output_data["name"] = ("explain" if self.context.get_request_header(
-            0, "explain") == "True" else "predict")
+        output_data["name"] = input_name
         output_data["shape"] = list(data_ndarray.shape)
         output_data["datatype"] = _to_datatype(data_ndarray.dtype)
         output_data["data"] = data_ndarray.flatten().tolist()

From 8de70b778a7526b565d0e1c976915ed2aa4d9b2f Mon Sep 17 00:00:00 2001
From: jagadeesh <jagadeeshj@ideas2it.com>
Date: Thu, 18 May 2023 17:21:40 +0530
Subject: [PATCH 5/5] fix docs

Signed-off-by: jagadeesh <jagadeeshj@ideas2it.com>
---
 .../kserve/kf_request_json/v2/mnist/README.md | 25 ++++++++++---------
 kubernetes/kserve/kserve_wrapper/__main__.py  |  4 ---
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/kubernetes/kserve/kf_request_json/v2/mnist/README.md b/kubernetes/kserve/kf_request_json/v2/mnist/README.md
index f8d41eb552..dcfcd1bd2b 100644
--- a/kubernetes/kserve/kf_request_json/v2/mnist/README.md
+++ b/kubernetes/kserve/kf_request_json/v2/mnist/README.md
@@ -19,13 +19,13 @@ The command will create `mnist.mar` file in current directory
 
 Move the mar file to model-store
 
-```
+```bash
 sudo mv mnist.mar /mnt/models/model-store
 ```
 
 and use the following config properties (`/mnt/models/config`)
 
-```
+```conf
 inference_address=http://0.0.0.0:8085
 management_address=http://0.0.0.0:8085
 metrics_address=http://0.0.0.0:8082
@@ -51,13 +51,13 @@ Move to `kubernetes/kserve/kf_request_json/v2/mnist`
 
 For bytes input, use [tobytes](tobytes.py) utility.
 
-```
+```bash
 python tobytes.py 0.png
 ```
 
 For tensor input, use [totensor](totensor.py) utility
 
-```
+```bash
 python totensor.py 0.png
 ```
 
@@ -66,7 +66,7 @@ python totensor.py 0.png
 
 Start TorchServe
 
-```
+```bash
 torchserve --start --ts-config /mnt/models/config/config.properties --ncs
 ```
 
@@ -74,7 +74,7 @@ To test locally, clone TorchServe and move to the following folder `kubernetes/k
 
 Start Kserve
 
-```
+```bash
 python __main__.py
 ```
 
@@ -85,12 +85,12 @@ Navigate to `kubernetes/kserve/kf_request_json/v2/mnist`
 Run the following command
 
 ```bash
-curl -v -H "ContentType: application/json" http://localhost:8080/v2/models/mnist/infer -d @./mnist_v2_bytes.json
+curl -v -H "Content-Type: application/json" http://localhost:8080/v2/models/mnist/infer -d @./mnist_v2_bytes.json
 ```
 
 Expected Output
 
-```bash
+```json
 {"id": "d3b15cad-50a2-4eaf-80ce-8b0a428bd298", "model_name": "mnist", "model_version": "1.0", "outputs": [{"name": "predict", "shape": [1], "datatype": "INT64", "data": [0]}]}
 ```
 
@@ -100,8 +100,8 @@ Expected Output
 
 Run the following command
 
-```
-curl -v -H "ContentType: application/json" http://localhost:8080/v2/models/mnist/infer -d @./mnist_v2_tensor.json
+```bash
+curl -v -H "Content-Type: application/json" http://localhost:8080/v2/models/mnist/infer -d @./mnist_v2_tensor.json
 ```
 
 Expected output
@@ -115,10 +115,11 @@ Expected output
 Run the following command
 
 ```bash
-curl -v -H "ContentType: application/json" http://localhost:8080/v2/models/mnist/explain -d @./mnist_v2_bytes.json
+curl -v -H "Content-Type: application/json" http://localhost:8080/v2/models/mnist/explain -d @./mnist_v2_bytes.json
 ```
 
 Expected output
-```bash
+
+```json
 {"id": "d3b15cad-50a2-4eaf-80ce-8b0a428bd298", "model_name": "mnist", "model_version": "1.0", "outputs": [{"name": "explain", "shape": [1, 28, 28], "datatype": "FP64", "data": [-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, -0.0, -0.0, 0.0, -0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, -0.0, 0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 0.0, 0.0, -0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, -0.0, 0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0040547529196303285, -0.000226128774499257, -0.00012734138382422276, 0.005648369544853077, 0.0089047843954152, 0.002638536593970295, 0.002680245911942565, -0.0026578015819202173, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00024465771891337887, 0.0008218450954311162, 0.01528591767842519, 0.007512832335428859, 0.00709498458333515, 0.0034056686436576803, -0.002091925041823873, -0.0007800293875604465, 0.02299587827540853, 0.019004329367380418, -0.0012529559050418735, -0.0014666116646934577, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005298396405518712, -0.0007901605729004231, 0.0039060659926479398, 0.023174082126728335, 0.01723791770922474, 0.010867034167828598, 0.003001563229273835, 0.00622421771715703, 0.006120712207087491, 0.01673632965122119, 0.005674718948781803, 0.004344134599735745, -0.0012328422311881568, -0.0, -0.0, -0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, 0.0006867353833785289, 0.009772899792600862, -0.0038754932221901437, 0.001798693579973005, 0.001307544047675232, -0.0024510981010352315, -0.0008806773488194292, -0.0, -0.0, -0.00014277890760828639, -0.009322313235257151, 0.020608317727589167, 0.004351394518148479, -0.0007875566214137449, -0.0009075897508410689, -0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00022247238084657642, -0.0007829029819622099, 0.0026663695200516055, 0.0009733366691924418, 0.0, -0.0, 0.0, 0.0, 0.0, 0.0, -0.0, 0.0004323207980879993, 0.023657171939959983, 0.01069484496100618, -0.0023759529165659743, -0.0, -0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.002074797197335781, -0.002320101263777886, -0.001289920656543141, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.007629679763806616, 0.01044862710854819, 0.00025032875474040415, -0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0003770836745884539, -0.005156369309364184, 0.0012477582083019567, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 0.0, -0.0, -4.442513564501309e-05, 0.010248046436803096, 0.0009971133914441863, -0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 0.0, -0.0, 0.0, 0.0, -0.0, 0.0004501048922351147, -0.00196305355861066, -0.0006664792277975681, 0.0020157403871024866, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, -0.0, -0.002214456978582924, 0.008361583668963536, 0.0031401942747203444, -0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0028943545250037983, -0.0031301382844878753, 0.002113252994616467, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 0.0, -0.0, -0.0, -0.0010321050071136991, 0.008905753948020954, 0.0028464383724280478, -0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, -0.0053052889804602885, -0.0019271100770928186, 0.0012090042664300153, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0011945155805738324, 0.005654442809865844, 0.0020132075147173286, -0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, -0.0014689358119857122, 0.0010743412654248086, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, 0.0, -0.0, -0.0, -0.0017047980433136346, 0.0029066051664685937, -0.0007805868937027288, -0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, -0.0, -0.0, 5.541726090138969e-05, 0.0014516115182299915, 0.0002827700518397855, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.001440140782635336, 0.002381249982038837, 0.002146825452068144, -0.0, -0.0, 0.0, -0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.001150052970321427, 0.0002865015237050364, 0.0029798150346815985, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.001775029606380323, 0.000833985914685474, -0.003770739075457816, -0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, -0.0006093176893524411, -0.00046905781658387527, 0.0034053217440919658, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, -0.0, -0.0, -0.0007450012183962096, 0.001298767353118675, -0.008499247802184222, -6.145165255574976e-05, -0.0, -0.0, -0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, -0.0, 0.0, 0.0011809726462884672, -0.0018384763902449712, 0.005411106715800028, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, 0.0, -0.0021392341817010304, 0.0003259163122540385, -0.005276118905978749, -0.0019509840184772497, -9.545685077687876e-07, 0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0007772404694664217, -0.0001517954537059768, 0.006481484678129392, -0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 8.098064554131295e-05, -0.0024904264199929506, -0.0020718618328775897, -5.3411287747038166e-05, -0.0004556472202791715, 0.0, -0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0022750984867578, 0.001716405971437602, 0.0003221344811922982, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0015560282437342534, 9.107229584202956e-05, 0.0008772841867241755, 0.0006502979194500701, -0.004128780661881036, 0.0006030386196211547, 0.0, -0.0, 0.0, -0.0, -0.0, 0.0, 0.0, -0.0, -0.0, 0.0, -0.0, -0.0, 0.0, 0.0, 0.0013959959731925453, 0.0026791526421029673, 0.002399500793142178, -0.00044960969955281656, 0.003101832495190209, 0.007494535809079955, 0.002864118744003058, -0.003052590549800204, 0.003420222341277871, 0.0014924017873988514, -0.0009357389226494119, 0.0007856229438140384, -0.001843397373255761, 1.6031851430693252e-05, 0.0, 0.0, -0.0, -0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, -0.000699901824825285, 0.0043822508549258565, -0.003541931476855951, -0.0028896746311921715, -0.0004873454583246359, -0.006087345141728267, 0.000388224886755815, 0.002533641621974457, -0.004352836429303485, -0.0006079421449756437, -0.003810133409713042, -0.0008284413779488711, 0.0, -0.0, 0.0, 0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0010901530854686326, -0.013135007707490608, 0.0004734520308098294, 0.0020504232707536456, -0.006609452262924153, 0.0023647861306777536, 0.004678920703192049, -0.0018122526857900652, 0.0021375383049022263, 0.0, -0.0, -0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, 0.0, -0.0, -0.0, 0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, 0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, 0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}]}
 ```
diff --git a/kubernetes/kserve/kserve_wrapper/__main__.py b/kubernetes/kserve/kserve_wrapper/__main__.py
index 0273e44751..b31e3df375 100644
--- a/kubernetes/kserve/kserve_wrapper/__main__.py
+++ b/kubernetes/kserve/kserve_wrapper/__main__.py
@@ -31,10 +31,8 @@ def parse_config():
     keys = {}
 
     with open(CONFIG_PATH) as f:
-
         for line in f:
             if separator in line:
-
                 # Find the name and value by splitting the string
                 name, value = line.split(separator, 1)
 
@@ -79,13 +77,11 @@ def parse_config():
 
 
 if __name__ == "__main__":
-
     model_names, inference_address, management_address, model_dir = parse_config()
 
     models = []
 
     for model_name in model_names:
-
         model = TorchserveModel(
             model_name, inference_address, management_address, model_dir
         )