pytorch · msaroufim · May 19, 2023 · Mar 13, 2023 · Apr 18, 2023 · Apr 20, 2023
diff --git a/.github/workflows/benchmark_nightly.yml b/.github/workflows/benchmark_nightly.yml
@@ -10,7 +10,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        hardware: [cpu, gpu, inf1]
+        hardware: [cpu, gpu, inf1, inf2]
     runs-on:
       - self-hosted
       - ${{ matrix.hardware }}
@@ -52,6 +52,11 @@ jobs:
         env:
           NEURON_RT_NUM_CORES: 4
         run: python benchmarks/auto_benchmark.py --input benchmarks/benchmark_config_neuron.yaml --skip false
+      - name: Benchmark inf2 nightly
+        if: ${{ matrix.hardware == 'inf2' }}
+        env:
+          NEURON_RT_NUM_CORES: 1
+        run: python benchmarks/auto_benchmark.py --input benchmarks/benchmark_config_neuronx.yaml --skip false
       - name: Save benchmark artifacts
         uses: actions/upload-artifact@v2
         with:

diff --git a/benchmarks/auto_benchmark.py b/benchmarks/auto_benchmark.py
@@ -97,7 +97,7 @@ def load_config(self):
 
         self.bm_config["model_config_path"] = (
             "{}/{}".format(MODEL_JSON_CONFIG_PATH, self.bm_config["hardware"])
-            if self.bm_config["hardware"] in ["cpu", "gpu", "neuron"]
+            if self.bm_config["hardware"] in ["cpu", "gpu", "neuron", "neuronx"]
             else "{}/cpu".format(MODEL_JSON_CONFIG_PATH)
         )
 

diff --git a/benchmarks/benchmark_config_neuronx.yaml b/benchmarks/benchmark_config_neuronx.yaml
@@ -0,0 +1,45 @@
+# Torchserve version is to be installed. It can be one of the options
+#  - branch : "master"
+#  - nightly: "2022.3.16"
+#  - release: "0.5.3"
+# Nightly build will be installed if "ts_version" is not specifiged
+#ts_version:
+#    branch: &ts_version "master"
+
+# a list of model configure yaml files defined in benchmarks/models_config
+# or a list of model configure yaml files with full path
+models:
+  - "bert_neuronx.yaml"
+
+# benchmark on "cpu", "gpu", "neuron" or "neuronx".
+# "cpu" is set if "hardware" is not specified
+hardware: &hardware "neuronx"
+
+# load prometheus metrics report to remote storage or local different path if "metrics_cmd" is set.
+# the command line to load prometheus metrics report to remote system.
+# Here is an example of AWS cloudwatch command:
+# Note:
+#    - keep the values order as the same as the command definition.
+#    - set up the command before enabling `metrics_cmd`.
+#      For example, aws client and AWS credentials need to be setup before trying this example.
+metrics_cmd:
+  - "cmd": "aws cloudwatch put-metric-data"
+  - "--namespace": ["torchserve_benchmark_nightly_", *hardware]
+  - "--region": "us-east-2"
+  - "--metric-data": 'file:///tmp/benchmark/logs/stats_metrics.json'
+
+# load report to remote storage or local different path if "report_cmd" is set.
+# the command line to load report to remote storage.
+# Here is an example of AWS cloudwatch command:
+# Note:
+#    - keep the values order as the same as the command.
+#    - set up the command before enabling `report_cmd`.
+#      For example, aws client, AWS credentials and S3 bucket
+#      need to be setup before trying this example.
+#    - "today()" is a keyword to apply current date in the path
+#      For example, the dest path in the following example is
+#      s3://torchserve-model-serving/benchmark/2022-03-18/gpu
+report_cmd:
+  - "cmd": "aws s3 cp --recursive"
+  - "source": '/tmp/ts_benchmark/'
+  - "dest": ['s3://torchserve-benchmark/nightly', "today()", *hardware]
diff --git a/benchmarks/models_config/bert_neuronx.yaml b/benchmarks/models_config/bert_neuronx.yaml
@@ -0,0 +1,68 @@
+---
+bert_neuronx_batch_1:
+    scripted_mode:
+        benchmark_engine: "ab"
+        url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification_torchscript_neuronx_batch_1.mar
+        workers:
+            - 2
+        batch_delay: 100
+        batch_size:
+            - 1
+        input: "./examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt"
+        requests: 10000
+        concurrency: 100
+        backend_profiling: False
+        exec_env: "local"
+        processors:
+            - "neuronx"
+
+bert_neuronx_batch_2:
+    scripted_mode:
+        benchmark_engine: "ab"
+        url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification_torchscript_neuronx_batch_2.mar
+        workers:
+            - 2
+        batch_delay: 100
+        batch_size:
+            - 2
+        input: "./examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt"
+        requests: 10000
+        concurrency: 100
+        backend_profiling: False
+        exec_env: "local"
+        processors:
+            - "neuronx"
+
+bert_neuronx_batch_4:
+    scripted_mode:
+        benchmark_engine: "ab"
+        url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification_torchscript_neuronx_batch_4.mar
+        workers:
+            - 2
+        batch_delay: 100
+        batch_size:
+            - 4
+        input: "./examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt"
+        requests: 10000
+        concurrency: 100
+        backend_profiling: False
+        exec_env: "local"
+        processors:
+            - "neuronx"
+
+bert_neuronx_batch_8:
+    scripted_mode:
+        benchmark_engine: "ab"
+        url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification_torchscript_neuronx_batch_8.mar
+        workers:
+            - 2
+        batch_delay: 100
+        batch_size:
+            - 8
+        input: "./examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text.txt"
+        requests: 10000
+        concurrency: 100
+        backend_profiling: False
+        exec_env: "local"
+        processors:
+            - "neuronx"
diff --git a/docs/index.md b/docs/index.md
@@ -4,19 +4,19 @@ TorchServe is a performant, flexible and easy to use tool for serving PyTorch mo
 
 
 ## ⚡ Why TorchServe
-* [Model Management API](https://github.com/pytorch/serve/blob/master/docs/management_api.md): multi model management with optimized worker to model allocation
-* [Inference API](https://github.com/pytorch/serve/blob/master/docs/inference_api.md): REST and gRPC support for batched inference
-* [TorchServe Workflows](https://github.com/pytorch/serve/blob/master/examples/Workflows/README.md): deploy complex DAGs with multiple interdependent models
+* [Model Management API](https://github.com/pytorch/serve/blob/master/docs/management_api.md#management-api): multi model management with optimized worker to model allocation
+* [Inference API](https://github.com/pytorch/serve/blob/master/docs/inference_api.md#inference-api): REST and gRPC support for batched inference
+* [TorchServe Workflows](https://github.com/pytorch/serve/blob/master/examples/Workflows/README.md#workflow-examples): deploy complex DAGs with multiple interdependent models
 * Default way to serve PyTorch models in
   * [Kubeflow](https://v0-5.kubeflow.org/docs/components/pytorchserving/)
   * [MLflow](https://github.com/mlflow/mlflow-torchserve)
   * [Sagemaker](https://aws.amazon.com/blogs/machine-learning/serving-pytorch-models-in-production-with-the-amazon-sagemaker-native-torchserve-integration/)
   * [Kserve](https://kserve.github.io/website/0.8/modelserving/v1beta1/torchserve/): Supports both v1 and v2 API
   * [Vertex AI](https://cloud.google.com/blog/topics/developers-practitioners/pytorch-google-cloud-how-deploy-pytorch-models-vertex-ai)
-* Export your model for optimized inference. Torchscript out of the box, [ORT and ONNX](https://github.com/pytorch/serve/blob/master/docs/performance_guide.md), [IPEX](https://github.com/pytorch/serve/tree/master/examples/intel_extension_for_pytorch), [TensorRT](https://github.com/pytorch/serve/blob/master/docs/performance_guide.md), [FasterTransformer](https://github.com/pytorch/serve/tree/master/examples/FasterTransformer_HuggingFace_Bert)
-* [Performance Guide](https://github.com/pytorch/serve/blob/master/docs/performance_guide.md): builtin support to optimize, benchmark and profile PyTorch and TorchServe performance
-* [Expressive handlers](https://github.com/pytorch/serve/blob/master/CONTRIBUTING.md): An expressive handler architecture that makes it trivial to support inferencing for your usecase with [many supported out of the box](https://github.com/pytorch/serve/tree/master/ts/torch_handler)
-* [Metrics API](https://github.com/pytorch/serve/blob/master/docs/metrics.md): out of box support for system level metrics with [Prometheus exports](https://github.com/pytorch/serve/tree/master/examples/custom_metrics), custom metrics and PyTorch profiler support
+* Export your model for optimized inference. Torchscript out of the box, [ORT and ONNX](https://github.com/pytorch/serve/blob/master/docs/performance_guide.md#performance-guide), [IPEX](https://github.com/pytorch/serve/tree/master/examples/intel_extension_for_pytorch), [TensorRT](https://github.com/pytorch/serve/blob/master/docs/performance_guide.md#performance-guide), [FasterTransformer](https://github.com/pytorch/serve/tree/master/examples/FasterTransformer_HuggingFace_Bert)
+* [Performance Guide](https://github.com/pytorch/serve/blob/master/docs/performance_guide.md#performance-guide): builtin support to optimize, benchmark and profile PyTorch and TorchServe performance
+* [Expressive handlers](https://github.com/pytorch/serve/blob/master/CONTRIBUTING.md#contributing-to-torchServe): An expressive handler architecture that makes it trivial to support inferencing for your usecase with [many supported out of the box](https://github.com/pytorch/serve/tree/master/ts/torch_handler)
+* [Metrics API](https://github.com/pytorch/serve/blob/master/docs/metrics.md#torchserve-metrics): out of box support for system level metrics with [Prometheus exports](https://github.com/pytorch/serve/tree/master/examples/custom_metrics), custom metrics and PyTorch profiler support
 
 ## 🤔 How does TorchServe work
 
@@ -56,7 +56,7 @@ TorchServe is a performant, flexible and easy to use tool for serving PyTorch mo
 * [TorchServe UseCases](https://github.com/pytorch/serve/blob/master/examples/README.md#usecases)
 * [Model Zoo](https://github.com/pytorch/serve/blob/master/docs/model_zoo.md) - List of pre-trained model archives ready to be served for inference with TorchServe.
 
-For [more examples](https://github.com/pytorch/serve/blob/master/examples/README.md)
+For [more examples](https://github.com/pytorch/serve/blob/master/examples/README.md#torchserve-internals)
 
 
 ## Advanced Features

diff --git a/docs/inference_api.md b/docs/inference_api.md
@@ -1,4 +1,4 @@
-# Inference API
+# [Inference API](#inference-api)
 
 Inference API is listening on port 8080 and only accessible from localhost by default. To change the default setting, see [TorchServe Configuration](configuration.md).
 
@@ -41,7 +41,7 @@ If the server is running, the response is:
 }
 ```
 
-"maxRetryTimeoutInSec" (default: 5MIN) can be defined in a model's config yaml file(eg. model-config.yaml). It is the maximum time window of recovering a dead backend worker. A healthy worker can be in the state: WORKER_STARTED, WORKER_MODEL_LOADED, or WORKER_STOPPED within maxRetryTimeoutInSec window. "Ping" endpont"
+"maxRetryTimeoutInSec" (default: 5MIN) can be defined in a model's config yaml file(e.g model-config.yaml). It is the maximum time window of recovering a dead backend worker. A healthy worker can be in the state: WORKER_STARTED, WORKER_MODEL_LOADED, or WORKER_STOPPED within maxRetryTimeoutInSec window. "Ping" endpoint"
 * return 200 + json message "healthy": for any model, the number of active workers is equal or larger than the configured minWorkers.
 * return 500 + json message "unhealthy": for any model, the number of active workers is less than the configured minWorkers.
 

diff --git a/docs/management_api.md b/docs/management_api.md
@@ -1,4 +1,4 @@
-# Management API
+# [Management API](#management-api)
 
 TorchServe provides the following APIs that allows you to manage models at runtime:
 
@@ -41,13 +41,13 @@ curl -X POST  "http://localhost:8081/models?url=https://torchserve.pytorch.org/m
 }
 ```
 
-### Encrypted model serving 
+### Encrypted model serving
 If you'd like to serve an encrypted model then you need to setup [S3 SSE-KMS](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingKMSEncryption.html) with the following environment variables:
 * AWS_ACCESS_KEY_ID
 * AWS_SECRET_ACCESS_KEY
 * AWS_DEFAULT_REGION
 
-And set "s3_sse_kms=true" in HTTP request. 
+And set "s3_sse_kms=true" in HTTP request.
 
 For example: model squeezenet1_1 is [encrypted on S3 under your own private account](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingKMSEncryption.html). The model http url on S3 is `https://torchserve.pytorch.org/sse-test/squeezenet1_1.mar`.
 - if torchserve will run on EC2 instance (e.g. OS: ubuntu)
@@ -86,7 +86,7 @@ curl -v -X POST "http://localhost:8081/models?initial_workers=1&synchronous=fals
 < x-request-id: 4dc54158-c6de-42aa-b5dd-ebcb5f721043
 < content-length: 47
 < connection: keep-alive
-< 
+<
 {
   "status": "Processing worker updates..."
 }
@@ -102,7 +102,7 @@ curl -v -X POST "http://localhost:8081/models?initial_workers=1&synchronous=true
 < x-request-id: ecd2e502-382f-4c3b-b425-519fbf6d3b85
 < content-length: 89
 < connection: keep-alive
-< 
+<
 {
   "status": "Model \"squeezenet1_1\" Version: 1.0 registered with 1 initial workers"
 }
@@ -118,7 +118,7 @@ This API follows the [ManagementAPIsService.ScaleWorker](https://github.com/pyto
 * `min_worker` - (optional) the minimum number of worker processes. TorchServe will try to maintain this minimum for specified model. The default value is `1`.
 * `max_worker` - (optional) the maximum number of worker processes. TorchServe will make no more that this number of workers for the specified model. The default is the same as the setting for `min_worker`.
 * `synchronous` - whether or not the call is synchronous. The default value is `false`.
-* `timeout` - the specified wait time for a worker to complete all pending requests. If exceeded, the work process will be terminated. Use `0` to terminate the backend worker process immediately. Use `-1` to wait infinitely. The default value is `-1`. 
+* `timeout` - the specified wait time for a worker to complete all pending requests. If exceeded, the work process will be terminated. Use `0` to terminate the backend worker process immediately. Use `-1` to wait infinitely. The default value is `-1`.
 
 Use the Scale Worker API to dynamically adjust the number of workers for any version of a model to better serve different inference request loads.
 
@@ -134,7 +134,7 @@ curl -v -X PUT "http://localhost:8081/models/noop?min_worker=3"
 < x-request-id: 42adc58e-6956-4198-ad07-db6c620c4c1e
 < content-length: 47
 < connection: keep-alive
-< 
+<
 {
   "status": "Processing worker updates..."
 }
@@ -150,7 +150,7 @@ curl -v -X PUT "http://localhost:8081/models/noop?min_worker=3&synchronous=true"
 < x-request-id: b72b1ea0-81c6-4cce-92c4-530d3cfe5d4a
 < content-length: 63
 < connection: keep-alive
-< 
+<
 {
   "status": "Workers scaled to 3 for model: noop"
 }
@@ -169,7 +169,7 @@ curl -v -X PUT "http://localhost:8081/models/noop/2.0?min_worker=3&synchronous=t
 < x-request-id: 3997ccd4-ae44-4570-b249-e361b08d3d47
 < content-length: 77
 < connection: keep-alive
-< 
+<
 {
   "status": "Workers scaled to 3 for model: noop, version: 2.0"
 }
@@ -290,7 +290,7 @@ curl http://localhost:8081/models/noop/all
 ```
 
 `GET /models/{model_name}/{model_version}?customized=true`
-or 
+or
 `GET /models/{model_name}?customized=true`
 
 Use the Describe Model API to get detail runtime status and customized metadata of a version of a model:

diff --git a/docs/metrics.md b/docs/metrics.md
@@ -1,4 +1,4 @@
-# TorchServe Metrics
+# [TorchServe Metrics](#torchserve-metrics)
 
 ## Contents of this document
 

diff --git a/docs/performance_guide.md b/docs/performance_guide.md
@@ -1,4 +1,4 @@
-# Performance Guide
+# [Performance Guide](#performance-guide)
 In case you're interested in optimizing the memory usage, latency or throughput of a PyTorch model served with TorchServe, this is the guide for you.
 ## Optimizing PyTorch
 There are many tricks to optimize PyTorch models for production including but not limited to distillation, quantization, fusion, pruning, setting environment variables and we encourage you to benchmark and see what works best for you. An experimental tool that may make this process easier is https://pypi.org/project/torchprep.
@@ -9,7 +9,7 @@ In general it's hard to optimize models and the easiest approach can be exportin
 
 `pip install torchserve[onnx]`
 
-In particular TorchServe has native support for ONNX models which can be loaded via ORT for both accelerated CPU and GPU inference. ONNX operates a bit differentyl from a regular PyTorch model in that when you're running the conversion you need to explicity set and name your input and output dimensions. See https://github.com/pytorch/serve/blob/master/test/pytest/test_onnx.py for an example. So at a high level what TorchServe allows you to do is
+In particular TorchServe has native support for ONNX models which can be loaded via ORT for both accelerated CPU and GPU inference. ONNX operates a bit differently from a regular PyTorch model in that when you're running the conversion you need to explicitly set and name your input and output dimensions. See https://github.com/pytorch/serve/blob/master/test/pytest/test_onnx.py for an example. So at a high level what TorchServe allows you to do is
 1. Package serialized ONNX weights `torch-model-archiver --serialized-file model.onnx ...`
 2. Load those weights from `base_handler.py` using `ort_session = ort.InferenceSession(self.model_pt_path, providers=providers, sess_options=sess_options)` which supports reasonable defaults for both CPU and GPU inference
 3. Allow you define custom pre and post processing functions to pass in data in the format your onnx model expects with a custom handler

diff --git a/examples/Huggingface_Transformers/Download_Transformer_models.py b/examples/Huggingface_Transformers/Download_Transformer_models.py
@@ -121,6 +121,23 @@ def transformers_model_dowloader(
                     "traced_{}_model_neuron_batch_{}.pt".format(model_name, batch_size),
                 ),
             )
+        elif hardware == "neuronx":
+            import torch_neuronx
+
+            input_ids = torch.cat([inputs["input_ids"]] * batch_size, 0).to(device)
+            attention_mask = torch.cat([inputs["attention_mask"]] * batch_size, 0).to(
+                device
+            )
+            traced_model = torch_neuronx.trace(model, (input_ids, attention_mask))
+            torch.jit.save(
+                traced_model,
+                os.path.join(
+                    NEW_DIR,
+                    "traced_{}_model_neuronx_batch_{}.pt".format(
+                        model_name, batch_size
+                    ),
+                ),
+            )
         else:
             input_ids = inputs["input_ids"].to(device)
             attention_mask = inputs["attention_mask"].to(device)

diff --git a/examples/Huggingface_Transformers/README.md b/examples/Huggingface_Transformers/README.md
@@ -51,9 +51,9 @@ In the setup_config.json :
 
 *embedding_name* : The name of embedding layer in the chosen model, this could be `bert` for `bert-base-uncased`, `roberta` for `roberta-base` or `roberta` for `xlm-roberta-large`, or `gpt2` for `gpt2` model
 
-*hardware* : The target platform to trace the model for. Specify as `neuron` for [Inferentia1](https://aws.amazon.com/ec2/instance-types/inf1/).
+*hardware* : The target platform to trace the model for. Specify as `neuron` for [Inferentia1](https://aws.amazon.com/ec2/instance-types/inf1/) and `neuronx` for [Inferentia2](https://aws.amazon.com/ec2/instance-types/inf2/).
 
-*batch_size* : Input batch size when tracing the model for `neuron` as target hardware.
+*batch_size* : Input batch size when tracing the model for `neuron` or `neuronx` as target hardware.
 
 Once, `setup_config.json` has been set properly, the next step is to run
 

diff --git a/examples/README.md b/examples/README.md
@@ -1,4 +1,4 @@
-# Examples showcasing TorchServe Features and Integrations
+# [Examples showcasing TorchServe Features and Integrations](#torchserve-internals)
 
 ## TorchServe Internals
 

diff --git a/examples/Workflows/README.md b/examples/Workflows/README.md
@@ -1,4 +1,4 @@
-# Workflow examples
+# [Workflow examples](#workflow-examples)
 
 Workflows can be used to compose an ensemble of Pytorch models and Python functions and package them in a `war` file. A workflow is executed as a DAG where the nodes can be either Pytorch models packaged as `mar` files or function nodes specified in the workflow handler file. The DAG can be used to define both sequential or parallel pipelines.
 
@@ -8,7 +8,7 @@ As an example a sequential pipeline may look something like
 input -> function1 -> model1 -> model2 -> function2 -> output
 ```
 
-And a parallel pipeline may look something like 
+And a parallel pipeline may look something like
 
 ```
                           model1