Merge branch 'master' into llama2-streaming

pytorch · Oct 29, 2023 · 0e88030 · 0e88030
2 parents f1dda59 + 7f4419f
commit 0e88030
Show file tree

Hide file tree

Showing 32 changed files with 811 additions and 51 deletions.
diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
@@ -0,0 +1,13 @@
+tracking_issue: 2724
+
+# List of workflows that will be re-run in case of failures
+# https://github.com/pytorch/test-infra/blob/main/torchci/lib/bot/retryBot.ts
+retryable_workflows:
+- Run Regression Tests on Docker
+- Run Regression Tests for CPU nightly binaries
+- Push torchserve nightly
+- Push Docker Nightly
+- Docker CI
+- CI CPU
+- CI GPU
+- Benchmark torchserve nightly
diff --git a/.github/workflows/kserve_cpu_tests.yml b/.github/workflows/kserve_cpu_tests.yml
@@ -0,0 +1,40 @@
+name: KServe CPU Nightly Tests
+
+on:
+  workflow_dispatch:
+  # runs everyday  at 5:15am
+  schedule:
+    - cron:  '15 5 * * *'
+
+jobs:
+  kserve-cpu-tests:
+    runs-on: [self-hosted, regression-test-gpu]
+    steps:
+      - name: Clean up previous run
+        run: |
+          echo "Cleaning up previous run"
+          ls -la ./
+          sudo rm -rf ./* || true
+          sudo rm -rf ./.??* || true
+          ls -la ./
+      - name: Install minikube and kubectl
+        run: |
+          curl -LO https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64
+          sudo install minikube-linux-amd64 /usr/local/bin/minikube
+          curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
+          sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
+          echo "/usr/local/bin" >> $GITHUB_PATH
+      - name: Setup Python 3.8
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.8
+          architecture: x64
+      - name: Checkout TorchServe
+        uses: actions/checkout@v3
+      - name: Checkout kserve repo
+        uses: actions/checkout@v4
+        with:
+          repository: kserve/kserve
+          path: kserve
+      - name: Validate torchserve-kfs
+        run: ./kubernetes/kserve/tests/scripts/test_mnist.sh
diff --git a/.github/workflows/regression_tests_cpu_binaries.yml b/.github/workflows/regression_tests_cpu_binaries.yml
@@ -39,6 +39,6 @@ jobs:
       - name: Install dependencies
         run: |
           python ts_scripts/install_dependencies.py --environment=dev
-      - name: Torchserve Regression Tests
+      - name: Validate Torchserve CPU Regression
         run: |
           python test/regression_tests.py --binaries --${{ matrix.binaries }} --nightly
diff --git a/.github/workflows/regression_tests_docker.yml b/.github/workflows/regression_tests_docker.yml
@@ -26,7 +26,7 @@ jobs:
           sudo rm -rf ./* || true
           sudo rm -rf ./.??* || true
           ls -la ./
-          docker system prune -f
+          docker system prune --all --volumes -f
       - name: Checkout TorchServe
         uses: actions/checkout@v3
       - name: Branch name
@@ -42,11 +42,11 @@ jobs:
         run: |
           cd docker
           ./build_image.sh -g -cv cu121 -bt ci -n -b $GITHUB_REF_NAME -t pytorch/torchserve:ci
-      - name: Torchserve GPU Regression Tests
+      - name: Validate Torchserve CPU Regression
         if: false == contains(matrix.hardware, 'ubuntu')
         run: |
           docker run --gpus all -v $GITHUB_WORKSPACE:/home/serve pytorch/torchserve:ci
-      - name: Torchserve CPU Regression Tests
+      - name: Validate Torchserve CPU Regression
         if: contains(matrix.hardware, 'ubuntu')
         run: |
           docker run -v $GITHUB_WORKSPACE:/home/serve pytorch/torchserve:ci

diff --git a/README.md b/README.md
@@ -55,19 +55,29 @@ docker pull pytorch/torchserve-nightly
 Refer to [torchserve docker](docker/README.md) for details.
 
 ## ⚡ Why TorchServe
+* Write once, run anywhere, on-prem, on-cloud, supports inference on CPUs, GPUs, AWS Inf1/Inf2/Trn1, Google Cloud TPUs, [Nvidia MPS](master/docs/nvidia_mps.md)
 * [Model Management API](docs/management_api.md): multi model management with optimized worker to model allocation
 * [Inference API](docs/inference_api.md): REST and gRPC support for batched inference
 * [TorchServe Workflows](examples/Workflows/README.md): deploy complex DAGs with multiple interdependent models
 * Default way to serve PyTorch models in
-  * [Kubeflow](https://v0-5.kubeflow.org/docs/components/pytorchserving/)
-  * [MLflow](https://github.com/mlflow/mlflow-torchserve)
   * [Sagemaker](https://aws.amazon.com/blogs/machine-learning/serving-pytorch-models-in-production-with-the-amazon-sagemaker-native-torchserve-integration/)
-  * [Kserve](https://kserve.github.io/website/0.8/modelserving/v1beta1/torchserve/): Supports both v1 and v2 API
   * [Vertex AI](https://cloud.google.com/blog/topics/developers-practitioners/pytorch-google-cloud-how-deploy-pytorch-models-vertex-ai)
-* Export your model for optimized inference. Torchscript out of the box, [ORT and ONNX](https://github.com/pytorch/serve/blob/master/docs/performance_guide.md), [IPEX](https://github.com/pytorch/serve/tree/master/examples/intel_extension_for_pytorch), [TensorRT](https://github.com/pytorch/serve/blob/master/docs/performance_guide.md), [FasterTransformer](https://github.com/pytorch/serve/tree/master/examples/FasterTransformer_HuggingFace_Bert)
+  * [Kubernetes](master/kubernetes) with support for [autoscaling](kubernetes#session-affinity-with-multiple-torchserve-pods), session-affinity, monitoring using Grafana works on-prem, AWS EKS, Google GKE, Azure AKS
+  * [Kserve](https://kserve.github.io/website/0.8/modelserving/v1beta1/torchserve/): Supports both v1 and v2 API, [autoscaling and canary deployments](kubernetes/kserve/README.md#autoscaling) for A/B testing
+  * [Kubeflow](https://v0-5.kubeflow.org/docs/components/pytorchserving/) 
+  * [MLflow](https://github.com/mlflow/mlflow-torchserve)
+* Export your model for optimized inference. Torchscript out of the box, [PyTorch Compiler](examples/pt2/README.md) preview, [ORT and ONNX](https://github.com/pytorch/serve/blob/master/docs/performance_guide.md), [IPEX](https://github.com/pytorch/serve/tree/master/examples/intel_extension_for_pytorch), [TensorRT](https://github.com/pytorch/serve/blob/master/docs/performance_guide.md), [FasterTransformer](https://github.com/pytorch/serve/tree/master/examples/FasterTransformer_HuggingFace_Bert), FlashAttention (Better Transformers)
 * [Performance Guide](docs/performance_guide.md): builtin support to optimize, benchmark and profile PyTorch and TorchServe performance
 * [Expressive handlers](CONTRIBUTING.md): An expressive handler architecture that makes it trivial to support inferencing for your usecase with [many supported out of the box](https://github.com/pytorch/serve/tree/master/ts/torch_handler)
-* [Metrics API](docs/metrics.md): out of box support for system level metrics with [Prometheus exports](https://github.com/pytorch/serve/tree/master/examples/custom_metrics), custom metrics and PyTorch profiler support
+* [Metrics API](docs/metrics.md): out of box support for system level metrics with [Prometheus exports](https://github.com/pytorch/serve/tree/master/examples/custom_metrics), custom metrics, 
+* [Large Model Inference Guide](docs/large_model_inference.md): With support for GenAI, LLMs including
+  * Fast Kernels with FlashAttention v2, continuous batching and streaming response
+  * PyTorch [Tensor Parallel](examples/large_models/tp_llama) preview, [Pipeline Parallel](examples/large_models/Huggingface_pippy) 
+  * Microsoft [DeepSpeed](examples/large_models/deepspeed), [DeepSpeed-Mii](examples/large_models/deepspeed_mii) 
+  * Hugging Face [Accelerate](large_models/Huggingface_accelerate), [Diffusers](examples/diffusers) 
+  * Running large models on AWS [Sagemaker](https://docs.aws.amazon.com/sagemaker/latest/dg/large-model-inference-tutorials-torchserve.html) and [Inferentia2](https://pytorch.org/blog/high-performance-llama/)
+  * Running [Llama 2 Chatbot locally on Mac](examples/LLM/llama2)
+* Monitoring using Grafana and [Datadog](https://www.datadoghq.com/blog/ai-integrations/#model-serving-and-deployment-vertex-ai-amazon-sagemaker-torchserve)
 
 
 ## 🤔 How does TorchServe work
@@ -80,6 +90,7 @@ Refer to [torchserve docker](docker/README.md) for details.
 * [Serving Llama 2 with TorchServe](examples/LLM/llama2/README.md)
 * [Chatbot with Llama 2 on Mac 🦙💬](examples/LLM/llama2/chat_app)
 * [🤗 HuggingFace Transformers](examples/Huggingface_Transformers) with a [Better Transformer Integration/ Flash Attention & Xformer Memory Efficient ](examples/Huggingface_Transformers#Speed-up-inference-with-Better-Transformer)
+* [Stable Diffusion](examples/diffusers)
 * [Model parallel inference](examples/Huggingface_Transformers#model-parallelism)
 * [MultiModal models with MMF](https://github.com/pytorch/serve/tree/master/examples/MMF-activity-recognition) combining text, audio and video
 * [Dual Neural Machine Translation](examples/Workflows/nmt_transformers_pipeline) for a complex workflow DAG
@@ -100,6 +111,12 @@ We welcome all contributions!
 To learn more about how to contribute, see the contributor guide [here](https://github.com/pytorch/serve/blob/master/CONTRIBUTING.md).
 
 ## 📰 News
+* [High performance Llama 2 deployments with AWS Inferentia2 using TorchServe](https://pytorch.org/blog/high-performance-llama/)
+* [Naver Case Study: Transition From High-Cost GPUs to Intel CPUs and oneAPI powered Software with performance](https://pytorch.org/blog/ml-model-server-resource-saving/)
+* [Run multiple generative AI models on GPU using Amazon SageMaker multi-model endpoints with TorchServe and save up to 75% in inference costs](https://aws.amazon.com/blogs/machine-learning/run-multiple-generative-ai-models-on-gpu-using-amazon-sagemaker-multi-model-endpoints-with-torchserve-and-save-up-to-75-in-inference-costs/)
+* [Deploying your Generative AI model in only four steps with Vertex AI and PyTorch](https://cloud.google.com/blog/products/ai-machine-learning/get-your-genai-model-going-in-four-easy-steps)
+* [PyTorch Model Serving on Google Cloud TPU v5](https://cloud.google.com/tpu/docs/v5e-inference#pytorch-model-inference-and-serving)
+* [Monitoring using Datadog](https://www.datadoghq.com/blog/ai-integrations/#model-serving-and-deployment-vertex-ai-amazon-sagemaker-torchserve)
 * [Torchserve Performance Tuning, Animated Drawings Case-Study](https://pytorch.org/blog/torchserve-performance-tuning/)
 * [Walmart Search: Serving Models at a Scale on TorchServe](https://medium.com/walmartglobaltech/search-model-serving-using-pytorch-and-torchserve-6caf9d1c5f4d)
 * [🎥 Scaling inference on CPU with TorchServe](https://www.youtube.com/watch?v=066_Jd6cwZg)

diff --git a/SECURITY.md b/SECURITY.md
@@ -3,8 +3,8 @@
 ## Supported Versions
 
 | Version | Supported          |
-| ------- | ------------------ |
-| 0.8.2   | :white_check_mark: |
+|---------| ------------------ |
+| 0.9.0   | :white_check_mark: |
 
 
 ## How we do security

diff --git a/benchmarks/config.properties b/benchmarks/config.properties
@@ -1,5 +1,5 @@
-inference_address=http://0.0.0.0:8080
-management_address=http://0.0.0.0:8081
+inference_address=http://127.0.0.1:8080
+management_address=http://127.0.0.1:8081
 
 number_of_netty_threads=32
 job_queue_size=1000

diff --git a/benchmarks/config_template.properties b/benchmarks/config_template.properties
@@ -1,2 +1,2 @@
-inference_address=http://0.0.0.0:8080
-management_address=http://0.0.0.0:8081
+inference_address=http://127.0.0.1:8080
+management_address=http://127.0.0.1:8081
diff --git a/docker/build_upload_release.py b/docker/build_upload_release.py
@@ -56,7 +56,7 @@
         f"{organization}/torchserve:{check_ts_version()}-cpu",
         f"{organization}/torchserve:{check_ts_version()}-gpu",
     ]:
-        os.system(f"docker push {image}")
+        try_and_handle(f"docker push {image}", dry_run)
 
     # Cleanup built images
     if args.cleanup:

diff --git a/docs/batch_inference_with_ts.md b/docs/batch_inference_with_ts.md
@@ -166,11 +166,11 @@ curl http://localhost:8081/models/resnet-152-batch_v2
     ```text
       $ curl http://localhost:8080/predictions/resnet-152-batch_v2 -T kitten.jpg
       {
-          "tiger_cat": 0.5848360657691956,
-          "tabby": 0.3782736361026764,
-          "Egyptian_cat": 0.03441936895251274,
-          "lynx": 0.0005633446853607893,
-          "quilt": 0.0002698268508538604
+          "tiger_cat": 0.5798614621162415,
+          "tabby": 0.38344162702560425,
+          "Egyptian_cat": 0.0342114195227623,
+          "lynx": 0.0005819813231937587,
+          "quilt": 0.000273319921689108
       }
     ```
 ### Batch inference of Resnet-152 configured through config.properties
@@ -249,11 +249,11 @@ curl http://localhost:8081/models/resnet-152-batch_v2
     ```text
       $ curl http://localhost:8080/predictions/resnet-152-batch_v2 -T kitten.jpg
       {
-          "tiger_cat": 0.5848360657691956,
-          "tabby": 0.3782736361026764,
-          "Egyptian_cat": 0.03441936895251274,
-          "lynx": 0.0005633446853607893,
-          "quilt": 0.0002698268508538604
+          "tiger_cat": 0.5798614621162415,
+          "tabby": 0.38344162702560425,
+          "Egyptian_cat": 0.0342114195227623,
+          "lynx": 0.0005819813231937587,
+          "quilt": 0.000273319921689108
       }
     ```
 ## Demo to configure TorchServe ResNet-152 model with batch-supported model using Docker
@@ -339,10 +339,10 @@ curl http://localhost:8081/models/resnet-152-batch_v2
     ```text
       $ curl http://localhost:8080/predictions/resnet-152-batch_v2 -T kitten.jpg
       {
-          "tiger_cat": 0.5848360657691956,
-          "tabby": 0.3782736361026764,
-          "Egyptian_cat": 0.03441936895251274,
-          "lynx": 0.0005633446853607893,
-          "quilt": 0.0002698268508538604
+          "tiger_cat": 0.5798614621162415,
+          "tabby": 0.38344162702560425,
+          "Egyptian_cat": 0.0342114195227623,
+          "lynx": 0.0005819813231937587,
+          "quilt": 0.000273319921689108
       }
     ```
diff --git a/docs/index.rst b/docs/index.rst
@@ -9,6 +9,12 @@ TorchServe is a performant, flexible and easy to use tool for serving PyTorch mo
 
 What's going on in TorchServe?
 
+* `High performance Llama 2 deployments with AWS Inferentia2 using TorchServe <https://pytorch.org/blog/high-performance-llama/>`__
+* `Naver Case Study: Transition From High-Cost GPUs to Intel CPUs and oneAPI powered Software with performance <https://pytorch.org/blog/ml-model-server-resource-saving/>`__
+* `Run multiple generative AI models on GPU using Amazon SageMaker multi-model endpoints with TorchServe and save up to 75% in inference costs <https://aws.amazon.com/blogs/machine-learning/run-multiple-generative-ai-models-on-gpu-using-amazon-sagemaker-multi-model-endpoints-with-torchserve-and-save-up-to-75-in-inference-costs/>`__
+* `Deploying your Generative AI model in only four steps with Vertex AI and PyTorch <https://cloud.google.com/blog/products/ai-machine-learning/get-your-genai-model-going-in-four-easy-steps>`__
+* `PyTorch Model Serving on Google Cloud TPUv5 <https://cloud.google.com/tpu/docs/v5e-inference#pytorch-model-inference-and-serving>`__
+* `Monitoring using Datadog <https://www.datadoghq.com/blog/ai-integrations/#model-serving-and-deployment-vertex-ai-amazon-sagemaker-torchserve>`__
 * `Torchserve Performance Tuning, Animated Drawings Case-Study <https://pytorch.org/blog/torchserve-performance-tuning/>`__
 * `Walmart Search: Serving Models at a Scale on TorchServe <https://medium.com/walmartglobaltech/search-model-serving-using-pytorch-and-torchserve-6caf9d1c5f4d>`__
 * `Scaling inference on CPU with TorchServe <https://www.youtube.com/watch?v=066_Jd6cwZg>`__

diff --git a/examples/large_models/tp_llama/REAME.md → examples/large_models/tp_llama/README.md b/examples/large_models/tp_llama/REAME.md → examples/large_models/tp_llama/README.md
diff --git a/examples/pt2/README.md b/examples/pt2/README.md
@@ -46,6 +46,17 @@ opt_mod = torch.compile(mod)
 
 torchserve takes care of 4 and 5 for you while the remaining steps are your responsibility. You can do the exact same thing on the vast majority of TIMM or HuggingFace models.
 
+### Note
+
+`torch.compile()` is a JIT compiler and JIT compilers generally have a startup cost. If that's an issue for you make sure to populate these two environment variables to improve your warm starts.
+
+```
+import os
+
+os.environ["TORCHINDUCTOR_CACHE_DIR"] = "1"
+os.environ["TORCHINDUCTOR_FX_GRAPH_CACHE"] = "/path/to/directory"  # replace with your desired path
+```
+
 ## torch.export.export
 
 Export your model from a training script, keep in mind that an exported model cannot have graph breaks.

diff --git a/frontend/server/src/main/java/org/pytorch/serve/util/codec/ModelRequestEncoder.java b/frontend/server/src/main/java/org/pytorch/serve/util/codec/ModelRequestEncoder.java
@@ -76,7 +76,7 @@ private void encodeRequest(RequestInput req, ByteBuf out) {
         out.writeInt(buf.length);
         out.writeBytes(buf);
 
-        if (req.isCached()) {
+        if (req.isCachedInBackend()) {
             out.writeInt(-1); // End of List
             out.writeInt(-1); // End of List
             return;
@@ -92,7 +92,6 @@ private void encodeRequest(RequestInput req, ByteBuf out) {
             encodeParameter(input, out);
         }
         out.writeInt(-1); // End of List
-        req.setCached(true);
     }
 
     private void encodeParameter(InputParameter parameter, ByteBuf out) {

diff --git a/frontend/server/src/main/java/org/pytorch/serve/util/messages/ModelInferenceRequest.java b/frontend/server/src/main/java/org/pytorch/serve/util/messages/ModelInferenceRequest.java
@@ -23,4 +23,10 @@ public void setRequestBatch(List<RequestInput> requestBatch) {
     public void addRequest(RequestInput req) {
         batch.add(req);
     }
+
+    public void setCachedInBackend(boolean cached) {
+        for (RequestInput input : batch) {
+            input.setCachedInBackend(cached);
+        }
+    }
 }
diff --git a/frontend/server/src/main/java/org/pytorch/serve/util/messages/RequestInput.java b/frontend/server/src/main/java/org/pytorch/serve/util/messages/RequestInput.java
@@ -73,11 +73,11 @@ public void setClientExpireTS(long clientTimeoutInMills) {
         }
     }
 
-    public boolean isCached() {
+    public boolean isCachedInBackend() {
         return cached;
     }
 
-    public void setCached(boolean cached) {
+    public void setCachedInBackend(boolean cached) {
         this.cached = cached;
     }
 }
diff --git a/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkerThread.java b/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkerThread.java
@@ -37,6 +37,7 @@
 import org.pytorch.serve.util.codec.ModelResponseDecoder;
 import org.pytorch.serve.util.messages.BaseModelRequest;
 import org.pytorch.serve.util.messages.InputParameter;
+import org.pytorch.serve.util.messages.ModelInferenceRequest;
 import org.pytorch.serve.util.messages.ModelWorkerResponse;
 import org.pytorch.serve.util.messages.RequestInput;
 import org.pytorch.serve.util.messages.WorkerCommands;
@@ -208,6 +209,9 @@ public void run() {
                 for (int i = 0; backendChannel.size() > 0 && i < repeats; i++) {
                     backendChannel.get(i).writeAndFlush(req).sync();
                 }
+                if (req instanceof ModelInferenceRequest) {
+                    ((ModelInferenceRequest) req).setCachedInBackend(true);
+                }
 
                 ModelWorkerResponse reply = null;
 
@@ -313,6 +317,7 @@ public void run() {
                     i++) {
                 backendChannel.get(i).disconnect();
             }
+            backendChannel.clear();
             currentThread.set(null);
             Integer exitValue = lifeCycle.getExitValue();
 
@@ -462,6 +467,7 @@ public void shutdown() {
                 backendChannel.get(i).close();
             }
         }
+        backendChannel.clear();
         lifeCycle.terminateIOStreams();
         Thread thread = currentThread.getAndSet(null);
         if (thread != null) {

diff --git a/kubernetes/EKS/README.md b/kubernetes/EKS/README.md
@@ -506,8 +506,8 @@
 
 
   ```yaml
-  inference_address=http://127.0.0.1:8080
-  management_address=http://127.0.0.1:8081
+  inference_address=http://0.0.0.0:8080
+  management_address=http://0.0.0.0:8081
   NUM_WORKERS=1
   number_of_gpu=1
   number_of_netty_threads=32

diff --git a/kubernetes/examples/FasterTransformer_HuggingFace_Bert.md b/kubernetes/examples/FasterTransformer_HuggingFace_Bert.md
@@ -33,9 +33,9 @@ docker cp <container-id>:/workspace/serve/examples/FasterTransformer_HuggingFace
 ## Create config.properties
 
 ```bash
-inference_address=http://127.0.0.1:8080
-management_address=http://127.0.0.1:8081
-metrics_address=http://127.0.0.1:8082
+inference_address=http://0.0.0.0:8080
+management_address=http://0.0.0.0:8081
+metrics_address=http://0.0.0.0:8082
 NUM_WORKERS=1
 number_of_gpu=1
 install_py_dep_per_model=true

diff --git a/kubernetes/kserve/README.md b/kubernetes/kserve/README.md
@@ -109,9 +109,9 @@ torch-model-archiver --model-name mnist_kf --version 1.0 --model-file examples/i
 - Step - 2 : Create a config.properties file and place the contents like below:
 
 ```bash
-inference_address=http://127.0.0.1:8085
-management_address=http://127.0.0.1:8081
-metrics_address=http://127.0.0.1:8082
+inference_address=http://0.0.0.0:8085
+management_address=http://0.0.0.0:8081
+metrics_address=http://0.0.0.0:8082
 grpc_inference_port=7070
 grpc_management_port=7071
 enable_envvars_config=true

diff --git a/kubernetes/kserve/build_upload_release.py b/kubernetes/kserve/build_upload_release.py
@@ -43,7 +43,7 @@
         f"{organization}/torchserve-kfs:{check_ts_version()}",
         f"{organization}/torchserve-kfs:{check_ts_version()}-gpu",
     ]:
-        os.system(f"docker push {image}")
+        try_and_handle(f"docker push {image}", dry_run)
 
     # Cleanup built images
     if args.cleanup:

diff --git a/kubernetes/kserve/config.properties b/kubernetes/kserve/config.properties
@@ -1,7 +1,7 @@
 #Sample config.properties. In production config.properties at /mnt/models/config/config.properties will be used
-inference_address=http://127.0.0.1:8085
-management_address=http://127.0.0.1:8085
-metrics_address=http://127.0.0.1:8082
+inference_address=http://0.0.0.0:8085
+management_address=http://0.0.0.0:8085
+metrics_address=http://0.0.0.0:8082
 grpc_inference_port=7070
 grpc_management_port=7071
 enable_envvars_config=true