diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
new file mode 100644
index 0000000000..89df7c5c23
--- /dev/null
+++ b/.github/pytorch-probot.yml
@@ -0,0 +1,13 @@
+tracking_issue: 2724
+
+# List of workflows that will be re-run in case of failures
+# https://github.com/pytorch/test-infra/blob/main/torchci/lib/bot/retryBot.ts
+retryable_workflows:
+- Run Regression Tests on Docker
+- Run Regression Tests for CPU nightly binaries
+- Push torchserve nightly
+- Push Docker Nightly
+- Docker CI
+- CI CPU
+- CI GPU
+- Benchmark torchserve nightly
diff --git a/.github/workflows/kserve_cpu_tests.yml b/.github/workflows/kserve_cpu_tests.yml
new file mode 100644
index 0000000000..beb91945e2
--- /dev/null
+++ b/.github/workflows/kserve_cpu_tests.yml
@@ -0,0 +1,40 @@
+name: KServe CPU Nightly Tests
+
+on:
+ workflow_dispatch:
+ # runs everyday at 5:15am
+ schedule:
+ - cron: '15 5 * * *'
+
+jobs:
+ kserve-cpu-tests:
+ runs-on: [self-hosted, regression-test-gpu]
+ steps:
+ - name: Clean up previous run
+ run: |
+ echo "Cleaning up previous run"
+ ls -la ./
+ sudo rm -rf ./* || true
+ sudo rm -rf ./.??* || true
+ ls -la ./
+ - name: Install minikube and kubectl
+ run: |
+ curl -LO https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64
+ sudo install minikube-linux-amd64 /usr/local/bin/minikube
+ curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
+ sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
+ echo "/usr/local/bin" >> $GITHUB_PATH
+ - name: Setup Python 3.8
+ uses: actions/setup-python@v4
+ with:
+ python-version: 3.8
+ architecture: x64
+ - name: Checkout TorchServe
+ uses: actions/checkout@v3
+ - name: Checkout kserve repo
+ uses: actions/checkout@v4
+ with:
+ repository: kserve/kserve
+ path: kserve
+ - name: Validate torchserve-kfs
+ run: ./kubernetes/kserve/tests/scripts/test_mnist.sh
diff --git a/.github/workflows/regression_tests_cpu_binaries.yml b/.github/workflows/regression_tests_cpu_binaries.yml
index d5ad0878a2..858cdeaff3 100644
--- a/.github/workflows/regression_tests_cpu_binaries.yml
+++ b/.github/workflows/regression_tests_cpu_binaries.yml
@@ -39,6 +39,6 @@ jobs:
- name: Install dependencies
run: |
python ts_scripts/install_dependencies.py --environment=dev
- - name: Torchserve Regression Tests
+ - name: Validate Torchserve CPU Regression
run: |
python test/regression_tests.py --binaries --${{ matrix.binaries }} --nightly
diff --git a/.github/workflows/regression_tests_docker.yml b/.github/workflows/regression_tests_docker.yml
index b861fadecd..97b1fd7320 100644
--- a/.github/workflows/regression_tests_docker.yml
+++ b/.github/workflows/regression_tests_docker.yml
@@ -26,7 +26,7 @@ jobs:
sudo rm -rf ./* || true
sudo rm -rf ./.??* || true
ls -la ./
- docker system prune -f
+ docker system prune --all --volumes -f
- name: Checkout TorchServe
uses: actions/checkout@v3
- name: Branch name
@@ -42,11 +42,11 @@ jobs:
run: |
cd docker
./build_image.sh -g -cv cu121 -bt ci -n -b $GITHUB_REF_NAME -t pytorch/torchserve:ci
- - name: Torchserve GPU Regression Tests
+ - name: Validate Torchserve CPU Regression
if: false == contains(matrix.hardware, 'ubuntu')
run: |
docker run --gpus all -v $GITHUB_WORKSPACE:/home/serve pytorch/torchserve:ci
- - name: Torchserve CPU Regression Tests
+ - name: Validate Torchserve CPU Regression
if: contains(matrix.hardware, 'ubuntu')
run: |
docker run -v $GITHUB_WORKSPACE:/home/serve pytorch/torchserve:ci
diff --git a/README.md b/README.md
index 76cd0100ee..c72b1a4320 100644
--- a/README.md
+++ b/README.md
@@ -55,19 +55,29 @@ docker pull pytorch/torchserve-nightly
Refer to [torchserve docker](docker/README.md) for details.
## ⚡ Why TorchServe
+* Write once, run anywhere, on-prem, on-cloud, supports inference on CPUs, GPUs, AWS Inf1/Inf2/Trn1, Google Cloud TPUs, [Nvidia MPS](master/docs/nvidia_mps.md)
* [Model Management API](docs/management_api.md): multi model management with optimized worker to model allocation
* [Inference API](docs/inference_api.md): REST and gRPC support for batched inference
* [TorchServe Workflows](examples/Workflows/README.md): deploy complex DAGs with multiple interdependent models
* Default way to serve PyTorch models in
- * [Kubeflow](https://v0-5.kubeflow.org/docs/components/pytorchserving/)
- * [MLflow](https://github.com/mlflow/mlflow-torchserve)
* [Sagemaker](https://aws.amazon.com/blogs/machine-learning/serving-pytorch-models-in-production-with-the-amazon-sagemaker-native-torchserve-integration/)
- * [Kserve](https://kserve.github.io/website/0.8/modelserving/v1beta1/torchserve/): Supports both v1 and v2 API
* [Vertex AI](https://cloud.google.com/blog/topics/developers-practitioners/pytorch-google-cloud-how-deploy-pytorch-models-vertex-ai)
-* Export your model for optimized inference. Torchscript out of the box, [ORT and ONNX](https://github.com/pytorch/serve/blob/master/docs/performance_guide.md), [IPEX](https://github.com/pytorch/serve/tree/master/examples/intel_extension_for_pytorch), [TensorRT](https://github.com/pytorch/serve/blob/master/docs/performance_guide.md), [FasterTransformer](https://github.com/pytorch/serve/tree/master/examples/FasterTransformer_HuggingFace_Bert)
+ * [Kubernetes](master/kubernetes) with support for [autoscaling](kubernetes#session-affinity-with-multiple-torchserve-pods), session-affinity, monitoring using Grafana works on-prem, AWS EKS, Google GKE, Azure AKS
+ * [Kserve](https://kserve.github.io/website/0.8/modelserving/v1beta1/torchserve/): Supports both v1 and v2 API, [autoscaling and canary deployments](kubernetes/kserve/README.md#autoscaling) for A/B testing
+ * [Kubeflow](https://v0-5.kubeflow.org/docs/components/pytorchserving/)
+ * [MLflow](https://github.com/mlflow/mlflow-torchserve)
+* Export your model for optimized inference. Torchscript out of the box, [PyTorch Compiler](examples/pt2/README.md) preview, [ORT and ONNX](https://github.com/pytorch/serve/blob/master/docs/performance_guide.md), [IPEX](https://github.com/pytorch/serve/tree/master/examples/intel_extension_for_pytorch), [TensorRT](https://github.com/pytorch/serve/blob/master/docs/performance_guide.md), [FasterTransformer](https://github.com/pytorch/serve/tree/master/examples/FasterTransformer_HuggingFace_Bert), FlashAttention (Better Transformers)
* [Performance Guide](docs/performance_guide.md): builtin support to optimize, benchmark and profile PyTorch and TorchServe performance
* [Expressive handlers](CONTRIBUTING.md): An expressive handler architecture that makes it trivial to support inferencing for your usecase with [many supported out of the box](https://github.com/pytorch/serve/tree/master/ts/torch_handler)
-* [Metrics API](docs/metrics.md): out of box support for system level metrics with [Prometheus exports](https://github.com/pytorch/serve/tree/master/examples/custom_metrics), custom metrics and PyTorch profiler support
+* [Metrics API](docs/metrics.md): out of box support for system level metrics with [Prometheus exports](https://github.com/pytorch/serve/tree/master/examples/custom_metrics), custom metrics,
+* [Large Model Inference Guide](docs/large_model_inference.md): With support for GenAI, LLMs including
+ * Fast Kernels with FlashAttention v2, continuous batching and streaming response
+ * PyTorch [Tensor Parallel](examples/large_models/tp_llama) preview, [Pipeline Parallel](examples/large_models/Huggingface_pippy)
+ * Microsoft [DeepSpeed](examples/large_models/deepspeed), [DeepSpeed-Mii](examples/large_models/deepspeed_mii)
+ * Hugging Face [Accelerate](large_models/Huggingface_accelerate), [Diffusers](examples/diffusers)
+ * Running large models on AWS [Sagemaker](https://docs.aws.amazon.com/sagemaker/latest/dg/large-model-inference-tutorials-torchserve.html) and [Inferentia2](https://pytorch.org/blog/high-performance-llama/)
+ * Running [Llama 2 Chatbot locally on Mac](examples/LLM/llama2)
+* Monitoring using Grafana and [Datadog](https://www.datadoghq.com/blog/ai-integrations/#model-serving-and-deployment-vertex-ai-amazon-sagemaker-torchserve)
## 🤔 How does TorchServe work
@@ -80,6 +90,7 @@ Refer to [torchserve docker](docker/README.md) for details.
* [Serving Llama 2 with TorchServe](examples/LLM/llama2/README.md)
* [Chatbot with Llama 2 on Mac 🦙💬](examples/LLM/llama2/chat_app)
* [🤗 HuggingFace Transformers](examples/Huggingface_Transformers) with a [Better Transformer Integration/ Flash Attention & Xformer Memory Efficient ](examples/Huggingface_Transformers#Speed-up-inference-with-Better-Transformer)
+* [Stable Diffusion](examples/diffusers)
* [Model parallel inference](examples/Huggingface_Transformers#model-parallelism)
* [MultiModal models with MMF](https://github.com/pytorch/serve/tree/master/examples/MMF-activity-recognition) combining text, audio and video
* [Dual Neural Machine Translation](examples/Workflows/nmt_transformers_pipeline) for a complex workflow DAG
@@ -100,6 +111,12 @@ We welcome all contributions!
To learn more about how to contribute, see the contributor guide [here](https://github.com/pytorch/serve/blob/master/CONTRIBUTING.md).
## 📰 News
+* [High performance Llama 2 deployments with AWS Inferentia2 using TorchServe](https://pytorch.org/blog/high-performance-llama/)
+* [Naver Case Study: Transition From High-Cost GPUs to Intel CPUs and oneAPI powered Software with performance](https://pytorch.org/blog/ml-model-server-resource-saving/)
+* [Run multiple generative AI models on GPU using Amazon SageMaker multi-model endpoints with TorchServe and save up to 75% in inference costs](https://aws.amazon.com/blogs/machine-learning/run-multiple-generative-ai-models-on-gpu-using-amazon-sagemaker-multi-model-endpoints-with-torchserve-and-save-up-to-75-in-inference-costs/)
+* [Deploying your Generative AI model in only four steps with Vertex AI and PyTorch](https://cloud.google.com/blog/products/ai-machine-learning/get-your-genai-model-going-in-four-easy-steps)
+* [PyTorch Model Serving on Google Cloud TPU v5](https://cloud.google.com/tpu/docs/v5e-inference#pytorch-model-inference-and-serving)
+* [Monitoring using Datadog](https://www.datadoghq.com/blog/ai-integrations/#model-serving-and-deployment-vertex-ai-amazon-sagemaker-torchserve)
* [Torchserve Performance Tuning, Animated Drawings Case-Study](https://pytorch.org/blog/torchserve-performance-tuning/)
* [Walmart Search: Serving Models at a Scale on TorchServe](https://medium.com/walmartglobaltech/search-model-serving-using-pytorch-and-torchserve-6caf9d1c5f4d)
* [🎥 Scaling inference on CPU with TorchServe](https://www.youtube.com/watch?v=066_Jd6cwZg)
diff --git a/SECURITY.md b/SECURITY.md
index 38d22373c6..1f424bcfa3 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -3,8 +3,8 @@
## Supported Versions
| Version | Supported |
-| ------- | ------------------ |
-| 0.8.2 | :white_check_mark: |
+|---------| ------------------ |
+| 0.9.0 | :white_check_mark: |
## How we do security
diff --git a/benchmarks/config.properties b/benchmarks/config.properties
index 5d819a29c6..a1b672d2c2 100644
--- a/benchmarks/config.properties
+++ b/benchmarks/config.properties
@@ -1,5 +1,5 @@
-inference_address=http://0.0.0.0:8080
-management_address=http://0.0.0.0:8081
+inference_address=http://127.0.0.1:8080
+management_address=http://127.0.0.1:8081
number_of_netty_threads=32
job_queue_size=1000
diff --git a/benchmarks/config_template.properties b/benchmarks/config_template.properties
index 1b1e9772dd..c2be608e54 100644
--- a/benchmarks/config_template.properties
+++ b/benchmarks/config_template.properties
@@ -1,2 +1,2 @@
-inference_address=http://0.0.0.0:8080
-management_address=http://0.0.0.0:8081
+inference_address=http://127.0.0.1:8080
+management_address=http://127.0.0.1:8081
diff --git a/docker/build_upload_release.py b/docker/build_upload_release.py
index 8def7bb217..44c3812297 100644
--- a/docker/build_upload_release.py
+++ b/docker/build_upload_release.py
@@ -56,7 +56,7 @@
f"{organization}/torchserve:{check_ts_version()}-cpu",
f"{organization}/torchserve:{check_ts_version()}-gpu",
]:
- os.system(f"docker push {image}")
+ try_and_handle(f"docker push {image}", dry_run)
# Cleanup built images
if args.cleanup:
diff --git a/docs/batch_inference_with_ts.md b/docs/batch_inference_with_ts.md
index b4f339d5a1..3ff04be63b 100644
--- a/docs/batch_inference_with_ts.md
+++ b/docs/batch_inference_with_ts.md
@@ -166,11 +166,11 @@ curl http://localhost:8081/models/resnet-152-batch_v2
```text
$ curl http://localhost:8080/predictions/resnet-152-batch_v2 -T kitten.jpg
{
- "tiger_cat": 0.5848360657691956,
- "tabby": 0.3782736361026764,
- "Egyptian_cat": 0.03441936895251274,
- "lynx": 0.0005633446853607893,
- "quilt": 0.0002698268508538604
+ "tiger_cat": 0.5798614621162415,
+ "tabby": 0.38344162702560425,
+ "Egyptian_cat": 0.0342114195227623,
+ "lynx": 0.0005819813231937587,
+ "quilt": 0.000273319921689108
}
```
### Batch inference of Resnet-152 configured through config.properties
@@ -249,11 +249,11 @@ curl http://localhost:8081/models/resnet-152-batch_v2
```text
$ curl http://localhost:8080/predictions/resnet-152-batch_v2 -T kitten.jpg
{
- "tiger_cat": 0.5848360657691956,
- "tabby": 0.3782736361026764,
- "Egyptian_cat": 0.03441936895251274,
- "lynx": 0.0005633446853607893,
- "quilt": 0.0002698268508538604
+ "tiger_cat": 0.5798614621162415,
+ "tabby": 0.38344162702560425,
+ "Egyptian_cat": 0.0342114195227623,
+ "lynx": 0.0005819813231937587,
+ "quilt": 0.000273319921689108
}
```
## Demo to configure TorchServe ResNet-152 model with batch-supported model using Docker
@@ -339,10 +339,10 @@ curl http://localhost:8081/models/resnet-152-batch_v2
```text
$ curl http://localhost:8080/predictions/resnet-152-batch_v2 -T kitten.jpg
{
- "tiger_cat": 0.5848360657691956,
- "tabby": 0.3782736361026764,
- "Egyptian_cat": 0.03441936895251274,
- "lynx": 0.0005633446853607893,
- "quilt": 0.0002698268508538604
+ "tiger_cat": 0.5798614621162415,
+ "tabby": 0.38344162702560425,
+ "Egyptian_cat": 0.0342114195227623,
+ "lynx": 0.0005819813231937587,
+ "quilt": 0.000273319921689108
}
```
diff --git a/docs/index.rst b/docs/index.rst
index f16037417e..06a36018fc 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -9,6 +9,12 @@ TorchServe is a performant, flexible and easy to use tool for serving PyTorch mo
What's going on in TorchServe?
+* `High performance Llama 2 deployments with AWS Inferentia2 using TorchServe `__
+* `Naver Case Study: Transition From High-Cost GPUs to Intel CPUs and oneAPI powered Software with performance `__
+* `Run multiple generative AI models on GPU using Amazon SageMaker multi-model endpoints with TorchServe and save up to 75% in inference costs `__
+* `Deploying your Generative AI model in only four steps with Vertex AI and PyTorch `__
+* `PyTorch Model Serving on Google Cloud TPUv5 `__
+* `Monitoring using Datadog `__
* `Torchserve Performance Tuning, Animated Drawings Case-Study `__
* `Walmart Search: Serving Models at a Scale on TorchServe `__
* `Scaling inference on CPU with TorchServe `__
diff --git a/examples/large_models/tp_llama/REAME.md b/examples/large_models/tp_llama/README.md
similarity index 100%
rename from examples/large_models/tp_llama/REAME.md
rename to examples/large_models/tp_llama/README.md
diff --git a/examples/pt2/README.md b/examples/pt2/README.md
index dbffc749ec..0758b089af 100644
--- a/examples/pt2/README.md
+++ b/examples/pt2/README.md
@@ -46,6 +46,17 @@ opt_mod = torch.compile(mod)
torchserve takes care of 4 and 5 for you while the remaining steps are your responsibility. You can do the exact same thing on the vast majority of TIMM or HuggingFace models.
+### Note
+
+`torch.compile()` is a JIT compiler and JIT compilers generally have a startup cost. If that's an issue for you make sure to populate these two environment variables to improve your warm starts.
+
+```
+import os
+
+os.environ["TORCHINDUCTOR_CACHE_DIR"] = "1"
+os.environ["TORCHINDUCTOR_FX_GRAPH_CACHE"] = "/path/to/directory" # replace with your desired path
+```
+
## torch.export.export
Export your model from a training script, keep in mind that an exported model cannot have graph breaks.
diff --git a/frontend/server/src/main/java/org/pytorch/serve/util/codec/ModelRequestEncoder.java b/frontend/server/src/main/java/org/pytorch/serve/util/codec/ModelRequestEncoder.java
index 57348de638..1f89f4a48a 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/util/codec/ModelRequestEncoder.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/util/codec/ModelRequestEncoder.java
@@ -76,7 +76,7 @@ private void encodeRequest(RequestInput req, ByteBuf out) {
out.writeInt(buf.length);
out.writeBytes(buf);
- if (req.isCached()) {
+ if (req.isCachedInBackend()) {
out.writeInt(-1); // End of List
out.writeInt(-1); // End of List
return;
@@ -92,7 +92,6 @@ private void encodeRequest(RequestInput req, ByteBuf out) {
encodeParameter(input, out);
}
out.writeInt(-1); // End of List
- req.setCached(true);
}
private void encodeParameter(InputParameter parameter, ByteBuf out) {
diff --git a/frontend/server/src/main/java/org/pytorch/serve/util/messages/ModelInferenceRequest.java b/frontend/server/src/main/java/org/pytorch/serve/util/messages/ModelInferenceRequest.java
index 9a4c73af76..e83b6d95eb 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/util/messages/ModelInferenceRequest.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/util/messages/ModelInferenceRequest.java
@@ -23,4 +23,10 @@ public void setRequestBatch(List requestBatch) {
public void addRequest(RequestInput req) {
batch.add(req);
}
+
+ public void setCachedInBackend(boolean cached) {
+ for (RequestInput input : batch) {
+ input.setCachedInBackend(cached);
+ }
+ }
}
diff --git a/frontend/server/src/main/java/org/pytorch/serve/util/messages/RequestInput.java b/frontend/server/src/main/java/org/pytorch/serve/util/messages/RequestInput.java
index 5717908f0f..0db8e84064 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/util/messages/RequestInput.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/util/messages/RequestInput.java
@@ -73,11 +73,11 @@ public void setClientExpireTS(long clientTimeoutInMills) {
}
}
- public boolean isCached() {
+ public boolean isCachedInBackend() {
return cached;
}
- public void setCached(boolean cached) {
+ public void setCachedInBackend(boolean cached) {
this.cached = cached;
}
}
diff --git a/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkerThread.java b/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkerThread.java
index 90f294e5cf..178bbb91a2 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkerThread.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/wlm/WorkerThread.java
@@ -37,6 +37,7 @@
import org.pytorch.serve.util.codec.ModelResponseDecoder;
import org.pytorch.serve.util.messages.BaseModelRequest;
import org.pytorch.serve.util.messages.InputParameter;
+import org.pytorch.serve.util.messages.ModelInferenceRequest;
import org.pytorch.serve.util.messages.ModelWorkerResponse;
import org.pytorch.serve.util.messages.RequestInput;
import org.pytorch.serve.util.messages.WorkerCommands;
@@ -208,6 +209,9 @@ public void run() {
for (int i = 0; backendChannel.size() > 0 && i < repeats; i++) {
backendChannel.get(i).writeAndFlush(req).sync();
}
+ if (req instanceof ModelInferenceRequest) {
+ ((ModelInferenceRequest) req).setCachedInBackend(true);
+ }
ModelWorkerResponse reply = null;
@@ -313,6 +317,7 @@ public void run() {
i++) {
backendChannel.get(i).disconnect();
}
+ backendChannel.clear();
currentThread.set(null);
Integer exitValue = lifeCycle.getExitValue();
@@ -462,6 +467,7 @@ public void shutdown() {
backendChannel.get(i).close();
}
}
+ backendChannel.clear();
lifeCycle.terminateIOStreams();
Thread thread = currentThread.getAndSet(null);
if (thread != null) {
diff --git a/kubernetes/EKS/README.md b/kubernetes/EKS/README.md
index 14a7b656fe..c932f5e914 100644
--- a/kubernetes/EKS/README.md
+++ b/kubernetes/EKS/README.md
@@ -506,8 +506,8 @@
```yaml
- inference_address=http://127.0.0.1:8080
- management_address=http://127.0.0.1:8081
+ inference_address=http://0.0.0.0:8080
+ management_address=http://0.0.0.0:8081
NUM_WORKERS=1
number_of_gpu=1
number_of_netty_threads=32
diff --git a/kubernetes/examples/FasterTransformer_HuggingFace_Bert.md b/kubernetes/examples/FasterTransformer_HuggingFace_Bert.md
index 53f9c49827..7d1b696e0b 100644
--- a/kubernetes/examples/FasterTransformer_HuggingFace_Bert.md
+++ b/kubernetes/examples/FasterTransformer_HuggingFace_Bert.md
@@ -33,9 +33,9 @@ docker cp :/workspace/serve/examples/FasterTransformer_HuggingFace
## Create config.properties
```bash
-inference_address=http://127.0.0.1:8080
-management_address=http://127.0.0.1:8081
-metrics_address=http://127.0.0.1:8082
+inference_address=http://0.0.0.0:8080
+management_address=http://0.0.0.0:8081
+metrics_address=http://0.0.0.0:8082
NUM_WORKERS=1
number_of_gpu=1
install_py_dep_per_model=true
diff --git a/kubernetes/kserve/README.md b/kubernetes/kserve/README.md
index f439bd7ce7..cf54a6ce73 100644
--- a/kubernetes/kserve/README.md
+++ b/kubernetes/kserve/README.md
@@ -109,9 +109,9 @@ torch-model-archiver --model-name mnist_kf --version 1.0 --model-file examples/i
- Step - 2 : Create a config.properties file and place the contents like below:
```bash
-inference_address=http://127.0.0.1:8085
-management_address=http://127.0.0.1:8081
-metrics_address=http://127.0.0.1:8082
+inference_address=http://0.0.0.0:8085
+management_address=http://0.0.0.0:8081
+metrics_address=http://0.0.0.0:8082
grpc_inference_port=7070
grpc_management_port=7071
enable_envvars_config=true
diff --git a/kubernetes/kserve/build_upload_release.py b/kubernetes/kserve/build_upload_release.py
index d10ae8533f..55183c7a03 100644
--- a/kubernetes/kserve/build_upload_release.py
+++ b/kubernetes/kserve/build_upload_release.py
@@ -43,7 +43,7 @@
f"{organization}/torchserve-kfs:{check_ts_version()}",
f"{organization}/torchserve-kfs:{check_ts_version()}-gpu",
]:
- os.system(f"docker push {image}")
+ try_and_handle(f"docker push {image}", dry_run)
# Cleanup built images
if args.cleanup:
diff --git a/kubernetes/kserve/config.properties b/kubernetes/kserve/config.properties
index 91fbb7483b..422e53d138 100644
--- a/kubernetes/kserve/config.properties
+++ b/kubernetes/kserve/config.properties
@@ -1,7 +1,7 @@
#Sample config.properties. In production config.properties at /mnt/models/config/config.properties will be used
-inference_address=http://127.0.0.1:8085
-management_address=http://127.0.0.1:8085
-metrics_address=http://127.0.0.1:8082
+inference_address=http://0.0.0.0:8085
+management_address=http://0.0.0.0:8085
+metrics_address=http://0.0.0.0:8082
grpc_inference_port=7070
grpc_management_port=7071
enable_envvars_config=true
diff --git a/kubernetes/kserve/examples/mnist/MNIST.md b/kubernetes/kserve/examples/mnist/MNIST.md
new file mode 100644
index 0000000000..24efd2a2bc
--- /dev/null
+++ b/kubernetes/kserve/examples/mnist/MNIST.md
@@ -0,0 +1,164 @@
+# Digit recognition model with MNIST dataset using a Kubernetes cluster
+
+In this example, we show how to use a pre-trained custom MNIST model to perform real time Digit recognition with TorchServe.
+We will be serving the model using Kserve deployed using [minikube](https://minikube.sigs.k8s.io/docs/start/).
+
+The inference service would return the digit inferred by the model in the input image.
+
+
+## Install kserve
+
+Start minikube cluster
+
+```
+minikube start
+```
+
+For this example, we need to git clone [kserve](https://github.com/kserve/kserve)
+Run the commands given in following steps from the parent directory of the root of the repository. For example, if you cloned the repository into /home/my_path/kserve, run the steps from /home/my_path/kserve
+
+Run the following for quick install of kserve
+```
+./hack/quick_install.sh
+```
+
+Make sure kserve is installed on minikube cluster using
+
+```
+kubectl get pods -n kserve
+```
+
+This should result in
+```
+NAME READY STATUS RESTARTS AGE
+kserve-controller-manager-57574b4878-rnsjn 2/2 Running 0 17s
+```
+
+TorchServe supports KServe V1 and V2 protocol. We show how to deploy with both for Mnist.
+
+## KServe V1 protocol
+
+Deploy `InferenceService` with Kserve V1 protocol
+
+```
+kubectl apply -f docs/samples/v1beta1/torchserve/v1/torchserve.yaml
+```
+
+results in
+
+```
+inferenceservice.serving.kserve.io/torchserve created
+```
+
+We need to wait till the pod is up
+
+```
+kubectl get pods
+NAME READY STATUS RESTARTS AGE
+torchserve-predictor-00001-deployment-8d66f9c-dkdhr 2/2 Running 0 8m19s
+```
+
+We need to set the following
+
+```
+MODEL_NAME=mnist
+SERVICE_HOSTNAME=$(kubectl get inferenceservice torchserve -o jsonpath='{.status.url}' | cut -d "/" -f 3)
+```
+
+```
+export INGRESS_HOST=localhost
+export INGRESS_PORT=8080
+```
+
+```
+INGRESS_GATEWAY_SERVICE=$(kubectl get svc --namespace istio-system --selector="app=istio-ingressgateway" --output jsonpath='{.items[0].metadata.name}')
+kubectl port-forward --namespace istio-system svc/${INGRESS_GATEWAY_SERVICE} 8080:80 &
+```
+
+Make an inference request
+
+```
+curl -H "Content-Type: application/json" -H "Host: ${SERVICE_HOSTNAME}" http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/${MODEL_NAME}:predict -d @./docs/samples/v1beta1/torchserve/v1/mnist.json
+```
+
+Expected output is
+
+```
+{"predictions":[2]}
+```
+
+## KServe V2 protocol
+
+Deploy `InferenceService` with Kserve V2 protocol
+
+```
+kubectl apply -f docs/samples/v1beta1/torchserve/v2/mnist.yaml
+```
+
+results in
+
+```
+inferenceservice.serving.kserve.io/torchserve-mnist-v2 created
+```
+
+We need to check the pod is running with
+
+```
+kubectl get pods
+NAME READY STATUS RESTARTS AGE
+torchserve-mnist-v2-predictor-00001-deployment-6c8c684dcb-4mfmr 2/2 Running 0 2m37s
+```
+
+Inspecting the logs of the pods to check the version of TorchServe
+
+```
+kubectl logs torchserve-mnist-v2-predictor-00001-deployment-6c8c684dcb-4mfmr
+Defaulted container "kserve-container" out of: kserve-container, queue-proxy, storage-initializer (init)
+WARNING: sun.reflect.Reflection.getCallerClass is not supported. This will impact performance.
+2023-10-12T20:50:39,466 [WARN ] main org.pytorch.serve.util.ConfigManager - Your torchserve instance can access any URL to load models. When deploying to production, make sure to limit the set of allowed_urls in config.properties
+2023-10-12T20:50:39,468 [INFO ] main org.pytorch.serve.servingsdk.impl.PluginsManager - Initializing plugins manager...
+2023-10-12T20:50:39,659 [INFO ] main org.pytorch.serve.metrics.configuration.MetricConfiguration - Successfully loaded metrics configuration from /home/venv/lib/python3.9/site-packages/ts/configs/metrics.yaml
+2023-10-12T20:50:39,779 [INFO ] main org.pytorch.serve.ModelServer -
+Torchserve version: 0.8.2
+TS Home: /home/venv/lib/python3.9/site-packages
+Current directory: /home/model-server
+Temp directory: /home/model-server/tmp
+Metrics config path: /home/venv/lib/python3.9/site-packages/ts/configs/metrics.yaml
+
+```
+
+We need to set the following
+
+```
+MODEL_NAME=mnist
+SERVICE_HOSTNAME=$(kubectl get inferenceservice torchserve-mnist-v2 -o jsonpath='{.status.url}' | cut -d "/" -f 3)
+```
+
+```
+export INGRESS_HOST=localhost
+export INGRESS_PORT=8080
+```
+
+```
+INGRESS_GATEWAY_SERVICE=$(kubectl get svc --namespace istio-system --selector="app=istio-ingressgateway" --output jsonpath='{.items[0].metadata.name}')
+kubectl port-forward --namespace istio-system svc/${INGRESS_GATEWAY_SERVICE} 8080:80 &
+```
+
+Make an inference request with tensor input
+
+```
+curl -v -H "Content-Type: application/json" -H "Host: ${SERVICE_HOSTNAME}" http://${INGRESS_HOST}:${INGRESS_PORT}/v2/models/${MODEL_NAME}/infer -d @./docs/samples/v1beta1/torchserve/v2/tensor_conv/mnist_v2.json
+```
+
+Expected output is
+
+```
+{"model_name":"mnist","model_version":null,"id":"d3b15cad-50a2-4eaf-80ce-8b0a428bd298","parameters":null,"outputs":[{"name":"input-0","shape":[1],"datatype":"INT64","parameters":null,"data":[1]}]}
+```
+
+## Stop and Delete the cluster
+
+```
+minikube stop
+minikube delete
+```
diff --git a/kubernetes/kserve/image_transformer/transformer.Dockerfile b/kubernetes/kserve/image_transformer/transformer.Dockerfile
index 5399b88413..88e4aed9c5 100644
--- a/kubernetes/kserve/image_transformer/transformer.Dockerfile
+++ b/kubernetes/kserve/image_transformer/transformer.Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.7-slim
+FROM python:3.8.18-slim
ARG BRANCH_NAME_KF=master
RUN apt-get update \
diff --git a/kubernetes/kserve/tests/configs/mnist_v1_cpu.yaml b/kubernetes/kserve/tests/configs/mnist_v1_cpu.yaml
new file mode 100644
index 0000000000..8c6b044244
--- /dev/null
+++ b/kubernetes/kserve/tests/configs/mnist_v1_cpu.yaml
@@ -0,0 +1,9 @@
+apiVersion: serving.kserve.io/v1beta1
+kind: InferenceService
+metadata:
+ name: "torchserve"
+spec:
+ predictor:
+ pytorch:
+ storageUri: gs://kfserving-examples/models/torchserve/image_classifier/v1
+ image: pytorch/torchserve-kfs-nightly:latest-cpu
diff --git a/kubernetes/kserve/tests/configs/mnist_v2_cpu.yaml b/kubernetes/kserve/tests/configs/mnist_v2_cpu.yaml
new file mode 100644
index 0000000000..f60efc14e0
--- /dev/null
+++ b/kubernetes/kserve/tests/configs/mnist_v2_cpu.yaml
@@ -0,0 +1,10 @@
+apiVersion: serving.kserve.io/v1beta1
+kind: InferenceService
+metadata:
+ name: "torchserve-mnist-v2"
+spec:
+ predictor:
+ pytorch:
+ protocolVersion: v2
+ storageUri: gs://kfserving-examples/models/torchserve/image_classifier/v2
+ image: pytorch/torchserve-kfs-nightly:latest-cpu
diff --git a/kubernetes/kserve/tests/scripts/test_mnist.sh b/kubernetes/kserve/tests/scripts/test_mnist.sh
new file mode 100755
index 0000000000..e9b012a757
--- /dev/null
+++ b/kubernetes/kserve/tests/scripts/test_mnist.sh
@@ -0,0 +1,141 @@
+#!/usr/bin/env bash
+
+set -o errexit -o nounset -o pipefail
+
+function start_minikube_cluster() {
+ echo "Removing any previous Kubernetes cluster"
+ minikube delete
+ echo "Starting Kubernetes cluster"
+ minikube start
+}
+
+function install_kserve() {
+ echo "Install Kserve"
+ cd $GITHUB_WORKSPACE/kserve
+ ./hack/quick_install.sh
+ echo "Waiting for Kserve pod to come up ..."
+ wait_for_kserve_pod 300 5
+}
+
+function deploy_cluster() {
+ echo "Deploying the cluster"
+ cd $GITHUB_WORKSPACE
+ kubectl apply -f "$1"
+ echo "Waiting for pod to come up..."
+ wait_for_pod_running "$2" 120
+ echo "Check status of the pod"
+ kubectl get pods
+ kubectl describe pod "$2"
+}
+
+function make_cluster_accessible() {
+ SERVICE_NAME="$1"
+ URL="$2"
+ wait_for_inference_service 300 5 "$1"
+ SERVICE_HOSTNAME=$(kubectl get inferenceservice ${SERVICE_NAME} -o jsonpath='{.status.url}' | cut -d "/" -f 3)
+ wait_for_port_forwarding 5
+ echo "Make inference request"
+ PREDICTION=$(curl -H "Content-Type: application/json" -H "Host: ${SERVICE_HOSTNAME}" ${URL} -d @"$3")
+ EXPECTED="$4"
+ if [ "${PREDICTION}" = "${EXPECTED}" ]; then
+ echo "✓ SUCCESS"
+ else
+ echo "✘ Test failed: Prediction: ${PREDICTION}, expected ${EXPECTED}."
+ delete_minikube_cluster
+ exit 1
+ fi
+}
+
+function delete_minikube_cluster() {
+ echo "Delete cluster"
+ minikube delete
+}
+
+function wait_for_inference_service() {
+ echo "Wait for inference service to be ready"
+ max_wait_time="$1"
+ interval="$2"
+ SERVICE_NAME="$3"
+ start_time=$(date +%s)
+ while true; do
+ service_status=$(kubectl get inferenceservice ${SERVICE_NAME} -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}')
+ if [[ "$service_status" == "True" ]]; then
+ break
+ fi
+ current_time=$(date +%s)
+ if (( current_time - start_time >= max_wait_time )); then
+ echo "Timeout waiting for inference service to come up."
+ delete_minikube_cluster
+ exit 1
+ fi
+ sleep "$interval"
+ done
+}
+function wait_for_kserve_pod() {
+ max_wait_time="$1"
+ interval="$2"
+ start_time=$(date +%s)
+ while true; do
+ kserve_pod_status=$(kubectl get pods -n kserve --no-headers -o custom-columns=":status.phase")
+ if [[ "$kserve_pod_status" == "Running" ]]; then
+ break
+ fi
+ current_time=$(date +%s)
+ if (( current_time - start_time >= max_wait_time )); then
+ echo "Timeout waiting for Kserve pod to come up."
+ delete_minikube_cluster
+ exit 1
+ fi
+ sleep "$interval"
+ done
+}
+
+function wait_for_pod_running() {
+ pod_name="$1"
+ max_wait_time="$2"
+ interval=5
+ start_time=$(date +%s)
+ while true; do
+ sleep "$interval"
+ pod_description=$(kubectl describe pod "$pod_name")
+ status_line=$(echo "$pod_description" | grep -E "Status:")
+ pod_status=$(echo "$status_line" | awk '{print $2}')
+ if [[ "$pod_status" == "Running" ]]; then
+ break
+ fi
+ current_time=$(date +%s)
+ if (( current_time - start_time >= max_wait_time )); then
+ echo "Timeout waiting for pod $pod_name to become Running."
+ delete_minikube_cluster
+ exit 1
+ fi
+ done
+}
+
+function wait_for_port_forwarding() {
+ echo "Wait for ports to be in forwarding"
+ interval="$1"
+ start_time=$(date +%s)
+ INGRESS_GATEWAY_SERVICE=$(kubectl get svc --namespace istio-system --selector="app=istio-ingressgateway" --output jsonpath='{.items[0].metadata.name}')
+ kubectl port-forward --namespace istio-system svc/${INGRESS_GATEWAY_SERVICE} 8080:80 &
+ sleep "$interval"
+}
+
+export INGRESS_HOST=localhost
+export INGRESS_PORT=8080
+export MODEL_NAME=mnist
+
+start_minikube_cluster
+install_kserve
+
+echo "MNIST KServe V2 test begin"
+deploy_cluster "kubernetes/kserve/tests/configs/mnist_v2_cpu.yaml" "torchserve-mnist-v2-predictor"
+URL="http://${INGRESS_HOST}:${INGRESS_PORT}/v2/models/${MODEL_NAME}/infer"
+make_cluster_accessible "torchserve-mnist-v2" ${URL} "./kubernetes/kserve/kf_request_json/v2/mnist/mnist_v2_tensor.json" '{"model_name":"mnist","model_version":null,"id":"d3b15cad-50a2-4eaf-80ce-8b0a428bd298","parameters":null,"outputs":[{"name":"input-0","shape":[1],"datatype":"INT64","parameters":null,"data":[1]}]}'
+
+echo "MNIST KServe V1 test begin"
+deploy_cluster "kubernetes/kserve/tests/configs/mnist_v1_cpu.yaml" "torchserve-predictor"
+URL="http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/${MODEL_NAME}:predict"
+make_cluster_accessible "torchserve" ${URL} "./kubernetes/kserve/kf_request_json/v1/mnist.json" '{"predictions":[2]}'
+
+delete_minikube_cluster
diff --git a/requirements/developer.txt b/requirements/developer.txt
index bf09ab8b69..a087314a8a 100644
--- a/requirements/developer.txt
+++ b/requirements/developer.txt
@@ -7,7 +7,7 @@ pytest-cov==4.1.0
grpcio==1.54.2
protobuf==4.23.1
grpcio-tools==1.54.2
-transformers==4.30.0
+transformers>=4.34.0
pyspelling==2.8.2
pygit2==1.13.1
pre-commit==3.3.2
diff --git a/test/pytest/test_auto_recover.py b/test/pytest/test_auto_recover.py
new file mode 100644
index 0000000000..87bca76c4a
--- /dev/null
+++ b/test/pytest/test_auto_recover.py
@@ -0,0 +1,180 @@
+import json
+import platform
+import shutil
+from argparse import Namespace
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+import requests
+import test_utils
+
+CURR_FILE_PATH = Path(__file__).parent
+REPO_ROOT_DIR = CURR_FILE_PATH.parent.parent
+
+MODEL_PY = """
+import torch
+import torch.nn as nn
+
+class Foo(nn.Module):
+ def __init__(self):
+ super().__init__()
+
+ def forward(self, x):
+ return x
+"""
+
+HANDLER_PY = """
+import time
+
+from typing import List, Dict, Any, Tuple
+from ts.context import Context
+
+
+class FailingModel(object):
+ def __init__(self) -> None:
+ pass
+
+ def initialize(self, context: Context) -> None:
+ print(f"[xxx] Model initialization ... !!")
+ self.initialized = True
+ print(f"[xxx] Model initialization ... DONE !!")
+
+ def handle(self, data: List[Dict[str, Any]], context: Context):
+ self.context = context
+
+ output = list()
+ for idx, row in enumerate(data):
+ # run
+ print(f"[xxx] run ... !!")
+ time.sleep(5)
+ print(f"[xxx] run ... DONE !!")
+ output.append(f"sample output {idx}")
+ return output
+"""
+
+CONFIG_PROPERTIES = """
+default_response_timeout=2
+"""
+
+
+@pytest.fixture(scope="module")
+def model_name():
+ yield "tp_model"
+
+
+@pytest.fixture(scope="module")
+def work_dir(tmp_path_factory, model_name):
+ return Path(tmp_path_factory.mktemp(model_name))
+
+
+@pytest.fixture(scope="module")
+def torchserve(model_store, work_dir):
+ test_utils.torchserve_cleanup()
+
+ config_properties_file = work_dir / "config.properties"
+ config_properties_file.write_text(CONFIG_PROPERTIES)
+
+ pipe = test_utils.start_torchserve(
+ model_store=model_store,
+ no_config_snapshots=True,
+ gen_mar=False,
+ snapshot_file=config_properties_file.as_posix(),
+ )
+
+ yield pipe
+
+ test_utils.torchserve_cleanup()
+
+
+@pytest.fixture(scope="module", name="mar_file_path")
+def create_mar_file(work_dir, model_archiver, model_name):
+ mar_file_path = work_dir.joinpath(model_name + ".mar")
+
+ model_py_file = work_dir / "model.py"
+ model_py_file.write_text(MODEL_PY)
+
+ handler_py_file = work_dir / "handler.py"
+ handler_py_file.write_text(HANDLER_PY)
+
+ args = Namespace(
+ model_name=model_name,
+ version="1.0",
+ serialized_file=None,
+ model_file=model_py_file.as_posix(),
+ handler=handler_py_file.as_posix(),
+ extra_files=None,
+ export_path=work_dir,
+ requirements_file=None,
+ runtime="python",
+ force=False,
+ archive_format="default",
+ config_file=None,
+ )
+
+ mock = MagicMock()
+ mock.parse_args = MagicMock(return_value=args)
+ with patch("archiver.ArgParser.export_model_args_parser", return_value=mock):
+ model_archiver.generate_model_archive()
+
+ assert mar_file_path.exists()
+
+ yield mar_file_path.as_posix()
+
+ # Clean up files
+ mar_file_path.unlink(missing_ok=True)
+
+
+@pytest.fixture(scope="module", name="model_name")
+def register_model(mar_file_path, model_store, torchserve):
+ """
+ Register the model in torchserve
+ """
+ shutil.copy(mar_file_path, model_store)
+
+ file_name = Path(mar_file_path).name
+
+ model_name = Path(file_name).stem
+
+ params = (
+ ("model_name", model_name),
+ ("url", file_name),
+ ("initial_workers", "1"),
+ ("synchronous", "true"),
+ ("batch_size", "1"),
+ )
+
+ test_utils.reg_resp = test_utils.register_model_with_params(params)
+
+ yield model_name, torchserve
+
+ test_utils.unregister_model(model_name)
+
+
+@pytest.mark.skipif(
+ platform.system() != "Linux", reason="Skipping test on non-Linux system"
+)
+def test_tp_inference(model_name):
+ """
+ Full circle test with torchserve
+ """
+
+ model_name, pipe = model_name
+
+ response = requests.post(
+ url=f"http://localhost:8080/predictions/{model_name}", data=json.dumps(42)
+ )
+ assert response.status_code == 500
+
+ logs = []
+ for _ in range(100):
+ logs.append(pipe.get())
+ if "Auto recovery succeeded, reset recoveryStartTS" in logs[-1]:
+ break
+
+ assert any("Model initialization ... DONE" in l for l in logs)
+ assert any("Number or consecutive unsuccessful inference 1" in l for l in logs)
+ assert any("Worker disconnected" in l for l in logs)
+ assert any("Retry worker" in l for l in logs)
+ assert any("Auto recovery start timestamp" in l for l in logs)
+ assert not any("Auto recovery failed again" in l for l in logs)
diff --git a/test/pytest/test_parallelism.py b/test/pytest/test_parallelism.py
new file mode 100644
index 0000000000..04183ec01f
--- /dev/null
+++ b/test/pytest/test_parallelism.py
@@ -0,0 +1,148 @@
+import json
+import platform
+import shutil
+from argparse import Namespace
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+import requests
+import test_utils
+
+CURR_FILE_PATH = Path(__file__).parent
+REPO_ROOT_DIR = CURR_FILE_PATH.parent.parent
+
+MODEL_PY = """
+import torch
+import torch.nn as nn
+
+class Foo(nn.Module):
+ def __init__(self):
+ super().__init__()
+
+ def forward(self, x):
+ torch.distributed.all_reduce(x)
+ return x
+"""
+
+HANDLER_PY = """
+import os
+import torch
+from ts.torch_handler.base_handler import BaseHandler
+
+class FooHandler(BaseHandler):
+ def initialize(self, ctx):
+ if not torch.distributed.is_initialized():
+ torch.distributed.init_process_group("gloo")
+ torch.set_default_device("cpu")
+ super().initialize(ctx)
+
+ def preprocess(self, data):
+ return torch.as_tensor(int(data[0].get('body').decode('utf-8')), device=self.device)
+
+ def postprocess(self, x):
+ return [x.item()]
+"""
+
+MODEL_CONFIG_YAML = f"""
+#frontend settings
+parallelType: "tp"
+deviceType: "cpu"
+
+torchrun:
+ nproc-per-node: 4
+"""
+
+
+@pytest.fixture(scope="module")
+def model_name():
+ yield "tp_model"
+
+
+@pytest.fixture(scope="module")
+def work_dir(tmp_path_factory, model_name):
+ return Path(tmp_path_factory.mktemp(model_name))
+
+
+@pytest.fixture(scope="module", name="mar_file_path")
+def create_mar_file(work_dir, model_archiver, model_name):
+ mar_file_path = work_dir.joinpath(model_name + ".mar")
+
+ model_config_yaml_file = work_dir / "model_config.yaml"
+ model_config_yaml_file.write_text(MODEL_CONFIG_YAML)
+
+ model_py_file = work_dir / "model.py"
+ model_py_file.write_text(MODEL_PY)
+
+ handler_py_file = work_dir / "handler.py"
+ handler_py_file.write_text(HANDLER_PY)
+
+ args = Namespace(
+ model_name=model_name,
+ version="1.0",
+ serialized_file=None,
+ model_file=model_py_file.as_posix(),
+ handler=handler_py_file.as_posix(),
+ extra_files=None,
+ export_path=work_dir,
+ requirements_file=None,
+ runtime="python",
+ force=False,
+ archive_format="default",
+ config_file=model_config_yaml_file.as_posix(),
+ )
+
+ mock = MagicMock()
+ mock.parse_args = MagicMock(return_value=args)
+ with patch("archiver.ArgParser.export_model_args_parser", return_value=mock):
+ model_archiver.generate_model_archive()
+
+ assert mar_file_path.exists()
+
+ yield mar_file_path.as_posix()
+
+ # Clean up files
+ mar_file_path.unlink(missing_ok=True)
+
+
+@pytest.fixture(scope="module", name="model_name")
+def register_model(mar_file_path, model_store, torchserve):
+ """
+ Register the model in torchserve
+ """
+ shutil.copy(mar_file_path, model_store)
+
+ file_name = Path(mar_file_path).name
+
+ model_name = Path(file_name).stem
+
+ params = (
+ ("model_name", model_name),
+ ("url", file_name),
+ ("initial_workers", "1"),
+ ("synchronous", "true"),
+ ("batch_size", "1"),
+ )
+
+ test_utils.reg_resp = test_utils.register_model_with_params(params)
+
+ yield model_name
+
+ test_utils.unregister_model(model_name)
+
+
+@pytest.mark.skipif(
+ platform.system() != "Linux", reason="Skipping test on non-Linux system"
+)
+def test_tp_inference(model_name):
+ """
+ Full circle test with torchserve
+ """
+
+ response = requests.post(
+ url=f"http://localhost:8080/predictions/{model_name}", data=json.dumps(42)
+ )
+
+ assert int(response.text) == 4 * 42
+
+ assert response.status_code == 200
diff --git a/ts_scripts/api_utils.py b/ts_scripts/api_utils.py
index 99398ef17c..cdfaccac9a 100755
--- a/ts_scripts/api_utils.py
+++ b/ts_scripts/api_utils.py
@@ -367,7 +367,8 @@ def trigger_all():
exit_code9 = trigger_https_tests_kfv2()
exit_code10 = trigger_explanation_tests()
exit_code11 = trigger_workflow_tests()
- exit_code12 = trigger_workflow_inference_tests()
+ # Skipping as this test is flaky
+ # exit_code12 = trigger_workflow_inference_tests()
return (
1
if any(
@@ -384,7 +385,6 @@ def trigger_all():
exit_code9,
exit_code10,
exit_code11,
- exit_code12,
]
)
else 0
diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt
index f8fe15e126..b4fb8bc4a6 100644
--- a/ts_scripts/spellcheck_conf/wordlist.txt
+++ b/ts_scripts/spellcheck_conf/wordlist.txt
@@ -162,7 +162,10 @@ CN
CORS
EventLoopGroup
EventLoops
+CPUs
GPUs
+TPU
+TPUs
JVM
MaxDirectMemorySize
OU
@@ -1118,3 +1121,10 @@ quantized
Chatbot
LLM
bitsandbytes
+Datadog
+Trn
+oneAPI
+Naver
+FlashAttention
+GenAI
+prem