Merge branch 'master' into feature/grpc_streaming

pytorch · Mar 27, 2023 · 8132f23 · 8132f23
2 parents 451b9a5 + 41a3af3
commit 8132f23
Show file tree

Hide file tree

Showing 25 changed files with 234 additions and 50 deletions.
diff --git a/.github/workflows/regression_tests_cpu.yml b/.github/workflows/regression_tests_cpu.yml
@@ -1,6 +1,9 @@
 name: Run Regression Tests on CPU
 
-on: workflow_dispatch
+on:
+  # runs every Sunday  at 11:15am
+  schedule:
+    - cron:  '15 11 * * 7'
 
 jobs:
   regression-cpu:

diff --git a/.github/workflows/regression_tests_gpu.yml b/.github/workflows/regression_tests_gpu.yml
@@ -1,6 +1,9 @@
 name: Run Regression Tests on GPU
 
-on: workflow_dispatch
+on:
+  # runs every Sunday  at 11:15am
+  schedule:
+    - cron:  '15 11 * * 7'
 
 jobs:
   regression-gpu:
@@ -9,7 +12,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        cuda: ["cu116", "cu117"]
+        cuda: ["cu117", "cu118"]
     steps:
       - name: Clean up previous run
         run: |

diff --git a/docker/README.md b/docker/README.md
@@ -36,7 +36,7 @@ Use `build_image.sh` script to build the docker images. The script builds the `p
 |-g, --gpu|Build image with GPU based ubuntu base image|
 |-bt, --buildtype|Which type of docker image to build. Can be one of : production, dev, codebuild|
 |-t, --tag|Tag name for image. If not specified, script uses torchserve default tag names.|
-|-cv, --cudaversion| Specify to cuda version to use. Supported values `cu92`, `cu101`, `cu102`, `cu111`, `cu113`, `cu116`, `cu117`. Default `cu117`|
+|-cv, --cudaversion| Specify to cuda version to use. Supported values `cu92`, `cu101`, `cu102`, `cu111`, `cu113`, `cu116`, `cu117`, `cu118`. Default `cu117`|
 |-ipex, --build-with-ipex| Specify to build with intel_extension_for_pytorch. If not specified, script builds without intel_extension_for_pytorch.|
 |--codebuild| Set if you need [AWS CodeBuild](https://aws.amazon.com/codebuild/)|
 |-py, --pythonversion| Specify the python version to use. Supported values `3.8`, `3.9`, `3.10`. Default `3.9`|

diff --git a/docker/build_image.sh b/docker/build_image.sh
@@ -80,7 +80,10 @@ do
         # With default ubuntu version 20.04
         -cv|--cudaversion)
           CUDA_VERSION="$2"
-          if [ $CUDA_VERSION == "cu117" ];
+          if [ $CUDA_VERSION == "cu118" ];
+          then
+            BASE_IMAGE="nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04"
+          elif [ $CUDA_VERSION == "cu117" ];
           then
             BASE_IMAGE="nvidia/cuda:11.7.0-cudnn8-runtime-ubuntu20.04"
           elif [ $CUDA_VERSION == "cu116" ];

diff --git a/docs/README.md b/docs/README.md
@@ -52,3 +52,4 @@ TorchServe is a performant, flexible and easy to use tool for serving PyTorch ea
 * [TorchServe on Kubernetes](https://github.com/pytorch/serve/blob/master/kubernetes/README.md#torchserve-on-kubernetes) -  Demonstrates a Torchserve deployment in Kubernetes using Helm Chart supported in both Azure Kubernetes Service and Google Kubernetes service
 * [mlflow-torchserve](https://github.com/mlflow/mlflow-torchserve) - Deploy mlflow pipeline models into TorchServe
 * [Kubeflow pipelines](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/pytorch-samples) - Kubeflow pipelines and Google Vertex AI Managed pipelines
+* [NVIDIA MPS](mps.md) - Use NVIDIA MPS to optimize multi-worker deployment on a single GPU
diff --git a/docs/code_coverage.md b/docs/code_coverage.md
@@ -12,7 +12,7 @@
    ```bash
    python ts_scripts/install_dependencies.py --environment=dev --cuda=cu102
    ```
-   > Supported cuda versions as cu117, cu116, cu113, cu111, cu102, cu101, cu92
+   > Supported cuda versions as cu118, cu117, cu116, cu113, cu111, cu102, cu101, cu92
 
  - Execute sanity suite
    ```bash

diff --git a/docs/configuration.md b/docs/configuration.md
@@ -288,7 +288,7 @@ the backend workers convert "Bytearray to utf-8 string" when the Content-Type of
 * `max_request_size` : The maximum allowable request size that the Torchserve accepts, in bytes. Default: 6553500
 * `max_response_size` : The maximum allowable response size that the Torchserve sends, in bytes. Default: 6553500
 * `limit_max_image_pixels` : Default value is true (Use default [PIL.Image.MAX_IMAGE_PIXELS](https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.MAX_IMAGE_PIXELS)). If this is set to "false", set PIL.Image.MAX_IMAGE_PIXELS = None in backend default vision handler for large image payload.
-* `allowed_urls` : Comma separated regex of allowed source URL(s) from where models can be registered. Default: "file://.*|http(s)?://.*" (all URLs and local file system)
+* `allowed_urls` : Comma separated regex of allowed source URL(s) from where models can be registered. Default: `file://.*|http(s)?://.*` (all URLs and local file system)
 e.g. : To allow base URLs `https://s3.amazonaws.com/` and `https://torchserve.pytorch.org/` use the following regex string `allowed_urls=https://s3.amazonaws.com/.*,https://torchserve.pytorch.org/.*`
 * `workflow_store` : Path of workflow store directory. Defaults to model store directory.
 * `disable_system_metrics` : Disable collection of system metrics when set to "true". Default value is "false".

diff --git a/docs/getting_started.md b/docs/getting_started.md
@@ -14,7 +14,7 @@
         python ./ts_scripts/install_dependencies.py
         ```
 
-     - For GPU with Cuda 10.2. Options are `cu92`, `cu101`, `cu102`, `cu111`, `cu113`, `cu116`, `cu117`
+     - For GPU with Cuda 10.2. Options are `cu92`, `cu101`, `cu102`, `cu111`, `cu113`, `cu116`, `cu117`, `cu118`
 
        ```bash
        python ./ts_scripts/install_dependencies.py --cuda=cu102

diff --git a/docs/images/mps_g4_single.png b/docs/images/mps_g4_single.png
diff --git a/docs/images/mps_g4_two_worker.png b/docs/images/mps_g4_two_worker.png
diff --git a/docs/images/mps_p3_single.png b/docs/images/mps_p3_single.png
diff --git a/docs/images/mps_p3_two_worker.png b/docs/images/mps_p3_two_worker.png
diff --git a/docs/mps.md b/docs/mps.md
@@ -0,0 +1,91 @@
+# Enabling NVIDIA MPS in TorchServe
+In order to deploy ML models, TorchServe spins up each worker in a separate processes, thus isolating each worker from the others.
+Each process creates its own CUDA context to execute its kernels and access the allocated memory.
+
+While NVIDIA GPUs in their default setting allow multiple processes to run CUDA kernels on a single device it involves the following drawback:
+* The execution of the kernels is generally serialized
+* Each processes creates its own CUDA context which occupies additional GPU memory
+
+For these scenarios NVIDIA offers the Multi-Process Service (MPS) which:
+* Allows multiple processes to share the same CUDA context on the same GPU
+* Run their kernels in a parallel fashion
+
+This can result in:
+* Increased performance when using multiple workers on the same GPU
+* Decreased GPU memory utilization due to the shared context
+
+
+To leverage the benefits of NVIDIA MPS we need to start the MPS daemon with the following commands before starting up TorchServe itself.
+```
+sudo nvidia-smi -c 3
+nvidia-cuda-mps-control -d
+```
+The first command enables the exclusive processing mode for the GPU allowing only one process (the MPS daemon) to utilize it.
+The second command starts the MPS daemon itself.
+To shutdown the daemon we can execute:
+```
+echo quit | nvidia-cuda-mps-control
+```
+For more details on MPS please refer to [NVIDIA's MPS documentation](https://docs.nvidia.com/deploy/mps/index.html).
+It should be noted that MPS only allows 48 processes (for Volta GPUs) to connect to the daemon due limited hardware resources.
+Adding more clients/workers (to the same GPU) will lead to a failure.
+
+## Benchmarks
+To show the performance of TorchServe with activated MPS and help to the decision in enabling MPS for your deployment or not we will perform some benchmarks with representative workloads.
+
+Primarily, we want to investigate how the throughput of a worker evolves with activated MPS for different operation points.
+As an example work load for our benchmark we select the [HuggingFace Transformers Sequence Classification example](https://github.com/pytorch/serve/tree/master/examples/Huggingface_Transformers#sequence-classification).
+We perform the benchmark on a g4dn.4xlarge as well as a p3.2xlarge instance on AWS.
+Both instance types provide one GPU per instance which will result in multiple workers to be scheduled on the same GPU.
+For the benchmark we concentrate on the model throughput as measured by the [benchmark-ab.py](https://github.com/pytorch/serve/tree/master/benchmarks/benchmark-ab.py) tool.
+
+First, we measure the throughput of a single worker for different batch sizes as it will show us at which point the compute resources of the GPU are fully occupied.
+Second, we measure the throughput with two deployed workers for the batch sizes where we expect the GPUs to have still some resources left over to share.
+For each benchmark we perform five runs and take the median over the runs.
+
+We use the following config.json for the benchmark, only overwriting the number of workers and the batch size accordingly.
+
+```
+{
+    "url":"/home/ubuntu/serve/examples/Huggingface_Transformers/model_store/BERTSeqClassification",
+    "requests": 10000,
+    "concurrency": 600,
+    "input": "/home/ubuntu/serve/examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text_captum_input.txt",
+    "workers": "1"
+}
+```
+Please note that we set the concurrency level to 600 which will make sure that the batch aggregation inside TorchServe fills up the batches to the maximum batch size. But concurrently this will skew the latency measurements as many requests will be waiting in the queue to be processed. We will therefore neglect the latency measurements in the following.
+
+### G4 Instance
+We first perform the single worker benchmark for the G4 instance.
+In the figure below we see that up to a batch size of four we see a steady increase of the throughput over the batch size.
+
+![G4 benchmark, single worker](images/mps_g4_single.png)
+
+Next, we increase the number of workers to two in order to compare the throughput with and without MPS running.
+To enable MPS for the second set of runs we first set the exclusive processing mode for the GPU and then start the MPS daemon as shown above.
+
+We select the batch size between one and eight according to our previous findings.
+In the figure we can see that the performance in terms of throughput can be better in case of batch size 1 and 8 (up to +18%) while it can be worse for others (-11%).
+An interpretation of this result could be that the G4 instance has not many resources to share when we run a BERT model in one of the workers.
+
+![G4 benchmark, two workers](images/mps_g4_two_worker.png)
+
+### P3 instance
+Next, we will run the same experiment with the bigger p3.2xlarge instance.
+With a single worker we get the following throughput values:
+
+![P3 benchmark, single worker](images/mps_p3_single.png)
+
+We can see that the throughput steady increases but for a batch size over eight we see diminishing returns.
+Finally, we deploy two workers on the P3 instance and compare running them with and without MPS.
+We can see that for batch size between 1 and 32 the throughput is consistently higher (up to +25%) for MPS enabled with the exception of batch size 16.
+
+![P3 benchmark, two workers](images/mps_p3_two_worker.png)
+
+## Summary
+In the previous section we saw that by enabling MPS for two workers running the same model we receive mixed results.
+For the smaller G4 instance we only saw benefits in certain operation points while we saw more consistent improvements for the bigger P3 instance.
+This suggests that the benefit in terms of throughput for running a deployment with MPS are highly workload and environment dependent and need to be determined for specific situations using appropriate benchmarks and tools.
+It should be noted that the previous benchmark solely focused on throughput and neglected latency and memory footprint.
+As using MPS will only create a single CUDA context more workers can be packed to the same GPU which needs to be considered as well in the according scenarios.
diff --git a/model-archiver/model_archiver/tests/integ_tests/test_integration_model_archiver.py b/model-archiver/model_archiver/tests/integ_tests/test_integration_model_archiver.py
@@ -1,16 +1,23 @@
-import platform
-import time
-from datetime import datetime
 import errno
 import json
 import os
+import platform
 import shutil
-import tempfile
 import subprocess
+import tempfile
+import time
+from datetime import datetime
+from pathlib import Path
+
 import model_archiver
 
 DEFAULT_RUNTIME = "python"
 MANIFEST_FILE = "MAR-INF/MANIFEST.json"
+INTEG_TEST_CONFIG_FILE = "integ_tests/configuration.json"
+DEFAULT_HANDLER_CONFIG_FILE = "integ_tests/default_handler_configuration.json"
+
+TEST_ROOT_DIR = Path(__file__).parents[1]
+MODEL_ARCHIVER_ROOT_DIR = Path(__file__).parents[3]
 
 
 def create_file_path(path):
@@ -49,11 +56,17 @@ def run_test(test, cmd):
 def validate_archive_exists(test):
     fmt = test.get("archive-format")
     if fmt == "tgz":
-        assert os.path.isfile(os.path.join(test.get("export-path"), test.get("model-name")+".tar.gz"))
+        assert os.path.isfile(
+            os.path.join(test.get("export-path"), test.get("model-name") + ".tar.gz")
+        )
     elif fmt == "no-archive":
-        assert os.path.isdir(os.path.join(test.get("export-path"), test.get("model-name")))
+        assert os.path.isdir(
+            os.path.join(test.get("export-path"), test.get("model-name"))
+        )
     else:
-        assert os.path.isfile(os.path.join(test.get("export-path"), test.get("model-name")+".mar"))
+        assert os.path.isfile(
+            os.path.join(test.get("export-path"), test.get("model-name") + ".mar")
+        )
 
 
 def validate_manifest_file(manifest, test, default_handler=None):
@@ -67,7 +80,9 @@ def validate_manifest_file(manifest, test, default_handler=None):
     assert manifest.get("runtime") == test.get("runtime")
     assert manifest.get("model").get("modelName") == test.get("model-name")
     if not default_handler:
-        assert manifest.get("model").get("handler") == test.get("handler").split("/")[-1]
+        assert (
+            manifest.get("model").get("handler") == test.get("handler").split("/")[-1]
+        )
     else:
         assert manifest.get("model").get("handler") == test.get("handler")
     assert manifest.get("archiverVersion") == model_archiver.__version__
@@ -87,21 +102,29 @@ def validate_files(file_list, prefix, default_handler=None):
 
 def validate_tar_archive(test_cfg):
     import tarfile
-    file_name = os.path.join(test_cfg.get("export-path"), test_cfg.get("model-name") + ".tar.gz")
+
+    file_name = os.path.join(
+        test_cfg.get("export-path"), test_cfg.get("model-name") + ".tar.gz"
+    )
     f = tarfile.open(file_name, "r:gz")
-    manifest = json.loads(f.extractfile(os.path.join(test_cfg.get("model-name"), MANIFEST_FILE)).read())
+    manifest = json.loads(
+        f.extractfile(os.path.join(test_cfg.get("model-name"), MANIFEST_FILE)).read()
+    )
     validate_manifest_file(manifest, test_cfg)
     validate_files(f.getnames(), test_cfg.get("model-name"))
 
 
 def validate_noarchive_archive(test):
-    file_name = os.path.join(test.get("export-path"), test.get("model-name"), MANIFEST_FILE)
+    file_name = os.path.join(
+        test.get("export-path"), test.get("model-name"), MANIFEST_FILE
+    )
     manifest = json.loads(open(file_name).read())
     validate_manifest_file(manifest, test)
 
 
 def validate_mar_archive(test):
     import zipfile
+
     file_name = os.path.join(test.get("export-path"), test.get("model-name") + ".mar")
     zf = zipfile.ZipFile(file_name, "r")
     manifest = json.loads(zf.open(MANIFEST_FILE).read())
@@ -124,8 +147,17 @@ def validate(test):
 
 
 def build_cmd(test):
-    args = ['model-name', 'model-file', 'serialized-file', 'handler', 'extra-files', 'archive-format',
-            'version', 'export-path', 'runtime']
+    args = [
+        "model-name",
+        "model-file",
+        "serialized-file",
+        "handler",
+        "extra-files",
+        "archive-format",
+        "version",
+        "export-path",
+        "runtime",
+    ]
 
     cmd = ["torch-model-archiver"]
 
@@ -136,19 +168,42 @@ def build_cmd(test):
     return " ".join(cmd)
 
 
+def make_paths_absolute(test, keys):
+    def make_absolute(paths):
+        if "," in paths:
+            return ",".join([make_absolute(p) for p in paths.split(",")])
+        return MODEL_ARCHIVER_ROOT_DIR.joinpath(paths).as_posix()
+
+    for k in keys:
+        test[k] = make_absolute(test[k])
+
+    return test
+
+
 def test_model_archiver():
-    with open("model_archiver/tests/integ_tests/configuration.json", "r") as f:
+    with open(TEST_ROOT_DIR.joinpath(INTEG_TEST_CONFIG_FILE), "r") as f:
         tests = json.loads(f.read())
+        keys = (
+            "model-file",
+            "serialized-file",
+            "handler",
+            "extra-files",
+        )
+        tests = [make_paths_absolute(t, keys) for t in tests]
         for test in tests:
             # tar.gz format problem on windows hence ignore
-            if platform.system() == "Windows" and test['archive-format'] == 'tgz':
+            if platform.system() == "Windows" and test["archive-format"] == "tgz":
                 continue
             try:
-                test["export-path"] = os.path.join(tempfile.gettempdir(), test["export-path"])
+                test["export-path"] = os.path.join(
+                    tempfile.gettempdir(), test["export-path"]
+                )
                 delete_file_path(test.get("export-path"))
                 create_file_path(test.get("export-path"))
                 test["runtime"] = test.get("runtime", DEFAULT_RUNTIME)
-                test["model-name"] = test["model-name"] + '_' + str(int(time.time()*1000.0))
+                test["model-name"] = (
+                    test["model-name"] + "_" + str(int(time.time() * 1000.0))
+                )
                 cmd = build_cmd(test)
                 if test.get("force"):
                     cmd += " -f"
@@ -160,8 +215,14 @@ def test_model_archiver():
 
 
 def test_default_handlers():
-    with open("model_archiver/tests/integ_tests/default_handler_configuration.json", "r") as f:
+    with open(TEST_ROOT_DIR.joinpath(DEFAULT_HANDLER_CONFIG_FILE), "r") as f:
         tests = json.loads(f.read())
+        keys = (
+            "model-file",
+            "serialized-file",
+            "extra-files",
+        )
+        tests = [make_paths_absolute(t, keys) for t in tests]
         for test in tests:
             cmd = build_cmd(test)
             try:

diff --git a/model-archiver/model_archiver/tests/unit_tests/test_version.py b/model-archiver/model_archiver/tests/unit_tests/test_version.py
@@ -1,15 +1,16 @@
+from pathlib import Path
 
-
-import os
 import model_archiver
 
+MODEL_ARCHIVER_ROOT_DIR = Path(__file__).parent.parent.parent
+
 
 def test_model_export_tool_version():
     """
     Test the model archive version
     :return:
     """
-    with open(os.path.join('model_archiver', 'version.txt')) as f:
+    with open(MODEL_ARCHIVER_ROOT_DIR.joinpath("version.txt")) as f:
         __version__ = f.readline().strip()
 
     assert __version__ == str(model_archiver.__version__), "Versions do not match"
diff --git a/requirements/torch_cu117_linux.txt b/requirements/torch_cu117_linux.txt
@@ -3,7 +3,7 @@
 cython
 wheel
 pillow==9.3.0
-torch==1.13.1+cu117; sys_platform == 'linux'
-torchvision==0.14.1+cu117; sys_platform == 'linux'
-torchtext==0.14.1; sys_platform == 'linux'
-torchaudio==0.13.1+cu117; sys_platform == 'linux'
+torch==2.0.0+cu117; sys_platform == 'linux'
+torchvision==0.15.1+cu117; sys_platform == 'linux'
+torchtext==0.15.1; sys_platform == 'linux'
+torchaudio==2.0.1+cu117; sys_platform == 'linux'
diff --git a/requirements/torch_cu117_windows.txt b/requirements/torch_cu117_windows.txt
@@ -1,6 +1,6 @@
 #pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117
 --extra-index-url https://download.pytorch.org/whl/cu117
-torch==1.13.1+cu117; sys_platform == 'win32'
-torchvision==0.14.1+cu117; sys_platform == 'win32'
-torchtext==0.14.1; sys_platform == 'win32'
-torchaudio==0.13.1+cu117; sys_platform == 'win32'
+torch==2.0.0+cu117; sys_platform == 'win32'
+torchvision==0.15.1+cu117; sys_platform == 'win32'
+torchtext==0.15.1; sys_platform == 'win32'
+torchaudio==2.0.1+cu117; sys_platform == 'win32'