From c4f968a122c5d80741b4888ac4f0a190e32c29ea Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Fri, 14 Jun 2024 20:10:48 +0000
Subject: [PATCH 01/30] move start_torchserve from test_utils into ts.launcher

---
 test/pytest/test_utils.py | 92 ++++-----------------------------------
 ts/launcher.py            | 88 +++++++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+), 83 deletions(-)
 create mode 100644 ts/launcher.py

diff --git a/test/pytest/test_utils.py b/test/pytest/test_utils.py
index 500c8508b2..6005472dca 100644
--- a/test/pytest/test_utils.py
+++ b/test/pytest/test_utils.py
@@ -5,12 +5,8 @@
 import subprocess
 import sys
 import tempfile
-import threading
-from io import TextIOWrapper
 from os import path
 from pathlib import Path
-from queue import Queue
-from subprocess import PIPE, STDOUT, Popen
 
 import orjson
 import requests
@@ -18,90 +14,14 @@
 # To help discover margen modules
 REPO_ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../")
 sys.path.append(REPO_ROOT)
-from ts_scripts import marsgen as mg
+from ts.launcher import start
+from ts.launcher import stop as stop_torchserve
 
 ROOT_DIR = os.path.join(tempfile.gettempdir(), "workspace")
 MODEL_STORE = path.join(ROOT_DIR, "model_store/")
 CODEBUILD_WD = path.abspath(path.join(__file__, "../../.."))
 
 
-class PrintTillTheEnd(threading.Thread):
-    def __init__(self, queue):
-        super().__init__()
-        self._queue = queue
-
-    def run(self):
-        while True:
-            line = self._queue.get()
-            if not line:
-                break
-            print(line.strip())
-
-
-class Tee(threading.Thread):
-    def __init__(self, reader):
-        super().__init__()
-        self.reader = reader
-        self.queue1 = Queue()
-        self.queue2 = Queue()
-
-    def run(self):
-        for line in self.reader:
-            self.queue1.put(line)
-            self.queue2.put(line)
-        self.queue1.put(None)
-        self.queue2.put(None)
-
-
-def start_torchserve(
-    model_store=None,
-    snapshot_file=None,
-    no_config_snapshots=False,
-    gen_mar=True,
-    plugin_folder=None,
-    disable_token=True,
-    models=None,
-    model_api_enabled=True,
-):
-    stop_torchserve()
-    crate_mar_file_table()
-    cmd = ["torchserve", "--start"]
-    model_store = model_store if model_store else MODEL_STORE
-    if gen_mar:
-        mg.gen_mar(model_store)
-    cmd.extend(["--model-store", model_store])
-    if plugin_folder:
-        cmd.extend(["--plugins-path", plugin_folder])
-    if snapshot_file:
-        cmd.extend(["--ts-config", snapshot_file])
-    if no_config_snapshots:
-        cmd.extend(["--no-config-snapshots"])
-    if disable_token:
-        cmd.append("--disable-token")
-    if models:
-        cmd.extend(["--models", models])
-    if model_api_enabled:
-        cmd.extend(["--model-api-enabled"])
-    print(cmd)
-
-    p = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=STDOUT)
-    for line in p.stdout:
-        print(line.decode("utf8").strip())
-        if "Model server started" in str(line).strip():
-            break
-
-    splitter = Tee(TextIOWrapper(p.stdout))
-    splitter.start()
-    print_thread = PrintTillTheEnd(splitter.queue1)
-    print_thread.start()
-
-    return splitter.queue2
-
-
-def stop_torchserve():
-    subprocess.run(["torchserve", "--stop", "--foreground"])
-
-
 def delete_all_snapshots():
     for f in glob.glob("logs/config/*"):
         os.remove(f)
@@ -115,6 +35,12 @@ def delete_model_store(model_store=None):
         os.remove(f)
 
 
+def start_torchserve(*args, **kwargs):
+    create_mar_file_table()
+    kwargs.update({"model_store": kwargs.get("model_store", MODEL_STORE)})
+    return start(*args, **kwargs)
+
+
 def torchserve_cleanup():
     stop_torchserve()
     delete_model_store()
@@ -163,7 +89,7 @@ def delete_mar_file_from_model_store(model_store=None, model_mar=None):
 mar_file_table = {}
 
 
-def crate_mar_file_table():
+def create_mar_file_table():
     if not mar_file_table:
         with open(
             os.path.join(os.path.dirname(__file__), *environment_json.split("/")), "rb"
diff --git a/ts/launcher.py b/ts/launcher.py
new file mode 100644
index 0000000000..2ca55829a6
--- /dev/null
+++ b/ts/launcher.py
@@ -0,0 +1,88 @@
+import subprocess
+import threading
+from io import TextIOWrapper
+from queue import Full, Queue
+from subprocess import PIPE, STDOUT, Popen
+
+from ts_scripts import marsgen as mg
+
+
+def stop():
+    subprocess.run(["torchserve", "--stop", "--foreground"])
+
+
+class Tee(threading.Thread):
+    def __init__(self, reader):
+        super().__init__()
+        self.reader = reader
+        self.queue1 = Queue(maxsize=1000)
+        self.queue2 = Queue(maxsize=1000)
+
+    def run(self):
+        for line in self.reader:
+            try:
+                self.queue1.put_nowait(line)
+            except Full:
+                pass
+            try:
+                self.queue2.put_nowait(line)
+            except Full:
+                pass
+
+        self.queue1.put_nowait(None)
+        self.queue2.put_nowait(None)
+
+
+class PrintTillTheEnd(threading.Thread):
+    def __init__(self, queue):
+        super().__init__()
+        self._queue = queue
+
+    def run(self):
+        while True:
+            line = self._queue.get()
+            if not line:
+                break
+            print(line.strip())
+
+
+def start(
+    model_store=None,
+    snapshot_file=None,
+    no_config_snapshots=False,
+    gen_mar=True,
+    plugin_folder=None,
+    disable_token=True,
+    models=None,
+    model_api_enabled=True,
+):
+    stop()
+    cmd = ["torchserve", "--start"]
+    if gen_mar:
+        mg.gen_mar(model_store)
+    cmd.extend(["--model-store", model_store])
+    if plugin_folder:
+        cmd.extend(["--plugins-path", plugin_folder])
+    if snapshot_file:
+        cmd.extend(["--ts-config", snapshot_file])
+    if no_config_snapshots:
+        cmd.extend(["--no-config-snapshots"])
+    if disable_token:
+        cmd.append("--disable-token")
+    if models:
+        cmd.extend(["--models", models])
+    if model_api_enabled:
+        cmd.extend(["--model-api-enabled"])
+
+    p = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=STDOUT)
+    for line in p.stdout:
+        print(line.decode("utf8").strip())
+        if "Model server started" in str(line).strip():
+            break
+
+    splitter = Tee(TextIOWrapper(p.stdout))
+    splitter.start()
+    print_thread = PrintTillTheEnd(splitter.queue1)
+    print_thread.start()
+
+    return splitter.queue2

From 08377fef01d67a61e8aea07aa857935eb9d7758a Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Fri, 14 Jun 2024 20:17:58 +0000
Subject: [PATCH 02/30] Move register model into launcher

---
 test/pytest/test_utils.py | 15 ---------------
 ts/launcher.py            | 17 +++++++++++++++++
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/test/pytest/test_utils.py b/test/pytest/test_utils.py
index 6005472dca..21e5c7bf4a 100644
--- a/test/pytest/test_utils.py
+++ b/test/pytest/test_utils.py
@@ -47,21 +47,6 @@ def torchserve_cleanup():
     delete_all_snapshots()
 
 
-def register_model(model_name, url):
-    params = (
-        ("model_name", model_name),
-        ("url", url),
-        ("initial_workers", "1"),
-        ("synchronous", "true"),
-    )
-    return register_model_with_params(params)
-
-
-def register_model_with_params(params):
-    response = requests.post("http://localhost:8081/models", params=params)
-    return response
-
-
 def unregister_model(model_name):
     response = requests.delete("http://localhost:8081/models/{}".format(model_name))
     return response
diff --git a/ts/launcher.py b/ts/launcher.py
index 2ca55829a6..3e3109df42 100644
--- a/ts/launcher.py
+++ b/ts/launcher.py
@@ -4,6 +4,8 @@
 from queue import Full, Queue
 from subprocess import PIPE, STDOUT, Popen
 
+import requests
+
 from ts_scripts import marsgen as mg
 
 
@@ -86,3 +88,18 @@ def start(
     print_thread.start()
 
     return splitter.queue2
+
+
+def register_model_with_params(params):
+    response = requests.post("http://localhost:8081/models", params=params)
+    return response
+
+
+def register_model(model_name, url):
+    params = (
+        ("model_name", model_name),
+        ("url", url),
+        ("initial_workers", "1"),
+        ("synchronous", "true"),
+    )
+    return register_model_with_params(params)

From fbb1f5d780cbb3e5c8ec8c8afb5731dbcc759376 Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Fri, 21 Jun 2024 00:11:46 +0000
Subject: [PATCH 03/30] Readd imports to register_model in test_util

---
 test/pytest/test_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/pytest/test_utils.py b/test/pytest/test_utils.py
index 21e5c7bf4a..0dad7fa65b 100644
--- a/test/pytest/test_utils.py
+++ b/test/pytest/test_utils.py
@@ -14,7 +14,7 @@
 # To help discover margen modules
 REPO_ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../")
 sys.path.append(REPO_ROOT)
-from ts.launcher import start
+from ts.launcher import register_model, register_model_with_params, start  # noqa
 from ts.launcher import stop as stop_torchserve
 
 ROOT_DIR = os.path.join(tempfile.gettempdir(), "workspace")

From 449603d56815d8f6bbf782e34d6f06f30ab05173 Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Fri, 21 Jun 2024 00:42:31 +0000
Subject: [PATCH 04/30] Move vllm_handler into ts/torch_handler and add vllm to
 dependencies

---
 examples/large_models/vllm/config.properties                    | 1 -
 examples/large_models/vllm/llama3/Readme.md                     | 2 +-
 examples/large_models/vllm/lora/Readme.md                       | 2 +-
 examples/large_models/vllm/mistral/Readme.md                    | 2 +-
 examples/large_models/vllm/requirements.txt                     | 1 -
 requirements/torch_linux.txt                                    | 1 +
 .../base_vllm_handler.py => ts/torch_handler/vllm_handler.py    | 2 +-
 7 files changed, 5 insertions(+), 6 deletions(-)
 delete mode 100644 examples/large_models/vllm/requirements.txt
 rename examples/large_models/vllm/base_vllm_handler.py => ts/torch_handler/vllm_handler.py (99%)

diff --git a/examples/large_models/vllm/config.properties b/examples/large_models/vllm/config.properties
index 67f62d182f..0afd95f2e9 100644
--- a/examples/large_models/vllm/config.properties
+++ b/examples/large_models/vllm/config.properties
@@ -2,4 +2,3 @@ inference_address=http://127.0.0.1:8080
 management_address=http://127.0.0.1:8081
 metrics_address=http://127.0.0.1:8082
 enable_envvars_config=true
-install_py_dep_per_model=true
diff --git a/examples/large_models/vllm/llama3/Readme.md b/examples/large_models/vllm/llama3/Readme.md
index ba51bf252c..a182b1cdff 100644
--- a/examples/large_models/vllm/llama3/Readme.md
+++ b/examples/large_models/vllm/llama3/Readme.md
@@ -21,7 +21,7 @@ python ../../utils/Download_model.py --model_path model --model_name meta-llama/
 Add the downloaded path to "model_path:" in `model-config.yaml` and run the following.
 
 ```bash
-torch-model-archiver --model-name llama3-8b --version 1.0 --handler ../base_vllm_handler.py --config-file model-config.yaml -r ../requirements.txt --archive-format no-archive
+torch-model-archiver --model-name llama3-8b --version 1.0 --handler vllm_handler --config-file model-config.yaml --archive-format no-archive
 mv model llama3-8b
 ```
 
diff --git a/examples/large_models/vllm/lora/Readme.md b/examples/large_models/vllm/lora/Readme.md
index 6f3e2cc40c..bea8cd7c8b 100644
--- a/examples/large_models/vllm/lora/Readme.md
+++ b/examples/large_models/vllm/lora/Readme.md
@@ -24,7 +24,7 @@ cd ..
 Add the downloaded path to "model_path:" and "adapter_1:" in `model-config.yaml` and run the following.
 
 ```bash
-torch-model-archiver --model-name llama-7b-lora --version 1.0 --handler ../base_vllm_handler.py --config-file model-config.yaml -r ../requirements.txt --archive-format no-archive
+torch-model-archiver --model-name llama-7b-lora --version 1.0 --handler vllm_handler --config-file model-config.yaml --archive-format no-archive
 mv model llama-7b-lora
 mv adapters llama-7b-lora
 ```
diff --git a/examples/large_models/vllm/mistral/Readme.md b/examples/large_models/vllm/mistral/Readme.md
index eb0fe96999..78e8f91d71 100644
--- a/examples/large_models/vllm/mistral/Readme.md
+++ b/examples/large_models/vllm/mistral/Readme.md
@@ -21,7 +21,7 @@ python ../../utils/Download_model.py --model_path model --model_name mistralai/M
 Add the downloaded path to "model_path:" in `model-config.yaml` and run the following.
 
 ```bash
-torch-model-archiver --model-name mistral --version 1.0 --handler ../base_vllm_handler.py --config-file model-config.yaml -r ../requirements.txt --archive-format no-archive
+torch-model-archiver --model-name mistral --version 1.0 --handler vllm_handler --config-file model-config.yaml --archive-format no-archive
 mv model mistral
 ```
 
diff --git a/examples/large_models/vllm/requirements.txt b/examples/large_models/vllm/requirements.txt
deleted file mode 100644
index e7a6c7781d..0000000000
--- a/examples/large_models/vllm/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-vllm
diff --git a/requirements/torch_linux.txt b/requirements/torch_linux.txt
index df10c8a2cb..adc3278a63 100644
--- a/requirements/torch_linux.txt
+++ b/requirements/torch_linux.txt
@@ -5,3 +5,4 @@ torch==2.3.0+cpu; sys_platform == 'linux'
 torchvision==0.18.0+cpu; sys_platform == 'linux'
 torchtext==0.18.0; sys_platform == 'linux'
 torchaudio==2.3.0+cpu; sys_platform == 'linux'
+vllm==0.5.0; sys_platform == 'linux'
diff --git a/examples/large_models/vllm/base_vllm_handler.py b/ts/torch_handler/vllm_handler.py
similarity index 99%
rename from examples/large_models/vllm/base_vllm_handler.py
rename to ts/torch_handler/vllm_handler.py
index 88cd4cfba5..6589cbacd9 100644
--- a/examples/large_models/vllm/base_vllm_handler.py
+++ b/ts/torch_handler/vllm_handler.py
@@ -12,7 +12,7 @@
 logger = logging.getLogger(__name__)
 
 
-class BaseVLLMHandler(BaseHandler):
+class VLLMHandler(BaseHandler):
     def __init__(self):
         super().__init__()
 

From bdb3f80a64378769108ecd923378ff2f6cd7bd69 Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Sat, 22 Jun 2024 04:59:59 +0000
Subject: [PATCH 05/30] Register vllm_handler in model_archiver

---
 model-archiver/model_archiver/model_packaging_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/model-archiver/model_archiver/model_packaging_utils.py b/model-archiver/model_archiver/model_packaging_utils.py
index 29ac5029c9..561579607a 100644
--- a/model-archiver/model_archiver/model_packaging_utils.py
+++ b/model-archiver/model_archiver/model_packaging_utils.py
@@ -34,6 +34,7 @@
     "object_detector": "vision",
     "image_segmenter": "vision",
     "dali_image_classifier": "vision",
+    "vllm_handler": "text",
 }
 
 MODEL_SERVER_VERSION = "1.0"

From 5b8801da158afb529be5756a6eae84811dd5c643 Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Sat, 22 Jun 2024 05:00:48 +0000
Subject: [PATCH 06/30] Remove gen_mars from launcher

---
 test/pytest/test_utils.py | 4 ++++
 ts/launcher.py            | 6 ------
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/test/pytest/test_utils.py b/test/pytest/test_utils.py
index 0dad7fa65b..dfad4290ba 100644
--- a/test/pytest/test_utils.py
+++ b/test/pytest/test_utils.py
@@ -11,6 +11,8 @@
 import orjson
 import requests
 
+from ts_scripts import marsgen as mg
+
 # To help discover margen modules
 REPO_ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../")
 sys.path.append(REPO_ROOT)
@@ -38,6 +40,8 @@ def delete_model_store(model_store=None):
 def start_torchserve(*args, **kwargs):
     create_mar_file_table()
     kwargs.update({"model_store": kwargs.get("model_store", MODEL_STORE)})
+    if kwargs.get("gen_mar", True):
+        mg.gen_mar(kwargs.get("model_store"))
     return start(*args, **kwargs)
 
 
diff --git a/ts/launcher.py b/ts/launcher.py
index 3e3109df42..2fddd1a8b0 100644
--- a/ts/launcher.py
+++ b/ts/launcher.py
@@ -6,8 +6,6 @@
 
 import requests
 
-from ts_scripts import marsgen as mg
-
 
 def stop():
     subprocess.run(["torchserve", "--stop", "--foreground"])
@@ -52,7 +50,6 @@ def start(
     model_store=None,
     snapshot_file=None,
     no_config_snapshots=False,
-    gen_mar=True,
     plugin_folder=None,
     disable_token=True,
     models=None,
@@ -60,8 +57,6 @@ def start(
 ):
     stop()
     cmd = ["torchserve", "--start"]
-    if gen_mar:
-        mg.gen_mar(model_store)
     cmd.extend(["--model-store", model_store])
     if plugin_folder:
         cmd.extend(["--plugins-path", plugin_folder])
@@ -75,7 +70,6 @@ def start(
         cmd.extend(["--models", models])
     if model_api_enabled:
         cmd.extend(["--model-api-enabled"])
-
     p = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=STDOUT)
     for line in p.stdout:
         print(line.decode("utf8").strip())

From 6781012b085e23410b5b48beb82b6c44a4856310 Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Sat, 22 Jun 2024 05:01:29 +0000
Subject: [PATCH 07/30] Add llm_launcher script + llm docker

---
 docker/Dockerfile.llm      |   8 +++
 ts_scripts/llm_launcher.py | 121 +++++++++++++++++++++++++++++++++++++
 2 files changed, 129 insertions(+)
 create mode 100644 docker/Dockerfile.llm
 create mode 100644 ts_scripts/llm_launcher.py

diff --git a/docker/Dockerfile.llm b/docker/Dockerfile.llm
new file mode 100644
index 0000000000..2b487fd1fc
--- /dev/null
+++ b/docker/Dockerfile.llm
@@ -0,0 +1,8 @@
+FROM pytorch/torchserve-nightly:latest-gpu as server
+ARG HUGGINGFACE_TOKEN
+
+USER root
+
+RUN pip install vllm
+
+ENTRYPOINT [ "/bin/bash" ]
diff --git a/ts_scripts/llm_launcher.py b/ts_scripts/llm_launcher.py
new file mode 100644
index 0000000000..cc39459bc2
--- /dev/null
+++ b/ts_scripts/llm_launcher.py
@@ -0,0 +1,121 @@
+import contextlib
+import importlib
+import os
+import shutil
+import sys
+import time
+from pathlib import Path
+
+import yaml
+from model_archiver import ModelArchiverConfig
+
+from ts.launcher import start, stop
+
+model_name = "SOME_MODEL"
+model_store = "/home/ubuntu/serve/model_store"
+work_dir = "/home/ubuntu/serve/data"
+
+model_config = {
+    "minWorkers": 1,
+    "maxWorkers": 1,
+    "maxBatchDelay": 100,
+    "responseTimeout": 1200,
+    "deviceType": "gpu",
+    "asyncCommunication": True,
+    "handler": {
+        "model_path": "model/models--meta-llama--Llama-2-7b-chat-hf/snapshots/f5db02db724555f92da89c216ac04704f23d4590/",
+        "vllm_engine_config": {
+            "enable_lora": True,
+            "max_loras": 4,
+            "max_cpu_loras": 4,
+            "max_num_seqs": 16,
+            "max_model_len": 250,
+        },
+        "adapters": {
+            "adapter_1": "adapters/model/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
+        },
+    },
+}
+
+
+@contextlib.contextmanager
+def model_archiver():
+    loader = importlib.machinery.SourceFileLoader(
+        "archiver",
+        os.path.join(
+            "/home/ubuntu/serve/",
+            "model-archiver",
+            "model_archiver",
+            "model_packaging.py",
+        ),
+    )
+    spec = importlib.util.spec_from_loader("archiver", loader)
+    archiver = importlib.util.module_from_spec(spec)
+
+    sys.modules["archiver"] = archiver
+
+    loader.exec_module(archiver)
+
+    yield archiver
+
+    del sys.modules["archiver"]
+
+
+@contextlib.contextmanager
+def create_mar_file():
+    mar_file_path = Path(model_store).joinpath(model_name)
+
+    model_config_yaml = Path(model_store) / "model-config.yaml"
+    with model_config_yaml.open("w") as f:
+        yaml.dump(model_config, f)
+
+    config = ModelArchiverConfig(
+        model_name=model_name,
+        version="1.0",
+        handler="vllm_handler",
+        serialized_file=None,
+        export_path=model_store,
+        requirements_file=None,
+        runtime="python",
+        force=False,
+        config_file=model_config_yaml.as_posix(),
+        archive_format="no-archive",
+    )
+
+    with model_archiver() as ma:
+        ma.generate_model_archive(config)
+
+    model_config_yaml.unlink()
+
+    assert mar_file_path.exists()
+
+    yield mar_file_path.as_posix()
+
+    shutil.rmtree(mar_file_path)
+
+
+def main():
+    """
+    Register the model in torchserve
+    """
+
+    params = (
+        ("model_name", model_name),
+        ("url", Path(model_store) / model_name),
+        ("initial_workers", "1"),
+        ("synchronous", "true"),
+        ("batch_size", "1"),
+    )
+
+    try:
+        with create_mar_file():
+            start(model_store=model_store, no_config_snapshots=True, models=model_name)
+
+        time.sleep(10)
+
+    finally:
+        stop()
+
+
+if __name__ == "__main__":
+    main()

From 0da0b47e0484b4f00da9c471cd88f5732717f331 Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Mon, 24 Jun 2024 19:53:28 +0000
Subject: [PATCH 08/30] Use model_path as mode id if path does not exist

---
 ts/torch_handler/vllm_handler.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/ts/torch_handler/vllm_handler.py b/ts/torch_handler/vllm_handler.py
index 6589cbacd9..35695d69ef 100644
--- a/ts/torch_handler/vllm_handler.py
+++ b/ts/torch_handler/vllm_handler.py
@@ -92,7 +92,12 @@ def _get_vllm_engine_config(self, handler_config: dict):
             assert (
                 len(model_path) > 0
             ), "please define model in vllm_engine_config or model_path in handler"
-            model = str(pathlib.Path(self.model_dir).joinpath(model_path))
+            model = pathlib.Path(self.model_dir).joinpath(model_path)
+            if not model.exists():
+                logger.debug(
+                    f"Model path ({model}) does not exist locally. Trying to give without model_dir as prefix."
+                )
+                model = model_path
         logger.debug(f"EngineArgs model: {model}")
         vllm_engine_config = AsyncEngineArgs(model=model)
         self._set_attr_value(vllm_engine_config, vllm_engine_params)

From 202d137dcddeedcdf1a5497a56bb4da1e78dafdf Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Mon, 24 Jun 2024 19:53:52 +0000
Subject: [PATCH 09/30] Add arguments to llm_launcher

---
 ts_scripts/llm_launcher.py | 136 ++++++++++++++++++++++++-------------
 1 file changed, 88 insertions(+), 48 deletions(-)

diff --git a/ts_scripts/llm_launcher.py b/ts_scripts/llm_launcher.py
index cc39459bc2..66aea65222 100644
--- a/ts_scripts/llm_launcher.py
+++ b/ts_scripts/llm_launcher.py
@@ -1,42 +1,18 @@
+import argparse
 import contextlib
 import importlib
 import os
 import shutil
 import sys
-import time
 from pathlib import Path
+from signal import pause
 
+import torch
 import yaml
 from model_archiver import ModelArchiverConfig
 
 from ts.launcher import start, stop
 
-model_name = "SOME_MODEL"
-model_store = "/home/ubuntu/serve/model_store"
-work_dir = "/home/ubuntu/serve/data"
-
-model_config = {
-    "minWorkers": 1,
-    "maxWorkers": 1,
-    "maxBatchDelay": 100,
-    "responseTimeout": 1200,
-    "deviceType": "gpu",
-    "asyncCommunication": True,
-    "handler": {
-        "model_path": "model/models--meta-llama--Llama-2-7b-chat-hf/snapshots/f5db02db724555f92da89c216ac04704f23d4590/",
-        "vllm_engine_config": {
-            "enable_lora": True,
-            "max_loras": 4,
-            "max_cpu_loras": 4,
-            "max_num_seqs": 16,
-            "max_model_len": 250,
-        },
-        "adapters": {
-            "adapter_1": "adapters/model/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
-        },
-    },
-}
-
 
 @contextlib.contextmanager
 def model_archiver():
@@ -61,20 +37,48 @@ def model_archiver():
     del sys.modules["archiver"]
 
 
+def get_model_config(args):
+    model_config = {
+        "minWorkers": 1,
+        "maxWorkers": 1,
+        "batchSize": 1,
+        "maxBatchDelay": 100,
+        "responseTimeout": 1200,
+        "deviceType": "gpu",
+        "asyncCommunication": True,
+        "parallelLevel": torch.cuda.device_count() if torch.cuda.is_available else 1,
+        "handler": {
+            "model_path": args.model_id,
+            "vllm_engine_config": {
+                "enable_lora": True,
+                "max_loras": 4,
+                "max_cpu_loras": 4,
+                "max_num_seqs": getattr(args, "vllm_engine.max_num_seqs"),
+                "max_model_len": getattr(args, "vllm_engine.max_model_len"),
+            }
+            # ,
+            # "adapters": {
+            #     "adapter_1": "adapters/model/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
+            # },
+        },
+    }
+    return model_config
+
+
 @contextlib.contextmanager
-def create_mar_file():
-    mar_file_path = Path(model_store).joinpath(model_name)
+def create_mar_file(args):
+    mar_file_path = Path(args.model_store).joinpath(args.model_name)
 
-    model_config_yaml = Path(model_store) / "model-config.yaml"
+    model_config_yaml = Path(args.model_store) / "model-config.yaml"
     with model_config_yaml.open("w") as f:
-        yaml.dump(model_config, f)
+        yaml.dump(get_model_config(args), f)
 
     config = ModelArchiverConfig(
-        model_name=model_name,
+        model_name=args.model_name,
         version="1.0",
         handler="vllm_handler",
         serialized_file=None,
-        export_path=model_store,
+        export_path=args.model_store,
         requirements_file=None,
         runtime="python",
         force=False,
@@ -94,28 +98,64 @@ def create_mar_file():
     shutil.rmtree(mar_file_path)
 
 
-def main():
+def main(args):
     """
     Register the model in torchserve
     """
 
-    params = (
-        ("model_name", model_name),
-        ("url", Path(model_store) / model_name),
-        ("initial_workers", "1"),
-        ("synchronous", "true"),
-        ("batch_size", "1"),
+    with create_mar_file(args):
+        try:
+            start(
+                model_store=args.model_store,
+                no_config_snapshots=True,
+                models=args.model_name,
+            )
+
+            pause()
+
+        except KeyboardInterrupt:
+            pass
+        finally:
+            stop()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="model",
+        help="Model name",
+    )
+
+    parser.add_argument(
+        "--model_store",
+        type=str,
+        default="model_store",
+        help="Model store",
     )
 
-    try:
-        with create_mar_file():
-            start(model_store=model_store, no_config_snapshots=True, models=model_name)
+    parser.add_argument(
+        "--model_id",
+        type=str,
+        default="meta-llama/Meta-Llama-3-8B-Instruct",
+        help="Model id",
+    )
 
-        time.sleep(10)
+    parser.add_argument(
+        "--vllm_engine.max_num_seqs",
+        type=int,
+        default=16,
+        help="Max sequences in vllm engine",
+    )
 
-    finally:
-        stop()
+    parser.add_argument(
+        "--vllm_engine.max_model_len",
+        type=int,
+        default=None,
+        help="Model context length",
+    )
 
+    args = parser.parse_args()
 
-if __name__ == "__main__":
-    main()
+    main(args)

From 4161c9d24522e8cee95eff67527fd4b55a4f40ea Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Mon, 24 Jun 2024 21:39:10 +0000
Subject: [PATCH 10/30] Wait for load command to finish

---
 .../java/org/pytorch/serve/wlm/AsyncWorkerThread.java  | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/frontend/server/src/main/java/org/pytorch/serve/wlm/AsyncWorkerThread.java b/frontend/server/src/main/java/org/pytorch/serve/wlm/AsyncWorkerThread.java
index ca4c374f62..f6a9c3f68d 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/wlm/AsyncWorkerThread.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/wlm/AsyncWorkerThread.java
@@ -35,6 +35,7 @@ public class AsyncWorkerThread extends WorkerThread {
     protected static final Logger logger = LoggerFactory.getLogger(AsyncWorkerThread.class);
 
     protected boolean loadingFinished;
+    protected CountDownLatch latch;
 
     public AsyncWorkerThread(
             ConfigManager configManager,
@@ -75,6 +76,14 @@ public void run() {
                 try {
                     backendChannel.get(0).writeAndFlush(req).sync();
                     logger.debug("Successfully flushed req");
+
+                    if (loadingFinished == false) {
+                        latch = new CountDownLatch(1);
+                        if (!latch.await(2, TimeUnit.MINUTES)) {
+                            throw new WorkerInitializationException("Worker did not load the model within" + WORKER_TIMEOUT + " mins");
+                        }
+                    }
+
                 } catch (InterruptedException e) {
                     logger.error("Failed to send request to backend", e);
                 }
@@ -240,6 +249,7 @@ public void channelRead0(ChannelHandlerContext ctx, ModelWorkerResponse msg) {
                         setState(WorkerState.WORKER_MODEL_LOADED, HttpURLConnection.HTTP_OK);
                         backoffIdx = 0;
                         loadingFinished = true;
+                        latch.countDown();
                     } else {
                         setState(WorkerState.WORKER_ERROR, msg.getCode());
                     }

From 18016d5defa6a1f99ce858d2625ac646215b5d2b Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Mon, 24 Jun 2024 21:39:41 +0000
Subject: [PATCH 11/30] Optionally skip waiting in launcher.stop

---
 ts/launcher.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/ts/launcher.py b/ts/launcher.py
index 2fddd1a8b0..0cd6846af6 100644
--- a/ts/launcher.py
+++ b/ts/launcher.py
@@ -7,8 +7,11 @@
 import requests
 
 
-def stop():
-    subprocess.run(["torchserve", "--stop", "--foreground"])
+def stop(wait=True):
+    cmd = ["torchserve", "--stop"]
+    if wait:
+        cmd += ["--foreground"]
+    subprocess.run(cmd)
 
 
 class Tee(threading.Thread):

From 8ab85b8a470e008f45b789acdc262d118f799e61 Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Mon, 24 Jun 2024 21:40:25 +0000
Subject: [PATCH 12/30] remove custom loading of  model archiver

---
 ts_scripts/llm_launcher.py | 37 +++++++------------------------------
 1 file changed, 7 insertions(+), 30 deletions(-)

diff --git a/ts_scripts/llm_launcher.py b/ts_scripts/llm_launcher.py
index 66aea65222..54cb0e7a66 100644
--- a/ts_scripts/llm_launcher.py
+++ b/ts_scripts/llm_launcher.py
@@ -1,42 +1,17 @@
 import argparse
 import contextlib
-import importlib
-import os
 import shutil
-import sys
 from pathlib import Path
 from signal import pause
 
 import torch
 import yaml
 from model_archiver import ModelArchiverConfig
+from model_archiver.model_packaging import generate_model_archive
 
 from ts.launcher import start, stop
 
 
-@contextlib.contextmanager
-def model_archiver():
-    loader = importlib.machinery.SourceFileLoader(
-        "archiver",
-        os.path.join(
-            "/home/ubuntu/serve/",
-            "model-archiver",
-            "model_archiver",
-            "model_packaging.py",
-        ),
-    )
-    spec = importlib.util.spec_from_loader("archiver", loader)
-    archiver = importlib.util.module_from_spec(spec)
-
-    sys.modules["archiver"] = archiver
-
-    loader.exec_module(archiver)
-
-    yield archiver
-
-    del sys.modules["archiver"]
-
-
 def get_model_config(args):
     model_config = {
         "minWorkers": 1,
@@ -67,7 +42,10 @@ def get_model_config(args):
 
 @contextlib.contextmanager
 def create_mar_file(args):
-    mar_file_path = Path(args.model_store).joinpath(args.model_name)
+    model_store_path = Path(args.model_store)
+    model_store_path.mkdir(parents=True, exist_ok=True)
+
+    mar_file_path = model_store_path / args.model_name
 
     model_config_yaml = Path(args.model_store) / "model-config.yaml"
     with model_config_yaml.open("w") as f:
@@ -86,8 +64,7 @@ def create_mar_file(args):
         archive_format="no-archive",
     )
 
-    with model_archiver() as ma:
-        ma.generate_model_archive(config)
+    generate_model_archive(config)
 
     model_config_yaml.unlink()
 
@@ -116,7 +93,7 @@ def main(args):
         except KeyboardInterrupt:
             pass
         finally:
-            stop()
+            stop(wait=False)
 
 
 if __name__ == "__main__":

From 5bc4914dd82689630e3d1a0ab628e70dce757486 Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Mon, 24 Jun 2024 21:44:24 +0000
Subject: [PATCH 13/30] Move llm_launcher to ts

---
 {ts_scripts => ts}/llm_launcher.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename {ts_scripts => ts}/llm_launcher.py (100%)

diff --git a/ts_scripts/llm_launcher.py b/ts/llm_launcher.py
similarity index 100%
rename from ts_scripts/llm_launcher.py
rename to ts/llm_launcher.py

From 0009390ff133a2ef926d4b3740cca8b22a507a02 Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Mon, 24 Jun 2024 23:01:35 +0000
Subject: [PATCH 14/30] Set model load timeout to 10 min

---
 .../main/java/org/pytorch/serve/wlm/AsyncWorkerThread.java   | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/frontend/server/src/main/java/org/pytorch/serve/wlm/AsyncWorkerThread.java b/frontend/server/src/main/java/org/pytorch/serve/wlm/AsyncWorkerThread.java
index f6a9c3f68d..92b6cdb895 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/wlm/AsyncWorkerThread.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/wlm/AsyncWorkerThread.java
@@ -33,6 +33,7 @@
 public class AsyncWorkerThread extends WorkerThread {
     // protected ConcurrentHashMap requestsInBackend;
     protected static final Logger logger = LoggerFactory.getLogger(AsyncWorkerThread.class);
+    protected static final long MODEL_LOAD_TIMEOUT = 10L;
 
     protected boolean loadingFinished;
     protected CountDownLatch latch;
@@ -79,8 +80,8 @@ public void run() {
 
                     if (loadingFinished == false) {
                         latch = new CountDownLatch(1);
-                        if (!latch.await(2, TimeUnit.MINUTES)) {
-                            throw new WorkerInitializationException("Worker did not load the model within" + WORKER_TIMEOUT + " mins");
+                        if (!latch.await(MODEL_LOAD_TIMEOUT, TimeUnit.MINUTES)) {
+                            throw new WorkerInitializationException("Worker did not load the model within" + MODEL_LOAD_TIMEOUT + " mins");
                         }
                     }
 

From c5476fcc488e4e87d0033efb8747c56b5fbd521c Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Mon, 24 Jun 2024 23:12:04 +0000
Subject: [PATCH 15/30] Finalize dockerfile.llm

---
 docker/Dockerfile.llm |  7 ++++---
 ts/llm_launcher.py    | 13 +++++++++++++
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/docker/Dockerfile.llm b/docker/Dockerfile.llm
index 2b487fd1fc..a426abd2cf 100644
--- a/docker/Dockerfile.llm
+++ b/docker/Dockerfile.llm
@@ -1,8 +1,9 @@
 FROM pytorch/torchserve-nightly:latest-gpu as server
-ARG HUGGINGFACE_TOKEN
 
 USER root
 
-RUN pip install vllm
+RUN mkdir /data && chown -R model-server /data
 
-ENTRYPOINT [ "/bin/bash" ]
+USER model-server
+
+ENTRYPOINT [ "python", "-m", "ts.llm_launcher", "--vllm_engine.download_dir", "/data" ]
diff --git a/ts/llm_launcher.py b/ts/llm_launcher.py
index 54cb0e7a66..b61d942aa2 100644
--- a/ts/llm_launcher.py
+++ b/ts/llm_launcher.py
@@ -13,6 +13,11 @@
 
 
 def get_model_config(args):
+    download_dir = getattr(args, "vllm_engine.download_dir")
+    download_dir = (
+        Path(download_dir).resolve().as_posix() if download_dir else download_dir
+    )
+
     model_config = {
         "minWorkers": 1,
         "maxWorkers": 1,
@@ -30,6 +35,7 @@ def get_model_config(args):
                 "max_cpu_loras": 4,
                 "max_num_seqs": getattr(args, "vllm_engine.max_num_seqs"),
                 "max_model_len": getattr(args, "vllm_engine.max_model_len"),
+                "download_dir": download_dir,
             }
             # ,
             # "adapters": {
@@ -133,6 +139,13 @@ def main(args):
         help="Model context length",
     )
 
+    parser.add_argument(
+        "--vllm_engine.download_dir",
+        type=str,
+        default="/data",
+        help="Cache dir",
+    )
+
     args = parser.parse_args()
 
     main(args)

From e9de8197108986ebacda564484390f7124d53e8a Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Wed, 26 Jun 2024 01:01:08 +0000
Subject: [PATCH 16/30] Adjust default value of ts launcher for token auth and
 model api

---
 test/pytest/test_utils.py | 2 ++
 ts/launcher.py            | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/pytest/test_utils.py b/test/pytest/test_utils.py
index dfad4290ba..225ff5d5a8 100644
--- a/test/pytest/test_utils.py
+++ b/test/pytest/test_utils.py
@@ -42,6 +42,8 @@ def start_torchserve(*args, **kwargs):
     kwargs.update({"model_store": kwargs.get("model_store", MODEL_STORE)})
     if kwargs.get("gen_mar", True):
         mg.gen_mar(kwargs.get("model_store"))
+    kwargs.update({"disable_token": kwargs.get("disable_token", True)})
+    kwargs.update({"model_api_enabled": kwargs.get("model_api_enabled", True)})
     return start(*args, **kwargs)
 
 
diff --git a/ts/launcher.py b/ts/launcher.py
index 0cd6846af6..db2a9d2093 100644
--- a/ts/launcher.py
+++ b/ts/launcher.py
@@ -54,9 +54,9 @@ def start(
     snapshot_file=None,
     no_config_snapshots=False,
     plugin_folder=None,
-    disable_token=True,
+    disable_token=False,
     models=None,
-    model_api_enabled=True,
+    model_api_enabled=False,
 ):
     stop()
     cmd = ["torchserve", "--start"]

From a69c79c15fb0b7df02bfcc0e38dd4dc703935c8b Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Wed, 26 Jun 2024 04:55:36 +0000
Subject: [PATCH 17/30] updated llm_launcher.py

---
 ts/llm_launcher.py | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/ts/llm_launcher.py b/ts/llm_launcher.py
index b61d942aa2..83f0ba4a8f 100644
--- a/ts/llm_launcher.py
+++ b/ts/llm_launcher.py
@@ -30,19 +30,23 @@ def get_model_config(args):
         "handler": {
             "model_path": args.model_id,
             "vllm_engine_config": {
-                "enable_lora": True,
-                "max_loras": 4,
-                "max_cpu_loras": 4,
                 "max_num_seqs": getattr(args, "vllm_engine.max_num_seqs"),
                 "max_model_len": getattr(args, "vllm_engine.max_model_len"),
                 "download_dir": download_dir,
-            }
-            # ,
-            # "adapters": {
-            #     "adapter_1": "adapters/model/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
-            # },
+            },
         },
     }
+
+    if hasattr(args, "lora_adapter_ids"):
+        raise NotImplementedError("Lora setting needs to be implemented")
+        lora_adapter_ids = args.lora_adapter_ids.split(";")
+
+        model_config["handler"]["vllm_engine_config"].update(
+            {
+                "enable_lora": True,
+            }
+        )
+
     return model_config
 
 
@@ -125,6 +129,13 @@ def main(args):
         help="Model id",
     )
 
+    parser.add_argument(
+        "--diable_token",
+        type=str,
+        default="meta-llama/Meta-Llama-3-8B-Instruct",
+        help="Model id",
+    )
+
     parser.add_argument(
         "--vllm_engine.max_num_seqs",
         type=int,
@@ -142,7 +153,7 @@ def main(args):
     parser.add_argument(
         "--vllm_engine.download_dir",
         type=str,
-        default="/data",
+        default=None,
         help="Cache dir",
     )
 

From 5eb640f57a97f921994ce8cc80717608570a8a52 Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Wed, 26 Jun 2024 04:57:26 +0000
Subject: [PATCH 18/30] Add llm deployment to readme.md

---
 README.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/README.md b/README.md
index 225079f466..12f4ed6bc9 100644
--- a/README.md
+++ b/README.md
@@ -56,6 +56,19 @@ docker pull pytorch/torchserve-nightly
 
 Refer to [torchserve docker](docker/README.md) for details.
 
+### 🤖 Quick Start LLM Deployment
+
+```bash
+#export token=<HUGGINGFACE_HUB_TOKEN>
+docker build . -f docker/Dockerfile.llm -t ts/llm
+
+docker run --rm -ti --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:8080 -v data:/data ts/llm --model_id meta-llama/Meta-Llama-3-8B-Instruct
+
+curl -X POST -d '{"prompt":"Hello, my name is", "max_new_tokens": 50}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model"
+```
+
+Refer to [llm deployment][docs/llm_deployment.md] for details and other methods.
+
 ## ⚡ Why TorchServe
 * Write once, run anywhere, on-prem, on-cloud, supports inference on CPUs, GPUs, AWS Inf1/Inf2/Trn1, Google Cloud TPUs, [Nvidia MPS](docs/nvidia_mps.md)
 * [Model Management API](docs/management_api.md): multi model management with optimized worker to model allocation

From 3122616b62683cf0bb9a1e35b1c204c081cd085c Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Wed, 26 Jun 2024 20:51:14 +0000
Subject: [PATCH 19/30] Added documentation for llm launcher

---
 README.md              |  2 +-
 docs/README.md         |  1 +
 docs/llm_deployment.md | 60 ++++++++++++++++++++++++++++++++++++++++++
 ts/llm_launcher.py     |  8 +++---
 4 files changed, 66 insertions(+), 5 deletions(-)
 create mode 100644 docs/llm_deployment.md

diff --git a/README.md b/README.md
index 12f4ed6bc9..92e8d5ec46 100644
--- a/README.md
+++ b/README.md
@@ -62,7 +62,7 @@ Refer to [torchserve docker](docker/README.md) for details.
 #export token=<HUGGINGFACE_HUB_TOKEN>
 docker build . -f docker/Dockerfile.llm -t ts/llm
 
-docker run --rm -ti --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:8080 -v data:/data ts/llm --model_id meta-llama/Meta-Llama-3-8B-Instruct
+docker run --rm -ti --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:8080 -v data:/data ts/llm --model_id meta-llama/Meta-Llama-3-8B-Instruct --disable_token
 
 curl -X POST -d '{"prompt":"Hello, my name is", "max_new_tokens": 50}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model"
 ```
diff --git a/docs/README.md b/docs/README.md
index 155dee671d..e03410b0dd 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -32,6 +32,7 @@ TorchServe is a performant, flexible and easy to use tool for serving PyTorch ea
 
 ## Examples
 
+* [Deploying LLMs](./llm_deployment.md) - How to easily deploy LLMs using TorchServe
 * [HuggingFace Language Model](https://github.com/pytorch/serve/blob/master/examples/Huggingface_Transformers/Transformer_handler_generalized.py) - This handler takes an input sentence and can return sequence classifications, token classifications or Q&A answers
 * [Multi Modal Framework](https://github.com/pytorch/serve/blob/master/examples/MMF-activity-recognition/handler.py) - Build and deploy a classifier that combines text, audio and video input data
 * [Dual Translation Workflow](https://github.com/pytorch/serve/tree/master/examples/Workflows/nmt_transformers_pipeline) -
diff --git a/docs/llm_deployment.md b/docs/llm_deployment.md
new file mode 100644
index 0000000000..0b95d55fde
--- /dev/null
+++ b/docs/llm_deployment.md
@@ -0,0 +1,60 @@
+# LLM Deployment with TorchServe
+
+This document describes how to easily serve large language models (LLM) like Meta-Llama3 with TorchServe.
+Besides a quick start guide using our VLLM integration we also provide a list of examples which describe other methods to deploy LLMs with TorchServe.
+
+## Quickstart LLM Deployment
+
+TorchServe offers easy LLM deployment through its VLLM integration.
+Through the integration of our [llm launcher script](https://github.com/pytorch/serve/blob/7a9b145204b4d7cfbb114fe737cf980221e6181e/ts/llm_launcher.py) users are able to deploy any model supported by VLLM with a single command.
+The launcher can either be used standalone or in combination with our provided TorchServe GPU docker image.
+
+To launch the docker we first need to build it:
+```bash
+docker build . -f docker/Dockerfile.llm -t ts/llm
+```
+
+Models are usually loaded from the HuggingFace hub and are cached in a [docker volume](https://docs.docker.com/storage/volumes/) for faster reload.
+If you want to access gated models like the Meta-Llama3 model you need to provide a HuggingFace hub token:
+```bash
+export token=<HUGGINGFACE_HUB_TOKEN>
+```
+
+You can then go ahead and launch a TorchServe instance serving your selected model:
+```bash
+docker run --rm -ti --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:8080 -v data:/data ts/llm --model_id meta-llama/Meta-Llama-3-8B-Instruct --disable_token
+```
+
+To change the model you just need to exchange the identifier fo the `--model_id` parameter.
+You can test the model with:
+```bash
+curl -X POST -d '{"prompt":"Hello, my name is", "max_new_tokens": 50}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model"
+```
+
+You can change any of the sampling argument for the request by using the [VLLM SamplingParams keywords](https://docs.vllm.ai/en/stable/dev/sampling_params.html#vllm.SamplingParams).
+E.g. for setting the sampling temperatur to 0 we can do:
+```bash
+curl -X POST -d '{"prompt":"Hello, my name is", "max_new_tokens": 50, "temperature": 0}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model"
+```
+
+TorchServe's llm launcher scripts offers some customization options as well.
+To rename the model endpoint from `predictions/model` to something else you can add `--model_name <SOME_NAME>` to the `docker run` command.
+
+The launcher script can also be used outside a docker container by calling this after installing TorchServe following the [installation instruction](https://github.com/pytorch/serve/blob/feature/single_cmd_llm_deployment/README.md#-quick-start-with-torchserve).
+```bash
+python -m ts.llm_launcher --disable_token
+```
+
+Please note that the launcher script as well as the docker command will automatically run on all available GPUs so make sure to restrict the visible number of device by setting CUDA_VISIBLE_DEVICES.
+
+## Other ways to deploy LLMs with TorchServe
+
+TorchServe offers a variety of example on how to deploy large models.
+Here is a list of the current examples:
+
+* [Llama 2/3 chatbot](https://github.com/pytorch/serve/tree/master/examples/LLM/llama)
+* [GPT-fast](https://github.com/pytorch/serve/tree/master/examples/large_models/gpt_fast)
+* [Infertia2](https://github.com/pytorch/serve/tree/master/examples/large_models/inferentia2)
+* [IPEX optimized](https://github.com/pytorch/serve/tree/master/examples/large_models/ipex_llm_int8)
+* [Tensor Parallel Llama](https://github.com/pytorch/serve/tree/master/examples/large_models/tp_llama)
+* [VLLM Integration](https://github.com/pytorch/serve/tree/master/examples/large_models/vllm)
diff --git a/ts/llm_launcher.py b/ts/llm_launcher.py
index 83f0ba4a8f..b8824ce9db 100644
--- a/ts/llm_launcher.py
+++ b/ts/llm_launcher.py
@@ -96,6 +96,7 @@ def main(args):
                 model_store=args.model_store,
                 no_config_snapshots=True,
                 models=args.model_name,
+                disable_token=args.disable_token,
             )
 
             pause()
@@ -130,10 +131,9 @@ def main(args):
     )
 
     parser.add_argument(
-        "--diable_token",
-        type=str,
-        default="meta-llama/Meta-Llama-3-8B-Instruct",
-        help="Model id",
+        "--disable_token",
+        action="store_true",
+        help="Disable token authentication",
     )
 
     parser.add_argument(

From 2003bd075e24277e7c07156c7b0cd5acf0f949c0 Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Wed, 26 Jun 2024 21:32:48 +0000
Subject: [PATCH 20/30] Added section on supported models

---
 docs/llm_deployment.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/docs/llm_deployment.md b/docs/llm_deployment.md
index 0b95d55fde..1d159f0d79 100644
--- a/docs/llm_deployment.md
+++ b/docs/llm_deployment.md
@@ -25,6 +25,8 @@ You can then go ahead and launch a TorchServe instance serving your selected mod
 docker run --rm -ti --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:8080 -v data:/data ts/llm --model_id meta-llama/Meta-Llama-3-8B-Instruct --disable_token
 ```
 
+
+
 To change the model you just need to exchange the identifier fo the `--model_id` parameter.
 You can test the model with:
 ```bash
@@ -47,6 +49,17 @@ python -m ts.llm_launcher --disable_token
 
 Please note that the launcher script as well as the docker command will automatically run on all available GPUs so make sure to restrict the visible number of device by setting CUDA_VISIBLE_DEVICES.
 
+## Supported models
+The quickstart launcher should allow to launch any model which is [supported by VLLM](https://docs.vllm.ai/en/latest/models/supported_models.html).
+Here is a list of model identifiers tested by the TorchServe team:
+
+* meta-llama/Meta-Llama-3-8B
+* meta-llama/Meta-Llama-3-8B-Instruct
+* meta-llama/Llama-2-7b-hf
+* meta-llama/Llama-2-7b-chat-hf
+* mistralai/Mistral-7B-v0.1
+* mistralai/Mistral-7B-Instruct-v0.1
+
 ## Other ways to deploy LLMs with TorchServe
 
 TorchServe offers a variety of example on how to deploy large models.

From de572a09f62affd74a87b1c77494c87da964fd02 Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Wed, 26 Jun 2024 22:44:08 +0000
Subject: [PATCH 21/30] Enable tensor parallelism in llm launcher

---
 ts/llm_launcher.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ts/llm_launcher.py b/ts/llm_launcher.py
index b8824ce9db..1e43888827 100644
--- a/ts/llm_launcher.py
+++ b/ts/llm_launcher.py
@@ -33,6 +33,9 @@ def get_model_config(args):
                 "max_num_seqs": getattr(args, "vllm_engine.max_num_seqs"),
                 "max_model_len": getattr(args, "vllm_engine.max_model_len"),
                 "download_dir": download_dir,
+                "tensor_parallel_size": torch.cuda.device_count()
+                if torch.cuda.is_available
+                else 1,
             },
         },
     }

From 332fb43abdbf75f3225abf5077c6cd4a0636402f Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Wed, 26 Jun 2024 22:45:42 +0000
Subject: [PATCH 22/30] Add reference to go beyond quickstart

---
 docs/llm_deployment.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/llm_deployment.md b/docs/llm_deployment.md
index 1d159f0d79..a91f95667d 100644
--- a/docs/llm_deployment.md
+++ b/docs/llm_deployment.md
@@ -49,6 +49,8 @@ python -m ts.llm_launcher --disable_token
 
 Please note that the launcher script as well as the docker command will automatically run on all available GPUs so make sure to restrict the visible number of device by setting CUDA_VISIBLE_DEVICES.
 
+For further customization of the handler and add 3rd party dependencies you can have a look at out [VLLM example](https://github.com/pytorch/serve/tree/master/examples/large_models/vllm).
+
 ## Supported models
 The quickstart launcher should allow to launch any model which is [supported by VLLM](https://docs.vllm.ai/en/latest/models/supported_models.html).
 Here is a list of model identifiers tested by the TorchServe team:

From f3508dd47eddeab309427fa99f30084e0ce4ae48 Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Wed, 26 Jun 2024 22:51:20 +0000
Subject: [PATCH 23/30] fix spellcheck lint

---
 README.md                               |  2 +-
 docs/llm_deployment.md                  | 14 ++++++--------
 ts_scripts/spellcheck_conf/wordlist.txt |  3 +++
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 92e8d5ec46..0d86bd4747 100644
--- a/README.md
+++ b/README.md
@@ -67,7 +67,7 @@ docker run --rm -ti --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:8080 -v
 curl -X POST -d '{"prompt":"Hello, my name is", "max_new_tokens": 50}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model"
 ```
 
-Refer to [llm deployment][docs/llm_deployment.md] for details and other methods.
+Refer to [LLM deployment][docs/llm_deployment.md] for details and other methods.
 
 ## ⚡ Why TorchServe
 * Write once, run anywhere, on-prem, on-cloud, supports inference on CPUs, GPUs, AWS Inf1/Inf2/Trn1, Google Cloud TPUs, [Nvidia MPS](docs/nvidia_mps.md)
diff --git a/docs/llm_deployment.md b/docs/llm_deployment.md
index a91f95667d..1501f08bc6 100644
--- a/docs/llm_deployment.md
+++ b/docs/llm_deployment.md
@@ -6,7 +6,7 @@ Besides a quick start guide using our VLLM integration we also provide a list of
 ## Quickstart LLM Deployment
 
 TorchServe offers easy LLM deployment through its VLLM integration.
-Through the integration of our [llm launcher script](https://github.com/pytorch/serve/blob/7a9b145204b4d7cfbb114fe737cf980221e6181e/ts/llm_launcher.py) users are able to deploy any model supported by VLLM with a single command.
+Through the integration of our [LLM launcher script](https://github.com/pytorch/serve/blob/7a9b145204b4d7cfbb114fe737cf980221e6181e/ts/llm_launcher.py) users are able to deploy any model supported by VLLM with a single command.
 The launcher can either be used standalone or in combination with our provided TorchServe GPU docker image.
 
 To launch the docker we first need to build it:
@@ -25,21 +25,19 @@ You can then go ahead and launch a TorchServe instance serving your selected mod
 docker run --rm -ti --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:8080 -v data:/data ts/llm --model_id meta-llama/Meta-Llama-3-8B-Instruct --disable_token
 ```
 
-
-
-To change the model you just need to exchange the identifier fo the `--model_id` parameter.
+To change the model you just need to exchange the identifier given to the `--model_id` parameter.
 You can test the model with:
 ```bash
 curl -X POST -d '{"prompt":"Hello, my name is", "max_new_tokens": 50}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model"
 ```
 
 You can change any of the sampling argument for the request by using the [VLLM SamplingParams keywords](https://docs.vllm.ai/en/stable/dev/sampling_params.html#vllm.SamplingParams).
-E.g. for setting the sampling temperatur to 0 we can do:
+E.g. for setting the sampling temperature to 0 we can do:
 ```bash
 curl -X POST -d '{"prompt":"Hello, my name is", "max_new_tokens": 50, "temperature": 0}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model"
 ```
 
-TorchServe's llm launcher scripts offers some customization options as well.
+TorchServe's LLM launcher scripts offers some customization options as well.
 To rename the model endpoint from `predictions/model` to something else you can add `--model_name <SOME_NAME>` to the `docker run` command.
 
 The launcher script can also be used outside a docker container by calling this after installing TorchServe following the [installation instruction](https://github.com/pytorch/serve/blob/feature/single_cmd_llm_deployment/README.md#-quick-start-with-torchserve).
@@ -67,9 +65,9 @@ Here is a list of model identifiers tested by the TorchServe team:
 TorchServe offers a variety of example on how to deploy large models.
 Here is a list of the current examples:
 
-* [Llama 2/3 chatbot](https://github.com/pytorch/serve/tree/master/examples/LLM/llama)
+* [Llama 2/3 chat bot](https://github.com/pytorch/serve/tree/master/examples/LLM/llama)
 * [GPT-fast](https://github.com/pytorch/serve/tree/master/examples/large_models/gpt_fast)
-* [Infertia2](https://github.com/pytorch/serve/tree/master/examples/large_models/inferentia2)
+* [Inferentia2](https://github.com/pytorch/serve/tree/master/examples/large_models/inferentia2)
 * [IPEX optimized](https://github.com/pytorch/serve/tree/master/examples/large_models/ipex_llm_int8)
 * [Tensor Parallel Llama](https://github.com/pytorch/serve/tree/master/examples/large_models/tp_llama)
 * [VLLM Integration](https://github.com/pytorch/serve/tree/master/examples/large_models/vllm)
diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt
index 43446f168f..8beb86f298 100644
--- a/ts_scripts/spellcheck_conf/wordlist.txt
+++ b/ts_scripts/spellcheck_conf/wordlist.txt
@@ -1255,3 +1255,6 @@ parallelType
 parallelization
 pptp
 torchcompile
+HPC
+hpc
+llm

From 61b8820639700e1cac092808cb050773d17345ec Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Wed, 26 Jun 2024 22:53:09 +0000
Subject: [PATCH 24/30] HPC->HPU

---
 ts_scripts/spellcheck_conf/wordlist.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt
index 8beb86f298..284fd5be50 100644
--- a/ts_scripts/spellcheck_conf/wordlist.txt
+++ b/ts_scripts/spellcheck_conf/wordlist.txt
@@ -1255,6 +1255,6 @@ parallelType
 parallelization
 pptp
 torchcompile
-HPC
-hpc
+HPU
+hpu
 llm

From 286e034c953dd18d8b7f077413847d9cc89f362c Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Wed, 26 Jun 2024 22:54:11 +0000
Subject: [PATCH 25/30] doc

---
 docs/llm_deployment.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/llm_deployment.md b/docs/llm_deployment.md
index 1501f08bc6..b413d02061 100644
--- a/docs/llm_deployment.md
+++ b/docs/llm_deployment.md
@@ -47,7 +47,7 @@ python -m ts.llm_launcher --disable_token
 
 Please note that the launcher script as well as the docker command will automatically run on all available GPUs so make sure to restrict the visible number of device by setting CUDA_VISIBLE_DEVICES.
 
-For further customization of the handler and add 3rd party dependencies you can have a look at out [VLLM example](https://github.com/pytorch/serve/tree/master/examples/large_models/vllm).
+For further customization of the handler and adding 3rd party dependencies you can have a look at out [VLLM example](https://github.com/pytorch/serve/tree/master/examples/large_models/vllm).
 
 ## Supported models
 The quickstart launcher should allow to launch any model which is [supported by VLLM](https://docs.vllm.ai/en/latest/models/supported_models.html).

From 65b6480057cafba5190dc703acfee28076d8097d Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Wed, 26 Jun 2024 23:12:53 +0000
Subject: [PATCH 26/30] Move margen import below path changes

---
 test/pytest/test_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/pytest/test_utils.py b/test/pytest/test_utils.py
index 225ff5d5a8..ee4cf1e201 100644
--- a/test/pytest/test_utils.py
+++ b/test/pytest/test_utils.py
@@ -11,13 +11,13 @@
 import orjson
 import requests
 
-from ts_scripts import marsgen as mg
-
 # To help discover margen modules
 REPO_ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../")
 sys.path.append(REPO_ROOT)
+
 from ts.launcher import register_model, register_model_with_params, start  # noqa
 from ts.launcher import stop as stop_torchserve
+from ts_scripts import marsgen as mg
 
 ROOT_DIR = os.path.join(tempfile.gettempdir(), "workspace")
 MODEL_STORE = path.join(ROOT_DIR, "model_store/")

From c7fdbf4592e15b61cd6f877aba7019fbd042c5c0 Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Thu, 27 Jun 2024 00:26:07 +0000
Subject: [PATCH 27/30] Fix java formatting

---
 .../main/java/org/pytorch/serve/wlm/AsyncWorkerThread.java   | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/frontend/server/src/main/java/org/pytorch/serve/wlm/AsyncWorkerThread.java b/frontend/server/src/main/java/org/pytorch/serve/wlm/AsyncWorkerThread.java
index 92b6cdb895..2dd1272b01 100644
--- a/frontend/server/src/main/java/org/pytorch/serve/wlm/AsyncWorkerThread.java
+++ b/frontend/server/src/main/java/org/pytorch/serve/wlm/AsyncWorkerThread.java
@@ -81,7 +81,10 @@ public void run() {
                     if (loadingFinished == false) {
                         latch = new CountDownLatch(1);
                         if (!latch.await(MODEL_LOAD_TIMEOUT, TimeUnit.MINUTES)) {
-                            throw new WorkerInitializationException("Worker did not load the model within" + MODEL_LOAD_TIMEOUT + " mins");
+                            throw new WorkerInitializationException(
+                                    "Worker did not load the model within"
+                                            + MODEL_LOAD_TIMEOUT
+                                            + " mins");
                         }
                     }
 

From 815539883a50e192e252b3dd2d14ffc24fec6199 Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Thu, 27 Jun 2024 01:02:52 +0000
Subject: [PATCH 28/30] Remove gen_mar kw

---
 test/pytest/test_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/pytest/test_utils.py b/test/pytest/test_utils.py
index ee4cf1e201..035d67aa7c 100644
--- a/test/pytest/test_utils.py
+++ b/test/pytest/test_utils.py
@@ -42,6 +42,8 @@ def start_torchserve(*args, **kwargs):
     kwargs.update({"model_store": kwargs.get("model_store", MODEL_STORE)})
     if kwargs.get("gen_mar", True):
         mg.gen_mar(kwargs.get("model_store"))
+    if "gen_mar" in kwargs:
+        del kwargs["gen_mar"]
     kwargs.update({"disable_token": kwargs.get("disable_token", True)})
     kwargs.update({"model_api_enabled": kwargs.get("model_api_enabled", True)})
     return start(*args, **kwargs)

From 474494ae1855599abd95430d6a030b00136ef8ed Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Thu, 27 Jun 2024 04:33:51 +0000
Subject: [PATCH 29/30] Fix error if model_store is used as positional argument

---
 test/pytest/test_utils.py |  4 +++-
 ts/launcher.py            | 19 ++++++++++++++++++-
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/test/pytest/test_utils.py b/test/pytest/test_utils.py
index 035d67aa7c..3429f247a2 100644
--- a/test/pytest/test_utils.py
+++ b/test/pytest/test_utils.py
@@ -39,7 +39,9 @@ def delete_model_store(model_store=None):
 
 def start_torchserve(*args, **kwargs):
     create_mar_file_table()
-    kwargs.update({"model_store": kwargs.get("model_store", MODEL_STORE)})
+    # In case someone uses model_store as positional argument
+    if len(args) == 0:
+        kwargs.update({"model_store": kwargs.get("model_store", MODEL_STORE)})
     if kwargs.get("gen_mar", True):
         mg.gen_mar(kwargs.get("model_store"))
     if "gen_mar" in kwargs:
diff --git a/ts/launcher.py b/ts/launcher.py
index db2a9d2093..ea9479db1d 100644
--- a/ts/launcher.py
+++ b/ts/launcher.py
@@ -1,7 +1,7 @@
 import subprocess
 import threading
 from io import TextIOWrapper
-from queue import Full, Queue
+from queue import Empty, Full, Queue
 from subprocess import PIPE, STDOUT, Popen
 
 import requests
@@ -32,6 +32,23 @@ def run(self):
             except Full:
                 pass
 
+        # If queues are full, clear them out and send None
+        # This is probably not necessary as the runner consuming the queue will have presumably died
+        # But we want to avoid a confusing additional exception if there is any error
+        try:
+            if self.queue1.full():
+                while not self.queue1.empty():
+                    self.queue1.queue.get(False)
+        except Empty:
+            pass
+
+        try:
+            if self.queue2.full():
+                while not self.queue2.empty():
+                    self.queue2.queue.get(False)
+        except Empty:
+            pass
+
         self.queue1.put_nowait(None)
         self.queue2.put_nowait(None)
 

From 57de73ee41acbeed60d89c9afd343f8cdbc13049 Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Thu, 27 Jun 2024 16:36:30 +0000
Subject: [PATCH 30/30] Remove .queue

---
 ts/launcher.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ts/launcher.py b/ts/launcher.py
index ea9479db1d..349332dcd9 100644
--- a/ts/launcher.py
+++ b/ts/launcher.py
@@ -38,14 +38,14 @@ def run(self):
         try:
             if self.queue1.full():
                 while not self.queue1.empty():
-                    self.queue1.queue.get(False)
+                    self.queue1.get(False)
         except Empty:
             pass
 
         try:
             if self.queue2.full():
                 while not self.queue2.empty():
-                    self.queue2.queue.get(False)
+                    self.queue2.get(False)
         except Empty:
             pass