From 67c5914f31a2de0eb8c369e9dc3f5627cf7be383 Mon Sep 17 00:00:00 2001
From: Aghilan Nathan <aghilan.nathan@baseten.co>
Date: Wed, 20 Aug 2025 12:10:41 -0700
Subject: [PATCH 1/4] fix(truss): explicitly pass in the chat_template flag and
 file

---
 truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py b/truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py
index 2e6093977..c1477f624 100644
--- a/truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py
+++ b/truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py
@@ -19,6 +19,7 @@
     + " --port 8000"
     + " --tensor-parallel-size {{ specify_tensor_parallelism }}"
     + " --dtype bfloat16"
+    + " --chat-template {{ model_path }}/chat_template.jinja"
     + '"'
 )
 

From a4527e21f37789bf557625db14603270ea7958eb Mon Sep 17 00:00:00 2001
From: Aghilan Nathan <aghilan.nathan@baseten.co>
Date: Thu, 21 Aug 2025 13:08:38 -0700
Subject: [PATCH 2/4] fix(truss): move deployments more robust

---
 .../deploy_full_checkpoints.py                  | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py b/truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py
index c1477f624..6759a4d99 100644
--- a/truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py
+++ b/truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py
@@ -14,13 +14,18 @@
     setup_environment_variables_and_secrets,
 )
 
+# NB(aghilan): Transformers was recently changed to save a chat_template.jinja file instead of inside the tokenizer_config.json file.
+# Old Models will not have this file, so we check for it and use it if it exists.
+# vLLM will not automatically resolve the chat_template.jinja file, so we need to pass it to the start command.
 VLLM_FULL_START_COMMAND = Template(
-    'sh -c "{%if envvars %}{{ envvars }} {% endif %}vllm serve {{ model_path }}'
-    + " --port 8000"
-    + " --tensor-parallel-size {{ specify_tensor_parallelism }}"
-    + " --dtype bfloat16"
-    + " --chat-template {{ model_path }}/chat_template.jinja"
-    + '"'
+    "sh -c '{% if envvars %}{{ envvars }} {% endif %}"
+    'HF_TOKEN="$$(cat /secrets/hf_access_token)" && export HF_TOKEN && '
+    "if [ -f {{ model_path }}/chat_template.jinja ]; then "
+    "  vllm serve {{ model_path }} --chat-template {{ model_path }}/chat_template.jinja "
+    "  --port 8000 --tensor-parallel-size {{ specify_tensor_parallelism }} --dtype bfloat16; "
+    "else "
+    "  vllm serve {{ model_path }} --port 8000 --tensor-parallel-size {{ specify_tensor_parallelism }} --dtype bfloat16; "
+    "fi'"
 )
 
 

From df83df0b0d6a7e85ff3b52a3e1cec1c770935b3e Mon Sep 17 00:00:00 2001
From: Aghilan Nathan <aghilan.nathan@baseten.co>
Date: Thu, 21 Aug 2025 14:28:09 -0700
Subject: [PATCH 3/4] fix(truss): unit test for vllm serve command

---
 truss/tests/cli/train/test_deploy_checkpoints.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/truss/tests/cli/train/test_deploy_checkpoints.py b/truss/tests/cli/train/test_deploy_checkpoints.py
index e049fd636..336b112e6 100644
--- a/truss/tests/cli/train/test_deploy_checkpoints.py
+++ b/truss/tests/cli/train/test_deploy_checkpoints.py
@@ -505,8 +505,16 @@ def test_render_vllm_full_truss_config():
     )
 
     result = render_vllm_full_truss_config(deploy_config)
-
-    expected_vllm_command = 'sh -c "HF_TOKEN=$(cat /secrets/hf_token) vllm serve /tmp/training_checkpoints/job123/rank-0/checkpoint-1 --port 8000 --tensor-parallel-size 2 --dtype bfloat16"'
+    expected_vllm_command = (
+        "sh -c 'HF_TOKEN=$(cat /secrets/hf_token) "
+        'HF_TOKEN="$$(cat /secrets/hf_access_token)" && export HF_TOKEN && '
+        "if [ -f /tmp/training_checkpoints/job123/rank-0/checkpoint-1/chat_template.jinja ]; then   "
+        "vllm serve /tmp/training_checkpoints/job123/rank-0/checkpoint-1 "
+        "--chat-template /tmp/training_checkpoints/job123/rank-0/checkpoint-1/chat_template.jinja   "
+        "--port 8000 --tensor-parallel-size 2 --dtype bfloat16; else   "
+        "vllm serve /tmp/training_checkpoints/job123/rank-0/checkpoint-1 "
+        "--port 8000 --tensor-parallel-size 2 --dtype bfloat16; fi'"
+    )
 
     assert isinstance(result, truss_config.TrussConfig)
     assert result.model_name == "test-full-model"

From 562433b83f0d7eb332fa76499c9742f98d30a589 Mon Sep 17 00:00:00 2001
From: Aghilan Nathan <aghilan.nathan@baseten.co>
Date: Thu, 21 Aug 2025 16:20:00 -0700
Subject: [PATCH 4/4] fix(truss): add comment

---
 truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py b/truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py
index 6759a4d99..9b19866b5 100644
--- a/truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py
+++ b/truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py
@@ -17,6 +17,7 @@
 # NB(aghilan): Transformers was recently changed to save a chat_template.jinja file instead of inside the tokenizer_config.json file.
 # Old Models will not have this file, so we check for it and use it if it exists.
 # vLLM will not automatically resolve the chat_template.jinja file, so we need to pass it to the start command.
+# This logic is needed for any models trained using Transformers v4.51.3 or later
 VLLM_FULL_START_COMMAND = Template(
     "sh -c '{% if envvars %}{{ envvars }} {% endif %}"
     'HF_TOKEN="$$(cat /secrets/hf_access_token)" && export HF_TOKEN && '