From 67c5914f31a2de0eb8c369e9dc3f5627cf7be383 Mon Sep 17 00:00:00 2001 From: Aghilan Nathan Date: Wed, 20 Aug 2025 12:10:41 -0700 Subject: [PATCH 1/4] fix(truss): explicitly pass in the chat_template flag and file --- truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py | 1 + 1 file changed, 1 insertion(+) diff --git a/truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py b/truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py index 2e6093977..c1477f624 100644 --- a/truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py +++ b/truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py @@ -19,6 +19,7 @@ + " --port 8000" + " --tensor-parallel-size {{ specify_tensor_parallelism }}" + " --dtype bfloat16" + + " --chat-template {{ model_path }}/chat_template.jinja" + '"' ) From a4527e21f37789bf557625db14603270ea7958eb Mon Sep 17 00:00:00 2001 From: Aghilan Nathan Date: Thu, 21 Aug 2025 13:08:38 -0700 Subject: [PATCH 2/4] fix(truss): move deployments more robust --- .../deploy_full_checkpoints.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py b/truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py index c1477f624..6759a4d99 100644 --- a/truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py +++ b/truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py @@ -14,13 +14,18 @@ setup_environment_variables_and_secrets, ) +# NB(aghilan): Transformers was recently changed to save a chat_template.jinja file instead of inside the tokenizer_config.json file. +# Old Models will not have this file, so we check for it and use it if it exists. +# vLLM will not automatically resolve the chat_template.jinja file, so we need to pass it to the start command. VLLM_FULL_START_COMMAND = Template( - 'sh -c "{%if envvars %}{{ envvars }} {% endif %}vllm serve {{ model_path }}' - + " --port 8000" - + " --tensor-parallel-size {{ specify_tensor_parallelism }}" - + " --dtype bfloat16" - + " --chat-template {{ model_path }}/chat_template.jinja" - + '"' + "sh -c '{% if envvars %}{{ envvars }} {% endif %}" + 'HF_TOKEN="$$(cat /secrets/hf_access_token)" && export HF_TOKEN && ' + "if [ -f {{ model_path }}/chat_template.jinja ]; then " + " vllm serve {{ model_path }} --chat-template {{ model_path }}/chat_template.jinja " + " --port 8000 --tensor-parallel-size {{ specify_tensor_parallelism }} --dtype bfloat16; " + "else " + " vllm serve {{ model_path }} --port 8000 --tensor-parallel-size {{ specify_tensor_parallelism }} --dtype bfloat16; " + "fi'" ) From df83df0b0d6a7e85ff3b52a3e1cec1c770935b3e Mon Sep 17 00:00:00 2001 From: Aghilan Nathan Date: Thu, 21 Aug 2025 14:28:09 -0700 Subject: [PATCH 3/4] fix(truss): unit test for vllm serve command --- truss/tests/cli/train/test_deploy_checkpoints.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/truss/tests/cli/train/test_deploy_checkpoints.py b/truss/tests/cli/train/test_deploy_checkpoints.py index e049fd636..336b112e6 100644 --- a/truss/tests/cli/train/test_deploy_checkpoints.py +++ b/truss/tests/cli/train/test_deploy_checkpoints.py @@ -505,8 +505,16 @@ def test_render_vllm_full_truss_config(): ) result = render_vllm_full_truss_config(deploy_config) - - expected_vllm_command = 'sh -c "HF_TOKEN=$(cat /secrets/hf_token) vllm serve /tmp/training_checkpoints/job123/rank-0/checkpoint-1 --port 8000 --tensor-parallel-size 2 --dtype bfloat16"' + expected_vllm_command = ( + "sh -c 'HF_TOKEN=$(cat /secrets/hf_token) " + 'HF_TOKEN="$$(cat /secrets/hf_access_token)" && export HF_TOKEN && ' + "if [ -f /tmp/training_checkpoints/job123/rank-0/checkpoint-1/chat_template.jinja ]; then " + "vllm serve /tmp/training_checkpoints/job123/rank-0/checkpoint-1 " + "--chat-template /tmp/training_checkpoints/job123/rank-0/checkpoint-1/chat_template.jinja " + "--port 8000 --tensor-parallel-size 2 --dtype bfloat16; else " + "vllm serve /tmp/training_checkpoints/job123/rank-0/checkpoint-1 " + "--port 8000 --tensor-parallel-size 2 --dtype bfloat16; fi'" + ) assert isinstance(result, truss_config.TrussConfig) assert result.model_name == "test-full-model" From 562433b83f0d7eb332fa76499c9742f98d30a589 Mon Sep 17 00:00:00 2001 From: Aghilan Nathan Date: Thu, 21 Aug 2025 16:20:00 -0700 Subject: [PATCH 4/4] fix(truss): add comment --- truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py | 1 + 1 file changed, 1 insertion(+) diff --git a/truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py b/truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py index 6759a4d99..9b19866b5 100644 --- a/truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py +++ b/truss/cli/train/deploy_checkpoints/deploy_full_checkpoints.py @@ -17,6 +17,7 @@ # NB(aghilan): Transformers was recently changed to save a chat_template.jinja file instead of inside the tokenizer_config.json file. # Old Models will not have this file, so we check for it and use it if it exists. # vLLM will not automatically resolve the chat_template.jinja file, so we need to pass it to the start command. +# This logic is needed for any models trained using Transformers v4.51.3 or later VLLM_FULL_START_COMMAND = Template( "sh -c '{% if envvars %}{{ envvars }} {% endif %}" 'HF_TOKEN="$$(cat /secrets/hf_access_token)" && export HF_TOKEN && '