Additional customisation options for Azimuth app

Adds the ability to customise vLLM version and model context length to the Azimuth UI form.
stackhpc · Jul 12, 2024 · 5a9e883 · 5a9e883
1 parent bb254a0
commit 5a9e883
Show file tree

Hide file tree

Showing 4 changed files with 38 additions and 1 deletion.
diff --git a/chart/azimuth-ui.schema.yaml b/chart/azimuth-ui.schema.yaml
@@ -16,4 +16,11 @@ sortOrder:
   - /huggingface/token
   - /ui/appSettings/hf_model_instruction
   - /ui/appSettings/page_title
+  - /api/image/version
   - /ui/appSettings/llm_temperature
+  - /ui/appSettings/llm_max_tokens
+  - /ui/appSettings/llm_frequency_penalty
+  - /ui/appSettings/llm_presence_penalty
+  - /ui/appSettings/llm_top_p
+  - /ui/appSettings/llm_top_k
+  - /api/modelMaxContextLength
diff --git a/chart/templates/api/deployment.yml b/chart/templates/api/deployment.yml
@@ -29,6 +29,10 @@ spec:
           - --model
           - {{ .Values.huggingface.model }}
           {{- include "azimuth-llm.chatTemplate" . | nindent 10 }}
+          {{- if .Values.api.modelMaxContextLength -}}
+          - --max-model-len
+          - {{ .Values.api.modelMaxContextLength }}
+          {{- end -}}
           {{- if .Values.api.extraArgs -}}
           {{- .Values.api.extraArgs | toYaml | nindent 10 }}
           {{- end -}}

diff --git a/chart/values.schema.json b/chart/values.schema.json
@@ -92,6 +92,25 @@
                     "required": ["hf_model_name", "hf_model_instruction"]
                 }
             }
+        },
+        "api": {
+            "properties": {
+                "modelMaxContextLength": {
+                    "type": "integer",
+                    "title": "Model Context Length",
+                    "description": "An override for the maximum context length to use if the model's default is not suitable."
+                },
+                "image": {
+                    "properties": {
+                        "version": {
+                            "type": "string",
+                            "title": "vLLM version override",
+                            "description": "The vLLM version to use as a backend. Must be a version tag from [this list](https://github.com/vllm-project/vllm/tags)",
+                            "default": "v0.4.3"
+                        }
+                    }
+                }
+            }
         }
     }
 }
diff --git a/chart/values.yaml b/chart/values.yaml
@@ -51,11 +51,13 @@ api:
       iconUrl: https://raw.githubusercontent.com/vllm-project/vllm/v0.2.7/docs/source/assets/logos/vllm-logo-only-light.png
       description: |
         The raw inference API endpoints for the deployed LLM.
+
   # Config for huggingface model cache volume
   # This is mounted at /root/.cache/huggingface in the api deployment
   cacheVolume:
     hostPath:
       path: /tmp/llm/huggingface-cache
+
   # Number of gpus to requests for each api pod instance
   # NOTE: This must be in the range 1 <= value <= N, where
   # 'N' is the number of GPUs available in a single
@@ -71,8 +73,13 @@ api:
   # to preform a rolling zero-downtime update
   updateStrategy:
     type: Recreate
+
+  # The value of the vLLM backend's max_model_len argument (if the model's default is not suitable)
+  # https://docs.vllm.ai/en/stable/serving/openai_compatible_server.html#command-line-arguments-for-the-server
+  modelMaxContextLength:
+
   # Extra args to supply to the vLLM backend, see
-  # https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/api_server.py
+  # https://docs.vllm.ai/en/stable/serving/openai_compatible_server.html#command-line-arguments-for-the-server
   extraArgs: []
 
 # Configuration for the frontend web interface