diff --git a/Tiltfile b/Tiltfile index aa94500..6881fa8 100644 --- a/Tiltfile +++ b/Tiltfile @@ -1,6 +1,6 @@ # Toggles whether UI should be run locally using gradio hot-reloading # or should be included in the remote Helm install -run_ui_locally = True +run_ui_locally = os.getenv("AZIMUTH_LLM_TILT_LOCAL_UI", True) # Allow non-local contexts allow_k8s_contexts(k8s_context()) diff --git a/chart/azimuth-ui.schema.yaml b/chart/azimuth-ui.schema.yaml index 929e7ef..de283f1 100644 --- a/chart/azimuth-ui.schema.yaml +++ b/chart/azimuth-ui.schema.yaml @@ -10,10 +10,25 @@ controls: type: MirrorControl path: /huggingface/model visuallyHidden: true + # Azimuth UI doesn't handle json type ["integer","null"] + # properly so we allow any type in JSON schema then + # constrain to (optional) integer here. + /api/modelMaxContextLength: + type: IntegerControl + minimum: 100 + step: 100 + required: false sortOrder: - /huggingface/model - /huggingface/token - /ui/appSettings/hf_model_instruction - /ui/appSettings/page_title + - /api/image/version - /ui/appSettings/llm_temperature + - /ui/appSettings/llm_max_tokens + - /ui/appSettings/llm_frequency_penalty + - /ui/appSettings/llm_presence_penalty + - /ui/appSettings/llm_top_p + - /ui/appSettings/llm_top_k + - /api/modelMaxContextLength diff --git a/chart/templates/api/deployment.yml b/chart/templates/api/deployment.yml index 77d2c2c..6a3e9a7 100644 --- a/chart/templates/api/deployment.yml +++ b/chart/templates/api/deployment.yml @@ -29,6 +29,10 @@ spec: - --model - {{ .Values.huggingface.model }} {{- include "azimuth-llm.chatTemplate" . | nindent 10 }} + {{- if .Values.api.modelMaxContextLength -}} + - --max-model-len + - {{ .Values.api.modelMaxContextLength | quote }} + {{- end -}} {{- if .Values.api.extraArgs -}} {{- .Values.api.extraArgs | toYaml | nindent 10 }} {{- end -}} diff --git a/chart/values.schema.json b/chart/values.schema.json index 309a9fa..8b56376 100644 --- a/chart/values.schema.json +++ b/chart/values.schema.json @@ -92,6 +92,26 @@ "required": ["hf_model_name", "hf_model_instruction"] } } + }, + "api": { + "type": "object", + "properties": { + "modelMaxContextLength": { + "title": "Model Context Length", + "description": "An override for the maximum context length to allow, if the model's default is not suitable." + }, + "image": { + "type": "object", + "properties": { + "version": { + "type": "string", + "title": "Backend vLLM version", + "description": "The vLLM version to use as a backend. Must be a version tag from [this list](https://github.com/vllm-project/vllm/tags)", + "default": "v0.4.3" + } + } + } + } } } } diff --git a/chart/values.yaml b/chart/values.yaml index a6d0fd2..a26685d 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -51,11 +51,13 @@ api: iconUrl: https://raw.githubusercontent.com/vllm-project/vllm/v0.2.7/docs/source/assets/logos/vllm-logo-only-light.png description: | The raw inference API endpoints for the deployed LLM. + # Config for huggingface model cache volume # This is mounted at /root/.cache/huggingface in the api deployment cacheVolume: hostPath: path: /tmp/llm/huggingface-cache + # Number of gpus to requests for each api pod instance # NOTE: This must be in the range 1 <= value <= N, where # 'N' is the number of GPUs available in a single @@ -71,8 +73,13 @@ api: # to preform a rolling zero-downtime update updateStrategy: type: Recreate + + # The value of the vLLM backend's max_model_len argument (if the model's default is not suitable) + # https://docs.vllm.ai/en/stable/serving/openai_compatible_server.html#command-line-arguments-for-the-server + modelMaxContextLength: + # Extra args to supply to the vLLM backend, see - # https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/api_server.py + # https://docs.vllm.ai/en/stable/serving/openai_compatible_server.html#command-line-arguments-for-the-server extraArgs: [] # Configuration for the frontend web interface