diff --git a/Tiltfile b/Tiltfile
index aa94500..6881fa8 100644
--- a/Tiltfile
+++ b/Tiltfile
@@ -1,6 +1,6 @@
 # Toggles whether UI should be run locally using gradio hot-reloading
 # or should be included in the remote Helm install
-run_ui_locally = True
+run_ui_locally = os.getenv("AZIMUTH_LLM_TILT_LOCAL_UI", True)
 
 # Allow non-local contexts
 allow_k8s_contexts(k8s_context())
diff --git a/chart/azimuth-ui.schema.yaml b/chart/azimuth-ui.schema.yaml
index 929e7ef..de283f1 100644
--- a/chart/azimuth-ui.schema.yaml
+++ b/chart/azimuth-ui.schema.yaml
@@ -10,10 +10,25 @@ controls:
     type: MirrorControl
     path: /huggingface/model
     visuallyHidden: true
+  # Azimuth UI doesn't handle json type ["integer","null"]
+  # properly so we allow any type in JSON schema then
+  # constrain to (optional) integer here.
+  /api/modelMaxContextLength:
+    type: IntegerControl
+    minimum: 100
+    step: 100
+    required: false
 
 sortOrder:
   - /huggingface/model
   - /huggingface/token
   - /ui/appSettings/hf_model_instruction
   - /ui/appSettings/page_title
+  - /api/image/version
   - /ui/appSettings/llm_temperature
+  - /ui/appSettings/llm_max_tokens
+  - /ui/appSettings/llm_frequency_penalty
+  - /ui/appSettings/llm_presence_penalty
+  - /ui/appSettings/llm_top_p
+  - /ui/appSettings/llm_top_k
+  - /api/modelMaxContextLength
diff --git a/chart/templates/api/deployment.yml b/chart/templates/api/deployment.yml
index 77d2c2c..6a3e9a7 100644
--- a/chart/templates/api/deployment.yml
+++ b/chart/templates/api/deployment.yml
@@ -29,6 +29,10 @@ spec:
           - --model
           - {{ .Values.huggingface.model }}
           {{- include "azimuth-llm.chatTemplate" . | nindent 10 }}
+          {{- if .Values.api.modelMaxContextLength -}}
+          - --max-model-len
+          - {{ .Values.api.modelMaxContextLength | quote }}
+          {{- end -}}
           {{- if .Values.api.extraArgs -}}
           {{- .Values.api.extraArgs | toYaml | nindent 10 }}
           {{- end -}}
diff --git a/chart/values.schema.json b/chart/values.schema.json
index 309a9fa..8b56376 100644
--- a/chart/values.schema.json
+++ b/chart/values.schema.json
@@ -92,6 +92,26 @@
                     "required": ["hf_model_name", "hf_model_instruction"]
                 }
             }
+        },
+        "api": {
+            "type": "object",
+            "properties": {
+                "modelMaxContextLength": {
+                    "title": "Model Context Length",
+                    "description": "An override for the maximum context length to allow, if the model's default is not suitable."
+                },
+                "image": {
+                    "type": "object",
+                    "properties": {
+                        "version": {
+                            "type": "string",
+                            "title": "Backend vLLM version",
+                            "description": "The vLLM version to use as a backend. Must be a version tag from [this list](https://github.com/vllm-project/vllm/tags)",
+                            "default": "v0.4.3"
+                        }
+                    }
+                }
+            }
         }
     }
 }
diff --git a/chart/values.yaml b/chart/values.yaml
index a6d0fd2..a26685d 100644
--- a/chart/values.yaml
+++ b/chart/values.yaml
@@ -51,11 +51,13 @@ api:
       iconUrl: https://raw.githubusercontent.com/vllm-project/vllm/v0.2.7/docs/source/assets/logos/vllm-logo-only-light.png
       description: |
         The raw inference API endpoints for the deployed LLM.
+
   # Config for huggingface model cache volume
   # This is mounted at /root/.cache/huggingface in the api deployment
   cacheVolume:
     hostPath:
       path: /tmp/llm/huggingface-cache
+
   # Number of gpus to requests for each api pod instance
   # NOTE: This must be in the range 1 <= value <= N, where
   # 'N' is the number of GPUs available in a single
@@ -71,8 +73,13 @@ api:
   # to preform a rolling zero-downtime update
   updateStrategy:
     type: Recreate
+
+  # The value of the vLLM backend's max_model_len argument (if the model's default is not suitable)
+  # https://docs.vllm.ai/en/stable/serving/openai_compatible_server.html#command-line-arguments-for-the-server
+  modelMaxContextLength:
+
   # Extra args to supply to the vLLM backend, see
-  # https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/api_server.py
+  # https://docs.vllm.ai/en/stable/serving/openai_compatible_server.html#command-line-arguments-for-the-server
   extraArgs: []
 
 # Configuration for the frontend web interface