kubernetes-sigs
diff --git a/‎Dockerfile‎
Lines changed: 1 addition & 0 deletions b/‎Dockerfile‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cmd/epp/runner/runner.go‎
Lines changed: 4 additions & 0 deletions b/‎cmd/epp/runner/runner.go‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎config/charts/inferencepool/README.md‎
Lines changed: 29 additions & 0 deletions b/‎config/charts/inferencepool/README.md‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎config/charts/inferencepool/templates/_latency-predictor.tpl‎
Lines changed: 112 additions & 0 deletions b/‎config/charts/inferencepool/templates/_latency-predictor.tpl‎
Lines changed: 112 additions & 0 deletions
diff --git a/‎config/charts/inferencepool/templates/epp-config.yaml‎
Lines changed: 61 additions & 0 deletions b/‎config/charts/inferencepool/templates/epp-config.yaml‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎config/charts/inferencepool/templates/epp-deployment.yaml‎
Lines changed: 3 additions & 0 deletions b/‎config/charts/inferencepool/templates/epp-deployment.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎config/charts/inferencepool/values.yaml‎
Lines changed: 83 additions & 0 deletions b/‎config/charts/inferencepool/values.yaml‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎docs/proposals/003-model-server-protocol/README.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/proposals/003-model-server-protocol/README.md‎
Lines changed: 1 addition & 0 deletions
@@ -24,6 +24,7 @@ COPY internal ./internal
 COPY apix ./apix
 COPY api ./api
 COPY version ./version
+COPY sidecars ./sidecars
 WORKDIR /src/cmd/epp
 RUN go build -ldflags="-X sigs.k8s.io/gateway-api-inference-extension/version.CommitSHA=${COMMIT_SHA} -X sigs.k8s.io/gateway-api-inference-extension/version.BuildRef=${BUILD_REF}" -o /epp
 
 
@@ -69,6 +69,7 @@ import (
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/saturationdetector"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/prefix"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/picker"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/profile"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/scorer"
@@ -430,6 +431,9 @@ func (r *Runner) registerInTreePlugins() {
 	plugins.Register(scorer.KvCacheUtilizationScorerType, scorer.KvCacheUtilizationScorerFactory)
 	plugins.Register(scorer.QueueScorerType, scorer.QueueScorerFactory)
 	plugins.Register(scorer.LoraAffinityScorerType, scorer.LoraAffinityScorerFactory)
+	// Latency predictor plugins
+	plugins.Register(slo_aware_router.SLOAwareRouterPluginType, slo_aware_router.SLOAwareRouterFactory)
+	plugins.Register(profile.SLOAwareProfileHandlerType, profile.SLOAwareProfileHandlerFactory)
 	// register filter for test purpose only (used in conformance tests)
 	plugins.Register(testfilter.HeaderBasedTestingFilterType, testfilter.HeaderBasedTestingFilterFactory)
 	// register response received plugin for test purpose only (used in conformance tests)
 
@@ -121,6 +121,35 @@ $ helm install triton-llama3-8b-instruct \
   oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0
 ```
 
+### Install with Latency-Based Routing
+
+For full details see the dedicated [Latency-Based Routing Guide](https://gateway-api-inference-extension.sigs.k8s.io/guides/latency-based-predictor.md)
+
+#### Latency-Based Router Configuration
+
+The behavior of the latency-based router can be fine-tuned using the configuration parameters under `inferenceExtension.latencyPredictor.sloAwareRouting` in your `values.yaml` file.
+
+| Parameter                        | Description                                                                                             | Default     |
+| -------------------------------- | ------------------------------------------------------------------------------------------------------- | ----------- |
+| `samplingMean`                   | The sampling mean (lambda) for the Poisson distribution of token sampling.                              | `100.0`     |
+| `maxSampledTokens`               | The maximum number of tokens to sample for TPOT prediction.                                             | `20`        |
+| `sloBufferFactor`                | A buffer to apply to the SLO to make it more or less strict.                                            | `1.0`       |
+| `negHeadroomTTFTWeight`          | The weight to give to the TTFT when a pod has negative headroom.                                        | `0.8`       |
+| `negHeadroomTPOTWeight`          | The weight to give to the TPOT when a pod has negative headroom.                                        | `0.2`       |
+| `headroomTTFTWeight`             | The weight to give to the TTFT when a pod has positive headroom.                                        | `0.8`       |
+| `headroomTPOTWeight`             | The weight to give to the TPOT when a pod has positive headroom.                                        | `0.2`       |
+| `headroomSelectionStrategy`      | The strategy to use for selecting a pod based on headroom. Options: `least`, `most`, `composite-least`, `composite-most`, `composite-only`. | `least`     |
+| `compositeKVWeight`              | The weight for KV cache in the composite score.                                                         | `1.0`       |
+| `compositeQueueWeight`           | The weight for queue size in the composite score.                                                       | `1.0`       |
+| `compositePrefixWeight`          | The weight for prefix cache in the composite score.                                                     | `1.0`       |
+| `epsilonExploreSticky`           | Exploration factor for sticky sessions.                                                                 | `0.01`      |
+| `epsilonExploreNeg`              | Exploration factor for negative headroom.                                                               | `0.01`      |
+| `affinityGateTau`                | Affinity gate threshold.                                                                                | `0.80`      |
+| `affinityGateTauGlobal`          | Global affinity gate threshold.                                                                         | `0.99`      |
+| `selectionMode`                  | The mode for selection (e.g., "linear").                                                                | `linear`    |
+
+**Note:** Enabling SLO-aware routing also exposes a number of Prometheus metrics for monitoring the feature, including actual vs. predicted latency, SLO violations, and more.
+
 ### Install with High Availability (HA)
 
 To deploy the EndpointPicker in a high-availability (HA) active-passive configuration set replicas to be greater than one. In such a setup, only one "leader" replica will be active and ready to process traffic at any given time. If the leader pod fails, another pod will be elected as the new leader, ensuring service continuity.
 
@@ -0,0 +1,112 @@
+{{/*
+Latency Predictor Env
+*/}}
+{{- define "gateway-api-inference-extension.latencyPredictor.env" -}}
+{{- if .Values.inferenceExtension.latencyPredictor.enabled }}
+- name: PREDICTION_SERVER_URL
+  value: "{{- $count := int .Values.inferenceExtension.latencyPredictor.predictionServers.count -}}
+          {{- $startPort := int .Values.inferenceExtension.latencyPredictor.predictionServers.startPort -}}
+          {{- range $i := until $count -}}
+            {{- if $i }},{{ end }}http://localhost:{{ add $startPort $i }}
+          {{- end }}"
+- name: TRAINING_SERVER_URL
+  value: "http://localhost:{{ .Values.inferenceExtension.latencyPredictor.trainingServer.port }}"
+{{- range $key, $value := .Values.inferenceExtension.latencyPredictor.eppEnv }}
+- name: {{ $key }}
+  value: {{ $value | quote }}
+{{- end }}
+{{- end }}
+{{- end }}
+
+{{/*
+Latency Predictor Sidecar Containers
+*/}}
+{{- define "gateway-api-inference-extension.latencyPredictor.containers" -}}
+{{- if .Values.inferenceExtension.latencyPredictor.enabled }}
+# Training Server Sidecar Container
+- name: training-server
+  image: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.hub }}/{{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.name }}:{{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.tag }}
+  imagePullPolicy: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.pullPolicy }}
+  ports:
+  - containerPort: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.port }}
+    name: training-port
+  livenessProbe:
+    {{- toYaml .Values.inferenceExtension.latencyPredictor.trainingServer.livenessProbe | nindent 4 }}
+  readinessProbe:
+    {{- toYaml .Values.inferenceExtension.latencyPredictor.trainingServer.readinessProbe | nindent 4 }}
+  resources:
+    {{- toYaml .Values.inferenceExtension.latencyPredictor.trainingServer.resources | nindent 4 }}
+  envFrom:
+  - configMapRef:
+      name: {{ include "gateway-api-inference-extension.name" . }}-latency-predictor-training
+  env:
+  - name: POD_NAME
+    valueFrom:
+      fieldRef:
+        fieldPath: metadata.name
+  - name: SERVER_TYPE
+    value: "training"
+  volumeMounts:
+  - name: training-server-storage
+    mountPath: /models
+{{- range $i := until (int .Values.inferenceExtension.latencyPredictor.predictionServers.count) }}
+# Prediction Server Sidecar Container {{ add $i 1 }}
+- name: prediction-server-{{ add $i 1 }}
+  image: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.image.hub }}/{{ $.Values.inferenceExtension.latencyPredictor.predictionServers.image.name }}:{{ $.Values.inferenceExtension.latencyPredictor.predictionServers.image.tag }}
+  imagePullPolicy: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.image.pullPolicy }}
+  command: ["uvicorn"]
+  args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "{{ add $.Values.inferenceExtension.latencyPredictor.predictionServers.startPort $i }}"]
+  ports:
+  - containerPort: {{ add $.Values.inferenceExtension.latencyPredictor.predictionServers.startPort $i }}
+    name: predict-port-{{ add $i 1 }}
+  livenessProbe:
+    httpGet:
+      path: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.livenessProbe.httpGet.path }}
+      port: {{ add $.Values.inferenceExtension.latencyPredictor.predictionServers.startPort $i }}
+    initialDelaySeconds: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.livenessProbe.initialDelaySeconds }}
+    periodSeconds: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.livenessProbe.periodSeconds }}
+  readinessProbe:
+    httpGet:
+      path: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.readinessProbe.httpGet.path }}
+      port: {{ add $.Values.inferenceExtension.latencyPredictor.predictionServers.startPort $i }}
+    initialDelaySeconds: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.readinessProbe.initialDelaySeconds }}
+    periodSeconds: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.readinessProbe.periodSeconds }}
+    failureThreshold: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.readinessProbe.failureThreshold }}
+  resources:
+    {{- toYaml $.Values.inferenceExtension.latencyPredictor.predictionServers.resources | nindent 4 }}
+  envFrom:
+  - configMapRef:
+      name: {{ include "gateway-api-inference-extension.name" $ }}-latency-predictor-prediction
+  env:
+  - name: PREDICT_PORT
+    value: "{{ add $.Values.inferenceExtension.latencyPredictor.predictionServers.startPort $i }}"
+  - name: POD_NAME
+    valueFrom:
+      fieldRef:
+        fieldPath: metadata.name
+  - name: SERVER_TYPE
+    value: "prediction-{{ add $i 1 }}"
+  - name: TRAINING_SERVER_URL
+    value: "http://localhost:{{ $.Values.inferenceExtension.latencyPredictor.trainingServer.port }}"
+  volumeMounts:
+  - name: prediction-server-{{ add $i 1 }}-storage
+    mountPath: /server_models
+{{- end }}
+{{- end }}
+{{- end }}
+
+{{/*
+Latency Predictor Volumes
+*/}}
+{{- define "gateway-api-inference-extension.latencyPredictor.volumes" -}}
+{{- if .Values.inferenceExtension.latencyPredictor.enabled }}
+- name: training-server-storage
+  emptyDir: 
+    sizeLimit: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.volumeSize }}
+{{- range $i := until (int .Values.inferenceExtension.latencyPredictor.predictionServers.count) }}
+- name: prediction-server-{{ add $i 1 }}-storage
+  emptyDir: 
+    sizeLimit: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.volumeSize }}
+{{- end }}
+{{- end }}
+{{- end }}
@@ -11,7 +11,45 @@ data:
     - type: queue-scorer
     - type: kv-cache-utilization-scorer
     - type: prefix-cache-scorer
+    {{- if .Values.inferenceExtension.latencyPredictor.enabled }}
+    - type: predicted-latency-scorer
+      parameters:
+        {{- with .Values.inferenceExtension.latencyPredictor.sloAwareRouting | default dict }}
+        samplingMean: {{ .samplingMean | default 100.0 }}
+        maxSampledTokens: {{ .maxSampledTokens | default 20 }}
+        sloBufferFactor: {{ .sloBufferFactor | default 1.0 }}
+        negHeadroomTTFTWeight: {{ .negHeadroomTTFTWeight | default 0.8 }}
+        negHeadroomTPOTWeight: {{ .negHeadroomTPOTWeight | default 0.2 }}
+        headroomTTFTWeight: {{ .headroomTTFTWeight | default 0.8 }}
+        headroomTPOTWeight: {{ .headroomTPOTWeight | default 0.2 }}
+        headroomSelectionStrategy: {{ .headroomSelectionStrategy | default "least" | quote }}
+        compositeKVWeight: {{ .compositeKVWeight | default 1.0 }}
+        compositeQueueWeight: {{ .compositeQueueWeight | default 1.0 }}
+        compositePrefixWeight: {{ .compositePrefixWeight | default 1.0 }}
+        epsilonExploreSticky: {{ .epsilonExploreSticky | default 0.01 }}
+        epsilonExploreNeg: {{ .epsilonExploreNeg | default 0.01 }}
+        affinityGateTau: {{ .affinityGateTau | default 0.80 }}
+        affinityGateTauGlobal: {{ .affinityGateTauGlobal | default 0.99 }}
+        selectionMode: {{ .selectionMode | default "linear" | quote }}
+        {{- end }}
+    - type: predicted-latency-profile-handler
+    {{- end }}
     schedulingProfiles:
+    {{- if .Values.inferenceExtension.latencyPredictor.enabled }}
+    - name: predicted-latency-prefix
+      plugins:
+      - pluginRef: prefix-cache-scorer
+    - name: predicted-latency-no-routing
+      plugins:
+      - pluginRef: prefix-cache-scorer
+      - pluginRef: predicted-latency-scorer
+        weight: 0
+      - pluginRef: queue-scorer
+      - pluginRef: kv-cache-utilization-scorer
+    - name: predicted-latency-routing
+      plugins:
+      - pluginRef: predicted-latency-scorer
+    {{- else }}
     - name: default
       plugins:
       - pluginRef: queue-scorer
@@ -20,6 +58,7 @@ data:
         weight: 2
       - pluginRef: prefix-cache-scorer
         weight: 3
+    {{- end }}
   {{- if (hasKey .Values.inferenceExtension "pluginsCustomConfig") }}
   {{- .Values.inferenceExtension.pluginsCustomConfig | toYaml | nindent 2 }}
   {{- end }}
@@ -34,3 +73,25 @@ metadata:
 data:
   {{- .Values.inferenceExtension.sidecar.configMap.data | toYaml | nindent 2 }}
 {{- end }}
+---
+{{- if .Values.inferenceExtension.latencyPredictor.enabled }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "gateway-api-inference-extension.name" . }}-latency-predictor-training
+  namespace: {{ .Release.Namespace }}
+data:
+  {{- range $key, $value := .Values.inferenceExtension.latencyPredictor.trainingServer.config }}
+  {{ $key }}: {{ $value | quote }}
+{{- end }}
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "gateway-api-inference-extension.name" . }}-latency-predictor-prediction
+  namespace: {{ .Release.Namespace }}
+data:
+  {{- range $key, $value := .Values.inferenceExtension.latencyPredictor.predictionServers.config }}
+  {{ $key }}: {{ $value | quote }}
+  {{- end }}
+{{- end }}
@@ -151,6 +151,7 @@ spec:
           valueFrom:
             fieldRef:
               fieldPath: metadata.name
+        {{- include "gateway-api-inference-extension.latencyPredictor.env" . | nindent 8 }}
         {{- if .Values.inferenceExtension.tracing.enabled }}
         - name: OTEL_SERVICE_NAME
           value: "gateway-api-inference-extension"
@@ -181,13 +182,15 @@ spec:
         volumeMounts:
         - name: plugins-config-volume
           mountPath: "/config"
+      {{- include "gateway-api-inference-extension.latencyPredictor.containers" . | nindent 6 }}
       volumes:
       {{- if .Values.inferenceExtension.sidecar.volumes }}
       {{- tpl (toYaml .Values.inferenceExtension.sidecar.volumes) $ | nindent 6 }}
       {{- end }}
       - name: plugins-config-volume
         configMap:
           name: {{ include "gateway-api-inference-extension.name" . }}
+      {{- include "gateway-api-inference-extension.latencyPredictor.volumes" . | nindent 6 }}
       {{- if .Values.inferenceExtension.affinity }}
       affinity:
         {{- toYaml .Values.inferenceExtension.affinity | nindent 8 }}
 
@@ -71,6 +71,89 @@ inferenceExtension:
       sampler: "parentbased_traceidratio"
       samplerArg: "0.1"
 
+  # Latency Predictor Configuration
+  latencyPredictor:
+    enabled: false
+    
+    # Training Server Configuration
+    trainingServer:
+      image:
+        hub: path/to/your/docker/repo # NOTE: Update with your Docker repository path for sidecars
+        name: latencypredictor-training-server
+        tag: latest
+        pullPolicy: Always
+      port: 8000
+      resources:
+        requests:
+          cpu: "2000m"
+          memory: "4Gi"
+        limits:
+          cpu: "4000m"
+          memory: "8Gi"
+      livenessProbe:
+        httpGet:
+          path: /healthz
+          port: 8000
+        initialDelaySeconds: 30
+        periodSeconds: 20
+      readinessProbe:
+        httpGet:
+          path: /readyz
+          port: 8000
+        initialDelaySeconds: 45
+        periodSeconds: 10
+      volumeSize: "20Gi"
+      config:
+        LATENCY_RETRAINING_INTERVAL_SEC: "1"
+        LATENCY_MIN_SAMPLES_FOR_RETRAIN: "100"
+        LATENCY_TTFT_MODEL_PATH: "/models/ttft.joblib"
+        LATENCY_TPOT_MODEL_PATH: "/models/tpot.joblib"
+        LATENCY_TTFT_SCALER_PATH: "/models/ttft_scaler.joblib"
+        LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib"
+        LATENCY_MODEL_TYPE: "xgboost"
+        LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: "5000"
+        LATENCY_QUANTILE_ALPHA: "0.9"
+
+    # Prediction Server Configuration
+    predictionServers:
+      count: 10
+      startPort: 8001
+      image:
+        hub: path/to/your/docker/repo # NOTE: Update with your Docker repository path for sidecars
+        name: latencypredictor-prediction-server
+        tag: latest
+        pullPolicy: Always
+      resources:
+        requests:
+          cpu: "500m"
+          memory: "1Gi"
+        limits:
+          cpu: "1000m"
+          memory: "2Gi"
+      livenessProbe:
+        httpGet:
+          path: /healthz
+        initialDelaySeconds: 15
+        periodSeconds: 15
+      readinessProbe:
+        httpGet:
+          path: /readyz
+        initialDelaySeconds: 10
+        periodSeconds: 5
+        failureThreshold: 10
+      volumeSize: "10Gi"
+      config:
+        LATENCY_MODEL_TYPE: "xgboost"
+        PREDICT_HOST: "0.0.0.0"
+        LOCAL_TTFT_MODEL_PATH: "/server_models/ttft.joblib"
+        LOCAL_TPOT_MODEL_PATH: "/server_models/tpot.joblib"
+        LOCAL_TTFT_SCALER_PATH: "/server_models/ttft_scaler.joblib"
+        LOCAL_TPOT_SCALER_PATH: "/server_models/tpot_scaler.joblib"
+
+    # EPP Environment Variables for Latency Predictor
+    eppEnv:
+      LATENCY_MAX_SAMPLE_SIZE: "10000"
+
 inferencePool:
   targetPorts:
     - number: 8000
 
@@ -28,6 +28,7 @@ effort.
 | Metric | Type | Description | vLLM metric | Triton TensorRT-LLM| SGLang |
 | ----- | ---- | ------------ | ---- | ---- | ---- |
 | TotalQueuedRequests         | Gauge     | The current total number of requests in the queue.| `vllm:num_requests_waiting`| `nv_trt_llm_request_metrics{request_type=waiting}`| `sglang:num_queue_reqs`
+| TotalRunningRequests         | Gauge     | The current total number of requests actively being served on the model server.| `vllm:num_requests_running`| `nv_trt_llm_request_metrics{request_type=scheduled}`| `sglang:num_running_reqs`
 | KVCacheUtilization| Gauge     | The current KV cache utilization in percentage.| `vllm:gpu_cache_usage_perc`| `nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=fraction}`| `sglang:token_usage`
 | [Optional] BlockSize         | Labeled     | The block size in tokens to allocate memory, used by the prefix cache scorer. If this metric is not available, the BlockSize will be derived from the [prefix plugin config](https://gateway-api-inference-extension.sigs.k8s.io/guides/epp-configuration/prefix-aware/#customize-the-prefix-cache-plugin).| name: `vllm:cache_config_info`, label name: `block_size`| | 
 | [Optional] NumGPUBlocks| Labeled     | The total number of blocks in the HBM KV cache, used by the prefix cache scorer. If this metric is not available, the NumGPUBlocks will be derived from the [prefix plugin config](https://gateway-api-inference-extension.sigs.k8s.io/guides/epp-configuration/prefix-aware/#customize-the-prefix-cache-plugin).| name: `vllm:cache_config_info`, label name: `num_gpu_blocks`| |