Skip to content

Commit 523c566

Browse files
authored
Merge branch 'main' into ciexempt
2 parents 12b0678 + 07aeb2b commit 523c566

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+1468
-173
lines changed

.github/workflows/crd-validation.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,4 +80,4 @@ jobs:
8080
fi
8181
fi
8282
83-
echo "All CRDs are compatible."
83+
echo "All CRDs are compatible."

Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ COPY internal ./internal
2424
COPY apix ./apix
2525
COPY api ./api
2626
COPY version ./version
27+
COPY sidecars ./sidecars
2728
WORKDIR /src/cmd/epp
2829
RUN go build -ldflags="-X sigs.k8s.io/gateway-api-inference-extension/version.CommitSHA=${COMMIT_SHA} -X sigs.k8s.io/gateway-api-inference-extension/version.BuildRef=${BUILD_REF}" -o /epp
2930

cmd/epp/runner/runner.go

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ import (
6969
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/saturationdetector"
7070
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling"
7171
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/prefix"
72+
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router"
7273
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/picker"
7374
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/profile"
7475
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/scorer"
@@ -126,6 +127,7 @@ var (
126127
"then a self-signed certificate is used.")
127128
// metric flags
128129
totalQueuedRequestsMetric = flag.String("total-queued-requests-metric", runserver.DefaultTotalQueuedRequestsMetric, "Prometheus metric for the number of queued requests.")
130+
totalRunningRequestsMetric = flag.String("total-running-requests-metric", runserver.DefaultTotalRunningRequestsMetric, "Prometheus metric for the number of running requests.")
129131
kvCacheUsagePercentageMetric = flag.String("kv-cache-usage-percentage-metric", runserver.DefaultKvCacheUsagePercentageMetric, "Prometheus metric for the fraction of KV-cache blocks currently in use (from 0 to 1).")
130132
// LoRA metrics
131133
loraInfoMetric = flag.String("lora-info-metric", runserver.DefaultLoraInfoMetric, "Prometheus metric for the LoRA info metrics (must be in vLLM label format).")
@@ -139,8 +141,9 @@ var (
139141
configFile = flag.String("config-file", runserver.DefaultConfigFile, "The path to the configuration file")
140142
configText = flag.String("config-text", runserver.DefaultConfigText, "The configuration specified as text, in lieu of a file")
141143

142-
modelServerMetricsPort = flag.Int("model-server-metrics-port", 0, "Port to scrape metrics from pods. "+
143-
"Default value will be set to the InferencePool.Spec.TargetPorts[0].Number if not set.")
144+
modelServerMetricsPort = flag.Int("model-server-metrics-port", 0, "[DEPRECATED] Port to scrape metrics from pods. "+
145+
"Default value will be set to the InferencePool.Spec.TargetPorts[0].Number if not set."+
146+
"This option will be removed in the next release.")
144147
modelServerMetricsPath = flag.String("model-server-metrics-path", "/metrics", "Path to scrape metrics from pods")
145148
modelServerMetricsScheme = flag.String("model-server-metrics-scheme", "http", "Scheme to scrape metrics from pods")
146149
modelServerMetricsHttpsInsecureSkipVerify = flag.Bool("model-server-metrics-https-insecure-skip-verify", true, "When using 'https' scheme for 'model-server-metrics-scheme', configure 'InsecureSkipVerify' (default to true)")
@@ -198,6 +201,8 @@ func (r *Runner) Run(ctx context.Context) error {
198201
flag.Parse()
199202
initLogging(&opts)
200203

204+
r.deprecatedFlagsHandler(setupLog)
205+
201206
if *tracing {
202207
err := common.InitTracing(ctx, setupLog)
203208
if err != nil {
@@ -426,6 +431,9 @@ func (r *Runner) registerInTreePlugins() {
426431
plugins.Register(scorer.KvCacheUtilizationScorerType, scorer.KvCacheUtilizationScorerFactory)
427432
plugins.Register(scorer.QueueScorerType, scorer.QueueScorerFactory)
428433
plugins.Register(scorer.LoraAffinityScorerType, scorer.LoraAffinityScorerFactory)
434+
// Latency predictor plugins
435+
plugins.Register(slo_aware_router.SLOAwareRouterPluginType, slo_aware_router.SLOAwareRouterFactory)
436+
plugins.Register(profile.SLOAwareProfileHandlerType, profile.SLOAwareProfileHandlerFactory)
429437
// register filter for test purpose only (used in conformance tests)
430438
plugins.Register(testfilter.HeaderBasedTestingFilterType, testfilter.HeaderBasedTestingFilterFactory)
431439
// register response received plugin for test purpose only (used in conformance tests)
@@ -478,6 +486,10 @@ func (r *Runner) parseConfigurationPhaseTwo(ctx context.Context, rawConfig *conf
478486

479487
// Add requestControl plugins
480488
r.requestControlConfig.AddPlugins(handle.GetAllPlugins()...)
489+
// Sort prepare data plugins in DAG order (topological sort). Also check prepare data plugins for cycles.
490+
if r.requestControlConfig.PrepareDataPluginGraph() != nil {
491+
return nil, errors.New("failed to load the configuration - prepare data plugins have cyclic dependencies")
492+
}
481493

482494
// Handler deprecated configuration options
483495
r.deprecatedConfigurationHelper(cfg, logger)
@@ -486,6 +498,14 @@ func (r *Runner) parseConfigurationPhaseTwo(ctx context.Context, rawConfig *conf
486498
return cfg, nil
487499
}
488500

501+
func (r *Runner) deprecatedFlagsHandler(logger logr.Logger) {
502+
flag.Visit(func(f *flag.Flag) {
503+
if f.Name == "model-server-metrics-port" { // future: use map/set to store deprecated flags (and replacements?)
504+
logger.Info("deprecated option will be removed in the next release.", "option", f.Name)
505+
}
506+
})
507+
}
508+
489509
func (r *Runner) deprecatedConfigurationHelper(cfg *config.Config, logger logr.Logger) {
490510
// Handle deprecated environment variable based feature flags
491511

@@ -538,6 +558,7 @@ func (r *Runner) setupMetricsCollection(setupLog logr.Logger, useExperimentalDat
538558
func setupMetricsV1(setupLog logr.Logger) (datalayer.EndpointFactory, error) {
539559
mapping, err := backendmetrics.NewMetricMapping(
540560
*totalQueuedRequestsMetric,
561+
*totalRunningRequestsMetric,
541562
*kvCacheUsagePercentageMetric,
542563
*loraInfoMetric,
543564
*cacheInfoMetric,
@@ -586,6 +607,7 @@ func setupDatalayer(logger logr.Logger) (datalayer.EndpointFactory, error) {
586607
*modelServerMetricsHttpsInsecureSkipVerify,
587608
nil)
588609
extractor, err := dlmetrics.NewExtractor(*totalQueuedRequestsMetric,
610+
*totalRunningRequestsMetric,
589611
*kvCacheUsagePercentageMetric,
590612
*loraInfoMetric, *cacheInfoMetric)
591613

config/charts/body-based-routing/README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,27 @@ $ helm install body-based-router oci://us-central1-docker.pkg.dev/k8s-staging-im
2424
--set provider.name=[gke|istio]
2525
```
2626

27+
### Install with Custom Cmd-line Flags
28+
29+
To set cmd-line flags, you can use the `--set` option to set each flag, e.g.,:
30+
31+
```txt
32+
$ helm install body-based-router ./config/charts/body-based-routing \
33+
--set provider.name=[gke|istio] \
34+
--set inferenceGateway.name=inference-gateway
35+
--set bbr.flags.<FLAG_NAME>=<FLAG_VALUE>
36+
```
37+
38+
Alternatively, you can define flags in the `values.yaml` file:
39+
40+
```yaml
41+
bbr:
42+
flags:
43+
FLAG_NAME: <FLAG_VALUE>
44+
v: 3 ## Log verbosity
45+
...
46+
```
47+
2748
## Uninstall
2849

2950
Run the following command to uninstall the chart:
@@ -46,6 +67,7 @@ The following table list the configurable parameters of the chart.
4667
| `bbr.image.hub` | Registry URL where the image is hosted. |
4768
| `bbr.image.tag` | Image tag. |
4869
| `bbr.image.pullPolicy` | Image pull policy for the container. Possible values: `Always`, `IfNotPresent`, or `Never`. Defaults to `Always`. |
70+
| `bbr.flags` | map of flags which are passed through to bbr. Refer to [runner.go](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/cmd/bbr/runner/runner.go) for complete list. |
4971
| `provider.name` | Name of the Inference Gateway implementation being used. Possible values: `istio`, `gke`. Defaults to `none`. |
5072
| `inferenceGateway.name` | The name of the Gateway. Defaults to `inference-gateway`. |
5173

config/charts/body-based-routing/templates/bbr.yaml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,11 @@ spec:
1919
imagePullPolicy: {{ .Values.bbr.image.pullPolicy | default "Always" }}
2020
args:
2121
- "--streaming"
22-
- "--v"
23-
- "3"
22+
# Pass additional flags via the bbr.flags field in values.yaml.
23+
{{- range $key, $value := .Values.bbr.flags }}
24+
- --{{ $key }}
25+
- "{{ $value }}"
26+
{{- end }}
2427
ports:
2528
- containerPort: {{ .Values.bbr.port }}
2629
# health check

config/charts/body-based-routing/values.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ bbr:
99
port: 9004
1010
healthCheckPort: 9005
1111

12+
flags:
13+
# Log verbosity
14+
v: 3
15+
1216
provider:
1317
name: none
1418

config/charts/inferencepool/README.md

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,35 @@ $ helm install triton-llama3-8b-instruct \
121121
oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0
122122
```
123123

124+
### Install with Latency-Based Routing
125+
126+
For full details see the dedicated [Latency-Based Routing Guide](https://gateway-api-inference-extension.sigs.k8s.io/guides/latency-based-predictor.md)
127+
128+
#### Latency-Based Router Configuration
129+
130+
The behavior of the latency-based router can be fine-tuned using the configuration parameters under `inferenceExtension.latencyPredictor.sloAwareRouting` in your `values.yaml` file.
131+
132+
| Parameter | Description | Default |
133+
| -------------------------------- | ------------------------------------------------------------------------------------------------------- | ----------- |
134+
| `samplingMean` | The sampling mean (lambda) for the Poisson distribution of token sampling. | `100.0` |
135+
| `maxSampledTokens` | The maximum number of tokens to sample for TPOT prediction. | `20` |
136+
| `sloBufferFactor` | A buffer to apply to the SLO to make it more or less strict. | `1.0` |
137+
| `negHeadroomTTFTWeight` | The weight to give to the TTFT when a pod has negative headroom. | `0.8` |
138+
| `negHeadroomTPOTWeight` | The weight to give to the TPOT when a pod has negative headroom. | `0.2` |
139+
| `headroomTTFTWeight` | The weight to give to the TTFT when a pod has positive headroom. | `0.8` |
140+
| `headroomTPOTWeight` | The weight to give to the TPOT when a pod has positive headroom. | `0.2` |
141+
| `headroomSelectionStrategy` | The strategy to use for selecting a pod based on headroom. Options: `least`, `most`, `composite-least`, `composite-most`, `composite-only`. | `least` |
142+
| `compositeKVWeight` | The weight for KV cache in the composite score. | `1.0` |
143+
| `compositeQueueWeight` | The weight for queue size in the composite score. | `1.0` |
144+
| `compositePrefixWeight` | The weight for prefix cache in the composite score. | `1.0` |
145+
| `epsilonExploreSticky` | Exploration factor for sticky sessions. | `0.01` |
146+
| `epsilonExploreNeg` | Exploration factor for negative headroom. | `0.01` |
147+
| `affinityGateTau` | Affinity gate threshold. | `0.80` |
148+
| `affinityGateTauGlobal` | Global affinity gate threshold. | `0.99` |
149+
| `selectionMode` | The mode for selection (e.g., "linear"). | `linear` |
150+
151+
**Note:** Enabling SLO-aware routing also exposes a number of Prometheus metrics for monitoring the feature, including actual vs. predicted latency, SLO violations, and more.
152+
124153
### Install with High Availability (HA)
125154

126155
To deploy the EndpointPicker in a high-availability (HA) active-passive configuration set replicas to be greater than one. In such a setup, only one "leader" replica will be active and ready to process traffic at any given time. If the leader pod fails, another pod will be elected as the new leader, ensuring service continuity.
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
{{/*
2+
Latency Predictor Env
3+
*/}}
4+
{{- define "gateway-api-inference-extension.latencyPredictor.env" -}}
5+
{{- if .Values.inferenceExtension.latencyPredictor.enabled }}
6+
- name: PREDICTION_SERVER_URL
7+
value: "{{- $count := int .Values.inferenceExtension.latencyPredictor.predictionServers.count -}}
8+
{{- $startPort := int .Values.inferenceExtension.latencyPredictor.predictionServers.startPort -}}
9+
{{- range $i := until $count -}}
10+
{{- if $i }},{{ end }}http://localhost:{{ add $startPort $i }}
11+
{{- end }}"
12+
- name: TRAINING_SERVER_URL
13+
value: "http://localhost:{{ .Values.inferenceExtension.latencyPredictor.trainingServer.port }}"
14+
{{- range $key, $value := .Values.inferenceExtension.latencyPredictor.eppEnv }}
15+
- name: {{ $key }}
16+
value: {{ $value | quote }}
17+
{{- end }}
18+
{{- end }}
19+
{{- end }}
20+
21+
{{/*
22+
Latency Predictor Sidecar Containers
23+
*/}}
24+
{{- define "gateway-api-inference-extension.latencyPredictor.containers" -}}
25+
{{- if .Values.inferenceExtension.latencyPredictor.enabled }}
26+
# Training Server Sidecar Container
27+
- name: training-server
28+
image: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.hub }}/{{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.name }}:{{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.tag }}
29+
imagePullPolicy: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.image.pullPolicy }}
30+
ports:
31+
- containerPort: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.port }}
32+
name: training-port
33+
livenessProbe:
34+
{{- toYaml .Values.inferenceExtension.latencyPredictor.trainingServer.livenessProbe | nindent 4 }}
35+
readinessProbe:
36+
{{- toYaml .Values.inferenceExtension.latencyPredictor.trainingServer.readinessProbe | nindent 4 }}
37+
resources:
38+
{{- toYaml .Values.inferenceExtension.latencyPredictor.trainingServer.resources | nindent 4 }}
39+
envFrom:
40+
- configMapRef:
41+
name: {{ include "gateway-api-inference-extension.name" . }}-latency-predictor-training
42+
env:
43+
- name: POD_NAME
44+
valueFrom:
45+
fieldRef:
46+
fieldPath: metadata.name
47+
- name: SERVER_TYPE
48+
value: "training"
49+
volumeMounts:
50+
- name: training-server-storage
51+
mountPath: /models
52+
{{- range $i := until (int .Values.inferenceExtension.latencyPredictor.predictionServers.count) }}
53+
# Prediction Server Sidecar Container {{ add $i 1 }}
54+
- name: prediction-server-{{ add $i 1 }}
55+
image: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.image.hub }}/{{ $.Values.inferenceExtension.latencyPredictor.predictionServers.image.name }}:{{ $.Values.inferenceExtension.latencyPredictor.predictionServers.image.tag }}
56+
imagePullPolicy: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.image.pullPolicy }}
57+
command: ["uvicorn"]
58+
args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "{{ add $.Values.inferenceExtension.latencyPredictor.predictionServers.startPort $i }}"]
59+
ports:
60+
- containerPort: {{ add $.Values.inferenceExtension.latencyPredictor.predictionServers.startPort $i }}
61+
name: predict-port-{{ add $i 1 }}
62+
livenessProbe:
63+
httpGet:
64+
path: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.livenessProbe.httpGet.path }}
65+
port: {{ add $.Values.inferenceExtension.latencyPredictor.predictionServers.startPort $i }}
66+
initialDelaySeconds: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.livenessProbe.initialDelaySeconds }}
67+
periodSeconds: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.livenessProbe.periodSeconds }}
68+
readinessProbe:
69+
httpGet:
70+
path: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.readinessProbe.httpGet.path }}
71+
port: {{ add $.Values.inferenceExtension.latencyPredictor.predictionServers.startPort $i }}
72+
initialDelaySeconds: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.readinessProbe.initialDelaySeconds }}
73+
periodSeconds: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.readinessProbe.periodSeconds }}
74+
failureThreshold: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.readinessProbe.failureThreshold }}
75+
resources:
76+
{{- toYaml $.Values.inferenceExtension.latencyPredictor.predictionServers.resources | nindent 4 }}
77+
envFrom:
78+
- configMapRef:
79+
name: {{ include "gateway-api-inference-extension.name" $ }}-latency-predictor-prediction
80+
env:
81+
- name: PREDICT_PORT
82+
value: "{{ add $.Values.inferenceExtension.latencyPredictor.predictionServers.startPort $i }}"
83+
- name: POD_NAME
84+
valueFrom:
85+
fieldRef:
86+
fieldPath: metadata.name
87+
- name: SERVER_TYPE
88+
value: "prediction-{{ add $i 1 }}"
89+
- name: TRAINING_SERVER_URL
90+
value: "http://localhost:{{ $.Values.inferenceExtension.latencyPredictor.trainingServer.port }}"
91+
volumeMounts:
92+
- name: prediction-server-{{ add $i 1 }}-storage
93+
mountPath: /server_models
94+
{{- end }}
95+
{{- end }}
96+
{{- end }}
97+
98+
{{/*
99+
Latency Predictor Volumes
100+
*/}}
101+
{{- define "gateway-api-inference-extension.latencyPredictor.volumes" -}}
102+
{{- if .Values.inferenceExtension.latencyPredictor.enabled }}
103+
- name: training-server-storage
104+
emptyDir:
105+
sizeLimit: {{ .Values.inferenceExtension.latencyPredictor.trainingServer.volumeSize }}
106+
{{- range $i := until (int .Values.inferenceExtension.latencyPredictor.predictionServers.count) }}
107+
- name: prediction-server-{{ add $i 1 }}-storage
108+
emptyDir:
109+
sizeLimit: {{ $.Values.inferenceExtension.latencyPredictor.predictionServers.volumeSize }}
110+
{{- end }}
111+
{{- end }}
112+
{{- end }}

0 commit comments

Comments
 (0)