Skip to content

Commit 1eb5d8a

Browse files
Rename latency prediction plugins, change docs accordingly, make sidecars not fail immediatly during EPP spinup
1 parent 5844d8f commit 1eb5d8a

File tree

6 files changed

+29
-30
lines changed

6 files changed

+29
-30
lines changed

config/charts/inferencepool/templates/epp-config.yaml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ data:
1212
- type: kv-cache-utilization-scorer
1313
- type: prefix-cache-scorer
1414
{{- if .Values.inferenceExtension.latencyPredictor.enabled }}
15-
- type: slo-aware-routing
15+
- type: predicted-latency-scorer
1616
parameters:
1717
{{- with .Values.inferenceExtension.latencyPredictor.sloAwareRouting | default dict }}
1818
samplingMean: {{ .samplingMean | default 100.0 }}
@@ -32,23 +32,23 @@ data:
3232
affinityGateTauGlobal: {{ .affinityGateTauGlobal | default 0.99 }}
3333
selectionMode: {{ .selectionMode | default "linear" | quote }}
3434
{{- end }}
35-
- type: slo-aware-profile-handler
35+
- type: predicted-latency-profile-handler
3636
{{- end }}
3737
schedulingProfiles:
3838
{{- if .Values.inferenceExtension.latencyPredictor.enabled }}
39-
- name: prefix
39+
- name: predicted-latency-prefix
4040
plugins:
4141
- pluginRef: prefix-cache-scorer
42-
- name: default
42+
- name: predicted-latency-no-routing
4343
plugins:
4444
- pluginRef: prefix-cache-scorer
45-
- pluginRef: slo-aware-routing
45+
- pluginRef: predicted-latency-scorer
4646
weight: 0
4747
- pluginRef: queue-scorer
4848
- pluginRef: kv-cache-utilization-scorer
49-
- name: routing
49+
- name: predicted-latency-routing
5050
plugins:
51-
- pluginRef: slo-aware-routing
51+
- pluginRef: predicted-latency-scorer
5252
{{- else }}
5353
- name: default
5454
plugins:

pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/scorer_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -375,8 +375,8 @@ func TestSLOAwareRouter_TypedName(t *testing.T) {
375375
router := NewSLOAwareRouter(cfg, predictor)
376376

377377
tn := router.TypedName()
378-
assert.Equal(t, "slo-aware-routing", tn.Type, "Type should be slo-aware-routing")
379-
assert.Equal(t, "slo-aware-routing", tn.Name, "Default name should be slo-aware-routing")
378+
assert.Equal(t, "predicted-latency-scorer", tn.Type, "Type should be predicted-latency-scorer")
379+
assert.Equal(t, "predicted-latency-scorer", tn.Name, "Default name should be predicted-latency-scorer")
380380
}
381381

382382
func TestSLOAwareRouter_WithName(t *testing.T) {
@@ -389,7 +389,7 @@ func TestSLOAwareRouter_WithName(t *testing.T) {
389389
router = router.WithName(customName)
390390

391391
tn := router.TypedName()
392-
assert.Equal(t, "slo-aware-routing", tn.Type, "Type should remain slo-aware-routing")
392+
assert.Equal(t, "predicted-latency-scorer", tn.Type, "Type should remain predicted-latency-scorer")
393393
assert.Equal(t, customName, tn.Name, "Name should be updated to custom name")
394394
}
395395

pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/types.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ const (
4343
)
4444

4545
const (
46-
SLOAwareRouterPluginType = "slo-aware-routing"
46+
SLOAwareRouterPluginType = "predicted-latency-scorer"
4747
eps = 1e-9
4848
wMax = 100
4949
minWeight = 1

pkg/epp/scheduling/framework/plugins/profile/slo_aware_profile_handler.go

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,10 @@ import (
3232
)
3333

3434
const (
35-
SLOAwareProfileHandlerType = "slo-aware-profile-handler"
36-
DefaultProfileName = "default"
37-
PrefixProfileName = "prefix"
38-
SLOProfileName = "routing"
35+
SLOAwareProfileHandlerType = "predicted-latency-profile-handler"
36+
NoLatencyRoutingProfileName = "predicted-latency-no-routing"
37+
PrefixProfileName = "predicted-latency-prefix"
38+
LatencyRoutingProfileName = "predicted-latency-routing"
3939

4040
// Boolean header string for whether to use predictor based scheduling
4141
PreictionBasedSchedulingHeaderKey = "x-prediction-based-scheduling"
@@ -89,7 +89,7 @@ func (h *SLOAwareProfileHandler) Pick(ctx context.Context, _ *types.CycleState,
8989

9090
if predictorBasedScheduling {
9191
_, prefixExecuted := profileResults[PrefixProfileName]
92-
_, routingExecuted := profileResults[SLOProfileName]
92+
_, routingExecuted := profileResults[LatencyRoutingProfileName]
9393
if prefixExecuted && routingExecuted { // both routing profiles have been executed already in previous call
9494
return map[string]*framework.SchedulerProfile{}
9595
}
@@ -103,13 +103,13 @@ func (h *SLOAwareProfileHandler) Pick(ctx context.Context, _ *types.CycleState,
103103

104104
// otherwise, return only the SLO profile to be executed next
105105
return map[string]*framework.SchedulerProfile{
106-
SLOProfileName: profiles[SLOProfileName],
106+
LatencyRoutingProfileName: profiles[LatencyRoutingProfileName],
107107
}
108108
}
109109

110110
// If predictor based scheduling is not requested, proceed with only default profile
111111
return map[string]*framework.SchedulerProfile{
112-
DefaultProfileName: profiles[DefaultProfileName],
112+
NoLatencyRoutingProfileName: profiles[NoLatencyRoutingProfileName],
113113
}
114114
}
115115

@@ -129,22 +129,22 @@ func (h *SLOAwareProfileHandler) ProcessResults(ctx context.Context, _ *types.Cy
129129
}
130130

131131
if predictorBasedScheduling { // TODO grab header directly from request.Headers instead of request field
132-
if profileResults[SLOProfileName] == nil { // there was an error while running the SLO profile
133-
return nil, fmt.Errorf("failed to run scheduler profile '%s'", SLOProfileName)
132+
if profileResults[LatencyRoutingProfileName] == nil { // there was an error while running the SLO profile
133+
return nil, fmt.Errorf("failed to run scheduler profile '%s'", LatencyRoutingProfileName)
134134
}
135135
return &types.SchedulingResult{
136136
ProfileResults: profileResults,
137-
PrimaryProfileName: SLOProfileName,
137+
PrimaryProfileName: LatencyRoutingProfileName,
138138
}, nil
139139
}
140140

141-
if profileResults[DefaultProfileName] == nil { // there was an error while running the default profile
142-
return nil, fmt.Errorf("failed to run scheduler profile '%s'", DefaultProfileName)
141+
if profileResults[NoLatencyRoutingProfileName] == nil { // there was an error while running the default profile
142+
return nil, fmt.Errorf("failed to run scheduler profile '%s'", NoLatencyRoutingProfileName)
143143
}
144144

145145
return &types.SchedulingResult{
146146
ProfileResults: profileResults,
147-
PrimaryProfileName: DefaultProfileName,
147+
PrimaryProfileName: NoLatencyRoutingProfileName,
148148
}, nil
149149
}
150150

sidecars/latencypredictorasync/latencypredictor_async.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ package latencypredictorasync
1919
import (
2020
"context"
2121
"errors"
22-
"fmt"
2322
"math/rand"
2423
"net/http"
2524
"sync"
@@ -68,12 +67,12 @@ func New(config *Config, logger logr.Logger) *Predictor {
6867
func (p *Predictor) Start(ctx context.Context) error {
6968
// Get initial server status
7069
if err := p.refreshServerStatus(ctx); err != nil {
71-
return fmt.Errorf("failed to get initial server status: %v", err)
70+
p.logger.Error(err, "failed to get initial server status (will retry in background)")
7271
}
7372

7473
// Get initial model info if training server is available
7574
if err := p.refreshModelInfo(ctx); err != nil {
76-
return fmt.Errorf("failed to get initial model info: %v", err)
75+
p.logger.Error(err, "failed to get initial model info (will retry in background)")
7776
}
7877

7978
p.logger.Info("Latency predictor async client started.",

site-src/guides/latency-based-predictor.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ The latency-based routing plugin provides several strategies for selecting a mod
3636
- `composite-most`: A strategy that considers a composite score of various metrics, and prefers the pod with the highest score.
3737
- `composite-only`: This strategy only uses the composite score and ignores latency predictions.
3838

39-
The selection strategy can be configured via the `HEADROOM_SELECTION_STRATEGY` environment variable in the Endpoint Picker deployment.
39+
The selection strategy can be configured via the `headroomSelectionStrategy` plugin config variable in the EPP helm chart (see deployment details below).
4040

4141
## Deploying with Latency-Based Routing
4242

@@ -46,7 +46,7 @@ Before you begin, ensure you have a functional Inference Gateway with at least o
4646

4747
### Deployment
4848

49-
To enable latency-based routing, you must enable the latency predictor in the chart and have built the images for the training/prediction sidecars, which are then deployed as containers alongside the Endpoint Picker. When the latency predictor is enabled, the `slo-aware-routing` and `slo-aware-profile-handler` plugins are automatically configured.
49+
To enable latency-based routing, you must enable the latency predictor in the chart and have built the images for the training/prediction sidecars, which are then deployed as containers alongside the Endpoint Picker. When the latency predictor is enabled, the `predicted-latency-scorer` and `predicted-latency-profile-handler` plugins are automatically configured.
5050

5151
#### Steps:
5252

@@ -67,7 +67,7 @@ helm install vllm-llama3-8b-instruct . \
6767

6868
After these steps, Inference Gateway will be prepared to predict, train, and route requests based on their SLOs.
6969

70-
For details on configuring specific environment variables for latency-based routing, refer to the [InferencePool Helm Chart README](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/config/charts/inferencepool/README.md#latency-based-router-environment-variables).
70+
For details on specific plugin config variables for latency-based routing, refer to the [InferencePool Helm Chart README](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/config/charts/inferencepool/README.md#latency-based-router-configuration).
7171

7272
### Sending Requests
7373

0 commit comments

Comments
 (0)