Rename latency prediction plugins, change docs accordingly, make sidecars not fail immediatly during EPP spinup

BenjaminBraunDev · BenjaminBraunDev · commit 1eb5d8a89bea · 2025-11-25T23:00:52.000Z
diff --git a/config/charts/inferencepool/templates/epp-config.yaml b/config/charts/inferencepool/templates/epp-config.yaml
@@ -12,7 +12,7 @@ data:
     - type: kv-cache-utilization-scorer
     - type: prefix-cache-scorer
     {{- if .Values.inferenceExtension.latencyPredictor.enabled }}
-    - type: slo-aware-routing
+    - type: predicted-latency-scorer
       parameters:
         {{- with .Values.inferenceExtension.latencyPredictor.sloAwareRouting | default dict }}
         samplingMean: {{ .samplingMean | default 100.0 }}
@@ -32,23 +32,23 @@ data:
         affinityGateTauGlobal: {{ .affinityGateTauGlobal | default 0.99 }}
         selectionMode: {{ .selectionMode | default "linear" | quote }}
         {{- end }}
-    - type: slo-aware-profile-handler
+    - type: predicted-latency-profile-handler
     {{- end }}
     schedulingProfiles:
     {{- if .Values.inferenceExtension.latencyPredictor.enabled }}
-    - name: prefix
+    - name: predicted-latency-prefix
       plugins:
       - pluginRef: prefix-cache-scorer
-    - name: default
+    - name: predicted-latency-no-routing
       plugins:
       - pluginRef: prefix-cache-scorer
-      - pluginRef: slo-aware-routing
+      - pluginRef: predicted-latency-scorer
         weight: 0
       - pluginRef: queue-scorer
       - pluginRef: kv-cache-utilization-scorer
-    - name: routing
+    - name: predicted-latency-routing
       plugins:
-      - pluginRef: slo-aware-routing
+      - pluginRef: predicted-latency-scorer
     {{- else }}
     - name: default
       plugins:
diff --git a/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/scorer_test.go b/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/scorer_test.go
@@ -375,8 +375,8 @@ func TestSLOAwareRouter_TypedName(t *testing.T) {
 	router := NewSLOAwareRouter(cfg, predictor)
 
 	tn := router.TypedName()
-	assert.Equal(t, "slo-aware-routing", tn.Type, "Type should be slo-aware-routing")
-	assert.Equal(t, "slo-aware-routing", tn.Name, "Default name should be slo-aware-routing")
+	assert.Equal(t, "predicted-latency-scorer", tn.Type, "Type should be predicted-latency-scorer")
+	assert.Equal(t, "predicted-latency-scorer", tn.Name, "Default name should be predicted-latency-scorer")
 }
 
 func TestSLOAwareRouter_WithName(t *testing.T) {
@@ -389,7 +389,7 @@ func TestSLOAwareRouter_WithName(t *testing.T) {
 	router = router.WithName(customName)
 
 	tn := router.TypedName()
-	assert.Equal(t, "slo-aware-routing", tn.Type, "Type should remain slo-aware-routing")
+	assert.Equal(t, "predicted-latency-scorer", tn.Type, "Type should remain predicted-latency-scorer")
 	assert.Equal(t, customName, tn.Name, "Name should be updated to custom name")
 }
 
diff --git a/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/types.go b/pkg/epp/scheduling/framework/plugins/multi/slo_aware_router/types.go
@@ -43,7 +43,7 @@ const (
 )
 
 const (
-	SLOAwareRouterPluginType = "slo-aware-routing"
+	SLOAwareRouterPluginType = "predicted-latency-scorer"
 	eps                      = 1e-9
 	wMax                     = 100
 	minWeight                = 1
diff --git a/pkg/epp/scheduling/framework/plugins/profile/slo_aware_profile_handler.go b/pkg/epp/scheduling/framework/plugins/profile/slo_aware_profile_handler.go
@@ -32,10 +32,10 @@ import (
 )
 
 const (
-	SLOAwareProfileHandlerType = "slo-aware-profile-handler"
-	DefaultProfileName         = "default"
-	PrefixProfileName          = "prefix"
-	SLOProfileName             = "routing"
+	SLOAwareProfileHandlerType  = "predicted-latency-profile-handler"
+	NoLatencyRoutingProfileName = "predicted-latency-no-routing"
+	PrefixProfileName           = "predicted-latency-prefix"
+	LatencyRoutingProfileName   = "predicted-latency-routing"
 
 	// Boolean header string for whether to use predictor based scheduling
 	PreictionBasedSchedulingHeaderKey = "x-prediction-based-scheduling"
@@ -89,7 +89,7 @@ func (h *SLOAwareProfileHandler) Pick(ctx context.Context, _ *types.CycleState,
 
 	if predictorBasedScheduling {
 		_, prefixExecuted := profileResults[PrefixProfileName]
-		_, routingExecuted := profileResults[SLOProfileName]
+		_, routingExecuted := profileResults[LatencyRoutingProfileName]
 		if prefixExecuted && routingExecuted { // both routing profiles have been executed already in previous call
 			return map[string]*framework.SchedulerProfile{}
 		}
@@ -103,13 +103,13 @@ func (h *SLOAwareProfileHandler) Pick(ctx context.Context, _ *types.CycleState,
 
 		// otherwise, return only the SLO profile to be executed next
 		return map[string]*framework.SchedulerProfile{
-			SLOProfileName: profiles[SLOProfileName],
+			LatencyRoutingProfileName: profiles[LatencyRoutingProfileName],
 		}
 	}
 
 	// If predictor based scheduling is not requested, proceed with only default profile
 	return map[string]*framework.SchedulerProfile{
-		DefaultProfileName: profiles[DefaultProfileName],
+		NoLatencyRoutingProfileName: profiles[NoLatencyRoutingProfileName],
 	}
 }
 
@@ -129,22 +129,22 @@ func (h *SLOAwareProfileHandler) ProcessResults(ctx context.Context, _ *types.Cy
 	}
 
 	if predictorBasedScheduling { // TODO grab header directly from request.Headers instead of request field
-		if profileResults[SLOProfileName] == nil { // there was an error while running the SLO profile
-			return nil, fmt.Errorf("failed to run scheduler profile '%s'", SLOProfileName)
+		if profileResults[LatencyRoutingProfileName] == nil { // there was an error while running the SLO profile
+			return nil, fmt.Errorf("failed to run scheduler profile '%s'", LatencyRoutingProfileName)
 		}
 		return &types.SchedulingResult{
 			ProfileResults:     profileResults,
-			PrimaryProfileName: SLOProfileName,
+			PrimaryProfileName: LatencyRoutingProfileName,
 		}, nil
 	}
 
-	if profileResults[DefaultProfileName] == nil { // there was an error while running the default profile
-		return nil, fmt.Errorf("failed to run scheduler profile '%s'", DefaultProfileName)
+	if profileResults[NoLatencyRoutingProfileName] == nil { // there was an error while running the default profile
+		return nil, fmt.Errorf("failed to run scheduler profile '%s'", NoLatencyRoutingProfileName)
 	}
 
 	return &types.SchedulingResult{
 		ProfileResults:     profileResults,
-		PrimaryProfileName: DefaultProfileName,
+		PrimaryProfileName: NoLatencyRoutingProfileName,
 	}, nil
 }
 
diff --git a/sidecars/latencypredictorasync/latencypredictor_async.go b/sidecars/latencypredictorasync/latencypredictor_async.go
@@ -19,7 +19,6 @@ package latencypredictorasync
 import (
 	"context"
 	"errors"
-	"fmt"
 	"math/rand"
 	"net/http"
 	"sync"
@@ -68,12 +67,12 @@ func New(config *Config, logger logr.Logger) *Predictor {
 func (p *Predictor) Start(ctx context.Context) error {
 	// Get initial server status
 	if err := p.refreshServerStatus(ctx); err != nil {
-		return fmt.Errorf("failed to get initial server status: %v", err)
+		p.logger.Error(err, "failed to get initial server status (will retry in background)")
 	}
 
 	// Get initial model info if training server is available
 	if err := p.refreshModelInfo(ctx); err != nil {
-		return fmt.Errorf("failed to get initial model info: %v", err)
+		p.logger.Error(err, "failed to get initial model info (will retry in background)")
 	}
 
 	p.logger.Info("Latency predictor async client started.",
diff --git a/site-src/guides/latency-based-predictor.md b/site-src/guides/latency-based-predictor.md
@@ -36,7 +36,7 @@ The latency-based routing plugin provides several strategies for selecting a mod
 -   `composite-most`: A strategy that considers a composite score of various metrics, and prefers the pod with the highest score.
 -   `composite-only`: This strategy only uses the composite score and ignores latency predictions.
 
-The selection strategy can be configured via the `HEADROOM_SELECTION_STRATEGY` environment variable in the Endpoint Picker deployment.
+The selection strategy can be configured via the `headroomSelectionStrategy` plugin config variable in the EPP helm chart (see deployment details below).
 
 ## Deploying with Latency-Based Routing
 
@@ -46,7 +46,7 @@ Before you begin, ensure you have a functional Inference Gateway with at least o
 
 ### Deployment
 
-To enable latency-based routing, you must enable the latency predictor in the chart and have built the images for the training/prediction sidecars, which are then deployed as containers alongside the Endpoint Picker. When the latency predictor is enabled, the `slo-aware-routing` and `slo-aware-profile-handler` plugins are automatically configured.
+To enable latency-based routing, you must enable the latency predictor in the chart and have built the images for the training/prediction sidecars, which are then deployed as containers alongside the Endpoint Picker. When the latency predictor is enabled, the `predicted-latency-scorer` and `predicted-latency-profile-handler` plugins are automatically configured.
 
 #### Steps:
 
@@ -67,7 +67,7 @@ helm install vllm-llama3-8b-instruct . \
 
 After these steps, Inference Gateway will be prepared to predict, train, and route requests based on their SLOs.
 
-For details on configuring specific environment variables for latency-based routing, refer to the [InferencePool Helm Chart README](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/config/charts/inferencepool/README.md#latency-based-router-environment-variables).
+For details on specific plugin config variables for latency-based routing, refer to the [InferencePool Helm Chart README](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/config/charts/inferencepool/README.md#latency-based-router-configuration).
 
 ### Sending Requests
 

Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,7 @@ const (`
`43`	`43`	`)`
`44`	`44`
`45`	`45`	`const (`
`46`		`- SLOAwareRouterPluginType = "slo-aware-routing"`
	`46`	`+ SLOAwareRouterPluginType = "predicted-latency-scorer"`
`47`	`47`	`eps = 1e-9`
`48`	`48`	`wMax = 100`
`49`	`49`	`minWeight = 1`
Original file line number	Diff line number	Diff line change
`@@ -32,10 +32,10 @@ import (`
`32`	`32`	`)`
`33`	`33`
`34`	`34`	`const (`
`35`		`- SLOAwareProfileHandlerType = "slo-aware-profile-handler"`
`36`		`- DefaultProfileName = "default"`
`37`		`- PrefixProfileName = "prefix"`
`38`		`- SLOProfileName = "routing"`
	`35`	`+ SLOAwareProfileHandlerType = "predicted-latency-profile-handler"`
	`36`	`+ NoLatencyRoutingProfileName = "predicted-latency-no-routing"`
	`37`	`+ PrefixProfileName = "predicted-latency-prefix"`
	`38`	`+ LatencyRoutingProfileName = "predicted-latency-routing"`
`39`	`39`
`40`	`40`	`// Boolean header string for whether to use predictor based scheduling`
`41`	`41`	`PreictionBasedSchedulingHeaderKey = "x-prediction-based-scheduling"`
`@@ -89,7 +89,7 @@ func (h SLOAwareProfileHandler) Pick(ctx context.Context, _ types.CycleState,`
`89`	`89`
`90`	`90`	`if predictorBasedScheduling {`
`91`	`91`	`_, prefixExecuted := profileResults[PrefixProfileName]`
`92`		`- _, routingExecuted := profileResults[SLOProfileName]`
	`92`	`+ _, routingExecuted := profileResults[LatencyRoutingProfileName]`
`93`	`93`	`if prefixExecuted && routingExecuted { // both routing profiles have been executed already in previous call`
`94`	`94`	`return map[string]*framework.SchedulerProfile{}`
`95`	`95`	`}`
`@@ -103,13 +103,13 @@ func (h SLOAwareProfileHandler) Pick(ctx context.Context, _ types.CycleState,`
`103`	`103`
`104`	`104`	`// otherwise, return only the SLO profile to be executed next`
`105`	`105`	`return map[string]*framework.SchedulerProfile{`
`106`		`- SLOProfileName: profiles[SLOProfileName],`
	`106`	`+ LatencyRoutingProfileName: profiles[LatencyRoutingProfileName],`
`107`	`107`	`}`
`108`	`108`	`}`
`109`	`109`
`110`	`110`	`// If predictor based scheduling is not requested, proceed with only default profile`
`111`	`111`	`return map[string]*framework.SchedulerProfile{`
`112`		`- DefaultProfileName: profiles[DefaultProfileName],`
	`112`	`+ NoLatencyRoutingProfileName: profiles[NoLatencyRoutingProfileName],`
`113`	`113`	`}`
`114`	`114`	`}`
`115`	`115`
`@@ -129,22 +129,22 @@ func (h SLOAwareProfileHandler) ProcessResults(ctx context.Context, _ types.Cy`
`129`	`129`	`}`
`130`	`130`
`131`	`131`	`if predictorBasedScheduling { // TODO grab header directly from request.Headers instead of request field`
`132`		`- if profileResults[SLOProfileName] == nil { // there was an error while running the SLO profile`
`133`		`- return nil, fmt.Errorf("failed to run scheduler profile '%s'", SLOProfileName)`
	`132`	`+ if profileResults[LatencyRoutingProfileName] == nil { // there was an error while running the SLO profile`
	`133`	`+ return nil, fmt.Errorf("failed to run scheduler profile '%s'", LatencyRoutingProfileName)`
`134`	`134`	`}`
`135`	`135`	`return &types.SchedulingResult{`
`136`	`136`	`ProfileResults: profileResults,`
`137`		`- PrimaryProfileName: SLOProfileName,`
	`137`	`+ PrimaryProfileName: LatencyRoutingProfileName,`
`138`	`138`	`}, nil`
`139`	`139`	`}`
`140`	`140`
`141`		`- if profileResults[DefaultProfileName] == nil { // there was an error while running the default profile`
`142`		`- return nil, fmt.Errorf("failed to run scheduler profile '%s'", DefaultProfileName)`
	`141`	`+ if profileResults[NoLatencyRoutingProfileName] == nil { // there was an error while running the default profile`
	`142`	`+ return nil, fmt.Errorf("failed to run scheduler profile '%s'", NoLatencyRoutingProfileName)`
`143`	`143`	`}`
`144`	`144`
`145`	`145`	`return &types.SchedulingResult{`
`146`	`146`	`ProfileResults: profileResults,`
`147`		`- PrimaryProfileName: DefaultProfileName,`
	`147`	`+ PrimaryProfileName: NoLatencyRoutingProfileName,`
`148`	`148`	`}, nil`
`149`	`149`	`}`
`150`	`150`