Support fake metrics (#144)

irar2 · web-flow · commit 430992528240 · 2025-08-19T14:13:17.000+03:00
* Support fake metrics

Signed-off-by: Ira &lt;IRAR@il.ibm.com&gt;

* Readme

Signed-off-by: Ira &lt;IRAR@il.ibm.com&gt;

* Removed commented out code

Signed-off-by: Ira &lt;IRAR@il.ibm.com&gt;

---------

Signed-off-by: Ira &lt;IRAR@il.ibm.com&gt;
diff --git a/README.md b/README.md
@@ -125,6 +125,16 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
 - `zmq-endpoint`: ZMQ address to publish events
 - `event-batch-size`: the maximum number of kv-cache events to be sent together, defaults to 16
 -->
+- `fake-metrics`: represents a predefined set of metrics to be sent to Prometheus as a substitute for the actual data. When specified, only these fake metrics will be reported — real metrics and fake metrics will never be reported simultaneously. The set should include values for 
+    - `running-requests`
+    - `waiting-requests`
+    - `kv-cache-usage`
+    - `loras` - an array containing LoRA information objects, each with `running` (a comma-separated list of active LoRAs), `waiting` (a comma-separated list of LoRAs on hold), and a `timestamp`.  
+
+    Example:
+      {"running-requests":10,"waiting-requests":30,"kv-cache-usage":0.4,"loras":[{"running":"lora4,lora2","waiting":"lora3","timestamp":1257894567},{"running":"lora4,lora3","waiting":"","timestamp":1257894569}]}
+      
+
 In addition, as we are using klog, the following parameters are available:
 - `add_dir_header`: if true, adds the file directory to the header of the log messages
 - `alsologtostderr`: log to standard error as well as files (no effect when -logtostderr=true)
diff --git a/manifests/config_with_fake.yaml b/manifests/config_with_fake.yaml
@@ -0,0 +1,16 @@
+model: "Qwen/Qwen2-0.5B"
+max-loras: 2
+max-cpu-loras: 5
+max-num-seqs: 5
+mode: "random"
+time-to-first-token: 2000
+inter-token-latency: 1000
+kv-cache-transfer-latency: 100
+seed: 100100100
+fake-metrics: 
+  running-requests: 16
+  waiting-requests: 3 
+  kv-cache-usage: 0.3
+  loras:
+  - '{"running":"lora1,lora2","waiting":"lora3","timestamp":1257894567}'
+  - '{"running":"lora1,lora3","waiting":"","timestamp":1257894569}'
diff --git a/pkg/common/config.go b/pkg/common/config.go
@@ -34,6 +34,7 @@ const (
 	vLLMDefaultPort = 8000
 	ModeRandom      = "random"
 	ModeEcho        = "echo"
+	dummy           = "dummy"
 )
 
 type Configuration struct {
@@ -127,6 +128,30 @@ type Configuration struct {
 	ZMQEndpoint string `yaml:"zmq-endpoint"`
 	// EventBatchSize is the maximum number of kv-cache events to be sent together, defaults to 16
 	EventBatchSize int `yaml:"event-batch-size"`
+
+	// FakeMetrics is a set of metrics to send to Prometheus instead of the real data
+	FakeMetrics *Metrics `yaml:"fake-metrics"`
+}
+
+type Metrics struct {
+	// LoraMetrics
+	LoraMetrics []LorasMetrics `json:"loras"`
+	LorasString []string       `yaml:"loras"`
+	// RunningRequests is the number of inference requests that are currently being processed
+	RunningRequests int64 `yaml:"running-requests" json:"running-requests"`
+	// WaitingRequests is the number of inference requests that are waiting to be processed
+	WaitingRequests int64 `yaml:"waiting-requests" json:"waiting-requests"`
+	// KVCacheUsagePercentage  is the fraction of KV-cache blocks currently in use (from 0 to 1)
+	KVCacheUsagePercentage float32 `yaml:"kv-cache-usage" json:"kv-cache-usage"`
+}
+
+type LorasMetrics struct {
+	// RunningLoras is a comma separated list of running LoRAs
+	RunningLoras string `json:"running"`
+	// WaitingLoras is a comma separated list of waiting LoRAs
+	WaitingLoras string `json:"waiting"`
+	// Timestamp is the timestamp of the metric
+	Timestamp float64 `json:"timestamp"`
 }
 
 type LoraModule struct {
@@ -168,6 +193,29 @@ func (c *Configuration) unmarshalLoras() error {
 	return nil
 }
 
+func (c *Configuration) unmarshalFakeMetrics(fakeMetricsString string) error {
+	var metrics *Metrics
+	if err := json.Unmarshal([]byte(fakeMetricsString), &metrics); err != nil {
+		return err
+	}
+	c.FakeMetrics = metrics
+	return nil
+}
+
+func (c *Configuration) unmarshalLoraFakeMetrics() error {
+	if c.FakeMetrics != nil {
+		c.FakeMetrics.LoraMetrics = make([]LorasMetrics, 0)
+		for _, jsonStr := range c.FakeMetrics.LorasString {
+			var lora LorasMetrics
+			if err := json.Unmarshal([]byte(jsonStr), &lora); err != nil {
+				return err
+			}
+			c.FakeMetrics.LoraMetrics = append(c.FakeMetrics.LoraMetrics, lora)
+		}
+	}
+	return nil
+}
+
 func newConfig() *Configuration {
 	return &Configuration{
 		Port:                                vLLMDefaultPort,
@@ -199,7 +247,14 @@ func (c *Configuration) load(configFile string) error {
 		return fmt.Errorf("failed to unmarshal configuration: %s", err)
 	}
 
-	return c.unmarshalLoras()
+	if err := c.unmarshalLoras(); err != nil {
+		return err
+	}
+	if err := c.unmarshalLoraFakeMetrics(); err != nil {
+		return err
+	}
+
+	return nil
 }
 
 func (c *Configuration) validate() error {
@@ -299,6 +354,15 @@ func (c *Configuration) validate() error {
 	if c.EventBatchSize < 1 {
 		return errors.New("event batch size cannot less than 1")
 	}
+
+	if c.FakeMetrics != nil {
+		if c.FakeMetrics.RunningRequests < 0 || c.FakeMetrics.WaitingRequests < 0 {
+			return errors.New("fake metrics request counters cannot be negative")
+		}
+		if c.FakeMetrics.KVCacheUsagePercentage < 0 || c.FakeMetrics.KVCacheUsagePercentage > 1 {
+			return errors.New("fake metrics KV cache usage must be between 0 ans 1")
+		}
+	}
 	return nil
 }
 
@@ -316,6 +380,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 
 	servedModelNames := getParamValueFromArgs("served-model-name")
 	loraModuleNames := getParamValueFromArgs("lora-modules")
+	fakeMetrics := getParamValueFromArgs("fake-metrics")
 
 	f := pflag.NewFlagSet("llm-d-inference-sim flags", pflag.ContinueOnError)
 
@@ -358,9 +423,11 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 	var dummyMultiString multiString
 	f.Var(&dummyMultiString, "served-model-name", "Model names exposed by the API (a list of space-separated strings)")
 	f.Var(&dummyMultiString, "lora-modules", "List of LoRA adapters (a list of space-separated JSON strings)")
+	f.Var(&dummyMultiString, "fake-metrics", "A set of metrics to send to Prometheus instead of the real data")
 	// In order to allow empty arguments, we set a dummy NoOptDefVal for these flags
-	f.Lookup("served-model-name").NoOptDefVal = "dummy"
-	f.Lookup("lora-modules").NoOptDefVal = "dummy"
+	f.Lookup("served-model-name").NoOptDefVal = dummy
+	f.Lookup("lora-modules").NoOptDefVal = dummy
+	f.Lookup("fake-metrics").NoOptDefVal = dummy
 
 	flagSet := flag.NewFlagSet("simFlagSet", flag.ExitOnError)
 	klog.InitFlags(flagSet)
@@ -381,6 +448,11 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 			return nil, err
 		}
 	}
+	if fakeMetrics != nil {
+		if err := config.unmarshalFakeMetrics(fakeMetrics[0]); err != nil {
+			return nil, err
+		}
+	}
 	if servedModelNames != nil {
 		config.ServedModelNames = servedModelNames
 	}
diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go
@@ -51,7 +51,6 @@ func createDefaultConfig(model string) *Configuration {
 	c.KVCacheTransferLatency = 100
 	c.Seed = 100100100
 	c.LoraModules = []LoraModule{}
-
 	return c
 }
 
@@ -173,20 +172,90 @@ var _ = Describe("Simulator configuration", func() {
 	}
 	tests = append(tests, test)
 
-	// Config from config.yaml file plus command line args with time to copy cache
+	// Config from basic-config.yaml file plus command line args with time to copy cache
 	c = createDefaultConfig(qwenModelName)
 	c.Port = 8001
 	// basic config file does not contain properties related to lora
 	c.MaxLoras = 1
 	c.MaxCPULoras = 1
 	c.KVCacheTransferLatency = 50
 	test = testCase{
-		name:           "config file with command line args with time to transfer kv-cache",
+		name:           "basic config file with command line args with time to transfer kv-cache",
 		args:           []string{"cmd", "--config", "../../manifests/basic-config.yaml", "--kv-cache-transfer-latency", "50"},
 		expectedConfig: c,
 	}
 	tests = append(tests, test)
 
+	// Config from config_with_fake.yaml file
+	c = createDefaultConfig(qwenModelName)
+	c.FakeMetrics = &Metrics{
+		RunningRequests:        16,
+		WaitingRequests:        3,
+		KVCacheUsagePercentage: float32(0.3),
+		LoraMetrics: []LorasMetrics{
+			{RunningLoras: "lora1,lora2", WaitingLoras: "lora3", Timestamp: 1257894567},
+			{RunningLoras: "lora1,lora3", WaitingLoras: "", Timestamp: 1257894569},
+		},
+		LorasString: []string{
+			"{\"running\":\"lora1,lora2\",\"waiting\":\"lora3\",\"timestamp\":1257894567}",
+			"{\"running\":\"lora1,lora3\",\"waiting\":\"\",\"timestamp\":1257894569}",
+		},
+	}
+	test = testCase{
+		name:           "config with fake metrics file",
+		args:           []string{"cmd", "--config", "../../manifests/config_with_fake.yaml"},
+		expectedConfig: c,
+	}
+	tests = append(tests, test)
+
+	// Fake metrics from command line
+	c = newConfig()
+	c.Model = model
+	c.ServedModelNames = []string{c.Model}
+	c.MaxCPULoras = 1
+	c.Seed = 100
+	c.FakeMetrics = &Metrics{
+		RunningRequests:        10,
+		WaitingRequests:        30,
+		KVCacheUsagePercentage: float32(0.4),
+		LoraMetrics: []LorasMetrics{
+			{RunningLoras: "lora4,lora2", WaitingLoras: "lora3", Timestamp: 1257894567},
+			{RunningLoras: "lora4,lora3", WaitingLoras: "", Timestamp: 1257894569},
+		},
+		LorasString: nil,
+	}
+	test = testCase{
+		name: "metrics from command line",
+		args: []string{"cmd", "--model", model, "--seed", "100",
+			"--fake-metrics",
+			"{\"running-requests\":10,\"waiting-requests\":30,\"kv-cache-usage\":0.4,\"loras\":[{\"running\":\"lora4,lora2\",\"waiting\":\"lora3\",\"timestamp\":1257894567},{\"running\":\"lora4,lora3\",\"waiting\":\"\",\"timestamp\":1257894569}]}",
+		},
+		expectedConfig: c,
+	}
+	tests = append(tests, test)
+
+	// Fake metrics from both the config file and command line
+	c = createDefaultConfig(qwenModelName)
+	c.FakeMetrics = &Metrics{
+		RunningRequests:        10,
+		WaitingRequests:        30,
+		KVCacheUsagePercentage: float32(0.4),
+		LoraMetrics: []LorasMetrics{
+			{RunningLoras: "lora4,lora2", WaitingLoras: "lora3", Timestamp: 1257894567},
+			{RunningLoras: "lora4,lora3", WaitingLoras: "", Timestamp: 1257894569},
+		},
+		LorasString: nil,
+	}
+	test = testCase{
+		name: "metrics from config file and command line",
+		args: []string{"cmd", "--config", "../../manifests/config_with_fake.yaml",
+			"--fake-metrics",
+			"{\"running-requests\":10,\"waiting-requests\":30,\"kv-cache-usage\":0.4,\"loras\":[{\"running\":\"lora4,lora2\",\"waiting\":\"lora3\",\"timestamp\":1257894567},{\"running\":\"lora4,lora3\",\"waiting\":\"\",\"timestamp\":1257894569}]}",
+		},
+		expectedConfig: c,
+	}
+	tests = append(tests, test)
+
 	for _, test := range tests {
 		When(test.name, func() {
 			It("should create correct configuration", func() {
@@ -298,6 +367,16 @@ var _ = Describe("Simulator configuration", func() {
 			args: []string{"cmd", "--event-batch-size", "-35",
 				"--config", "../../manifests/config.yaml"},
 		},
+		{
+			name: "invalid fake metrics: negative running requests",
+			args: []string{"cmd", "--fake-metrics", "{\"running-requests\":-10,\"waiting-requests\":30,\"kv-cache-usage\":0.4}",
+				"--config", "../../manifests/config.yaml"},
+		},
+		{
+			name: "invalid fake metrics: kv cache usage",
+			args: []string{"cmd", "--fake-metrics", "{\"running-requests\":10,\"waiting-requests\":30,\"kv-cache-usage\":40}",
+				"--config", "../../manifests/config.yaml"},
+		},
 	}
 
 	for _, test := range invalidTests {
diff --git a/pkg/llm-d-inference-sim/metrics.go b/pkg/llm-d-inference-sim/metrics.go
@@ -96,25 +96,40 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
 	return nil
 }
 
-// setInitialPrometheusMetrics send default values to prometheus
+// setInitialPrometheusMetrics sends the default values to prometheus or
+// the fake metrics if set
 func (s *VllmSimulator) setInitialPrometheusMetrics() {
+	var nRunningReqs, nWaitingReqs, kvCacheUsage float64
+	if s.config.FakeMetrics != nil {
+		nRunningReqs = float64(s.config.FakeMetrics.RunningRequests)
+		nWaitingReqs = float64(s.config.FakeMetrics.WaitingRequests)
+		kvCacheUsage = float64(s.config.FakeMetrics.KVCacheUsagePercentage)
+	}
 	modelName := s.getDisplayedModelName(s.config.Model)
-	s.loraInfo.WithLabelValues(
-		strconv.Itoa(s.config.MaxLoras),
-		"",
-		"").Set(float64(time.Now().Unix()))
-
-	s.nRunningReqs = 0
-	s.runningRequests.WithLabelValues(
-		modelName).Set(float64(s.nRunningReqs))
-	s.waitingRequests.WithLabelValues(
-		modelName).Set(float64(0))
-	s.kvCacheUsagePercentage.WithLabelValues(
-		modelName).Set(float64(0))
+	s.runningRequests.WithLabelValues(modelName).Set(nRunningReqs)
+	s.waitingRequests.WithLabelValues(modelName).Set(nWaitingReqs)
+	s.kvCacheUsagePercentage.WithLabelValues(modelName).Set(kvCacheUsage)
+
+	if s.config.FakeMetrics != nil && len(s.config.FakeMetrics.LoraMetrics) != 0 {
+		for _, metrics := range s.config.FakeMetrics.LoraMetrics {
+			s.loraInfo.WithLabelValues(
+				strconv.Itoa(s.config.MaxLoras),
+				metrics.RunningLoras,
+				metrics.WaitingLoras).Set(metrics.Timestamp)
+		}
+	} else {
+		s.loraInfo.WithLabelValues(
+			strconv.Itoa(s.config.MaxLoras),
+			"",
+			"").Set(float64(time.Now().Unix()))
+	}
 }
 
 // reportLoras sets information about loaded LoRA adapters
 func (s *VllmSimulator) reportLoras() {
+	if s.config.FakeMetrics != nil {
+		return
+	}
 	if s.loraInfo == nil {
 		// Happens in the tests
 		return
@@ -138,6 +153,9 @@ func (s *VllmSimulator) reportLoras() {
 
 // reportRunningRequests sets information about running completion requests
 func (s *VllmSimulator) reportRunningRequests() {
+	if s.config.FakeMetrics != nil {
+		return
+	}
 	if s.runningRequests != nil {
 		nRunningReqs := atomic.LoadInt64(&(s.nRunningReqs))
 		s.runningRequests.WithLabelValues(
@@ -147,6 +165,9 @@ func (s *VllmSimulator) reportRunningRequests() {
 
 // reportWaitingRequests sets information about waiting completion requests
 func (s *VllmSimulator) reportWaitingRequests() {
+	if s.config.FakeMetrics != nil {
+		return
+	}
 	if s.waitingRequests != nil {
 		nWaitingReqs := atomic.LoadInt64(&(s.nWaitingReqs))
 		s.waitingRequests.WithLabelValues(
diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go