Skip to content

Commit 4309925

Browse files
authored
Support fake metrics (#144)
* Support fake metrics Signed-off-by: Ira <IRAR@il.ibm.com> * Readme Signed-off-by: Ira <IRAR@il.ibm.com> * Removed commented out code Signed-off-by: Ira <IRAR@il.ibm.com> --------- Signed-off-by: Ira <IRAR@il.ibm.com>
1 parent a080a17 commit 4309925

File tree

6 files changed

+257
-21
lines changed

6 files changed

+257
-21
lines changed

README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,16 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
125125
- `zmq-endpoint`: ZMQ address to publish events
126126
- `event-batch-size`: the maximum number of kv-cache events to be sent together, defaults to 16
127127
-->
128+
- `fake-metrics`: represents a predefined set of metrics to be sent to Prometheus as a substitute for the actual data. When specified, only these fake metrics will be reported — real metrics and fake metrics will never be reported simultaneously. The set should include values for
129+
- `running-requests`
130+
- `waiting-requests`
131+
- `kv-cache-usage`
132+
- `loras` - an array containing LoRA information objects, each with `running` (a comma-separated list of active LoRAs), `waiting` (a comma-separated list of LoRAs on hold), and a `timestamp`.
133+
134+
Example:
135+
{"running-requests":10,"waiting-requests":30,"kv-cache-usage":0.4,"loras":[{"running":"lora4,lora2","waiting":"lora3","timestamp":1257894567},{"running":"lora4,lora3","waiting":"","timestamp":1257894569}]}
136+
137+
128138
In addition, as we are using klog, the following parameters are available:
129139
- `add_dir_header`: if true, adds the file directory to the header of the log messages
130140
- `alsologtostderr`: log to standard error as well as files (no effect when -logtostderr=true)

manifests/config_with_fake.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
model: "Qwen/Qwen2-0.5B"
2+
max-loras: 2
3+
max-cpu-loras: 5
4+
max-num-seqs: 5
5+
mode: "random"
6+
time-to-first-token: 2000
7+
inter-token-latency: 1000
8+
kv-cache-transfer-latency: 100
9+
seed: 100100100
10+
fake-metrics:
11+
running-requests: 16
12+
waiting-requests: 3
13+
kv-cache-usage: 0.3
14+
loras:
15+
- '{"running":"lora1,lora2","waiting":"lora3","timestamp":1257894567}'
16+
- '{"running":"lora1,lora3","waiting":"","timestamp":1257894569}'

pkg/common/config.go

Lines changed: 75 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ const (
3434
vLLMDefaultPort = 8000
3535
ModeRandom = "random"
3636
ModeEcho = "echo"
37+
dummy = "dummy"
3738
)
3839

3940
type Configuration struct {
@@ -127,6 +128,30 @@ type Configuration struct {
127128
ZMQEndpoint string `yaml:"zmq-endpoint"`
128129
// EventBatchSize is the maximum number of kv-cache events to be sent together, defaults to 16
129130
EventBatchSize int `yaml:"event-batch-size"`
131+
132+
// FakeMetrics is a set of metrics to send to Prometheus instead of the real data
133+
FakeMetrics *Metrics `yaml:"fake-metrics"`
134+
}
135+
136+
type Metrics struct {
137+
// LoraMetrics
138+
LoraMetrics []LorasMetrics `json:"loras"`
139+
LorasString []string `yaml:"loras"`
140+
// RunningRequests is the number of inference requests that are currently being processed
141+
RunningRequests int64 `yaml:"running-requests" json:"running-requests"`
142+
// WaitingRequests is the number of inference requests that are waiting to be processed
143+
WaitingRequests int64 `yaml:"waiting-requests" json:"waiting-requests"`
144+
// KVCacheUsagePercentage is the fraction of KV-cache blocks currently in use (from 0 to 1)
145+
KVCacheUsagePercentage float32 `yaml:"kv-cache-usage" json:"kv-cache-usage"`
146+
}
147+
148+
type LorasMetrics struct {
149+
// RunningLoras is a comma separated list of running LoRAs
150+
RunningLoras string `json:"running"`
151+
// WaitingLoras is a comma separated list of waiting LoRAs
152+
WaitingLoras string `json:"waiting"`
153+
// Timestamp is the timestamp of the metric
154+
Timestamp float64 `json:"timestamp"`
130155
}
131156

132157
type LoraModule struct {
@@ -168,6 +193,29 @@ func (c *Configuration) unmarshalLoras() error {
168193
return nil
169194
}
170195

196+
func (c *Configuration) unmarshalFakeMetrics(fakeMetricsString string) error {
197+
var metrics *Metrics
198+
if err := json.Unmarshal([]byte(fakeMetricsString), &metrics); err != nil {
199+
return err
200+
}
201+
c.FakeMetrics = metrics
202+
return nil
203+
}
204+
205+
func (c *Configuration) unmarshalLoraFakeMetrics() error {
206+
if c.FakeMetrics != nil {
207+
c.FakeMetrics.LoraMetrics = make([]LorasMetrics, 0)
208+
for _, jsonStr := range c.FakeMetrics.LorasString {
209+
var lora LorasMetrics
210+
if err := json.Unmarshal([]byte(jsonStr), &lora); err != nil {
211+
return err
212+
}
213+
c.FakeMetrics.LoraMetrics = append(c.FakeMetrics.LoraMetrics, lora)
214+
}
215+
}
216+
return nil
217+
}
218+
171219
func newConfig() *Configuration {
172220
return &Configuration{
173221
Port: vLLMDefaultPort,
@@ -199,7 +247,14 @@ func (c *Configuration) load(configFile string) error {
199247
return fmt.Errorf("failed to unmarshal configuration: %s", err)
200248
}
201249

202-
return c.unmarshalLoras()
250+
if err := c.unmarshalLoras(); err != nil {
251+
return err
252+
}
253+
if err := c.unmarshalLoraFakeMetrics(); err != nil {
254+
return err
255+
}
256+
257+
return nil
203258
}
204259

205260
func (c *Configuration) validate() error {
@@ -299,6 +354,15 @@ func (c *Configuration) validate() error {
299354
if c.EventBatchSize < 1 {
300355
return errors.New("event batch size cannot less than 1")
301356
}
357+
358+
if c.FakeMetrics != nil {
359+
if c.FakeMetrics.RunningRequests < 0 || c.FakeMetrics.WaitingRequests < 0 {
360+
return errors.New("fake metrics request counters cannot be negative")
361+
}
362+
if c.FakeMetrics.KVCacheUsagePercentage < 0 || c.FakeMetrics.KVCacheUsagePercentage > 1 {
363+
return errors.New("fake metrics KV cache usage must be between 0 ans 1")
364+
}
365+
}
302366
return nil
303367
}
304368

@@ -316,6 +380,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
316380

317381
servedModelNames := getParamValueFromArgs("served-model-name")
318382
loraModuleNames := getParamValueFromArgs("lora-modules")
383+
fakeMetrics := getParamValueFromArgs("fake-metrics")
319384

320385
f := pflag.NewFlagSet("llm-d-inference-sim flags", pflag.ContinueOnError)
321386

@@ -358,9 +423,11 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
358423
var dummyMultiString multiString
359424
f.Var(&dummyMultiString, "served-model-name", "Model names exposed by the API (a list of space-separated strings)")
360425
f.Var(&dummyMultiString, "lora-modules", "List of LoRA adapters (a list of space-separated JSON strings)")
426+
f.Var(&dummyMultiString, "fake-metrics", "A set of metrics to send to Prometheus instead of the real data")
361427
// In order to allow empty arguments, we set a dummy NoOptDefVal for these flags
362-
f.Lookup("served-model-name").NoOptDefVal = "dummy"
363-
f.Lookup("lora-modules").NoOptDefVal = "dummy"
428+
f.Lookup("served-model-name").NoOptDefVal = dummy
429+
f.Lookup("lora-modules").NoOptDefVal = dummy
430+
f.Lookup("fake-metrics").NoOptDefVal = dummy
364431

365432
flagSet := flag.NewFlagSet("simFlagSet", flag.ExitOnError)
366433
klog.InitFlags(flagSet)
@@ -381,6 +448,11 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
381448
return nil, err
382449
}
383450
}
451+
if fakeMetrics != nil {
452+
if err := config.unmarshalFakeMetrics(fakeMetrics[0]); err != nil {
453+
return nil, err
454+
}
455+
}
384456
if servedModelNames != nil {
385457
config.ServedModelNames = servedModelNames
386458
}

pkg/common/config_test.go

Lines changed: 82 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@ func createDefaultConfig(model string) *Configuration {
5151
c.KVCacheTransferLatency = 100
5252
c.Seed = 100100100
5353
c.LoraModules = []LoraModule{}
54-
5554
return c
5655
}
5756

@@ -173,20 +172,90 @@ var _ = Describe("Simulator configuration", func() {
173172
}
174173
tests = append(tests, test)
175174

176-
// Config from config.yaml file plus command line args with time to copy cache
175+
// Config from basic-config.yaml file plus command line args with time to copy cache
177176
c = createDefaultConfig(qwenModelName)
178177
c.Port = 8001
179178
// basic config file does not contain properties related to lora
180179
c.MaxLoras = 1
181180
c.MaxCPULoras = 1
182181
c.KVCacheTransferLatency = 50
183182
test = testCase{
184-
name: "config file with command line args with time to transfer kv-cache",
183+
name: "basic config file with command line args with time to transfer kv-cache",
185184
args: []string{"cmd", "--config", "../../manifests/basic-config.yaml", "--kv-cache-transfer-latency", "50"},
186185
expectedConfig: c,
187186
}
188187
tests = append(tests, test)
189188

189+
// Config from config_with_fake.yaml file
190+
c = createDefaultConfig(qwenModelName)
191+
c.FakeMetrics = &Metrics{
192+
RunningRequests: 16,
193+
WaitingRequests: 3,
194+
KVCacheUsagePercentage: float32(0.3),
195+
LoraMetrics: []LorasMetrics{
196+
{RunningLoras: "lora1,lora2", WaitingLoras: "lora3", Timestamp: 1257894567},
197+
{RunningLoras: "lora1,lora3", WaitingLoras: "", Timestamp: 1257894569},
198+
},
199+
LorasString: []string{
200+
"{\"running\":\"lora1,lora2\",\"waiting\":\"lora3\",\"timestamp\":1257894567}",
201+
"{\"running\":\"lora1,lora3\",\"waiting\":\"\",\"timestamp\":1257894569}",
202+
},
203+
}
204+
test = testCase{
205+
name: "config with fake metrics file",
206+
args: []string{"cmd", "--config", "../../manifests/config_with_fake.yaml"},
207+
expectedConfig: c,
208+
}
209+
tests = append(tests, test)
210+
211+
// Fake metrics from command line
212+
c = newConfig()
213+
c.Model = model
214+
c.ServedModelNames = []string{c.Model}
215+
c.MaxCPULoras = 1
216+
c.Seed = 100
217+
c.FakeMetrics = &Metrics{
218+
RunningRequests: 10,
219+
WaitingRequests: 30,
220+
KVCacheUsagePercentage: float32(0.4),
221+
LoraMetrics: []LorasMetrics{
222+
{RunningLoras: "lora4,lora2", WaitingLoras: "lora3", Timestamp: 1257894567},
223+
{RunningLoras: "lora4,lora3", WaitingLoras: "", Timestamp: 1257894569},
224+
},
225+
LorasString: nil,
226+
}
227+
test = testCase{
228+
name: "metrics from command line",
229+
args: []string{"cmd", "--model", model, "--seed", "100",
230+
"--fake-metrics",
231+
"{\"running-requests\":10,\"waiting-requests\":30,\"kv-cache-usage\":0.4,\"loras\":[{\"running\":\"lora4,lora2\",\"waiting\":\"lora3\",\"timestamp\":1257894567},{\"running\":\"lora4,lora3\",\"waiting\":\"\",\"timestamp\":1257894569}]}",
232+
},
233+
expectedConfig: c,
234+
}
235+
tests = append(tests, test)
236+
237+
// Fake metrics from both the config file and command line
238+
c = createDefaultConfig(qwenModelName)
239+
c.FakeMetrics = &Metrics{
240+
RunningRequests: 10,
241+
WaitingRequests: 30,
242+
KVCacheUsagePercentage: float32(0.4),
243+
LoraMetrics: []LorasMetrics{
244+
{RunningLoras: "lora4,lora2", WaitingLoras: "lora3", Timestamp: 1257894567},
245+
{RunningLoras: "lora4,lora3", WaitingLoras: "", Timestamp: 1257894569},
246+
},
247+
LorasString: nil,
248+
}
249+
test = testCase{
250+
name: "metrics from config file and command line",
251+
args: []string{"cmd", "--config", "../../manifests/config_with_fake.yaml",
252+
"--fake-metrics",
253+
"{\"running-requests\":10,\"waiting-requests\":30,\"kv-cache-usage\":0.4,\"loras\":[{\"running\":\"lora4,lora2\",\"waiting\":\"lora3\",\"timestamp\":1257894567},{\"running\":\"lora4,lora3\",\"waiting\":\"\",\"timestamp\":1257894569}]}",
254+
},
255+
expectedConfig: c,
256+
}
257+
tests = append(tests, test)
258+
190259
for _, test := range tests {
191260
When(test.name, func() {
192261
It("should create correct configuration", func() {
@@ -298,6 +367,16 @@ var _ = Describe("Simulator configuration", func() {
298367
args: []string{"cmd", "--event-batch-size", "-35",
299368
"--config", "../../manifests/config.yaml"},
300369
},
370+
{
371+
name: "invalid fake metrics: negative running requests",
372+
args: []string{"cmd", "--fake-metrics", "{\"running-requests\":-10,\"waiting-requests\":30,\"kv-cache-usage\":0.4}",
373+
"--config", "../../manifests/config.yaml"},
374+
},
375+
{
376+
name: "invalid fake metrics: kv cache usage",
377+
args: []string{"cmd", "--fake-metrics", "{\"running-requests\":10,\"waiting-requests\":30,\"kv-cache-usage\":40}",
378+
"--config", "../../manifests/config.yaml"},
379+
},
301380
}
302381

303382
for _, test := range invalidTests {

pkg/llm-d-inference-sim/metrics.go

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -96,25 +96,40 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
9696
return nil
9797
}
9898

99-
// setInitialPrometheusMetrics send default values to prometheus
99+
// setInitialPrometheusMetrics sends the default values to prometheus or
100+
// the fake metrics if set
100101
func (s *VllmSimulator) setInitialPrometheusMetrics() {
102+
var nRunningReqs, nWaitingReqs, kvCacheUsage float64
103+
if s.config.FakeMetrics != nil {
104+
nRunningReqs = float64(s.config.FakeMetrics.RunningRequests)
105+
nWaitingReqs = float64(s.config.FakeMetrics.WaitingRequests)
106+
kvCacheUsage = float64(s.config.FakeMetrics.KVCacheUsagePercentage)
107+
}
101108
modelName := s.getDisplayedModelName(s.config.Model)
102-
s.loraInfo.WithLabelValues(
103-
strconv.Itoa(s.config.MaxLoras),
104-
"",
105-
"").Set(float64(time.Now().Unix()))
106-
107-
s.nRunningReqs = 0
108-
s.runningRequests.WithLabelValues(
109-
modelName).Set(float64(s.nRunningReqs))
110-
s.waitingRequests.WithLabelValues(
111-
modelName).Set(float64(0))
112-
s.kvCacheUsagePercentage.WithLabelValues(
113-
modelName).Set(float64(0))
109+
s.runningRequests.WithLabelValues(modelName).Set(nRunningReqs)
110+
s.waitingRequests.WithLabelValues(modelName).Set(nWaitingReqs)
111+
s.kvCacheUsagePercentage.WithLabelValues(modelName).Set(kvCacheUsage)
112+
113+
if s.config.FakeMetrics != nil && len(s.config.FakeMetrics.LoraMetrics) != 0 {
114+
for _, metrics := range s.config.FakeMetrics.LoraMetrics {
115+
s.loraInfo.WithLabelValues(
116+
strconv.Itoa(s.config.MaxLoras),
117+
metrics.RunningLoras,
118+
metrics.WaitingLoras).Set(metrics.Timestamp)
119+
}
120+
} else {
121+
s.loraInfo.WithLabelValues(
122+
strconv.Itoa(s.config.MaxLoras),
123+
"",
124+
"").Set(float64(time.Now().Unix()))
125+
}
114126
}
115127

116128
// reportLoras sets information about loaded LoRA adapters
117129
func (s *VllmSimulator) reportLoras() {
130+
if s.config.FakeMetrics != nil {
131+
return
132+
}
118133
if s.loraInfo == nil {
119134
// Happens in the tests
120135
return
@@ -138,6 +153,9 @@ func (s *VllmSimulator) reportLoras() {
138153

139154
// reportRunningRequests sets information about running completion requests
140155
func (s *VllmSimulator) reportRunningRequests() {
156+
if s.config.FakeMetrics != nil {
157+
return
158+
}
141159
if s.runningRequests != nil {
142160
nRunningReqs := atomic.LoadInt64(&(s.nRunningReqs))
143161
s.runningRequests.WithLabelValues(
@@ -147,6 +165,9 @@ func (s *VllmSimulator) reportRunningRequests() {
147165

148166
// reportWaitingRequests sets information about waiting completion requests
149167
func (s *VllmSimulator) reportWaitingRequests() {
168+
if s.config.FakeMetrics != nil {
169+
return
170+
}
150171
if s.waitingRequests != nil {
151172
nWaitingReqs := atomic.LoadInt64(&(s.nWaitingReqs))
152173
s.waitingRequests.WithLabelValues(

0 commit comments

Comments
 (0)