Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,16 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
- `zmq-endpoint`: ZMQ address to publish events
- `event-batch-size`: the maximum number of kv-cache events to be sent together, defaults to 16
-->
- `fake-metrics`: represents a predefined set of metrics to be sent to Prometheus as a substitute for the actual data. When specified, only these fake metrics will be reported — real metrics and fake metrics will never be reported simultaneously. The set should include values for
- `running-requests`
- `waiting-requests`
- `kv-cache-usage`
- `loras` - an array containing LoRA information objects, each with `running` (a comma-separated list of active LoRAs), `waiting` (a comma-separated list of LoRAs on hold), and a `timestamp`.

Example:
{"running-requests":10,"waiting-requests":30,"kv-cache-usage":0.4,"loras":[{"running":"lora4,lora2","waiting":"lora3","timestamp":1257894567},{"running":"lora4,lora3","waiting":"","timestamp":1257894569}]}


In addition, as we are using klog, the following parameters are available:
- `add_dir_header`: if true, adds the file directory to the header of the log messages
- `alsologtostderr`: log to standard error as well as files (no effect when -logtostderr=true)
Expand Down
16 changes: 16 additions & 0 deletions manifests/config_with_fake.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
model: "Qwen/Qwen2-0.5B"
max-loras: 2
max-cpu-loras: 5
max-num-seqs: 5
mode: "random"
time-to-first-token: 2000
inter-token-latency: 1000
kv-cache-transfer-latency: 100
seed: 100100100
fake-metrics:
running-requests: 16
waiting-requests: 3
kv-cache-usage: 0.3
loras:
- '{"running":"lora1,lora2","waiting":"lora3","timestamp":1257894567}'
- '{"running":"lora1,lora3","waiting":"","timestamp":1257894569}'
78 changes: 75 additions & 3 deletions pkg/common/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ const (
vLLMDefaultPort = 8000
ModeRandom = "random"
ModeEcho = "echo"
dummy = "dummy"
)

type Configuration struct {
Expand Down Expand Up @@ -127,6 +128,30 @@ type Configuration struct {
ZMQEndpoint string `yaml:"zmq-endpoint"`
// EventBatchSize is the maximum number of kv-cache events to be sent together, defaults to 16
EventBatchSize int `yaml:"event-batch-size"`

// FakeMetrics is a set of metrics to send to Prometheus instead of the real data
FakeMetrics *Metrics `yaml:"fake-metrics"`
}

type Metrics struct {
// LoraMetrics
LoraMetrics []LorasMetrics `json:"loras"`
LorasString []string `yaml:"loras"`
// RunningRequests is the number of inference requests that are currently being processed
RunningRequests int64 `yaml:"running-requests" json:"running-requests"`
// WaitingRequests is the number of inference requests that are waiting to be processed
WaitingRequests int64 `yaml:"waiting-requests" json:"waiting-requests"`
// KVCacheUsagePercentage is the fraction of KV-cache blocks currently in use (from 0 to 1)
KVCacheUsagePercentage float32 `yaml:"kv-cache-usage" json:"kv-cache-usage"`
}

type LorasMetrics struct {
// RunningLoras is a comma separated list of running LoRAs
RunningLoras string `json:"running"`
// WaitingLoras is a comma separated list of waiting LoRAs
WaitingLoras string `json:"waiting"`
// Timestamp is the timestamp of the metric
Timestamp float64 `json:"timestamp"`
}

type LoraModule struct {
Expand Down Expand Up @@ -168,6 +193,29 @@ func (c *Configuration) unmarshalLoras() error {
return nil
}

func (c *Configuration) unmarshalFakeMetrics(fakeMetricsString string) error {
var metrics *Metrics
if err := json.Unmarshal([]byte(fakeMetricsString), &metrics); err != nil {
return err
}
c.FakeMetrics = metrics
return nil
}

func (c *Configuration) unmarshalLoraFakeMetrics() error {
if c.FakeMetrics != nil {
c.FakeMetrics.LoraMetrics = make([]LorasMetrics, 0)
for _, jsonStr := range c.FakeMetrics.LorasString {
var lora LorasMetrics
if err := json.Unmarshal([]byte(jsonStr), &lora); err != nil {
return err
}
c.FakeMetrics.LoraMetrics = append(c.FakeMetrics.LoraMetrics, lora)
}
}
return nil
}

func newConfig() *Configuration {
return &Configuration{
Port: vLLMDefaultPort,
Expand Down Expand Up @@ -199,7 +247,14 @@ func (c *Configuration) load(configFile string) error {
return fmt.Errorf("failed to unmarshal configuration: %s", err)
}

return c.unmarshalLoras()
if err := c.unmarshalLoras(); err != nil {
return err
}
if err := c.unmarshalLoraFakeMetrics(); err != nil {
return err
}

return nil
}

func (c *Configuration) validate() error {
Expand Down Expand Up @@ -299,6 +354,15 @@ func (c *Configuration) validate() error {
if c.EventBatchSize < 1 {
return errors.New("event batch size cannot less than 1")
}

if c.FakeMetrics != nil {
if c.FakeMetrics.RunningRequests < 0 || c.FakeMetrics.WaitingRequests < 0 {
return errors.New("fake metrics request counters cannot be negative")
}
if c.FakeMetrics.KVCacheUsagePercentage < 0 || c.FakeMetrics.KVCacheUsagePercentage > 1 {
return errors.New("fake metrics KV cache usage must be between 0 ans 1")
}
}
return nil
}

Expand All @@ -316,6 +380,7 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {

servedModelNames := getParamValueFromArgs("served-model-name")
loraModuleNames := getParamValueFromArgs("lora-modules")
fakeMetrics := getParamValueFromArgs("fake-metrics")

f := pflag.NewFlagSet("llm-d-inference-sim flags", pflag.ContinueOnError)

Expand Down Expand Up @@ -358,9 +423,11 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
var dummyMultiString multiString
f.Var(&dummyMultiString, "served-model-name", "Model names exposed by the API (a list of space-separated strings)")
f.Var(&dummyMultiString, "lora-modules", "List of LoRA adapters (a list of space-separated JSON strings)")
f.Var(&dummyMultiString, "fake-metrics", "A set of metrics to send to Prometheus instead of the real data")
// In order to allow empty arguments, we set a dummy NoOptDefVal for these flags
f.Lookup("served-model-name").NoOptDefVal = "dummy"
f.Lookup("lora-modules").NoOptDefVal = "dummy"
f.Lookup("served-model-name").NoOptDefVal = dummy
f.Lookup("lora-modules").NoOptDefVal = dummy
f.Lookup("fake-metrics").NoOptDefVal = dummy

flagSet := flag.NewFlagSet("simFlagSet", flag.ExitOnError)
klog.InitFlags(flagSet)
Expand All @@ -381,6 +448,11 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
return nil, err
}
}
if fakeMetrics != nil {
if err := config.unmarshalFakeMetrics(fakeMetrics[0]); err != nil {
return nil, err
}
}
if servedModelNames != nil {
config.ServedModelNames = servedModelNames
}
Expand Down
85 changes: 82 additions & 3 deletions pkg/common/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ func createDefaultConfig(model string) *Configuration {
c.KVCacheTransferLatency = 100
c.Seed = 100100100
c.LoraModules = []LoraModule{}

return c
}

Expand Down Expand Up @@ -173,20 +172,90 @@ var _ = Describe("Simulator configuration", func() {
}
tests = append(tests, test)

// Config from config.yaml file plus command line args with time to copy cache
// Config from basic-config.yaml file plus command line args with time to copy cache
c = createDefaultConfig(qwenModelName)
c.Port = 8001
// basic config file does not contain properties related to lora
c.MaxLoras = 1
c.MaxCPULoras = 1
c.KVCacheTransferLatency = 50
test = testCase{
name: "config file with command line args with time to transfer kv-cache",
name: "basic config file with command line args with time to transfer kv-cache",
args: []string{"cmd", "--config", "../../manifests/basic-config.yaml", "--kv-cache-transfer-latency", "50"},
expectedConfig: c,
}
tests = append(tests, test)

// Config from config_with_fake.yaml file
c = createDefaultConfig(qwenModelName)
c.FakeMetrics = &Metrics{
RunningRequests: 16,
WaitingRequests: 3,
KVCacheUsagePercentage: float32(0.3),
LoraMetrics: []LorasMetrics{
{RunningLoras: "lora1,lora2", WaitingLoras: "lora3", Timestamp: 1257894567},
{RunningLoras: "lora1,lora3", WaitingLoras: "", Timestamp: 1257894569},
},
LorasString: []string{
"{\"running\":\"lora1,lora2\",\"waiting\":\"lora3\",\"timestamp\":1257894567}",
"{\"running\":\"lora1,lora3\",\"waiting\":\"\",\"timestamp\":1257894569}",
},
}
test = testCase{
name: "config with fake metrics file",
args: []string{"cmd", "--config", "../../manifests/config_with_fake.yaml"},
expectedConfig: c,
}
tests = append(tests, test)

// Fake metrics from command line
c = newConfig()
c.Model = model
c.ServedModelNames = []string{c.Model}
c.MaxCPULoras = 1
c.Seed = 100
c.FakeMetrics = &Metrics{
RunningRequests: 10,
WaitingRequests: 30,
KVCacheUsagePercentage: float32(0.4),
LoraMetrics: []LorasMetrics{
{RunningLoras: "lora4,lora2", WaitingLoras: "lora3", Timestamp: 1257894567},
{RunningLoras: "lora4,lora3", WaitingLoras: "", Timestamp: 1257894569},
},
LorasString: nil,
}
test = testCase{
name: "metrics from command line",
args: []string{"cmd", "--model", model, "--seed", "100",
"--fake-metrics",
"{\"running-requests\":10,\"waiting-requests\":30,\"kv-cache-usage\":0.4,\"loras\":[{\"running\":\"lora4,lora2\",\"waiting\":\"lora3\",\"timestamp\":1257894567},{\"running\":\"lora4,lora3\",\"waiting\":\"\",\"timestamp\":1257894569}]}",
},
expectedConfig: c,
}
tests = append(tests, test)

// Fake metrics from both the config file and command line
c = createDefaultConfig(qwenModelName)
c.FakeMetrics = &Metrics{
RunningRequests: 10,
WaitingRequests: 30,
KVCacheUsagePercentage: float32(0.4),
LoraMetrics: []LorasMetrics{
{RunningLoras: "lora4,lora2", WaitingLoras: "lora3", Timestamp: 1257894567},
{RunningLoras: "lora4,lora3", WaitingLoras: "", Timestamp: 1257894569},
},
LorasString: nil,
}
test = testCase{
name: "metrics from config file and command line",
args: []string{"cmd", "--config", "../../manifests/config_with_fake.yaml",
"--fake-metrics",
"{\"running-requests\":10,\"waiting-requests\":30,\"kv-cache-usage\":0.4,\"loras\":[{\"running\":\"lora4,lora2\",\"waiting\":\"lora3\",\"timestamp\":1257894567},{\"running\":\"lora4,lora3\",\"waiting\":\"\",\"timestamp\":1257894569}]}",
},
expectedConfig: c,
}
tests = append(tests, test)

for _, test := range tests {
When(test.name, func() {
It("should create correct configuration", func() {
Expand Down Expand Up @@ -298,6 +367,16 @@ var _ = Describe("Simulator configuration", func() {
args: []string{"cmd", "--event-batch-size", "-35",
"--config", "../../manifests/config.yaml"},
},
{
name: "invalid fake metrics: negative running requests",
args: []string{"cmd", "--fake-metrics", "{\"running-requests\":-10,\"waiting-requests\":30,\"kv-cache-usage\":0.4}",
"--config", "../../manifests/config.yaml"},
},
{
name: "invalid fake metrics: kv cache usage",
args: []string{"cmd", "--fake-metrics", "{\"running-requests\":10,\"waiting-requests\":30,\"kv-cache-usage\":40}",
"--config", "../../manifests/config.yaml"},
},
}

for _, test := range invalidTests {
Expand Down
47 changes: 34 additions & 13 deletions pkg/llm-d-inference-sim/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,25 +96,40 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
return nil
}

// setInitialPrometheusMetrics send default values to prometheus
// setInitialPrometheusMetrics sends the default values to prometheus or
// the fake metrics if set
func (s *VllmSimulator) setInitialPrometheusMetrics() {
var nRunningReqs, nWaitingReqs, kvCacheUsage float64
if s.config.FakeMetrics != nil {
nRunningReqs = float64(s.config.FakeMetrics.RunningRequests)
nWaitingReqs = float64(s.config.FakeMetrics.WaitingRequests)
kvCacheUsage = float64(s.config.FakeMetrics.KVCacheUsagePercentage)
}
modelName := s.getDisplayedModelName(s.config.Model)
s.loraInfo.WithLabelValues(
strconv.Itoa(s.config.MaxLoras),
"",
"").Set(float64(time.Now().Unix()))

s.nRunningReqs = 0
s.runningRequests.WithLabelValues(
modelName).Set(float64(s.nRunningReqs))
s.waitingRequests.WithLabelValues(
modelName).Set(float64(0))
s.kvCacheUsagePercentage.WithLabelValues(
modelName).Set(float64(0))
s.runningRequests.WithLabelValues(modelName).Set(nRunningReqs)
s.waitingRequests.WithLabelValues(modelName).Set(nWaitingReqs)
s.kvCacheUsagePercentage.WithLabelValues(modelName).Set(kvCacheUsage)

if s.config.FakeMetrics != nil && len(s.config.FakeMetrics.LoraMetrics) != 0 {
for _, metrics := range s.config.FakeMetrics.LoraMetrics {
s.loraInfo.WithLabelValues(
strconv.Itoa(s.config.MaxLoras),
metrics.RunningLoras,
metrics.WaitingLoras).Set(metrics.Timestamp)
}
} else {
s.loraInfo.WithLabelValues(
strconv.Itoa(s.config.MaxLoras),
"",
"").Set(float64(time.Now().Unix()))
}
}

// reportLoras sets information about loaded LoRA adapters
func (s *VllmSimulator) reportLoras() {
if s.config.FakeMetrics != nil {
return
}
if s.loraInfo == nil {
// Happens in the tests
return
Expand All @@ -138,6 +153,9 @@ func (s *VllmSimulator) reportLoras() {

// reportRunningRequests sets information about running completion requests
func (s *VllmSimulator) reportRunningRequests() {
if s.config.FakeMetrics != nil {
return
}
if s.runningRequests != nil {
nRunningReqs := atomic.LoadInt64(&(s.nRunningReqs))
s.runningRequests.WithLabelValues(
Expand All @@ -147,6 +165,9 @@ func (s *VllmSimulator) reportRunningRequests() {

// reportWaitingRequests sets information about waiting completion requests
func (s *VllmSimulator) reportWaitingRequests() {
if s.config.FakeMetrics != nil {
return
}
if s.waitingRequests != nil {
nWaitingReqs := atomic.LoadInt64(&(s.nWaitingReqs))
s.waitingRequests.WithLabelValues(
Expand Down
Loading
Loading