llm-d
diff --git a/‎README.md‎
Lines changed: 12 additions & 1 deletion b/‎README.md‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎pkg/common/config.go‎
Lines changed: 53 additions & 5 deletions b/‎pkg/common/config.go‎
Lines changed: 53 additions & 5 deletions
diff --git a/‎pkg/common/utils.go‎
Lines changed: 3 additions & 0 deletions b/‎pkg/common/utils.go‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎pkg/llm-d-inference-sim/failures_test.go‎
Lines changed: 23 additions & 23 deletions b/‎pkg/llm-d-inference-sim/failures_test.go‎
Lines changed: 23 additions & 23 deletions
diff --git a/‎pkg/llm-d-inference-sim/lora_test.go‎
Lines changed: 6 additions & 6 deletions b/‎pkg/llm-d-inference-sim/lora_test.go‎
Lines changed: 6 additions & 6 deletions
@@ -26,7 +26,18 @@ In addition, it supports a subset of vLLM's Prometheus metrics. These metrics ar
 | vllm:lora_requests_info | Running stats on LoRA requests |
 | vllm:num_requests_running | Number of requests currently running on GPU |
 | vllm:num_requests_waiting | Prometheus metric for the number of queued requests |
-
+| vllm:e2e_request_latency_seconds | Histogram of end to end request latency in seconds |
+| vllm:request_inference_time_seconds | Histogram of time spent in RUNNING phase for request |
+| vllm:request_queue_time_seconds | Histogram of time spent in WAITING phase for request |
+| vllm:request_prefill_time_seconds | Histogram of time spent in PREFILL phase for request |
+| vllm:request_decode_time_seconds | Histogram of time spent in DECODE phase for request |
+| vllm:time_to_first_token_seconds | Histogram of time to first token in seconds |
+| vllm:time_per_output_token_seconds | Histogram of time per output token in seconds |
+| vllm:request_generation_tokens | Number of generation tokens processed |
+| vllm:request_params_max_tokens | Histogram of the max_tokens request parameter | 
+| vllm:request_prompt_tokens | Number of prefill tokens processed |
+| vllm:request_success_total | Count of successfully processed requests |
+  
 The simulated inference has no connection with the model and LoRA adapters specified in the command line parameters or via the /v1/load_lora_adapter HTTP REST endpoint. The /v1/models endpoint returns simulated results based on those same command line parameters and those loaded via the /v1/load_lora_adapter HTTP REST endpoint.
 
 The simulator supports two modes of operation:
 
@@ -232,16 +232,17 @@ type Metrics struct {
 	WaitingRequests int64 `yaml:"waiting-requests" json:"waiting-requests"`
 	// KVCacheUsagePercentage  is the fraction of KV-cache blocks currently in use (from 0 to 1)
 	KVCacheUsagePercentage float32 `yaml:"kv-cache-usage" json:"kv-cache-usage"`
-	// TTFTBuckets is an array of values for time-to-first-token buckets,
-	// each value in this array is a value for the corresponding bucket.
+
+	// Histogram metrics - defined by array of values.
+	// Each value in this array is a value for the corresponding bucket.
 	// Array may contain less values than number of buckets, all trailing missing values assumed as 0.
+
+	// TTFTBuckets is an array of values for time-to-first-token buckets.
 	// Buckets upper boundaries in seconds are:
 	// 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
 	// 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, 2560.0, +Inf
 	TTFTBucketValues []int `yaml:"ttft-buckets-values" json:"ttft-buckets-values"`
-	// TPOTBuckets is an array of values for time-per-output-token buckets,
-	// each value in this array is a value for the corresponding bucket.
-	// Array may contain less values than number of buckets, all trailing missing values assumed as 0.
+	// TPOTBuckets is an array of values for time-per-output-token buckets.
 	// Buckets upper boundaries in seconds are:
 	// 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
 	// 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, +Inf
@@ -253,6 +254,21 @@ type Metrics struct {
 	RequestParamsMaxTokens  []int `yaml:"request-params-max-tokens" json:"request-params-max-tokens"` // max_tokens parameter samples
 	// RequestSuccessTotal is the number of successful requests, key: finish-reason (stop, length, etc.).
 	RequestSuccessTotal map[string]int64 `yaml:"request-success-total" json:"request-success-total"`
+
+	// Latency histograms - have same buckets upper boundaries in seconds are:
+	// 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0,
+	// 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0, +Inf
+
+	// E2ERequestLatencyBucketValues is an array of values for e2e request latency buckets.
+	E2ERequestLatencyBucketValues []int `yaml:"e2erl-buckets-values" json:"e2erl-buckets-values"`
+	// ReqQueueTimeBucketValues is an array of values for request queue time buckets.
+	ReqQueueTimeBucketValues []int `yaml:"queue-time-buckets-values" json:"queue-time-buckets-values"`
+	// ReqInfTimeBucketValues is an array of values for request inference time buckets.
+	ReqInfTimeBucketValues []int `yaml:"inf-time-buckets-values" json:"inf-time-buckets-values"`
+	// ReqPrefillTimeBucketValues is an array of values for request prefill time buckets.
+	ReqPrefillTimeBucketValues []int `yaml:"prefill-time-buckets-values" json:"prefill-time-buckets-values"`
+	// ReqDecodeTimeBucketValues is an array of values for request decode time buckets.
+	ReqDecodeTimeBucketValues []int `yaml:"decode-time-buckets-values" json:"decode-time-buckets-values"`
 }
 
 type LorasMetrics struct {
@@ -588,6 +604,38 @@ func (c *Configuration) validate() error {
 				return errors.New("fake metrics request-params-max-tokens cannot contain negative values")
 			}
 		}
+
+		for _, v := range c.FakeMetrics.RequestParamsMaxTokens {
+			if v < 0 {
+				return errors.New("fake metrics request-params-max-tokens cannot contain negative values")
+			}
+		}
+
+		for _, v := range c.FakeMetrics.E2ERequestLatencyBucketValues {
+			if v < 0 {
+				return errors.New("fake metrics e2erl-buckets-values cannot contain negative values")
+			}
+		}
+		for _, v := range c.FakeMetrics.ReqQueueTimeBucketValues {
+			if v < 0 {
+				return errors.New("fake metrics queue-time-buckets-values cannot contain negative values")
+			}
+		}
+		for _, v := range c.FakeMetrics.ReqInfTimeBucketValues {
+			if v < 0 {
+				return errors.New("fake metrics inf-time-buckets-values cannot contain negative values")
+			}
+		}
+		for _, v := range c.FakeMetrics.ReqPrefillTimeBucketValues {
+			if v < 0 {
+				return errors.New("fake metrics prefill-time-buckets-values cannot contain negative values")
+			}
+		}
+		for _, v := range c.FakeMetrics.ReqDecodeTimeBucketValues {
+			if v < 0 {
+				return errors.New("fake metrics decode-time-buckets-values cannot contain negative values")
+			}
+		}
 	}
 
 	if c.DPSize < 1 || c.DPSize > 8 {
 
@@ -32,6 +32,9 @@ var TTFTBucketsBoundaries = []float64{0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08
 var TPOTBucketsBoundaries = []float64{0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
 	1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0}
 
+var RequestLatencyBucketsBoundaries = []float64{0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0,
+	20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0}
+
 // ValidateContextWindow checks if the request fits within the model's context window
 // Returns validation result, actual completion tokens, and total tokens
 func ValidateContextWindow(promptTokens int, maxCompletionTokens *int64, maxModelLen int) (bool, int64, int64) {
 
@@ -126,15 +126,15 @@ var _ = Describe("Failures", func() {
 			BeforeEach(func() {
 				ctx = context.Background()
 				var err error
-				client, err = startServerWithArgs(ctx, "", []string{
-					"cmd", "--model", model,
+				client, err = startServerWithArgs(ctx, []string{
+					"cmd", "--model", testModel,
 					"--failure-injection-rate", "100",
-				}, nil)
+				})
 				Expect(err).ToNot(HaveOccurred())
 			})
 
 			It("should always return an error response for chat completions", func() {
-				openaiClient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
+				openaiClient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
 				_, err := openaiClient.Chat.Completions.New(ctx, params)
 				Expect(err).To(HaveOccurred())
 
@@ -147,7 +147,7 @@ var _ = Describe("Failures", func() {
 			})
 
 			It("should always return an error response for text completions", func() {
-				openaiClient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
+				openaiClient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
 				_, err := openaiClient.Chat.Completions.New(ctx, params)
 				Expect(err).To(HaveOccurred())
 
@@ -164,16 +164,16 @@ var _ = Describe("Failures", func() {
 			BeforeEach(func() {
 				ctx = context.Background()
 				var err error
-				client, err = startServerWithArgs(ctx, "", []string{
-					"cmd", "--model", model,
+				client, err = startServerWithArgs(ctx, []string{
+					"cmd", "--model", testModel,
 					"--failure-injection-rate", "100",
 					"--failure-types", common.FailureTypeRateLimit,
-				}, nil)
+				})
 				Expect(err).ToNot(HaveOccurred())
 			})
 
 			It("should return only rate limit errors", func() {
-				openaiClient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
+				openaiClient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
 				_, err := openaiClient.Chat.Completions.New(ctx, params)
 				Expect(err).To(HaveOccurred())
 
@@ -182,24 +182,24 @@ var _ = Describe("Failures", func() {
 				Expect(ok).To(BeTrue())
 				Expect(openaiError.StatusCode).To(Equal(429))
 				Expect(openaiError.Type).To(Equal(openaiserverapi.ErrorCodeToType(429)))
-				Expect(strings.Contains(openaiError.Message, model)).To(BeTrue())
+				Expect(strings.Contains(openaiError.Message, testModel)).To(BeTrue())
 			})
 		})
 
 		Context("with multiple specific failure types", func() {
 			BeforeEach(func() {
 				ctx = context.Background()
 				var err error
-				client, err = startServerWithArgs(ctx, "", []string{
-					"cmd", "--model", model,
+				client, err = startServerWithArgs(ctx, []string{
+					"cmd", "--model", testModel,
 					"--failure-injection-rate", "100",
 					"--failure-types", common.FailureTypeInvalidAPIKey, common.FailureTypeServerError,
-				}, nil)
+				})
 				Expect(err).ToNot(HaveOccurred())
 			})
 
 			It("should return only specified error types", func() {
-				openaiClient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
+				openaiClient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
 
 				// Make multiple requests to verify we get the expected error types
 				for i := 0; i < 10; i++ {
@@ -222,35 +222,35 @@ var _ = Describe("Failures", func() {
 			BeforeEach(func() {
 				ctx = context.Background()
 				var err error
-				client, err = startServerWithArgs(ctx, "", []string{
-					"cmd", "--model", model,
+				client, err = startServerWithArgs(ctx, []string{
+					"cmd", "--model", testModel,
 					"--failure-injection-rate", "0",
-				}, nil)
+				})
 				Expect(err).ToNot(HaveOccurred())
 			})
 
 			It("should never return errors and behave like random mode", func() {
-				openaiClient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
+				openaiClient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
 				resp, err := openaiClient.Chat.Completions.New(ctx, params)
 				Expect(err).ToNot(HaveOccurred())
 				Expect(resp.Choices).To(HaveLen(1))
 				Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
-				Expect(resp.Model).To(Equal(model))
+				Expect(resp.Model).To(Equal(testModel))
 			})
 		})
 
 		Context("testing all predefined failure types", func() {
 			DescribeTable("should return correct error for each failure type",
 				func(failureType string, expectedStatusCode int, expectedErrorType string) {
 					ctx := context.Background()
-					client, err := startServerWithArgs(ctx, "", []string{
-						"cmd", "--model", model,
+					client, err := startServerWithArgs(ctx, []string{
+						"cmd", "--model", testModel,
 						"--failure-injection-rate", "100",
 						"--failure-types", failureType,
-					}, nil)
+					})
 					Expect(err).ToNot(HaveOccurred())
 
-					openaiClient, params := getOpenAIClientAndChatParams(client, model, userMessage, false)
+					openaiClient, params := getOpenAIClientAndChatParams(client, testModel, testUserMessage, false)
 					_, err = openaiClient.Chat.Completions.New(ctx, params)
 					Expect(err).To(HaveOccurred())
 
 
@@ -34,22 +34,22 @@ var _ = Describe("LoRAs", func() {
 	Context("LoRAs config and load", func() {
 		It("Should config, load and load LoRAs correctly", func() {
 			ctx := context.TODO()
-			client, err := startServerWithArgs(ctx, "",
-				[]string{"cmd", "--model", model, "--mode", common.ModeEcho,
+			client, err := startServerWithArgs(ctx,
+				[]string{"cmd", "--model", testModel, "--mode", common.ModeEcho,
 					"--lora-modules", "{\"name\":\"lora3\",\"path\":\"/path/to/lora3\"}",
-					"{\"name\":\"lora4\",\"path\":\"/path/to/lora4\"}"}, nil)
+					"{\"name\":\"lora4\",\"path\":\"/path/to/lora4\"}"})
 			Expect(err).NotTo(HaveOccurred())
 
 			// Request to lora3
-			openaiclient, params := getOpenAIClientAndChatParams(client, "lora3", userMessage, false)
+			openaiclient, params := getOpenAIClientAndChatParams(client, "lora3", testUserMessage, false)
 			resp, err := openaiclient.Chat.Completions.New(ctx, params)
 			Expect(err).ToNot(HaveOccurred())
 
 			Expect(resp.Choices).ShouldNot(BeEmpty())
 			Expect(string(resp.Object)).To(Equal(chatCompletionObject))
 
 			msg := resp.Choices[0].Message.Content
-			Expect(msg).Should(Equal(userMessage))
+			Expect(msg).Should(Equal(testUserMessage))
 
 			// Unknown model, should return 404
 			params.Model = "lora1"
@@ -88,7 +88,7 @@ var _ = Describe("LoRAs", func() {
 			Expect(string(resp.Object)).To(Equal(chatCompletionObject))
 
 			msg = resp.Choices[0].Message.Content
-			Expect(msg).Should(Equal(userMessage))
+			Expect(msg).Should(Equal(testUserMessage))
 
 			// Unload lora3
 			payload = map[string]string{