Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
- `max-cpu-loras`: maximum number of LoRAs to store in CPU memory, optional, must be >= than max-loras, default is max-loras
- `max-model-len`: model's context window, maximum number of tokens in a single request including input and output, optional, default is 1024
- `max-num-seqs`: maximum number of sequences per iteration (maximum number of inference requests that could be processed at the same time), default is 5
- `max-waiting-queue-length`: maximum length of inference requests waiting queue, default is 1000
- `mode`: the simulator mode, optional, by default `random`
- `echo`: returns the same text that was sent in the request
- `random`: returns a sentence chosen at random from a set of pre-defined sentences
Expand Down
10 changes: 9 additions & 1 deletion pkg/common/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ type Configuration struct {
// MaxNumSeqs is maximum number of sequences per iteration (the maximum
// number of inference requests that could be processed at the same time)
MaxNumSeqs int `yaml:"max-num-seqs" json:"max-num-seqs"`
// MaxWaitingQueueLength defines maximum size of waiting requests queue
MaxWaitingQueueLength int `yaml:"max-waiting-queue-length" json:"max-waiting-queue-length"`
// MaxModelLen is the model's context window, the maximum number of tokens
// in a single request including input and output. Default value is 1024.
MaxModelLen int `yaml:"max-model-len" json:"max-model-len"`
Expand Down Expand Up @@ -329,6 +331,7 @@ func newConfig() *Configuration {
Port: vLLMDefaultPort,
MaxLoras: 1,
MaxNumSeqs: 5,
MaxWaitingQueueLength: 1000,
MaxModelLen: 1024,
Mode: ModeRandom,
Seed: time.Now().UnixNano(),
Expand Down Expand Up @@ -458,6 +461,10 @@ func (c *Configuration) validate() error {
return errors.New("max num seqs cannot be less than 1")
}

if c.MaxWaitingQueueLength < 1 {
return errors.New("max waiting queue size cannot be less than 1")
}

for _, lora := range c.LoraModules {
if lora.Name == "" {
return errors.New("empty LoRA name")
Expand Down Expand Up @@ -637,7 +644,8 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {

f.IntVar(&config.Port, "port", config.Port, "Port")
f.StringVar(&config.Model, "model", config.Model, "Currently 'loaded' model")
f.IntVar(&config.MaxNumSeqs, "max-num-seqs", config.MaxNumSeqs, "Maximum number of inference requests that could be processed at the same time (parameter to simulate requests waiting queue)")
f.IntVar(&config.MaxNumSeqs, "max-num-seqs", config.MaxNumSeqs, "Maximum number of inference requests that could be processed at the same time")
f.IntVar(&config.MaxWaitingQueueLength, "max-waiting-queue-length", config.MaxWaitingQueueLength, "Maximum length of inference requests waiting queue")
f.IntVar(&config.MaxLoras, "max-loras", config.MaxLoras, "Maximum number of LoRAs in a single batch")
f.IntVar(&config.MaxCPULoras, "max-cpu-loras", config.MaxCPULoras, "Maximum number of LoRAs to store in CPU memory")
f.IntVar(&config.MaxModelLen, "max-model-len", config.MaxModelLen, "Model's context window, maximum number of tokens in a single request including input and output")
Expand Down
10 changes: 10 additions & 0 deletions pkg/common/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,16 @@ var _ = Describe("Simulator configuration", func() {
args: []string{"cmd", "--max-num-seqs", "-1",
"--config", "../../manifests/config.yaml"},
},
{
name: "invalid max-waiting-queue-length",
args: []string{"cmd", "--max-waiting-queue-length", "0",
"--config", "../../manifests/config.yaml"},
},
{
name: "invalid max-waiting-queue-length",
args: []string{"cmd", "--max-waiting-queue-length", "-1",
"--config", "../../manifests/config.yaml"},
},
{
name: "invalid time-factor-under-load",
args: []string{"cmd", "--time-factor-under-load", "0",
Expand Down
2 changes: 1 addition & 1 deletion pkg/llm-d-inference-sim/latencies.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ func (s *VllmSimulator) getCurrLoadFactor() float64 {
if s.config.MaxNumSeqs <= 1 {
return 1.0
}
return 1 + (s.config.TimeFactorUnderLoad-1)*float64(s.nRunningReqs-1)/float64(s.config.MaxNumSeqs-1)
return 1 + (s.config.TimeFactorUnderLoad-1)*float64(s.metrics.nRunningReqs-1)/float64(s.config.MaxNumSeqs-1)
}

func (s *VllmSimulator) getTimeToFirstToken() int {
Expand Down
20 changes: 11 additions & 9 deletions pkg/llm-d-inference-sim/latencies_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ var _ = Describe("Check random latencies", Ordered, func() {
KVCacheTransferLatencyStdDev: 2048,
}

simulator.metrics.runReqChan = make(chan int64, 100)

common.InitRandom(time.Now().UnixNano())
})

Expand Down Expand Up @@ -245,7 +247,7 @@ var _ = Describe("Check random latencies", Ordered, func() {
simulator.config.TimeToFirstTokenStdDev = 0
simulator.config.TimeFactorUnderLoad = 1.0

simulator.runReqChan <- 100
simulator.metrics.runReqChan <- 100

ttft := simulator.getWaitTimeToFirstToken(128, 0, false)
Expect(ttft).To(Equal(42))
Expand All @@ -257,11 +259,11 @@ var _ = Describe("Check random latencies", Ordered, func() {
simulator.config.TimeFactorUnderLoad = 100.0
simulator.config.MaxNumSeqs = 1

for len(simulator.runReqChan) > 0 {
<-simulator.runReqChan
for len(simulator.metrics.runReqChan) > 0 {
<-simulator.metrics.runReqChan
}

simulator.runReqChan <- 1
simulator.metrics.runReqChan <- 1

ttft := simulator.getWaitTimeToFirstToken(128, 0, false)
Expect(ttft).To(Equal(42))
Expand All @@ -273,7 +275,7 @@ var _ = Describe("Check random latencies", Ordered, func() {
simulator.config.TimeToFirstTokenStdDev = 0
simulator.config.TimeFactorUnderLoad = timeFactorUnderLoad
simulator.config.MaxNumSeqs = maxNumOfReq
simulator.nRunningReqs = int64(maxNumOfReq)
simulator.metrics.nRunningReqs = int64(maxNumOfReq)

ttft := simulator.getWaitTimeToFirstToken(128, 0, false)
Expect(ttft).To(Equal(int(float64(42) * timeFactorUnderLoad)))
Expand All @@ -296,7 +298,7 @@ var _ = Describe("Check random latencies", Ordered, func() {
simulator.config.TimeToFirstTokenStdDev = 0
simulator.config.TimeFactorUnderLoad = timeFactorUnderLoad
simulator.config.MaxNumSeqs = maxNumOfReq
simulator.nRunningReqs = int64(nCurrNumOfReq)
simulator.metrics.nRunningReqs = int64(nCurrNumOfReq)

ttft := simulator.getWaitTimeToFirstToken(128, 0, false)
max := timeFactorUnderLoad * float64(42)
Expand All @@ -318,7 +320,7 @@ var _ = Describe("Check random latencies", Ordered, func() {
It("when TimeFactorUnderLoad is 1.0, calcLoadFactor should give 1", func() {
simulator.config.TimeFactorUnderLoad = 1.0
simulator.config.MaxNumSeqs = 11
simulator.nRunningReqs = 3
simulator.metrics.nRunningReqs = 3

factor := simulator.getCurrLoadFactor()
Expect(factor).To(BeNumerically("==", 1.0))
Expand All @@ -327,7 +329,7 @@ var _ = Describe("Check random latencies", Ordered, func() {
It("when TimeFactorUnderLoad is > 1.0, and sim is fully loaded, calcLoadFactor should give TimeFactorUnderLoad", func() {
simulator.config.TimeFactorUnderLoad = 2.0
simulator.config.MaxNumSeqs = 11
simulator.nRunningReqs = 11
simulator.metrics.nRunningReqs = 11

factor := simulator.getCurrLoadFactor()
Expect(factor).To(BeNumerically("==", simulator.config.TimeFactorUnderLoad))
Expand All @@ -337,7 +339,7 @@ var _ = Describe("Check random latencies", Ordered, func() {
It("when TimeFactorUnderLoad is > 1.0, and sim is partially loaded, calcLoadFactor should give a value between 1 and TimeFactorUnderLoad", func() {
simulator.config.TimeFactorUnderLoad = 2.0
simulator.config.MaxNumSeqs = 11
simulator.nRunningReqs = 6
simulator.metrics.nRunningReqs = 6

factor := simulator.getCurrLoadFactor()
Expect(factor).To(BeNumerically(">", 1.0))
Expand Down
76 changes: 74 additions & 2 deletions pkg/llm-d-inference-sim/lora.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ func (s *VllmSimulator) getLoras() []string {
return loras
}

func (s *VllmSimulator) loadLora(ctx *fasthttp.RequestCtx) {
func (s *VllmSimulator) loadLoraAdaptor(ctx *fasthttp.RequestCtx) {
var req loadLoraRequest
err := json.Unmarshal(ctx.Request.Body(), &req)
if err != nil {
Expand All @@ -59,7 +59,7 @@ func (s *VllmSimulator) loadLora(ctx *fasthttp.RequestCtx) {
s.loraAdaptors.Store(req.LoraName, "")
}

func (s *VllmSimulator) unloadLora(ctx *fasthttp.RequestCtx) {
func (s *VllmSimulator) unloadLoraAdaptor(ctx *fasthttp.RequestCtx) {
var req unloadLoraRequest
err := json.Unmarshal(ctx.Request.Body(), &req)
if err != nil {
Expand All @@ -70,3 +70,75 @@ func (s *VllmSimulator) unloadLora(ctx *fasthttp.RequestCtx) {

s.loraAdaptors.Delete(req.LoraName)
}

// Checks if the LoRA adaptor is loaded
func (s *VllmSimulator) loraIsLoaded(model string) bool {
if !s.isLora(model) {
return true
}

s.loras.mux.RLock()
defer s.loras.mux.RUnlock()

_, ok := s.loras.loadedLoras[model]
return ok
}

// Load the LoRA adaptor if possible. Return false if not.
func (s *VllmSimulator) loadLora(model string) bool {
if !s.isLora(model) {
return true
}

s.loras.mux.Lock()
defer s.loras.mux.Unlock()

// check if this LoRA is already loaded or within maxLoras slots
_, ok := s.loras.loadedLoras[model]
ok = ok || len(s.loras.loadedLoras) < s.loras.maxLoras
if !ok {
// if this LoRA is not loaded, and the number of loaded LoRAs reached
// maxLoras, try to find a LoRA that is not in use, and unload it
for lora, count := range s.loras.loadedLoras {
if count == 0 {
delete(s.loras.loadedLoras, lora)
ok = true
break
}
}
}
if ok {
s.loras.loadedLoras[model]++
}
return ok
}

// incrementLora increments the count of running requests using the model
// (if the model is a LoRA). Can be called only for loaded LoRAs (that are
// already in loras.loadedLoras)
func (s *VllmSimulator) incrementLora(model string) {
if !s.isLora(model) {
return
}

s.loras.mux.Lock()
defer s.loras.mux.Unlock()
s.loras.loadedLoras[model]++
}

// decrementLora decrements the count of running requests using the model
// (if the model is a LoRA)
func (s *VllmSimulator) decrementLora(model string) {
if model == "" || !s.isLora(model) {
return
}

s.loras.mux.Lock()
defer s.loras.mux.Unlock()

s.loras.loadedLoras[model]--
if s.loras.loadedLoras[model] <= 0 {
// last usage of this LoRA
s.loras.loraRemovable <- 1
}
}
Loading