KV cache and tokenization related configuration (llm-d#125)

irar2 · smarunich · commit 25bde53717c3 · 2025-08-14T11:55:29.000-04:00
Signed-off-by: Ira &lt;IRAR@il.ibm.com&gt;
Signed-off-by: Sergey Marunich &lt;marunich.s@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -117,7 +117,12 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
 - `tool-call-not-required-param-probability`: the probability to add a parameter, that is not required, in a tool call, optional, defaults to 50
 - `object-tool-call-not-required-field-probability`: the probability to add a field, that is not required, in an object in a tool call, optional, defaults to 50
 - `enable-kvcache`: if true, the KV cache support will be enabled in the simulator. In this case, the KV cache will be simulated, and ZQM events will be published when a KV cache block is added or evicted.
-	
+- `kv-cache-size`: the maximum number of token blocks in kv cache
+- `block-size`: token block size for contiguous chunks of tokens, possible values: 8,16,32,64,128
+- `tokenizers-cache-dir`: the directory for caching tokenizers
+- `hash-seed`: seed for hash generation (if not set, is read from PYTHONHASHSEED environment variable)
+- `zmq-endpoint`: ZMQ address to publish events
+
 In addition, as we are using klog, the following parameters are available:
 - `add_dir_header`: if true, adds the file directory to the header of the log messages
 - `alsologtostderr`: log to standard error as well as files (no effect when -logtostderr=true)
diff --git a/pkg/common/config.go b/pkg/common/config.go
@@ -113,6 +113,18 @@ type Configuration struct {
 
 	// EnableKVCache defines if kv cache feature will be enabled
 	EnableKVCache bool `yaml:"enable-kvcache"`
+	//  KVCacheSize is the maximum number of token blocks in kv cache, the default value is 1024
+	KVCacheSize int `yaml:"kv-cache-size"`
+
+	// TokenizersCacheDir is the directory for caching tokenizers
+	TokenizersCacheDir string `yaml:"tokenizers-cache-dir"`
+	// TokenBlockSize is token block size for contiguous chunks of tokens, possible values: 8,16,32,64,128, defaults to 16
+	TokenBlockSize int `yaml:"block-size"`
+	// HashSeed is the seed for hash generation (if not set, is read from PYTHONHASHSEED environment variable)
+	HashSeed string `yaml:"hash-seed"`
+
+	// ZMQEndpoint is the ZMQ address to publish events, the default value is tcp://localhost:5557
+	ZMQEndpoint string `yaml:"zmq-endpoint"`
 }
 
 type LoraModule struct {
@@ -168,6 +180,9 @@ func newConfig() *Configuration {
 		MinToolCallArrayParamLength:         1,
 		ToolCallNotRequiredParamProbability: 50,
 		ObjectToolCallNotRequiredParamProbability: 50,
+		KVCacheSize:    1024,
+		TokenBlockSize: 16,
+		ZMQEndpoint:    "tcp://localhost:5557",
 	}
 }
 
@@ -269,6 +284,15 @@ func (c *Configuration) validate() error {
 	if c.ObjectToolCallNotRequiredParamProbability < 0 || c.ObjectToolCallNotRequiredParamProbability > 100 {
 		return errors.New("ObjectToolCallNotRequiredParamProbability should be between 0 and 100")
 	}
+
+	if c.TokenBlockSize != 8 && c.TokenBlockSize != 16 && c.TokenBlockSize != 32 &&
+		c.TokenBlockSize != 64 && c.TokenBlockSize != 128 {
+		return errors.New("token block size should be one of the following: 8, 16, 32, 64, 128")
+	}
+
+	if c.KVCacheSize < 0 {
+		return errors.New("KV cache size cannot be negative")
+	}
 	return nil
 }
 
@@ -313,7 +337,13 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 	f.IntVar(&config.MinToolCallArrayParamLength, "min-tool-call-array-param-length", config.MinToolCallArrayParamLength, "Minimum possible length of array parameters in a tool call")
 	f.IntVar(&config.ToolCallNotRequiredParamProbability, "tool-call-not-required-param-probability", config.ToolCallNotRequiredParamProbability, "Probability to add a parameter, that is not required, in a tool call")
 	f.IntVar(&config.ObjectToolCallNotRequiredParamProbability, "object-tool-call-not-required-field-probability", config.ObjectToolCallNotRequiredParamProbability, "Probability to add a field, that is not required, in an object in a tool call")
+
 	f.BoolVar(&config.EnableKVCache, "enable-kvcache", config.EnableKVCache, "Defines if KV cache feature is enabled")
+	f.IntVar(&config.KVCacheSize, "kv-cache-size", config.KVCacheSize, "Maximum number of token blocks in kv cache")
+	f.IntVar(&config.TokenBlockSize, "block-size", config.TokenBlockSize, "Token block size for contiguous chunks of tokens, possible values: 8,16,32,64,128")
+	f.StringVar(&config.TokenizersCacheDir, "tokenizers-cache-dir", config.TokenizersCacheDir, "Directory for caching tokenizers")
+	f.StringVar(&config.HashSeed, "hash-seed", config.HashSeed, "Seed for hash generation (if not set, is read from PYTHONHASHSEED environment variable)")
+	f.StringVar(&config.ZMQEndpoint, "zmq-endpoint", config.ZMQEndpoint, "ZMQ address to publish events")
 
 	// These values were manually parsed above in getParamValueFromArgs, we leave this in order to get these flags in --help
 	var dummyString string
@@ -348,6 +378,13 @@ func ParseCommandParamsAndLoadConfig() (*Configuration, error) {
 		config.ServedModelNames = servedModelNames
 	}
 
+	if config.HashSeed == "" {
+		hashSeed := os.Getenv("PYTHONHASHSEED")
+		if hashSeed != "" {
+			config.HashSeed = hashSeed
+		}
+	}
+
 	if err := config.validate(); err != nil {
 		return nil, err
 	}
diff --git a/pkg/common/config_test.go b/pkg/common/config_test.go
@@ -281,6 +281,16 @@ var _ = Describe("Simulator configuration", func() {
 			args: []string{"cmd", "--kv-cache-transfer-latency-std-dev", "-35",
 				"--config", "../../manifests/config.yaml"},
 		},
+		{
+			name: "invalid (negative) kv-cache-size",
+			args: []string{"cmd", "--kv-cache-size", "-35",
+				"--config", "../../manifests/config.yaml"},
+		},
+		{
+			name: "invalid block-size",
+			args: []string{"cmd", "--block-size", "35",
+				"--config", "../../manifests/config.yaml"},
+		},
 	}
 
 	for _, test := range invalidTests {
diff --git a/pkg/kv-cache/kv_cache.go b/pkg/kv-cache/kv_cache.go
@@ -21,29 +21,31 @@ import (
 	"fmt"
 
 	"github.com/go-logr/logr"
+	"github.com/llm-d/llm-d-inference-sim/pkg/common"
 	openaiserverapi "github.com/llm-d/llm-d-inference-sim/pkg/openai-server-api"
 	"github.com/llm-d/llm-d-kv-cache-manager/pkg/kvcache/kvblock"
 	"github.com/llm-d/llm-d-kv-cache-manager/pkg/tokenization"
 )
 
-const (
-	// TODO move it to configuration
-	maxBlocks = 100
-)
-
 type KVCacheHelper struct {
 	tokenizer       tokenization.Tokenizer
 	tokensProcessor kvblock.TokenProcessor // turns tokens to kv block keys
 	logger          logr.Logger
 	blockCache      *blockCache
 }
 
-func NewKVCacheHelper(logger logr.Logger) (*KVCacheHelper, error) {
-	// TODO update config by command line params
+func NewKVCacheHelper(config *common.Configuration, logger logr.Logger) (*KVCacheHelper, error) {
 	tokenProcConfig := kvblock.DefaultTokenProcessorConfig()
+	tokenProcConfig.BlockSize = config.TokenBlockSize
+	if config.HashSeed != "" {
+		tokenProcConfig.HashSeed = config.HashSeed
+	}
 	tokensProcessor := kvblock.NewChunkedTokenDatabase(tokenProcConfig)
 
 	tokenizationConfig := tokenization.DefaultConfig()
+	if config.TokenizersCacheDir != "" {
+		tokenizationConfig.TokenizersCacheDir = config.TokenizersCacheDir
+	}
 	tokenizer, err := tokenization.NewCachedHFTokenizer(tokenizationConfig.HFTokenizerConfig)
 
 	if err != nil {
@@ -53,7 +55,7 @@ func NewKVCacheHelper(logger logr.Logger) (*KVCacheHelper, error) {
 	return &KVCacheHelper{
 		tokenizer:       tokenizer,
 		tokensProcessor: tokensProcessor,
-		blockCache:      newBlockCache(maxBlocks, logger),
+		blockCache:      newBlockCache(config.KVCacheSize, logger),
 		logger:          logger,
 	}, nil
 }
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
@@ -118,7 +118,7 @@ func (s *VllmSimulator) Start(ctx context.Context) error {
 	}
 
 	if s.config.EnableKVCache {
-		s.kvcacheHelper, err = kvcache.NewKVCacheHelper(s.logger)
+		s.kvcacheHelper, err = kvcache.NewKVCacheHelper(s.config, s.logger)
 		if err != nil {
 			return err
 		}

Original file line number	Diff line number	Diff line change
`@@ -118,7 +118,7 @@ func (s *VllmSimulator) Start(ctx context.Context) error {`
`118`	`118`	`}`
`119`	`119`
`120`	`120`	`if s.config.EnableKVCache {`
`121`		`- s.kvcacheHelper, err = kvcache.NewKVCacheHelper(s.logger)`
	`121`	`+ s.kvcacheHelper, err = kvcache.NewKVCacheHelper(s.config, s.logger)`
`122`	`122`	`if err != nil {`
`123`	`123`	`return err`
`124`	`124`	`}`