Skip to content

Commit

Permalink
reduce split kv amount
Browse files Browse the repository at this point in the history
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
  • Loading branch information
LucasWilkinson committed Feb 1, 2025
1 parent 5fe1d1d commit 5d5071c
Showing 1 changed file with 1 addition and 3 deletions.
4 changes: 1 addition & 3 deletions vllm/attention/backends/triton_mla.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,8 +622,6 @@ def build(self, seq_lens: List[int], query_lens: List[int],
self.multimodal_placeholder_maps.items()
}

num_kv_splits = 8

return TritonMLAMetadata(
num_prefills=self.num_prefills,
slot_mapping=slot_mapping_tensor,
Expand All @@ -643,7 +641,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
context_lens_tensor=context_lens_tensor,
block_tables=block_tables,
use_cuda_graph=use_captured_graph,
num_kv_splits=num_kv_splits,
num_kv_splits=4, # TODO(lucas) add heuristic
head_dim=self.runner.model_config.get_head_size(),
)

Expand Down

0 comments on commit 5d5071c

Please sign in to comment.