diff --git a/lightllm/server/router/manager.py b/lightllm/server/router/manager.py index a941a6609..b620aaabd 100644 --- a/lightllm/server/router/manager.py +++ b/lightllm/server/router/manager.py @@ -307,7 +307,7 @@ async def _step(self): paused_reqs = select_paused_reqs( self.running_batch, self.pause_strategy, self.req_queue, self.max_total_token_num ) - await self._pause_reqs(self.running_batch, paused_reqs) + await self._pause_reqs(paused_reqs) logger.debug(f"pasued req num: {self.req_queue.get_paused_req_num()}") self.has_wait_tokens = 0 return @@ -342,9 +342,9 @@ async def _decode_batch(self, batch: Batch): ) return - async def _pause_reqs(self, batch: Batch, pasue_reqs): + async def _pause_reqs(self, pasue_reqs): pasue_req_ids = [r.request_id for r in pasue_reqs] - await self.model_rpc_client.pause_reqs(batch.batch_id, pasue_req_ids) + await self.model_rpc_client.pause_reqs(pasue_req_ids) return def _filter_runing_batch(self): diff --git a/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py b/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py index d3bb5403e..3e91c5bf7 100644 --- a/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py +++ b/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py @@ -54,6 +54,8 @@ def post_handel(self, run_reqs: List[InferReq], next_token_ids, next_token_logpr req_obj.cur_kv_len = len(req_obj.get_chuncked_input_token_ids()) if req_obj.cur_kv_len < req_obj.get_cur_total_len(): + if self.tp_rank < self.dp_size: + req_obj.shm_req.shm_cur_kv_len = req_obj.cur_kv_len continue req_obj.set_next_gen_token_id(next_token_id, next_token_logprob)