From 1c7502b5d315ac22067531f6aff5608046b4ba71 Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Mon, 25 Nov 2024 07:48:24 +0000 Subject: [PATCH 1/5] update DygraphInferencePredictor --- llm/predict/predictor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llm/predict/predictor.py b/llm/predict/predictor.py index 4294b7b60ab3..1ba61c80bec1 100644 --- a/llm/predict/predictor.py +++ b/llm/predict/predictor.py @@ -719,10 +719,9 @@ def _infer(self, inputs: dict[str, paddle.Tensor]): inputs[key] = paddle.to_tensor(inputs[key]) inputs["cache_kvs"] = self.cache_kvs - self.model.generate( + return self.model.generate( **inputs, ) - return None class BlockInferencePredictorMixin(BasePredictor): From 31d3d621c89d7178e312cdc90ce405faf35e7b25 Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Tue, 26 Nov 2024 05:28:07 +0000 Subject: [PATCH 2/5] update batch_size --- llm/predict/predictor.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/llm/predict/predictor.py b/llm/predict/predictor.py index 1ba61c80bec1..99297f57752c 100644 --- a/llm/predict/predictor.py +++ b/llm/predict/predictor.py @@ -903,6 +903,8 @@ def _preprocess(self, input_text: list[str]): input_text = [input_text] if isinstance(input_text, str) else input_text input_text = [self.tokenizer.apply_chat_template(sentence, tokenize=False) for sentence in input_text] + input_text_batch_size = len(input_text) + input_ids = [] for text in input_text: tokens = self.tokenizer( @@ -922,28 +924,24 @@ def _preprocess(self, input_text: list[str]): self.model_inputs["block_tables"][:][:] = -1 free_list = list(range(self.max_block_nums)) - for i in range(self.config.batch_size): + for i in range(input_text_batch_size): for j in range( (seq_lens[i] + self.config.max_length + self.config.block_size - 1) // self.config.block_size ): used_block_id = free_list.pop() self.model_inputs["block_tables"][i, j] = used_block_id + # fmt:off self.model_inputs["seq_lens_this_time"] = paddle.to_tensor(np.array(seq_lens).astype("int32").reshape(-1, 1)) self.model_inputs["seq_lens_encoder"] = paddle.to_tensor(np.array(seq_lens).astype("int32").reshape(-1, 1)) - self.model_inputs["seq_lens_decoder"] = paddle.full( - shape=[self.config.batch_size, 1], fill_value=0, dtype="int32" - ) - self.model_inputs["step_idx"] = paddle.full(shape=[self.config.batch_size, 1], fill_value=0, dtype="int64") + self.model_inputs["seq_lens_decoder"] = paddle.full(shape=[input_text_batch_size, 1], fill_value=0, dtype="int32") + self.model_inputs["step_idx"] = paddle.full(shape=[input_text_batch_size, 1], fill_value=0, dtype="int64") self.model_inputs["not_need_stop"] = paddle.full(shape=[1], fill_value=True, dtype="bool") - self.model_inputs["stop_flags"] = paddle.full( - shape=[self.config.batch_size, 1], fill_value=False, dtype="bool" - ) - self.model_inputs["stop_nums"] = paddle.full(shape=[1], fill_value=self.config.batch_size, dtype="int64") - self.model_inputs["pre_ids"] = paddle.full( - shape=[self.config.batch_size, self.config.max_length], fill_value=-1, dtype="int64" - ) - self.model_inputs["next_tokens"] = paddle.full(shape=[self.config.batch_size, 1], fill_value=-1, dtype="int64") + self.model_inputs["stop_flags"] = paddle.full(shape=[input_text_batch_size, 1], fill_value=False, dtype="bool") + self.model_inputs["stop_nums"] = paddle.full(shape=[1], fill_value=input_text_batch_size, dtype="int64") + self.model_inputs["pre_ids"] = paddle.full(shape=[input_text_batch_size, self.config.max_length], fill_value=-1, dtype="int64") + self.model_inputs["next_tokens"] = paddle.full(shape=[input_text_batch_size, 1], fill_value=-1, dtype="int64") + # fmt:on if self.config.mode == "static": for k, v in self.model_inputs.items(): @@ -1007,8 +1005,8 @@ def predict(self, input_texts: list[str], return_tokens=False): if self.tensor_parallel_rank == 0: outputs = [] output_tokens = [] - while len(outputs) < self.batch_size: - result = result_queue.get(timeout=1) + while len(outputs) < len(input_texts): + result = result_queue.get(timeout=10) outputs.append(result[-1]) output_tokens.append(result[-2]) From 33da8ddbeeca01e7bd1103716ed0223425d1c6cb Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Fri, 29 Nov 2024 09:00:27 +0000 Subject: [PATCH 3/5] fix int8 setup --- csrc/gpu/quant_int8.cu | 4 ++++ csrc/setup_cuda.py | 17 ++++++++++------- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/csrc/gpu/quant_int8.cu b/csrc/gpu/quant_int8.cu index c34c6b701af9..8155c2198fe7 100644 --- a/csrc/gpu/quant_int8.cu +++ b/csrc/gpu/quant_int8.cu @@ -65,7 +65,11 @@ __forceinline__ __device__ hip_bfloat16 add_mul(hip_bfloat16 a, hi #else template<> __forceinline__ __device__ __nv_bfloat16 add_mul<__nv_bfloat16>(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) { + #if __CUDA_ARCH__ >= 800 return __hmul(__hadd(a, b), c); + #else + return (static_cast(a) + static_cast(b)) * static_cast(c); + #endif } #endif diff --git a/csrc/setup_cuda.py b/csrc/setup_cuda.py index 40a5ff3a0f7e..58966fbafacc 100644 --- a/csrc/setup_cuda.py +++ b/csrc/setup_cuda.py @@ -57,8 +57,7 @@ def strtobool(v): def get_gencode_flags(): if not strtobool(os.getenv("FLAG_LLM_PDC", "False")): - prop = paddle.device.cuda.get_device_properties() - cc = prop.major * 10 + prop.minor + cc = get_sm_version() return ["-gencode", "arch=compute_{0},code=sm_{0}".format(cc)] else: # support more cuda archs @@ -75,6 +74,7 @@ def get_gencode_flags(): gencode_flags = get_gencode_flags() library_path = os.environ.get("LD_LIBRARY_PATH", "/usr/local/cuda/lib64") +sm_version = get_sm_version() sources = [ "./gpu/save_with_output.cc", @@ -102,16 +102,11 @@ def get_gencode_flags(): "./gpu/dequant_int8.cu", "./gpu/flash_attn_bwd.cc", "./gpu/tune_cublaslt_gemm.cu", - "./gpu/append_attention.cu", - "./gpu/append_attn/get_block_shape_and_split_kv_block.cu", - "./gpu/append_attn/decoder_write_cache_with_rope_kernel.cu", - "./gpu/append_attn/speculate_write_cache_with_rope_kernel.cu", "./gpu/sample_kernels/top_p_sampling_reject.cu", "./gpu/update_inputs_v2.cu", "./gpu/set_preids_token_penalty_multi_scores.cu", "./gpu/speculate_decoding_kernels/ngram_match.cc", ] -sources += find_end_files("./gpu/append_attn/template_instantiation", ".cu") sources += find_end_files("./gpu/speculate_decoding_kernels", ".cu") nvcc_compile_args = gencode_flags @@ -138,6 +133,14 @@ def get_gencode_flags(): if cc >= 80: sources += ["gpu/int8_gemm_with_cutlass/gemm_dequant.cu"] + sources += [ + "./gpu/append_attention.cu", + "./gpu/append_attn/get_block_shape_and_split_kv_block.cu", + "./gpu/append_attn/decoder_write_cache_with_rope_kernel.cu", + "./gpu/append_attn/speculate_write_cache_with_rope_kernel.cu", + ] + sources += find_end_files("./gpu/append_attn/template_instantiation", ".cu") + if cc >= 89 and cuda_version >= 12.4: os.system("python utils/auto_gen_fp8_fp8_gemm_fused_kernels.py") os.system("python utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels.py") From 7b83fb1f9e9a7668fde2374bc4744ee08f4e3e94 Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Fri, 29 Nov 2024 09:51:26 +0000 Subject: [PATCH 4/5] revert fix by padding input_text --- llm/predict/predictor.py | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/llm/predict/predictor.py b/llm/predict/predictor.py index e25f9218b370..a9172c4e2271 100644 --- a/llm/predict/predictor.py +++ b/llm/predict/predictor.py @@ -139,9 +139,7 @@ class PredictorArgument: ) speculate_method: str = field( default=None, - metadata={ - "help": "speculate method, it should be one of ['None', 'autoregressive', 'inference_with_reference']" - }, + metadata={"help": "speculate method, it should be one of ['None', 'inference_with_reference']"}, ) speculate_max_draft_token_num: int = field( default=1, @@ -735,9 +733,10 @@ def _infer(self, inputs: dict[str, paddle.Tensor]): inputs[key] = paddle.to_tensor(inputs[key]) inputs["cache_kvs"] = self.cache_kvs - return self.model.generate( + self.model.generate( **inputs, ) + return None class BlockInferencePredictorMixin(BasePredictor): @@ -915,12 +914,16 @@ def init_model_inputs(self, config: PredictorArgument): self.model_inputs["rope_emb"] = paddle.concat([src_mask.reshape([-1]), tgt_mask.reshape([-1])]) def _preprocess(self, input_text: list[str]): + len_input_text = len(input_text) + if len_input_text < self.batch_size: + padding_len = self.batch_size - len_input_text + input_text += [""] * padding_len + assert len(input_text) == self.batch_size + if self.tokenizer.chat_template is not None: input_text = [input_text] if isinstance(input_text, str) else input_text input_text = [self.tokenizer.apply_chat_template(sentence, tokenize=False) for sentence in input_text] - input_text_batch_size = len(input_text) - input_ids = [] for text in input_text: tokens = self.tokenizer( @@ -940,24 +943,28 @@ def _preprocess(self, input_text: list[str]): self.model_inputs["block_tables"][:][:] = -1 free_list = list(range(self.max_block_nums)) - for i in range(input_text_batch_size): + for i in range(self.config.batch_size): for j in range( (seq_lens[i] + self.config.max_length + self.config.block_size - 1) // self.config.block_size ): used_block_id = free_list.pop() self.model_inputs["block_tables"][i, j] = used_block_id - # fmt:off self.model_inputs["seq_lens_this_time"] = paddle.to_tensor(np.array(seq_lens).astype("int32").reshape(-1, 1)) self.model_inputs["seq_lens_encoder"] = paddle.to_tensor(np.array(seq_lens).astype("int32").reshape(-1, 1)) - self.model_inputs["seq_lens_decoder"] = paddle.full(shape=[input_text_batch_size, 1], fill_value=0, dtype="int32") - self.model_inputs["step_idx"] = paddle.full(shape=[input_text_batch_size, 1], fill_value=0, dtype="int64") + self.model_inputs["seq_lens_decoder"] = paddle.full( + shape=[self.config.batch_size, 1], fill_value=0, dtype="int32" + ) + self.model_inputs["step_idx"] = paddle.full(shape=[self.config.batch_size, 1], fill_value=0, dtype="int64") self.model_inputs["not_need_stop"] = paddle.full(shape=[1], fill_value=True, dtype="bool") - self.model_inputs["stop_flags"] = paddle.full(shape=[input_text_batch_size, 1], fill_value=False, dtype="bool") - self.model_inputs["stop_nums"] = paddle.full(shape=[1], fill_value=input_text_batch_size, dtype="int64") - self.model_inputs["pre_ids"] = paddle.full(shape=[input_text_batch_size, self.config.max_length], fill_value=-1, dtype="int64") - self.model_inputs["next_tokens"] = paddle.full(shape=[input_text_batch_size, 1], fill_value=-1, dtype="int64") - # fmt:on + self.model_inputs["stop_flags"] = paddle.full( + shape=[self.config.batch_size, 1], fill_value=False, dtype="bool" + ) + self.model_inputs["stop_nums"] = paddle.full(shape=[1], fill_value=self.config.batch_size, dtype="int64") + self.model_inputs["pre_ids"] = paddle.full( + shape=[self.config.batch_size, self.config.max_length], fill_value=-1, dtype="int64" + ) + self.model_inputs["next_tokens"] = paddle.full(shape=[self.config.batch_size, 1], fill_value=-1, dtype="int64") # speculative decoding related parameters if self.config.speculate_method is not None: @@ -1073,7 +1080,7 @@ def predict(self, input_texts: list[str], return_tokens=False): outputs = [] output_tokens = [] while len(outputs) < len(input_texts): - result = result_queue.get(timeout=10) + result = result_queue.get(timeout=1) outputs.append(result[-1]) output_tokens.append(result[-2]) From ebe00c11469d03657b8ea97da0269777f8f27f92 Mon Sep 17 00:00:00 2001 From: DrownFish19 Date: Mon, 2 Dec 2024 05:50:10 +0000 Subject: [PATCH 5/5] fix --- llm/predict/predictor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llm/predict/predictor.py b/llm/predict/predictor.py index a9172c4e2271..593f0b286fc9 100644 --- a/llm/predict/predictor.py +++ b/llm/predict/predictor.py @@ -733,10 +733,9 @@ def _infer(self, inputs: dict[str, paddle.Tensor]): inputs[key] = paddle.to_tensor(inputs[key]) inputs["cache_kvs"] = self.cache_kvs - self.model.generate( + return self.model.generate( **inputs, ) - return None class BlockInferencePredictorMixin(BasePredictor):