fix perf

cqulilujia · cqulilujia · commit 3f08bc3436fb · 2025-09-25T11:04:31.000+08:00
diff --git a/custom_ops/xpu_ops/src/ops/gather_next_token.cc b/custom_ops/xpu_ops/src/ops/gather_next_token.cc
@@ -27,7 +27,6 @@ GatherNextToken(const paddle::Tensor &tmp_out,     // [token_num, dim_embed]
                 const paddle::Tensor &enc_batch_tensor,
                 const paddle::Tensor &dec_batch_tensor,
                 const paddle::optional<paddle::Tensor> &output_padding_offset,
-                const paddle::optional<paddle::Tensor> &token_type_ids,
                 int max_input_length) {
     phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId());
     auto dev_ctx =
@@ -53,13 +52,11 @@ GatherNextToken(const paddle::Tensor &tmp_out,     // [token_num, dim_embed]
 
     auto out = paddle::full({bsz, dim}, -2, tmp_out.type(), tmp_out.place());
 
-    const int32_t* token_type_ids_ptr = token_type_ids.get_ptr() ?
-                                        token_type_ids->data<int32_t>() : nullptr;
     int r = baidu::xpu::api::plugin::eb_gather_next_token<XPUType, XPUType>(
         xpu_ctx->x_context(),
         reinterpret_cast<const XPUType *>(tmp_out.data<data_t>()),
         reinterpret_cast<XPUType *>(out.data<data_t>()), encoder_seqs_lods_vp,
-        encoder_batch_map_vp, decoder_batch_map_vp, token_type_ids_ptr, dim);
+        encoder_batch_map_vp, decoder_batch_map_vp, dim);
     return {out};
 }
 
@@ -103,8 +100,7 @@ PD_BUILD_OP(gather_next_token)
              "decoder_batch_map", "encoder_seq_lod_cpu",
              "encoder_batch_map_cpu", "decoder_batch_map_cpu",
              "enc_batch_tensor", "dec_batch_tensor",
-             paddle::Optional("output_padding_offset"),
-             paddle::Optional("token_type_ids")})
+             paddle::Optional("output_padding_offset")})
     .Outputs({"out"})
     .Attrs({"max_input_length: int"})
     .SetKernelFn(PD_KERNEL(GatherNextToken))
diff --git a/custom_ops/xpu_ops/src/plugin/include/xpu/plugin.h b/custom_ops/xpu_ops/src/plugin/include/xpu/plugin.h
@@ -133,7 +133,6 @@ eb_gather_next_token(Context *ctx, const TX *x, TY *y,
                      VectorParam<int32_t> &encoder_seqs_lods, // NOLINT
                      VectorParam<int32_t> &encoder_batch_map, // NOLINT
                      VectorParam<int32_t> &decoder_batch_map, // NOLINT
-                     const int32_t* token_type_ids,           // for VL model
                      int64_t hidden_dim);
 
 template <typename TX, typename TSCALE = float, typename TY = int8_t>
diff --git a/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/eb_gather_next_token.xpu b/custom_ops/xpu_ops/src/plugin/src/kernel/kunlun3cpp/eb_gather_next_token.xpu
@@ -42,7 +42,6 @@ __global__ void eb_gather_next_token(TX* src,
                                      int* encoder_seqs_lods,
                                      int* encoder_batch_map,
                                      int* decoder_batch_map,
-                                     const int* token_type_ids,
                                      int en_batch,
                                      int de_batch,
                                      int64_t copy_size) {
@@ -51,7 +50,6 @@ __global__ void eb_gather_next_token(TX* src,
   __group_shared__ int local_lods_en[MAX_BATCH + 1];
   __group_shared__ int local_map_en[MAX_BATCH];
   __group_shared__ int local_map_de[MAX_BATCH];
-  __group_shared__ int local_token_type_ids[128]; // 128 * 4B = 0.5KB
   GM2GSM_ASYNC(encoder_seqs_lods, local_lods_en, (en_batch + 1) * sizeof(int));
   if (en_batch > 0) {
     GM2GSM_ASYNC(encoder_batch_map, local_map_en, en_batch * sizeof(int));
@@ -68,23 +66,7 @@ __global__ void eb_gather_next_token(TX* src,
   for (int i = start; i < end; i++) {
     if (i < en_batch) {
       // src encode part
-      int last_text_token = local_lods_en[i + 1] - 1;
-      if (token_type_ids != nullptr) {
-        int token_type = token_type_ids[last_text_token]; // GM2LM, size = 1
-        // token_type: 0 for text and 1 for image
-        if (__builtin_expect(token_type == 0, 1)) {
-          ; // branch prediction
-        } else {
-          // TODU(lilujia): to be optimized with aligned-gm2lm and vectorization
-          for (int id = local_lods_en[i + 1] - 1; id >= local_lods_en[i]; id--) {
-            if (token_type_ids[id] == 0) {
-              last_text_token = id;
-              break;
-            }
-          }
-        }
-      }
-      _global_ptr_ TX* cur_src = src + last_text_token * copy_size;
+      _global_ptr_ TX* cur_src = src + (local_lods_en[i + 1] - 1) * copy_size;
       _global_ptr_ TY* cur_dst = dst + local_map_en[i] * copy_size;
       do_memcpy_1d<TX, TY>(cur_src, cur_dst, copy_size);
     } else {
@@ -103,7 +85,6 @@ __global__ void eb_gather_next_token(TX* src,
       int* encoder_seqs_lods,                            \
       int* encoder_batch_map,                            \
       int* decoder_batch_map,                            \
-      const int* token_type_ids,                         \
       int en_batch,                                      \
       int de_batch,                                      \
       int64_t copy_size);
diff --git a/custom_ops/xpu_ops/src/plugin/src/wrapper/eb_gather_next_token.cpp b/custom_ops/xpu_ops/src/plugin/src/wrapper/eb_gather_next_token.cpp
@@ -23,7 +23,6 @@ template <typename TX, typename TY>
 __attribute__((global)) void
 eb_gather_next_token(TX *src, TY *dst, int *encoder_seqs_lods,
                      int *encoder_batch_map, int *decoder_batch_map,
-                     const int* token_type_ids,
                      int en_batch, int de_batch, int64_t copy_size);
 } // namespace plugin
 } // namespace xpu3
@@ -36,22 +35,13 @@ template <typename TX, typename TY>
 static int
 cpu_wrapper(api::Context *ctx, const TX *x, TY *y, const int *encoder_seqs_lods,
             const int *encoder_batch_map, const int *decoder_batch_map,
-            const int* token_type_ids, int en_batch, int de_batch,
-            int64_t hidden_dim) {
+            int en_batch, int de_batch, int64_t hidden_dim) {
   int ret = 0;
   int encoder_len_total = encoder_seqs_lods[en_batch];
   for (int i = 0; i < en_batch; i++) {
-    int last_text_token = encoder_seqs_lods[i + 1] - 1;
-    if (token_type_ids != nullptr) {
-      for (int id = encoder_seqs_lods[i + 1] - 1; id >= encoder_seqs_lods[i]; id--) {
-        if (token_type_ids[id] == 0) {
-          last_text_token = id;
-          break;
-        }
-      }
-    }
-    ret = api::cast<TX, TY>(ctx, x + last_text_token * hidden_dim,
-                            y + encoder_batch_map[i] * hidden_dim, hidden_dim);
+    ret =
+        api::cast<TX, TY>(ctx, x + (encoder_seqs_lods[i + 1] - 1) * hidden_dim,
+                          y + encoder_batch_map[i] * hidden_dim, hidden_dim);
     WRAPPER_ASSERT_SUCCESS(ctx, ret);
   }
   for (int i = 0; i < de_batch; i++) {
@@ -67,14 +57,12 @@ static int xpu3_wrapper(api::Context *ctx, const TX *x, TY *y,
                         api::VectorParam<int32_t> &encoder_seqs_lods, // NOLINT
                         api::VectorParam<int32_t> &encoder_batch_map, // NOLINT
                         api::VectorParam<int32_t> &decoder_batch_map, // NOLINT
-                        const int32_t* token_type_ids,
-                        int en_batch, int de_batch,
-                        int64_t hidden_dim) {
+                        int en_batch, int de_batch, int64_t hidden_dim) {
   auto eb_gather_next_token_kernel = xpu3::plugin::eb_gather_next_token<TX, TY>;
   // NOTE: Don't change 16 to 64, because kernel use gsm
   eb_gather_next_token_kernel<<<ctx->ncluster(), 16, ctx->xpu_stream>>>(
       const_cast<TX *>(x), y, encoder_seqs_lods.xpu, encoder_batch_map.xpu,
-      decoder_batch_map.xpu, token_type_ids, en_batch, de_batch, hidden_dim);
+      decoder_batch_map.xpu, en_batch, de_batch, hidden_dim);
   return api::SUCCESS;
 }
 
@@ -83,25 +71,18 @@ int eb_gather_next_token(api::Context *ctx, const TX *x, TY *y,
                          api::VectorParam<int32_t> &encoder_seqs_lods, // NOLINT
                          api::VectorParam<int32_t> &encoder_batch_map, // NOLINT
                          api::VectorParam<int32_t> &decoder_batch_map, // NOLINT
-                         const int32_t* token_type_ids,                // for VL model
                          int64_t hidden_dim) {
   WRAPPER_CHECK_CTX(ctx);
   WRAPPER_DUMP_FUNCTION_T2(ctx, "eb_gather_next_token", TX, TY);
   WRAPPER_DUMP_PARAM6(ctx, x, y, encoder_seqs_lods, encoder_batch_map,
-                      decoder_batch_map, token_type_ids);
-  WRAPPER_DUMP_PARAM1(ctx, hidden_dim);
+                      decoder_batch_map, hidden_dim);
   WRAPPER_DUMP(ctx);
   int encoder_batch = encoder_batch_map.len;
   int batch = encoder_batch + decoder_batch_map.len;
   int max_encoder_lod = encoder_seqs_lods.cpu[encoder_batch];
   int m = encoder_seqs_lods.cpu[encoder_batch] + decoder_batch_map.len;
   WRAPPER_CHECK_PTR(ctx, TX, m * hidden_dim, x);
   WRAPPER_CHECK_PTR(ctx, TY, batch * hidden_dim, y);
-  if (token_type_ids != nullptr) {
-    // token_type_ids records the token type, 1 for vision and 0 for text
-    // in text model, token_type_ids is nullptr
-    WRAPPER_CHECK_PTR(ctx, int32_t, m, token_type_ids);
-  }
   WRAPPER_ASSERT_GT(ctx, hidden_dim, 0);
   // check VectorParam
   WRAPPER_ASSERT_EQ(ctx, encoder_seqs_lods.len, encoder_batch_map.len + 1);
@@ -118,15 +99,8 @@ int eb_gather_next_token(api::Context *ctx, const TX *x, TY *y,
     WRAPPER_ASSERT_GE(ctx, decoder_batch_map.cpu[i], 0);
   }
   if (ctx->dev().type() == api::kCPU) {
-    std::vector<int> token_type_ids_vec;
-    if (token_type_ids != nullptr) {
-        token_type_ids_vec.resize(m);
-        int ret = do_device2host(ctx, token_type_ids, token_type_ids_vec.data(), m * sizeof(int32_t));
-        WRAPPER_ASSERT_SUCCESS(ctx, ret);
-    }
     return cpu_wrapper<TX, TY>(ctx, x, y, encoder_seqs_lods.cpu,
                                encoder_batch_map.cpu, decoder_batch_map.cpu,
-                               token_type_ids_vec.data(),
                                encoder_batch_map.len, decoder_batch_map.len,
                                hidden_dim);
   }
@@ -140,7 +114,6 @@ int eb_gather_next_token(api::Context *ctx, const TX *x, TY *y,
         decoder_batch_map.to_xpu(RAII_GUARD);
     return xpu3_wrapper<TX, TY>(ctx, x, y, encoder_seqs_lods_xpu,
                                 encoder_batch_map_xpu, decoder_batch_map_xpu,
-                                token_type_ids,
                                 encoder_batch_map.len, decoder_batch_map.len,
                                 hidden_dim);
   }
@@ -149,7 +122,7 @@ int eb_gather_next_token(api::Context *ctx, const TX *x, TY *y,
 #define INSTANTIATION_EB_GATHER_NEXT_TOKEN(TX, TY)                             \
   template int eb_gather_next_token<TX, TY>(                                   \
       api::Context *, const TX *, TY *, api::VectorParam<int32_t> &,           \
-      api::VectorParam<int32_t> &, api::VectorParam<int32_t> &, const int32_t*, int64_t);
+      api::VectorParam<int32_t> &, api::VectorParam<int32_t> &, int64_t);
 
 INSTANTIATION_EB_GATHER_NEXT_TOKEN(float16, float16);
 INSTANTIATION_EB_GATHER_NEXT_TOKEN(bfloat16, bfloat16);
diff --git a/fastdeploy/model_executor/forward_meta.py b/fastdeploy/model_executor/forward_meta.py
@@ -234,8 +234,6 @@ class XPUForwardMeta(ForwardMeta):
     total_enc_len: Optional[paddle.Tensor] = None
     # position embedding type in rope, supports 'NORMAL' or 'HALF_HEAD_DIM'
     pos_emb_type: Optional[str] = "NORMAL"
-    # used in VL model, record the token type, 1 for image, 0 for text
-    token_type_ids: Optional[paddle.Tensor] = None
 
 
 @dataclass
diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
@@ -778,9 +778,6 @@ def forward(
         )
         self._input_embeddings.copy_(input_embeddings, False)
 
-        if vl_moe_meta.image_token_num.item() > 0:  # for XPU
-            forward_meta.token_type_ids = vl_moe_meta.token_type_ids
-
         hidden_states = self.ernie(
             input_embeddings=self._input_embeddings,
             ids_remove_padding=ids_remove_padding,
diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py
@@ -184,7 +184,6 @@ def xpu_process_output(
         xpu_forward_meta.enc_batch,
         xpu_forward_meta.dec_batch,
         None,  # output_padding_offset
-        xpu_forward_meta.token_type_ids,  # token_type_ids
         -1,  # max_input_length
     )
     return hiddden_states
@@ -1231,6 +1230,15 @@ def update_share_input_block_num(self, num_gpu_blocks: int) -> None:
             }
         )
 
+    def clear_block_table(self) -> None:
+        """
+        Clear the block tables and kv cache after profiling.
+        """
+        del self.share_inputs["caches"]
+        if self.forward_meta is not None:
+            del self.forward_meta.caches
+        paddle.device.xpu.empty_cache()
+
     def cal_theortical_kvcache(self):
         """
         Calculate the total block memory required at the model level
diff --git a/fastdeploy/worker/xpu_worker.py b/fastdeploy/worker/xpu_worker.py
@@ -15,7 +15,6 @@
 """
 
 import gc
-import time
 from typing import List, Optional
 
 import paddle
@@ -101,73 +100,38 @@ def determine_available_memory(self) -> int:
             len(self.device_ids) > self.local_rank
         ), f"device number must be greater than local rank, but get device number is {len(self.device_ids)}, rank is {self.local_rank}"
 
-        # 1. Record memory state before profile run
-        start_time = time.perf_counter()
-        Gb = 1024**3
-        local_rank = self.local_rank % 8
-        paddle.device.xpu.reset_max_memory_reserved(local_rank)
-        paddle.device.xpu.reset_max_memory_allocated(local_rank)
-        paddle_reserved_mem_before_run = paddle.device.xpu.max_memory_reserved(local_rank)
-        paddle_allocated_mem_before_run = paddle.device.xpu.max_memory_allocated(local_rank)
-        before_run_mem_total = xpu_get_total_global_memory(int(self.device_ids[self.local_rank]))
-        before_run_mem_used = xpu_get_used_global_memory(int(self.device_ids[self.local_rank]))
-        before_run_mem_free = xpu_get_free_global_memory(int(self.device_ids[self.local_rank]))
+        total_memory = xpu_get_total_global_memory(int(self.device_ids[self.local_rank]))
+        used_memory = xpu_get_used_global_memory(int(self.device_ids[self.local_rank]))
+        free_memory = xpu_get_free_global_memory(int(self.device_ids[self.local_rank]))
 
         logger.info(
-            (
-                "Before running the profile, the memory usage info is as follows:",
-                f"\nDevice Total memory: {before_run_mem_total / Gb}",
-                f"\nDevice used memory: {before_run_mem_used / Gb}",
-                f"\nDevice free memory: {before_run_mem_free / Gb}",
-                f"\nPaddle reserved memory: {paddle_reserved_mem_before_run / Gb}",
-                f"\nPaddle allocated memory: {paddle_allocated_mem_before_run / Gb}",
-            )
+            f"Before warm up, total_memory: {total_memory}, \
+                    used_memory: {used_memory}, free_memory: {free_memory}"
         )
 
-        # 2. Profile run
         self.model_runner.prepare_profile()
         if self.parallel_config.use_ep:
             logger.warning("EP mode does not support profile run.")
         else:
             self.model_runner.profile_run()
         set_random_seed(self.fd_config.model_config.seed)
 
-        # 3. Statistical memory information
-        paddle_reserved_mem_after_run = paddle.device.xpu.max_memory_reserved(local_rank)
-        paddle_allocated_mem_after_run = paddle.device.xpu.max_memory_allocated(local_rank)
-
+        total_available_memory = int(total_memory * self.cache_config.gpu_memory_utilization)
+        used_memory = xpu_get_used_global_memory(int(self.device_ids[self.local_rank]))
+        available_kv_cache_memory = total_available_memory - used_memory
         model_block_memory_used = self.cal_theortical_kvcache()
-        paddle_peak_increase = paddle_reserved_mem_after_run - paddle_allocated_mem_before_run
-
-        paddle.device.xpu.empty_cache()
-
-        after_run_mem_total = xpu_get_total_global_memory(int(self.device_ids[self.local_rank])).item()
-        after_run_mem_used = xpu_get_used_global_memory(int(self.device_ids[self.local_rank])).item()
-        after_run_mem_free = xpu_get_free_global_memory(int(self.device_ids[self.local_rank])).item()
-
-        available_kv_cache_memory = (
-            after_run_mem_total * self.cache_config.gpu_memory_utilization - after_run_mem_used - paddle_peak_increase
-        )
         available_kv_cache_memory += model_block_memory_used * self.parallel_config.total_block_num
-
         if self.parallel_config.use_ep:
             available_kv_cache_memory = int(available_kv_cache_memory * 0.6)
 
-        end_time = time.perf_counter()
+        self.model_runner.clear_block_table()
+
         logger.info(
-            (
-                "After running the profile, the memory usage info is as follows:",
-                f"\nDevice Total memory: {after_run_mem_total / Gb}",
-                f"\nDevice used memory: {after_run_mem_used / Gb}",
-                f"\nDevice free memory: {after_run_mem_free / Gb}",
-                f"\nPaddle reserved memory: {paddle_reserved_mem_after_run / Gb}",
-                f"\nPaddle allocated memory: {paddle_allocated_mem_after_run / Gb}",
-                f"\nAvailable KV Cache meomory: {available_kv_cache_memory / Gb}",
-                f"Profile time: {end_time - start_time}",
-            )
+            f"After warm up, total_available_memory: {total_available_memory}, \
+                    used_memory: {used_memory}, available_kv_cache_memory: {available_kv_cache_memory}"
         )
-
-        return available_kv_cache_memory  # return to calculate the block num in this device
+        paddle.device.xpu.empty_cache()
+        return available_kv_cache_memory  # approximate value
 
     def load_model(self) -> None:
         """Load model"""

Original file line number	Diff line number	Diff line change
`@@ -778,9 +778,6 @@ def forward(`
`778`	`778`	`)`
`779`	`779`	`self._input_embeddings.copy_(input_embeddings, False)`
`780`	`780`
`781`		`- if vl_moe_meta.image_token_num.item() > 0: # for XPU`
`782`		`- forward_meta.token_type_ids = vl_moe_meta.token_type_ids`
`783`		`-`
`784`	`781`	`hidden_states = self.ernie(`
`785`	`782`	`input_embeddings=self._input_embeddings,`
`786`	`783`	`ids_remove_padding=ids_remove_padding,`