PaddlePaddle · zhangbo9674 · Aug 1, 2025 · Jul 29, 2025 · Jul 30, 2025 · Jul 30, 2025
diff --git a/python/paddle/incubate/fp8/deep_gemm/jit/compiler.py b/python/paddle/incubate/fp8/deep_gemm/jit/compiler.py
@@ -23,8 +23,6 @@
 import subprocess
 import uuid
 
-import paddle
-
 from ..utils import get_cuda_home
 from . import interleave_ffma
 from .runtime import Runtime, RuntimeCache
@@ -158,7 +156,6 @@ def build(name: str, arg_defs: tuple, code: str) -> Runtime:
         if os.getenv("DG_JIT_DEBUG", None):
             print(f"Using cached JIT runtime {name} during build")
         return runtime_cache[path]
-    paddle.base.core.nvprof_nvtx_pop()
     # Write the code
     os.makedirs(path, exist_ok=True)
     args_path = f"{path}/kernel.args"

diff --git a/python/paddle/incubate/fp8/deep_gemm/jit_kernels/gemm.py b/python/paddle/incubate/fp8/deep_gemm/jit_kernels/gemm.py
@@ -255,7 +255,6 @@ def gemm_fp8_fp8_bf16_nt(
     # NOTES: `get_tma_aligned_lhs_scales` may launch a kernel if not processed by previous kernels
     lhs_scales = get_col_major_tma_aligned_tensor(lhs_scales)
     assert rhs_scales.is_contiguous()
-    paddle.base.core.nvprof_nvtx_pop()
 
     # Do nothing if `m` is zero
     if m == 0:
@@ -275,4 +274,3 @@ def gemm_fp8_fp8_bf16_nt(
 
     # Run the kernel.
     runtime(*args)
-    paddle.base.core.nvprof_nvtx_pop()