diff --git a/python/paddle/incubate/fp8/deep_gemm/jit/compiler.py b/python/paddle/incubate/fp8/deep_gemm/jit/compiler.py index 1a9e7943453395..2d6e27707e726b 100644 --- a/python/paddle/incubate/fp8/deep_gemm/jit/compiler.py +++ b/python/paddle/incubate/fp8/deep_gemm/jit/compiler.py @@ -23,8 +23,6 @@ import subprocess import uuid -import paddle - from ..utils import get_cuda_home from . import interleave_ffma from .runtime import Runtime, RuntimeCache @@ -158,7 +156,6 @@ def build(name: str, arg_defs: tuple, code: str) -> Runtime: if os.getenv("DG_JIT_DEBUG", None): print(f"Using cached JIT runtime {name} during build") return runtime_cache[path] - paddle.base.core.nvprof_nvtx_pop() # Write the code os.makedirs(path, exist_ok=True) args_path = f"{path}/kernel.args" diff --git a/python/paddle/incubate/fp8/deep_gemm/jit_kernels/gemm.py b/python/paddle/incubate/fp8/deep_gemm/jit_kernels/gemm.py index 6bf3e39e54465d..a84fbad6e30348 100644 --- a/python/paddle/incubate/fp8/deep_gemm/jit_kernels/gemm.py +++ b/python/paddle/incubate/fp8/deep_gemm/jit_kernels/gemm.py @@ -255,7 +255,6 @@ def gemm_fp8_fp8_bf16_nt( # NOTES: `get_tma_aligned_lhs_scales` may launch a kernel if not processed by previous kernels lhs_scales = get_col_major_tma_aligned_tensor(lhs_scales) assert rhs_scales.is_contiguous() - paddle.base.core.nvprof_nvtx_pop() # Do nothing if `m` is zero if m == 0: @@ -275,4 +274,3 @@ def gemm_fp8_fp8_bf16_nt( # Run the kernel. runtime(*args) - paddle.base.core.nvprof_nvtx_pop()