remove unnecessary comments

yewentao256 · yewentao256 · commit 8f58030b94f5 · 2025-06-25T01:05:08.000Z
Signed-off-by: yewentao256 &lt;zhyanwentao@126.com&gt;
diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@@ -85,21 +85,13 @@ def benchmark_shape(m: int,
 
     # === DeepGEMM Implementation ===
     def deepgemm_gemm():
-        # A quantization is inside the loop as it depends on activations
-        # A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A)
-        # A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(
-        #     A, block_size[1])
-        # A_scale_aligned = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
-        # C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
         deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm),
                                        (B_deepgemm, B_scale_deepgemm),
                                        C_deepgemm)
         return C_deepgemm
 
     # === vLLM Triton Implementation ===
     def vllm_triton_gemm():
-        # A quantization is inside the loop as it depends on activations
-        # A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
         return w8a8_block_fp8_matmul(A_vllm,
                                      B_vllm,
                                      A_scale_vllm,
@@ -109,9 +101,6 @@ def vllm_triton_gemm():
 
     # === vLLM CUTLASS Implementation ===
     def vllm_cutlass_gemm():
-        # A quantization is inside the loop as it depends on activations
-        # A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
-        #     A, block_size[1], column_major_scales=True)
         return ops.cutlass_scaled_mm(A_vllm_cutlass,
                                      B_vllm.T,
                                      scale_a=A_scale_vllm_cutlass,
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -143,6 +143,7 @@ def apply(
         quant_out = _resize_cache(workspace13.view(dtype=torch.float8_e4m3fn),
                                   (M_sum, N // 2))
         mm2_out = _resize_cache(workspace2, (M_sum, K))
+        # import pdb; pdb.set_trace()
 
         dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
             (a1q, a1q_scale), (w1, w1_scale), mm1_out, expert_ids)