@@ -2509,7 +2509,6 @@ at::Tensor _qscaled_dot_product_cpu(
25092509#ifdef CPU_CAPABILITY_AVX512
25102510 if (at::native::cpublas::could_pack (dtype)) {
25112511 at::Tensor output = at::empty_like (query, query.options ()).transpose (1 , 2 );
2512- std::cout << " int8_sdpa_fused_kernel" << std::endl;
25132512 int8_sdpa_fused_kernel (output, query, key, value,
25142513 dropout_p, is_causal, attn_mask, scale,
25152514 q_scale, q_zp,
@@ -2520,7 +2519,6 @@ at::Tensor _qscaled_dot_product_cpu(
25202519 return output.transpose (1 , 2 );
25212520 } else {
25222521#endif // CPU_CAPABILITY_AVX512
2523- std::cout << " int8_sdpa_math_kernel" << std::endl;
25242522 return int8_sdpa_math_kernel (query, key, value,
25252523 dropout_p, is_causal, attn_mask, scale,
25262524 q_scale, q_zp,
@@ -2536,7 +2534,6 @@ at::Tensor _qscaled_dot_product_cpu(
25362534// CPUBLAS_BRGEMM_F8F8F32 is defined if FP8 BRGEMM is supported in PyTorch CPUBlas.
25372535 if (at::native::cpublas::could_pack (dtype)) {
25382536 at::Tensor output = at::empty_like (query, query.options ()).transpose (1 , 2 );
2539- std::cout << " fp8_sdpa_fused_kernel" << std::endl;
25402537 fp8_sdpa_fused_kernel (output, query, key, value,
25412538 dropout_p, is_causal, attn_mask, scale,
25422539 q_scale, k_scale,
@@ -2545,7 +2542,6 @@ at::Tensor _qscaled_dot_product_cpu(
25452542 return output.transpose (1 , 2 );
25462543 } else {
25472544#endif // CPU_CAPABILITY_AVX512 && CPUBLAS_BRGEMM_F8F8F32
2548- std::cout << " fp8_sdpa_math_kernel" << std::endl;
25492545 return fp8_sdpa_math_kernel (query, key, value,
25502546 dropout_p, is_causal, attn_mask, scale,
25512547 q_scale, k_scale,
0 commit comments