csarofeen · csarofeen · Aug 1, 2022 · Jul 29, 2022 · Jul 30, 2022 · Jul 30, 2022
diff --git a/torch/csrc/jit/codegen/cuda/arith.cpp b/torch/csrc/jit/codegen/cuda/arith.cpp
@@ -466,6 +466,7 @@ NVFUSER_DEFINE_UNARY_OP(relu, Relu)
 NVFUSER_DEFINE_UNARY_OP(round, Round)
 NVFUSER_DEFINE_UNARY_OP(silu, Silu)
 NVFUSER_DEFINE_UNARY_OP(trunc, Trunc)
+NVFUSER_DEFINE_UNARY_OP(print, Print)
 #undef NVFUSER_DEFINE_UNARY_OP
 
 Val* randlike(Val* v) {

diff --git a/torch/csrc/jit/codegen/cuda/arith.h b/torch/csrc/jit/codegen/cuda/arith.h
@@ -245,6 +245,9 @@ TORCH_CUDA_CU_API TensorView* isposinf(TensorView*);
 // isreal
 TORCH_CUDA_CU_API Val* isreal(Val*);
 TORCH_CUDA_CU_API TensorView* isreal(TensorView*);
+// print
+TORCH_CUDA_CU_API Val* print(Val*);
+TORCH_CUDA_CU_API TensorView* print(TensorView*);
 
 // Broadcasts inp based on bool vector. Size of broadcast bool vector should be
 // the number of dims desired in the broadcasted tensor. This vector should be

diff --git a/torch/csrc/jit/codegen/cuda/runtime/helpers.cu b/torch/csrc/jit/codegen/cuda/runtime/helpers.cu
@@ -528,3 +528,103 @@ __device__ inline int64_t readCycleCounter() {
   __threadfence();
   return clock64();
 }
+
+__device__ float print_impl(const char* name, float value) {
+  printf(
+      "%s = %f @ threadIdx=(%d,%d,%d), blockIdx=(%d,%d,%d)\n",
+      name,
+      value,
+      (int)threadIdx.x,
+      (int)threadIdx.y,
+      (int)threadIdx.z,
+      (int)blockIdx.x,
+      (int)blockIdx.y,
+      (int)blockIdx.z);
+  return value;
+}
+
+__device__ double print_impl(const char* name, double value) {
+  printf(
+      "%s = %lf @ threadIdx=(%d,%d,%d), blockIdx=(%d,%d,%d)\n",
+      name,
+      value,
+      (int)threadIdx.x,
+      (int)threadIdx.y,
+      (int)threadIdx.z,
+      (int)blockIdx.x,
+      (int)blockIdx.y,
+      (int)blockIdx.z);
+  return value;
+}
+
+__device__ int print_impl(const char* name, int value) {
+  printf(
+      "%s = %d @ threadIdx=(%d,%d,%d), blockIdx=(%d,%d,%d)\n",
+      name,
+      value,
+      (int)threadIdx.x,
+      (int)threadIdx.y,
+      (int)threadIdx.z,
+      (int)blockIdx.x,
+      (int)blockIdx.y,
+      (int)blockIdx.z);
+  return value;
+}
+
+__device__ int64_t print_impl(const char* name, int64_t value) {
+  printf(
+      "%s = %ld @ threadIdx=(%d,%d,%d), blockIdx=(%d,%d,%d)\n",
+      name,
+      value,
+      (int)threadIdx.x,
+      (int)threadIdx.y,
+      (int)threadIdx.z,
+      (int)blockIdx.x,
+      (int)blockIdx.y,
+      (int)blockIdx.z);
+  return value;
+}
+
+__device__ bool print_impl(const char* name, bool value) {
+  printf(
+      "%s = %s @ threadIdx=(%d,%d,%d), blockIdx=(%d,%d,%d)\n",
+      name,
+      value ? "true" : "false",
+      (int)threadIdx.x,
+      (int)threadIdx.y,
+      (int)threadIdx.z,
+      (int)blockIdx.x,
+      (int)blockIdx.y,
+      (int)blockIdx.z);
+  return value;
+}
+
+__device__ __half print_impl(const char* name, __half value) {
+  printf(
+      "%s = %f @ threadIdx=(%d,%d,%d), blockIdx=(%d,%d,%d)\n",
+      name,
+      __half2float(value),
+      (int)threadIdx.x,
+      (int)threadIdx.y,
+      (int)threadIdx.z,
+      (int)blockIdx.x,
+      (int)blockIdx.y,
+      (int)blockIdx.z);
+  return value;
+}
+
+__device__ __bfloat print_impl(const char* name, __bfloat value) {
+  printf(
+      "%s = %f @ threadIdx=(%d,%d,%d), blockIdx=(%d,%d,%d)\n",
+      name,
+      __bfloat2float(value),
+      (int)threadIdx.x,
+      (int)threadIdx.y,
+      (int)threadIdx.z,
+      (int)blockIdx.x,
+      (int)blockIdx.y,
+      (int)blockIdx.z);
+  return value;
+}
+
+#define print(...) print_impl(#__VA_ARGS__, (__VA_ARGS__))
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp
@@ -25170,6 +25170,44 @@ TEST_F(NVFuserTest, FusionIdGraphTrivialReduction_CUDA) {
   }
 }
 
+TEST_F(NVFuserTest, FusionPrint_CUDA) {
+  auto dtypes = {
+      at::kFloat,
+      at::kDouble,
+      at::kHalf,
+      at::kBFloat16,
+      at::kInt,
+      at::kLong,
+      at::kBool};
+  for (auto dtype : dtypes) {
+    auto fusion = std::make_unique<Fusion>();
+    FusionGuard fg(fusion.get());
+
+    auto tv0 = makeSymbolicTensor(1, aten_to_data_type(dtype));
+    fusion->addInput(tv0);
+    auto tv1 = print(tv0);
+    auto tv2 = sin(tv1);
+    fusion->addOutput(tv2);
+
+    // There is no way to check if anything is printed to the console, but we
+    // can validate that when print exist, compilation and computation are not
+    // broken.
+    auto options = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+    at::Tensor t0 = at::arange(2, options).to(dtype);
+
+    FusionExecutorCache executor_cache(std::move(fusion));
+    auto cg_outputs = executor_cache.runFusionWithInputs({t0});
+
+    testValidate(
+        executor_cache.fusion(),
+        cg_outputs,
+        {t0},
+        {t0.sin()},
+        __LINE__,
+        __FILE__);
+  }
+}
+
 } // namespace jit
 } // namespace torch
 #endif // #if defined(USE_CUDA)
diff --git a/torch/csrc/jit/codegen/cuda/type.cpp b/torch/csrc/jit/codegen/cuda/type.cpp
@@ -376,6 +376,7 @@ bool needFloatSuffix(UnaryOpType t) {
     case UnaryOpType::IsNegInf:
     case UnaryOpType::IsPosInf:
     case UnaryOpType::IsReal:
+    case UnaryOpType::Print:
       return false;
     default:
       return true;
@@ -432,6 +433,8 @@ static const char* unary_op_type2string(UnaryOpType t) {
       return "neg";
     case UnaryOpType::Not:
       return "not";
+    case UnaryOpType::Print:
+      return "print";
     case UnaryOpType::RandLike:
       return "randLike";
     case UnaryOpType::Reciprocal:

diff --git a/torch/csrc/jit/codegen/cuda/type.h b/torch/csrc/jit/codegen/cuda/type.h
@@ -179,6 +179,9 @@ enum class UnaryOpType {
   Tanh,
   Trunc,
 
+  // Tools to help debugging
+  Print,
+
   // Might be a bitwise operator or boolean operator.
   Not,