From a598d85e387ccdaf9e6e08b8cd816f453d919ebe Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 21 Dec 2022 08:16:35 -0500
Subject: [PATCH 01/28] Add test_gpu_match_frontend.cpp

This file will hold tests that reproduce those in test_nvfuser_frontend.py.
---
 .../nvfuser/test/test_gpu_match_frontend.cpp  | 175 ++++++++++++++++++
 1 file changed, 175 insertions(+)
 create mode 100644 third_party/nvfuser/test/test_gpu_match_frontend.cpp
diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
new file mode 100644
index 000000000000..3dc4f686b808
--- /dev/null
+++ b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -0,0 +1,175 @@
+// NOTE: This was copied from test_gpu_indexing.cpp 2022-12-15 to experiment with writing tests - JH
+
+#if defined(USE_CUDA)
+#include <gmock/gmock-matchers.h>
+#include <gtest/gtest.h>
+
+#include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/executor.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
+
+#include <test/cpp/jit/test_utils.h>
+#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
+#include <torch/csrc/jit/codegen/cuda/test/test_utils.h>
+
+#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_definition.h>
+
+
+#include <torch/torch.h>
+
+// Tests go in torch::jit
+namespace torch {
+namespace jit {
+
+using namespace torch::jit::fuser::cuda;
+
+
+TEST_F(NVFuserTest, FusionFrontendSuperBasic_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2);
+  auto c0 = IrBuilder::create<Double>(3.0);
+
+  fusion.addInput(tv0);
+
+  auto tv1 = mul(tv0, c0);
+  auto tv2 = sum(tv1, {-1}, false, DataType::Float);
+
+  fusion.addOutput(tv2);
+
+  std::cout << "Before merge/split/parallelize:" << std::endl;
+  fusion.printMath();
+
+  fusion.printTransforms();
+
+  // {i0, i1}
+  tv0->split(1, NamedScalar::getParallelDim(ParallelType::TIDx)); // {i0, i1 / blockDim.x, blockDim.x}
+  tv0->split(1, 1); // {i0, i1 / blockDim.x, 1, blockDim.x}
+  tv0->split(1, NamedScalar::getParallelDim(ParallelType::BIDx), false); // {i0, gridDim.x, (i1 / blockDim.x) / gridDim.x, 1, blockDim.x}
+  tv0->axis(0)->parallelize(ParallelType::BIDy);
+  tv0->axis(1)->parallelize(ParallelType::BIDx);
+  tv0->axis(4)->parallelize(ParallelType::TIDx);
+
+  std::cout << std::endl << "After split/parallelize tv0:" << std::endl;
+  fusion.printTransforms();
+
+  tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
+  tv1->split(1, 1);
+  tv1->split(1, NamedScalar::getParallelDim(ParallelType::BIDx), false);
+  tv1->axis(0)->parallelize(ParallelType::BIDy);
+  tv1->axis(1)->parallelize(ParallelType::BIDx);
+  tv1->axis(4)->parallelize(ParallelType::TIDx);
+
+  std::cout << std::endl << "After split/parallelize tv1:" << std::endl;
+  fusion.printTransforms();
+
+  // {i0, i1}
+  tv2->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
+  tv2->split(1, 1);
+  tv2->split(1, NamedScalar::getParallelDim(ParallelType::BIDx), false);
+  tv2->axis(0)->parallelize(ParallelType::BIDy);
+  tv2->axis(1)->parallelize(ParallelType::BIDx);
+  tv2->axis(3)->parallelize(ParallelType::TIDy);
+  tv2->axis(4)->parallelize(ParallelType::TIDx);
+  
+  std::cout << std::endl << "After split/parallelize tv2:" << std::endl;
+  fusion.printTransforms();
+
+  int x = 2, y = 4, z = 8;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({y, z}, options);
+
+  auto t1 = t0 * 3.0; auto t2 = t1.sum({-1}, false);
+
+  std::vector<IValue> inputs = {t0};
+
+  FusionExecutor fe;
+  std::cout << "Compiling..." << std::endl;
+  fe.compileFusion(&fusion, inputs);
+  std::cout << "Running..." << std::endl;
+  auto cg_outputs = fe.runFusion(inputs);
+
+  testValidate(
+      &fusion, cg_outputs, inputs, {t2}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionFrontendBasic_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(3);
+  auto tv1 = makeSymbolicTensor(3);
+  auto c0 = IrBuilder::create<Double>(0.1);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+  auto tv3 = mul(tv2, c0);
+  auto tv4 = sum(tv3, {-1}, false, DataType::Float);
+
+  fusion.addOutput(tv4);
+
+  std::cout << "Before merge/split/parallelize:" << std::endl;
+  fusion.printMath();
+  fusion.printKernel();
+
+  tv0->merge(0);
+  tv0->split(0, 128);
+  tv0->axis(0)->parallelize(ParallelType::BIDx);
+  tv0->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv1->merge(0);
+  tv1->split(0, 128);
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv2->merge(0);
+  tv2->split(0, 128);
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::TIDx);
+  
+  tv3->merge(0);
+  tv3->split(0, 128);
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+
+  tv4->merge(0);
+  tv4->split(0, 128);
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(1)->parallelize(ParallelType::TIDx);
+
+  std::cout << "After merge/split/parallelize:" << std::endl;
+  fusion.printMath();
+  fusion.printKernel();
+
+  int x = 2, y = 4, z = 8;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({x, y, z}, options);
+  at::Tensor t1 = at::randn({x, y, z}, options);
+
+  auto t2 = t0.add(t1);
+  auto t3 = t2.add(0.1);
+  auto t4 = t2.sum({-1}, false);
+
+  std::vector<IValue> inputs = {t0, t1};
+
+  FusionExecutor fe;
+  std::cout << "Compiling..." << std::endl;
+  fe.compileFusion(&fusion, inputs);
+  std::cout << "Running..." << std::endl;
+  auto cg_outputs = fe.runFusion(inputs);
+
+  testValidate(
+      &fusion, cg_outputs, inputs, {t4}, __LINE__, __FILE__);
+}
+
+} // namespace jit
+} // namespace torch
+#endif // #if defined(USE_CUDA)

From 82904825c676df050a912365f4e1db5b4927c3d9 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 28 Dec 2022 14:22:57 -0500
Subject: [PATCH 02/28] Properly parallelize SuperBasic and Basic tests

Added rFactor and computeAt along the split reduction dimension. Since the reduction dimension was split, we also need to specify a size for the blocks.
---
 .../nvfuser/test/test_gpu_match_frontend.cpp  | 133 +++++++-----------
 1 file changed, 47 insertions(+), 86 deletions(-)

diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
index 3dc4f686b808..dbb74764bbbb 100644
--- a/third_party/nvfuser/test/test_gpu_match_frontend.cpp
+++ b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -1,5 +1,3 @@
-// NOTE: This was copied from test_gpu_indexing.cpp 2022-12-15 to experiment with writing tests - JH
-
 #if defined(USE_CUDA)
 #include <gmock/gmock-matchers.h>
 #include <gtest/gtest.h>
@@ -8,6 +6,7 @@
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
 
@@ -15,8 +14,6 @@
 #include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
 #include <torch/csrc/jit/codegen/cuda/test/test_utils.h>
 
-#include <torch/csrc/jit/codegen/cuda/python_frontend/fusion_definition.h>
-
 
 #include <torch/torch.h>
 
@@ -28,74 +25,52 @@ using namespace torch::jit::fuser::cuda;
 
 
 TEST_F(NVFuserTest, FusionFrontendSuperBasic_CUDA) {
+  int y = 4, z = 8;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({y, z}, options);
+
   Fusion fusion;
   FusionGuard fg(&fusion);
 
-  auto tv0 = makeSymbolicTensor(2);
+  auto tv0 = makeSymbolicTensor(2); // {i0, i1}
   auto c0 = IrBuilder::create<Double>(3.0);
 
   fusion.addInput(tv0);
 
-  auto tv1 = mul(tv0, c0);
-  auto tv2 = sum(tv1, {-1}, false, DataType::Float);
+  auto tv1 = mul(tv0, c0); // {i0, i1}
+  auto tv2 = sum(tv1, {-1}, false, DataType::Float); // {i0, r1}
 
   fusion.addOutput(tv2);
 
-  std::cout << "Before merge/split/parallelize:" << std::endl;
-  fusion.printMath();
-
-  fusion.printTransforms();
-
-  // {i0, i1}
-  tv0->split(1, NamedScalar::getParallelDim(ParallelType::TIDx)); // {i0, i1 / blockDim.x, blockDim.x}
-  tv0->split(1, 1); // {i0, i1 / blockDim.x, 1, blockDim.x}
-  tv0->split(1, NamedScalar::getParallelDim(ParallelType::BIDx), false); // {i0, gridDim.x, (i1 / blockDim.x) / gridDim.x, 1, blockDim.x}
-  tv0->axis(0)->parallelize(ParallelType::BIDy);
-  tv0->axis(1)->parallelize(ParallelType::BIDx);
-  tv0->axis(4)->parallelize(ParallelType::TIDx);
-
-  std::cout << std::endl << "After split/parallelize tv0:" << std::endl;
-  fusion.printTransforms();
-
-  tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
-  tv1->split(1, 1);
-  tv1->split(1, NamedScalar::getParallelDim(ParallelType::BIDx), false);
-  tv1->axis(0)->parallelize(ParallelType::BIDy);
-  tv1->axis(1)->parallelize(ParallelType::BIDx);
-  tv1->axis(4)->parallelize(ParallelType::TIDx);
-
-  std::cout << std::endl << "After split/parallelize tv1:" << std::endl;
-  fusion.printTransforms();
-
-  // {i0, i1}
-  tv2->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
-  tv2->split(1, 1);
-  tv2->split(1, NamedScalar::getParallelDim(ParallelType::BIDx), false);
-  tv2->axis(0)->parallelize(ParallelType::BIDy);
-  tv2->axis(1)->parallelize(ParallelType::BIDx);
-  tv2->axis(3)->parallelize(ParallelType::TIDy);
-  tv2->axis(4)->parallelize(ParallelType::TIDx);
-  
-  std::cout << std::endl << "After split/parallelize tv2:" << std::endl;
-  fusion.printTransforms();
+  // {i0, r1}
+  tv2->split(1, NamedScalar::getParallelDim(ParallelType::TIDx)); // {i0, r1 / TIDx, TIDx}
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  auto tv3 = tv2->rFactor({1});
+  //tv3->axis(-1)->parallelize(ParallelType::Unswitch);
 
-  int x = 2, y = 4, z = 8;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  // propagate the mapping to other tensors
+  TransformPropagatorWithCheck propagator(tv3);
+  MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
+  scheduler_utils::parallelizeAllLike(tv3, {tv0, tv1});
 
-  at::Tensor t0 = at::randn({y, z}, options);
+  tv0->computeAt(tv3, 1, ComputeAtMode::MostInlined);
 
   auto t1 = t0 * 3.0; auto t2 = t1.sum({-1}, false);
 
   std::vector<IValue> inputs = {t0};
 
+  // Have to know TIDx at compile time?
+  int runtime_threadIdx_dim = 128;
+  LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
+
   FusionExecutor fe;
-  std::cout << "Compiling..." << std::endl;
-  fe.compileFusion(&fusion, inputs);
-  std::cout << "Running..." << std::endl;
-  auto cg_outputs = fe.runFusion(inputs);
+  fe.compileFusion(&fusion, inputs, lparams);
+  auto cg_outputs = fe.runFusion(inputs, lparams);
 
   testValidate(
-      &fusion, cg_outputs, inputs, {t2}, __LINE__, __FILE__);
+      &fusion, cg_outputs, inputs, {t2}, __LINE__, __FILE__, "", lparams);
 }
 
 TEST_F(NVFuserTest, FusionFrontendBasic_CUDA) {
@@ -115,38 +90,20 @@ TEST_F(NVFuserTest, FusionFrontendBasic_CUDA) {
 
   fusion.addOutput(tv4);
 
-  std::cout << "Before merge/split/parallelize:" << std::endl;
-  fusion.printMath();
-  fusion.printKernel();
-
-  tv0->merge(0);
-  tv0->split(0, 128);
-  tv0->axis(0)->parallelize(ParallelType::BIDx);
-  tv0->axis(1)->parallelize(ParallelType::TIDx);
+  // {i0, i1, i2}
+  tv4->merge(0, 1); // {i0*i1, i2}
+  tv4->split(1, NamedScalar::getParallelDim(ParallelType::TIDx)); // {i0*i1, r2 / bDx, bDx}
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  auto tv5 = tv4->rFactor({1});
 
-  tv1->merge(0);
-  tv1->split(0, 128);
-  tv1->axis(0)->parallelize(ParallelType::BIDx);
-  tv1->axis(1)->parallelize(ParallelType::TIDx);
+  // propagate the mapping to other tensors
+  TransformPropagatorWithCheck propagator(tv5);
+  MaxRootDomainInfoSpanningTree(tv5).traverse(&propagator);
+  scheduler_utils::parallelizeAllLike(tv5, {tv0, tv1, tv2, tv3});
 
-  tv2->merge(0);
-  tv2->split(0, 128);
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(1)->parallelize(ParallelType::TIDx);
-  
-  tv3->merge(0);
-  tv3->split(0, 128);
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv3->axis(1)->parallelize(ParallelType::TIDx);
-
-  tv4->merge(0);
-  tv4->split(0, 128);
-  tv4->axis(0)->parallelize(ParallelType::BIDx);
-  tv4->axis(1)->parallelize(ParallelType::TIDx);
+  tv0->computeAt(tv5, 1, ComputeAtMode::MostInlined);
 
-  std::cout << "After merge/split/parallelize:" << std::endl;
-  fusion.printMath();
-  fusion.printKernel();
 
   int x = 2, y = 4, z = 8;
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
@@ -155,19 +112,23 @@ TEST_F(NVFuserTest, FusionFrontendBasic_CUDA) {
   at::Tensor t1 = at::randn({x, y, z}, options);
 
   auto t2 = t0.add(t1);
-  auto t3 = t2.add(0.1);
-  auto t4 = t2.sum({-1}, false);
+  auto t3 = t2.mul(0.1);
+  auto t4 = t3.sum({-1}, false);
 
   std::vector<IValue> inputs = {t0, t1};
 
+  // Have to know TIDx at compile time?
+  int runtime_threadIdx_dim = 128;
+  LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
+
   FusionExecutor fe;
   std::cout << "Compiling..." << std::endl;
-  fe.compileFusion(&fusion, inputs);
+  fe.compileFusion(&fusion, inputs, lparams);
   std::cout << "Running..." << std::endl;
-  auto cg_outputs = fe.runFusion(inputs);
+  auto cg_outputs = fe.runFusion(inputs, lparams);
 
   testValidate(
-      &fusion, cg_outputs, inputs, {t4}, __LINE__, __FILE__);
+      &fusion, cg_outputs, inputs, {t4}, __LINE__, __FILE__, "", lparams);
 }
 
 } // namespace jit

From a10fe6a96af40ff66df0bc0985666c9e40b2dd48 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 28 Dec 2022 15:46:27 -0500
Subject: [PATCH 03/28] Add FP16 Basic and SuperBasic frontend tests

Currently the SuperBasicFP16 test works fine but the BasicFP16 test fails to compile, complaining that a producer is not in global memory based on parallelization type. This is likely because I haven't parallelized the t5float tensor properly, but I wonder why I did not encounter this in the other (2d) test SuperBasicFP16.
---
 .../nvfuser/test/test_gpu_match_frontend.cpp  | 116 +++++++++++++++++-
 1 file changed, 111 insertions(+), 5 deletions(-)

diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
index dbb74764bbbb..0b7f076a2eb0 100644
--- a/third_party/nvfuser/test/test_gpu_match_frontend.cpp
+++ b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -48,7 +48,6 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasic_CUDA) {
   tv2->axis(0)->parallelize(ParallelType::BIDx);
   tv2->axis(-1)->parallelize(ParallelType::TIDx);
   auto tv3 = tv2->rFactor({1});
-  //tv3->axis(-1)->parallelize(ParallelType::Unswitch);
 
   // propagate the mapping to other tensors
   TransformPropagatorWithCheck propagator(tv3);
@@ -73,13 +72,62 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasic_CUDA) {
       &fusion, cg_outputs, inputs, {t2}, __LINE__, __FILE__, "", lparams);
 }
 
+TEST_F(NVFuserTest, FusionFrontendSuperBasicFP16_CUDA) {
+  int y = 4, z = 8;
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({y, z}, options);
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2, DataType::Half); // {i0, i1}
+  auto c0 = IrBuilder::create<Double>(3.0);
+
+  fusion.addInput(tv0);
+
+  auto tv1 = mul(tv0, c0); // {i0, i1}
+  auto tv2float = sum(tv1, {-1}, false, DataType::Float); // {i0, r1}
+  auto tv2 = castOp(DataType::Half, tv2float);
+
+  fusion.addOutput(tv2);
+
+  // {i0, r1}
+  tv2float->split(1, NamedScalar::getParallelDim(ParallelType::TIDx)); // {i0, r1 / TIDx, TIDx}
+  tv2float->axis(0)->parallelize(ParallelType::BIDx);
+  tv2float->axis(-1)->parallelize(ParallelType::TIDx);
+  auto tv3 = tv2float->rFactor({1});
+
+  // propagate the mapping to other tensors
+  TransformPropagatorWithCheck propagator(tv3);
+  MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
+  scheduler_utils::parallelizeAllLike(tv3, {tv0, tv1, tv2});
+
+  tv0->computeAt(tv3, 1, ComputeAtMode::MostInlined);
+
+  auto t1 = t0 * 3.0; auto t2 = t1.sum({-1}, false, c10::kFloat);
+
+  std::vector<IValue> inputs = {t0};
+
+  // Have to know TIDx at compile time?
+  int runtime_threadIdx_dim = 128;
+  LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs, lparams);
+  auto cg_outputs = fe.runFusion(inputs, lparams);
+
+  testValidate(
+      &fusion, cg_outputs, inputs, {t2}, __LINE__, __FILE__, "", lparams);
+}
+
 TEST_F(NVFuserTest, FusionFrontendBasic_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
   auto tv0 = makeSymbolicTensor(3);
   auto tv1 = makeSymbolicTensor(3);
-  auto c0 = IrBuilder::create<Double>(0.1);
+  auto c0 = IrBuilder::create<Double>(3.0);
 
   fusion.addInput(tv0);
   fusion.addInput(tv1);
@@ -112,7 +160,67 @@ TEST_F(NVFuserTest, FusionFrontendBasic_CUDA) {
   at::Tensor t1 = at::randn({x, y, z}, options);
 
   auto t2 = t0.add(t1);
-  auto t3 = t2.mul(0.1);
+  auto t3 = t2.mul(3.0);
+  auto t4 = t3.sum({-1}, false);
+
+  std::vector<IValue> inputs = {t0, t1};
+
+  // Have to know TIDx at compile time?
+  int runtime_threadIdx_dim = 128;
+  LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs, lparams);
+  auto cg_outputs = fe.runFusion(inputs, lparams);
+
+  testValidate(
+      &fusion, cg_outputs, inputs, {t4}, __LINE__, __FILE__, "", lparams);
+}
+
+TEST_F(NVFuserTest, FusionFrontendBasicFP16_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(3, DataType::Half);
+  auto tv1 = makeSymbolicTensor(3, DataType::Half);
+  auto c0 = IrBuilder::create<Double>(3.0);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+  auto tv3 = mul(tv2, c0);
+  auto tv4float = sum(tv3, {-1}, false, DataType::Float);
+  auto tv4 = castOp(DataType::Half, tv4float);
+
+  fusion.addOutput(tv4);
+
+  // {i0, i1, i2}
+  tv4float->merge(0, 1); // {i0*i1, i2}
+  tv4float->split(1, NamedScalar::getParallelDim(ParallelType::TIDx)); // {i0*i1, r2 / bDx, bDx}
+  tv4float->axis(0)->parallelize(ParallelType::BIDx);
+  tv4float->axis(-1)->parallelize(ParallelType::TIDx);
+  auto tv5 = tv4float->rFactor({1});
+
+  // propagate the mapping to other tensors
+  TransformPropagatorWithCheck propagator(tv5);
+  MaxRootDomainInfoSpanningTree(tv5).traverse(&propagator);
+  scheduler_utils::parallelizeAllLike(tv5, {tv0, tv1, tv2, tv3});
+
+  tv0->computeAt(tv5, 1, ComputeAtMode::MostInlined);
+
+  fusion.printTransforms();
+  fusion.printKernel();
+
+
+  int x = 2, y = 4, z = 8;
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({x, y, z}, options);
+  at::Tensor t1 = at::randn({x, y, z}, options);
+
+  auto t2 = t0.add(t1);
+  auto t3 = t2.mul(3.0);
   auto t4 = t3.sum({-1}, false);
 
   std::vector<IValue> inputs = {t0, t1};
@@ -122,9 +230,7 @@ TEST_F(NVFuserTest, FusionFrontendBasic_CUDA) {
   LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
 
   FusionExecutor fe;
-  std::cout << "Compiling..." << std::endl;
   fe.compileFusion(&fusion, inputs, lparams);
-  std::cout << "Running..." << std::endl;
   auto cg_outputs = fe.runFusion(inputs, lparams);
 
   testValidate(

From d7cc4425f88d74e1dcc76fab7e87bc78aa9ddaf1 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 29 Dec 2022 11:26:55 -0500
Subject: [PATCH 04/28] Add parallelize tv4 in FusionFrontendBasicFP16_CUDA

This fixes one bug but now I have variable sized static arrays popping
up. Also I fixed my precommit hooks so clang-format was run and
hopefully the CI will pass.
---
 .../nvfuser/test/test_gpu_match_frontend.cpp  | 33 ++++++++++---------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
index 0b7f076a2eb0..e46c9752c421 100644
--- a/third_party/nvfuser/test/test_gpu_match_frontend.cpp
+++ b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -14,7 +14,6 @@
 #include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
 #include <torch/csrc/jit/codegen/cuda/test/test_utils.h>
 
-
 #include <torch/torch.h>
 
 // Tests go in torch::jit
@@ -23,7 +22,6 @@ namespace jit {
 
 using namespace torch::jit::fuser::cuda;
 
-
 TEST_F(NVFuserTest, FusionFrontendSuperBasic_CUDA) {
   int y = 4, z = 8;
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
@@ -44,7 +42,9 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasic_CUDA) {
   fusion.addOutput(tv2);
 
   // {i0, r1}
-  tv2->split(1, NamedScalar::getParallelDim(ParallelType::TIDx)); // {i0, r1 / TIDx, TIDx}
+  tv2->split(
+      1,
+      NamedScalar::getParallelDim(ParallelType::TIDx)); // {i0, r1 / TIDx, TIDx}
   tv2->axis(0)->parallelize(ParallelType::BIDx);
   tv2->axis(-1)->parallelize(ParallelType::TIDx);
   auto tv3 = tv2->rFactor({1});
@@ -56,7 +56,8 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasic_CUDA) {
 
   tv0->computeAt(tv3, 1, ComputeAtMode::MostInlined);
 
-  auto t1 = t0 * 3.0; auto t2 = t1.sum({-1}, false);
+  auto t1 = t0 * 3.0;
+  auto t2 = t1.sum({-1}, false);
 
   std::vector<IValue> inputs = {t0};
 
@@ -93,7 +94,9 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasicFP16_CUDA) {
   fusion.addOutput(tv2);
 
   // {i0, r1}
-  tv2float->split(1, NamedScalar::getParallelDim(ParallelType::TIDx)); // {i0, r1 / TIDx, TIDx}
+  tv2float->split(
+      1,
+      NamedScalar::getParallelDim(ParallelType::TIDx)); // {i0, r1 / TIDx, TIDx}
   tv2float->axis(0)->parallelize(ParallelType::BIDx);
   tv2float->axis(-1)->parallelize(ParallelType::TIDx);
   auto tv3 = tv2float->rFactor({1});
@@ -105,7 +108,8 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasicFP16_CUDA) {
 
   tv0->computeAt(tv3, 1, ComputeAtMode::MostInlined);
 
-  auto t1 = t0 * 3.0; auto t2 = t1.sum({-1}, false, c10::kFloat);
+  auto t1 = t0 * 3.0;
+  auto t2 = t1.sum({-1}, false, c10::kFloat);
 
   std::vector<IValue> inputs = {t0};
 
@@ -140,7 +144,10 @@ TEST_F(NVFuserTest, FusionFrontendBasic_CUDA) {
 
   // {i0, i1, i2}
   tv4->merge(0, 1); // {i0*i1, i2}
-  tv4->split(1, NamedScalar::getParallelDim(ParallelType::TIDx)); // {i0*i1, r2 / bDx, bDx}
+  tv4->split(
+      1,
+      NamedScalar::getParallelDim(
+          ParallelType::TIDx)); // {i0*i1, r2 / bDx, bDx}
   tv4->axis(0)->parallelize(ParallelType::BIDx);
   tv4->axis(-1)->parallelize(ParallelType::TIDx);
   auto tv5 = tv4->rFactor({1});
@@ -152,7 +159,6 @@ TEST_F(NVFuserTest, FusionFrontendBasic_CUDA) {
 
   tv0->computeAt(tv5, 1, ComputeAtMode::MostInlined);
 
-
   int x = 2, y = 4, z = 8;
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
 
@@ -195,9 +201,8 @@ TEST_F(NVFuserTest, FusionFrontendBasicFP16_CUDA) {
 
   fusion.addOutput(tv4);
 
-  // {i0, i1, i2}
-  tv4float->merge(0, 1); // {i0*i1, i2}
-  tv4float->split(1, NamedScalar::getParallelDim(ParallelType::TIDx)); // {i0*i1, r2 / bDx, bDx}
+  tv4float->merge(0, 1);
+  tv4float->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
   tv4float->axis(0)->parallelize(ParallelType::BIDx);
   tv4float->axis(-1)->parallelize(ParallelType::TIDx);
   auto tv5 = tv4float->rFactor({1});
@@ -205,14 +210,10 @@ TEST_F(NVFuserTest, FusionFrontendBasicFP16_CUDA) {
   // propagate the mapping to other tensors
   TransformPropagatorWithCheck propagator(tv5);
   MaxRootDomainInfoSpanningTree(tv5).traverse(&propagator);
-  scheduler_utils::parallelizeAllLike(tv5, {tv0, tv1, tv2, tv3});
+  scheduler_utils::parallelizeAllLike(tv5, {tv0, tv1, tv2, tv3, tv4});
 
   tv0->computeAt(tv5, 1, ComputeAtMode::MostInlined);
 
-  fusion.printTransforms();
-  fusion.printKernel();
-
-
   int x = 2, y = 4, z = 8;
   auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
 

From aa6b3783f745714d3f83027062300c7fc3c03b08 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 29 Dec 2022 11:53:20 -0500
Subject: [PATCH 05/28] Add tv1->computeAt() to BasicFP16 test

This fixes that compile (in that it runs and passes), but neither of the
basic tests matches the generated kernel from the pytest.
---
 third_party/nvfuser/test/test_gpu_match_frontend.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
index e46c9752c421..8d60866e4412 100644
--- a/third_party/nvfuser/test/test_gpu_match_frontend.cpp
+++ b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -213,6 +213,7 @@ TEST_F(NVFuserTest, FusionFrontendBasicFP16_CUDA) {
   scheduler_utils::parallelizeAllLike(tv5, {tv0, tv1, tv2, tv3, tv4});
 
   tv0->computeAt(tv5, 1, ComputeAtMode::MostInlined);
+  tv1->computeAt(tv5, 1, ComputeAtMode::MostInlined);
 
   int x = 2, y = 4, z = 8;
   auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);

From 2e614f9b839a205ccbbc7865516d74077f18683f Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 29 Dec 2022 12:25:53 -0500
Subject: [PATCH 06/28] Add test: FusionFrontendCastDoubleToHalf_CUDA

This test results in what looks like decent code. Again, it doesn't
match the pytest code exactly. Also, I have to manually specify not only
the block dimension but the grid dimension as well for this test, and
I'm not 100% sure why yet.
---
 .../nvfuser/test/test_gpu_match_frontend.cpp  | 68 +++++++++++++++++--
 1 file changed, 64 insertions(+), 4 deletions(-)

diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
index 8d60866e4412..0d67e79c5cc8 100644
--- a/third_party/nvfuser/test/test_gpu_match_frontend.cpp
+++ b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -61,7 +61,6 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasic_CUDA) {
 
   std::vector<IValue> inputs = {t0};
 
-  // Have to know TIDx at compile time?
   int runtime_threadIdx_dim = 128;
   LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
 
@@ -113,7 +112,6 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasicFP16_CUDA) {
 
   std::vector<IValue> inputs = {t0};
 
-  // Have to know TIDx at compile time?
   int runtime_threadIdx_dim = 128;
   LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
 
@@ -171,7 +169,6 @@ TEST_F(NVFuserTest, FusionFrontendBasic_CUDA) {
 
   std::vector<IValue> inputs = {t0, t1};
 
-  // Have to know TIDx at compile time?
   int runtime_threadIdx_dim = 128;
   LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
 
@@ -227,7 +224,6 @@ TEST_F(NVFuserTest, FusionFrontendBasicFP16_CUDA) {
 
   std::vector<IValue> inputs = {t0, t1};
 
-  // Have to know TIDx at compile time?
   int runtime_threadIdx_dim = 128;
   LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
 
@@ -239,6 +235,70 @@ TEST_F(NVFuserTest, FusionFrontendBasicFP16_CUDA) {
       &fusion, cg_outputs, inputs, {t4}, __LINE__, __FILE__, "", lparams);
 }
 
+TEST_F(NVFuserTest, FusionFrontendCastDoubleToHalf_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(2, DataType::Double);
+  auto tv1 = makeSymbolicTensor(2, DataType::Double);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv0h = castOp(DataType::Half, tv0);
+  auto tv1h = castOp(DataType::Half, tv1);
+  auto tv0f = castOp(DataType::Float, tv0h);
+  auto tv1f = castOp(DataType::Float, tv1h);
+  auto tv2 = add(tv0f, tv1f);
+  auto tv3 = relu(tv2);
+  auto tv4 = castOp(DataType::Half, tv3);
+
+  fusion.addOutput(tv4);
+
+  tv4->merge(0, 1);
+  tv4->split(0, NamedScalar::getParallelDim(ParallelType::TIDx));
+  tv4->split(0, NamedScalar::getParallelDim(ParallelType::BIDx));
+  tv4->axis(1)->parallelize(ParallelType::BIDx);
+  tv4->axis(2)->parallelize(ParallelType::TIDx);
+
+  // propagate the mapping to other tensors
+  TransformPropagatorWithCheck propagator(tv4);
+  MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
+  scheduler_utils::parallelizeAllLike(
+      tv4, {tv0, tv1, tv0h, tv1h, tv0f, tv1f, tv2, tv3});
+
+  tv0->computeAt(tv4, 0, ComputeAtMode::MostInlined);
+  tv1->computeAt(tv4, 0, ComputeAtMode::MostInlined);
+
+  int x = 2, y = 4;
+  auto options = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({x, y}, options);
+  at::Tensor t1 = at::randn({x, y}, options);
+
+  auto t0h = t0.to(options.dtype(at::kHalf));
+  auto t1h = t1.to(options.dtype(at::kHalf));
+
+  auto t2 = t0h.add(t1h);
+  auto t3 = t2.relu();
+  auto t4 = t3.to(options);
+
+  std::vector<IValue> inputs = {t0, t1};
+
+  // Need to hardcode both block and grid size
+  int runtime_threadIdx_dim = 128;
+  int runtime_blockIdx_dim = 128;
+  LaunchParams lparams(
+      runtime_blockIdx_dim, -1, -1, runtime_threadIdx_dim, -1, -1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs, lparams);
+  auto cg_outputs = fe.runFusion(inputs, lparams);
+
+  testValidate(
+      &fusion, cg_outputs, inputs, {t4}, __LINE__, __FILE__, "", lparams);
+}
+
 } // namespace jit
 } // namespace torch
 #endif // #if defined(USE_CUDA)

From fb4fa85723837f46dc63412bb2c949e105ac0d76 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 29 Dec 2022 12:49:08 -0500
Subject: [PATCH 07/28] Remove griddim split in CastDoubleToHalf test

The result is that this kernel better matches the python-generated one:
it does not loop inside the kernel, and instead assumes there will be
enough blocks to cover the entire input.
---
 .../nvfuser/test/test_gpu_match_frontend.cpp   | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
index 0d67e79c5cc8..8a06a68c4db7 100644
--- a/third_party/nvfuser/test/test_gpu_match_frontend.cpp
+++ b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -22,6 +22,7 @@ namespace jit {
 
 using namespace torch::jit::fuser::cuda;
 
+//! A very simple test computing sum(x * 3.0, dim=-1) for 2D inputs
 TEST_F(NVFuserTest, FusionFrontendSuperBasic_CUDA) {
   int y = 4, z = 8;
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
@@ -72,6 +73,8 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasic_CUDA) {
       &fusion, cg_outputs, inputs, {t2}, __LINE__, __FILE__, "", lparams);
 }
 
+//! The same test as FusionFrontendSuperBasic_CUDA, but with half-precision
+//! inputs and outputs
 TEST_F(NVFuserTest, FusionFrontendSuperBasicFP16_CUDA) {
   int y = 4, z = 8;
   auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
@@ -123,6 +126,7 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasicFP16_CUDA) {
       &fusion, cg_outputs, inputs, {t2}, __LINE__, __FILE__, "", lparams);
 }
 
+//! A simple test computing sum((x + y) * 3.0, dim=-1) for 3D inputs
 TEST_F(NVFuserTest, FusionFrontendBasic_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
@@ -180,6 +184,8 @@ TEST_F(NVFuserTest, FusionFrontendBasic_CUDA) {
       &fusion, cg_outputs, inputs, {t4}, __LINE__, __FILE__, "", lparams);
 }
 
+//! The same test as FusionFrontendBasic_CUDA, but with half-precision
+//! inputs and outputs
 TEST_F(NVFuserTest, FusionFrontendBasicFP16_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
@@ -235,6 +241,8 @@ TEST_F(NVFuserTest, FusionFrontendBasicFP16_CUDA) {
       &fusion, cg_outputs, inputs, {t4}, __LINE__, __FILE__, "", lparams);
 }
 
+//! Convert double inputs to half, then do some point-wise operations and
+//! output half precision
 TEST_F(NVFuserTest, FusionFrontendCastDoubleToHalf_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
@@ -257,9 +265,8 @@ TEST_F(NVFuserTest, FusionFrontendCastDoubleToHalf_CUDA) {
 
   tv4->merge(0, 1);
   tv4->split(0, NamedScalar::getParallelDim(ParallelType::TIDx));
-  tv4->split(0, NamedScalar::getParallelDim(ParallelType::BIDx));
-  tv4->axis(1)->parallelize(ParallelType::BIDx);
-  tv4->axis(2)->parallelize(ParallelType::TIDx);
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(1)->parallelize(ParallelType::TIDx);
 
   // propagate the mapping to other tensors
   TransformPropagatorWithCheck propagator(tv4);
@@ -285,11 +292,8 @@ TEST_F(NVFuserTest, FusionFrontendCastDoubleToHalf_CUDA) {
 
   std::vector<IValue> inputs = {t0, t1};
 
-  // Need to hardcode both block and grid size
   int runtime_threadIdx_dim = 128;
-  int runtime_blockIdx_dim = 128;
-  LaunchParams lparams(
-      runtime_blockIdx_dim, -1, -1, runtime_threadIdx_dim, -1, -1);
+  LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
 
   FusionExecutor fe;
   fe.compileFusion(&fusion, inputs, lparams);

From c0f211c7c2c7b386665671769813fd40189db060 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 29 Dec 2022 13:37:05 -0500
Subject: [PATCH 08/28] Add test FusionFrontendPromoteToDouble_CUDA

This test runs and is very similar to the python test, but has multiple
predicates instead of one outer predicate, just as the SuperBasic test
does.
---
 .../nvfuser/test/test_gpu_match_frontend.cpp  | 57 +++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
index 8a06a68c4db7..7bbdcbd0cfc1 100644
--- a/third_party/nvfuser/test/test_gpu_match_frontend.cpp
+++ b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -303,6 +303,63 @@ TEST_F(NVFuserTest, FusionFrontendCastDoubleToHalf_CUDA) {
       &fusion, cg_outputs, inputs, {t4}, __LINE__, __FILE__, "", lparams);
 }
 
+//! Same test as FusionFrontendCastDoubleToHalf_CUDA, but with mixed inputs
+//! (double and half) and without the explicit cast to half, so that
+//! computation and output are all at double precision.
+TEST_F(NVFuserTest, FusionFrontendPromoteToDouble_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0h = makeSymbolicTensor(2, DataType::Half);
+  auto tv1 = makeSymbolicTensor(2, DataType::Double);
+
+  fusion.addInput(tv0h);
+  fusion.addInput(tv1);
+
+  auto tv0 = castOp(DataType::Double, tv0h);
+  auto tv2 = add(tv0, tv1);
+  auto tv3 = relu(tv2);
+
+  fusion.addOutput(tv3);
+
+  tv3->merge(0, 1);
+  tv3->split(0, NamedScalar::getParallelDim(ParallelType::TIDx));
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(1)->parallelize(ParallelType::TIDx);
+
+  // propagate the mapping to other tensors
+  TransformPropagatorWithCheck propagator(tv3);
+  MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
+  scheduler_utils::parallelizeAllLike(tv3, {tv0h, tv0, tv1, tv2});
+
+  tv0->computeAt(tv3, 0, ComputeAtMode::MostInlined);
+  tv1->computeAt(tv3, 0, ComputeAtMode::MostInlined);
+
+  int x = 2, y = 4;
+  auto options = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA, 0);
+
+  at::Tensor t0h = at::randn({x, y}, options.dtype(at::kHalf));
+  at::Tensor t1 = at::randn({x, y}, options);
+
+  auto t0 = t0h.to(options.dtype(at::kDouble));
+
+  auto t2 = t0.add(t1);
+  auto t3 = t2.relu();
+
+  std::vector<IValue> inputs = {t0h, t1};
+
+  // Need to hardcode both block and grid size
+  int runtime_threadIdx_dim = 128;
+  LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs, lparams);
+  auto cg_outputs = fe.runFusion(inputs, lparams);
+
+  testValidate(
+      &fusion, cg_outputs, inputs, {t3}, __LINE__, __FILE__, "", lparams);
+}
+
 } // namespace jit
 } // namespace torch
 #endif // #if defined(USE_CUDA)

From 917556831d92500d5c3d07f0739db09a710f2eb8 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Tue, 3 Jan 2023 11:15:30 -0500
Subject: [PATCH 09/28] Replace computeAt(..MostInlined) with inlineMost()

---
 .../nvfuser/test/test_gpu_match_frontend.cpp     | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
index 7bbdcbd0cfc1..78d930c9d468 100644
--- a/third_party/nvfuser/test/test_gpu_match_frontend.cpp
+++ b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -4,6 +4,7 @@
 
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/executor.h>
+#include <torch/csrc/jit/codegen/cuda/inlining.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
@@ -55,7 +56,7 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasic_CUDA) {
   MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
   scheduler_utils::parallelizeAllLike(tv3, {tv0, tv1});
 
-  tv0->computeAt(tv3, 1, ComputeAtMode::MostInlined);
+  inlineMost();
 
   auto t1 = t0 * 3.0;
   auto t2 = t1.sum({-1}, false);
@@ -108,7 +109,7 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasicFP16_CUDA) {
   MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
   scheduler_utils::parallelizeAllLike(tv3, {tv0, tv1, tv2});
 
-  tv0->computeAt(tv3, 1, ComputeAtMode::MostInlined);
+  inlineMost();
 
   auto t1 = t0 * 3.0;
   auto t2 = t1.sum({-1}, false, c10::kFloat);
@@ -159,7 +160,7 @@ TEST_F(NVFuserTest, FusionFrontendBasic_CUDA) {
   MaxRootDomainInfoSpanningTree(tv5).traverse(&propagator);
   scheduler_utils::parallelizeAllLike(tv5, {tv0, tv1, tv2, tv3});
 
-  tv0->computeAt(tv5, 1, ComputeAtMode::MostInlined);
+  inlineMost();
 
   int x = 2, y = 4, z = 8;
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
@@ -215,8 +216,7 @@ TEST_F(NVFuserTest, FusionFrontendBasicFP16_CUDA) {
   MaxRootDomainInfoSpanningTree(tv5).traverse(&propagator);
   scheduler_utils::parallelizeAllLike(tv5, {tv0, tv1, tv2, tv3, tv4});
 
-  tv0->computeAt(tv5, 1, ComputeAtMode::MostInlined);
-  tv1->computeAt(tv5, 1, ComputeAtMode::MostInlined);
+  inlineMost();
 
   int x = 2, y = 4, z = 8;
   auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
@@ -274,8 +274,7 @@ TEST_F(NVFuserTest, FusionFrontendCastDoubleToHalf_CUDA) {
   scheduler_utils::parallelizeAllLike(
       tv4, {tv0, tv1, tv0h, tv1h, tv0f, tv1f, tv2, tv3});
 
-  tv0->computeAt(tv4, 0, ComputeAtMode::MostInlined);
-  tv1->computeAt(tv4, 0, ComputeAtMode::MostInlined);
+  inlineMost();
 
   int x = 2, y = 4;
   auto options = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA, 0);
@@ -332,8 +331,7 @@ TEST_F(NVFuserTest, FusionFrontendPromoteToDouble_CUDA) {
   MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
   scheduler_utils::parallelizeAllLike(tv3, {tv0h, tv0, tv1, tv2});
 
-  tv0->computeAt(tv3, 0, ComputeAtMode::MostInlined);
-  tv1->computeAt(tv3, 0, ComputeAtMode::MostInlined);
+  inlineMost();
 
   int x = 2, y = 4;
   auto options = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA, 0);

From 5c43cccfd3699f3d0b31b0ee24c2dd9fba0e1312 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Tue, 3 Jan 2023 13:28:27 -0500
Subject: [PATCH 10/28] Add stream argument to printMath, add ir_math_check

The ir_math_check() function TORCH_CHECKs whether two fusions produce the same
ir math string. This is useful for targeting autogenerated fusions. When
translating Python frontend tests, I've first manually scheduled until
the fusion compiles and runs, then attempted to match the automatically
scheduled kernels. This commit makes the second stage simpler. However,
if changes are made to the automatic scheduler, then the manual
scheduling in these tests would need to be updated.
---
 third_party/nvfuser/csrc/fusion.cpp           | 18 +++++----
 third_party/nvfuser/csrc/fusion.h             |  5 ++-
 .../nvfuser/test/test_gpu_match_frontend.cpp  | 40 ++++++++++++++++++-
 3 files changed, 52 insertions(+), 11 deletions(-)

diff --git a/third_party/nvfuser/csrc/fusion.cpp b/third_party/nvfuser/csrc/fusion.cpp
index 31bb763ac559..48178f2a20f3 100644
--- a/third_party/nvfuser/csrc/fusion.cpp
+++ b/third_party/nvfuser/csrc/fusion.cpp
@@ -13,6 +13,8 @@
 #include <lower2device.h>
 #include <lower_bank_conflict.h>
 
+#include <ostream>
+
 namespace torch {
 namespace jit {
 namespace fuser {
@@ -380,19 +382,19 @@ std::unordered_map<std::string, std::pair<int, int>> Fusion::bankConflictInfo(
   return result;
 }
 
-void Fusion::printMath(bool from_outputs_only) {
+void Fusion::printMath(bool from_outputs_only, std::ostream& stream) {
   FUSER_PERF_SCOPE("Fusion::printMath");
 
   FusionGuard fg(this);
   auto exprs_for_print = exprs();
-  std::cout << "Inputs:" << std::endl;
+  stream << "Inputs:" << std::endl;
   for (auto inp : inputs()) {
-    std::cout << "  " << inp << ", " << inp->getDataType().value() << std::endl;
+    stream << "  " << inp << ", " << inp->getDataType().value() << std::endl;
   }
 
-  std::cout << "Outputs:" << std::endl;
+  stream << "Outputs:" << std::endl;
   for (auto out : outputs()) {
-    std::cout << "  " << out << ", " << out->getDataType().value() << std::endl;
+    stream << "  " << out << ", " << out->getDataType().value() << std::endl;
   }
 
   // If we want everything in the fusion, grab all values without uses to
@@ -407,11 +409,11 @@ void Fusion::printMath(bool from_outputs_only) {
     exprs_for_print = StmtSort::getExprs(this, leaf_vals);
   }
 
-  std::cout << "\n%kernel_math {\n";
+  stream << "\n%kernel_math {\n";
   for (auto expr : exprs_for_print) {
-    std::cout << expr;
+    stream << expr;
   }
-  std::cout << "}\n\n";
+  stream << "}\n\n";
 }
 
 std::vector<Val*> Fusion::inputsAndCreated() {
diff --git a/third_party/nvfuser/csrc/fusion.h b/third_party/nvfuser/csrc/fusion.h
index d8cef33fda0d..971d9c1553e5 100644
--- a/third_party/nvfuser/csrc/fusion.h
+++ b/third_party/nvfuser/csrc/fusion.h
@@ -128,7 +128,10 @@ class TORCH_CUDA_CU_API Fusion : public IrContainer {
 
   //! Print Arith exprs
   //! \param from_outputs_only Only print exprs reachable from outputs
-  void printMath(bool from_outputs_only = true);
+  //! \param stream Where to print output (defaults to std::cout)
+  void printMath(
+      bool from_outputs_only = true,
+      std::ostream& stream = std::cout);
 
   //! Print transformations used in fusion (can be very verbose)
   void printTransforms();
diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
index 78d930c9d468..2e2c7ec2adf9 100644
--- a/third_party/nvfuser/test/test_gpu_match_frontend.cpp
+++ b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -1,3 +1,7 @@
+//! These tests replicate those that appear in test/test_nvfuser_frontend.py
+//! In this file, we manually schedule each fusion, and compare that to the
+//! automatic scheduling that occurs in the python test.
+
 #if defined(USE_CUDA)
 #include <gmock/gmock-matchers.h>
 #include <gtest/gtest.h>
@@ -17,19 +21,41 @@
 
 #include <torch/torch.h>
 
+#include <sstream>
+
 // Tests go in torch::jit
 namespace torch {
 namespace jit {
 
 using namespace torch::jit::fuser::cuda;
 
+//! Compare fusions by printing the Math of each to string then doing a strcmp
+void ir_math_check(Fusion& factual, Fusion& fexpected) {
+  std::ostringstream sactual, sexpected;
+
+  factual.printMath(true, sactual);
+  fexpected.printMath(true, sexpected);
+
+  if (sactual.str() != sexpected.str()) {
+    std::cerr << "========= EXPECTED ==========" << std::endl;
+    std::cerr << sexpected.str() << std::endl;
+    std::cerr << "========= ACTUAL ==========" << std::endl;
+    std::cerr << sactual.str() << std::endl;
+    TORCH_INTERNAL_ASSERT(false, "Fusion IR math does not match expected");
+  }
+}
+
 //! A very simple test computing sum(x * 3.0, dim=-1) for 2D inputs
 TEST_F(NVFuserTest, FusionFrontendSuperBasic_CUDA) {
+  // Create inputs
   int y = 4, z = 8;
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
 
   at::Tensor t0 = at::randn({y, z}, options);
 
+  std::vector<IValue> inputs = {t0};
+
+  // Define fusion
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -43,6 +69,14 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasic_CUDA) {
 
   fusion.addOutput(tv2);
 
+  // Run automatic scheduler
+  auto fauto = Fusion(fusion); // unique_ptr to copy of fusion
+  auto reduction_params = getReductionHeuristics(&fauto, inputs);
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  scheduleReduction(&fauto, *reduction_params);
+
+  // Perform manual scheduling
+
   // {i0, r1}
   tv2->split(
       1,
@@ -58,11 +92,13 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasic_CUDA) {
 
   inlineMost();
 
+  ir_math_check(fusion, fauto);
+
+  // Perform manual computation and verify
+
   auto t1 = t0 * 3.0;
   auto t2 = t1.sum({-1}, false);
 
-  std::vector<IValue> inputs = {t0};
-
   int runtime_threadIdx_dim = 128;
   LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
 

From 6d3d34f0ffc5159936c29fe240dafe65584c1fb3 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Tue, 3 Jan 2023 14:37:35 -0500
Subject: [PATCH 11/28] Make SuperBasic manual schedule match automatic

This makes the test pass now that I'm checking that the auto and manual IRs match.
---
 .../nvfuser/test/test_gpu_match_frontend.cpp  | 23 +++++++++++--------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
index 2e2c7ec2adf9..bf46021f19e9 100644
--- a/third_party/nvfuser/test/test_gpu_match_frontend.cpp
+++ b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -77,18 +77,22 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasic_CUDA) {
 
   // Perform manual scheduling
 
-  // {i0, r1}
-  tv2->split(
-      1,
-      NamedScalar::getParallelDim(ParallelType::TIDx)); // {i0, r1 / TIDx, TIDx}
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  auto tv3 = tv2->rFactor({1});
+  tv2->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
+  tv2->split(1, 1);
+  tv2->reorder({{-1, -2}, {-2, -1}});
+
+  auto tv3 = tv2->rFactor({1, 3});
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(2)->parallelize(ParallelType::TIDx);
+  tv3->axis(3)->parallelize(ParallelType::Unswitch);
 
   // propagate the mapping to other tensors
   TransformPropagatorWithCheck propagator(tv3);
   MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
-  scheduler_utils::parallelizeAllLike(tv3, {tv0, tv1});
+  scheduler_utils::parallelizeAllLike(tv3, {tv0, tv1, tv2});
+
+  tv1->computeAt(tv3, -1);
 
   inlineMost();
 
@@ -112,7 +116,8 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasic_CUDA) {
 
 //! The same test as FusionFrontendSuperBasic_CUDA, but with half-precision
 //! inputs and outputs
-TEST_F(NVFuserTest, FusionFrontendSuperBasicFP16_CUDA) {
+// TEST_F(NVFuserTest, FusionFrontendSuperBasicFP16_CUDA) {
+void foo() {
   int y = 4, z = 8;
   auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
 

From b1c51c9b36a22e5342ecb3f3f452e35131c5cdae Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Tue, 3 Jan 2023 15:37:49 -0500
Subject: [PATCH 12/28] Enable dumping kernel to string, fix FP16 SuperBasic
 test

Currently, `ir_kernel_check` is commented out since it is failing with
trivial variable number bumps for the SuperBasic{,FP16} tests. It is
low-priority, but at some point I'd like to understand why the
equivalent kernel math leads to slightly different (but still
equivalent) kernels in these cases. For now, I will move on to the other
tests.
---
 third_party/nvfuser/csrc/fusion.cpp           |  4 +-
 third_party/nvfuser/csrc/fusion.h             |  4 +-
 .../nvfuser/test/test_gpu_match_frontend.cpp  | 75 +++++++++++++++----
 3 files changed, 64 insertions(+), 19 deletions(-)

diff --git a/third_party/nvfuser/csrc/fusion.cpp b/third_party/nvfuser/csrc/fusion.cpp
index 48178f2a20f3..0de4b4da1410 100644
--- a/third_party/nvfuser/csrc/fusion.cpp
+++ b/third_party/nvfuser/csrc/fusion.cpp
@@ -359,13 +359,13 @@ void Fusion::print() {
   std::cout << "}\n\n";
 }
 
-void Fusion::printKernel(DataType index_type) {
+void Fusion::printKernel(DataType index_type, std::ostream& stream) {
   FUSER_PERF_SCOPE("Fusion::printKernel");
   TORCH_INTERNAL_ASSERT(
       !this->isA<kir::Kernel>(),
       "Cannot \"print kernel\" of a kernel container. ",
       "This would require lowering during lowering.");
-  std::cout << codegen::generateCudaKernel(GpuLower(this, index_type).kernel());
+  stream << codegen::generateCudaKernel(GpuLower(this, index_type).kernel());
 }
 
 std::unordered_map<std::string, std::pair<int, int>> Fusion::bankConflictInfo(
diff --git a/third_party/nvfuser/csrc/fusion.h b/third_party/nvfuser/csrc/fusion.h
index 971d9c1553e5..f995e8ee4ee0 100644
--- a/third_party/nvfuser/csrc/fusion.h
+++ b/third_party/nvfuser/csrc/fusion.h
@@ -137,7 +137,9 @@ class TORCH_CUDA_CU_API Fusion : public IrContainer {
   void printTransforms();
 
   //! Lower the fusion and print a kernel
-  void printKernel(DataType index_type = DataType::Int);
+  void printKernel(
+      DataType index_type = DataType::Int,
+      std::ostream& stream = std::cout);
 
   //! Returns if this fusion is noop, for example, trivially forwarding inputs,
   //! or all outputs are size-0 tensors, etc.
diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
index bf46021f19e9..670c440c9d2c 100644
--- a/third_party/nvfuser/test/test_gpu_match_frontend.cpp
+++ b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -29,7 +29,8 @@ namespace jit {
 
 using namespace torch::jit::fuser::cuda;
 
-//! Compare fusions by printing the Math of each to string then doing a strcmp
+//! Compare fusions by printing the IR math of each to string then doing a
+//! strcmp
 void ir_math_check(Fusion& factual, Fusion& fexpected) {
   std::ostringstream sactual, sexpected;
 
@@ -45,6 +46,24 @@ void ir_math_check(Fusion& factual, Fusion& fexpected) {
   }
 }
 
+//! Compare fusions by printing the generated CUDA kernel of each to string
+//! then doing a strcmp
+void ir_kernel_check(Fusion& factual, Fusion& fexpected) {
+  std::ostringstream sactual, sexpected;
+
+  factual.printKernel(DataType::Int, sactual);
+  fexpected.printKernel(DataType::Int, sexpected);
+
+  if (sactual.str() != sexpected.str()) {
+    std::cerr << "========= EXPECTED ==========" << std::endl;
+    std::cerr << sexpected.str() << std::endl;
+    std::cerr << "========= ACTUAL ==========" << std::endl;
+    std::cerr << sactual.str() << std::endl;
+    TORCH_INTERNAL_ASSERT(
+        false, "Generated CUDA kernel does not match expected");
+  }
+}
+
 //! A very simple test computing sum(x * 3.0, dim=-1) for 2D inputs
 TEST_F(NVFuserTest, FusionFrontendSuperBasic_CUDA) {
   // Create inputs
@@ -76,7 +95,6 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasic_CUDA) {
   scheduleReduction(&fauto, *reduction_params);
 
   // Perform manual scheduling
-
   tv2->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
   tv2->split(1, 1);
   tv2->reorder({{-1, -2}, {-2, -1}});
@@ -97,9 +115,11 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasic_CUDA) {
   inlineMost();
 
   ir_math_check(fusion, fauto);
+  // CUDA kernel is equivalent, but automatic scheduling uses i23 instead of
+  // i22 for the name of the index variable in the loop (rFactor, see tv3)
+  // ir_kernel_check(fusion, fauto);
 
   // Perform manual computation and verify
-
   auto t1 = t0 * 3.0;
   auto t2 = t1.sum({-1}, false);
 
@@ -116,13 +136,16 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasic_CUDA) {
 
 //! The same test as FusionFrontendSuperBasic_CUDA, but with half-precision
 //! inputs and outputs
-// TEST_F(NVFuserTest, FusionFrontendSuperBasicFP16_CUDA) {
-void foo() {
+TEST_F(NVFuserTest, FusionFrontendSuperBasicFP16_CUDA) {
+  // Create inputs
   int y = 4, z = 8;
   auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
 
   at::Tensor t0 = at::randn({y, z}, options);
 
+  std::vector<IValue> inputs = {t0};
+
+  // Define fusion
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -131,32 +154,52 @@ void foo() {
 
   fusion.addInput(tv0);
 
-  auto tv1 = mul(tv0, c0); // {i0, i1}
+  // Note: A manual schedule will run without an explicit cast here, producing
+  // an _implicit_ cast to float. That float tensor will not be explicitly
+  // parallelized to match, but will result
+  auto tv0float = castOp(DataType::Float, tv0);
+
+  auto tv1 = mul(tv0float, c0); // {i0, i1}
   auto tv2float = sum(tv1, {-1}, false, DataType::Float); // {i0, r1}
   auto tv2 = castOp(DataType::Half, tv2float);
 
   fusion.addOutput(tv2);
 
-  // {i0, r1}
-  tv2float->split(
-      1,
-      NamedScalar::getParallelDim(ParallelType::TIDx)); // {i0, r1 / TIDx, TIDx}
-  tv2float->axis(0)->parallelize(ParallelType::BIDx);
-  tv2float->axis(-1)->parallelize(ParallelType::TIDx);
-  auto tv3 = tv2float->rFactor({1});
+  // Run automatic scheduler
+  auto fauto = Fusion(fusion); // unique_ptr to copy of fusion
+  auto reduction_params = getReductionHeuristics(&fauto, inputs);
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  scheduleReduction(&fauto, *reduction_params);
+
+  // Perform manual scheduling
+  tv2float->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
+  tv2float->split(1, 1);
+  tv2float->reorder({{-1, -2}, {-2, -1}});
+
+  auto tv3 = tv2float->rFactor({1, 3});
+
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv3->axis(2)->parallelize(ParallelType::TIDx);
+  tv3->axis(3)->parallelize(ParallelType::Unswitch);
 
   // propagate the mapping to other tensors
   TransformPropagatorWithCheck propagator(tv3);
   MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
-  scheduler_utils::parallelizeAllLike(tv3, {tv0, tv1, tv2});
+  scheduler_utils::parallelizeAllLike(tv3, {tv0, tv0float, tv1, tv2, tv2float});
+
+  tv1->computeAt(tv3, -1);
 
   inlineMost();
 
+  ir_math_check(fusion, fauto);
+  // CUDA kernel is equivalent, but automatic scheduling uses i31 instead of
+  // i30 for the name of the index variable in the loop (rFactor, see tv3)
+  // ir_kernel_check(fusion, fauto);
+
+  // Perform manual computation and verify
   auto t1 = t0 * 3.0;
   auto t2 = t1.sum({-1}, false, c10::kFloat);
 
-  std::vector<IValue> inputs = {t0};
-
   int runtime_threadIdx_dim = 128;
   LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
 

From 5ffd5a596f25f11b2c624576d3c9d6adc243f94d Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 4 Jan 2023 10:55:24 -0500
Subject: [PATCH 13/28] Add python defs to docstrings of frontend tests

---
 .../nvfuser/test/test_gpu_match_frontend.cpp  | 70 +++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
index 670c440c9d2c..9f4d37f1a14d 100644
--- a/third_party/nvfuser/test/test_gpu_match_frontend.cpp
+++ b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -65,6 +65,16 @@ void ir_kernel_check(Fusion& factual, Fusion& fexpected) {
 }
 
 //! A very simple test computing sum(x * 3.0, dim=-1) for 2D inputs
+//! ```python
+//! def fusion_func(fd: FusionDefinition):
+//!     t0 = fd.define_tensor(2)
+//!     c0 = fd.define_constant(3.0)
+//!
+//!     t1 = fd.ops.mul(t0, c0)
+//!     t2 = fd.ops.sum(t1, [-1], False, DataType.Float)
+//!
+//!     fd.add_output(t2)
+//! ```
 TEST_F(NVFuserTest, FusionFrontendSuperBasic_CUDA) {
   // Create inputs
   int y = 4, z = 8;
@@ -136,6 +146,17 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasic_CUDA) {
 
 //! The same test as FusionFrontendSuperBasic_CUDA, but with half-precision
 //! inputs and outputs
+//! ```python
+//! def fusion_func(fd: FusionDefinition):
+//!     t0 = fd.define_tensor(2, DataType.Half)
+//!     c0 = fd.define_constant(3.0)
+//!
+//!     t1 = fd.ops.mul(t0, c0)
+//!     t2 = fd.ops.sum(t1, [-1], False, DataType.Float)
+//!
+//!     t3 = fd.ops.cast(t2, DataType.Half)
+//!     fd.add_output(t3)
+//! ```
 TEST_F(NVFuserTest, FusionFrontendSuperBasicFP16_CUDA) {
   // Create inputs
   int y = 4, z = 8;
@@ -212,6 +233,19 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasicFP16_CUDA) {
 }
 
 //! A simple test computing sum((x + y) * 3.0, dim=-1) for 3D inputs
+//! A simple test computing (x + y) for 3D inputs
+//! ```python
+//! def fusion_func(fd: FusionDefinition) :
+//!     t0 = fd.define_tensor(3)
+//!     t1 = fd.define_tensor(3)
+//!     c0 = fd.define_constant(3.0)
+//!
+//!     t2 = fd.ops.add(t0, t1)
+//!     t3 = fd.ops.mul(t2, c0)
+//!     t4 = fd.ops.sum(t3, [-1], False, DataType.Float)
+//!
+//!     fd.add_output(t4)
+//! ```
 TEST_F(NVFuserTest, FusionFrontendBasic_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
@@ -271,6 +305,19 @@ TEST_F(NVFuserTest, FusionFrontendBasic_CUDA) {
 
 //! The same test as FusionFrontendBasic_CUDA, but with half-precision
 //! inputs and outputs
+//! ```python
+//! def fusion_func(fd: FusionDefinition) :
+//!     t0 = fd.define_tensor(3, DataType.Half)
+//!     t1 = fd.define_tensor(3, DataType.Half)
+//!     c0 = fd.define_constant(3.0)
+//!
+//!     t2 = fd.ops.add(t0, t1)
+//!     t3 = fd.ops.mul(t2, c0)
+//!     t4 = fd.ops.sum(t3, [-1], False, DataType.Float)
+//!
+//!     t5 = fd.ops.cast(t4, DataType.Half)
+//!     fd.add_output(t5)
+//! ```
 TEST_F(NVFuserTest, FusionFrontendBasicFP16_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
@@ -327,6 +374,19 @@ TEST_F(NVFuserTest, FusionFrontendBasicFP16_CUDA) {
 
 //! Convert double inputs to half, then do some point-wise operations and
 //! output half precision
+//! ```python
+//! def fusion_func(fd: FusionDefinition) :
+//!     t0 = fd.define_tensor(2, DataType.Double)
+//!     t1 = fd.define_tensor(2, DataType.Double)
+//!
+//!     t0h = fd.ops.cast(t0, DataType.Half)
+//!     t1h = fd.ops.cast(t1, DataType.Half)
+//!     t2 = fd.ops.add(t0h, t1h)
+//!     t3 = fd.ops.relu(t2)
+//!     t4 = fd.ops.cast(t3, DataType.Half)
+//!
+//!     fd.add_output(t4)
+//! ```
 TEST_F(NVFuserTest, FusionFrontendCastDoubleToHalf_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
@@ -389,6 +449,16 @@ TEST_F(NVFuserTest, FusionFrontendCastDoubleToHalf_CUDA) {
 //! Same test as FusionFrontendCastDoubleToHalf_CUDA, but with mixed inputs
 //! (double and half) and without the explicit cast to half, so that
 //! computation and output are all at double precision.
+//! ```python
+//! def fusion_func(fd: FusionDefinition) :
+//!     t0 = fd.define_tensor(2, DataType.Half)
+//!     t1 = fd.define_tensor(2, DataType.Double)
+//!
+//!     t2 = fd.ops.add(t0, t1)
+//!     t5 = fd.ops.relu(t2)
+//!
+//!     fd.add_output(t5)
+//! ```
 TEST_F(NVFuserTest, FusionFrontendPromoteToDouble_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);

From cd5fc9f8fba587e774b388bdd634c86854e347e1 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 4 Jan 2023 11:49:28 -0500
Subject: [PATCH 14/28] Add stream arg to Fusion::print{,Transforms}()

This is mainly so that I can use printTransforms to compare IRs.
---
 third_party/nvfuser/csrc/fusion.cpp | 16 ++++++++--------
 third_party/nvfuser/csrc/fusion.h   |  4 ++--
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/third_party/nvfuser/csrc/fusion.cpp b/third_party/nvfuser/csrc/fusion.cpp
index 0de4b4da1410..901e729f6cfc 100644
--- a/third_party/nvfuser/csrc/fusion.cpp
+++ b/third_party/nvfuser/csrc/fusion.cpp
@@ -346,17 +346,17 @@ void Fusion::validateInputs() {
   }
 }
 
-void Fusion::print() {
+void Fusion::print(std::ostream& stream) {
   FUSER_PERF_SCOPE("Fusion::print");
 
   FusionGuard fg(this);
-  std::cout << "\n%kernel {\n";
-  IrMathPrinter op_exprs(std::cout);
+  stream << "\n%kernel {\n";
+  IrMathPrinter op_exprs(stream);
   op_exprs.handle(this);
-  std::cout << "\nTransformPrinter : \n";
-  IrTransformPrinter t_exprs(std::cout);
+  stream << "\nTransformPrinter : \n";
+  IrTransformPrinter t_exprs(stream);
   t_exprs.handle(this);
-  std::cout << "}\n\n";
+  stream << "}\n\n";
 }
 
 void Fusion::printKernel(DataType index_type, std::ostream& stream) {
@@ -429,11 +429,11 @@ std::vector<Val*> Fusion::inputsAndCreated() {
   return result;
 }
 
-void Fusion::printTransforms() {
+void Fusion::printTransforms(std::ostream& stream) {
   FUSER_PERF_SCOPE("Fusion::printTransforms");
 
   FusionGuard fg(this);
-  IrTransformPrinter t_exprs(std::cout);
+  IrTransformPrinter t_exprs(stream);
   t_exprs.handle(this);
 }
 
diff --git a/third_party/nvfuser/csrc/fusion.h b/third_party/nvfuser/csrc/fusion.h
index f995e8ee4ee0..3912048c1e11 100644
--- a/third_party/nvfuser/csrc/fusion.h
+++ b/third_party/nvfuser/csrc/fusion.h
@@ -124,7 +124,7 @@ class TORCH_CUDA_CU_API Fusion : public IrContainer {
   void validateInputs();
 
   //! Print this fusion to the console
-  void print();
+  void print(std::ostream& stream = std::cout);
 
   //! Print Arith exprs
   //! \param from_outputs_only Only print exprs reachable from outputs
@@ -134,7 +134,7 @@ class TORCH_CUDA_CU_API Fusion : public IrContainer {
       std::ostream& stream = std::cout);
 
   //! Print transformations used in fusion (can be very verbose)
-  void printTransforms();
+  void printTransforms(std::ostream& stream = std::cout);
 
   //! Lower the fusion and print a kernel
   void printKernel(

From e041b9fce089c111128597aa112acdd877225326 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 4 Jan 2023 11:51:40 -0500
Subject: [PATCH 15/28] Add compare_ir() which compares math, fusions & kernels

---
 .../nvfuser/test/test_gpu_match_frontend.cpp  | 33 +++++++++++++++----
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
index 9f4d37f1a14d..742b2dc33270 100644
--- a/third_party/nvfuser/test/test_gpu_match_frontend.cpp
+++ b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -31,7 +31,7 @@ using namespace torch::jit::fuser::cuda;
 
 //! Compare fusions by printing the IR math of each to string then doing a
 //! strcmp
-void ir_math_check(Fusion& factual, Fusion& fexpected) {
+void compare_ir_math(Fusion& factual, Fusion& fexpected) {
   std::ostringstream sactual, sexpected;
 
   factual.printMath(true, sactual);
@@ -46,9 +46,26 @@ void ir_math_check(Fusion& factual, Fusion& fexpected) {
   }
 }
 
+//! Compare fusions by printing the IR transforms of each to string then doing
+//! a strcmp
+void compare_transforms(Fusion& factual, Fusion& fexpected) {
+  std::ostringstream sactual, sexpected;
+
+  factual.printTransforms(sactual);
+  fexpected.printTransforms(sexpected);
+
+  if (sactual.str() != sexpected.str()) {
+    std::cerr << "========= EXPECTED ==========" << std::endl;
+    std::cerr << sexpected.str() << std::endl;
+    std::cerr << "========= ACTUAL ==========" << std::endl;
+    std::cerr << sactual.str() << std::endl;
+    TORCH_INTERNAL_ASSERT(false, "Generated transforms do not match expected");
+  }
+}
+
 //! Compare fusions by printing the generated CUDA kernel of each to string
 //! then doing a strcmp
-void ir_kernel_check(Fusion& factual, Fusion& fexpected) {
+void compare_kernels(Fusion& factual, Fusion& fexpected) {
   std::ostringstream sactual, sexpected;
 
   factual.printKernel(DataType::Int, sactual);
@@ -64,6 +81,12 @@ void ir_kernel_check(Fusion& factual, Fusion& fexpected) {
   }
 }
 
+void compare_ir(Fusion& factual, Fusion& fexpected) {
+  compare_ir_math(factual, fexpected);
+  compare_transforms(factual, fexpected);
+  compare_kernels(factual, fexpected);
+}
+
 //! A very simple test computing sum(x * 3.0, dim=-1) for 2D inputs
 //! ```python
 //! def fusion_func(fd: FusionDefinition):
@@ -124,10 +147,9 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasic_CUDA) {
 
   inlineMost();
 
-  ir_math_check(fusion, fauto);
   // CUDA kernel is equivalent, but automatic scheduling uses i23 instead of
   // i22 for the name of the index variable in the loop (rFactor, see tv3)
-  // ir_kernel_check(fusion, fauto);
+  compare_ir(fusion, fauto);
 
   // Perform manual computation and verify
   auto t1 = t0 * 3.0;
@@ -212,10 +234,9 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasicFP16_CUDA) {
 
   inlineMost();
 
-  ir_math_check(fusion, fauto);
   // CUDA kernel is equivalent, but automatic scheduling uses i31 instead of
   // i30 for the name of the index variable in the loop (rFactor, see tv3)
-  // ir_kernel_check(fusion, fauto);
+  compare_ir(fusion, fauto);
 
   // Perform manual computation and verify
   auto t1 = t0 * 3.0;

From acbdbc962ebea1b37aee9d061a50ea74dd1a19eb Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 4 Jan 2023 12:07:53 -0500
Subject: [PATCH 16/28] Add pointwise Add() example to frontend and C++ tests

---
 test/test_nvfuser_frontend.py                 | 102 +++++++++++++++++-
 .../nvfuser/test/test_gpu_match_frontend.cpp  |  83 ++++++++++++++
 2 files changed, 181 insertions(+), 4 deletions(-)

diff --git a/test/test_nvfuser_frontend.py b/test/test_nvfuser_frontend.py
index b34ac484ea40..4872f86b54db 100644
--- a/test/test_nvfuser_frontend.py
+++ b/test/test_nvfuser_frontend.py
@@ -61,13 +61,107 @@ def exec_nvfuser(self, fusion_func, inputs, new_fusion_expected=True) :
         self.assertEqual(fc.num_fusions() - before_fusions, int(new_fusion_expected))
         return out, fs
 
-    def test_basic(self) :
+    def test_add(self):
         inputs = [
-            torch.ones(2, 4, 8, device='cuda'),
-            torch.ones(2, 4, 8, device='cuda'),
+            torch.ones(2, 4, 8, device="cuda"),
+            torch.ones(2, 4, 8, device="cuda"),
         ]
 
-        def fusion_func(fd: FusionDefinition) :
+        def fusion_func(fd: FusionDefinition):
+            t0 = fd.define_tensor(3)
+            t1 = fd.define_tensor(3)
+
+            t2 = fd.ops.add(t0, t1)
+
+            fd.add_output(t2)
+
+        # Expected Output is a tensor of 2's
+        nvf_out1, _ = self.exec_nvfuser(fusion_func, inputs)
+
+        # Create a new fusion with the same definition, it should hit the cache!
+        nvf_out2, fs2 = self.exec_nvfuser(
+            fusion_func, inputs, new_fusion_expected=False
+        )
+
+        # Create a fusion from a fusion id and make sure it executes!
+        fs3 = Fusion(fs2.id())
+        nvf_out3 = fs3.execute(inputs)[0]
+
+        eager_out = inputs[0] + inputs[1]
+        self.assertEqual(eager_out, nvf_out1)
+        self.assertEqual(eager_out, nvf_out2)
+        self.assertEqual(eager_out, nvf_out3)
+
+    def test_super_basic(self):
+        inputs = [
+            torch.ones(4, 8, device="cuda"),
+        ]
+
+        def fusion_func(fd: FusionDefinition):
+            t0 = fd.define_tensor(2)
+            c0 = fd.define_constant(3.0)
+
+            t1 = fd.ops.mul(t0, c0)
+            t2 = fd.ops.sum(t1, [-1], False, DataType.Float)
+
+            fd.add_output(t2)
+
+        # Expected Output is a tensor of 24's
+        nvf_out1, _ = self.exec_nvfuser(fusion_func, inputs)
+
+        # Create a new fusion with the same definition, it should hit the cache!
+        nvf_out2, fs2 = self.exec_nvfuser(
+            fusion_func, inputs, new_fusion_expected=False
+        )
+
+        # Create a fusion from a fusion id and make sure it executes!
+        fs3 = Fusion(fs2.id())
+        nvf_out3 = fs3.execute(inputs)[0]
+
+        eager_out = torch.sum(inputs[0] * 3.0, dim=-1)
+        self.assertEqual(eager_out, nvf_out1)
+        self.assertEqual(eager_out, nvf_out2)
+        self.assertEqual(eager_out, nvf_out3)
+
+    def test_super_basic_fp16(self):
+        inputs = [
+            torch.ones(4, 8, device="cuda", dtype=torch.float16),
+        ]
+
+        def fusion_func(fd: FusionDefinition):
+            t0 = fd.define_tensor(2, DataType.Half)
+            c0 = fd.define_constant(3.0)
+
+            t1 = fd.ops.mul(t0, c0)
+            t2 = fd.ops.sum(t1, [-1], False, DataType.Float)
+
+            t3 = fd.ops.cast(t2, DataType.Half)
+            fd.add_output(t3)
+
+        # Expected Output is a tensor of 48's
+        nvf_out1, _ = self.exec_nvfuser(fusion_func, inputs)
+
+        # Create a new fusion with the same definition, it should hit the cache!
+        nvf_out2, fs2 = self.exec_nvfuser(
+            fusion_func, inputs, new_fusion_expected=False
+        )
+
+        # Create a fusion from a fusion id and make sure it executes!
+        fs3 = Fusion(fs2.id())
+        nvf_out3 = fs3.execute(inputs)[0]
+
+        eager_out = torch.sum(inputs[0] * 3.0, dim=-1)
+        self.assertEqual(eager_out, nvf_out1)
+        self.assertEqual(eager_out, nvf_out2)
+        self.assertEqual(eager_out, nvf_out3)
+
+    def test_basic(self):
+        inputs = [
+            torch.ones(2, 4, 8, device="cuda"),
+            torch.ones(2, 4, 8, device="cuda"),
+        ]
+
+        def fusion_func(fd: FusionDefinition):
             t0 = fd.define_tensor(3)
             t1 = fd.define_tensor(3)
             c0 = fd.define_constant(3.0)
diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
index 742b2dc33270..f867cd021ccc 100644
--- a/third_party/nvfuser/test/test_gpu_match_frontend.cpp
+++ b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -87,6 +87,89 @@ void compare_ir(Fusion& factual, Fusion& fexpected) {
   compare_kernels(factual, fexpected);
 }
 
+//! A simple point-wise test computing (x + y) for 3D inputs
+//! ```python
+//! def fusion_func(fd: FusionDefinition):
+//!     t0 = fd.define_tensor(2)
+//!     t1 = fd.define_tensor(2)
+//!
+//!     t2 = fd.ops.add(t0, t1)
+//!
+//!     fd.add_output(t2)
+//! ```
+TEST_F(NVFuserTest, FusionFrontendAdd_CUDA) {
+  // Create inputs
+
+  int x = 2, y = 4, z = 8;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({x, y, z}, options);
+  at::Tensor t1 = at::randn({x, y, z}, options);
+
+  std::vector<IValue> inputs = {t0, t1};
+
+  // Define fusion
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(3);
+  auto tv1 = makeSymbolicTensor(3);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  auto tv2 = add(tv0, tv1);
+  // auto tv4 = sum(tv2, {-1}, false, DataType::Float);
+
+  fusion.addOutput(tv2);
+
+  // Run automatic scheduler
+  auto fauto = Fusion(fusion); // unique_ptr to copy of fusion
+  auto pointwise_params = getPointwiseHeuristics(&fauto, inputs);
+  TORCH_CHECK(pointwise_params, "Pointwise schedule was not generated!");
+  schedulePointwise(&fauto, *pointwise_params);
+
+  // Perform manual scheduling
+  auto tv0p = tv0->cacheAfter();
+  auto tv1p = tv1->cacheAfter();
+  tv2->merge(1, 2);
+  tv2->merge(0, 1);
+  tv2->split(0, 128);
+  tv2->split(0, 1);
+  tv2->split(0, 1);
+  auto tv2l = tv2->cacheBefore();
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::Unswitch);
+  tv2->axis(3)->parallelize(ParallelType::TIDx);
+
+  inlineMost();
+  tv0p->computeAt(tv2, 2);
+  tv1p->computeAt(tv2, 2);
+
+  TransformPropagatorWithCheck propagator(tv2);
+  MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator);
+  scheduler_utils::parallelizeAllLike(tv2, {tv0, tv1, tv0p, tv1p, tv2l});
+
+  // Pointwise scheduler does not use inlineMost(), as reduction scheduler does
+  // inlineMost();
+
+  compare_ir(fusion, fauto);
+
+  // Perform manual computation and verify
+  auto t2 = t0.add(t1);
+  // auto t4 = t2.sum({-1}, false);
+
+  int runtime_threadIdx_dim = 128;
+  LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs, lparams);
+  auto cg_outputs = fe.runFusion(inputs, lparams);
+
+  testValidate(
+      &fusion, cg_outputs, inputs, {t2}, __LINE__, __FILE__, "", lparams);
+}
+
 //! A very simple test computing sum(x * 3.0, dim=-1) for 2D inputs
 //! ```python
 //! def fusion_func(fd: FusionDefinition):

From 30ed719b2f7b0b828085c9661ca3405fc106b93f Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 4 Jan 2023 12:19:14 -0500
Subject: [PATCH 17/28] Add FusionFrontendImplicitBroadcastInput_CUDA test

This test required using pointwiseSchedule. Note that it does not
currently pass, like many of the others, due to my misunderstanding of
ca_pos.
---
 .../nvfuser/test/test_gpu_match_frontend.cpp  | 87 +++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
index f867cd021ccc..f61d7dd80cd0 100644
--- a/third_party/nvfuser/test/test_gpu_match_frontend.cpp
+++ b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -616,6 +616,93 @@ TEST_F(NVFuserTest, FusionFrontendPromoteToDouble_CUDA) {
       &fusion, cg_outputs, inputs, {t3}, __LINE__, __FILE__, "", lparams);
 }
 
+//! Test broadcasting one input then adding another
+//! ```python
+//! def fusion_func(fd: FusionDefinition) :
+//!     t0 = fd.define_tensor(1)
+//!     t1 = fd.define_tensor(3)
+//!
+//!     t0_b = fd.ops.broadcast_in_dim(t0, [2, 3, 4], [1])
+//!     t2 = fd.ops.add(t0_b, t1)
+//!
+//!     fd.add_output(t2)
+//! ```
+TEST_F(NVFuserTest, FusionFrontendImplicitBroadcastInput_CUDA) {
+  // Create inputs
+  int w = 3, x = 2, y = 3, z = 4;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({w}, options);
+  at::Tensor t1 = at::randn({x, y, z}, options);
+
+  std::vector<IValue> inputs = {t0, t1};
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(1);
+  auto tv1 = makeSymbolicTensor(3);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // explicitly tell tv0 to broadcast along new first and last dimensions
+  auto tv0_b = broadcast(tv0, {true, false, true});
+  auto tv0_e = expand(
+      tv0_b,
+      {tv1->axis(0)->extent(), tv1->axis(1)->extent(), tv1->axis(2)->extent()});
+  auto tv2 = add(tv0_e, tv1);
+
+  fusion.addOutput(tv2);
+
+  // Run automatic scheduler
+  auto fauto = Fusion(fusion); // unique_ptr to copy of fusion
+  auto pointwise_params = getPointwiseHeuristics(&fauto, inputs);
+  TORCH_CHECK(pointwise_params, "Pointwise schedule was not generated!");
+  schedulePointwise(&fauto, *pointwise_params);
+
+  // Perform manual scheduling
+
+  auto tv0l = tv0->cacheAfter();
+  auto tv1l = tv1->cacheAfter();
+  auto tv2l = tv2->cacheBefore();
+  tv2->merge(1, 2);
+  tv2->merge(0, 1);
+  tv2->split(0, 128);
+  tv2->split(0, 1);
+  tv2->split(0, 1);
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
+  tv2->axis(1)->parallelize(ParallelType::Unswitch);
+  tv2->axis(3)->parallelize(ParallelType::TIDx);
+
+  TransformPropagatorWithCheck propagator(tv2);
+  MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator);
+  scheduler_utils::parallelizeAllLike(
+      tv2, {tv0_b, tv0_e, tv0, tv1, tv0l, tv1l, tv2l});
+
+  inlineMost();
+  tv0->computeAt(tv0l, 2);
+  tv1->computeAt(tv1l, 2);
+
+  compare_ir(fusion, fauto);
+
+  // Perform manual computation and verify
+  auto t0_b = t0.view({1, w, 1});
+
+  auto t2 = t0_b.add(t1);
+
+  // Need to hardcode both block and grid size
+  int runtime_threadIdx_dim = 128;
+  LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs, lparams);
+  auto cg_outputs = fe.runFusion(inputs, lparams);
+
+  testValidate(
+      &fusion, cg_outputs, inputs, {t2}, __LINE__, __FILE__, "", lparams);
+}
+
 } // namespace jit
 } // namespace torch
 #endif // #if defined(USE_CUDA)

From a75c509f83b997f158dad9e04620e169ad240309 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 4 Jan 2023 12:22:35 -0500
Subject: [PATCH 18/28] Update {super,}basic tests to closer match auto sched

---
 .../nvfuser/test/test_gpu_match_frontend.cpp  | 44 ++++++++++++++-----
 1 file changed, 34 insertions(+), 10 deletions(-)

diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
index f61d7dd80cd0..b14ec9042512 100644
--- a/third_party/nvfuser/test/test_gpu_match_frontend.cpp
+++ b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -351,6 +351,17 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasicFP16_CUDA) {
 //!     fd.add_output(t4)
 //! ```
 TEST_F(NVFuserTest, FusionFrontendBasic_CUDA) {
+  // Create inputs
+
+  int x = 2, y = 4, z = 8;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({x, y, z}, options);
+  at::Tensor t1 = at::randn({x, y, z}, options);
+
+  std::vector<IValue> inputs = {t0, t1};
+
+  // Define fusion
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -367,35 +378,48 @@ TEST_F(NVFuserTest, FusionFrontendBasic_CUDA) {
 
   fusion.addOutput(tv4);
 
+  // Run automatic scheduler
+  auto fauto = Fusion(fusion); // unique_ptr to copy of fusion
+  auto reduction_params = getReductionHeuristics(&fauto, inputs);
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  scheduleReduction(&fauto, *reduction_params);
+
+  // Perform manual scheduling
+  //
   // {i0, i1, i2}
+
   tv4->merge(0, 1); // {i0*i1, i2}
   tv4->split(
       1,
       NamedScalar::getParallelDim(
           ParallelType::TIDx)); // {i0*i1, r2 / bDx, bDx}
+  tv4->split(-2, 1);
+  tv4->reorder({{-2, -1}, {-1, -2}});
+  tv4->split(0, 2);
+  tv4->reorder({{1, 2}, {2, 1}});
+  tv4->split(0, 1);
+  tv4->reorder({{1, 2}, {2, 1}});
   tv4->axis(0)->parallelize(ParallelType::BIDx);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  auto tv5 = tv4->rFactor({1});
+  tv4->axis(2)->parallelize(ParallelType::Unswitch);
+  tv4->axis(3)->parallelize(ParallelType::Unroll);
+  tv4->axis(4)->parallelize(ParallelType::TIDx);
+  tv4->axis(5)->parallelize(ParallelType::Unswitch);
+  auto tv5 = tv4->rFactor({1, 5});
 
   // propagate the mapping to other tensors
   TransformPropagatorWithCheck propagator(tv5);
   MaxRootDomainInfoSpanningTree(tv5).traverse(&propagator);
-  scheduler_utils::parallelizeAllLike(tv5, {tv0, tv1, tv2, tv3});
+  scheduler_utils::parallelizeAllLike(tv5, {tv0, tv1, tv2, tv3, tv4});
 
   inlineMost();
 
-  int x = 2, y = 4, z = 8;
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({x, y, z}, options);
-  at::Tensor t1 = at::randn({x, y, z}, options);
+  compare_ir(fusion, fauto);
 
+  // Perform manual computation and verify
   auto t2 = t0.add(t1);
   auto t3 = t2.mul(3.0);
   auto t4 = t3.sum({-1}, false);
 
-  std::vector<IValue> inputs = {t0, t1};
-
   int runtime_threadIdx_dim = 128;
   LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
 

From cff9b5ed4573c429902f9ef4fe8cf4201c9fb628 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 4 Jan 2023 15:38:36 -0500
Subject: [PATCH 19/28] Make ImplicitBroadcastInput manual IR match auto

Note that like SuperBasic{,FP16}, this test passes the math and
transforms check but produces a kernel whose variables are incremented
(this time by two). There are two inputs in this test where in the other
case there is only one. Perhaps something is being done with the inputs
by the automatic scheduler to skip a variable number when the kernel is
generated. Anyway, the kernels are completely equivalent, but just not
_equal_ yet. I will move on to further tests and make another pass later
once I figure out what's happening with the variable names in the
kernels.
---
 .../nvfuser/test/test_gpu_match_frontend.cpp  | 52 ++++++++++---------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
index b14ec9042512..b76d9491a494 100644
--- a/third_party/nvfuser/test/test_gpu_match_frontend.cpp
+++ b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -661,6 +661,7 @@ TEST_F(NVFuserTest, FusionFrontendImplicitBroadcastInput_CUDA) {
 
   std::vector<IValue> inputs = {t0, t1};
 
+  // Define fusion
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -671,13 +672,13 @@ TEST_F(NVFuserTest, FusionFrontendImplicitBroadcastInput_CUDA) {
   fusion.addInput(tv1);
 
   // explicitly tell tv0 to broadcast along new first and last dimensions
-  auto tv0_b = broadcast(tv0, {true, false, true});
-  auto tv0_e = expand(
-      tv0_b,
+  auto tv2 = broadcast(tv0, {true, false, true});
+  auto tv3 = expand(
+      tv2,
       {tv1->axis(0)->extent(), tv1->axis(1)->extent(), tv1->axis(2)->extent()});
-  auto tv2 = add(tv0_e, tv1);
+  auto tv4 = add(tv3, tv1);
 
-  fusion.addOutput(tv2);
+  fusion.addOutput(tv4);
 
   // Run automatic scheduler
   auto fauto = Fusion(fusion); // unique_ptr to copy of fusion
@@ -686,31 +687,34 @@ TEST_F(NVFuserTest, FusionFrontendImplicitBroadcastInput_CUDA) {
   schedulePointwise(&fauto, *pointwise_params);
 
   // Perform manual scheduling
+  auto tv5 = tv0->cacheAfter();
+  auto tv6 = tv1->cacheAfter();
+  auto tv7 = tv4->cacheBefore();
+  tv4->merge(1, 2);
+  tv4->merge(0, 1);
+  tv4->split(0, 128);
+  tv4->split(0, 1);
+  tv4->split(0, 1);
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(1)->parallelize(ParallelType::Unswitch);
+  tv4->axis(3)->parallelize(ParallelType::TIDx);
 
-  auto tv0l = tv0->cacheAfter();
-  auto tv1l = tv1->cacheAfter();
-  auto tv2l = tv2->cacheBefore();
-  tv2->merge(1, 2);
-  tv2->merge(0, 1);
-  tv2->split(0, 128);
-  tv2->split(0, 1);
-  tv2->split(0, 1);
-  tv2->axis(0)->parallelize(ParallelType::BIDx);
-  tv2->axis(1)->parallelize(ParallelType::Unswitch);
-  tv2->axis(3)->parallelize(ParallelType::TIDx);
+  TransformPropagatorWithCheck propagator(tv4);
+  MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
+  scheduler_utils::parallelizeAllLike(tv4, {tv0, tv1, tv2, tv3, tv5, tv6, tv7});
 
-  TransformPropagatorWithCheck propagator(tv2);
-  MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator);
-  scheduler_utils::parallelizeAllLike(
-      tv2, {tv0_b, tv0_e, tv0, tv1, tv0l, tv1l, tv2l});
+  inlineMost(std::vector<TensorView*>({tv2, tv3, tv7}));
+  tv0->computeAt(tv2, 2);
+  tv5->computeAt(tv2, 2);
+  tv6->computeAt(tv4, 2);
+  tv1->computeAt(tv6, 2);
+  tv4->inlineAt(2);
 
-  inlineMost();
-  tv0->computeAt(tv0l, 2);
-  tv1->computeAt(tv1l, 2);
+  fusion.printTransforms();
 
   compare_ir(fusion, fauto);
 
-  // Perform manual computation and verify
+  // Perform eager computation and verify
   auto t0_b = t0.view({1, w, 1});
 
   auto t2 = t0_b.add(t1);

From 95c0e08ffa2b2353d621469ec51c017af64f0a92 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 4 Jan 2023 15:48:07 -0500
Subject: [PATCH 20/28] Add IR comparisons to Cast and Promote tests

---
 .../nvfuser/test/test_gpu_match_frontend.cpp  | 87 +++++++++++++------
 1 file changed, 61 insertions(+), 26 deletions(-)

diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
index b76d9491a494..2022d91e4ffa 100644
--- a/third_party/nvfuser/test/test_gpu_match_frontend.cpp
+++ b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -155,7 +155,7 @@ TEST_F(NVFuserTest, FusionFrontendAdd_CUDA) {
 
   compare_ir(fusion, fauto);
 
-  // Perform manual computation and verify
+  // Perform eager computation and verify
   auto t2 = t0.add(t1);
   // auto t4 = t2.sum({-1}, false);
 
@@ -234,7 +234,7 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasic_CUDA) {
   // i22 for the name of the index variable in the loop (rFactor, see tv3)
   compare_ir(fusion, fauto);
 
-  // Perform manual computation and verify
+  // Perform eager computation and verify
   auto t1 = t0 * 3.0;
   auto t2 = t1.sum({-1}, false);
 
@@ -321,7 +321,7 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasicFP16_CUDA) {
   // i30 for the name of the index variable in the loop (rFactor, see tv3)
   compare_ir(fusion, fauto);
 
-  // Perform manual computation and verify
+  // Perform eager computation and verify
   auto t1 = t0 * 3.0;
   auto t2 = t1.sum({-1}, false, c10::kFloat);
 
@@ -352,7 +352,6 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasicFP16_CUDA) {
 //! ```
 TEST_F(NVFuserTest, FusionFrontendBasic_CUDA) {
   // Create inputs
-
   int x = 2, y = 4, z = 8;
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
 
@@ -415,7 +414,7 @@ TEST_F(NVFuserTest, FusionFrontendBasic_CUDA) {
 
   compare_ir(fusion, fauto);
 
-  // Perform manual computation and verify
+  // Perform eager computation and verify
   auto t2 = t0.add(t1);
   auto t3 = t2.mul(3.0);
   auto t4 = t3.sum({-1}, false);
@@ -447,6 +446,16 @@ TEST_F(NVFuserTest, FusionFrontendBasic_CUDA) {
 //!     fd.add_output(t5)
 //! ```
 TEST_F(NVFuserTest, FusionFrontendBasicFP16_CUDA) {
+  // Create inputs
+  int x = 2, y = 4, z = 8;
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({x, y, z}, options);
+  at::Tensor t1 = at::randn({x, y, z}, options);
+
+  std::vector<IValue> inputs = {t0, t1};
+
+  // Define fusion
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -464,6 +473,13 @@ TEST_F(NVFuserTest, FusionFrontendBasicFP16_CUDA) {
 
   fusion.addOutput(tv4);
 
+  // Run automatic scheduler
+  auto fauto = Fusion(fusion); // unique_ptr to copy of fusion
+  auto reduction_params = getReductionHeuristics(&fauto, inputs);
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  scheduleReduction(&fauto, *reduction_params);
+
+  // Perform manual scheduling
   tv4float->merge(0, 1);
   tv4float->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
   tv4float->axis(0)->parallelize(ParallelType::BIDx);
@@ -477,18 +493,13 @@ TEST_F(NVFuserTest, FusionFrontendBasicFP16_CUDA) {
 
   inlineMost();
 
-  int x = 2, y = 4, z = 8;
-  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({x, y, z}, options);
-  at::Tensor t1 = at::randn({x, y, z}, options);
+  compare_ir(fusion, fauto);
 
+  // Perform eager computation and verify
   auto t2 = t0.add(t1);
   auto t3 = t2.mul(3.0);
   auto t4 = t3.sum({-1}, false);
 
-  std::vector<IValue> inputs = {t0, t1};
-
   int runtime_threadIdx_dim = 128;
   LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
 
@@ -516,6 +527,16 @@ TEST_F(NVFuserTest, FusionFrontendBasicFP16_CUDA) {
 //!     fd.add_output(t4)
 //! ```
 TEST_F(NVFuserTest, FusionFrontendCastDoubleToHalf_CUDA) {
+  // Create inputs
+  int x = 2, y = 4;
+  auto options = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({x, y}, options);
+  at::Tensor t1 = at::randn({x, y}, options);
+
+  std::vector<IValue> inputs = {t0, t1};
+
+  // Define fusion
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -535,6 +556,13 @@ TEST_F(NVFuserTest, FusionFrontendCastDoubleToHalf_CUDA) {
 
   fusion.addOutput(tv4);
 
+  // Run automatic scheduler
+  auto fauto = Fusion(fusion); // unique_ptr to copy of fusion
+  auto pointwise_params = getPointwiseHeuristics(&fauto, inputs);
+  TORCH_CHECK(pointwise_params, "Pointwise schedule was not generated!");
+  schedulePointwise(&fauto, *pointwise_params);
+
+  // Perform manual scheduling
   tv4->merge(0, 1);
   tv4->split(0, NamedScalar::getParallelDim(ParallelType::TIDx));
   tv4->axis(0)->parallelize(ParallelType::BIDx);
@@ -548,12 +576,9 @@ TEST_F(NVFuserTest, FusionFrontendCastDoubleToHalf_CUDA) {
 
   inlineMost();
 
-  int x = 2, y = 4;
-  auto options = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA, 0);
-
-  at::Tensor t0 = at::randn({x, y}, options);
-  at::Tensor t1 = at::randn({x, y}, options);
+  compare_ir(fusion, fauto);
 
+  // Perform eager computation and verify
   auto t0h = t0.to(options.dtype(at::kHalf));
   auto t1h = t1.to(options.dtype(at::kHalf));
 
@@ -561,8 +586,6 @@ TEST_F(NVFuserTest, FusionFrontendCastDoubleToHalf_CUDA) {
   auto t3 = t2.relu();
   auto t4 = t3.to(options);
 
-  std::vector<IValue> inputs = {t0, t1};
-
   int runtime_threadIdx_dim = 128;
   LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
 
@@ -588,6 +611,16 @@ TEST_F(NVFuserTest, FusionFrontendCastDoubleToHalf_CUDA) {
 //!     fd.add_output(t5)
 //! ```
 TEST_F(NVFuserTest, FusionFrontendPromoteToDouble_CUDA) {
+  // Create inputs
+  int x = 2, y = 4;
+  auto options = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA, 0);
+
+  at::Tensor t0h = at::randn({x, y}, options.dtype(at::kHalf));
+  at::Tensor t1 = at::randn({x, y}, options);
+
+  std::vector<IValue> inputs = {t0h, t1};
+
+  // Define fusion
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -603,6 +636,13 @@ TEST_F(NVFuserTest, FusionFrontendPromoteToDouble_CUDA) {
 
   fusion.addOutput(tv3);
 
+  // Run automatic scheduler
+  auto fauto = Fusion(fusion); // unique_ptr to copy of fusion
+  auto reduction_params = getReductionHeuristics(&fauto, inputs);
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  scheduleReduction(&fauto, *reduction_params);
+
+  // Perform manual scheduling
   tv3->merge(0, 1);
   tv3->split(0, NamedScalar::getParallelDim(ParallelType::TIDx));
   tv3->axis(0)->parallelize(ParallelType::BIDx);
@@ -615,19 +655,14 @@ TEST_F(NVFuserTest, FusionFrontendPromoteToDouble_CUDA) {
 
   inlineMost();
 
-  int x = 2, y = 4;
-  auto options = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA, 0);
-
-  at::Tensor t0h = at::randn({x, y}, options.dtype(at::kHalf));
-  at::Tensor t1 = at::randn({x, y}, options);
+  compare_ir(fusion, fauto);
 
+  // Perform eager computation and verify
   auto t0 = t0h.to(options.dtype(at::kDouble));
 
   auto t2 = t0.add(t1);
   auto t3 = t2.relu();
 
-  std::vector<IValue> inputs = {t0h, t1};
-
   // Need to hardcode both block and grid size
   int runtime_threadIdx_dim = 128;
   LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);

From 35b872d5b6c82d3e51b7f6cb586dc0ffc9e49a22 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 5 Jan 2023 15:55:55 -0500
Subject: [PATCH 21/28] Modify CMakeLists.txt and headers following rebase.

This is just the necessary changes to get my tests compiling after Jie's
big refactor, which I've just rebased in.
---
 third_party/nvfuser/CMakeLists.txt            |  1 +
 .../nvfuser/test/test_gpu_match_frontend.cpp  | 20 +++++++++----------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/third_party/nvfuser/CMakeLists.txt b/third_party/nvfuser/CMakeLists.txt
index 020b3694721f..b418f9101693 100644
--- a/third_party/nvfuser/CMakeLists.txt
+++ b/third_party/nvfuser/CMakeLists.txt
@@ -332,6 +332,7 @@ if(BUILD_TEST)
   list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_indexing_ops.cpp)
   list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_indexing.cpp)
   list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_gather_ops.cpp)
+  list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_match_frontend.cpp)
 
   set(JIT_TEST_CU_SRCS)
   list(APPEND JIT_TEST_CU_SRCS ${NVFUSER_ROOT}/test/test_gpu_rng.cu)
diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
index 2022d91e4ffa..e82216a97334 100644
--- a/third_party/nvfuser/test/test_gpu_match_frontend.cpp
+++ b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -6,18 +6,18 @@
 #include <gmock/gmock-matchers.h>
 #include <gtest/gtest.h>
 
-#include <torch/csrc/jit/codegen/cuda/arith.h>
-#include <torch/csrc/jit/codegen/cuda/executor.h>
-#include <torch/csrc/jit/codegen/cuda/inlining.h>
-#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
-#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
-#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
+#include <arith.h>
+#include <executor.h>
+#include <inlining.h>
+#include <ir_all_nodes.h>
+#include <ir_builder.h>
+#include <kernel_cache.h>
+#include <scheduler/all_schedulers.h>
+#include <scheduler/utils.h>
 
 #include <test/cpp/jit/test_utils.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h>
-#include <torch/csrc/jit/codegen/cuda/test/test_utils.h>
+#include <test/test_gpu_validator.h>
+#include <test/test_utils.h>
 
 #include <torch/torch.h>
 

From bb1d5ab5e9b11bc4601d0bbfe1105a9cebd633bd Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Tue, 17 Jan 2023 12:04:45 -0500
Subject: [PATCH 22/28] Fix Frontend{Add,SuperBasic,SuperBasicFP16} tests

To make these tests pass including matching the generated kernels, I
used the WIP fusion_debug dump output from
https://github.com/csarofeen/pytorch/pull/2326. This let me see a noisy
log of operations that revealed a few patterns I was unaware of. For
example, in the FrontendAdd test, the pointwise scheduler is used, and
the order of inlining at the end is not defined (and does change) due to
using unordered_set. Also, notice that getPointwiseHeuristics actually
creates a Val (fusion.oneVal()) so it actually does technically alter
the Fusion since it adds a Val to its vals_ list.

Beyond those trivial things, I noticed differences in inlining patterns
between the pointwise and reduction schedulers, and also saw some
different ways to call methods like parallelizeAllLike. One thing I
haven't looked more into is the reorders that often happen at the
beginning of scheduling both pointwise and reduction fusions. They have
no effect on the generated kernels, but it is noticeable and I plan to
read up on it soon.
---
 .../nvfuser/test/test_gpu_match_frontend.cpp  | 226 ++++++++++++------
 1 file changed, 157 insertions(+), 69 deletions(-)

diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
index e82216a97334..2fb2957bc10a 100644
--- a/third_party/nvfuser/test/test_gpu_match_frontend.cpp
+++ b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -108,7 +108,29 @@ TEST_F(NVFuserTest, FusionFrontendAdd_CUDA) {
 
   std::vector<IValue> inputs = {t0, t1};
 
-  // Define fusion
+  // Define fusion for automatic scheduling
+  Fusion fauto;
+  {
+    FusionGuard fg(&fauto);
+
+    auto tv0 = makeSymbolicTensor(3);
+    auto tv1 = makeSymbolicTensor(3);
+
+    fauto.addInput(tv0);
+    fauto.addInput(tv1);
+
+    auto tv2 = add(tv0, tv1);
+    // auto tv4 = sum(tv2, {-1}, false, DataType::Float);
+
+    fauto.addOutput(tv2);
+
+    // Run automatic scheduler
+    auto pointwise_params = getPointwiseHeuristics(&fauto, inputs);
+    TORCH_CHECK(pointwise_params, "Pointwise schedule was not generated!");
+    schedulePointwise(&fauto, *pointwise_params);
+  }
+
+  // Repeat definition of fusion for manual scheduling
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -123,37 +145,59 @@ TEST_F(NVFuserTest, FusionFrontendAdd_CUDA) {
 
   fusion.addOutput(tv2);
 
-  // Run automatic scheduler
-  auto fauto = Fusion(fusion); // unique_ptr to copy of fusion
-  auto pointwise_params = getPointwiseHeuristics(&fauto, inputs);
-  TORCH_CHECK(pointwise_params, "Pointwise schedule was not generated!");
-  schedulePointwise(&fauto, *pointwise_params);
-
   // Perform manual scheduling
-  auto tv0p = tv0->cacheAfter();
-  auto tv1p = tv1->cacheAfter();
+
+  // Before schedulePointwise() is called, getPointwiseHeuristics() calls
+  // vectorize_helper::getExpandedVectorization() which in turn calls:
+  //   vectorize_helper::getVectorizationSize
+  //   vectorize_helper::ProjectedExtent::getNumerator
+  //   vectorize_helper::ProjectedExtent::computeNumerDenomir
+  //   IrContainer::oneVal
+  // oneVal() creates an actual Val here to hold the denominator and
+  // initializes it to 1. Since this is reflected in the fusion log, I'm
+  // inserting it here even though it has not effect on the generated kernel.
+  fusion.oneVal();
+
+  // scheduler_utils::cacheInputs(fusion, true);
+  tv0->cacheAfter(); // tv3
+  tv1->cacheAfter(); // tv4
+
+  // scheduler_utils::cacheAndForkOutputs(fusion, true);
+  auto tv5 = tv2->cacheBefore(); // tv5
+
   tv2->merge(1, 2);
   tv2->merge(0, 1);
+  tv2->reorder({{0, -1}});
+  tv2->reorder({{-1, 0}});
   tv2->split(0, 128);
   tv2->split(0, 1);
   tv2->split(0, 1);
-  auto tv2l = tv2->cacheBefore();
   tv2->axis(0)->parallelize(ParallelType::BIDx);
   tv2->axis(1)->parallelize(ParallelType::Unswitch);
   tv2->axis(3)->parallelize(ParallelType::TIDx);
 
-  inlineMost();
-  tv0p->computeAt(tv2, 2);
-  tv1p->computeAt(tv2, 2);
+  // inlineMost();
+  // tv3->computeAt(tv2, 2);
+  // tv4->computeAt(tv2, 2);
 
   TransformPropagatorWithCheck propagator(tv2);
   MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator);
-  scheduler_utils::parallelizeAllLike(tv2, {tv0, tv1, tv0p, tv1p, tv2l});
+  scheduler_utils::parallelizeAllLike(tv2);
 
   // Pointwise scheduler does not use inlineMost(), as reduction scheduler does
-  // inlineMost();
+  // Instead, it uses inlineAllAt followed by inlineMost(innermost_tensors)
+  inlineAllAt(tv2, 2, true);
+  inlineMost(std::vector<TensorView*>({tv5, tv1, tv0}));
 
-  compare_ir(fusion, fauto);
+  // Note that inlineAllAt iterates through an unordered_set to do inlining, so
+  // it is not practical to match the fusion_debug log exactly when using
+  // pointwise scheduler
+  compare_ir_math(fusion, fauto);
+  compare_transforms(fusion, fauto);
+  // compare_fusion_debug(fusion, fauto);
+  compare_kernels(fusion, fauto);
+
+  // compare_ir(fusion, fauto);
 
   // Perform eager computation and verify
   auto t2 = t0.add(t1);
@@ -190,7 +234,29 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasic_CUDA) {
 
   std::vector<IValue> inputs = {t0};
 
-  // Define fusion
+  Fusion fauto;
+  { // Do automatic scheduling on fauto
+    FusionGuard fg(&fauto);
+
+    auto tv0 = makeSymbolicTensor(2); // {i0, i1}
+    auto c0 = IrBuilder::create<Double>(3.0);
+
+    fauto.addInput(tv0);
+
+    auto tv1 = mul(tv0, c0); // {i0, i1}
+    auto tv2 = sum(tv1, {-1}, false, DataType::Float); // {i0, r1}
+
+    fauto.addOutput(tv2);
+
+    // Run automatic scheduler
+    auto reduction_params = getReductionHeuristics(&fauto, inputs);
+    TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+    scheduleReduction(&fauto, *reduction_params);
+  }
+
+  // Re-define the fusion exactly for manual scheduling
+  // This is necessary in order to catch all the constructors inside each
+  // Fusion independently.
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -204,34 +270,36 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasic_CUDA) {
 
   fusion.addOutput(tv2);
 
-  // Run automatic scheduler
-  auto fauto = Fusion(fusion); // unique_ptr to copy of fusion
-  auto reduction_params = getReductionHeuristics(&fauto, inputs);
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-  scheduleReduction(&fauto, *reduction_params);
-
   // Perform manual scheduling
+
+  tv2->reorder({{1, 0}}); // Removing these two reorders does not effect the
+                          // generated kernel
+  tv2->reorder({{1, 0}});
   tv2->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
+  tv2->axis(2)->parallelize(ParallelType::TIDx);
   tv2->split(1, 1);
-  tv2->reorder({{-1, -2}, {-2, -1}});
+  tv2->axis(2)->parallelize(ParallelType::Unswitch);
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
 
-  auto tv3 = tv2->rFactor({1, 3});
+  // tv2->reorder({{-2, -1}}) has same effect but this shows the mapping
+  // explicitly
+  tv2->reorder({{0, 0}, {1, 1}, {2, 3}, {3, 2}});
 
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv3->axis(2)->parallelize(ParallelType::TIDx);
-  tv3->axis(3)->parallelize(ParallelType::Unswitch);
+  auto tv3 = tv2->rFactor({1, 3});
 
   // propagate the mapping to other tensors
   TransformPropagatorWithCheck propagator(tv3);
   MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
-  scheduler_utils::parallelizeAllLike(tv3, {tv0, tv1, tv2});
-
-  tv1->computeAt(tv3, -1);
+  scheduler_utils::parallelizeAllLike(
+      tv3,
+      {},
+      allParallelTypesExcept(
+          {ParallelType::Unroll,
+           ParallelType::Vectorize,
+           ParallelType::MisalignedVectorize}));
 
   inlineMost();
 
-  // CUDA kernel is equivalent, but automatic scheduling uses i23 instead of
-  // i22 for the name of the index variable in the loop (rFactor, see tv3)
   compare_ir(fusion, fauto);
 
   // Perform eager computation and verify
@@ -271,7 +339,30 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasicFP16_CUDA) {
 
   std::vector<IValue> inputs = {t0};
 
-  // Define fusion
+  Fusion fauto;
+  { // Do automatic scheduling on fauto
+    FusionGuard fg(&fauto);
+
+    auto tv0 = makeSymbolicTensor(2, DataType::Half); // {i0, i1}
+    auto c0 = IrBuilder::create<Double>(3.0);
+
+    fauto.addInput(tv0);
+
+    auto tv1 = mul(tv0, c0); // {i0, i1}
+    auto tv2 = sum(tv1, {-1}, false, DataType::Float); // {i0, r1}
+    auto tv3 = castOp(DataType::Half, tv2);
+
+    fauto.addOutput(tv3);
+
+    // Run automatic scheduler
+    auto reduction_params = getReductionHeuristics(&fauto, inputs);
+    TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+    scheduleReduction(&fauto, *reduction_params);
+  }
+
+  // Re-define the fusion exactly for manual scheduling
+  // This is necessary in order to catch all the constructors inside each
+  // Fusion independently.
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -280,50 +371,46 @@ TEST_F(NVFuserTest, FusionFrontendSuperBasicFP16_CUDA) {
 
   fusion.addInput(tv0);
 
-  // Note: A manual schedule will run without an explicit cast here, producing
-  // an _implicit_ cast to float. That float tensor will not be explicitly
-  // parallelized to match, but will result
-  auto tv0float = castOp(DataType::Float, tv0);
-
-  auto tv1 = mul(tv0float, c0); // {i0, i1}
-  auto tv2float = sum(tv1, {-1}, false, DataType::Float); // {i0, r1}
-  auto tv2 = castOp(DataType::Half, tv2float);
-
-  fusion.addOutput(tv2);
+  auto tv1 = mul(tv0, c0); // {i0, i1}
+  auto tv2 = sum(tv1, {-1}, false, DataType::Float); // {i0, r1}
+  auto tv4 = castOp(DataType::Half, tv2);
 
-  // Run automatic scheduler
-  auto fauto = Fusion(fusion); // unique_ptr to copy of fusion
-  auto reduction_params = getReductionHeuristics(&fauto, inputs);
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-  scheduleReduction(&fauto, *reduction_params);
+  fusion.addOutput(tv4);
 
   // Perform manual scheduling
-  tv2float->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
-  tv2float->split(1, 1);
-  tv2float->reorder({{-1, -2}, {-2, -1}});
+  tv2->reorder({{1, 0}}); // Removing these two reorders does not effect the
+                          // generated kernel
+  tv2->reorder({{1, 0}});
+  tv2->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
+  tv2->axis(2)->parallelize(ParallelType::TIDx);
+  tv2->split(1, 1);
+  tv2->axis(2)->parallelize(ParallelType::Unswitch);
+  tv2->axis(0)->parallelize(ParallelType::BIDx);
 
-  auto tv3 = tv2float->rFactor({1, 3});
+  // tv2->reorder({{-2, -1}}) has same effect but this shows the mapping
+  // explicitly
+  tv2->reorder({{0, 0}, {1, 1}, {2, 3}, {3, 2}});
 
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv3->axis(2)->parallelize(ParallelType::TIDx);
-  tv3->axis(3)->parallelize(ParallelType::Unswitch);
+  auto tv3 = tv2->rFactor({1, 3});
 
   // propagate the mapping to other tensors
   TransformPropagatorWithCheck propagator(tv3);
   MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
-  scheduler_utils::parallelizeAllLike(tv3, {tv0, tv0float, tv1, tv2, tv2float});
-
-  tv1->computeAt(tv3, -1);
+  scheduler_utils::parallelizeAllLike(
+      tv3,
+      {},
+      allParallelTypesExcept(
+          {ParallelType::Unroll,
+           ParallelType::Vectorize,
+           ParallelType::MisalignedVectorize}));
 
   inlineMost();
 
-  // CUDA kernel is equivalent, but automatic scheduling uses i31 instead of
-  // i30 for the name of the index variable in the loop (rFactor, see tv3)
   compare_ir(fusion, fauto);
 
   // Perform eager computation and verify
   auto t1 = t0 * 3.0;
-  auto t2 = t1.sum({-1}, false, c10::kFloat);
+  auto t2 = t1.sum({-1}, false);
 
   int runtime_threadIdx_dim = 128;
   LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
@@ -384,9 +471,6 @@ TEST_F(NVFuserTest, FusionFrontendBasic_CUDA) {
   scheduleReduction(&fauto, *reduction_params);
 
   // Perform manual scheduling
-  //
-  // {i0, i1, i2}
-
   tv4->merge(0, 1); // {i0*i1, i2}
   tv4->split(
       1,
@@ -403,12 +487,16 @@ TEST_F(NVFuserTest, FusionFrontendBasic_CUDA) {
   tv4->axis(3)->parallelize(ParallelType::Unroll);
   tv4->axis(4)->parallelize(ParallelType::TIDx);
   tv4->axis(5)->parallelize(ParallelType::Unswitch);
-  auto tv5 = tv4->rFactor({1, 5});
+
+  auto tv5 = tv0->cacheAfter();
+  auto tv6 = tv1->cacheAfter();
+  auto tv7 = tv4->cacheBefore();
+  auto tv8 = tv7->rFactor({1, 5});
 
   // propagate the mapping to other tensors
-  TransformPropagatorWithCheck propagator(tv5);
-  MaxRootDomainInfoSpanningTree(tv5).traverse(&propagator);
-  scheduler_utils::parallelizeAllLike(tv5, {tv0, tv1, tv2, tv3, tv4});
+  TransformPropagatorWithCheck propagator(tv7);
+  MaxRootDomainInfoSpanningTree(tv7).traverse(&propagator);
+  scheduler_utils::parallelizeAllLike(tv7, {tv2, tv3, tv4, tv5, tv6, tv8});
 
   inlineMost();
 

From dc6e5e035fb642cf715a97193cebf75ae18a8217 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Tue, 17 Jan 2023 13:43:32 -0500
Subject: [PATCH 23/28] Make FrontendBasic test match auto schedule

Again I used the fusion_debug dump from
https://github.com/csarofeen/pytorch/pull/2326 to trace what the
reduction scheduler is doing. This time I learned about
multiReductionInliner, which uses two calls to parallelizeAllLike for
different types of ParallelTypes, followed by an undoing of unrolling
and vectorization on the reference tensor. The need for the latter is
still a little unclear to me.
---
 .../nvfuser/test/test_gpu_match_frontend.cpp  | 99 +++++++++++++------
 1 file changed, 70 insertions(+), 29 deletions(-)

diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
index 2fb2957bc10a..cea1071c34e8 100644
--- a/third_party/nvfuser/test/test_gpu_match_frontend.cpp
+++ b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -447,7 +447,32 @@ TEST_F(NVFuserTest, FusionFrontendBasic_CUDA) {
 
   std::vector<IValue> inputs = {t0, t1};
 
-  // Define fusion
+  Fusion fauto;
+  { // Do automatic scheduling on fauto
+    FusionGuard fg(&fauto);
+
+    auto tv0 = makeSymbolicTensor(3);
+    auto tv1 = makeSymbolicTensor(3);
+    auto c0 = IrBuilder::create<Double>(3.0);
+
+    fauto.addInput(tv0);
+    fauto.addInput(tv1);
+
+    auto tv2 = add(tv0, tv1);
+    auto tv3 = mul(tv2, c0);
+    auto tv4 = sum(tv3, {-1}, false, DataType::Float);
+
+    fauto.addOutput(tv4);
+
+    // Run automatic scheduler
+    auto reduction_params = getReductionHeuristics(&fauto, inputs);
+    TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+    scheduleReduction(&fauto, *reduction_params);
+  }
+
+  // Re-define the fusion exactly for manual scheduling
+  // This is necessary in order to catch all the constructors inside each
+  // Fusion independently.
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -464,39 +489,55 @@ TEST_F(NVFuserTest, FusionFrontendBasic_CUDA) {
 
   fusion.addOutput(tv4);
 
-  // Run automatic scheduler
-  auto fauto = Fusion(fusion); // unique_ptr to copy of fusion
-  auto reduction_params = getReductionHeuristics(&fauto, inputs);
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-  scheduleReduction(&fauto, *reduction_params);
-
   // Perform manual scheduling
-  tv4->merge(0, 1); // {i0*i1, i2}
-  tv4->split(
-      1,
-      NamedScalar::getParallelDim(
-          ParallelType::TIDx)); // {i0*i1, r2 / bDx, bDx}
-  tv4->split(-2, 1);
-  tv4->reorder({{-2, -1}, {-1, -2}});
-  tv4->split(0, 2);
-  tv4->reorder({{1, 2}, {2, 1}});
-  tv4->split(0, 1);
-  tv4->reorder({{1, 2}, {2, 1}});
-  tv4->axis(0)->parallelize(ParallelType::BIDx);
-  tv4->axis(2)->parallelize(ParallelType::Unswitch);
-  tv4->axis(3)->parallelize(ParallelType::Unroll);
-  tv4->axis(4)->parallelize(ParallelType::TIDx);
-  tv4->axis(5)->parallelize(ParallelType::Unswitch);
 
-  auto tv5 = tv0->cacheAfter();
-  auto tv6 = tv1->cacheAfter();
-  auto tv7 = tv4->cacheBefore();
+  auto tv5 = tv0->cacheAfter(); // tv5
+  auto tv6 = tv1->cacheAfter(); // tv6
+  auto tv7 = tv4->cacheBefore(); // tv7
+
+  tv7->reorder({{2, 0}});
+  tv7->merge(1, 2);
+  tv7->reorder({{1, 0}});
+  tv7->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
+  tv7->axis(2)->parallelize(ParallelType::TIDx);
+  tv7->split(1, 1);
+  tv7->axis(2)->parallelize(ParallelType::Unswitch);
+  tv7->split(0, 2);
+  tv7->axis(1)->parallelize(ParallelType::Unroll);
+  tv7->split(0, 1);
+  tv7->axis(1)->parallelize(ParallelType::Unswitch);
+  tv7->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv7->reorder({{0, 0}, {1, 2}, {2, 3}, {3, 1}, {4, 5}, {5, 4}});
+
   auto tv8 = tv7->rFactor({1, 5});
 
+  // NOTE: see multiReductionInliner for more info on how propagation and
+  // inlining works in the reduction scheduler
+
   // propagate the mapping to other tensors
-  TransformPropagatorWithCheck propagator(tv7);
-  MaxRootDomainInfoSpanningTree(tv7).traverse(&propagator);
-  scheduler_utils::parallelizeAllLike(tv7, {tv2, tv3, tv4, tv5, tv6, tv8});
+  TransformPropagatorWithCheck propagator(tv8);
+  MaxRootDomainInfoSpanningTree(tv8).traverse(&propagator);
+  // Propagate parallelization except vectorization and unrolling
+  scheduler_utils::parallelizeAllLike(
+      tv8,
+      {},
+      allParallelTypesExcept(
+          {ParallelType::Unroll,
+           ParallelType::Vectorize,
+           ParallelType::MisalignedVectorize}));
+  // Propagate vectorization/unrolling to those tensors that need it
+  scheduler_utils::parallelizeAllLike(
+      tv8,
+      {tv4, tv6, tv5},
+      {
+          ParallelType::Unroll,
+          ParallelType::Vectorize,
+          ParallelType::MisalignedVectorize,
+      });
+  // If reference shouldn't be unrolled, clear that parallel type.
+  tv8->axis(3)->parallelize(ParallelType::Serial);
+  tv7->axis(2)->parallelize(ParallelType::Serial);
 
   inlineMost();
 

From dd44075628e6eb4ec7272ecbda8196f2b8d8b226 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Tue, 17 Jan 2023 15:45:54 -0500
Subject: [PATCH 24/28] Make FrontendBasicFP16 test match auto schedule

There were no real surprises here except that the representative tensor
seemed to be chosen differently than in the FP32 case, maybe because the
casting before cacheBefore pushed that choice further back in the
pipeline.
---
 .../nvfuser/test/test_gpu_match_frontend.cpp  | 95 +++++++++++++++----
 1 file changed, 77 insertions(+), 18 deletions(-)

diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
index cea1071c34e8..de316234fd0d 100644
--- a/third_party/nvfuser/test/test_gpu_match_frontend.cpp
+++ b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -584,7 +584,33 @@ TEST_F(NVFuserTest, FusionFrontendBasicFP16_CUDA) {
 
   std::vector<IValue> inputs = {t0, t1};
 
-  // Define fusion
+  Fusion fauto;
+  { // Do automatic scheduling on fauto
+    FusionGuard fg(&fauto);
+
+    auto tv0 = makeSymbolicTensor(3, DataType::Half);
+    auto tv1 = makeSymbolicTensor(3, DataType::Half);
+    auto c0 = IrBuilder::create<Double>(3.0);
+
+    fauto.addInput(tv0);
+    fauto.addInput(tv1);
+
+    auto tv2 = add(tv0, tv1);
+    auto tv3 = mul(tv2, c0);
+    auto tv4 = sum(tv3, {-1}, false, DataType::Float);
+    auto tv5 = castOp(DataType::Half, tv4);
+
+    fauto.addOutput(tv5);
+
+    // Run automatic scheduler
+    auto reduction_params = getReductionHeuristics(&fauto, inputs);
+    TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+    scheduleReduction(&fauto, *reduction_params);
+  }
+
+  // Re-define the fusion exactly for manual scheduling
+  // This is necessary in order to catch all the constructors inside each
+  // Fusion independently.
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -597,28 +623,61 @@ TEST_F(NVFuserTest, FusionFrontendBasicFP16_CUDA) {
 
   auto tv2 = add(tv0, tv1);
   auto tv3 = mul(tv2, c0);
-  auto tv4float = sum(tv3, {-1}, false, DataType::Float);
-  auto tv4 = castOp(DataType::Half, tv4float);
-
-  fusion.addOutput(tv4);
+  auto tv4 = sum(tv3, {-1}, false, DataType::Float);
+  auto tv5 = castOp(DataType::Half, tv4);
 
-  // Run automatic scheduler
-  auto fauto = Fusion(fusion); // unique_ptr to copy of fusion
-  auto reduction_params = getReductionHeuristics(&fauto, inputs);
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-  scheduleReduction(&fauto, *reduction_params);
+  fusion.addOutput(tv5);
 
   // Perform manual scheduling
-  tv4float->merge(0, 1);
-  tv4float->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
-  tv4float->axis(0)->parallelize(ParallelType::BIDx);
-  tv4float->axis(-1)->parallelize(ParallelType::TIDx);
-  auto tv5 = tv4float->rFactor({1});
+
+  auto tv6 = tv0->cacheAfter(); // tv6
+  auto tv7 = tv1->cacheAfter(); // tv7
+  tv5->cacheBefore(); // tv8
+
+  // NOTE: tv4 is now chosen as the representative tensor
+  tv4->reorder({{2, 0}});
+  tv4->merge(1, 2);
+  tv4->reorder({{1, 0}});
+  tv4->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
+  tv4->axis(2)->parallelize(ParallelType::TIDx);
+  tv4->split(1, 1);
+  tv4->axis(2)->parallelize(ParallelType::Unswitch);
+  tv4->split(0, 2);
+  tv4->axis(1)->parallelize(ParallelType::Unroll);
+  tv4->split(0, 1);
+  tv4->axis(1)->parallelize(ParallelType::Unswitch);
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv4->reorder({{0, 0}, {1, 2}, {2, 3}, {3, 1}, {4, 5}, {5, 4}});
+
+  auto tv9 = tv4->rFactor({1, 5});
+
+  // NOTE: see multiReductionInliner for more info on how propagation and
+  // inlining works in the reduction scheduler
 
   // propagate the mapping to other tensors
-  TransformPropagatorWithCheck propagator(tv5);
-  MaxRootDomainInfoSpanningTree(tv5).traverse(&propagator);
-  scheduler_utils::parallelizeAllLike(tv5, {tv0, tv1, tv2, tv3, tv4});
+  TransformPropagatorWithCheck propagator(tv9);
+  MaxRootDomainInfoSpanningTree(tv9).traverse(&propagator);
+  // Propagate parallelization except vectorization and unrolling
+  scheduler_utils::parallelizeAllLike(
+      tv9,
+      {},
+      allParallelTypesExcept(
+          {ParallelType::Unroll,
+           ParallelType::Vectorize,
+           ParallelType::MisalignedVectorize}));
+  // Propagate vectorization/unrolling to those tensors that need it
+  scheduler_utils::parallelizeAllLike(
+      tv9,
+      {tv5, tv7, tv6},
+      {
+          ParallelType::Unroll,
+          ParallelType::Vectorize,
+          ParallelType::MisalignedVectorize,
+      });
+  // If reference shouldn't be unrolled, clear that parallel type.
+  tv9->axis(3)->parallelize(ParallelType::Serial);
+  tv4->axis(2)->parallelize(ParallelType::Serial);
 
   inlineMost();
 

From 4519bd17186bee8aec56d7430376d6fe81c1adad Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 18 Jan 2023 09:27:04 -0500
Subject: [PATCH 25/28] Make FrontendCastDoubleToHalf match auto schedule

This was very similar to the FrontendAdd example since it also uses the
pointwise scheduler.
---
 .../nvfuser/test/test_gpu_match_frontend.cpp  | 106 +++++++++++++-----
 1 file changed, 81 insertions(+), 25 deletions(-)

diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
index de316234fd0d..51201f8226af 100644
--- a/third_party/nvfuser/test/test_gpu_match_frontend.cpp
+++ b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -724,7 +724,36 @@ TEST_F(NVFuserTest, FusionFrontendCastDoubleToHalf_CUDA) {
 
   std::vector<IValue> inputs = {t0, t1};
 
-  // Define fusion
+  Fusion fauto;
+  { // Do automatic scheduling on fauto
+    FusionGuard fg(&fauto);
+
+    auto tv0 = makeSymbolicTensor(2, DataType::Double);
+    auto tv1 = makeSymbolicTensor(2, DataType::Double);
+
+    fauto.addInput(tv0);
+    fauto.addInput(tv1);
+
+    auto tv2 = castOp(DataType::Half, tv0);
+    auto tv3 = castOp(DataType::Half, tv1);
+    // implicit casts
+    auto tv4 = castOp(DataType::Float, tv2);
+    auto tv5 = castOp(DataType::Float, tv3);
+    auto tv6 = add(tv4, tv5);
+    auto tv7 = relu(tv6);
+    auto tv8 = castOp(DataType::Half, tv7);
+
+    fauto.addOutput(tv8);
+
+    // Run automatic scheduler
+    auto pointwise_params = getPointwiseHeuristics(&fauto, inputs);
+    TORCH_CHECK(pointwise_params, "Pointwise schedule was not generated!");
+    schedulePointwise(&fauto, *pointwise_params);
+  }
+
+  // Re-define the fusion exactly for manual scheduling
+  // This is necessary in order to catch all the constructors inside each
+  // Fusion independently.
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -734,37 +763,64 @@ TEST_F(NVFuserTest, FusionFrontendCastDoubleToHalf_CUDA) {
   fusion.addInput(tv0);
   fusion.addInput(tv1);
 
-  auto tv0h = castOp(DataType::Half, tv0);
-  auto tv1h = castOp(DataType::Half, tv1);
-  auto tv0f = castOp(DataType::Float, tv0h);
-  auto tv1f = castOp(DataType::Float, tv1h);
-  auto tv2 = add(tv0f, tv1f);
-  auto tv3 = relu(tv2);
-  auto tv4 = castOp(DataType::Half, tv3);
+  auto tv2 = castOp(DataType::Half, tv0);
+  auto tv3 = castOp(DataType::Half, tv1);
+  // implicit casts
+  auto tv4 = castOp(DataType::Float, tv2);
+  auto tv5 = castOp(DataType::Float, tv3);
+  auto tv6 = add(tv4, tv5);
+  auto tv7 = relu(tv6);
+  auto tv8 = castOp(DataType::Half, tv7);
 
-  fusion.addOutput(tv4);
-
-  // Run automatic scheduler
-  auto fauto = Fusion(fusion); // unique_ptr to copy of fusion
-  auto pointwise_params = getPointwiseHeuristics(&fauto, inputs);
-  TORCH_CHECK(pointwise_params, "Pointwise schedule was not generated!");
-  schedulePointwise(&fauto, *pointwise_params);
+  fusion.addOutput(tv8);
 
   // Perform manual scheduling
-  tv4->merge(0, 1);
-  tv4->split(0, NamedScalar::getParallelDim(ParallelType::TIDx));
-  tv4->axis(0)->parallelize(ParallelType::BIDx);
-  tv4->axis(1)->parallelize(ParallelType::TIDx);
+
+  // Before schedulePointwise() is called, getPointwiseHeuristics() calls
+  // vectorize_helper::getExpandedVectorization() which in turn calls:
+  //   vectorize_helper::getVectorizationSize
+  //   vectorize_helper::ProjectedExtent::getNumerator
+  //   vectorize_helper::ProjectedExtent::computeNumerDenomir
+  //   IrContainer::oneVal
+  // oneVal() creates an actual Val here to hold the denominator and
+  // initializes it to 1. Since this is reflected in the fusion log, I'm
+  // inserting it here even though it has not effect on the generated kernel.
+  fusion.oneVal();
+
+  tv0->cacheAfter(); // tv9
+  tv1->cacheAfter(); // tv10
+  auto tv11 = tv8->cacheBefore(); // tv11
+
+  tv8->merge(0, 1);
+  tv8->reorder({{0, -1}});
+  tv8->reorder({{-1, 0}});
+  tv8->split(0, 128);
+  tv8->split(0, 1);
+  tv8->split(0, 1);
+  tv8->axis(0)->parallelize(ParallelType::BIDx);
+  tv8->axis(1)->parallelize(ParallelType::Unswitch);
+  tv8->axis(3)->parallelize(ParallelType::TIDx);
 
   // propagate the mapping to other tensors
-  TransformPropagatorWithCheck propagator(tv4);
-  MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
-  scheduler_utils::parallelizeAllLike(
-      tv4, {tv0, tv1, tv0h, tv1h, tv0f, tv1f, tv2, tv3});
+  TransformPropagatorWithCheck propagator(tv8);
+  MaxRootDomainInfoSpanningTree(tv8).traverse(&propagator);
+  scheduler_utils::parallelizeAllLike(tv8);
 
-  inlineMost();
+  // Pointwise scheduler does not use inlineMost(), as reduction scheduler does
+  // Instead, it uses inlineAllAt followed by inlineMost(innermost_tensors)
+  inlineAllAt(tv8, 2, true);
+  inlineMost(
+      std::vector<TensorView*>({tv0, tv1, tv2, tv3, tv4, tv5, tv6, tv7, tv11}));
 
-  compare_ir(fusion, fauto);
+  // Note that inlineAllAt iterates through an unordered_set to do inlining, so
+  // it is not practical to match the fusion_debug log exactly when using
+  // pointwise scheduler
+  compare_ir_math(fusion, fauto);
+  compare_transforms(fusion, fauto);
+  // compare_fusion_debug(fusion, fauto);
+  compare_kernels(fusion, fauto);
+
+  // compare_ir(fusion, fauto);
 
   // Perform eager computation and verify
   auto t0h = t0.to(options.dtype(at::kHalf));

From 4697c30f87669ee381dfc9a3c2dd761d4370d232 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 18 Jan 2023 11:05:15 -0500
Subject: [PATCH 26/28] Make FrontendPromoteToDouble test match auto sched

---
 .../nvfuser/test/test_gpu_match_frontend.cpp  | 108 +++++++++++++-----
 1 file changed, 78 insertions(+), 30 deletions(-)

diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
index 51201f8226af..3138888cd537 100644
--- a/third_party/nvfuser/test/test_gpu_match_frontend.cpp
+++ b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -859,55 +859,103 @@ TEST_F(NVFuserTest, FusionFrontendPromoteToDouble_CUDA) {
   int x = 2, y = 4;
   auto options = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA, 0);
 
-  at::Tensor t0h = at::randn({x, y}, options.dtype(at::kHalf));
+  at::Tensor t0 = at::randn({x, y}, options.dtype(at::kHalf));
   at::Tensor t1 = at::randn({x, y}, options);
 
-  std::vector<IValue> inputs = {t0h, t1};
+  std::vector<IValue> inputs = {t0, t1};
 
-  // Define fusion
+  Fusion fauto;
+  { // Do automatic scheduling on fauto
+    FusionGuard fg(&fauto);
+
+    auto tv0 = makeSymbolicTensor(2, DataType::Half);
+    auto tv1 = makeSymbolicTensor(2, DataType::Double);
+
+    fauto.addInput(tv0);
+    fauto.addInput(tv1);
+
+    auto tv2 = castOp(DataType::Double, tv0);
+    auto tv3 = add(tv2, tv1);
+    auto tv4 = relu(tv3);
+
+    fauto.addOutput(tv4);
+
+    // Run automatic scheduler
+    auto pointwise_params = getPointwiseHeuristics(&fauto, inputs);
+    TORCH_CHECK(pointwise_params, "Pointwise schedule was not generated!");
+    schedulePointwise(&fauto, *pointwise_params);
+  }
+
+  // Re-define the fusion exactly for manual scheduling
+  // This is necessary in order to catch all the constructors inside each
+  // Fusion independently.
   Fusion fusion;
   FusionGuard fg(&fusion);
 
-  auto tv0h = makeSymbolicTensor(2, DataType::Half);
+  auto tv0 = makeSymbolicTensor(2, DataType::Half);
   auto tv1 = makeSymbolicTensor(2, DataType::Double);
 
-  fusion.addInput(tv0h);
+  fusion.addInput(tv0);
   fusion.addInput(tv1);
 
-  auto tv0 = castOp(DataType::Double, tv0h);
-  auto tv2 = add(tv0, tv1);
-  auto tv3 = relu(tv2);
-
-  fusion.addOutput(tv3);
+  auto tv2 = castOp(DataType::Double, tv0);
+  auto tv3 = add(tv2, tv1);
+  auto tv4 = relu(tv3);
 
-  // Run automatic scheduler
-  auto fauto = Fusion(fusion); // unique_ptr to copy of fusion
-  auto reduction_params = getReductionHeuristics(&fauto, inputs);
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-  scheduleReduction(&fauto, *reduction_params);
+  fusion.addOutput(tv4);
 
   // Perform manual scheduling
-  tv3->merge(0, 1);
-  tv3->split(0, NamedScalar::getParallelDim(ParallelType::TIDx));
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv3->axis(1)->parallelize(ParallelType::TIDx);
+
+  // Before schedulePointwise() is called, getPointwiseHeuristics() calls
+  // vectorize_helper::getExpandedVectorization() which in turn calls:
+  //   vectorize_helper::getVectorizationSize
+  //   vectorize_helper::ProjectedExtent::getNumerator
+  //   vectorize_helper::ProjectedExtent::computeNumerDenomir
+  //   IrContainer::oneVal
+  // oneVal() creates an actual Val here to hold the denominator and
+  // initializes it to 1. Since this is reflected in the fusion log, I'm
+  // inserting it here even though it has not effect on the generated kernel.
+  fusion.oneVal();
+
+  tv0->cacheAfter(); // tv5
+  tv1->cacheAfter(); // tv6
+  auto tv7 = tv4->cacheBefore();
+
+  tv4->merge(0, 1);
+  tv4->reorder({{0, -1}});
+  tv4->reorder({{-1, 0}});
+  tv4->split(0, 128);
+  tv4->split(0, 1);
+  tv4->split(0, 1);
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(1)->parallelize(ParallelType::Unswitch);
+  tv4->axis(3)->parallelize(ParallelType::TIDx);
 
   // propagate the mapping to other tensors
-  TransformPropagatorWithCheck propagator(tv3);
-  MaxRootDomainInfoSpanningTree(tv3).traverse(&propagator);
-  scheduler_utils::parallelizeAllLike(tv3, {tv0h, tv0, tv1, tv2});
+  TransformPropagatorWithCheck propagator(tv4);
+  MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
+  scheduler_utils::parallelizeAllLike(tv4);
 
-  inlineMost();
+  // Pointwise scheduler does not use inlineMost(), as reduction scheduler does
+  // Instead, it uses inlineAllAt followed by inlineMost(innermost_tensors)
+  inlineAllAt(tv4, 2, true);
+  inlineMost(std::vector<TensorView*>({tv0, tv1, tv2, tv3, tv7}));
 
-  compare_ir(fusion, fauto);
+  // Note that inlineAllAt iterates through an unordered_set to do inlining, so
+  // it is not practical to match the fusion_debug log exactly when using
+  // pointwise scheduler
+  // compare_fusion_debug(fusion, fauto);
+  compare_ir_math(fusion, fauto);
+  compare_transforms(fusion, fauto);
+  compare_kernels(fusion, fauto);
 
-  // Perform eager computation and verify
-  auto t0 = t0h.to(options.dtype(at::kDouble));
+  // compare_ir(fusion, fauto);
 
-  auto t2 = t0.add(t1);
-  auto t3 = t2.relu();
+  // Perform eager computation and verify
+  auto t2 = t0.to(options.dtype(at::kDouble));
+  auto t3 = t2.add(t1);
+  auto t4 = t3.relu();
 
-  // Need to hardcode both block and grid size
   int runtime_threadIdx_dim = 128;
   LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
 
@@ -916,7 +964,7 @@ TEST_F(NVFuserTest, FusionFrontendPromoteToDouble_CUDA) {
   auto cg_outputs = fe.runFusion(inputs, lparams);
 
   testValidate(
-      &fusion, cg_outputs, inputs, {t3}, __LINE__, __FILE__, "", lparams);
+      &fusion, cg_outputs, inputs, {t4}, __LINE__, __FILE__, "", lparams);
 }
 
 //! Test broadcasting one input then adding another

From 98e5259fb382ff5b6a37201cadfcbdcb81e1477a Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 18 Jan 2023 11:19:19 -0500
Subject: [PATCH 27/28] Make FrontendImplicitBroadcastInput match auto sched

At this point `nvfuser_tests --gtest_filter='*Frontend*' passes all 8
tests.
---
 .../nvfuser/test/test_gpu_match_frontend.cpp  | 87 ++++++++++++++-----
 1 file changed, 64 insertions(+), 23 deletions(-)

diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
index 3138888cd537..1963d4300d37 100644
--- a/third_party/nvfuser/test/test_gpu_match_frontend.cpp
+++ b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -988,7 +988,36 @@ TEST_F(NVFuserTest, FusionFrontendImplicitBroadcastInput_CUDA) {
 
   std::vector<IValue> inputs = {t0, t1};
 
-  // Define fusion
+  Fusion fauto;
+  { // Do automatic scheduling on fauto
+    FusionGuard fg(&fauto);
+
+    auto tv0 = makeSymbolicTensor(1);
+    auto tv1 = makeSymbolicTensor(3);
+
+    fauto.addInput(tv0);
+    fauto.addInput(tv1);
+
+    // explicitly tell tv0 to broadcast along new first and last dimensions
+    auto tv2 = broadcast(tv0, {true, false, true});
+    auto tv3 = expand(
+        tv2,
+        {tv1->axis(0)->extent(),
+         tv1->axis(1)->extent(),
+         tv1->axis(2)->extent()});
+    auto tv4 = add(tv3, tv1);
+
+    fauto.addOutput(tv4);
+
+    // Run automatic scheduler
+    auto pointwise_params = getPointwiseHeuristics(&fauto, inputs);
+    TORCH_CHECK(pointwise_params, "Pointwise schedule was not generated!");
+    schedulePointwise(&fauto, *pointwise_params);
+  }
+
+  // Re-define the fusion exactly for manual scheduling
+  // This is necessary in order to catch all the constructors inside each
+  // Fusion independently.
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -1007,18 +1036,27 @@ TEST_F(NVFuserTest, FusionFrontendImplicitBroadcastInput_CUDA) {
 
   fusion.addOutput(tv4);
 
-  // Run automatic scheduler
-  auto fauto = Fusion(fusion); // unique_ptr to copy of fusion
-  auto pointwise_params = getPointwiseHeuristics(&fauto, inputs);
-  TORCH_CHECK(pointwise_params, "Pointwise schedule was not generated!");
-  schedulePointwise(&fauto, *pointwise_params);
-
   // Perform manual scheduling
-  auto tv5 = tv0->cacheAfter();
-  auto tv6 = tv1->cacheAfter();
+
+  // Before schedulePointwise() is called, getPointwiseHeuristics() calls
+  // vectorize_helper::getExpandedVectorization() which in turn calls:
+  //   vectorize_helper::getVectorizationSize
+  //   vectorize_helper::ProjectedExtent::getNumerator
+  //   vectorize_helper::ProjectedExtent::computeNumerDenomir
+  //   IrContainer::oneVal
+  // oneVal() creates an actual Val here to hold the denominator and
+  // initializes it to 1. Since this is reflected in the fusion log, I'm
+  // inserting it here even though it has not effect on the generated kernel.
+  fusion.oneVal();
+
+  tv0->cacheAfter(); // tv5
+  tv1->cacheAfter(); // tv6
   auto tv7 = tv4->cacheBefore();
+
   tv4->merge(1, 2);
   tv4->merge(0, 1);
+  tv4->reorder({{0, -1}});
+  tv4->reorder({{-1, 0}});
   tv4->split(0, 128);
   tv4->split(0, 1);
   tv4->split(0, 1);
@@ -1026,27 +1064,30 @@ TEST_F(NVFuserTest, FusionFrontendImplicitBroadcastInput_CUDA) {
   tv4->axis(1)->parallelize(ParallelType::Unswitch);
   tv4->axis(3)->parallelize(ParallelType::TIDx);
 
+  // propagate the mapping to other tensors
   TransformPropagatorWithCheck propagator(tv4);
   MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
-  scheduler_utils::parallelizeAllLike(tv4, {tv0, tv1, tv2, tv3, tv5, tv6, tv7});
+  scheduler_utils::parallelizeAllLike(tv4);
 
-  inlineMost(std::vector<TensorView*>({tv2, tv3, tv7}));
-  tv0->computeAt(tv2, 2);
-  tv5->computeAt(tv2, 2);
-  tv6->computeAt(tv4, 2);
-  tv1->computeAt(tv6, 2);
-  tv4->inlineAt(2);
+  // Pointwise scheduler does not use inlineMost(), as reduction scheduler does
+  // Instead, it uses inlineAllAt followed by inlineMost(innermost_tensors)
+  inlineAllAt(tv4, 2, true);
+  inlineMost(std::vector<TensorView*>({tv0, tv1, tv2, tv3, tv7}));
 
-  fusion.printTransforms();
+  // Note that inlineAllAt iterates through an unordered_set to do inlining, so
+  // it is not practical to match the fusion_debug log exactly when using
+  // pointwise scheduler
+  // compare_fusion_debug(fusion, fauto);
+  compare_ir_math(fusion, fauto);
+  compare_transforms(fusion, fauto);
+  compare_kernels(fusion, fauto);
 
-  compare_ir(fusion, fauto);
+  // compare_ir(fusion, fauto);
 
   // Perform eager computation and verify
-  auto t0_b = t0.view({1, w, 1});
+  auto t2 = t0.view({1, w, 1});
+  auto t4 = t2.add(t1);
 
-  auto t2 = t0_b.add(t1);
-
-  // Need to hardcode both block and grid size
   int runtime_threadIdx_dim = 128;
   LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1);
 
@@ -1055,7 +1096,7 @@ TEST_F(NVFuserTest, FusionFrontendImplicitBroadcastInput_CUDA) {
   auto cg_outputs = fe.runFusion(inputs, lparams);
 
   testValidate(
-      &fusion, cg_outputs, inputs, {t2}, __LINE__, __FILE__, "", lparams);
+      &fusion, cg_outputs, inputs, {t4}, __LINE__, __FILE__, "", lparams);
 }
 
 } // namespace jit

From aaf571327feb83e434fb8e2a39008346f6b5b939 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 19 Jan 2023 09:00:18 -0500
Subject: [PATCH 28/28] Add two broadcasting frontend tests

The only new thing in these tests is that when given an explicit
broadcast dimension, we shoul duse makeConcreteTensor with extents of 1,
in which case those IterDomains will automatically be set to broadcast.
---
 .../nvfuser/test/test_gpu_match_frontend.cpp  | 263 ++++++++++++++++++
 1 file changed, 263 insertions(+)

diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
index 1963d4300d37..32a27e6a55cb 100644
--- a/third_party/nvfuser/test/test_gpu_match_frontend.cpp
+++ b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -1099,6 +1099,269 @@ TEST_F(NVFuserTest, FusionFrontendImplicitBroadcastInput_CUDA) {
       &fusion, cg_outputs, inputs, {t4}, __LINE__, __FILE__, "", lparams);
 }
 
+//! Test broadcasting an input with existing broadcast dimensions, then adding
+//! ```python
+//! inputs = [
+//!     torch.randn(1, 1, 4, device='cuda'),
+//!     torch.randn(2, 3, 4, device='cuda'),
+//! ]
+//!
+//! def fusion_func(fd: FusionDefinition) :
+//!     t0 = fd.define_tensor(sizes=inputs[0].size(),
+//!     strides=inputs[0].stride()) t1 =
+//!     fd.define_tensor(sizes=inputs[1].size(), strides=inputs[1].stride())
+//!
+//!     t0_b = fd.ops.broadcast_in_dim(t0, inputs[1].size(), [0, 1, 2])
+//!     t2 = fd.ops.add(t0_b, t1)
+//!
+//!     fd.add_output(t2)
+//! ```
+TEST_F(NVFuserTest, FusionFrontendExplicitBroadcastInput_CUDA) {
+  // Create inputs
+  int x = 2, y = 3, z = 4;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({1, 1, z}, options);
+  at::Tensor t1 = at::randn({x, y, z}, options);
+
+  std::vector<IValue> inputs = {t0, t1};
+
+  Fusion fauto;
+  { // Do automatic scheduling on fauto
+    FusionGuard fg(&fauto);
+
+    // We depend on the input having size 1 in the first two dimensions, so we
+    // create a concrete tensor instead of using makeSymbolicTensor. The last
+    // dimension is still free.
+    auto tv0 = makeConcreteTensor({1, 1, -1});
+    auto tv1 = makeSymbolicTensor(3);
+
+    fauto.addInput(tv0);
+    fauto.addInput(tv1);
+
+    // The following line is unnecessary, but matches what is done in the
+    // frontend's broadcast_in_dim
+    auto tv2 = broadcast(tv0, {false, false, false});
+
+    auto tv3 = expand_as(tv2, tv1);
+    auto tv4 = add(tv3, tv1);
+
+    fauto.addOutput(tv4);
+
+    // Run automatic scheduler
+    auto pointwise_params = getPointwiseHeuristics(&fauto, inputs);
+    TORCH_CHECK(pointwise_params, "Pointwise schedule was not generated!");
+    schedulePointwise(&fauto, *pointwise_params);
+  }
+
+  // Re-define the fusion exactly for manual scheduling
+  // This is necessary in order to catch all the constructors inside each
+  // Fusion independently.
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({1, 1, -1});
+  auto tv1 = makeSymbolicTensor(3);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // The following line is unnecessary, but matches what is done in the
+  // frontend's broadcast_in_dim
+  auto tv2 = broadcast(tv0, {false, false, false});
+
+  auto tv3 = expand_as(tv2, tv1);
+  auto tv4 = add(tv3, tv1);
+
+  fusion.addOutput(tv4);
+
+  // Perform manual scheduling
+
+  // Before schedulePointwise() is called, getPointwiseHeuristics() calls
+  // vectorize_helper::getExpandedVectorization() which in turn calls:
+  //   vectorize_helper::getVectorizationSize
+  //   vectorize_helper::ProjectedExtent::getNumerator
+  //   vectorize_helper::ProjectedExtent::computeNumerDenomir
+  //   IrContainer::oneVal
+  // oneVal() creates an actual Val here to hold the denominator and
+  // initializes it to 1. Since this is reflected in the fusion log, I'm
+  // inserting it here even though it has not effect on the generated kernel.
+  fusion.oneVal();
+
+  tv0->cacheAfter(); // tv5
+  tv1->cacheAfter(); // tv6
+  auto tv7 = tv4->cacheBefore();
+
+  tv4->merge(1, 2);
+  tv4->merge(0, 1);
+  tv4->reorder({{0, -1}});
+  tv4->reorder({{-1, 0}});
+  tv4->split(0, 128);
+  tv4->split(0, 1);
+  tv4->split(0, 1);
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(1)->parallelize(ParallelType::Unswitch);
+  tv4->axis(3)->parallelize(ParallelType::TIDx);
+
+  // propagate the mapping to other tensors
+  TransformPropagatorWithCheck propagator(tv4);
+  MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
+  scheduler_utils::parallelizeAllLike(tv4);
+
+  // Pointwise scheduler does not use inlineMost(), as reduction scheduler does
+  // Instead, it uses inlineAllAt followed by inlineMost(innermost_tensors)
+  inlineAllAt(tv4, 2, true);
+  inlineMost(std::vector<TensorView*>({tv0, tv1, tv2, tv3, tv7}));
+
+  // Note that inlineAllAt iterates through an unordered_set to do inlining, so
+  // it is not practical to match the fusion_debug log exactly when using
+  // pointwise scheduler
+  // compare_fusion_debug(fusion, fauto);
+  compare_ir_math(fusion, fauto);
+  compare_transforms(fusion, fauto);
+  compare_kernels(fusion, fauto);
+
+  // compare_ir(fusion, fauto);
+
+  // Perform eager computation and verify
+  auto t4 = t1.add(t0);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto cg_outputs = fe.runFusion(inputs);
+
+  testValidate(&fusion, cg_outputs, inputs, {t4}, __LINE__, __FILE__);
+}
+
+//! Test adding implicitly and explicitly broadcast tensors together
+//! ```python
+//! inputs = [
+//!     torch.randn(3, 1, device='cuda'),
+//!     torch.randn(3, device='cuda'),
+//! ]
+//!
+//! def fusion_func(fd: FusionDefinition) :
+//!     t0 = fd.define_tensor([3, 1], [1, 1])
+//!     t1 = fd.define_tensor(1)
+//!
+//!     t1_b = fd.ops.broadcast_in_dim(t1, [3, 3], [0])
+//!     t2 = fd.ops.add(t0, t1_b)
+//!
+//!     fd.add_output(t2)
+//! ```
+TEST_F(NVFuserTest, FusionFrontendBroadcastMixing_CUDA) {
+  // Create inputs
+  int x = 3;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({x, 1}, options);
+  at::Tensor t1 = at::randn({x}, options);
+
+  std::vector<IValue> inputs = {t0, t1};
+
+  Fusion fauto;
+  { // Do automatic scheduling on fauto
+    FusionGuard fg(&fauto);
+
+    auto tv0 = makeConcreteTensor({-1, 1});
+    auto tv1 = makeSymbolicTensor(1);
+
+    fauto.addInput(tv0);
+    fauto.addInput(tv1);
+
+    // The following line is unnecessary, but matches what is done in the
+    // frontend's broadcast_in_dim
+    auto tv2 = broadcast(tv1, {false, true});
+    auto xc = IrBuilder::create<Int>(x);
+    auto tv3 = expand(tv2, {tv2->axis(0)->extent(), xc});
+    auto tv4 = add(tv0, tv3);
+
+    fauto.addOutput(tv4);
+
+    // Run automatic scheduler
+    auto pointwise_params = getPointwiseHeuristics(&fauto, inputs);
+    TORCH_CHECK(pointwise_params, "Pointwise schedule was not generated!");
+    schedulePointwise(&fauto, *pointwise_params);
+  }
+
+  // Re-define the fusion exactly for manual scheduling
+  // This is necessary in order to catch all the constructors inside each
+  // Fusion independently.
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({-1, 1});
+  auto tv1 = makeSymbolicTensor(1);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // The following line is unnecessary, but matches what is done in the
+  // frontend's broadcast_in_dim
+  auto tv2 = broadcast(tv1, {false, true});
+  auto xc = IrBuilder::create<Int>(x);
+  auto tv3 = expand(tv2, {tv2->axis(0)->extent(), xc});
+  auto tv4 = add(tv0, tv3);
+
+  fusion.addOutput(tv4);
+
+  // Perform manual scheduling
+
+  // Before schedulePointwise() is called, getPointwiseHeuristics() calls
+  // vectorize_helper::getExpandedVectorization() which in turn calls:
+  //   vectorize_helper::getVectorizationSize
+  //   vectorize_helper::ProjectedExtent::getNumerator
+  //   vectorize_helper::ProjectedExtent::computeNumerDenomir
+  //   IrContainer::oneVal
+  // oneVal() creates an actual Val here to hold the denominator and
+  // initializes it to 1. Since this is reflected in the fusion log, I'm
+  // inserting it here even though it has not effect on the generated kernel.
+  fusion.oneVal();
+
+  tv0->cacheAfter(); // tv5
+  tv1->cacheAfter(); // tv6
+  auto tv7 = tv4->cacheBefore();
+
+  tv4->merge(0, 1);
+  tv4->reorder({{0, -1}});
+  tv4->reorder({{-1, 0}});
+  tv4->split(0, 128);
+  tv4->split(0, 1);
+  tv4->split(0, 1);
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(1)->parallelize(ParallelType::Unswitch);
+  tv4->axis(3)->parallelize(ParallelType::TIDx);
+
+  // propagate the mapping to other tensors
+  TransformPropagatorWithCheck propagator(tv4);
+  MaxRootDomainInfoSpanningTree(tv4).traverse(&propagator);
+  scheduler_utils::parallelizeAllLike(tv4);
+
+  // Pointwise scheduler does not use inlineMost(), as reduction scheduler does
+  // Instead, it uses inlineAllAt followed by inlineMost(innermost_tensors)
+  inlineAllAt(tv4, 2, true);
+  inlineMost(std::vector<TensorView*>({tv0, tv1, tv2, tv3, tv7}));
+
+  // Note that inlineAllAt iterates through an unordered_set to do inlining, so
+  // it is not practical to match the fusion_debug log exactly when using
+  // pointwise scheduler
+  // compare_fusion_debug(fusion, fauto);
+  compare_ir_math(fusion, fauto);
+  compare_transforms(fusion, fauto);
+  compare_kernels(fusion, fauto);
+
+  // compare_ir(fusion, fauto);
+
+  // Perform eager computation and verify
+  auto t4 = t1.view({x, 1}).add(t0.expand({x, x}));
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, inputs);
+  auto cg_outputs = fe.runFusion(inputs);
+
+  testValidate(&fusion, cg_outputs, inputs, {t4}, __LINE__, __FILE__);
+}
+
 } // namespace jit
 } // namespace torch
 #endif // #if defined(USE_CUDA)