csarofeen · shmsong · Feb 18, 2021 · Nov 13, 2020 · Nov 16, 2020 · Dec 8, 2020
diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp
@@ -10852,6 +10852,365 @@ __global__ void kernel1(
   TORCH_CHECK(in0.mean(dims).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6));
 }
 
+TEST(NVFuserTest, FusionWelfordOp_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int M = 64, N = 128;
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = mul(tv0, new Double(1));
+  auto tvs = Welford(tv1, {1});
+  auto tv_M2 = tvs.var;
+  auto tv_avg = tvs.avg;
+  auto tv_N = tvs.n;
+  fusion.addOutput(tv_M2);
+  fusion.addOutput(tv_avg);
+  fusion.addOutput(tv_N);
+
+  tv_avg->split(1, 32);
+  tv_avg->split(0, 32);
+  tv_avg->split(0, 4);
+  tv_avg->reorder({{-1, -3}, {-3, -1}});
+  tv1->computeAt(tv_avg, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M, N}, options);
+  at::Tensor t_var = at::empty({M}, options);
+  at::Tensor t_avg = at::empty({M}, options);
+  at::Tensor t_N = at::empty({M}, options_int);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  auto outputs = fe.runFusion({t0});
+
+  // by default Welford outputs sum of square diff so need to divide to get var
+  outputs[0] /= N;
+
+  testValidate(
+      &fusion,
+      outputs,
+      {t0},
+      {t0.var({1}, false), t0.mean({1}), at::ones({M}, options_int) * N},
+      __LINE__,
+      __FILE__);
+}
+
+TEST(NVFuserTest, FusionBlockWelfordOp_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int M = 64, N = 128;
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = mul(tv0, new Double(1));
+  auto tvs = Welford(tv1, {1});
+  auto tv_M2 = tvs.var;
+  auto tv_avg = tvs.avg;
+  auto tv_N = tvs.n;
+  fusion.addOutput(tv_M2);
+  fusion.addOutput(tv_avg);
+  fusion.addOutput(tv_N);
+
+  tv_avg->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv1->computeAt(tv_avg, -1);
+
+  //
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M, N}, options);
+  at::Tensor t_var = at::empty({M}, options);
+  at::Tensor t_avg = at::empty({M}, options);
+  at::Tensor t_N = at::empty({M}, options_int);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  auto outputs = fe.runFusion({t0});
+
+  // by default Welford outputs sum of square diff so need to divide to get var
+  outputs[0] /= N;
+
+  testValidate(
+      &fusion,
+      outputs,
+      {t0},
+      {t0.var({1}, false), t0.mean({1}), at::ones({M}, options_int) * N},
+      __LINE__,
+      __FILE__);
+}
+
+TEST(NVFuserTest, FusionGridWelfordOp_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int M = 64, N = 128;
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = mul(tv0, new Double(1));
+  auto tvs = Welford(tv1, {1});
+  auto tv_M2 = tvs.var;
+  auto tv_avg = tvs.avg;
+  auto tv_N = tvs.n;
+  fusion.addOutput(tv_M2);
+  fusion.addOutput(tv_avg);
+  fusion.addOutput(tv_N);
+
+  tv_avg->axis(0)->parallelize(ParallelType::TIDx);
+  tv_avg->axis(-1)->parallelize(ParallelType::BIDx);
+
+  tv1->computeAt(tv_avg, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M, N}, options);
+  at::Tensor t_var = at::empty({M}, options);
+  at::Tensor t_avg = at::empty({M}, options);
+  at::Tensor t_N = at::empty({M}, options_int);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  auto outputs = fe.runFusion({t0});
+
+  // by default Welford outputs sum of square diff so need to divide to get var
+  outputs[0] /= N;
+
+  testValidate(
+      &fusion,
+      outputs,
+      {t0},
+      {t0.var({1}, false), t0.mean({1}), at::ones({M}, options_int) * N},
+      __LINE__,
+      __FILE__);
+}
+
+TEST(NVFuserTest, FusionRfactorWelfordOp_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int M = 64, N = 128;
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = mul(tv0, new Double(1));
+  auto tvs = Welford(tv1, {1});
+  auto tv_M2 = tvs.var;
+  auto tv_avg = tvs.avg;
+  auto tv_N = tvs.n;
+  fusion.addOutput(tv_M2);
+  fusion.addOutput(tv_avg);
+  fusion.addOutput(tv_N);
+
+  tv_avg->split(1, 4);
+  auto rtvs = tvs.rFactor({2});
+  tv1->computeAt(tv_avg, -1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M, N}, options);
+  at::Tensor t_var = at::empty({M}, options);
+  at::Tensor t_avg = at::empty({M}, options);
+  at::Tensor t_N = at::empty({M}, options_int);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  auto outputs = fe.runFusion({t0});
+
+  // by default Welford outputs sum of square diff so need to divide to get var
+  outputs[0] /= N;
+
+  testValidate(
+      &fusion,
+      outputs,
+      {t0},
+      {t0.var({1}, false), t0.mean({1}), at::ones({M}, options_int) * N},
+      __LINE__,
+      __FILE__);
+}
+
+TEST(NVFuserTest, FusionWelfordSchedule_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  int M = 64, N = 128;
+
+  auto tv0 = makeSymbolicTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = mul(tv0, new Double(1));
+  auto tvs = Welford(tv1, {1});
+  auto tv_M2 = tvs.var;
+  auto tv_avg = tvs.avg;
+  auto tv_N = tvs.n;
+  fusion.addOutput(tv_M2);
+  fusion.addOutput(tv_N);
+  fusion.addOutput(tv_avg);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  at::Tensor t0 = at::randn({M, N}, options);
+  auto red_params = getReductionHeuristics(&fusion, {t0}, tv_avg);
+
+  tv_avg->split(1, 4);
+  tv_avg->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
+  tv_avg->split(0, NamedScalar::getParallelDim(ParallelType::TIDy));
+
+  auto rtvs = tvs.rFactor({-3, -1});
+
+  rtvs.avg->computeAt(tv_avg, -1);
+
+  rtvs.avg->axis(-1)->parallelize(ParallelType::Unroll);
+
+  tv_avg->axis(0)->parallelize(ParallelType::BIDx);
+  tv_avg->axis(1)->parallelize(ParallelType::TIDy);
+  tv_avg->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv1->computeAt(rtvs.avg, -1);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  auto outputs = fe.runFusion({t0}, red_params.value().lparams);
+
+  // by default Welford outputs sum of square diff so need to divide to get var
+  outputs[0] /= N;
+
+  auto at_var = t0.var({1}, false);
+  auto at_avg = t0.mean({1});
+  auto at_n = at::ones({M}, options_int) * N;
+
+  testValidate(
+      &fusion,
+      outputs,
+      {t0},
+      {at_var, at_n, at_avg},
+      __LINE__,
+      __FILE__,
+      "validate welford",
+      red_params.value().lparams);
+}
+
+namespace {
+void testWelford(DataType dtype, int red_axis, int odim, int rdim) {
+  const int axis = red_axis;
+  at::ScalarType aten_dtype = data_type_to_aten(dtype);
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+  TensorView* tv0 = makeSymbolicTensor(2, dtype);
+  bool is_fp16 = dtype == DataType::Half;
+  TensorView* tv0_cast = tv0;
+  if (is_fp16) {
+    tv0_cast = castOp(DataType::Float, tv0);
+  }
+  fusion.addInput(tv0);
+  auto tv1 = mul(tv0_cast, new Double(1));
+  auto tvs = Welford(tv1, {axis});
+  auto tv_M2 = tvs.var;
+  auto tv_avg = tvs.avg;
+  auto tv_N = tvs.n;
+
+  TensorView* avg_cast = tv_avg;
+  TensorView* M2_cast = tv_M2;
+
+  if (is_fp16) {
+    avg_cast = castOp(DataType::Half, tv_avg);
+    M2_cast = castOp(DataType::Half, tv_M2);
+  }
+
+  fusion.addOutput(M2_cast);
+  fusion.addOutput(tv_N);
+  fusion.addOutput(avg_cast);
+
+  auto options = at::TensorOptions().dtype(aten_dtype).device(at::kCUDA, 0);
+  auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
+  at::manual_seed(0);
+  std::vector<TensorView*> outputs_of_red;
+  at::Tensor aten_input =
+      (axis ? at::randn({odim, rdim}, options)
+            : at::randn({rdim, odim}, options));
+
+  if (is_fp16) {
+    outputs_of_red.push_back(avg_cast);
+    outputs_of_red.push_back(M2_cast);
+  }
+
+  auto reduction_params = getReductionHeuristics(&fusion, {aten_input}, tv_avg);
+  scheduleReduction(&fusion, reduction_params.value(), tv_avg, outputs_of_red);
+
+  auto lparams = reduction_params.value().lparams;
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  auto outputs = fe.runFusion({aten_input}, reduction_params.value().lparams);
+
+  // by default Welford outputs sum of square diff so need to divide to
+  // get var
+
+  outputs[0] /= rdim;
+
+  auto at_var = aten_input.var({axis}, false);
+  auto at_avg = aten_input.mean({axis});
+  auto at_n =
+      (axis ? at::ones({odim, rdim}, options)
+            : at::ones({rdim, odim}, options));
+  at_n = at_n.sum({axis});
+
+  testValidate(
+      &fusion,
+      outputs,
+      {aten_input},
+      {at_var, at_n, at_avg},
+      __LINE__,
+      __FILE__,
+      "validate welford",
+      reduction_params.value().lparams);
+}
+} // namespace
+
+TEST(NVFuserTest, FusionWelfordShmoo_CUDA) {
+  std::vector<DataType> dtypes = {
+      DataType::Double, DataType::Float, DataType::Half};
+  std::vector<int> red_axis = {1, 0};
+  std::vector<int> output_dims = {160, 320};
+  std::vector<int> red_dims;
+
+  // Tried to cut down the number iterations with just
+  // doing every other power of 2.
+  for (int i = 1; i <= 1024 * 1024; i <<= 2) {
+    red_dims.push_back(i);
+  }
+
+  for (auto dtype : dtypes) {
+    for (auto& axis : red_axis) {
+      for (auto& odim : output_dims) {
+        for (auto& rdim : red_dims) {
+          // TODO: original welford algorithm actually keeps a running sum of
+          // squares, i.e. M_{2n} in the
+          //       cf:
+          //       https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+          //       algorithm notation, and it can reach inf for large numbers
+          //       with half precision. skipping too large volumes for half for
+          //       nwo might need further numerical experiments to re-design
+          //       this.
+          if (rdim > 32768 && dtype == DataType::Half) {
+            continue;
+          }
+
+          testWelford(dtype, axis, odim, rdim);
+        }
+      }
+    }
+  }
+}
+
 TEST(NVFuserTest, FusionTranspose1_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);