diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp index 5226e1778b770..e16beb70302d2 100644 --- a/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp +++ b/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp @@ -9636,7 +9636,6 @@ TEST_F(NVFuserTest, FusionPersistentNormLocalShared_CUDA) { torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion, aten_inputs); - fe.runFusion(aten_inputs, {cg_static_out, cg_dynamic_out}); auto properties = at::cuda::getDeviceProperties(0); // Require 70KB of smem to run test @@ -9645,6 +9644,8 @@ TEST_F(NVFuserTest, FusionPersistentNormLocalShared_CUDA) { GTEST_SKIP() << "not enough shared memory space on device to run test"; } + fe.runFusion(aten_inputs, {cg_static_out, cg_dynamic_out}); + auto at_mu = at::mean(aten_input.to(at::kDouble), -1).unsqueeze(1); auto at_var = at::var(aten_input.to(at::kDouble), -1, false).unsqueeze(1); auto at_rvar = at::rsqrt(at::add(at_var, kEps));