csarofeen · naoyam · Mar 9, 2023 · Mar 9, 2023 · Michoumichmich · Mar 10, 2023
diff --git a/third_party/nvfuser/csrc/executor.cpp b/third_party/nvfuser/csrc/executor.cpp
@@ -229,7 +229,9 @@ void FusionExecutor::compileFusion(
   // TODO: refactor the options_ passed through
   options_.device = c10::Device(c10::DeviceType::CUDA, args.getDeviceIndex());
   options_.index_mode = args.getIndexMode();
-  compile_params.index_type = DataType::Index;
+  compile_params.index_type =
+      (options_.index_mode == KernelIndexMode::INT64 ? DataType::Int
+                                                     : DataType::Int32);
   c10::DeviceGuard dg(options_.device);
 
   TORCH_INTERNAL_ASSERT(
@@ -1102,68 +1104,11 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
     launch_params_ =
         computeLaunchParams(launch_constraints, expr_eval, warp_size_);
 
-    auto alias_indices_entry =
-        executor_utils::caching::ExecutorCompileTimeEntry<
-            executor_utils::caching::InputAliasIndices>(
-            compileTimeDataCache(), [&]() {
-              return std::make_unique<std::vector<std::pair<int, int>>>(
-                  fusion_->getInputAliasIndices());
-            });
-
-    auto& alias_indices = alias_indices_entry.get();
-
-    // We need to push all of the outputs to the KernelArgumentHolder before
-    // checking at the Indexing Type. NOLINTNEXTLINE(bugprone-branch-clone)
-    if (outputs.empty()) {
-      auto output_alias_indices_entry =
-          executor_utils::caching::ExecutorCompileTimeEntry<
-              executor_utils::caching::OutputAliasIndices>(
-              compileTimeDataCache(), [&]() {
-                return std::make_unique<std::unordered_set<int>>(
-                    fusion_->getOutputAliasIndices());
-              });
-
-      auto& output_alias_indices = output_alias_indices_entry.get();
-
-      allocated_outputs = allocOutputs(args, expr_eval, output_alias_indices);
-
-      for (const auto& entry : alias_indices) {
-        auto aliased_output_index = entry.first;
-        auto aliased_input_index = entry.second;
-        auto tensor_arg_abstract =
-            dynamic_cast<const TensorArgAbstract*>(args[aliased_input_index]);
-        TORCH_INTERNAL_ASSERT(
-            tensor_arg_abstract, "alias io only supports tensor");
-        allocated_outputs[aliased_output_index] =
-            tensor_arg_abstract->getTensor();
-      }
-      args.push(allocated_outputs);
-    } else {
-      allocated_outputs = outputs;
-      args.push(outputs);
-      executor_utils::validateKernelOutputs(
-          fusion_, allocated_outputs, options_.device);
-    }
-
     // Recompile the kernel if the number of threads in the block has increased
-    // or maxrregcount has changed or nvfuser_index_t size changed
-    bool need_to_recompile =
-        launch_params_.nThreads() > block_size_high_water_mark ||
-        compile_params.maxrregcount != maxrregcount_high_water_mark;
-
-    // we recompile if the index type shrinks too, as it may lead to faster
-    // code.
-    if (args.getIndexMode() != options_.index_mode) {
-      need_to_recompile = true;
-      options_.index_mode = args.getIndexMode();
-    }
-
-    if (need_to_recompile) {
+    // or maxrregcount has changed
+    if (launch_params_.nThreads() > block_size_high_water_mark ||
+        compile_params.maxrregcount != maxrregcount_high_water_mark) {
       const auto kernel = lowered_->kernel();
-      // index type is contained in the kernel name and as a result in
-      // kernel_code which can then be used as a key in KernelDb. TODO This
-      // needs to be cleaned-up so that KernelDb's key is not only the kernel
-      // (we also need the GPU Arch, maxrregcount, ...)
       kernel_code_ = codegen::generateCudaKernel(kernel, kernelName());
       const auto structured_code = getStructuredCode(kernel_code_);
       block_size_high_water_mark = launch_params_.nThreads();
@@ -1218,6 +1163,48 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
         compileTimeDataCache(),
         expr_eval);
 
+    auto alias_indices_entry =
+        executor_utils::caching::ExecutorCompileTimeEntry<
+            executor_utils::caching::InputAliasIndices>(
+            compileTimeDataCache(), [&]() {
+              return std::make_unique<std::vector<std::pair<int, int>>>(
+                  fusion_->getInputAliasIndices());
+            });
+
+    auto& alias_indices = alias_indices_entry.get();
+
+    // NOLINTNEXTLINE(bugprone-branch-clone)
+    if (outputs.empty()) {
+      auto output_alias_indices_entry =
+          executor_utils::caching::ExecutorCompileTimeEntry<
+              executor_utils::caching::OutputAliasIndices>(
+              compileTimeDataCache(), [&]() {
+                return std::make_unique<std::unordered_set<int>>(
+                    fusion_->getOutputAliasIndices());
+              });
+
+      auto& output_alias_indices = output_alias_indices_entry.get();
+
+      allocated_outputs = allocOutputs(args, expr_eval, output_alias_indices);
+
+      for (const auto& entry : alias_indices) {
+        auto aliased_output_index = entry.first;
+        auto aliased_input_index = entry.second;
+        auto tensor_arg_abstract =
+            dynamic_cast<const TensorArgAbstract*>(args[aliased_input_index]);
+        TORCH_INTERNAL_ASSERT(
+            tensor_arg_abstract, "alias io only supports tensor");
+        allocated_outputs[aliased_output_index] =
+            tensor_arg_abstract->getTensor();
+      }
+      args.push(allocated_outputs);
+    } else {
+      allocated_outputs = outputs;
+      args.push(outputs);
+      executor_utils::validateKernelOutputs(
+          fusion_, allocated_outputs, options_.device);
+    }
+
     global_buffers = allocGlobalVals(expr_eval);
 
     if (kernel()->summary().max_rng_offsets >= 0) {

diff --git a/third_party/nvfuser/csrc/executor.h b/third_party/nvfuser/csrc/executor.h
@@ -206,16 +206,9 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
         last_compiled_binary_, "-fun 1 -c");
   }
 
-  //! Returns a kernel name.
-  //! The kernel name has a suffix that represents the choosen indexing mode:
-  //! int32 or int64.
-  //! TODO: the suffix is currently used principally as a workaround so we can
-  //! use the kernel string as a key in KernelDb. This should be re-worked.
   std::string kernelName() const {
-    const char* index_type_suffix =
-        options_.index_mode == KernelIndexMode::INT64 ? "_int64" : "_int32";
     std::stringstream ss;
-    ss << "kernel" << fusion_id_ << index_type_suffix;
+    ss << "kernel" << fusion_id_;
     return ss.str();
   }
 

diff --git a/third_party/nvfuser/csrc/executor_kernel_arg.cpp b/third_party/nvfuser/csrc/executor_kernel_arg.cpp
@@ -152,14 +152,6 @@ KernelArgumentHolder KernelArgumentHolder::createKernelArgumentHolder(
 
 // Push a tensor to the arguments
 void KernelArgumentHolder::push(const at::Tensor& tensor) {
-  // Deferred update of index_mode.
-  // Index mode size depends on the fusion inputs AND outputs.
-  // TODO We should also take into account the other intermediary buffers
-  if (index_mode_ == KernelIndexMode::INT32 &&
-      collectIndexMode({tensor}) == KernelIndexMode::INT64) {
-    setIndexMode(KernelIndexMode::INT64);
-  }
-
   changed_ = true;
   if (is_cpu_scalar(tensor)) {
     switch (tensor.scalar_type()) {
@@ -291,44 +283,6 @@ void** KernelArgumentHolder::getBuffer() {
   return void_ptrs_.data();
 }
 
-void KernelArgumentHolder::updateTensorIndexModes() {
-  for (auto& arg : arguments_) {
-    TensorArgAbstract* tensor_arg_old =
-        dynamic_cast<TensorArgAbstract*>(arg.get());
-    if (tensor_arg_old == nullptr)
-      continue;
-    auto tensor = tensor_arg_old->getTensor();
-    int nDims = tensor.ndimension();
-    c10::ScalarType dtype = tensor.scalar_type();
-    std::unique_ptr<TensorArgAbstract> tensor_arg =
-        getTensorArg(dtype, nDims, index_mode_);
-    tensor_arg->setTensor(tensor);
-    tensor_arg->setPointer(tensor.data_ptr());
-    tensor_arg->setDataType(aten_to_data_type(dtype));
-    for (const auto i : c10::irange(nDims)) {
-      tensor_arg->setSize(i, tensor.sizes()[i]);
-      tensor_arg->setStride(i, tensor.strides()[i]);
-    }
-    arg = std::move(tensor_arg);
-  }
-}
-
-KernelIndexMode KernelArgumentHolder::getSmallestIndexModeRequired() const {
-  KernelIndexMode smallest = KernelIndexMode::INT32;
-  for (auto& arg : arguments_) {
-    TensorArgAbstract* tensor_arg_old =
-        dynamic_cast<TensorArgAbstract*>(arg.get());
-    if (tensor_arg_old == nullptr)
-      continue;
-    auto tensor = tensor_arg_old->getTensor();
-    auto mode = collectIndexMode({tensor});
-    if (mode == KernelIndexMode::INT64) {
-      smallest = KernelIndexMode::INT64;
-    }
-  }
-  return smallest;
-}
-
 void KernelArgumentHolder::push(const c10::ArrayRef<c10::IValue>& args) {
   // Naive I/O setup, I'm ignoring all the potential transformation (i.e. I/O
   // allocated here from the subgraph could be, and very likely are, different

diff --git a/third_party/nvfuser/csrc/executor_kernel_arg.h b/third_party/nvfuser/csrc/executor_kernel_arg.h
@@ -284,37 +284,6 @@ class TORCH_CUDA_CU_API KernelArgumentHolder {
     return index_mode_;
   }
 
-  // Allows the user to specify a larger index mode than reqd
-  void setIndexMode(KernelIndexMode mode) {
-    // index_mode_ is deduced on args. We won't allow setting it to a smaller
-    // size than required
-    auto smallest = getSmallestIndexModeRequired();
-
-    // well, turns out index type was too small
-    if (smallest == KernelIndexMode::INT64 &&
-        index_mode_ == KernelIndexMode::INT32) {
-      TORCH_INTERNAL_ASSERT(
-          false,
-          "index_mode_ is INT32, but given the state of KernelArgumentHolder it should have already been INT64.");
-    }
-
-    // we can't narrow down index type
-    if (mode == KernelIndexMode::INT32 && smallest == KernelIndexMode::INT64) {
-      return;
-    }
-
-    // no-op
-    if (mode == index_mode_) {
-      return;
-    }
-
-    // Increasing or decreasing index size.
-    if (mode == smallest || mode == KernelIndexMode::INT64) {
-      index_mode_ = mode;
-      updateTensorIndexModes();
-    }
-  }
-
   explicit KernelArgumentHolder(KernelIndexMode index_mode)
       : index_mode_(index_mode) {}
 
@@ -363,12 +332,6 @@ class TORCH_CUDA_CU_API KernelArgumentHolder {
     return arguments_.back().get();
   }
 
-  // Goes through all tensors and changes index mode
-  void updateTensorIndexModes();
-
-  // Checks all tensors held to find the smallest index mode required.
-  KernelIndexMode getSmallestIndexModeRequired() const;
-
   void appendPhiloxRNGSeed(uint64_t rand_offset);
 
   const ArgAbstract* operator[](int ind) const {

diff --git a/third_party/nvfuser/csrc/utils.cpp b/third_party/nvfuser/csrc/utils.cpp
@@ -323,11 +323,11 @@ KernelIndexMode collectIndexMode(const at::ArrayRef<c10::IValue>& inputs) {
           if (tensor_input.stride(dim_i) > 0) {
             // Acuumulate positive stride
             tensor_most_positive_index +=
-                tensor_input.size(dim_i) * tensor_input.stride(dim_i);
+                (tensor_input.size(dim_i) - 1) * tensor_input.stride(dim_i);
           } else {
             // Acuumulate negative stride
             tensor_most_negative_index +=
-                tensor_input.size(dim_i) * tensor_input.stride(dim_i);
+                (tensor_input.size(dim_i) - 1) * tensor_input.stride(dim_i);
           }
         }
       }

diff --git a/third_party/nvfuser/test/test_gpu3.cpp b/third_party/nvfuser/test/test_gpu3.cpp
@@ -7750,62 +7750,6 @@ TEST_F(NVFuserTest, FusionPredicateReductionInitGlobal_CUDA) {
       fe.kernel(), cg_outputs, inputs, {ref_t1, ref_t3}, __LINE__, __FILE__);
 }
 
-TEST_F(NVFuserTest, FusionExecutorCache_IndexType_Recompilation) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
-
-  auto tv0 = makeContigTensor(2, DataType::Half);
-  auto tv1 = makeContigTensor(2, DataType::Half);
-  auto tv2 = add(tv0, tv1);
-  fusion->addInput(tv0);
-  fusion->addInput(tv1);
-  fusion->addOutput(tv2);
-
-  auto fec = std::make_unique<FusionExecutorCache>(std::move(fusion));
-
-  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
-  auto input_small = at::ones({1024, 1024}, options);
-  auto input_large = at::ones({1024, 4 * 1 << 20}, options);
-
-  KernelArgumentHolder small_holder =
-      KernelArgumentHolder::createKernelArgumentHolder({input_small});
-  TORCH_CHECK(
-      small_holder.getIndexMode() ==
-      KernelIndexMode::INT32); // Index mode should be INT32
-
-  small_holder.setIndexMode(
-      KernelIndexMode::INT64); // Index mode should go up in size.
-  TORCH_CHECK(small_holder.getIndexMode() == KernelIndexMode::INT64);
-
-  small_holder.push(input_large);
-  small_holder.setIndexMode(KernelIndexMode::INT32);
-  TORCH_CHECK(
-      small_holder.getIndexMode() ==
-      KernelIndexMode::INT64); // Index mode shouldn't be changed.
-
-  // first run with the smaller input
-  fec->runFusionWithInputs({input_small, input_small});
-  auto kernel_runtime_for_input_small = fec->getMostRecentKernelRuntime();
-
-  // run with the larger input
-  fec->runFusionWithInputs({input_large, input_large});
-  auto kernel_runtime_for_input_large = fec->getMostRecentKernelRuntime();
-
-  // make sure they correspond to different runtimes, just in case
-  TORCH_CHECK(
-      kernel_runtime_for_input_small != kernel_runtime_for_input_large,
-      "Unexpected kernel runtime");
-
-  // second run with the smaller input
-  fec->runFusionWithInputs({input_small, input_small});
-  auto second_kernel_runtime_for_input_small =
-      fec->getMostRecentKernelRuntime();
-
-  TORCH_CHECK(
-      kernel_runtime_for_input_small == second_kernel_runtime_for_input_small,
-      "Expected to reuse the same runtime");
-}
-
 // Test file size should be up to 10K LoC. Create a new file for more tests.
 
 } // namespace nvfuser