csarofeen · mmigdal-nv · Mar 8, 2023 · Feb 27, 2023 · Mar 1, 2023 · Mar 8, 2023
diff --git a/third_party/nvfuser/csrc/executor.cpp b/third_party/nvfuser/csrc/executor.cpp
@@ -245,9 +245,7 @@ void FusionExecutor::compileFusion(
   // TODO: refactor the options_ passed through
   options_.device = c10::Device(c10::DeviceType::CUDA, args.getDeviceIndex());
   options_.index_mode = args.getIndexMode();
-  compile_params.index_type =
-      (options_.index_mode == KernelIndexMode::INT64 ? DataType::Int
-                                                     : DataType::Int32);
+  compile_params.index_type = DataType::Index;
   c10::DeviceGuard dg(options_.device);
 
   TORCH_INTERNAL_ASSERT(
@@ -1111,11 +1109,68 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
     launch_params_ =
         computeLaunchParams(launch_constraints, expr_eval, warp_size_);
 
+    auto alias_indices_entry =
+        executor_utils::caching::ExecutorCompileTimeEntry<
+            executor_utils::caching::InputAliasIndices>(
+            compileTimeDataCache(), [&]() {
+              return std::make_unique<std::vector<std::pair<int, int>>>(
+                  fusion_->getInputAliasIndices());
+            });
+
+    auto& alias_indices = alias_indices_entry.get();
+
+    // We need to push all of the outputs to the KernelArgumentHolder before checking at the Indexing Type.
+    // NOLINTNEXTLINE(bugprone-branch-clone)
+    if (outputs.empty()) {
+      auto output_alias_indices_entry =
+          executor_utils::caching::ExecutorCompileTimeEntry<
+              executor_utils::caching::OutputAliasIndices>(
+              compileTimeDataCache(), [&]() {
+                return std::make_unique<std::unordered_set<int>>(
+                    fusion_->getOutputAliasIndices());
+              });
+
+      auto& output_alias_indices = output_alias_indices_entry.get();
+
+      allocated_outputs = allocOutputs(args, expr_eval, output_alias_indices);
+
+      for (const auto& entry : alias_indices) {
+        auto aliased_output_index = entry.first;
+        auto aliased_input_index = entry.second;
+        auto tensor_arg_abstract =
+            dynamic_cast<const TensorArgAbstract*>(args[aliased_input_index]);
+        TORCH_INTERNAL_ASSERT(
+            tensor_arg_abstract, "alias io only supports tensor");
+        allocated_outputs[aliased_output_index] =
+            tensor_arg_abstract->getTensor();
+      }
+      args.push(allocated_outputs);
+    } else {
+      allocated_outputs = outputs;
+      args.push(outputs);
+      executor_utils::validateKernelOutputs(
+          fusion_, allocated_outputs, options_.device);
+    }
+
     // Recompile the kernel if the number of threads in the block has increased
-    // or maxrregcount has changed
-    if (launch_params_.nThreads() > block_size_high_water_mark ||
-        compile_params.maxrregcount != maxrregcount_high_water_mark) {
+    // or maxrregcount has changed or nvfuser_index_t size changed
+    bool need_to_recompile =
+        launch_params_.nThreads() > block_size_high_water_mark ||
+        compile_params.maxrregcount != maxrregcount_high_water_mark;
+
+    // we recompile if the index type shrinks too, as it may lead to faster
+    // code.
+    if (args.getIndexMode() != options_.index_mode) {
+      need_to_recompile = true;
+      options_.index_mode = args.getIndexMode();
+    }
+
+    if (need_to_recompile) {
       const auto kernel = lowered_->kernel();
+      // index type is contained in the kernel name and as a result in
+      // kernel_code which can then be used as a key in KernelDb. TODO This
+      // needs to be cleaned-up so that KernelDb's key is not only the kernel
+      // (we also need the GPU Arch, maxrregcount, ...)
       kernel_code_ = codegen::generateCudaKernel(kernel, kernelName());
       const auto structured_code = getStructuredCode(kernel_code_);
       block_size_high_water_mark = launch_params_.nThreads();
@@ -1170,48 +1225,6 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
         compileTimeDataCache(),
         expr_eval);
 
-    auto alias_indices_entry =
-        executor_utils::caching::ExecutorCompileTimeEntry<
-            executor_utils::caching::InputAliasIndices>(
-            compileTimeDataCache(), [&]() {
-              return std::make_unique<std::vector<std::pair<int, int>>>(
-                  fusion_->getInputAliasIndices());
-            });
-
-    auto& alias_indices = alias_indices_entry.get();
-
-    // NOLINTNEXTLINE(bugprone-branch-clone)
-    if (outputs.empty()) {
-      auto output_alias_indices_entry =
-          executor_utils::caching::ExecutorCompileTimeEntry<
-              executor_utils::caching::OutputAliasIndices>(
-              compileTimeDataCache(), [&]() {
-                return std::make_unique<std::unordered_set<int>>(
-                    fusion_->getOutputAliasIndices());
-              });
-
-      auto& output_alias_indices = output_alias_indices_entry.get();
-
-      allocated_outputs = allocOutputs(args, expr_eval, output_alias_indices);
-
-      for (const auto& entry : alias_indices) {
-        auto aliased_output_index = entry.first;
-        auto aliased_input_index = entry.second;
-        auto tensor_arg_abstract =
-            dynamic_cast<const TensorArgAbstract*>(args[aliased_input_index]);
-        TORCH_INTERNAL_ASSERT(
-            tensor_arg_abstract, "alias io only supports tensor");
-        allocated_outputs[aliased_output_index] =
-            tensor_arg_abstract->getTensor();
-      }
-      args.push(allocated_outputs);
-    } else {
-      allocated_outputs = outputs;
-      args.push(outputs);
-      executor_utils::validateKernelOutputs(
-          fusion_, allocated_outputs, options_.device);
-    }
-
     global_buffers = allocGlobalVals(expr_eval);
 
     if (kernel()->summary().max_rng_offsets >= 0) {

diff --git a/third_party/nvfuser/csrc/executor.h b/third_party/nvfuser/csrc/executor.h
@@ -189,8 +189,10 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
   }
 
   std::string kernelName() const {
+    const char* index_type_suffix =
+        options_.index_mode == KernelIndexMode::INT64 ? "_int64" : "_int32";
     std::stringstream ss;
-    ss << "kernel" << fusion_id_;
+    ss << "kernel" << fusion_id_ << index_type_suffix;
     return ss.str();
   }
 

diff --git a/third_party/nvfuser/csrc/executor_kernel_arg.cpp b/third_party/nvfuser/csrc/executor_kernel_arg.cpp
@@ -152,6 +152,14 @@ KernelArgumentHolder KernelArgumentHolder::createKernelArgumentHolder(
 
 // Push a tensor to the arguments
 void KernelArgumentHolder::push(const at::Tensor& tensor) {
+  // Deferred update of index_mode.
+  // Index mode size depends on the fusion inputs AND outputs.
+  // TODO We should also take into account the other intermediary buffers
+  if (index_mode_ == KernelIndexMode::INT32 &&
+      collectIndexMode({tensor}) == KernelIndexMode::INT64) {
+    setIndexMode(KernelIndexMode::INT64);
+  }
+
   changed_ = true;
   if (is_cpu_scalar(tensor)) {
     switch (tensor.scalar_type()) {
@@ -283,6 +291,44 @@ void** KernelArgumentHolder::getBuffer() {
   return void_ptrs_.data();
 }
 
+void KernelArgumentHolder::updateTensorIndexModes() {
+  for (auto& arg : arguments_) {
+    TensorArgAbstract* tensor_arg_old =
+        dynamic_cast<TensorArgAbstract*>(arg.get());
+    if (tensor_arg_old == nullptr)
+      continue;
+    auto tensor = tensor_arg_old->getTensor();
+    int nDims = tensor.ndimension();
+    c10::ScalarType dtype = tensor.scalar_type();
+    std::unique_ptr<TensorArgAbstract> tensor_arg =
+        getTensorArg(dtype, nDims, index_mode_);
+    tensor_arg->setTensor(tensor);
+    tensor_arg->setPointer(tensor.data_ptr());
+    tensor_arg->setDataType(aten_to_data_type(dtype));
+    for (const auto i : c10::irange(nDims)) {
+      tensor_arg->setSize(i, tensor.sizes()[i]);
+      tensor_arg->setStride(i, tensor.strides()[i]);
+    }
+    arg = std::move(tensor_arg);
+  }
+}
+
+KernelIndexMode KernelArgumentHolder::getSmallestIndexModeRequired() const {
+  KernelIndexMode smallest = KernelIndexMode::INT32;
+  for (auto& arg : arguments_) {
+    TensorArgAbstract* tensor_arg_old =
+        dynamic_cast<TensorArgAbstract*>(arg.get());
+    if (tensor_arg_old == nullptr)
+      continue;
+    auto tensor = tensor_arg_old->getTensor();
+    auto mode = collectIndexMode({tensor});
+    if (mode == KernelIndexMode::INT64) {
+      smallest = KernelIndexMode::INT64;
+    }
+  }
+  return smallest;
+}
+
 void KernelArgumentHolder::push(const c10::ArrayRef<c10::IValue>& args) {
   // Naive I/O setup, I'm ignoring all the potential transformation (i.e. I/O
   // allocated here from the subgraph could be, and very likely are, different

diff --git a/third_party/nvfuser/csrc/executor_kernel_arg.h b/third_party/nvfuser/csrc/executor_kernel_arg.h
@@ -284,6 +284,37 @@ class TORCH_CUDA_CU_API KernelArgumentHolder {
     return index_mode_;
   }
 
+  // Allows the user to specify a larger index mode than reqd
+  void setIndexMode(KernelIndexMode mode) {
+    // index_mode_ is deduced on args. We won't allow setting it to a smaller
+    // size than required
+    auto smallest = getSmallestIndexModeRequired();
+
+    // well, turns out index type was too small, regardless of the argument.
+    if (smallest == KernelIndexMode::INT64 &&
+        index_mode_ != KernelIndexMode::INT64) {
+      index_mode_ = KernelIndexMode::INT64;
+      updateTensorIndexModes();
+      return;
+    }
+
+    // we can't narrow down index type
+    if (mode == KernelIndexMode::INT32 && smallest == KernelIndexMode::INT64) {
+      return;
+    }
+
+    // no-op
+    if (mode == index_mode_) {
+      return;
+    }
+
+    // Increasing or decreasing index size.
+    if (mode == smallest || mode == KernelIndexMode::INT64) {
+      index_mode_ = mode;
+      updateTensorIndexModes();
+    }
+  }
+
   explicit KernelArgumentHolder(KernelIndexMode index_mode)
       : index_mode_(index_mode) {}
 
@@ -332,6 +363,12 @@ class TORCH_CUDA_CU_API KernelArgumentHolder {
     return arguments_.back().get();
   }
 
+  // Goes through all tensors and changes index mode
+  void updateTensorIndexModes();
+
+  // Checks all tensors held to find the smallest index mode required.
+  KernelIndexMode getSmallestIndexModeRequired() const;
+
   void appendPhiloxRNGSeed(uint64_t rand_offset);
 
   const ArgAbstract* operator[](int ind) const {

diff --git a/third_party/nvfuser/csrc/utils.cpp b/third_party/nvfuser/csrc/utils.cpp
@@ -322,11 +322,11 @@ KernelIndexMode collectIndexMode(const at::ArrayRef<c10::IValue>& inputs) {
           if (tensor_input.stride(dim_i) > 0) {
             // Acuumulate positive stride
             tensor_most_positive_index +=
-                (tensor_input.size(dim_i) - 1) * tensor_input.stride(dim_i);
+                tensor_input.size(dim_i) * tensor_input.stride(dim_i);
           } else {
             // Acuumulate negative stride
             tensor_most_negative_index +=
-                (tensor_input.size(dim_i) - 1) * tensor_input.stride(dim_i);
+                tensor_input.size(dim_i) * tensor_input.stride(dim_i);
           }
         }
       }

diff --git a/third_party/nvfuser/test/test_gpu3.cpp b/third_party/nvfuser/test/test_gpu3.cpp
@@ -7774,6 +7774,59 @@ TEST_F(NVFuserTest, FusionPredicateReductionInitGlobal_CUDA) {
       fe.kernel(), cg_outputs, inputs, {ref_t1, ref_t3}, __LINE__, __FILE__);
 }
 
+TEST_F(NVFuserTest, FusionExecutorCache_IndexType_Recompilation) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  auto tv0 = makeContigTensor(2, DataType::Half);
+  auto tv1 = makeContigTensor(2, DataType::Half);
+  auto tv2 = add(tv0, tv1);
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  fusion->addOutput(tv2);
+
+  auto fec = std::make_unique<FusionExecutorCache>(std::move(fusion));
+
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  auto input_small = at::ones({1024, 1024}, options);
+  auto input_large = at::ones({1024, 4 * 1 << 20}, options);
+
+  KernelArgumentHolder small_holder =
+      KernelArgumentHolder::createKernelArgumentHolder({input_small});
+  TORCH_CHECK(
+      small_holder.getIndexMode() ==
+      KernelIndexMode::INT32); // Index mode should be INT32
+
+  small_holder.setIndexMode(
+      KernelIndexMode::INT64); // Index mode should go up in size.
+  TORCH_CHECK(small_holder.getIndexMode() == KernelIndexMode::INT64);
+
+  small_holder.push(input_large);
+  small_holder.setIndexMode(KernelIndexMode::INT32);
+  TORCH_CHECK(
+      small_holder.getIndexMode() ==
+      KernelIndexMode::INT64); // Index mode shouldn't be changed.
+
+  // Nothing should be compiled prior to running a first time
+  TORCH_CHECK(!fec->isCompiled({input_small, input_small}));
+  TORCH_CHECK(!fec->isCompiled({input_large, input_large}));
+
+  // Only INT32 Indexing version should be compiled
+  fec->runFusionWithInputs({input_small, input_small});
+  TORCH_CHECK(fec->isCompiled({input_small, input_small}));
+  TORCH_CHECK(!fec->isCompiled({input_large, input_large}));
+
+  // Running input that requires INT64 should not delete cache entry for INT32
+  fec->runFusionWithInputs({input_large, input_large});
+  TORCH_CHECK(fec->isCompiled({input_small, input_small}));
+  TORCH_CHECK(fec->isCompiled({input_large, input_large}));
+
+  // In the other order is should also not delete the entry
+  fec->runFusionWithInputs({input_small, input_small});
+  TORCH_CHECK(fec->isCompiled({input_small, input_small}));
+  TORCH_CHECK(fec->isCompiled({input_large, input_large}));
+}
+
 // Test file size should be up to 10K LoC. Create a new file for more tests.
 
 } // namespace nvfuser