Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 48 additions & 61 deletions third_party/nvfuser/csrc/executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,9 @@ void FusionExecutor::compileFusion(
// TODO: refactor the options_ passed through
options_.device = c10::Device(c10::DeviceType::CUDA, args.getDeviceIndex());
options_.index_mode = args.getIndexMode();
compile_params.index_type = DataType::Index;
compile_params.index_type =

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! I will check tomorrow, I think it's that

(options_.index_mode == KernelIndexMode::INT64 ? DataType::Int
: DataType::Int32);
c10::DeviceGuard dg(options_.device);

TORCH_INTERNAL_ASSERT(
Expand Down Expand Up @@ -1102,68 +1104,11 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
launch_params_ =
computeLaunchParams(launch_constraints, expr_eval, warp_size_);

auto alias_indices_entry =
executor_utils::caching::ExecutorCompileTimeEntry<
executor_utils::caching::InputAliasIndices>(
compileTimeDataCache(), [&]() {
return std::make_unique<std::vector<std::pair<int, int>>>(
fusion_->getInputAliasIndices());
});

auto& alias_indices = alias_indices_entry.get();

// We need to push all of the outputs to the KernelArgumentHolder before
// checking at the Indexing Type. NOLINTNEXTLINE(bugprone-branch-clone)
if (outputs.empty()) {
auto output_alias_indices_entry =
executor_utils::caching::ExecutorCompileTimeEntry<
executor_utils::caching::OutputAliasIndices>(
compileTimeDataCache(), [&]() {
return std::make_unique<std::unordered_set<int>>(
fusion_->getOutputAliasIndices());
});

auto& output_alias_indices = output_alias_indices_entry.get();

allocated_outputs = allocOutputs(args, expr_eval, output_alias_indices);

for (const auto& entry : alias_indices) {
auto aliased_output_index = entry.first;
auto aliased_input_index = entry.second;
auto tensor_arg_abstract =
dynamic_cast<const TensorArgAbstract*>(args[aliased_input_index]);
TORCH_INTERNAL_ASSERT(
tensor_arg_abstract, "alias io only supports tensor");
allocated_outputs[aliased_output_index] =
tensor_arg_abstract->getTensor();
}
args.push(allocated_outputs);
} else {
allocated_outputs = outputs;
args.push(outputs);
executor_utils::validateKernelOutputs(
fusion_, allocated_outputs, options_.device);
}

// Recompile the kernel if the number of threads in the block has increased
// or maxrregcount has changed or nvfuser_index_t size changed
bool need_to_recompile =
launch_params_.nThreads() > block_size_high_water_mark ||
compile_params.maxrregcount != maxrregcount_high_water_mark;

// we recompile if the index type shrinks too, as it may lead to faster
// code.
if (args.getIndexMode() != options_.index_mode) {
need_to_recompile = true;
options_.index_mode = args.getIndexMode();
}

if (need_to_recompile) {
// or maxrregcount has changed
if (launch_params_.nThreads() > block_size_high_water_mark ||
compile_params.maxrregcount != maxrregcount_high_water_mark) {
const auto kernel = lowered_->kernel();
// index type is contained in the kernel name and as a result in
// kernel_code which can then be used as a key in KernelDb. TODO This
// needs to be cleaned-up so that KernelDb's key is not only the kernel
// (we also need the GPU Arch, maxrregcount, ...)
kernel_code_ = codegen::generateCudaKernel(kernel, kernelName());
const auto structured_code = getStructuredCode(kernel_code_);
block_size_high_water_mark = launch_params_.nThreads();
Expand Down Expand Up @@ -1218,6 +1163,48 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
compileTimeDataCache(),
expr_eval);

auto alias_indices_entry =
executor_utils::caching::ExecutorCompileTimeEntry<
executor_utils::caching::InputAliasIndices>(
compileTimeDataCache(), [&]() {
return std::make_unique<std::vector<std::pair<int, int>>>(
fusion_->getInputAliasIndices());
});

auto& alias_indices = alias_indices_entry.get();

// NOLINTNEXTLINE(bugprone-branch-clone)
if (outputs.empty()) {
auto output_alias_indices_entry =
executor_utils::caching::ExecutorCompileTimeEntry<
executor_utils::caching::OutputAliasIndices>(
compileTimeDataCache(), [&]() {
return std::make_unique<std::unordered_set<int>>(
fusion_->getOutputAliasIndices());
});

auto& output_alias_indices = output_alias_indices_entry.get();

allocated_outputs = allocOutputs(args, expr_eval, output_alias_indices);

for (const auto& entry : alias_indices) {
auto aliased_output_index = entry.first;
auto aliased_input_index = entry.second;
auto tensor_arg_abstract =
dynamic_cast<const TensorArgAbstract*>(args[aliased_input_index]);
TORCH_INTERNAL_ASSERT(
tensor_arg_abstract, "alias io only supports tensor");
allocated_outputs[aliased_output_index] =
tensor_arg_abstract->getTensor();
}
args.push(allocated_outputs);
} else {
allocated_outputs = outputs;
args.push(outputs);
executor_utils::validateKernelOutputs(
fusion_, allocated_outputs, options_.device);
}

global_buffers = allocGlobalVals(expr_eval);

if (kernel()->summary().max_rng_offsets >= 0) {
Expand Down
9 changes: 1 addition & 8 deletions third_party/nvfuser/csrc/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -206,16 +206,9 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
last_compiled_binary_, "-fun 1 -c");
}

//! Returns a kernel name.
//! The kernel name has a suffix that represents the choosen indexing mode:
//! int32 or int64.
//! TODO: the suffix is currently used principally as a workaround so we can
//! use the kernel string as a key in KernelDb. This should be re-worked.
std::string kernelName() const {
const char* index_type_suffix =
options_.index_mode == KernelIndexMode::INT64 ? "_int64" : "_int32";
std::stringstream ss;
ss << "kernel" << fusion_id_ << index_type_suffix;
ss << "kernel" << fusion_id_;
return ss.str();
}

Expand Down
46 changes: 0 additions & 46 deletions third_party/nvfuser/csrc/executor_kernel_arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,14 +152,6 @@ KernelArgumentHolder KernelArgumentHolder::createKernelArgumentHolder(

// Push a tensor to the arguments
void KernelArgumentHolder::push(const at::Tensor& tensor) {
// Deferred update of index_mode.
// Index mode size depends on the fusion inputs AND outputs.
// TODO We should also take into account the other intermediary buffers
if (index_mode_ == KernelIndexMode::INT32 &&
collectIndexMode({tensor}) == KernelIndexMode::INT64) {
setIndexMode(KernelIndexMode::INT64);
}

changed_ = true;
if (is_cpu_scalar(tensor)) {
switch (tensor.scalar_type()) {
Expand Down Expand Up @@ -291,44 +283,6 @@ void** KernelArgumentHolder::getBuffer() {
return void_ptrs_.data();
}

void KernelArgumentHolder::updateTensorIndexModes() {
for (auto& arg : arguments_) {
TensorArgAbstract* tensor_arg_old =
dynamic_cast<TensorArgAbstract*>(arg.get());
if (tensor_arg_old == nullptr)
continue;
auto tensor = tensor_arg_old->getTensor();
int nDims = tensor.ndimension();
c10::ScalarType dtype = tensor.scalar_type();
std::unique_ptr<TensorArgAbstract> tensor_arg =
getTensorArg(dtype, nDims, index_mode_);
tensor_arg->setTensor(tensor);
tensor_arg->setPointer(tensor.data_ptr());
tensor_arg->setDataType(aten_to_data_type(dtype));
for (const auto i : c10::irange(nDims)) {
tensor_arg->setSize(i, tensor.sizes()[i]);
tensor_arg->setStride(i, tensor.strides()[i]);
}
arg = std::move(tensor_arg);
}
}

KernelIndexMode KernelArgumentHolder::getSmallestIndexModeRequired() const {
KernelIndexMode smallest = KernelIndexMode::INT32;
for (auto& arg : arguments_) {
TensorArgAbstract* tensor_arg_old =
dynamic_cast<TensorArgAbstract*>(arg.get());
if (tensor_arg_old == nullptr)
continue;
auto tensor = tensor_arg_old->getTensor();
auto mode = collectIndexMode({tensor});
if (mode == KernelIndexMode::INT64) {
smallest = KernelIndexMode::INT64;
}
}
return smallest;
}

void KernelArgumentHolder::push(const c10::ArrayRef<c10::IValue>& args) {
// Naive I/O setup, I'm ignoring all the potential transformation (i.e. I/O
// allocated here from the subgraph could be, and very likely are, different
Expand Down
37 changes: 0 additions & 37 deletions third_party/nvfuser/csrc/executor_kernel_arg.h
Original file line number Diff line number Diff line change
Expand Up @@ -284,37 +284,6 @@ class TORCH_CUDA_CU_API KernelArgumentHolder {
return index_mode_;
}

// Allows the user to specify a larger index mode than reqd
void setIndexMode(KernelIndexMode mode) {
// index_mode_ is deduced on args. We won't allow setting it to a smaller
// size than required
auto smallest = getSmallestIndexModeRequired();

// well, turns out index type was too small
if (smallest == KernelIndexMode::INT64 &&
index_mode_ == KernelIndexMode::INT32) {
TORCH_INTERNAL_ASSERT(
false,
"index_mode_ is INT32, but given the state of KernelArgumentHolder it should have already been INT64.");
}

// we can't narrow down index type
if (mode == KernelIndexMode::INT32 && smallest == KernelIndexMode::INT64) {
return;
}

// no-op
if (mode == index_mode_) {
return;
}

// Increasing or decreasing index size.
if (mode == smallest || mode == KernelIndexMode::INT64) {
index_mode_ = mode;
updateTensorIndexModes();
}
}

explicit KernelArgumentHolder(KernelIndexMode index_mode)
: index_mode_(index_mode) {}

Expand Down Expand Up @@ -363,12 +332,6 @@ class TORCH_CUDA_CU_API KernelArgumentHolder {
return arguments_.back().get();
}

// Goes through all tensors and changes index mode
void updateTensorIndexModes();

// Checks all tensors held to find the smallest index mode required.
KernelIndexMode getSmallestIndexModeRequired() const;

void appendPhiloxRNGSeed(uint64_t rand_offset);

const ArgAbstract* operator[](int ind) const {
Expand Down
4 changes: 2 additions & 2 deletions third_party/nvfuser/csrc/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -323,11 +323,11 @@ KernelIndexMode collectIndexMode(const at::ArrayRef<c10::IValue>& inputs) {
if (tensor_input.stride(dim_i) > 0) {
// Acuumulate positive stride
tensor_most_positive_index +=
tensor_input.size(dim_i) * tensor_input.stride(dim_i);
(tensor_input.size(dim_i) - 1) * tensor_input.stride(dim_i);
} else {
// Acuumulate negative stride
tensor_most_negative_index +=
tensor_input.size(dim_i) * tensor_input.stride(dim_i);
(tensor_input.size(dim_i) - 1) * tensor_input.stride(dim_i);
}
}
}
Expand Down
56 changes: 0 additions & 56 deletions third_party/nvfuser/test/test_gpu3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7750,62 +7750,6 @@ TEST_F(NVFuserTest, FusionPredicateReductionInitGlobal_CUDA) {
fe.kernel(), cg_outputs, inputs, {ref_t1, ref_t3}, __LINE__, __FILE__);
}

TEST_F(NVFuserTest, FusionExecutorCache_IndexType_Recompilation) {
auto fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());

auto tv0 = makeContigTensor(2, DataType::Half);
auto tv1 = makeContigTensor(2, DataType::Half);
auto tv2 = add(tv0, tv1);
fusion->addInput(tv0);
fusion->addInput(tv1);
fusion->addOutput(tv2);

auto fec = std::make_unique<FusionExecutorCache>(std::move(fusion));

auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
auto input_small = at::ones({1024, 1024}, options);
auto input_large = at::ones({1024, 4 * 1 << 20}, options);

KernelArgumentHolder small_holder =
KernelArgumentHolder::createKernelArgumentHolder({input_small});
TORCH_CHECK(
small_holder.getIndexMode() ==
KernelIndexMode::INT32); // Index mode should be INT32

small_holder.setIndexMode(
KernelIndexMode::INT64); // Index mode should go up in size.
TORCH_CHECK(small_holder.getIndexMode() == KernelIndexMode::INT64);

small_holder.push(input_large);
small_holder.setIndexMode(KernelIndexMode::INT32);
TORCH_CHECK(
small_holder.getIndexMode() ==
KernelIndexMode::INT64); // Index mode shouldn't be changed.

// first run with the smaller input
fec->runFusionWithInputs({input_small, input_small});
auto kernel_runtime_for_input_small = fec->getMostRecentKernelRuntime();

// run with the larger input
fec->runFusionWithInputs({input_large, input_large});
auto kernel_runtime_for_input_large = fec->getMostRecentKernelRuntime();

// make sure they correspond to different runtimes, just in case
TORCH_CHECK(
kernel_runtime_for_input_small != kernel_runtime_for_input_large,
"Unexpected kernel runtime");

// second run with the smaller input
fec->runFusionWithInputs({input_small, input_small});
auto second_kernel_runtime_for_input_small =
fec->getMostRecentKernelRuntime();

TORCH_CHECK(
kernel_runtime_for_input_small == second_kernel_runtime_for_input_small,
"Expected to reuse the same runtime");
}

// Test file size should be up to 10K LoC. Create a new file for more tests.

} // namespace nvfuser