Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixing and improving indexing type handling #2522

Merged
merged 3 commits into from
Mar 8, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 61 additions & 48 deletions third_party/nvfuser/csrc/executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,9 +245,7 @@ void FusionExecutor::compileFusion(
// TODO: refactor the options_ passed through
options_.device = c10::Device(c10::DeviceType::CUDA, args.getDeviceIndex());
options_.index_mode = args.getIndexMode();
compile_params.index_type =
(options_.index_mode == KernelIndexMode::INT64 ? DataType::Int
: DataType::Int32);
compile_params.index_type = DataType::Index;
c10::DeviceGuard dg(options_.device);

TORCH_INTERNAL_ASSERT(
Expand Down Expand Up @@ -1111,11 +1109,68 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
launch_params_ =
computeLaunchParams(launch_constraints, expr_eval, warp_size_);

auto alias_indices_entry =
executor_utils::caching::ExecutorCompileTimeEntry<
executor_utils::caching::InputAliasIndices>(
compileTimeDataCache(), [&]() {
return std::make_unique<std::vector<std::pair<int, int>>>(
fusion_->getInputAliasIndices());
});

auto& alias_indices = alias_indices_entry.get();

// We need to push all of the outputs to the KernelArgumentHolder before checking at the Indexing Type.
// NOLINTNEXTLINE(bugprone-branch-clone)
if (outputs.empty()) {
naoyam marked this conversation as resolved.
Show resolved Hide resolved
auto output_alias_indices_entry =
executor_utils::caching::ExecutorCompileTimeEntry<
executor_utils::caching::OutputAliasIndices>(
compileTimeDataCache(), [&]() {
return std::make_unique<std::unordered_set<int>>(
fusion_->getOutputAliasIndices());
});

auto& output_alias_indices = output_alias_indices_entry.get();

allocated_outputs = allocOutputs(args, expr_eval, output_alias_indices);

for (const auto& entry : alias_indices) {
auto aliased_output_index = entry.first;
auto aliased_input_index = entry.second;
auto tensor_arg_abstract =
dynamic_cast<const TensorArgAbstract*>(args[aliased_input_index]);
TORCH_INTERNAL_ASSERT(
tensor_arg_abstract, "alias io only supports tensor");
allocated_outputs[aliased_output_index] =
tensor_arg_abstract->getTensor();
}
args.push(allocated_outputs);
} else {
allocated_outputs = outputs;
args.push(outputs);
executor_utils::validateKernelOutputs(
fusion_, allocated_outputs, options_.device);
}

// Recompile the kernel if the number of threads in the block has increased
// or maxrregcount has changed
if (launch_params_.nThreads() > block_size_high_water_mark ||
compile_params.maxrregcount != maxrregcount_high_water_mark) {
// or maxrregcount has changed or nvfuser_index_t size changed
bool need_to_recompile =
launch_params_.nThreads() > block_size_high_water_mark ||
compile_params.maxrregcount != maxrregcount_high_water_mark;

// we recompile if the index type shrinks too, as it may lead to faster
// code.
if (args.getIndexMode() != options_.index_mode) {
naoyam marked this conversation as resolved.
Show resolved Hide resolved
need_to_recompile = true;
options_.index_mode = args.getIndexMode();
}

if (need_to_recompile) {
const auto kernel = lowered_->kernel();
// index type is contained in the kernel name and as a result in
// kernel_code which can then be used as a key in KernelDb. TODO This
// needs to be cleaned-up so that KernelDb's key is not only the kernel
// (we also need the GPU Arch, maxrregcount, ...)
kernel_code_ = codegen::generateCudaKernel(kernel, kernelName());
const auto structured_code = getStructuredCode(kernel_code_);
block_size_high_water_mark = launch_params_.nThreads();
Expand Down Expand Up @@ -1170,48 +1225,6 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
compileTimeDataCache(),
expr_eval);

auto alias_indices_entry =
executor_utils::caching::ExecutorCompileTimeEntry<
executor_utils::caching::InputAliasIndices>(
compileTimeDataCache(), [&]() {
return std::make_unique<std::vector<std::pair<int, int>>>(
fusion_->getInputAliasIndices());
});

auto& alias_indices = alias_indices_entry.get();

// NOLINTNEXTLINE(bugprone-branch-clone)
if (outputs.empty()) {
auto output_alias_indices_entry =
executor_utils::caching::ExecutorCompileTimeEntry<
executor_utils::caching::OutputAliasIndices>(
compileTimeDataCache(), [&]() {
return std::make_unique<std::unordered_set<int>>(
fusion_->getOutputAliasIndices());
});

auto& output_alias_indices = output_alias_indices_entry.get();

allocated_outputs = allocOutputs(args, expr_eval, output_alias_indices);

for (const auto& entry : alias_indices) {
auto aliased_output_index = entry.first;
auto aliased_input_index = entry.second;
auto tensor_arg_abstract =
dynamic_cast<const TensorArgAbstract*>(args[aliased_input_index]);
TORCH_INTERNAL_ASSERT(
tensor_arg_abstract, "alias io only supports tensor");
allocated_outputs[aliased_output_index] =
tensor_arg_abstract->getTensor();
}
args.push(allocated_outputs);
} else {
allocated_outputs = outputs;
args.push(outputs);
executor_utils::validateKernelOutputs(
fusion_, allocated_outputs, options_.device);
}

global_buffers = allocGlobalVals(expr_eval);

if (kernel()->summary().max_rng_offsets >= 0) {
Expand Down
4 changes: 3 additions & 1 deletion third_party/nvfuser/csrc/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,10 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
}

std::string kernelName() const {
const char* index_type_suffix =
options_.index_mode == KernelIndexMode::INT64 ? "_int64" : "_int32";
std::stringstream ss;
ss << "kernel" << fusion_id_;
ss << "kernel" << fusion_id_ << index_type_suffix;
naoyam marked this conversation as resolved.
Show resolved Hide resolved
return ss.str();
}

Expand Down
46 changes: 46 additions & 0 deletions third_party/nvfuser/csrc/executor_kernel_arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,14 @@ KernelArgumentHolder KernelArgumentHolder::createKernelArgumentHolder(

// Push a tensor to the arguments
void KernelArgumentHolder::push(const at::Tensor& tensor) {
// Deferred update of index_mode.
// Index mode size depends on the fusion inputs AND outputs.
// TODO We should also take into account the other intermediary buffers
if (index_mode_ == KernelIndexMode::INT32 &&
collectIndexMode({tensor}) == KernelIndexMode::INT64) {
setIndexMode(KernelIndexMode::INT64);
}

changed_ = true;
if (is_cpu_scalar(tensor)) {
switch (tensor.scalar_type()) {
Expand Down Expand Up @@ -283,6 +291,44 @@ void** KernelArgumentHolder::getBuffer() {
return void_ptrs_.data();
}

void KernelArgumentHolder::updateTensorIndexModes() {
for (auto& arg : arguments_) {
TensorArgAbstract* tensor_arg_old =
dynamic_cast<TensorArgAbstract*>(arg.get());
if (tensor_arg_old == nullptr)
continue;
auto tensor = tensor_arg_old->getTensor();
int nDims = tensor.ndimension();
c10::ScalarType dtype = tensor.scalar_type();
std::unique_ptr<TensorArgAbstract> tensor_arg =
getTensorArg(dtype, nDims, index_mode_);
tensor_arg->setTensor(tensor);
tensor_arg->setPointer(tensor.data_ptr());
tensor_arg->setDataType(aten_to_data_type(dtype));
for (const auto i : c10::irange(nDims)) {
tensor_arg->setSize(i, tensor.sizes()[i]);
tensor_arg->setStride(i, tensor.strides()[i]);
}
arg = std::move(tensor_arg);
}
}

KernelIndexMode KernelArgumentHolder::getSmallestIndexModeRequired() const {
KernelIndexMode smallest = KernelIndexMode::INT32;
for (auto& arg : arguments_) {
TensorArgAbstract* tensor_arg_old =
dynamic_cast<TensorArgAbstract*>(arg.get());
if (tensor_arg_old == nullptr)
continue;
auto tensor = tensor_arg_old->getTensor();
auto mode = collectIndexMode({tensor});
if (mode == KernelIndexMode::INT64) {
smallest = KernelIndexMode::INT64;
}
}
return smallest;
}

void KernelArgumentHolder::push(const c10::ArrayRef<c10::IValue>& args) {
// Naive I/O setup, I'm ignoring all the potential transformation (i.e. I/O
// allocated here from the subgraph could be, and very likely are, different
Expand Down
37 changes: 37 additions & 0 deletions third_party/nvfuser/csrc/executor_kernel_arg.h
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,37 @@ class TORCH_CUDA_CU_API KernelArgumentHolder {
return index_mode_;
}

// Allows the user to specify a larger index mode than reqd
void setIndexMode(KernelIndexMode mode) {
// index_mode_ is deduced on args. We won't allow setting it to a smaller
// size than required
auto smallest = getSmallestIndexModeRequired();

// well, turns out index type was too small, regardless of the argument.
if (smallest == KernelIndexMode::INT64 &&
index_mode_ != KernelIndexMode::INT64) {
naoyam marked this conversation as resolved.
Show resolved Hide resolved
index_mode_ = KernelIndexMode::INT64;
updateTensorIndexModes();
return;
}

// we can't narrow down index type
if (mode == KernelIndexMode::INT32 && smallest == KernelIndexMode::INT64) {
return;
}

// no-op
if (mode == index_mode_) {
return;
}

// Increasing or decreasing index size.
if (mode == smallest || mode == KernelIndexMode::INT64) {
index_mode_ = mode;
updateTensorIndexModes();
}
}

explicit KernelArgumentHolder(KernelIndexMode index_mode)
: index_mode_(index_mode) {}

Expand Down Expand Up @@ -332,6 +363,12 @@ class TORCH_CUDA_CU_API KernelArgumentHolder {
return arguments_.back().get();
}

// Goes through all tensors and changes index mode
void updateTensorIndexModes();

// Checks all tensors held to find the smallest index mode required.
KernelIndexMode getSmallestIndexModeRequired() const;

void appendPhiloxRNGSeed(uint64_t rand_offset);

const ArgAbstract* operator[](int ind) const {
Expand Down
4 changes: 2 additions & 2 deletions third_party/nvfuser/csrc/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -322,11 +322,11 @@ KernelIndexMode collectIndexMode(const at::ArrayRef<c10::IValue>& inputs) {
if (tensor_input.stride(dim_i) > 0) {
// Acuumulate positive stride
tensor_most_positive_index +=
(tensor_input.size(dim_i) - 1) * tensor_input.stride(dim_i);
tensor_input.size(dim_i) * tensor_input.stride(dim_i);
} else {
// Acuumulate negative stride
tensor_most_negative_index +=
(tensor_input.size(dim_i) - 1) * tensor_input.stride(dim_i);
tensor_input.size(dim_i) * tensor_input.stride(dim_i);
}
}
}
Expand Down
53 changes: 53 additions & 0 deletions third_party/nvfuser/test/test_gpu3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7774,6 +7774,59 @@ TEST_F(NVFuserTest, FusionPredicateReductionInitGlobal_CUDA) {
fe.kernel(), cg_outputs, inputs, {ref_t1, ref_t3}, __LINE__, __FILE__);
}

TEST_F(NVFuserTest, FusionExecutorCache_IndexType_Recompilation) {
auto fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());

auto tv0 = makeContigTensor(2, DataType::Half);
auto tv1 = makeContigTensor(2, DataType::Half);
auto tv2 = add(tv0, tv1);
fusion->addInput(tv0);
fusion->addInput(tv1);
fusion->addOutput(tv2);

auto fec = std::make_unique<FusionExecutorCache>(std::move(fusion));

auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
auto input_small = at::ones({1024, 1024}, options);
auto input_large = at::ones({1024, 4 * 1 << 20}, options);

KernelArgumentHolder small_holder =
KernelArgumentHolder::createKernelArgumentHolder({input_small});
TORCH_CHECK(
small_holder.getIndexMode() ==
KernelIndexMode::INT32); // Index mode should be INT32

small_holder.setIndexMode(
KernelIndexMode::INT64); // Index mode should go up in size.
TORCH_CHECK(small_holder.getIndexMode() == KernelIndexMode::INT64);

small_holder.push(input_large);
small_holder.setIndexMode(KernelIndexMode::INT32);
TORCH_CHECK(
small_holder.getIndexMode() ==
KernelIndexMode::INT64); // Index mode shouldn't be changed.

// Nothing should be compiled prior to running a first time
TORCH_CHECK(!fec->isCompiled({input_small, input_small}));
TORCH_CHECK(!fec->isCompiled({input_large, input_large}));

// Only INT32 Indexing version should be compiled
fec->runFusionWithInputs({input_small, input_small});
TORCH_CHECK(fec->isCompiled({input_small, input_small}));
TORCH_CHECK(!fec->isCompiled({input_large, input_large}));

// Running input that requires INT64 should not delete cache entry for INT32
fec->runFusionWithInputs({input_large, input_large});
TORCH_CHECK(fec->isCompiled({input_small, input_small}));
TORCH_CHECK(fec->isCompiled({input_large, input_large}));

// In the other order is should also not delete the entry
fec->runFusionWithInputs({input_small, input_small});
TORCH_CHECK(fec->isCompiled({input_small, input_small}));
TORCH_CHECK(fec->isCompiled({input_large, input_large}));
naoyam marked this conversation as resolved.
Show resolved Hide resolved
}

// Test file size should be up to 10K LoC. Create a new file for more tests.

} // namespace nvfuser