Skip to content

Commit

Permalink
Output allocate patch (#1790)
Browse files Browse the repository at this point in the history
Caching strides along with sizes. This is to support current expand, which introduces non-contiguous output tensor
  • Loading branch information
jjsjann123 authored Jun 30, 2022
1 parent fe93bf5 commit 59f3c32
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 1 deletion.
4 changes: 3 additions & 1 deletion torch/csrc/jit/codegen/cuda/executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -762,8 +762,9 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
if (outputs.empty()) {
FUSER_PERF_SCOPE("ExecutorRunFusion::OutputAlloc");
for (const auto i : c10::irange(executor_entry->output_sizes.size())) {
allocated_outputs.push_back(at::native::empty_cuda(
allocated_outputs.push_back(at::native::empty_strided_cuda(
executor_entry->output_sizes[i],
executor_entry->output_strides[i],
executor_entry->output_types[i],
c10::nullopt,
options_.device,
Expand Down Expand Up @@ -934,6 +935,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
executor_entry->io_alias_indices = alias_indices;
for (const auto& output : allocated_outputs) {
executor_entry->output_sizes.push_back(output.sizes().vec());
executor_entry->output_strides.push_back(output.strides().vec());
executor_entry->output_types.push_back(output.scalar_type());
}

Expand Down
1 change: 1 addition & 0 deletions torch/csrc/jit/codegen/cuda/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
LaunchParams launch_params;
std::vector<std::pair<int, int>> io_alias_indices;
std::vector<std::vector<int64_t>> output_sizes;
std::vector<std::vector<int64_t>> output_strides;
std::vector<at::ScalarType> output_types;
std::vector<std::vector<int64_t>> buffer_sizes;
std::vector<at::ScalarType> buffer_types;
Expand Down
34 changes: 34 additions & 0 deletions torch/csrc/jit/codegen/cuda/test/test_gpu_view.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -972,6 +972,40 @@ TEST_F(NVFuserTest, FusionComputeAtRootDomainMapWithView_CUDA) {
tv1->axis(1)->toString());
}

TEST_F(NVFuserTest, FusionExpandRepro_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);

const std::vector<int64_t> input_shape1{4, 1, 1};
const std::vector<int64_t> input_shape2{4, 3, 2};

auto tv0 = makeConcreteTensor({-1, 1, 1});
fusion.addInput(tv0);
auto tv1 = makeSymbolicTensor(3);
fusion.addInput(tv1);

auto tv2 = expand_as(tv0, tv1);
fusion.addOutput(tv2);

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor at_x = at::randn(input_shape1, options);
at::Tensor at_y = at::randn(input_shape2, options);
std::vector<IValue> aten_inputs = {at_x, at_y};

FusionExecutor fe;
fe.compileFusion(&fusion);
LaunchParams l_params;
auto outputs = fe.runFusion(aten_inputs, {}, l_params, 0);

auto out = at_x.expand_as(at_y);

testValidate(&fusion, outputs, aten_inputs, {out}, __LINE__, __FILE__);

// second run to verify cached output allocation
outputs = fe.runFusion(aten_inputs, {}, l_params, 0);
testValidate(&fusion, outputs, aten_inputs, {out}, __LINE__, __FILE__);
}

} // namespace jit
} // namespace torch
#endif // #if defined(USE_CUDA)

0 comments on commit 59f3c32

Please sign in to comment.