Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Runtime] Extend Graph Runtime To Support Cuda Graph Launch #7616

Merged
merged 19 commits into from
Mar 17, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ tvm_option(USE_LLVM "Build with LLVM, can be set to specific llvm-config path" O
tvm_option(USE_STACKVM_RUNTIME "Include stackvm into the runtime" OFF)
tvm_option(USE_GRAPH_RUNTIME "Build with tiny graph runtime" ON)
tvm_option(USE_GRAPH_RUNTIME_DEBUG "Build with tiny graph runtime debug mode" OFF)
tvm_option(USE_GRAPH_RUNTIME_CUGRAPH "Build with tiny graph runtime cuGraph launch mode" OFF)
zhuochenKIDD marked this conversation as resolved.
Show resolved Hide resolved
tvm_option(USE_OPENMP "Build with OpenMP thread pool implementation" OFF)
tvm_option(USE_RELAY_DEBUG "Building Relay in debug mode..." OFF)
tvm_option(USE_RTTI "Build with RTTI" ON)
Expand Down Expand Up @@ -321,6 +322,16 @@ if(USE_GRAPH_RUNTIME)
set_source_files_properties(${RUNTIME_GRAPH_SRCS}
PROPERTIES COMPILE_DEFINITIONS "TVM_GRAPH_RUNTIME_DEBUG")
endif(USE_GRAPH_RUNTIME_DEBUG)

if(USE_CUDA)
if(USE_GRAPH_RUNTIME_CUGRAPH)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes USE_GRAPH_RUNTIME_CUGRAPH silent when CUDA is OFF and may confuse users. We should have

if(USE_GRAPH_RUNTIME_CUGRAPH)
  if(NOT USE_CUDA)
    // error out saying please config with USE_CUDA=ON.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I moved this to CUDA.cmake to better check CUDA version > 10, so it might like cudnn/cublas feature, is that ok?

message(STATUS "Build with Graph runtime cuGraph support...")
file(GLOB RUNTIME_CUGRAPH_SRCS src/runtime/graph/cugraph/*.cc)
list(APPEND RUNTIME_SRCS ${RUNTIME_CUGRAPH_SRCS})
set_source_files_properties(${RUNTIME_GRAPH_SRCS}
PROPERTIES COMPILE_DEFINITIONS "TVM_GRAPH_RUNTIME_CUGRAPH")
endif(USE_GRAPH_RUNTIME_CUGRAPH)
endif(USE_CUDA)
endif(USE_GRAPH_RUNTIME)

if(USE_VM_PROFILER)
Expand Down
3 changes: 3 additions & 0 deletions cmake/config.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ set(USE_GRAPH_RUNTIME ON)
# Whether enable additional graph debug functions
set(USE_GRAPH_RUNTIME_DEBUG OFF)

# Whether enable tiny graph runtime for cudaGraph Launch
zhuochenKIDD marked this conversation as resolved.
Show resolved Hide resolved
set(USE_GRAPH_RUNTIME_CUGRAPH OFF)

# Whether enable additional vm profiler functions
set(USE_VM_PROFILER OFF)

Expand Down
62 changes: 62 additions & 0 deletions python/tvm/contrib/cu_graph/cugraph_runtime.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Graph runtime test cuGraph"""
import tvm._ffi

from tvm._ffi.base import string_types
from tvm.contrib import graph_runtime


def create(graph_json_str, libmod, ctx):
assert isinstance(graph_json_str, string_types)
try:
ctx, num_rpc_ctx, device_type_id = graph_runtime.get_device_ctx(libmod, ctx)
if num_rpc_ctx == len(ctx):
pass
else:
fcreate = tvm._ffi.get_global_func("tvm.graph_runtime_cugraph.create")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if num_rpc_ctx == len(ctx):
pass
else:
fcreate = tvm._ffi.get_global_func("tvm.graph_runtime_cugraph.create")
if num_rpc_ctx != len(ctx):
fcreate = tvm._ffi.get_global_func("tvm.graph_runtime_cugraph.create")

except ValueError:
raise ValueError(
"Please set '(USE_GRAPH_RUNTIME_CUGRAPH ON)' in "
"config.cmake and rebuild TVM to enable cu_graph test mode"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why test mode?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is because we are currently evaluating CUDA graph API vs kernel launch, and it's keep on going, using TVM is more convenient to do so on new workloads than TF Runtime. And also currently only Kernel-kind cuda node is in captured CUDA graph, in might be more benefits when Memcpy-kind node or using manually created cuda graph, so currently I am not sure current stream-capture way is the optimal way, perhaps need more test

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. We usually call it "experimental". I'll suggest the following:

To enable CuGraph (experimental), please set '(USE_GRAPH_RUNTIME_CUGRAPH ON)'
in config.cmake and rebuild TVM

)

func_obj = fcreate(graph_json_str, libmod, *device_type_id)
return GraphModuleCuGraph(func_obj, ctx, graph_json_str)


class GraphModuleCuGraph(graph_runtime.GraphModule):
def __init__(self, module, ctx, graph_json_str):

self._start_capture = module["start_capture"]
self._end_capture = module["end_capture"]
self._run_cuda_graph = module["run_cuda_graph"]

graph_runtime.GraphModule.__init__(self, module)

def capture_cuda_graph(self):
self._run() # call cuModuleLoadData before cudaStream API

print("====== Start Stream Capture ======")
zhuochenKIDD marked this conversation as resolved.
Show resolved Hide resolved
self._start_capture()
print("====== Start Run Ops On Stream ======")
self._run()
print("====== End Stream Capture ======")
self._end_capture()

def run_cuda_graph(self):
self._run_cuda_graph()
116 changes: 116 additions & 0 deletions src/runtime/graph/cugraph/graph_runtime_cugraph.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

/*!
* \file graph_runtime_cugraph.cc
*/

#include <tvm/runtime/registry.h>

#include "../../cuda/cuda_common.h"
#include "../graph_runtime.h"

namespace tvm {
namespace runtime {

class GraphRuntimeCuGraph : public GraphRuntime {
public:
int StartCapture() {
zhuochenKIDD marked this conversation as resolved.
Show resolved Hide resolved
const TVMContext& ctx = data_entry_[entry_id(0, 0)]->ctx;

TVMStreamCreate(ctx.device_type, ctx.device_id, &capture_stream_);
TVMSetStream(ctx.device_type, ctx.device_id, capture_stream_);

CUDA_CALL(cudaStreamBeginCapture(static_cast<cudaStream_t>(capture_stream_),
cudaStreamCaptureModeGlobal));
return 0;
zhuochenKIDD marked this conversation as resolved.
Show resolved Hide resolved
}

int RunCudaGraph() {
cudaStream_t cuStream = static_cast<cudaStream_t>(capture_stream_);
CUDA_CALL(cudaGraphLaunch(cu_graph_exec_, cuStream));
CUDA_CALL(cudaStreamSynchronize(cuStream));
return 0;
}

int EndCapture() {
cudaGraph_t graph;
CUDA_CALL(cudaStreamEndCapture(static_cast<cudaStream_t>(capture_stream_), &graph));

cudaGraphNode_t* nodes = NULL;
size_t numNodes = 0;
CUDA_CALL(cudaGraphGetNodes(graph, nodes, &numNodes));
LOG(INFO) << "Num of nodes in the cuda graph created using stream capture API = " << numNodes;

CUDA_CALL(cudaGraphInstantiate(&cu_graph_exec_, graph, NULL, NULL, 0));
return 0;
}

/*!
* \brief GetFunction Get the function based on input.
* \param name The function which needs to be invoked.
* \param sptr_to_self Packed function pointer.
*/
PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self);

private:
TVMStreamHandle capture_stream_;
zhuochenKIDD marked this conversation as resolved.
Show resolved Hide resolved
cudaGraphExec_t cu_graph_exec_;
};

PackedFunc GraphRuntimeCuGraph::GetFunction(const std::string& name,
const ObjectPtr<Object>& sptr_to_self) {
if (name == "run_cuda_graph") {
return PackedFunc(
[sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->RunCudaGraph(); });
} else if (name == "start_capture") {
return PackedFunc(
[sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->StartCapture(); });
} else if (name == "end_capture") {
return PackedFunc(
[sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->EndCapture(); });
} else {
return GraphRuntime::GetFunction(name, sptr_to_self);
}
}

Module GraphRuntimeCuGraphCreate(const std::string& sym_json, const tvm::runtime::Module& m,
const std::vector<TVMContext>& ctxs,
PackedFunc lookup_linked_param_func) {
auto exec = make_object<GraphRuntimeCuGraph>();
exec->Init(sym_json, m, ctxs, lookup_linked_param_func);
return Module(exec);
}

TVM_REGISTER_GLOBAL("tvm.graph_runtime_cugraph.create").set_body([](TVMArgs args, TVMRetValue* rv) {
ICHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_runtime.create is "
"at least 4, but it has "
<< args.num_args;
PackedFunc lookup_linked_param_func;
int ctx_start_arg = 2;
if (args[2].type_code() == kTVMPackedFuncHandle) {
lookup_linked_param_func = args[2];
ctx_start_arg++;
}

*rv = GraphRuntimeCuGraphCreate(args[0], args[1], GetAllContext(args, ctx_start_arg),
lookup_linked_param_func);
});
} // namespace runtime
} // namespace tvm
92 changes: 92 additions & 0 deletions tests/python/unittest/test_runtime_graph_cugraph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import json
import os
import re
import sys
import time

import pytest

import tvm
import tvm.testing
from tvm import te
import numpy as np

from tvm.contrib import utils, graph_runtime
from tvm.contrib.cu_graph import cugraph_runtime


bx = te.thread_axis("blockIdx.x")
tx = te.thread_axis("threadIdx.x")


@tvm.testing.requires_cuda
def test_graph_simple():
comaniac marked this conversation as resolved.
Show resolved Hide resolved
n = 32
A = te.placeholder((n,), name="A")
B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
s = te.create_schedule(B.op)
xo, xi = s[B].split(B.op.axis[0], factor=8)
s[B].bind(xo, bx)
s[B].bind(xi, tx)

node0 = {"op": "null", "name": "x", "inputs": []}
node1 = {
"op": "tvm_op",
"name": "add",
"inputs": [[0, 0, 0]],
"attrs": {"func_name": "myadd", "flatten_data": "1", "num_inputs": "1", "num_outputs": "1"},
}
nodes = [node0, node1]
arg_nodes = [0]
node_row_ptr = [0, 1, 2]
outputs = [[1, 0, 0]]
shape = (n,)
attrs = {
"shape": ["list_shape", [shape, shape]],
"dltype": ["list_str", ["float32", "float32"]],
"storage_id": ["list_int", [0, 1]],
}
graph = {
"nodes": nodes,
"arg_nodes": arg_nodes,
"node_row_ptr": node_row_ptr,
"heads": outputs,
"attrs": attrs,
}
graph = json.dumps(graph)

def check_verify():
mlib = tvm.build(s, [A, B], "cuda", name="myadd")
ctx = tvm.gpu(0)
try:
mod = cugraph_runtime.create(graph, mlib, ctx)
comaniac marked this conversation as resolved.
Show resolved Hide resolved
except ValueError:
return
mod.capture_cuda_graph()
a = np.random.uniform(size=(n,)).astype(A.dtype)
mod.set_input(x=a)
mod.run_cuda_graph()
comaniac marked this conversation as resolved.
Show resolved Hide resolved
out = mod.get_output(0, tvm.nd.empty((n,)))
np.testing.assert_equal(out.asnumpy(), a + 1)

check_verify()


if __name__ == "__main__":
test_graph_simple()