From 0b30034bac7334bdeb361b69f84fdcc002449019 Mon Sep 17 00:00:00 2001 From: huajsj Date: Sun, 17 Jul 2022 00:06:56 -0700 Subject: [PATCH] change into cutlass --- .../using_with_pipeline_executor.py | 96 ++++++++++++++----- python/tvm/contrib/pipeline_executor.py | 9 +- python/tvm/contrib/pipeline_executor_build.py | 14 ++- 3 files changed, 89 insertions(+), 30 deletions(-) diff --git a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py index f693b76126b6..f58edfc43021 100755 --- a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py +++ b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py @@ -46,9 +46,9 @@ def get_network(): out_channels = 16 batch_size = 1 - data = relay.var("data", relay.TensorType((batch_size, 3, img_size, img_size), "float32")) + data = relay.var("data", relay.TensorType((batch_size, 3, img_size, img_size), "float16")) dense_weight = relay.var( - "data", relay.TensorType((batch_size, 16 * img_size * img_size), "float32") + "dweight", relay.TensorType((batch_size, 16 * img_size * img_size), "float16") ) weight = relay.var("weight") second_weight = relay.var("second_weight") @@ -92,20 +92,22 @@ def get_network(): """ #subgraphs[0]) - def @main(%data: Tensor[(1, 3, img_size, img_size), float32]) { - %0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(16, 3, 3, 3), float32] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, img_size, img_size), float32] */; - %1 = nn.batch_norm(%0, meta[relay.Constant][1] /* ty=Tensor[(16), float32] */, meta[relay.Constant][2] /* ty=Tensor[(16), float32]*/, meta[relay.Constant][3] /* ty=Tensor[(16), float32] */, meta[relay.Constant][4] /* ty=Tensor[(16), float32] */) /* ty=(Tensor[(1,16, img_size, img_size), float32], Tensor[(16), float32], Tensor[(16), float32]) */; + def @main(%data: Tensor[(1, 3, img_size, img_size), float16]) { + %0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(16, 3, 3, 3), float16] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, img_size, img_size), float16] */; + %1 = nn.batch_norm(%0, meta[relay.Constant][1] /* ty=Tensor[(16), float16] */, meta[relay.Constant][2] /* ty=Tensor[(16), float16]*/, meta[relay.Constant][3] /* ty=Tensor[(16), float16] */, meta[relay.Constant][4] /* ty=Tensor[(16), float16] */) /* ty=(Tensor[(1,16, img_size, img_size), float16], Tensor[(16), float16], Tensor[(16), float16]) */; %2 = %1.0; - nn.relu(%2) /* ty=Tensor[(1, 16, img_size, img_size), float32] */ + nn.relu(%2) /* ty=Tensor[(1, 16, img_size, img_size), float16] */ } peline-tutorial #subgraphs[1] - def @main(%data_n_0: Tensor[(1, 16, img_size, img_size), float32]) { - nn.conv2d(%data_n_0, meta[relay.Constant][0] /* ty=Tensor[(16, 16, 3, 3), float32] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, img_size, img_size), float32] */ + def @main(%data_n_0: Tensor[(1, 16, 8, 8), float16] /* ty=Tensor[(1, 16, 8, 8), float16] */) { + %0 = nn.batch_flatten(%data_n_0) /* ty=Tensor[(1, 1024), float16] */; + nn.dense(%0, meta[relay.Constant][0] /* ty=Tensor[(1, 1024), float16] */, units=None) /* ty=Tensor[(1, 1), float16] */ } + """ # sphinx_gallery_start_ignore @@ -113,13 +115,40 @@ def @main(%data_n_0: Tensor[(1, 16, img_size, img_size), float32]) { testing.utils.install_request_hook(depth=3) # sphinx_gallery_end_ignore +######################################### +# Build the subgraph with cutlass target. +# --------------------------------------- +######################################### +cutlass = tvm.target.Target( + { + "kind": "cutlass", + "sm": 80, + "use_3xtf32": True, + "split_k_slices": [1], + "profile_all_alignments": False, + "find_first_valid": True, + "use_multiprocessing": True, + "use_fast_math": False, + "tmp_dir": "./tmp", + }, + host=tvm.target.Target("llvm"), +) + + +def cutlass_build(mod, target, params=None, target_host=None, mod_name="default"): + target = [target, cutlass] + lib = relay.build_module.build( + mod, target=target, params=params, target_host=target_host, mod_name=mod_name + ) + return lib + ########################################################### # Run the two subgraphs in pipeline with pipeline executor. # --------------------------------------------------------- # Define a function to do all the codegen and pipeline executor works. # To run pipeline executor with dnnl, USE_PIPELINE_EXECUTOR need to get set as ON. -# and the 'USE_DNNL_CODEGEN' should set as ON in config.cmake and installing MKL-DNN. +# and the 'USE_CUTLASS' should set as ON in config.cmake. def run_pipeline(): from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build @@ -127,12 +156,9 @@ def run_pipeline(): # Create subgraph pipeline configuration. # Associate the subgraph module with a target. # Using BYOC to set the codegen of the second subgraph module. - # To use dnnl the 'USE_DNNL_CODEGEN' should set as ON in config.cmake and installing MKL-DNN. + # To use cutlass the 'USE_CUTLASS' should set as ON. mod0, mod1 = subgraphs[0], subgraphs[1] - # mod0 = relay.transform.AnnotateTarget(["dnnl"])(mod0) - # mod0 = relay.transform.AnnotateTarget(["cutlass"])(mod0) - # mod0 = relay.transform.MergeCompilerRegions()(mod0) - # mod0 = relay.transform.PartitionGraph()(mod0) + # Apply cutlass as the codegen. mod1 = partition_for_cutlass(mod1) ################################################# # Get the pipeline executor configuration object. @@ -144,10 +170,13 @@ def run_pipeline(): ############################################################################### # Set the cpu afinity for control flow, for example using cpu 0 for control flow. pipe_config[mod1].cpu_affinity = "0" + pipe_config[mod1].export_cc = None ############################################################## # Set the compile target of the second subgraph module as LLVM. - pipe_config[mod1].target = "cuda" + pipe_config[mod1].target = "cuda" # tvm.target.Target("cuda", host=tvm.target.Target("llvm")) pipe_config[mod1].dev = tvm.device("cuda", 0) + pipe_config[mod1].build_func = cutlass_build + pipe_config[mod1].export_cc = "nvcc" ################################################################################# # Set the cpu afinity for control flow, for example using cpu 1 for control flow. pipe_config[mod1].cpu_affinity = "1" @@ -171,7 +200,7 @@ def run_pipeline(): # sphinx_gallery_start_ignore from tvm import testing - testing.utils.install_request_hook(depth=3) + # testing.utils.install_request_hook(depth=3) # sphinx_gallery_end_ignore ############################## # Build the pipeline executor. @@ -195,7 +224,7 @@ def run_pipeline(): # Run the pipeline executor. # -------------------------- # Allocated a input data. - data = np.random.uniform(-1, 1, size=data_shape).astype("float32") + data = np.random.uniform(-1, 1, size=data_shape).astype("float16") pipeline_module.set_input("data", tvm.nd.array(data)) ########################################################################## # Run the two subgraph in pipeline mode and get the output asynchronously. @@ -209,18 +238,39 @@ def run_pipeline(): # ------------------------------------ # Run these two subgraphs in sequence with graph_executor to get the output. target = "llvm" - dev = tvm.device(target, 0) + dev0 = tvm.device(target, 0) lib0 = relay.build_module.build(mod0, target, params=params) - lib1 = relay.build_module.build(mod1, target, params=params) - module0 = runtime.GraphModule(lib0["default"](dev)) - module1 = runtime.GraphModule(lib1["default"](dev)) + module0 = runtime.GraphModule(lib0["default"](dev0)) + cutlass = tvm.target.Target( + { + "kind": "cutlass", + "sm": 75, + "use_3xtf32": True, + "split_k_slices": [1], + "profile_all_alignments": False, + "find_first_valid": True, + "use_multiprocessing": True, + "use_fast_math": False, + "tmp_dir": "./tmp", + }, + host=tvm.target.Target("llvm"), + ) + cuda = tvm.target.Target("cuda", host=tvm.target.Target("llvm")) + lib1 = relay.build_module.build(mod1, [cuda, cutlass], params=params) + lib1 = finalize_modules(lib1, "compile.so", "./tmp") + + dev1 = tvm.device("cuda", 0) + + module1 = runtime.GraphModule(lib1["default"](dev1)) + module0.set_input("data", data) module0.run() out_shape = (1, 16, img_size, img_size) - out = module0.get_output(0, tvm.nd.empty(out_shape)) + out = module0.get_output(0, tvm.nd.empty(out_shape, "float16")) module1.set_input("data_n_0", out) module1.run() - out = module1.get_output(0, tvm.nd.empty(out_shape)) + out_shape = (1, 1) + out = module1.get_output(0, tvm.nd.empty(out_shape, "float16")) #################### # Verify the result. tvm.testing.assert_allclose(outputs[0].numpy(), out.numpy()) diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py index 5ef309bb2808..f1c4e98a51d7 100644 --- a/python/tvm/contrib/pipeline_executor.py +++ b/python/tvm/contrib/pipeline_executor.py @@ -302,11 +302,16 @@ def export_library(self, directory_path): self.pipeline_mods[lib_index]["dev"].device_type, self.pipeline_mods[lib_index]["dev"].device_id, ) - # Get the graph, lib, and parameters from GraphExecutorFactoryModule. lib = self.pipeline_mods[lib_index]["lib"] # Export the lib, graph, and parameters to disk. - lib.export_library(mconfig["lib_name"]) + if self.pipeline_mods[lib_index]["export_cc"]: + lib.export_library( + mconfig["lib_name"], cc=self.pipeline_mods[lib_index]["export_cc"] + ) + else: + lib.export_library(mconfig["lib_name"]) + with open(mconfig["json_name"], "w") as file_handle: file_handle.write(lib.graph_json) with open(mconfig["params_name"], "wb") as file_handle: diff --git a/python/tvm/contrib/pipeline_executor_build.py b/python/tvm/contrib/pipeline_executor_build.py index 520156b47406..324383ab7ce3 100644 --- a/python/tvm/contrib/pipeline_executor_build.py +++ b/python/tvm/contrib/pipeline_executor_build.py @@ -86,7 +86,12 @@ def build(pipe_configs): # Use "mod_idx" as the key to create a "module_connection" map which is not only # for the module index but also for the module connection used to build the pipeline. module_string_config[mod_idx] = pipe_config - libs[mod_idx] = {"lib": lib, "dev": dev, "fcompile": mod_config["fcompile"]} + libs[mod_idx] = { + "lib": lib, + "dev": dev, + "fcompile": mod_config["fcompile"], + "export_cc": mod_config["export_cc"], + } # Creating a text form configuration to record the "input_connection" and the # "module_connection" information. The "input_connection" is used to record the @@ -132,10 +137,7 @@ def export_library(factory, directory_path): mconfig["json_name"] = "{}/json{}".format(directory_path, lib_index) mconfig["params_name"] = "{}/params{}".format(directory_path, lib_index) lib_config = factory.pipeline_mods[lib_index] - mconfig["dev"] = "{},{}".format( - lib_config["dev"].device_type, - lib_config["dev"].device_id, - ) + mconfig["dev"] = "{},{}".format(lib_config["dev"].device_type, lib_config["dev"].device_id) fcompile = lib_config["fcompile"] if not fcompile: fcompile = False @@ -413,6 +415,7 @@ def __init__(self, mod=None): self.fcompile = None self.name = None self.dev = None + self.export_cc = None self.cpu_affinity = "" self.idx = None self.mod = mod @@ -601,6 +604,7 @@ def get_config(self): "target": module.target, "fcompile": module.fcompile, "dev": module.dev, + "export_cc": module.export_cc, } # Creating a map including pipeline inputs and subgraph inputs.