From 0b30034bac7334bdeb361b69f84fdcc002449019 Mon Sep 17 00:00:00 2001
From: huajsj <huaj@xilinx.com>
Date: Sun, 17 Jul 2022 00:06:56 -0700
Subject: [PATCH] change into cutlass

---
 .../using_with_pipeline_executor.py           | 96 ++++++++++++++-----
 python/tvm/contrib/pipeline_executor.py       |  9 +-
 python/tvm/contrib/pipeline_executor_build.py | 14 ++-
 3 files changed, 89 insertions(+), 30 deletions(-)

diff --git a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
index f693b76126b6..f58edfc43021 100755
--- a/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_with_pipeline_executor.py
@@ -46,9 +46,9 @@
 def get_network():
     out_channels = 16
     batch_size = 1
-    data = relay.var("data", relay.TensorType((batch_size, 3, img_size, img_size), "float32"))
+    data = relay.var("data", relay.TensorType((batch_size, 3, img_size, img_size), "float16"))
     dense_weight = relay.var(
-        "data", relay.TensorType((batch_size, 16 * img_size * img_size), "float32")
+        "dweight", relay.TensorType((batch_size, 16 * img_size * img_size), "float16")
     )
     weight = relay.var("weight")
     second_weight = relay.var("second_weight")
@@ -92,20 +92,22 @@ def get_network():
 """
 #subgraphs[0])
 
- def @main(%data: Tensor[(1, 3, img_size, img_size), float32]) {
-  %0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(16, 3, 3, 3), float32] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, img_size, img_size), float32] */;
-  %1 = nn.batch_norm(%0, meta[relay.Constant][1] /* ty=Tensor[(16), float32] */, meta[relay.Constant][2] /* ty=Tensor[(16), float32]*/, meta[relay.Constant][3] /* ty=Tensor[(16), float32] */, meta[relay.Constant][4] /* ty=Tensor[(16), float32] */) /* ty=(Tensor[(1,16, img_size, img_size), float32], Tensor[(16), float32], Tensor[(16), float32]) */;
+ def @main(%data: Tensor[(1, 3, img_size, img_size), float16]) {
+  %0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(16, 3, 3, 3), float16] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, img_size, img_size), float16] */;
+  %1 = nn.batch_norm(%0, meta[relay.Constant][1] /* ty=Tensor[(16), float16] */, meta[relay.Constant][2] /* ty=Tensor[(16), float16]*/, meta[relay.Constant][3] /* ty=Tensor[(16), float16] */, meta[relay.Constant][4] /* ty=Tensor[(16), float16] */) /* ty=(Tensor[(1,16, img_size, img_size), float16], Tensor[(16), float16], Tensor[(16), float16]) */;
   %2 = %1.0;
-  nn.relu(%2) /* ty=Tensor[(1, 16, img_size, img_size), float32] */
+  nn.relu(%2) /* ty=Tensor[(1, 16, img_size, img_size), float16] */
  }
 
 peline-tutorial
 
 #subgraphs[1]
 
- def @main(%data_n_0: Tensor[(1, 16, img_size, img_size), float32]) {
-  nn.conv2d(%data_n_0, meta[relay.Constant][0] /* ty=Tensor[(16, 16, 3, 3), float32] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, img_size, img_size), float32] */
+ def @main(%data_n_0: Tensor[(1, 16, 8, 8), float16] /* ty=Tensor[(1, 16, 8, 8), float16] */) {
+  %0 = nn.batch_flatten(%data_n_0) /* ty=Tensor[(1, 1024), float16] */;
+  nn.dense(%0, meta[relay.Constant][0] /* ty=Tensor[(1, 1024), float16] */, units=None) /* ty=Tensor[(1, 1), float16] */
  }
+
 """
 
 # sphinx_gallery_start_ignore
@@ -113,13 +115,40 @@ def @main(%data_n_0: Tensor[(1, 16, img_size, img_size), float32]) {
 
 testing.utils.install_request_hook(depth=3)
 # sphinx_gallery_end_ignore
+#########################################
+# Build the subgraph with cutlass target.
+# ---------------------------------------
+#########################################
+cutlass = tvm.target.Target(
+    {
+        "kind": "cutlass",
+        "sm": 80,
+        "use_3xtf32": True,
+        "split_k_slices": [1],
+        "profile_all_alignments": False,
+        "find_first_valid": True,
+        "use_multiprocessing": True,
+        "use_fast_math": False,
+        "tmp_dir": "./tmp",
+    },
+    host=tvm.target.Target("llvm"),
+)
+
+
+def cutlass_build(mod, target, params=None, target_host=None, mod_name="default"):
+    target = [target, cutlass]
+    lib = relay.build_module.build(
+        mod, target=target, params=params, target_host=target_host, mod_name=mod_name
+    )
+    return lib
+
 
 ###########################################################
 # Run the two subgraphs in pipeline with pipeline executor.
 # ---------------------------------------------------------
 # Define a function to do all the codegen and pipeline executor works.
 # To run pipeline executor with dnnl, USE_PIPELINE_EXECUTOR need to get set as ON.
-# and the 'USE_DNNL_CODEGEN' should set as ON in config.cmake and installing MKL-DNN.
+# and the 'USE_CUTLASS' should set as ON in config.cmake.
 def run_pipeline():
     from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build
 
@@ -127,12 +156,9 @@ def run_pipeline():
     # Create subgraph pipeline configuration.
     # Associate the subgraph module with a target.
     # Using BYOC to set the codegen of the second subgraph module.
-    # To use dnnl the 'USE_DNNL_CODEGEN' should set as ON in config.cmake and installing MKL-DNN.
+    # To use cutlass the 'USE_CUTLASS' should set as ON.
     mod0, mod1 = subgraphs[0], subgraphs[1]
-    # mod0 = relay.transform.AnnotateTarget(["dnnl"])(mod0)
-    # mod0 = relay.transform.AnnotateTarget(["cutlass"])(mod0)
-    # mod0 = relay.transform.MergeCompilerRegions()(mod0)
-    # mod0 = relay.transform.PartitionGraph()(mod0)
+    # Apply cutlass as the codegen.
     mod1 = partition_for_cutlass(mod1)
     #################################################
     # Get the pipeline executor configuration object.
@@ -144,10 +170,13 @@ def run_pipeline():
     ###############################################################################
     # Set the cpu afinity for control flow, for example using cpu 0 for control flow.
     pipe_config[mod1].cpu_affinity = "0"
+    pipe_config[mod1].export_cc = None
     ##############################################################
     # Set the compile target of the second subgraph module as LLVM.
-    pipe_config[mod1].target = "cuda"
+    pipe_config[mod1].target = "cuda"  # tvm.target.Target("cuda", host=tvm.target.Target("llvm"))
     pipe_config[mod1].dev = tvm.device("cuda", 0)
+    pipe_config[mod1].build_func = cutlass_build
+    pipe_config[mod1].export_cc = "nvcc"
     #################################################################################
     # Set the cpu afinity for control flow, for example using cpu 1 for control flow.
     pipe_config[mod1].cpu_affinity = "1"
@@ -171,7 +200,7 @@ def run_pipeline():
     # sphinx_gallery_start_ignore
     from tvm import testing
 
-    testing.utils.install_request_hook(depth=3)
+    # testing.utils.install_request_hook(depth=3)
     # sphinx_gallery_end_ignore
     ##############################
     # Build the pipeline executor.
@@ -195,7 +224,7 @@ def run_pipeline():
     # Run the pipeline executor.
     # --------------------------
     # Allocated a input data.
-    data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
+    data = np.random.uniform(-1, 1, size=data_shape).astype("float16")
     pipeline_module.set_input("data", tvm.nd.array(data))
     ##########################################################################
     # Run the two subgraph in pipeline mode and get the output asynchronously.
@@ -209,18 +238,39 @@ def run_pipeline():
     # ------------------------------------
     # Run these two subgraphs in sequence with graph_executor to get the output.
     target = "llvm"
-    dev = tvm.device(target, 0)
+    dev0 = tvm.device(target, 0)
     lib0 = relay.build_module.build(mod0, target, params=params)
-    lib1 = relay.build_module.build(mod1, target, params=params)
-    module0 = runtime.GraphModule(lib0["default"](dev))
-    module1 = runtime.GraphModule(lib1["default"](dev))
+    module0 = runtime.GraphModule(lib0["default"](dev0))
+    cutlass = tvm.target.Target(
+        {
+            "kind": "cutlass",
+            "sm": 75,
+            "use_3xtf32": True,
+            "split_k_slices": [1],
+            "profile_all_alignments": False,
+            "find_first_valid": True,
+            "use_multiprocessing": True,
+            "use_fast_math": False,
+            "tmp_dir": "./tmp",
+        },
+        host=tvm.target.Target("llvm"),
+    )
+    cuda = tvm.target.Target("cuda", host=tvm.target.Target("llvm"))
+    lib1 = relay.build_module.build(mod1, [cuda, cutlass], params=params)
+    lib1 = finalize_modules(lib1, "compile.so", "./tmp")
+
+    dev1 = tvm.device("cuda", 0)
+
+    module1 = runtime.GraphModule(lib1["default"](dev1))
+
     module0.set_input("data", data)
     module0.run()
     out_shape = (1, 16, img_size, img_size)
-    out = module0.get_output(0, tvm.nd.empty(out_shape))
+    out = module0.get_output(0, tvm.nd.empty(out_shape, "float16"))
     module1.set_input("data_n_0", out)
     module1.run()
-    out = module1.get_output(0, tvm.nd.empty(out_shape))
+    out_shape = (1, 1)
+    out = module1.get_output(0, tvm.nd.empty(out_shape, "float16"))
     ####################
     # Verify the result.
     tvm.testing.assert_allclose(outputs[0].numpy(), out.numpy())
diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py
index 5ef309bb2808..f1c4e98a51d7 100644
--- a/python/tvm/contrib/pipeline_executor.py
+++ b/python/tvm/contrib/pipeline_executor.py
@@ -302,11 +302,16 @@ def export_library(self, directory_path):
                 self.pipeline_mods[lib_index]["dev"].device_type,
                 self.pipeline_mods[lib_index]["dev"].device_id,
             )
-
             # Get the graph, lib, and parameters from GraphExecutorFactoryModule.
             lib = self.pipeline_mods[lib_index]["lib"]
             # Export the lib, graph, and parameters to disk.
-            lib.export_library(mconfig["lib_name"])
+            if self.pipeline_mods[lib_index]["export_cc"]:
+                lib.export_library(
+                    mconfig["lib_name"], cc=self.pipeline_mods[lib_index]["export_cc"]
+                )
+            else:
+                lib.export_library(mconfig["lib_name"])
+
             with open(mconfig["json_name"], "w") as file_handle:
                 file_handle.write(lib.graph_json)
             with open(mconfig["params_name"], "wb") as file_handle:
diff --git a/python/tvm/contrib/pipeline_executor_build.py b/python/tvm/contrib/pipeline_executor_build.py
index 520156b47406..324383ab7ce3 100644
--- a/python/tvm/contrib/pipeline_executor_build.py
+++ b/python/tvm/contrib/pipeline_executor_build.py
@@ -86,7 +86,12 @@ def build(pipe_configs):
         # Use "mod_idx" as the key to create a "module_connection" map which is not only
         # for the module index but also for the module connection used to build the pipeline.
         module_string_config[mod_idx] = pipe_config
-        libs[mod_idx] = {"lib": lib, "dev": dev, "fcompile": mod_config["fcompile"]}
+        libs[mod_idx] = {
+            "lib": lib,
+            "dev": dev,
+            "fcompile": mod_config["fcompile"],
+            "export_cc": mod_config["export_cc"],
+        }
 
     # Creating a text form configuration to record the "input_connection" and the
     # "module_connection" information. The "input_connection" is used to record the
@@ -132,10 +137,7 @@ def export_library(factory, directory_path):
         mconfig["json_name"] = "{}/json{}".format(directory_path, lib_index)
         mconfig["params_name"] = "{}/params{}".format(directory_path, lib_index)
         lib_config = factory.pipeline_mods[lib_index]
-        mconfig["dev"] = "{},{}".format(
-            lib_config["dev"].device_type,
-            lib_config["dev"].device_id,
-        )
+        mconfig["dev"] = "{},{}".format(lib_config["dev"].device_type, lib_config["dev"].device_id)
         fcompile = lib_config["fcompile"]
         if not fcompile:
             fcompile = False
@@ -413,6 +415,7 @@ def __init__(self, mod=None):
             self.fcompile = None
             self.name = None
             self.dev = None
+            self.export_cc = None
             self.cpu_affinity = ""
             self.idx = None
             self.mod = mod
@@ -601,6 +604,7 @@ def get_config(self):
                 "target": module.target,
                 "fcompile": module.fcompile,
                 "dev": module.dev,
+                "export_cc": module.export_cc,
             }
 
         # Creating a map including pipeline inputs and subgraph inputs.