octoml · jwfromm · Mar 8, 2023 · Feb 25, 2023 · Feb 25, 2023 · Feb 25, 2023
diff --git a/python/tvm/octo/__init__.py b/python/tvm/octo/__init__.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, wrong-import-position, redefined-builtin
+"""OctoML Simplified API utilities."""
+
+from . import utils
+from .compile import compile
+from .octo_model import OctoModel
diff --git a/python/tvm/octo/compile.py b/python/tvm/octo/compile.py
@@ -0,0 +1,170 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, wrong-import-position, redefined-builtin, not-callable
+"""Simplified interface for TVM Unity Flow."""
+from pathlib import Path
+from typing import Union, Optional, Dict, List
+import onnx
+import tvm
+from tvm import relax
+from tvm.relax.frontend.onnx import from_onnx
+from tvm.relax.backend.contrib.cutlass import partition_for_cutlass
+from .utils import get_cuda_target, get_llvm_target
+from .octo_model import OctoModel
+
+
+def load_onnx_model(
+    model_file: Union[str, Path, onnx.ModelProto], shape_dict: Optional[Dict[str, List]] = None
+) -> tvm.IRModule:
+    """Convert an input onnx model into a relax module.
+
+    Parameters
+    ----------
+    model_file : Union[str, Path, onnx.ModelProto]
+        An input onnx model to convert. Can either be a path to a model or an already
+        loaded onnx protobuf.
+
+    shape_dict : Optional[Dict[str, List]]
+        An optional dictionary that maps inputs to specific shapes. If not provided,
+        the default values in the onnx graph will be used.
+
+    Returns
+    -------
+    relax_mod : tvm.IRModule
+        A Relax module implementing the input onnx graph.
+    """
+    # Check input format and load if needed.
+    if isinstance(model_file, (Path, str)):
+        model_file = onnx.load(model_file)
+    else:
+        assert isinstance(
+            model_file, onnx.ModelProto
+        ), f"model_file must be one of (str, Path, onnx.ModelProto) but got {type(model_file)})"
+
+    # Convert the graph into a relax implementation.
+    relax_mod = from_onnx(model_file, shape_dict=shape_dict)
+
+    return relax_mod
+
+
+def offload_cutlass(mod: tvm.IRModule, target: tvm.target.Target) -> tvm.IRModule:
+    """Converts appropriate subgraphs to CUTLASS
+
+    Parameters
+    ----------
+    mod : tvm.IRModule
+        The input module that should have subgraphs rewritten to CUTLASS.
+    target : tvm.target.Target
+        The target used for compilation. Needed to parameterize CUTLASS.
+
+    Returns
+    -------
+    cutlass_mod : tvm.IRModule
+        The input module after the partition_for_cutlass and RunCodegen passes
+        are applied. In the first step, subgraphs that cutlass supports are
+        found and annotated. Next, those subgraphs are compiled using nvcc.
+        The result is a graph containing a mixture of relax operators
+        and external calls to the compiled cutlass kernels.
+    """
+    # Extract the sm version of the current target.
+    assert target.arch, "Target architecture must be specified."
+    sm = int(target.arch.split("_")[1])
+    # Cutlass only has support up to sm80, future sms will work with
+    # earlier kernels though.
+    if sm > 80:
+        sm = 80
+
+    # Apply partitioning to offload patterns to cutlass.
+    mod = partition_for_cutlass(mod)
+
+    # Construct CUTLASS codegen pass.
+    cutlass_codegen_pass = relax.transform.RunCodegen(
+        {"cutlass": {"sm": sm, "find_first_valid": True}}
+    )
+
+    # Generate code for matched cutlass kernels.
+    mod = cutlass_codegen_pass(mod)
+    return mod
+
+
+def compile(
+    model: Union[str, Path, onnx.ModelProto],
+    target: Optional[tvm.target.Target] = None,
+    shape_dict: Optional[Dict[str, List]] = None,
+):
+    """Entrypoint to compiling a model using the Unity Flow.
+
+    Parameters
+    ----------
+    model : Union[str, Path, onnx.ModelProto]
+        An input onnx model to convert. Can either be a path to a model or an already
+        loaded onnx protobuf.
+
+    target : Optional[tvm.target.Target]
+        A description of the hardware to compile to. If not provided, one will be extracted for
+        the current host machine.
+
+    shape_dict : Optional[Dict[str, List]]
+        An optional dictionary that maps inputs to specific shapes. If not provided,
+        the default values in the onnx graph will be used.
+
+    Returns
+    -------
+    octo_model: OctoModel
+        A convenience wrapper around the compiled model that provides utility functions.
+    """
+    # Determine current target.
+    if target is None:
+        # Check if this is gpu enabled.
+        if tvm.cuda(0).exist:
+            target = get_cuda_target()
+        else:
+            target = get_llvm_target()
+        print(f"Auto-selected target {target}")
+
+    # Convert model into a relax module.
+    relax_mod = load_onnx_model(model, shape_dict)
+
+    # Extract information about input shapes and types so we can
+    # randomly generate them later if needed.
+    input_info = {}
+    for inp in relax_mod["main"].params:
+        input_shape = [i.value for i in inp.struct_info.shape]
+        input_dtype = inp.struct_info.dtype
+        input_info[inp.name_hint] = (input_shape, input_dtype)
+
+    # If target is gpu and compiled with Cutlass, offload where possible.
+    if target.kind.name == "cuda":
+        if tvm.get_global_func("relax.ext.cutlass", True):
+            # Match subgraphs that can be offloaded to cutlass and offload them.
+            relax_mod = offload_cutlass(relax_mod, target)
+        else:
+            print("Cutlass backend not detected. Consider enabling it for better performance.")
+
+    # Perform legalization to lower Relax operators.
+    relax_mod = relax.transform.LegalizeOps()(relax_mod)
+
+    # Schedule all remaining functions to be compatible with gpu if needed.
+    if target.kind.name == "cuda":
+        with target, tvm.transform.PassContext(opt_level=3):
+            relax_mod = tvm.tir.transform.DefaultGPUSchedule()(relax_mod)
+
+    # Compile the module.
+    exe = relax.build(relax_mod, target)
+
+    # Create an OctoModel from the compiled artifact.
+    return OctoModel(exe, input_info, target=target)
diff --git a/python/tvm/octo/octo_model.py b/python/tvm/octo/octo_model.py
@@ -0,0 +1,195 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, wrong-import-position
+"""Wrapper class for compiled models."""
+import json
+import tarfile
+from pathlib import Path
+from typing import Optional, Union, Dict, Tuple, List
+import numpy as np
+import tvm
+from tvm import relax
+from tvm.contrib import utils
+
+
+class OctoModel(object):
+    """A compiled model wrapper that provides helpful utilities.
+
+    Parameters
+    ----------
+    exe : Optional[relax.Executable]
+        A compiled executable that can be loaded and run by a relax VM.
+    input_info : Optional[Dict[str, Tuple[List, str]]]
+        Information about the input names, shapes, and types for the VM.
+        Will be loaded from memory if possible.
+    model_path : Optional[Union[str, Path]]
+        The path to a saved OctoModel, one of exe and model_path must
+        be specified.
+    target : Optional[tvm.target.Target]
+        The target being compiled for.
+    """
+
+    def __init__(
+        self,
+        exe: Optional[relax.Executable] = None,
+        input_info: Optional[Dict[str, Tuple[List, str]]] = None,
+        model_path: Optional[Union[str, Path]] = None,
+        target: Optional[tvm.target.Target] = None,
+    ):
+        self.target = target
+
+        if exe is None and model_path is None:
+            raise ValueError("One of vm and model_path must be provided.")
+
+        self._tmp_dir = utils.tempdir()
+
+        if model_path is not None:
+            exe, input_info = self.load(model_path)
+
+        self.dev = tvm.device(self.target.get_target_device_type())
+        self.exe = exe
+        self.input_info = input_info
+
+        # Create a vm from exe.
+        self.vm = relax.VirtualMachine(self.exe, self.dev, profile=True)
+
+    def save(
+        self, model_path: Union[str, Path]
+    ) -> Tuple[relax.Executable, Dict[str, relax.StructInfo]]:
+        """Save the OctoModel to disk.
+
+        The current format used is a simple tar of the exported model library (exe.so),
+        the input information of the model (input_info.json), and a metadata
+        file containing strings such as the target.
+
+        Parameters
+        ----------
+        model_path : Union[str, Path]
+            A full path to save this OctoModel to including the output file name.
+            The file will be saved as a tar file so using a ".tar" extension is advised.
+        """
+        # Only two artifacts need to be saved, the exe and the input struct info.
+        # Serialize both to a temp directory.
+        exe_path = self._tmp_dir.relpath("exe.so")
+        self.exe.mod.export_library(exe_path)
+        input_info_path = self._tmp_dir.relpath("input_info.json")
+        with open(input_info_path, "w") as fo:
+            json.dump(self.input_info, fo)
+
+        # Save additional metadata.
+        metadata = {"target": str(self.target)}
+        metadata_path = self._tmp_dir.relpath("metadata.json")
+        with open(metadata_path, "w") as fo:
+            json.dump(metadata, fo)
+
+        # Tar the tempfile and save to the designated model_path.
+        with tarfile.open(model_path, "w") as tar:
+            tar.add(exe_path, "exe.so")
+            tar.add(input_info_path, "input_info.json")
+            tar.add(metadata_path, "metadata.json")
+
+    def load(self, model_path: Union[str, Path]) -> Tuple[relax.Executable, Dict[List, str]]:
+        """Load a saved OctoModel back into memory.
+
+        Parameters
+        ----------
+        model_path : Union[str, Path]
+            The path to the saved OctoModel that will be loaded.
+
+        Returns
+        -------
+        exe : relax.Executable
+            A compiled executable that can be loaded and run by a relax VM.
+        input_info : Dict[str, Tuple[List, str]]
+            Information about the input names, shapes, and types for the VM.
+            Will be loaded from memory if possible.
+        """
+        t = tarfile.open(model_path)
+        t.extractall(self._tmp_dir.relpath("."))
+
+        # Load executable.
+        exe_path = self._tmp_dir.relpath("exe.so")
+        exe = relax.Executable(tvm.runtime.load_module(exe_path))
+
+        # Load input info.
+        input_info_path = self._tmp_dir.relpath("input_info.json")
+        with open(input_info_path, "r") as fi:
+            input_info = json.load(fi)
+
+        # load other metadata.
+        metadata_path = self._tmp_dir.relpath("metadata.json")
+        with open(metadata_path, "r") as fi:
+            metadata = json.load(fi)
+        self.target = tvm.target.Target(metadata["target"])
+
+        return exe, input_info
+
+    def generate_inputs(self) -> Dict[str, np.array]:
+        """Generate random inputs for inference or benchmarking
+
+        Returns
+        -------
+        input_dict : Dict[str, np.array]
+        """
+        input_dict = {}
+        for name, (shape, dtype) in self.input_info.items():
+            input_dict[name] = np.random.normal(size=shape).astype(dtype)
+        return input_dict
+
+    def run(self, inputs: Optional[Dict[str, np.array]] = None) -> List[np.array]:
+        """Perform an inference of the model.
+
+        Parameters
+        ----------
+        inputs : Optional[Dict[str, np.array]]
+            An optional input dictionary containing the values to perform
+            inference with. If not provided, random values will be generated
+            instead.
+
+        Returns
+        -------
+        outputs : List[np.array]
+            The output values from the inference.
+        """
+        # Generate random inputs if none are provided.
+        if inputs is None:
+            inputs = self.generate_inputs()
+
+        # Assign inputs.
+        self.vm.set_input("main", **inputs)
+        # Run the modeel.
+        self.vm.invoke_stateful("main")
+        # Get and return the outputs.
+        outputs = self.vm.get_outputs("main")
+        if isinstance(outputs, tuple):
+            outputs = [output.numpy() for output in outputs]
+        else:
+            outputs = [outputs.numpy()]
+        return outputs
+
+    def profile(self) -> tvm.runtime.profiling.Report:
+        """Measures the model's performance.
+
+        Returns
+        -------
+        report : tvm.runtime.profiling.Report
+            A breakdown of the runtime and per layer metrics.
+        """
+        inputs = self.generate_inputs()
+        self.vm.set_input("main", **inputs)
+        report = self.vm.profile("main")
+        return report