-
Notifications
You must be signed in to change notification settings - Fork 13
[TUZ-150] Add a simplified access point for Unity Flow #32
Changes from all commits
f7165a1
10fb8c5
9fab56c
1ad1994
f21a17b
d9b0a80
54a62c1
74603ee
0e046da
77df6e8
bf589f3
663f7ae
4d152fe
2feb243
6097df5
2b2cb96
7d67bb1
7c06de5
428400c
1043136
e9cf04e
6c04ac5
69acdfb
908dc8f
25f4d06
05cbe32
bd8e7d3
cb37b82
91dc8ef
a42e98b
bc92a3f
df429c5
c0f148a
736ceca
22c47ee
baedf7f
befdc4e
e7b02f2
9508a18
4d46290
b59ad48
ff8bfa2
bb0c129
1807e6f
846a2c5
850d6a4
a966cf1
f735d93
ad4185c
e8227b9
2e08c8c
68a04a8
5723ebb
72bca0f
f491b96
71437f7
9113fc9
c788135
b6818bb
17cf446
9317ec8
07e0dfb
33c4aab
fd5c73d
733fc00
88852c1
24470c9
fc0540c
91adf7b
53f800d
596d472
b150b1a
449e094
9b5f214
fd35d1e
d8fdd5c
8039f6a
782c632
1b85765
c1439b3
180bead
fe528f6
aa55c05
fada709
06de35e
a9032d9
ed2696a
0d58835
246c4c1
e8a0c4d
4ad8d64
3a64963
b8460eb
defc15b
7645aa7
6e2d7bb
acd0e0b
1950940
cf36b7b
b1f2d53
a8338e6
74f3007
98d0a01
b755a6f
eaaa1fb
89bb68b
02b3a1f
c7d2c38
57c86eb
61c2761
368d9f6
4713b52
28c6825
63ce37d
7a5d313
c29ac7e
4c085d2
8a1e623
67659ac
e57f591
3fa880a
6ee79e1
8423811
781bfe0
475f3c2
38315af
ed5367d
1f04221
22b65bc
88ab730
0e98e6e
284b278
f7f24b7
d103ee2
8f7c343
6de551b
bb34d97
0937202
1878d7b
b12320b
627fb0a
3d72050
1dd7a56
d843a77
ea79707
dfe37fd
ab05b8c
44f4742
61e03a4
e49e924
e7ab17e
dfc0313
ee05ae0
b4d172f
57cf2ff
f5b86c1
447bc59
e68e731
022042c
b9813e8
b5777e1
dfc2931
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, | ||
# software distributed under the License is distributed on an | ||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, either express or implied. See the License for the | ||
# specific language governing permissions and limitations | ||
# under the License. | ||
# pylint: disable=invalid-name, wrong-import-position, redefined-builtin | ||
"""OctoML Simplified API utilities.""" | ||
|
||
from . import utils | ||
from .compile import compile | ||
from .octo_model import OctoModel |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, | ||
# software distributed under the License is distributed on an | ||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, either express or implied. See the License for the | ||
# specific language governing permissions and limitations | ||
# under the License. | ||
# pylint: disable=invalid-name, wrong-import-position, redefined-builtin, not-callable | ||
"""Simplified interface for TVM Unity Flow.""" | ||
from pathlib import Path | ||
from typing import Union, Optional, Dict, List | ||
import onnx | ||
import tvm | ||
from tvm import relax | ||
from tvm.relax.frontend.onnx import from_onnx | ||
from tvm.relax.backend.contrib.cutlass import partition_for_cutlass | ||
from .utils import get_cuda_target, get_llvm_target | ||
from .octo_model import OctoModel | ||
|
||
|
||
def load_onnx_model( | ||
model_file: Union[str, Path, onnx.ModelProto], shape_dict: Optional[Dict[str, List]] = None | ||
) -> tvm.IRModule: | ||
"""Convert an input onnx model into a relax module. | ||
|
||
Parameters | ||
---------- | ||
model_file : Union[str, Path, onnx.ModelProto] | ||
An input onnx model to convert. Can either be a path to a model or an already | ||
loaded onnx protobuf. | ||
|
||
shape_dict : Optional[Dict[str, List]] | ||
An optional dictionary that maps inputs to specific shapes. If not provided, | ||
the default values in the onnx graph will be used. | ||
|
||
Returns | ||
------- | ||
relax_mod : tvm.IRModule | ||
A Relax module implementing the input onnx graph. | ||
""" | ||
# Check input format and load if needed. | ||
if isinstance(model_file, (Path, str)): | ||
model_file = onnx.load(model_file) | ||
else: | ||
assert isinstance( | ||
model_file, onnx.ModelProto | ||
), f"model_file must be one of (str, Path, onnx.ModelProto) but got {type(model_file)})" | ||
|
||
# Convert the graph into a relax implementation. | ||
relax_mod = from_onnx(model_file, shape_dict=shape_dict) | ||
|
||
return relax_mod | ||
|
||
|
||
def offload_cutlass(mod: tvm.IRModule, target: tvm.target.Target) -> tvm.IRModule: | ||
"""Converts appropriate subgraphs to CUTLASS | ||
|
||
Parameters | ||
---------- | ||
mod : tvm.IRModule | ||
The input module that should have subgraphs rewritten to CUTLASS. | ||
target : tvm.target.Target | ||
The target used for compilation. Needed to parameterize CUTLASS. | ||
|
||
Returns | ||
------- | ||
cutlass_mod : tvm.IRModule | ||
The input module after the partition_for_cutlass and RunCodegen passes | ||
are applied. In the first step, subgraphs that cutlass supports are | ||
found and annotated. Next, those subgraphs are compiled using nvcc. | ||
The result is a graph containing a mixture of relax operators | ||
and external calls to the compiled cutlass kernels. | ||
""" | ||
# Extract the sm version of the current target. | ||
assert target.arch, "Target architecture must be specified." | ||
sm = int(target.arch.split("_")[1]) | ||
# Cutlass only has support up to sm80, future sms will work with | ||
# earlier kernels though. | ||
if sm > 80: | ||
sm = 80 | ||
|
||
# Apply partitioning to offload patterns to cutlass. | ||
mod = partition_for_cutlass(mod) | ||
|
||
# Construct CUTLASS codegen pass. | ||
cutlass_codegen_pass = relax.transform.RunCodegen( | ||
{"cutlass": {"sm": sm, "find_first_valid": True}} | ||
) | ||
|
||
# Generate code for matched cutlass kernels. | ||
mod = cutlass_codegen_pass(mod) | ||
return mod | ||
|
||
|
||
def compile( | ||
model: Union[str, Path, onnx.ModelProto], | ||
target: Optional[tvm.target.Target] = None, | ||
shape_dict: Optional[Dict[str, List]] = None, | ||
): | ||
"""Entrypoint to compiling a model using the Unity Flow. | ||
|
||
Parameters | ||
---------- | ||
model : Union[str, Path, onnx.ModelProto] | ||
An input onnx model to convert. Can either be a path to a model or an already | ||
loaded onnx protobuf. | ||
|
||
target : Optional[tvm.target.Target] | ||
A description of the hardware to compile to. If not provided, one will be extracted for | ||
the current host machine. | ||
|
||
shape_dict : Optional[Dict[str, List]] | ||
An optional dictionary that maps inputs to specific shapes. If not provided, | ||
the default values in the onnx graph will be used. | ||
|
||
Returns | ||
------- | ||
octo_model: OctoModel | ||
A convenience wrapper around the compiled model that provides utility functions. | ||
""" | ||
# Determine current target. | ||
if target is None: | ||
# Check if this is gpu enabled. | ||
if tvm.cuda(0).exist: | ||
target = get_cuda_target() | ||
else: | ||
target = get_llvm_target() | ||
print(f"Auto-selected target {target}") | ||
|
||
# Convert model into a relax module. | ||
relax_mod = load_onnx_model(model, shape_dict) | ||
|
||
# Extract information about input shapes and types so we can | ||
# randomly generate them later if needed. | ||
input_info = {} | ||
for inp in relax_mod["main"].params: | ||
input_shape = [i.value for i in inp.struct_info.shape] | ||
input_dtype = inp.struct_info.dtype | ||
input_info[inp.name_hint] = (input_shape, input_dtype) | ||
|
||
# If target is gpu and compiled with Cutlass, offload where possible. | ||
if target.kind.name == "cuda": | ||
if tvm.get_global_func("relax.ext.cutlass", True): | ||
# Match subgraphs that can be offloaded to cutlass and offload them. | ||
relax_mod = offload_cutlass(relax_mod, target) | ||
else: | ||
print("Cutlass backend not detected. Consider enabling it for better performance.") | ||
|
||
# Perform legalization to lower Relax operators. | ||
relax_mod = relax.transform.LegalizeOps()(relax_mod) | ||
|
||
# Schedule all remaining functions to be compatible with gpu if needed. | ||
if target.kind.name == "cuda": | ||
with target, tvm.transform.PassContext(opt_level=3): | ||
relax_mod = tvm.tir.transform.DefaultGPUSchedule()(relax_mod) | ||
|
||
# Compile the module. | ||
exe = relax.build(relax_mod, target) | ||
|
||
# Create an OctoModel from the compiled artifact. | ||
return OctoModel(exe, input_info, target=target) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,195 @@ | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, | ||
# software distributed under the License is distributed on an | ||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, either express or implied. See the License for the | ||
# specific language governing permissions and limitations | ||
# under the License. | ||
# pylint: disable=invalid-name, wrong-import-position | ||
"""Wrapper class for compiled models.""" | ||
import json | ||
import tarfile | ||
from pathlib import Path | ||
from typing import Optional, Union, Dict, Tuple, List | ||
import numpy as np | ||
import tvm | ||
from tvm import relax | ||
from tvm.contrib import utils | ||
|
||
|
||
class OctoModel(object): | ||
"""A compiled model wrapper that provides helpful utilities. | ||
jwfromm marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
Parameters | ||
---------- | ||
exe : Optional[relax.Executable] | ||
A compiled executable that can be loaded and run by a relax VM. | ||
input_info : Optional[Dict[str, Tuple[List, str]]] | ||
Information about the input names, shapes, and types for the VM. | ||
Will be loaded from memory if possible. | ||
jwfromm marked this conversation as resolved.
Show resolved
Hide resolved
|
||
model_path : Optional[Union[str, Path]] | ||
The path to a saved OctoModel, one of exe and model_path must | ||
be specified. | ||
target : Optional[tvm.target.Target] | ||
The target being compiled for. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
exe: Optional[relax.Executable] = None, | ||
input_info: Optional[Dict[str, Tuple[List, str]]] = None, | ||
model_path: Optional[Union[str, Path]] = None, | ||
target: Optional[tvm.target.Target] = None, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this |
||
): | ||
self.target = target | ||
|
||
if exe is None and model_path is None: | ||
raise ValueError("One of vm and model_path must be provided.") | ||
|
||
self._tmp_dir = utils.tempdir() | ||
|
||
if model_path is not None: | ||
exe, input_info = self.load(model_path) | ||
|
||
self.dev = tvm.device(self.target.get_target_device_type()) | ||
self.exe = exe | ||
self.input_info = input_info | ||
|
||
# Create a vm from exe. | ||
self.vm = relax.VirtualMachine(self.exe, self.dev, profile=True) | ||
jwfromm marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def save( | ||
self, model_path: Union[str, Path] | ||
) -> Tuple[relax.Executable, Dict[str, relax.StructInfo]]: | ||
jwfromm marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"""Save the OctoModel to disk. | ||
|
||
The current format used is a simple tar of the exported model library (exe.so), | ||
the input information of the model (input_info.json), and a metadata | ||
file containing strings such as the target. | ||
|
||
Parameters | ||
---------- | ||
model_path : Union[str, Path] | ||
A full path to save this OctoModel to including the output file name. | ||
The file will be saved as a tar file so using a ".tar" extension is advised. | ||
""" | ||
# Only two artifacts need to be saved, the exe and the input struct info. | ||
# Serialize both to a temp directory. | ||
exe_path = self._tmp_dir.relpath("exe.so") | ||
self.exe.mod.export_library(exe_path) | ||
input_info_path = self._tmp_dir.relpath("input_info.json") | ||
with open(input_info_path, "w") as fo: | ||
json.dump(self.input_info, fo) | ||
|
||
# Save additional metadata. | ||
metadata = {"target": str(self.target)} | ||
metadata_path = self._tmp_dir.relpath("metadata.json") | ||
with open(metadata_path, "w") as fo: | ||
json.dump(metadata, fo) | ||
|
||
# Tar the tempfile and save to the designated model_path. | ||
with tarfile.open(model_path, "w") as tar: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we recycle Model Library Format here to avoid creating two of these? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Outlining the format would be a good idea in any case, and it would be even better to reuse an existing format if possible. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think using MLF is a good idea but not currently applicable since it is tied to |
||
tar.add(exe_path, "exe.so") | ||
tar.add(input_info_path, "input_info.json") | ||
tar.add(metadata_path, "metadata.json") | ||
|
||
def load(self, model_path: Union[str, Path]) -> Tuple[relax.Executable, Dict[List, str]]: | ||
"""Load a saved OctoModel back into memory. | ||
|
||
Parameters | ||
---------- | ||
model_path : Union[str, Path] | ||
The path to the saved OctoModel that will be loaded. | ||
|
||
Returns | ||
------- | ||
exe : relax.Executable | ||
A compiled executable that can be loaded and run by a relax VM. | ||
input_info : Dict[str, Tuple[List, str]] | ||
Information about the input names, shapes, and types for the VM. | ||
Will be loaded from memory if possible. | ||
""" | ||
t = tarfile.open(model_path) | ||
t.extractall(self._tmp_dir.relpath(".")) | ||
|
||
# Load executable. | ||
exe_path = self._tmp_dir.relpath("exe.so") | ||
exe = relax.Executable(tvm.runtime.load_module(exe_path)) | ||
|
||
# Load input info. | ||
input_info_path = self._tmp_dir.relpath("input_info.json") | ||
with open(input_info_path, "r") as fi: | ||
input_info = json.load(fi) | ||
|
||
# load other metadata. | ||
metadata_path = self._tmp_dir.relpath("metadata.json") | ||
with open(metadata_path, "r") as fi: | ||
metadata = json.load(fi) | ||
self.target = tvm.target.Target(metadata["target"]) | ||
|
||
return exe, input_info | ||
|
||
def generate_inputs(self) -> Dict[str, np.array]: | ||
"""Generate random inputs for inference or benchmarking | ||
jwfromm marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
Returns | ||
------- | ||
input_dict : Dict[str, np.array] | ||
""" | ||
input_dict = {} | ||
for name, (shape, dtype) in self.input_info.items(): | ||
input_dict[name] = np.random.normal(size=shape).astype(dtype) | ||
jwfromm marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return input_dict | ||
|
||
def run(self, inputs: Optional[Dict[str, np.array]] = None) -> List[np.array]: | ||
"""Perform an inference of the model. | ||
|
||
Parameters | ||
---------- | ||
inputs : Optional[Dict[str, np.array]] | ||
An optional input dictionary containing the values to perform | ||
inference with. If not provided, random values will be generated | ||
instead. | ||
|
||
Returns | ||
------- | ||
outputs : List[np.array] | ||
The output values from the inference. | ||
""" | ||
# Generate random inputs if none are provided. | ||
if inputs is None: | ||
inputs = self.generate_inputs() | ||
|
||
# Assign inputs. | ||
self.vm.set_input("main", **inputs) | ||
# Run the modeel. | ||
self.vm.invoke_stateful("main") | ||
# Get and return the outputs. | ||
outputs = self.vm.get_outputs("main") | ||
if isinstance(outputs, tuple): | ||
outputs = [output.numpy() for output in outputs] | ||
else: | ||
outputs = [outputs.numpy()] | ||
return outputs | ||
|
||
def profile(self) -> tvm.runtime.profiling.Report: | ||
jwfromm marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"""Measures the model's performance. | ||
|
||
Returns | ||
------- | ||
report : tvm.runtime.profiling.Report | ||
A breakdown of the runtime and per layer metrics. | ||
""" | ||
inputs = self.generate_inputs() | ||
self.vm.set_input("main", **inputs) | ||
report = self.vm.profile("main") | ||
return report |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a way to introspect the output and see what was offloaded (and why?)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we can add IR visitor after partition / codegen pass to collect the information of the lifted subgraphs