Tinygrad runner (#34261)

* squash * dmonitoringmodeld: use cl transform (#34235) * needs cleanup * only if tici * bump tinygrad * check width * base modelframe * . * need to be args * more cleanup * no _frame in base * tici only * its DrivingModelFrame * .6 is fair --------- Co-authored-by: Comma Device <device@comma.ai> * Update tinygrad * tg upstream * bump tg * bump tg * debug * attr * misc cleanup * whitespace * remove * Add TODOs to make python proc for modelrunners * whitespace --------- Co-authored-by: ZwX1616 <zwx1616@gmail.com> Co-authored-by: Comma Device <device@comma.ai> Co-authored-by: Maxime Desroches <desroches.maxime@gmail.com>
commaai · Dec 18, 2024 · 17ca638 · 17ca638
1 parent ff97a43
commit 17ca638
Show file tree

Hide file tree

Showing 39 changed files with 175 additions and 1,483 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -42,8 +42,7 @@ dependencies = [
 
   # modeld
   "onnx >= 1.14.0",
-  "onnxruntime >=1.16.3; platform_system == 'Linux' and platform_machine == 'aarch64'",
-  "onnxruntime-gpu >=1.16.3; platform_system == 'Linux' and platform_machine == 'x86_64'",
+  "onnxruntime >=1.16.3",
 
   # logging
   "pyzmq",

diff --git a/release/release_files.py b/release/release_files.py
@@ -54,7 +54,7 @@
   "tools/joystick/",
   "tools/longitudinal_maneuvers/",
 
-  "tinygrad_repo/openpilot/compile2.py",
+  "tinygrad_repo/examples/openpilot/compile3.py",
   "tinygrad_repo/extra/onnx.py",
   "tinygrad_repo/extra/onnx_ops.py",
   "tinygrad_repo/extra/thneed.py",

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
@@ -13,20 +13,6 @@ common_src = [
   "transforms/transform.cc",
 ]
 
-thneed_src_common = [
-  "thneed/thneed_common.cc",
-  "thneed/serialize.cc",
-]
-
-thneed_src_qcom = thneed_src_common + ["thneed/thneed_qcom2.cc"]
-thneed_src_pc = thneed_src_common + ["thneed/thneed_pc.cc"]
-thneed_src = thneed_src_qcom if arch == "larch64" else thneed_src_pc
-
-# SNPE except on Mac and ARM Linux
-snpe_lib = []
-if arch != "Darwin" and arch != "aarch64":
-  common_src += ['runners/snpemodel.cc']
-  snpe_lib += ['SNPE']
 
 # OpenCL is a framework on Mac
 if arch == "Darwin":
@@ -45,34 +31,24 @@ snpe_rpath_pc = f"{Dir('#').abspath}/third_party/snpe/x86_64-linux-clang"
 snpe_rpath = lenvCython['RPATH'] + [snpe_rpath_qcom if arch == "larch64" else snpe_rpath_pc]
 
 cython_libs = envCython["LIBS"] + libs
-snpemodel_lib = lenv.Library('snpemodel', ['runners/snpemodel.cc'])
 commonmodel_lib = lenv.Library('commonmodel', common_src)
-
-lenvCython.Program('runners/runmodel_pyx.so', 'runners/runmodel_pyx.pyx', LIBS=cython_libs, FRAMEWORKS=frameworks)
-lenvCython.Program('runners/snpemodel_pyx.so', 'runners/snpemodel_pyx.pyx', LIBS=[snpemodel_lib, snpe_lib, *cython_libs], FRAMEWORKS=frameworks, RPATH=snpe_rpath)
 lenvCython.Program('models/commonmodel_pyx.so', 'models/commonmodel_pyx.pyx', LIBS=[commonmodel_lib, *cython_libs], FRAMEWORKS=frameworks)
-
-tinygrad_files = ["#"+x for x in glob.glob(env.Dir("#tinygrad_repo").relpath + "/**", recursive=True, root_dir=env.Dir("#").abspath)]
+tinygrad_files = ["#"+x for x in glob.glob(env.Dir("#tinygrad_repo").relpath + "/**", recursive=True, root_dir=env.Dir("#").abspath) if 'pycache' not in x]
 
 # Get model metadata
 fn = File("models/supercombo").abspath
 cmd = f'python3 {Dir("#selfdrive/modeld").abspath}/get_model_metadata.py {fn}.onnx'
 lenv.Command(fn + "_metadata.pkl", [fn + ".onnx"] + tinygrad_files, cmd)
 
-# Build thneed model
-if arch == "larch64" or GetOption('pc_thneed'):
-  tinygrad_opts = []
-  if not GetOption('pc_thneed'):
-    # use FLOAT16 on device for speed + don't cache the CL kernels for space
-    tinygrad_opts += ["FLOAT16=1", "PYOPENCL_NO_CACHE=1"]
-  cmd = f"cd {Dir('#').abspath}/tinygrad_repo && " + ' '.join(tinygrad_opts) + f" python3 openpilot/compile2.py {fn}.onnx {fn}.thneed"
-
-  lenv.Command(fn + ".thneed", [fn + ".onnx"] + tinygrad_files, cmd)
+# Compile tinygrad model
+pythonpath_string = 'PYTHONPATH="${PYTHONPATH}:' + env.Dir("#tinygrad_repo").abspath + '"'
+if arch == 'larch64':
+  device_string = 'QCOM=1'
+else:
+  device_string = 'CLANG=1 IMAGE=0'
 
-  fn_dm = File("models/dmonitoring_model").abspath
-  cmd = f"cd {Dir('#').abspath}/tinygrad_repo && " + ' '.join(tinygrad_opts) + f" python3 openpilot/compile2.py {fn_dm}.onnx {fn_dm}.thneed"
-  lenv.Command(fn_dm + ".thneed", [fn_dm + ".onnx"] + tinygrad_files, cmd)
+for model_name in ['supercombo', 'dmonitoring_model']:
+  fn = File(f"models/{model_name}").abspath
+  cmd = f'{pythonpath_string} {device_string} python3 {Dir("#tinygrad_repo").abspath}/examples/openpilot/compile3.py {fn}.onnx {fn}_tinygrad.pkl'
+  lenv.Command(fn + "_tinygrad.pkl", [fn + ".onnx"] + tinygrad_files, cmd)
 
-  thneed_lib = env.SharedLibrary('thneed', thneed_src, LIBS=[gpucommon, common, 'OpenCL', 'dl'])
-  thneedmodel_lib = env.Library('thneedmodel', ['runners/thneedmodel.cc'])
-  lenvCython.Program('runners/thneedmodel_pyx.so', 'runners/thneedmodel_pyx.pyx', LIBS=envCython["LIBS"]+[thneedmodel_lib, thneed_lib, gpucommon, common, 'dl', 'OpenCL'])
diff --git a/selfdrive/modeld/dmonitoringmodeld b/selfdrive/modeld/dmonitoringmodeld
@@ -1,10 +1,4 @@
 #!/usr/bin/env bash
 
 DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null && pwd)"
-cd "$DIR/../../"
-
-if [ -f "$DIR/libthneed.so" ]; then
-  export LD_PRELOAD="$DIR/libthneed.so"
-fi
-
 exec "$DIR/dmonitoringmodeld.py" "$@"
diff --git a/selfdrive/modeld/dmonitoringmodeld.py b/selfdrive/modeld/dmonitoringmodeld.py
@@ -1,8 +1,17 @@
 #!/usr/bin/env python3
 import os
+from openpilot.system.hardware import TICI
+if TICI:
+  from tinygrad.tensor import Tensor
+  from tinygrad.dtype import dtypes
+  from openpilot.selfdrive.modeld.runners.tinygrad_helpers import qcom_tensor_from_opencl_address
+  os.environ['QCOM'] = '1'
+else:
+  from openpilot.selfdrive.modeld.runners.ort_helpers import make_onnx_cpu_runner
 import gc
 import math
 import time
+import pickle
 import ctypes
 import numpy as np
 from pathlib import Path
@@ -13,21 +22,20 @@
 from msgq.visionipc import VisionIpcClient, VisionStreamType, VisionBuf
 from openpilot.common.swaglog import cloudlog
 from openpilot.common.realtime import set_realtime_priority
-from openpilot.common.transformations.model import dmonitoringmodel_intrinsics
+from openpilot.common.transformations.model import dmonitoringmodel_intrinsics, DM_INPUT_SIZE
 from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye
 from openpilot.selfdrive.modeld.models.commonmodel_pyx import CLContext, MonitoringModelFrame
-from openpilot.selfdrive.modeld.runners import ModelRunner, Runtime
 from openpilot.selfdrive.modeld.parse_model_outputs import sigmoid
 
+MODEL_WIDTH, MODEL_HEIGHT = DM_INPUT_SIZE
 CALIB_LEN = 3
 FEATURE_LEN = 512
 OUTPUT_SIZE = 84 + FEATURE_LEN
 
 PROCESS_NAME = "selfdrive.modeld.dmonitoringmodeld"
 SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
-MODEL_PATHS = {
-  ModelRunner.THNEED: Path(__file__).parent / 'models/dmonitoring_model.thneed',
-  ModelRunner.ONNX: Path(__file__).parent / 'models/dmonitoring_model.onnx'}
+MODEL_PATH = Path(__file__).parent / 'models/dmonitoring_model.onnx'
+MODEL_PKL_PATH = Path(__file__).parent / 'models/dmonitoring_model_tinygrad.pkl'
 
 class DriverStateResult(ctypes.Structure):
   _fields_ = [
@@ -58,29 +66,42 @@ class DMonitoringModelResult(ctypes.Structure):
 class ModelState:
   inputs: dict[str, np.ndarray]
   output: np.ndarray
-  model: ModelRunner
 
   def __init__(self, cl_ctx):
     assert ctypes.sizeof(DMonitoringModelResult) == OUTPUT_SIZE * ctypes.sizeof(ctypes.c_float)
 
     self.frame = MonitoringModelFrame(cl_ctx)
-    self.output = np.zeros(OUTPUT_SIZE, dtype=np.float32)
-    self.inputs = {
-      'calib': np.zeros(CALIB_LEN, dtype=np.float32)}
+    self.numpy_inputs = {
+      'calib': np.zeros((1, CALIB_LEN), dtype=np.float32),
+    }
 
-    self.model = ModelRunner(MODEL_PATHS, self.output, Runtime.GPU, False, cl_ctx)
-    self.model.addInput("input_img", None)
-    self.model.addInput("calib", self.inputs['calib'])
+    if TICI:
+      self.tensor_inputs = {k: Tensor(v, device='NPY').realize() for k,v in self.numpy_inputs.items()}
+      with open(MODEL_PKL_PATH, "rb") as f:
+        self.model_run = pickle.load(f)
+    else:
+      self.onnx_cpu_runner = make_onnx_cpu_runner(MODEL_PATH)
 
   def run(self, buf:VisionBuf, calib:np.ndarray, transform:np.ndarray) -> tuple[np.ndarray, float]:
-    self.inputs['calib'][:] = calib
-
-    self.model.setInputBuffer("input_img", self.frame.prepare(buf, transform.flatten(), None).view(np.float32))
+    self.numpy_inputs['calib'][0,:] = calib
 
     t1 = time.perf_counter()
-    self.model.execute()
+
+    input_img_cl = self.frame.prepare(buf, transform.flatten())
+    if TICI:
+      # The imgs tensors are backed by opencl memory, only need init once
+      if 'input_img' not in self.tensor_inputs:
+        self.tensor_inputs['input_img'] = qcom_tensor_from_opencl_address(input_img_cl.mem_address, (1, MODEL_WIDTH*MODEL_HEIGHT), dtype=dtypes.uint8)
+    else:
+      self.numpy_inputs['input_img'] = self.frame.buffer_from_cl(input_img_cl).reshape((1, MODEL_WIDTH*MODEL_HEIGHT))
+
+    if TICI:
+      output = self.model_run(**self.tensor_inputs).numpy().flatten()
+    else:
+      output = self.onnx_cpu_runner.run(None, self.numpy_inputs)[0].flatten()
+
     t2 = time.perf_counter()
-    return self.output, t2 - t1
+    return output, t2 - t1
 
 
 def fill_driver_state(msg, ds_result: DriverStateResult):

diff --git a/selfdrive/modeld/modeld b/selfdrive/modeld/modeld
@@ -1,10 +1,4 @@
 #!/usr/bin/env bash
 
 DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null && pwd)"
-cd "$DIR/../../"
-
-if [ -f "$DIR/libthneed.so" ]; then
-  export LD_PRELOAD="$DIR/libthneed.so"
-fi
-
 exec "$DIR/modeld.py" "$@"
diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
@@ -1,5 +1,15 @@
 #!/usr/bin/env python3
 import os
+from openpilot.system.hardware import TICI
+
+#
+if TICI:
+  from tinygrad.tensor import Tensor
+  from tinygrad.dtype import dtypes
+  from openpilot.selfdrive.modeld.runners.tinygrad_helpers import qcom_tensor_from_opencl_address
+  os.environ['QCOM'] = '1'
+else:
+  from openpilot.selfdrive.modeld.runners.ort_helpers import make_onnx_cpu_runner
 import time
 import pickle
 import numpy as np
@@ -18,22 +28,19 @@
 from openpilot.common.transformations.model import get_warp_matrix
 from openpilot.system import sentry
 from openpilot.selfdrive.controls.lib.desire_helper import DesireHelper
-from openpilot.selfdrive.modeld.runners import ModelRunner, Runtime
 from openpilot.selfdrive.modeld.parse_model_outputs import Parser
 from openpilot.selfdrive.modeld.fill_model_msg import fill_model_msg, fill_pose_msg, PublishState
 from openpilot.selfdrive.modeld.constants import ModelConstants
 from openpilot.selfdrive.modeld.models.commonmodel_pyx import DrivingModelFrame, CLContext
 
+
 PROCESS_NAME = "selfdrive.modeld.modeld"
 SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
 
-MODEL_PATHS = {
-  ModelRunner.THNEED: Path(__file__).parent / 'models/supercombo.thneed',
-  ModelRunner.ONNX: Path(__file__).parent / 'models/supercombo.onnx'}
-
+MODEL_PATH = Path(__file__).parent / 'models/supercombo.onnx'
+MODEL_PKL_PATH = Path(__file__).parent / 'models/supercombo_tinygrad.pkl'
 METADATA_PATH = Path(__file__).parent / 'models/supercombo_metadata.pkl'
 
-
 class FrameMeta:
   frame_id: int = 0
   timestamp_sof: int = 0
@@ -44,40 +51,39 @@ def __init__(self, vipc=None):
       self.frame_id, self.timestamp_sof, self.timestamp_eof = vipc.frame_id, vipc.timestamp_sof, vipc.timestamp_eof
 
 class ModelState:
-  frame: DrivingModelFrame
-  wide_frame: DrivingModelFrame
+  frames: dict[str, DrivingModelFrame]
   inputs: dict[str, np.ndarray]
   output: np.ndarray
   prev_desire: np.ndarray  # for tracking the rising edge of the pulse
-  model: ModelRunner
 
   def __init__(self, context: CLContext):
-    self.frame = DrivingModelFrame(context)
-    self.wide_frame = DrivingModelFrame(context)
+    self.frames = {'input_imgs': DrivingModelFrame(context), 'big_input_imgs': DrivingModelFrame(context)}
     self.prev_desire = np.zeros(ModelConstants.DESIRE_LEN, dtype=np.float32)
     self.full_features_20Hz = np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN, ModelConstants.FEATURE_LEN), dtype=np.float32)
     self.desire_20Hz =  np.zeros((ModelConstants.FULL_HISTORY_BUFFER_LEN + 1, ModelConstants.DESIRE_LEN), dtype=np.float32)
 
     # img buffers are managed in openCL transform code
-    self.inputs = {
-      'desire': np.zeros(ModelConstants.DESIRE_LEN * (ModelConstants.HISTORY_BUFFER_LEN+1), dtype=np.float32),
-      'traffic_convention': np.zeros(ModelConstants.TRAFFIC_CONVENTION_LEN, dtype=np.float32),
-      'features_buffer': np.zeros(ModelConstants.HISTORY_BUFFER_LEN * ModelConstants.FEATURE_LEN, dtype=np.float32),
+    self.numpy_inputs = {
+      'desire': np.zeros((1, (ModelConstants.HISTORY_BUFFER_LEN+1), ModelConstants.DESIRE_LEN), dtype=np.float32),
+      'traffic_convention': np.zeros((1, ModelConstants.TRAFFIC_CONVENTION_LEN), dtype=np.float32),
+      'features_buffer': np.zeros((1, ModelConstants.HISTORY_BUFFER_LEN,  ModelConstants.FEATURE_LEN), dtype=np.float32),
     }
 
     with open(METADATA_PATH, 'rb') as f:
       model_metadata = pickle.load(f)
+    self.input_shapes =  model_metadata['input_shapes']
 
     self.output_slices = model_metadata['output_slices']
     net_output_size = model_metadata['output_shapes']['outputs'][1]
     self.output = np.zeros(net_output_size, dtype=np.float32)
     self.parser = Parser()
 
-    self.model = ModelRunner(MODEL_PATHS, self.output, Runtime.GPU, False, context)
-    self.model.addInput("input_imgs", None)
-    self.model.addInput("big_input_imgs", None)
-    for k,v in self.inputs.items():
-      self.model.addInput(k, v)
+    if TICI:
+      self.tensor_inputs = {k: Tensor(v, device='NPY').realize() for k,v in self.numpy_inputs.items()}
+      with open(MODEL_PKL_PATH, "rb") as f:
+        self.model_run = pickle.load(f)
+    else:
+      self.onnx_cpu_runner = make_onnx_cpu_runner(MODEL_PATH)
 
   def slice_outputs(self, model_outputs: np.ndarray) -> dict[str, np.ndarray]:
     parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in self.output_slices.items()}
@@ -94,24 +100,36 @@ def run(self, buf: VisionBuf, wbuf: VisionBuf, transform: np.ndarray, transform_
 
     self.desire_20Hz[:-1] = self.desire_20Hz[1:]
     self.desire_20Hz[-1] = new_desire
-    self.inputs['desire'][:] = self.desire_20Hz.reshape((25,4,-1)).max(axis=1).flatten()
+    self.numpy_inputs['desire'][:] = self.desire_20Hz.reshape((1,25,4,-1)).max(axis=2)
 
-    self.inputs['traffic_convention'][:] = inputs['traffic_convention']
+    self.numpy_inputs['traffic_convention'][:] = inputs['traffic_convention']
+    imgs_cl = {'input_imgs': self.frames['input_imgs'].prepare(buf, transform.flatten()),
+               'big_input_imgs': self.frames['big_input_imgs'].prepare(wbuf, transform_wide.flatten())}
 
-    self.model.setInputBuffer("input_imgs", self.frame.prepare(buf, transform.flatten(), self.model.getCLBuffer("input_imgs")))
-    self.model.setInputBuffer("big_input_imgs", self.wide_frame.prepare(wbuf, transform_wide.flatten(), self.model.getCLBuffer("big_input_imgs")))
+    if TICI:
+      # The imgs tensors are backed by opencl memory, only need init once
+      for key in imgs_cl:
+        if key not in self.tensor_inputs:
+          self.tensor_inputs[key] = qcom_tensor_from_opencl_address(imgs_cl[key].mem_address, self.input_shapes[key], dtype=dtypes.uint8)
+    else:
+      for key in imgs_cl:
+        self.numpy_inputs[key] = self.frames[key].buffer_from_cl(imgs_cl[key]).reshape(self.input_shapes[key])
 
     if prepare_only:
       return None
 
-    self.model.execute()
+    if TICI:
+      self.output = self.model_run(**self.tensor_inputs).numpy().flatten()
+    else:
+      self.output = self.onnx_cpu_runner.run(None, self.numpy_inputs)[0].flatten()
+
     outputs = self.parser.parse_outputs(self.slice_outputs(self.output))
 
     self.full_features_20Hz[:-1] = self.full_features_20Hz[1:]
     self.full_features_20Hz[-1] = outputs['hidden_state'][0, :]
 
     idxs = np.arange(-4,-100,-4)[::-1]
-    self.inputs['features_buffer'][:] = self.full_features_20Hz[idxs].flatten()
+    self.numpy_inputs['features_buffer'][:] = self.full_features_20Hz[idxs]
     return outputs
 
 
@@ -281,7 +299,6 @@ def main(demo=False):
       pm.send('modelV2', modelv2_send)
       pm.send('drivingModelData', drivingdata_send)
       pm.send('cameraOdometry', posenet_send)
-
     last_vipc_frame_id = meta_main.frame_id