From 15e86e26da02d0699b3e3beb7b63cbdc1e188fd4 Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Tue, 28 Jan 2025 12:00:57 +0530
Subject: [PATCH 1/2] [RUNTIME][CLML] Profiling options enabled for CLML (BYOC
 via JSON Runtime)

Graph debug runtime to modifications to acommodate profiling through
BYOC external calls. Updated TVMC interface to add more formats while
profile dump. Added CLML helpers that can rebiuild CPP clml sources
from profile dumps. CLML runtime profiling is now controlled by runtime
profile flag.
---
 apps/cpp_clml/scripts/clml_codegen_json.py    |  51 +++++
 apps/cpp_clml/scripts/compare_npy.py          |  41 ++++
 python/tvm/contrib/debugger/debug_executor.py |  13 ++
 python/tvm/contrib/debugger/debug_result.py   |   4 +
 python/tvm/driver/tvmc/runner.py              |  20 +-
 python/tvm/relay/op/contrib/clml.py           | 155 ++++++++-----
 src/runtime/contrib/clml/clml_runtime.cc      | 210 ++++++++++++++++--
 src/runtime/contrib/clml/clml_runtime.h       |   4 +
 src/runtime/contrib/json/json_runtime.h       |  43 ++++
 .../debug/graph_executor_debug.cc             |  40 +++-
 .../debug/graph_executor_debug.h              |   9 +
 src/runtime/graph_executor/graph_executor.cc  |  26 ++-
 src/runtime/graph_executor/graph_executor.h   |  10 +-
 src/runtime/opencl/opencl_common.h            |   9 +-
 src/runtime/profiling.cc                      |   3 +-
 tests/scripts/setup-adreno-env.sh             |   2 +-
 16 files changed, 537 insertions(+), 103 deletions(-)
 create mode 100644 apps/cpp_clml/scripts/clml_codegen_json.py
 create mode 100644 apps/cpp_clml/scripts/compare_npy.py
diff --git a/apps/cpp_clml/scripts/clml_codegen_json.py b/apps/cpp_clml/scripts/clml_codegen_json.py
new file mode 100644
index 000000000000..c3fbf835d8ee
--- /dev/null
+++ b/apps/cpp_clml/scripts/clml_codegen_json.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import sys
+import json
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.driver import tvmc
+from tvm.relay.op.contrib import clml
+from tvm.contrib import utils
+from string import Template
+
+
+def main():
+    print("CLML Codegen From JSON")
+    if len(sys.argv) != 3:
+        print("Usage: python clml_codegen_json.py <json path> <outfile path>")
+        return
+
+    with open(sys.argv[1], "r") as file:
+        codegen = json.load(file)
+        (_, gen_src) = clml.CLMLGenSrc(codegen).get_artifacts()
+
+        f_src = open(sys.argv[2], "w")
+        f_src.write("\n".join(gen_src))
+        f_src.close()
+        os.popen("clang-format-15 -i " + sys.argv[2])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/apps/cpp_clml/scripts/compare_npy.py b/apps/cpp_clml/scripts/compare_npy.py
new file mode 100644
index 000000000000..8e3c3a8b630f
--- /dev/null
+++ b/apps/cpp_clml/scripts/compare_npy.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+import numpy as np
+
+
+def main():
+    print("Compare given numpy array in npz files")
+    if len(sys.argv) != 4:
+        print("Usage: python compare_npy.py <npz file 1> <npz file 2> <np array to cpmpare>")
+        return
+
+    in1 = np.load(sys.argv[1])
+    in2 = np.load(sys.argv[2])
+
+    print(sys.argv[1] + "->" + sys.argv[3] + ":", in1[sys.argv[3]].shape)
+    print(sys.argv[2] + "->" + sys.argv[3] + ":", in1[sys.argv[3]].shape)
+
+    np.testing.assert_allclose(in1[sys.argv[3]], in2[sys.argv[3]], rtol=1e-5, atol=1e-5)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py
index 785959ce8dd7..b0bd46c123b7 100644
--- a/python/tvm/contrib/debugger/debug_executor.py
+++ b/python/tvm/contrib/debugger/debug_executor.py
@@ -17,6 +17,7 @@
 """Graph debug runtime executes TVM debug packed functions."""
 
 import logging
+import json
 import os
 import shutil
 import struct
@@ -117,6 +118,7 @@ def __init__(self, module, device, graph_json_str, dump_root):
         self._run_individual_node = module["run_individual_node"]
         self._debug_get_output = module["debug_get_output"]
         self._execute_node = module["execute_node"]
+        self._debug_run_ext_compiler = module["debug_run_ext_compiler"]
         self._get_node_output = module["get_node_output"]
         self._profile = module["profile"]
         self._profile_rpc = module["profile_rpc"]
@@ -223,6 +225,14 @@ def _run_per_layer(self):
                 output_tensors.append(self._get_node_output(i, j))
         self.debug_datum.update_output_tensors(output_tensors)
 
+    def _run_external_debug(self):
+        ext_trace = self._debug_run_ext_compiler()
+        ext_json = json.loads(ext_trace)
+        for op in ext_json:
+            ext_debug = tvm.get_global_func("runtime.ext.debug." + op["compiler"], True)
+            if isinstance(ext_debug, tvm.runtime.packed_func.PackedFunc):
+                ext_debug(op["op"], op["dump"], self._dump_path)
+
     def _run_debug(
         self,
         number,
@@ -249,6 +259,9 @@ def _run_debug(
         # Get outputs.
         self._run_per_layer()
 
+        # Run external compiler debug if supported
+        self._run_external_debug()
+
     def debug_get_output(self, node, out=None):
         """Run graph up to node and get the output to out
 
diff --git a/python/tvm/contrib/debugger/debug_result.py b/python/tvm/contrib/debugger/debug_result.py
index 45caf41e7e58..946afd8a0be3 100644
--- a/python/tvm/contrib/debugger/debug_result.py
+++ b/python/tvm/contrib/debugger/debug_result.py
@@ -150,6 +150,10 @@ def dump_output_tensor(self):
         self._cleanup_tensors()
         output_tensors = self.get_output_tensors()
 
+        np_tensors = {}
+        for key, val in output_tensors.items():
+            np_tensors[key] = val.asnumpy()
+        np.savez(os.path.join(self._dump_path, "output_tensors.npz"), **np_tensors)
         with open(os.path.join(self._dump_path, "output_tensors.params"), "wb") as param_f:
             param_f.write(save_tensors(output_tensors))
 
diff --git a/python/tvm/driver/tvmc/runner.py b/python/tvm/driver/tvmc/runner.py
index 1394936b0a57..4c47a56147b6 100644
--- a/python/tvm/driver/tvmc/runner.py
+++ b/python/tvm/driver/tvmc/runner.py
@@ -91,6 +91,12 @@ def add_run_parser(subparsers, main_parser, json_params):  # pylint: disable=unu
         "Profiling may also have an impact on inference time, "
         "making it take longer to be generated.",
     )
+    parser.add_argument(
+        "--profile-options",
+        default="table,sort,aggregate,col_sums",
+        help="Additional options for profiling. Table dump is default"
+        "comma seperated string of table,csv,json,sort,aggregate,col_sums",
+    )
     parser.add_argument("-v", "--verbose", action="count", default=0, help="increase verbosity.")
     parser.add_argument(
         "--end-to-end",
@@ -170,6 +176,7 @@ def drive_run(args):
         repeat=args.repeat,
         number=args.number,
         profile=args.profile,
+        profile_options=args.profile_options,
         end_to_end=args.end_to_end,
     )
 
@@ -359,6 +366,7 @@ def run_module(
     repeat: int = 10,
     number: int = 10,
     profile: bool = False,
+    profile_options: str = "table,sort,aggregate,col_sums",
     end_to_end: bool = False,
 ):
     """Run a compiled graph executor module locally or remotely with
@@ -398,6 +406,8 @@ def run_module(
         Requires `benchmark` to be set to True.
     profile : bool
         Whether to profile the run with the debug executor.
+    profile_options : string
+        Additional options for profiling
     end_to_end : bool
         Whether to measure the time of memory copies as well as model
         execution. Turning this on can provide a more realistic estimate
@@ -533,7 +543,15 @@ def run_module(
                 logger.info("Running the module with profiling enabled.")
                 report = module.profile()
                 # This print is intentional
-                print(report)
+                if profile_options.find("table") != -1:
+                    is_sort = profile_options.find("sort") != -1
+                    is_aggr = profile_options.find("aggregate") != -1
+                    is_sum = profile_options.find("col_sums") != -1
+                    print(report.table(sort=is_sort, aggregate=is_aggr, col_sums=is_sum))
+                if profile_options.find("csv") != -1:
+                    print(report.csv())
+                if profile_options.find("json") != -1:
+                    print(report.json())
 
             if not benchmark or device == "micro":
                 # TODO(gromero): Fix time_evaluator() for micro targets. Once it's
diff --git a/python/tvm/relay/op/contrib/clml.py b/python/tvm/relay/op/contrib/clml.py
index dace7aaab913..6ee303891cd3 100644
--- a/python/tvm/relay/op/contrib/clml.py
+++ b/python/tvm/relay/op/contrib/clml.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name, unused-argument, pointless-exception-statement.
 """CLML Library supported operators."""
 import json
+import os
 from string import Template
 import numpy as np
 import tvm
@@ -29,6 +30,7 @@
 from tvm.relay import function as _function
 from tvm.relay.expr_functor import ExprMutator
 from tvm.relay.expr import Call, TupleGetItem, Var, Constant
+from tvm.relay.backend.executor_factory import GraphExecutorFactoryModule
 
 from ...dataflow_pattern import wildcard, is_op, is_constant, is_tuple_get_item, is_tuple
 from .register import register_pattern_table
@@ -159,6 +161,13 @@ def partition_for_clml(mod, params=None, **opts):
     if params:
         mod["main"] = bind_params_by_name(mod["main"], params)
 
+    pass_context = tvm.get_global_func("transform.GetCurrentPassContext")()
+    target_version = (
+        pass_context.config["relay.ext.clml.target_version"]
+        if "relay.ext.clml.target_version" in pass_context.config
+        else 3
+    )
+
     seq = tvm.transform.Sequential(
         [
             transform.InferType(),
@@ -631,18 +640,35 @@ def __exit__(self, ptype, value, trace):
             self.op.set_attr(self.attr_key, self.older_attr)
 
 
+@register_func("runtime.ext.debug.clml")
+def process_debug(op, dump, dump_path):
+    """Dump the required debug information in given path"""
+    dump_json = json.loads(dump)
+
+    graph_json = json.loads(dump_json["graph"])
+    with open(os.path.join(dump_path, op + ".json"), "w") as outfile:
+        json.dump(graph_json, outfile, indent=4, sort_keys=False)
+
+    hex_tensors = dump_json["tensors"]
+    fload = tvm._ffi.get_global_func("runtime.LoadParams")
+    tensor_map = fload(bytearray.fromhex(hex_tensors))
+    np_tensors = {}
+    for key, val in tensor_map.items():
+        np_tensors[key] = val.asnumpy()
+    np.savez(os.path.join(dump_path, op + ".npz"), **np_tensors)
+
+
 class CLMLGetSubModuleSrc:
     """Generates CLML API one CLML sub module out ot global TVM module"""
 
-    def __init__(self, cmod):
+    def __init__(self, codegen):
         """Initialize
         Parameters
         ----------
-        cmod : Module
-            The CLML sub module from TVM module
+        codegen : JSON
+            The CLML sub module as JSON
         """
-        self.cmod = cmod
-        self.codegen = None
+        self.codegen = codegen
         self.nodes = None
         self.node_map = {}
         self.input_meta = []
@@ -833,7 +859,6 @@ def __init__(self, cmod):
     def get_src(self):
         """Returns pair of sub module name and the generated source"""
 
-        self.codegen = json.loads(self.cmod.get_source("json"))
         self.sub_module_name = self.codegen["symbol"]
         self.nodes = self.codegen["nodes"]
         self.clml_code.append(self.MakeHeader.substitute(module=self.sub_module_name))
@@ -848,7 +873,7 @@ def get_tensor_from_map(
                 dtype = str(node["attrs"]["dtype"][0][0])
                 if node["op"] == "input":
                     self.clml_code.append("// Input Node")
-                    node_out_name = self.sub_module_name + "_" + "input_" + str(node_seq)
+                    node_out_name = node["name"]
                 else:
                     node_out_name = node["name"]
                 if shape is None:
@@ -1267,6 +1292,53 @@ def make_output_tensor(
         return (self.sub_module_name, self.clml_code)
 
 
+HEADER_STR = """
+    /*
+    * Licensed to the Apache Software Foundation (ASF) under one
+    * or more contributor license agreements.  See the NOTICE file
+    * distributed with this work for additional information
+    * regarding copyright ownership.  The ASF licenses this file
+    * to you under the Apache License, Version 2.0 (the
+    * "License"); you may not use this file except in compliance
+    * with the License.  You may obtain a copy of the License at
+    *
+    *   http://www.apache.org/licenses/LICENSE-2.0
+    *
+    * Unless required by applicable law or agreed to in writing,
+    * software distributed under the License is distributed on an
+    * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    * KIND, either express or implied.  See the License for the
+    * specific language governing permissions and limitations
+    * under the License.
+    */
+
+    /*!
+     * \\file clml_models.cc
+     * \\brief CLML models for all subgraph in given TVM module.
+     */
+
+    // AUTO GENERATED BY TOOL (clml_codegen.py), PLEASE DO NOT CHANGE THIS FILE!
+    // =========================================================================
+
+    #include <iostream>
+    #include <fstream>
+
+    #include <vector>
+    #include <string>
+    #include <algorithm>
+    #include <math.h>
+    #include <list>
+
+    // Project includes
+    #include "CL/cl.h"
+    #include "CL/cl_qcom_ml_ops.h"
+
+    #include "clml_runner.h"
+
+    using namespace tvm::runtime;
+"""
+
+
 class CLMLGenSrc:
     """Generates CLML API source given a TVM compiled mod"""
 
@@ -1274,8 +1346,7 @@ def __init__(self, libm):
         """Initialize
         Parameters
         ----------
-        libm : Module
-            Compiled relay module
+        libm : Module or json codegen object
         """
         self.libm = libm
         self.gen_src = []
@@ -1284,55 +1355,12 @@ def __init__(self, libm):
         self.codegen = None
         self.nodes = None
 
-        self.MakeFileHeader = Template(
-            """/*
-        * Licensed to the Apache Software Foundation (ASF) under one
-        * or more contributor license agreements.  See the NOTICE file
-        * distributed with this work for additional information
-        * regarding copyright ownership.  The ASF licenses this file
-        * to you under the Apache License, Version 2.0 (the
-        * "License"); you may not use this file except in compliance
-        * with the License.  You may obtain a copy of the License at
-        *
-        *   http://www.apache.org/licenses/LICENSE-2.0
-        *
-        * Unless required by applicable law or agreed to in writing,
-        * software distributed under the License is distributed on an
-        * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-        * KIND, either express or implied.  See the License for the
-        * specific language governing permissions and limitations
-        * under the License.
-        */
-
-        /*!
-         * \\file clml_models.cc
-         * \\brief CLML models for all subgraph in given TVM module.
-         */
-
-        // AUTO GENERATED BY TOOL (clml_codegen.py), PLEASE DO NOT CHANGE THIS FILE!
-        // =========================================================================
-
-        #include <iostream>
-        #include <fstream>
-
-        #include <vector>
-        #include <string>
-        #include <algorithm>
-        #include <math.h>
-        #include <list>
-
-        // Project includes
-        #include "CL/cl.h"
-        #include "CL/cl_qcom_ml_ops.h"
-
-        #include "clml_runner.h"
-
-        using namespace tvm::runtime;
-        """
-        )
+        self.MakeFileHeader = Template(HEADER_STR)
 
     def get_clml_params(self):
         """Returns parameters from the TVM module"""
+        if not isinstance(self.libm, GraphExecutorFactoryModule):
+            return {}
 
         clml_params = {}
         if self.libm.get_lib().type_key == "const_loader":
@@ -1353,14 +1381,21 @@ def get_clml_params(self):
     def get_artifacts(self):
         """Function that returns params as dict and source as list of cource code lines"""
 
-        self.clml_modules = list(
-            filter(lambda mod: mod.type_key == "clml", self.libm.get_lib().imported_modules)
-        )
         self.clml_builds["file_header"] = [self.MakeFileHeader.substitute()]
+        if isinstance(self.libm, GraphExecutorFactoryModule):
+            self.clml_modules = list(
+                filter(lambda mod: mod.type_key == "clml", self.libm.get_lib().imported_modules)
+            )
 
-        for cmod in self.clml_modules:
-            (sub_module_name, clml_code) = CLMLGetSubModuleSrc(cmod).get_src()
+            for cmod in self.clml_modules:
+                codegen = json.loads(cmod.get_source("json"))
+                (sub_module_name, clml_code) = CLMLGetSubModuleSrc(codegen).get_src()
+                self.clml_builds[sub_module_name] = clml_code
+        elif isinstance(self.libm, dict):
+            (sub_module_name, clml_code) = CLMLGetSubModuleSrc(self.libm).get_src()
             self.clml_builds[sub_module_name] = clml_code
+        else:
+            raise Exception("Don't know how to handle the input")
 
         main_code = []
         main_code.append(
diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc
index c580123b1347..d8c0075fcdc1 100644
--- a/src/runtime/contrib/clml/clml_runtime.cc
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -23,11 +23,15 @@
  */
 #include "clml_runtime.h"
 
+#include <unordered_map>
+
 #ifdef TVM_GRAPH_EXECUTOR_CLML
 #include "clml_memory_planner.h"
 #include "clml_utils.h"
 #endif
 
+#include <tvm/runtime/profiling.h>
+
 namespace tvm {
 namespace runtime {
 namespace contrib {
@@ -60,23 +64,28 @@ CLMLWorkspace::CLMLWorkspace() {
   result = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, reqd_size, extn_buf.data(), nullptr);
   ICHECK(result == CL_SUCCESS) << "clGetDeviceInfo:" << result;
   std::string extensions(extn_buf.data());
-  LOG(WARNING) << "OpenCL Extensions:" << extensions;
+  LOG_CLML << "OpenCL Extensions:" << extensions;
 
   if (extensions.find("cl_qcom_ml_ops") == std::string::npos) {
     LOG(FATAL) << "CLML Runtime Init: Qualcomm extn not present.\n";
     return;
   }
-  is_recordable_queue = (extensions.find("cl_qcom_recordable_queues") != std::string::npos);
-  is_on_chip_memory = (extensions.find("cl_qcom_onchip_global_memory") != std::string::npos);
-  LOG(WARNING) << "Recordable Queues Support :" << is_recordable_queue;
-  LOG(WARNING) << "On chip Memory Support :" << is_on_chip_memory;
+  if (getenv("CLML_DISABLE_RECORDABLE_QUEUE")) {
+    is_recordable_queue = 0;
+    is_on_chip_memory = 0;
+  } else {
+    is_recordable_queue = (extensions.find("cl_qcom_recordable_queues") != std::string::npos);
+    is_on_chip_memory = (extensions.find("cl_qcom_onchip_global_memory") != std::string::npos);
+    LOG_CLML << "Recordable Queues Support :" << is_recordable_queue;
+    LOG_CLML << "On chip Memory Support :" << is_on_chip_memory;
+  }
 
   if (is_on_chip_memory) {
     result = clGetDeviceInfo(device_id, CL_DEVICE_ONCHIP_GLOBAL_MEM_SIZE_QCOM,
                              sizeof(onchip_mem_size), &onchip_mem_size, nullptr);
     ICHECK(result == CL_SUCCESS) << "clGetDeviceInfo(CL_DEVICE_ONCHIP_GLOBAL_MEM_SIZE_QCOM):"
                                  << result;
-    LOG(WARNING) << "On chip memory size:" << onchip_mem_size;
+    LOG_CLML << "On chip memory size:" << onchip_mem_size;
   }
 
   // Query and Get CLML Interface
@@ -106,10 +115,6 @@ CLMLWorkspace::CLMLWorkspace() {
     target_minor = 0;
   }
 
-  // ICHECK(target_minor <= CL_QCOM_ML_OPS_H_MINOR_VERSION)
-  //    << "CLML runtime compiled with minor version " << CL_QCOM_ML_OPS_H_MINOR_VERSION
-  //    << " where as the target supports higher version " << target_minor;
-
   clGetMLInterfaceQCOM(&h_ClmlIntf, target_major, target_minor);
 
   ICHECK(nullptr != h_ClmlIntf) << "Couldn't get API interface, target is not supported."
@@ -257,6 +262,167 @@ class CLMLRuntime : public JSONRuntimeBase {
     }
   }
 
+  std::string DebugDump(void) override {
+    if (cws->is_recordable_queue) {
+      LOG(FATAL) << "Debugging over recordable queues is not supported yet. You may disable the "
+                    "same by exporting CLML_DISABLE_RECORDABLE_QUEUE at runtime.";
+    }
+    cl_command_queue queue = CLML_QUEUE;
+    Map<String, NDArray> dump_tensors;
+    std::ostringstream os;
+    dmlc::JSONWriter writer(&os);
+    writer.BeginObject();
+
+    writer.WriteObjectKeyValue("graph", graph_json_);
+
+    int op_index = 0;
+    for (auto it = this->layer_.storage_map.begin(); it != this->layer_.storage_map.end(); it++) {
+      int nid = it->first;
+      auto clml_desc = it->second.first;
+      auto node = it->second.second;
+
+      if ("kernel" == node.GetOpType()) {
+        CLML_CALL(clEnqueueMLOpQCOM, queue, this->layer_.function[op_index],
+                  this->layer_.descriptorSet, 0, nullptr, nullptr);
+        OPENCL_CALL(clFinish(queue));
+        op_index++;
+      }
+
+      // Dump tensor to CPU
+      std::vector<int64_t> shape = node.GetOpShape()[0];
+      DLDataType tvm_dtype = node.GetOpDataType()[0];
+      NDArray narr = NDArray::Empty(ShapeTuple(shape), tvm_dtype, {kDLCPU, 0});
+      CopyDataFromCLMLTensor(clml_desc, narr.operator->()->data);
+
+      // Naming convention
+      std::string node_name;
+      bool is_out = false;
+      for (size_t i = 0; i < outputs_.size(); ++i) {
+        uint32_t eid = EntryID(outputs_[i]);
+        is_out = (eid == nid);
+      }
+      if (is_out) {
+        node_name = clml_symbol + "_layer_out_" + std::to_string(nid);
+      } else if (("const" == node.GetOpType()) || ("input" == node.GetOpType())) {
+        node_name = node.GetOpName();
+      } else {
+        node_name = node.GetOpName() + "____topo-index:" + std::to_string(nid);
+      }
+      dump_tensors.Set(node_name, narr);
+    }
+
+    const PackedFunc* f = Registry::Get("runtime.SaveParams");
+    if (nullptr != f) {
+      std::string dump_bytes = (*f)(dump_tensors);
+      std::ostringstream oss;
+      /*TODO(Siva) HEX encoding doubles the size, look for better encode that can cross the RPC. */
+      for (size_t i = 0; i < dump_bytes.size(); ++i) {
+        oss << std::setw(2) << std::setfill('0') << std::hex << static_cast<int>(dump_bytes[i]);
+      }
+      writer.WriteObjectKeyValue("tensors", oss.str());
+    }
+
+    writer.EndObject();
+    return os.str();
+  }
+
+  void RunProfile(profiling::Profiler* prof) override {
+    cl_command_queue queue = CLML_QUEUE;
+    std::vector<cl_event>& evts = cws->workspace->GetEventQueue(cws->tentry->device);
+    std::vector<profiling::MetricCollector> cs;
+    std::vector<Device> devices;
+    devices.push_back(cws->tentry->device);
+
+    for (size_t i = 0; i < input_nodes_.size(); ++i) {
+      auto nid = input_nodes_[i];
+      uint32_t eid = EntryID(nid, 0);
+      if (nodes_[nid].GetOpType() == "input") {
+        // Assuming all inputs are from OpenCL
+        if (kDLOpenCL == data_entry_[eid]->device.device_type) {
+          layer_.in_placeholder[nid]->memory = static_cast<cl_mem>(
+              ((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
+          cl_event cpy_evt = nullptr;
+          cl_event* evt = &cpy_evt;
+          if (cws->workspace->IsProfiling(cws->tentry->device)) {
+            evts.resize(evts.size() + 1);
+            evt = &(evts.back());
+          }
+          std::unordered_map<std::string, ObjectRef> metrics;
+          std::string shape_str;
+          std::vector<int64_t> shape = nodes_[nid].GetOpShape()[0];
+          DLDataType tvm_dtype = nodes_[nid].GetOpDataType()[0];
+          shape_str.append(profiling::ShapeString(shape, tvm_dtype));
+          metrics["Argument Shapes"] = String(shape_str);
+
+          prof->StartCall("CopyIn", cws->tentry->device, metrics);
+          CLML_CALL(clEnqueueCopyMLTensorDataQCOM, queue, layer_.in_placeholder[nid]->tensor,
+                    layer_.in_placeholder[nid]->memory, layer_.inputs[nid]->tensor,
+                    layer_.inputs[nid]->memory, 0, nullptr, evt);
+          prof->StopCall();
+        }
+      }
+    }
+
+    for (size_t i = 0; i < this->layer_.function.size(); ++i) {
+      std::unordered_map<std::string, ObjectRef> metrics;
+      auto node = this->layer_.op_node_map[this->layer_.function[i]].second;
+      std::string shape_str;
+      for (uint32_t j = 0; j < node.GetInputs().size(); ++j) {
+        const JSONGraphNode in_node = nodes_[node.GetInputs()[j].id_];
+        std::vector<int64_t> shape = in_node.GetOpShape()[0];
+        DLDataType tvm_dtype = in_node.GetOpDataType()[0];
+        shape_str.append(profiling::ShapeString(shape, tvm_dtype));
+        shape_str.append(", ");
+      }
+      // Assuming one output per operation
+      std::vector<int64_t> shape = node.GetOpShape()[0];
+      DLDataType tvm_dtype = node.GetOpDataType()[0];
+      shape_str.append(profiling::ShapeString(shape, tvm_dtype));
+      metrics["Argument Shapes"] = String(shape_str);
+
+      // Launch call
+      prof->StartCall(clml_symbol + "-" + this->layer_.layer_names[i], cws->tentry->device,
+                      metrics);
+      queue = CLML_QUEUE;
+      evts.resize(evts.size() + 1);
+      cl_event* evt = &(evts.back());
+      CLML_CALL(clEnqueueMLOpQCOM, queue, this->layer_.function[i], this->layer_.descriptorSet, 0,
+                nullptr, evt);
+      prof->StopCall();
+    }
+
+    for (size_t i = 0; i < outputs_.size(); ++i) {
+      uint32_t eid = EntryID(outputs_[i]);
+
+      // Assuming all outputs are to OpenCL
+      if (kDLOpenCL == data_entry_[eid]->device.device_type) {
+        layer_.out_placeholder[i]->memory = static_cast<cl_mem>(
+            ((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
+        cl_event cpy_evt = nullptr;
+        cl_event* evt = &cpy_evt;
+        if (cws->workspace->IsProfiling(cws->tentry->device)) {
+          evts.resize(evts.size() + 1);
+          evt = &(evts.back());
+        }
+
+        std::unordered_map<std::string, ObjectRef> metrics;
+        std::string shape_str;
+        std::vector<int64_t> shape = nodes_[eid].GetOpShape()[0];
+        DLDataType tvm_dtype = nodes_[eid].GetOpDataType()[0];
+        shape_str.append(profiling::ShapeString(shape, tvm_dtype));
+        metrics["Argument Shapes"] = String(shape_str);
+
+        prof->StartCall("CopyOut", cws->tentry->device, metrics);
+        CLML_CALL(clEnqueueCopyMLTensorDataQCOM, queue, layer_.outputs[i]->tensor,
+                  layer_.outputs[i]->memory, layer_.out_placeholder[i]->tensor,
+                  layer_.out_placeholder[i]->memory, 0, nullptr, evt);
+        prof->StopCall();
+      }
+    }
+
+    return;
+  }
+
   /*!
    * \brief Unpack inputs and outputs and run inference on a given layer.
    *
@@ -305,7 +471,7 @@ class CLMLRuntime : public JSONRuntimeBase {
 
     int64_t duration = 0;
     if (cws->is_recordable_queue) {
-      if (getenv("CLML_PROFILING")) {
+      if (cws->workspace->IsProfiling(cws->tentry->device)) {
         Timer t;
         auto f = Registry::Get(std::string("profiling.timer.opencl"));
         t = f->operator()(cws->tentry->device);
@@ -324,7 +490,7 @@ class CLMLRuntime : public JSONRuntimeBase {
     } else {
       for (size_t i = 0; i < this->layer_.function.size(); ++i) {
         // Make CLML subgraphs accounted by OpenCLTimerNode.
-        if (getenv("CLML_PROFILING")) {
+        if (cws->workspace->IsProfiling(cws->tentry->device)) {
           Timer t;
           auto f = Registry::Get(std::string("profiling.timer.opencl"));
           t = f->operator()(cws->tentry->device);
@@ -336,16 +502,16 @@ class CLMLRuntime : public JSONRuntimeBase {
                     0, nullptr, evt);
           t->Stop();
           duration += t->SyncAndGetElapsedNanos();
-          LOG(WARNING) << "Layer:" << this->layer_.layer_names[i]
-                       << " Duration:" << t->SyncAndGetElapsedNanos();
+          LOG_CLML << "Layer:" << this->layer_.layer_names[i]
+                   << " Duration:" << t->SyncAndGetElapsedNanos();
         } else {
           CLML_CALL(clEnqueueMLOpQCOM, queue, this->layer_.function[i], this->layer_.descriptorSet,
                     0, nullptr, nullptr);
         }
       }
     }
-    if (getenv("CLML_PROFILING")) {
-      LOG(WARNING) << "Total Duration for " << clml_symbol << " is:" << duration;
+    if (cws->workspace->IsProfiling(cws->tentry->device)) {
+      LOG_CLML << "Total Duration for " << clml_symbol << " is:" << duration;
     }
 
     for (size_t i = 0; i < outputs_.size(); ++i) {
@@ -616,6 +782,8 @@ class CLMLRuntime : public JSONRuntimeBase {
         else
           LOG(FATAL) << "Unsupported op: " << op_name;
         this->layer_.layer_names.push_back(op_name);
+        // Keep map of function and Node to use in profiling
+        this->layer_.op_node_map.insert({this->layer_.function.back(), std::make_pair(nid, node)});
       } else if (node.GetOpType() != "const") {
         LOG(WARNING) << "Build Engine: Unknown Node:" << node.GetOpType();
       }
@@ -710,11 +878,11 @@ class CLMLRuntime : public JSONRuntimeBase {
               this->layer_.tensorMemDescs.data());
 
     if (cws->is_tuning_run) {
-      LOG(WARNING) << "CLML Tunning In Progress:";
+      LOG_CLML << "CLML Tunning In Progress:";
       // Let the command queue recreated in profiling mode.
       cl::OpenCLWorkspace::Global()->EnableQueueProfiling(cws->tentry->device, true);
       for (size_t i = 0; i < this->layer_.function.size(); ++i) {
-        LOG(WARNING) << "CLML Tunning:" << this->layer_.layer_names[i];
+        LOG_CLML << "CLML Tunning:" << this->layer_.layer_names[i];
         CLML_CALL(clTuneMLOpQCOM, CLML_QUEUE, this->layer_.function[i], this->layer_.descriptorSet,
                   this->layer_.tuning_cache, nullptr);
       }
@@ -741,8 +909,8 @@ class CLMLRuntime : public JSONRuntimeBase {
       std::ofstream fs(cws->tuning_file, std::ios::app | std::ios::binary);
       ICHECK(!fs.fail()) << "Cannot open " << cws->tuning_file;
       fs.write(&tune_str[0], tune_str.length());
-      LOG(WARNING) << "CLML: Tuning cache dumped to:" << cws->tuning_file << " size"
-                   << tune_str.length() << " with tuning blob len " << saved_cache.size();
+      LOG_CLML << "CLML: Tuning cache dumped to:" << cws->tuning_file << " size"
+               << tune_str.length() << " with tuning blob len " << saved_cache.size();
     }
     if (cws->is_recordable_queue) {
       for (size_t i = 0; i < this->layer_.function.size(); ++i) {
@@ -1591,6 +1759,8 @@ class CLMLRuntime : public JSONRuntimeBase {
                  << "Please build with USE_CLML_GRAPH_EXECUTOR.";
   }
 #endif
+  bool CanDebug() override { return true; }
+
   /*! CLML sub graph symbol in TVM main module */
   std::string clml_symbol;
 };
diff --git a/src/runtime/contrib/clml/clml_runtime.h b/src/runtime/contrib/clml/clml_runtime.h
index f346ce7af696..9dfde2f7820d 100644
--- a/src/runtime/contrib/clml/clml_runtime.h
+++ b/src/runtime/contrib/clml/clml_runtime.h
@@ -164,8 +164,10 @@ static const uint64_t kTVMCLMLTuningCacheMagic = 0x434C4D4C54554E45;
 
 #define DEBUG_MEMORY_ALLOC false
 #define DEBUG_STATS false
+#define DEBUG_CLML false
 #define LOG_MEM LOG_IF(WARNING, DEBUG_MEMORY_ALLOC)
 #define LOG_STATS LOG_IF(WARNING, DEBUG_STATS)
+#define LOG_CLML LOG_IF(WARNING, DEBUG_CLML)
 
 namespace tvm {
 namespace runtime {
@@ -235,6 +237,8 @@ class CLMLThreadEntry {
 struct CachedLayer {
   /* List of all created CLML operation handles in graph */
   std::vector<cl_ml_op_qcom> function;
+  /* Map of function and original JsonNode */
+  std::map<cl_ml_op_qcom, std::pair<int, JSONGraphNode>> op_node_map;
   /* The input tensor map  */
   std::map<int, std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> inputs;
   /* A place holder Tensor representing TVM NDArray as CLML Tensor */
diff --git a/src/runtime/contrib/json/json_runtime.h b/src/runtime/contrib/json/json_runtime.h
index 8eec0447a189..8e105dab7837 100644
--- a/src/runtime/contrib/json/json_runtime.h
+++ b/src/runtime/contrib/json/json_runtime.h
@@ -27,6 +27,7 @@
 
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/profiling.h>
 
 #include <cstddef>
 #include <string>
@@ -69,6 +70,25 @@ class JSONRuntimeBase : public ModuleNode {
   /*! \brief Invoke the execution engine to inteprete a specific json runtime. */
   virtual void Run() = 0;
 
+  /*! \brief Does the backend support debug & profiling */
+  virtual bool CanDebug() { return false; }
+
+  /*!
+   * \brief Invoke the profiler
+   * \param pointer to profiler
+   */
+  virtual void RunProfile(profiling::Profiler* prof) {
+    LOG(FATAL) << "Not expected to be here : Profiling call w/o support ?";
+  }
+
+  /*!
+   * \brief Invoke the debugger
+   * \return External compiler specific debug blob
+   */
+  virtual std::string DebugDump(void) {
+    LOG(FATAL) << "Not expected to be here : Debug dump w/o support ?";
+  }
+
   /*!
    * \brief Get a packed function.
    * \param name The name/symbol of the function.
@@ -88,9 +108,32 @@ class JSONRuntimeBase : public ModuleNode {
 
         // Bind argument tensors to data entries.
         this->SetInputOutputBuffers(args);
+
         // Execute the subgraph.
         this->Run();
       });
+    } else if (this->symbol_name_ + "_debug" == name) {
+      if (!this->CanDebug()) {
+        return PackedFunc(nullptr);
+      }
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        ICHECK(this->initialized_) << "The module has not been initialized";
+
+        // Bind argument tensors to data entries.
+        this->SetInputOutputBuffers(args);
+
+        if (rv->IsObjectRef<String>()) {
+          String purpose = *rv;
+          if ("debug_dump" == purpose) {
+            *rv = this->DebugDump();
+          }
+        } else {
+          // Profile the subgraph.
+          profiling::Profiler* prof = static_cast<profiling::Profiler*>(rv->value().v_handle);
+          this->RunProfile(prof);
+        }
+        // String vendor_prof = this->RunProfile(prof);
+      });
     } else if ("__init_" + this->symbol_name_ == name) {
       // The function to initialize constant tensors.
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc
index 892a13b46bb4..a9cd4d544d3b 100644
--- a/src/runtime/graph_executor/debug/graph_executor_debug.cc
+++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc
@@ -213,6 +213,9 @@ PackedFunc GraphExecutorDebug::GetFunction(const String& name,
   } else if (name == "execute_node") {
     return PackedFunc(
         [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->ExecuteNode(args[0]); });
+  } else if (name == "debug_run_ext_compiler") {
+    return PackedFunc(
+        [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->DebugRunExtCompiler(); });
   } else if (name == "get_node_output") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
       *rv = this->GetNodeOutput(args[0], args[1]);
@@ -320,6 +323,31 @@ void GraphExecutorDebug::ExecuteNode(int node) {
   last_executed_node_ = end_ind;
 }
 
+std::string GraphExecutorDebug::DebugRunExtCompiler(void) {
+  std::ostringstream os;
+  dmlc::JSONWriter writer(&os);
+  writer.BeginArray();
+  for (size_t i = 0; i < op_execs_.size(); ++i) {
+    if (!nodes_[i].param.compiler.empty() && op_profile_execs_[i]) {
+      TVMRetValue rv;
+      rv = String("debug_dump");
+      this->op_profile_execs_[i](&rv);
+      std::string debug_ret = rv;
+
+      writer.BeginObject();
+      writer.WriteObjectKeyValue("compiler", nodes_[i].param.compiler);
+      writer.WriteObjectKeyValue("op", nodes_[i].param.func_name);
+      writer.WriteObjectKeyValue("dump", debug_ret);
+      writer.EndObject();
+    } else {
+      if (op_execs_[i]) op_execs_[i]();
+    }
+  }
+  writer.EndArray();
+
+  return os.str();
+}
+
 void GraphExecutorDebug::DebugGetNodeOutput(int index, DLTensor* data_out) {
   ICHECK_LT(static_cast<size_t>(index), op_execs_.size());
   uint32_t eid = index;
@@ -386,9 +414,15 @@ profiling::Report GraphExecutorDebug::Profile(Array<profiling::MetricCollector>
         metrics["Hash"] = Downcast<String>(nodes_[i].param.attrs.at("hash"));
       }
       metrics["Argument Shapes"] = profiling::ShapeString(shapes);
-      prof.StartCall(nodes_[i].param.func_name, dev, metrics);
-      op_execs_[i]();
-      prof.StopCall();
+      if (!nodes_[i].param.compiler.empty() && op_profile_execs_[i]) {
+        TVMRetValue rv;
+        rv = static_cast<void*>(&prof);
+        this->op_profile_execs_[i](&rv);
+      } else {
+        prof.StartCall(nodes_[i].param.func_name, dev, metrics);
+        op_execs_[i]();
+        prof.StopCall();
+      }
     }
   }
   prof.Stop();
diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.h b/src/runtime/graph_executor/debug/graph_executor_debug.h
index 382083056604..8ede2a3a5f84 100644
--- a/src/runtime/graph_executor/debug/graph_executor_debug.h
+++ b/src/runtime/graph_executor/debug/graph_executor_debug.h
@@ -98,6 +98,15 @@ class GraphExecutorDebug : public GraphExecutor {
    */
   void ExecuteNode(int node);
 
+  /*!
+   * \brief debug external comilers if supported.
+   *
+   * This method invokes the external compilers to generate any debug trace info.
+   *
+   * \return Returns serialized debug trace information to the caller
+   */
+  std::string DebugRunExtCompiler(void);
+
   /*!
    * \brief Returns index-th output of node.
    *
diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc
index 6324da9c27ef..1b1051322c49 100644
--- a/src/runtime/graph_executor/graph_executor.cc
+++ b/src/runtime/graph_executor/graph_executor.cc
@@ -503,6 +503,7 @@ void GraphExecutor::SetupStorage() {
 
 void GraphExecutor::SetupOpExecs() {
   op_execs_.resize(this->GetNumOfNodes());
+  op_profile_execs_.resize(this->GetNumOfNodes());
   input_dltensors_.resize(num_node_entries());
   output_dltensors_.resize(num_node_entries());
   both_output_opinput_dltensors_.resize(num_node_entries());
@@ -532,7 +533,7 @@ void GraphExecutor::SetupOpExecs() {
     ICHECK(inode.op_type == "tvm_op") << "Can only take tvm_op as op";
 
     std::shared_ptr<OpArgs> op_args = nullptr;
-    std::tie(op_execs_[nid], op_args) = CreateTVMOp(inode.param, args);
+    std::tie(op_execs_[nid], op_profile_execs_[nid], op_args) = CreateTVMOp(inode.param, args);
 
     for (size_t i = 0; i < inode.inputs.size(); i++) {
       uint32_t input_eid = this->entry_id(inode.inputs[i]);
@@ -581,8 +582,9 @@ void GraphExecutor::SetupOpExecs() {
   }
 }
 
-std::pair<std::function<void()>, std::shared_ptr<GraphExecutor::OpArgs>> GraphExecutor::CreateTVMOp(
-    const TVMOpParam& param, const std::vector<DLTensor*>& args) {
+std::tuple<std::function<void()>, std::function<void(TVMRetValue*)>,
+           std::shared_ptr<GraphExecutor::OpArgs>>
+GraphExecutor::CreateTVMOp(const TVMOpParam& param, const std::vector<DLTensor*>& args) {
   std::shared_ptr<GraphExecutor::OpArgs> arg_ptr = std::make_shared<GraphExecutor::OpArgs>();
   // setup address.
   arg_ptr->args = args;
@@ -604,7 +606,7 @@ std::pair<std::function<void()>, std::shared_ptr<GraphExecutor::OpArgs>> GraphEx
   }
 
   if (param.func_name == "__nop") {
-    return {[]() {}, arg_ptr};
+    return {[]() {}, [](TVMRetValue* rv) {}, arg_ptr};
   } else if (param.func_name == "__copy") {
     // Perform cross device data copy.
     // Directly copy data from the input to the output.
@@ -614,21 +616,31 @@ std::pair<std::function<void()>, std::shared_ptr<GraphExecutor::OpArgs>> GraphEx
       DLTensor* to = static_cast<DLTensor*>(arg_ptr->arg_values[1].v_handle);
       TVM_CCALL(TVMArrayCopyFromTo(from, to, nullptr));
     };
-    return {fexec, arg_ptr};
+    return {fexec, [](TVMRetValue* rv) {}, arg_ptr};
   }
 
   // Get compiled function from the module that contains both host and device
   // code.
   tvm::runtime::PackedFunc pf = module_.GetFunction(param.func_name, true);
   ICHECK(pf != nullptr) << "no such function in module: " << param.func_name;
-
   auto fexec = [arg_ptr, pf]() {
     TVMRetValue rv;
     TVMArgs targs(arg_ptr->arg_values.data(), arg_ptr->arg_tcodes.data(),
                   static_cast<int>(arg_ptr->arg_values.size()));
     pf.CallPacked(targs, &rv);
   };
-  return {fexec, arg_ptr};
+
+  pf = module_.GetFunction(param.func_name + "_debug", true);
+  std::function<void(TVMRetValue*)> fexec_profile = nullptr;
+  if (pf != nullptr) {
+    fexec_profile = [arg_ptr, pf](TVMRetValue* rv) {
+      TVMArgs targs(arg_ptr->arg_values.data(), arg_ptr->arg_tcodes.data(),
+                    static_cast<int>(arg_ptr->arg_values.size()));
+      pf.CallPacked(targs, rv);
+    };
+  }
+
+  return {fexec, fexec_profile, arg_ptr};
 }
 
 PackedFunc GraphExecutor::GetFunction(const String& name, const ObjectPtr<Object>& sptr_to_self) {
diff --git a/src/runtime/graph_executor/graph_executor.h b/src/runtime/graph_executor/graph_executor.h
index 53e2801d574e..cfdba8916baa 100644
--- a/src/runtime/graph_executor/graph_executor.h
+++ b/src/runtime/graph_executor/graph_executor.h
@@ -56,6 +56,7 @@ using memory::MemoryManager;
 /*! \brief operator attributes about tvm op */
 struct TVMOpParam {
   std::string func_name;
+  std::string compiler;
   std::unordered_map<std::string, ObjectRef> attrs;
   uint32_t num_inputs;
   uint32_t num_outputs;
@@ -272,6 +273,9 @@ class TVM_DLL GraphExecutor : public ModuleNode {
         if (key == "func_name") {
           param->func_name = value;
           bitmask |= 1;
+        }
+        if (key == "Compiler") {
+          param->compiler = value;
         } else if (key == "num_inputs") {
           param->num_inputs = strtoul(value.c_str(), nullptr, 10);
           bitmask |= 2;
@@ -440,8 +444,8 @@ class TVM_DLL GraphExecutor : public ModuleNode {
    * \param args The arguments to the functor, including inputs and outputs.
    * \return The created executor.
    */
-  std::pair<std::function<void()>, std::shared_ptr<OpArgs>> CreateTVMOp(
-      const TVMOpParam& attrs, const std::vector<DLTensor*>& args);
+  std::tuple<std::function<void()>, std::function<void(TVMRetValue*)>, std::shared_ptr<OpArgs>>
+  CreateTVMOp(const TVMOpParam& attrs, const std::vector<DLTensor*>& args);
   // Get node entry index.
   uint32_t entry_id(uint32_t nid, uint32_t index) const { return node_row_ptr_[nid] + index; }
   // Get node entry index.
@@ -486,6 +490,8 @@ class TVM_DLL GraphExecutor : public ModuleNode {
   std::vector<size_t> data_alignment_;
   /*! \brief Operator on each node. */
   std::vector<std::function<void()>> op_execs_;
+  /*! \brief Profilable Operator on each node. */
+  std::vector<std::function<void(TVMRetValue*)>> op_profile_execs_;
   /*! \brief Linked parameter lookup function. */
   PackedFunc lookup_linked_param_;
   /*! \brief Module's _lookup_linked_param function, used by DefaultLookupLinkedParam. */
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index e0abd1841b64..2e9b05edcb58 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -514,7 +514,7 @@ class OpenCLTimerNode : public TimerNode {
       cl::OpenCLWorkspace::Global()->GetEventQueue(dev_).clear();
       // Very first call of Start() leads to the recreation of
       // OpenCL command queue in profiling mode. This allows to run profile after inference.
-      recreateCommandQueue();
+      cl::OpenCLWorkspace::Global()->EnableQueueProfiling(dev_, true);
     }
     ++count_timer_execs;
     // set new first idx in event queue
@@ -549,7 +549,7 @@ class OpenCLTimerNode : public TimerNode {
     // Profiling session ends, recreate clCommandQueue in non-profiling mode
     // This will disable collection of cl_events in case of executing inference after profile
     if (count_timer_execs == 0) {
-      recreateCommandQueue();
+      cl::OpenCLWorkspace::Global()->EnableQueueProfiling(dev_, false);
       event_start_idxs.clear();
     }
   }
@@ -565,11 +565,6 @@ class OpenCLTimerNode : public TimerNode {
  private:
   int64_t duration;
   Device dev_;
-
-  void recreateCommandQueue() {
-    cl::OpenCLWorkspace::Global()->EnableQueueProfiling(
-        dev_, !cl::OpenCLWorkspace::Global()->IsProfiling(dev_));
-  }
 };
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index 6a42d840b206..83be98556a9e 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -671,7 +671,7 @@ Report Profiler::Report() {
     rows.push_back(row);
   }
 
-  // the last couple of call frames are the overall times
+  // the last frames are the overall times
   double overall_time_us = 0;
   std::unordered_map<String, Map<String, ObjectRef>> device_metrics;
   for (size_t i = 0; i < devs_.size(); i++) {
@@ -776,7 +776,6 @@ Report Report::FromJSON(String json) {
       configuration = parse_metrics(&reader);
     }
   }
-
   return Report(calls, device_metrics, configuration);
 }
 
diff --git a/tests/scripts/setup-adreno-env.sh b/tests/scripts/setup-adreno-env.sh
index b0c3559bf081..a35a633e1dfd 100755
--- a/tests/scripts/setup-adreno-env.sh
+++ b/tests/scripts/setup-adreno-env.sh
@@ -112,7 +112,7 @@ case ${ENVIRONMENT} in
     adb forward tcp:$((LISTEN_PORT + 1)) tcp:$((LISTEN_PORT + 1))
     adb forward tcp:$((LISTEN_PORT + 2)) tcp:$((LISTEN_PORT + 2))
     adb forward tcp:$((LISTEN_PORT + 3)) tcp:$((LISTEN_PORT + 3))
-    adb shell "cd ${TARGET_FOLDER}; killall -9 tvm_rpc-${USER}; sleep 2; export CLML_PROFILING=1; export CLML_IS_TUNING_RUN=1; export CLML_TUNING_CACHE=clml.bin; LD_LIBRARY_PATH=${TARGET_FOLDER}/ ./tvm_rpc-${USER} server --host=0.0.0.0 --port=${LISTEN_PORT} --port-end=$((LISTEN_PORT + 10)) --tracker=127.0.0.1:${TVM_TRACKER_PORT} --key=${RPC_DEVICE_KEY}"
+    adb shell "cd ${TARGET_FOLDER}; killall -9 tvm_rpc-${USER}; sleep 2; export CLML_DISABLE_RECORDABLE_QUEUE=1; export CLML_IS_TUNING_RUN=1; export CLML_TUNING_CACHE=clml.bin; LD_LIBRARY_PATH=${TARGET_FOLDER}/ ./tvm_rpc-${USER} server --host=0.0.0.0 --port=${LISTEN_PORT} --port-end=$((LISTEN_PORT + 10)) --tracker=127.0.0.1:${TVM_TRACKER_PORT} --key=${RPC_DEVICE_KEY}"
     ;;
 
   "query")

From 0b621c7f910fa1a5c0a00f3fce600376eee87fdf Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Tue, 28 Jan 2025 15:39:51 +0530
Subject: [PATCH 2/2] lint

---
 python/tvm/relay/op/contrib/clml.py                        | 7 -------
 .../test_hexagon/test_relax_2d_buffer_allocation.py        | 2 +-
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/python/tvm/relay/op/contrib/clml.py b/python/tvm/relay/op/contrib/clml.py
index 6ee303891cd3..3f79acbdc8f1 100644
--- a/python/tvm/relay/op/contrib/clml.py
+++ b/python/tvm/relay/op/contrib/clml.py
@@ -161,13 +161,6 @@ def partition_for_clml(mod, params=None, **opts):
     if params:
         mod["main"] = bind_params_by_name(mod["main"], params)
 
-    pass_context = tvm.get_global_func("transform.GetCurrentPassContext")()
-    target_version = (
-        pass_context.config["relay.ext.clml.target_version"]
-        if "relay.ext.clml.target_version" in pass_context.config
-        else 3
-    )
-
     seq = tvm.transform.Sequential(
         [
             transform.InferType(),
diff --git a/tests/python/contrib/test_hexagon/test_relax_2d_buffer_allocation.py b/tests/python/contrib/test_hexagon/test_relax_2d_buffer_allocation.py
index 6eaa1179ba17..62ece5c6b88c 100644
--- a/tests/python/contrib/test_hexagon/test_relax_2d_buffer_allocation.py
+++ b/tests/python/contrib/test_hexagon/test_relax_2d_buffer_allocation.py
@@ -17,6 +17,7 @@
 """Relax hexagon 2d VTCM allocation test."""
 
 import numpy as np
+import pytest
 
 import tvm
 import tvm.contrib.hexagon
@@ -25,7 +26,6 @@
 from tvm.script import ir as I
 from tvm.script import relax as R
 from tvm.script import tir as T
-import pytest
 
 
 # pylint: disable=missing-docstring,no-self-argument,invalid-name